mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Fix garbage regex
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
95d2f5fd92
commit
a8a8b8e0f9
@ -28,8 +28,8 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
||||||
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
||||||
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
||||||
r"(?:/\w+\s+){5,}"
|
r"(?:/\w+\s*){2,}"
|
||||||
) # Five or more "/token " sequences
|
) # Two or more "/token " sequences
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
Loading…
Reference in New Issue
Block a user