diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 5f901f65..dc96c30f 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,3 +1,4 @@ +import re from pathlib import Path from typing import Iterable, Optional @@ -21,6 +22,11 @@ class PagePreprocessingModel(BasePageModel): def __init__(self, options: PagePreprocessingOptions): self.options = options + # Pre-compiled regex patterns for efficiency + self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>") + self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}") + self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b") + def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: @@ -67,7 +73,9 @@ class PagePreprocessingModel(BasePageModel): text_scores.append(score) conv_res.confidence.pages[page.page_no].parse_score = float( - np.nanmean(text_scores) + np.nanquantile( + text_scores, q=0.05 + ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. ) # DEBUG code: @@ -99,12 +107,26 @@ class PagePreprocessingModel(BasePageModel): return page - def rate_text_quality(self, text) -> ScoreValue: - """Rates the quality of a given text string by analyzing common PDF parsing issues.""" - - # Very poor-man rating function, must improve. - contains_glyph = text.find("GLYPH<") >= 0 - if contains_glyph: + def rate_text_quality(self, text: str) -> float: + # Hard errors: if any of these patterns are found, return 0.0 immediately. + blacklist_chars = ["�"] + if ( + self.GLYPH_RE.search(text) + or self.SLASH_G_RE.search(text) + or any([text.find(c) >= 0 for c in blacklist_chars]) + ): return 0.0 - return 1.0 + penalty = 0.0 + + # Apply a penalty only if the fragmented words pattern occurs at least three times. + frag_matches = self.FRAG_RE.findall(text) + if len(frag_matches) >= 3: + penalty += 0.1 * len(frag_matches) + + # Additional heuristic: if the average token length is below 2, add a penalty. + tokens = text.split() + if tokens and (sum(map(len, tokens)) / len(tokens)) < 2: + penalty += 0.2 + + return max(1.0 - penalty, 0.0) diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 9c9326f4..2b427bd6 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -250,8 +250,9 @@ class StandardPdfPipeline(PaginatedPipeline): ) ) conv_res.confidence.parse_score = float( - np.nanmean( - [c.parse_score for c in conv_res.confidence.pages.values()] + np.nanquantile( + [c.parse_score for c in conv_res.confidence.pages.values()], + q=0.05, # parse score should relate to worst 5% of pages. ) ) conv_res.confidence.table_score = float( @@ -266,8 +267,9 @@ class StandardPdfPipeline(PaginatedPipeline): ) conv_res.confidence.overall_score = float( - np.nanmean( - [c.overall_score for c in conv_res.confidence.pages.values()] + np.nanquantile( + [c.overall_score for c in conv_res.confidence.pages.values()], + q=0.05, # overall score should relate to worst 5% of page scores. ) )