mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Add parse quality rules, use 5% percentile for overall and parse scores
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
83e0fa2f5e
commit
16c90b64f5
@ -1,3 +1,4 @@
|
|||||||
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
@ -21,6 +22,11 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
def __init__(self, options: PagePreprocessingOptions):
|
def __init__(self, options: PagePreprocessingOptions):
|
||||||
self.options = options
|
self.options = options
|
||||||
|
|
||||||
|
# Pre-compiled regex patterns for efficiency
|
||||||
|
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
||||||
|
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
||||||
|
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
@ -67,7 +73,9 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
text_scores.append(score)
|
text_scores.append(score)
|
||||||
|
|
||||||
conv_res.confidence.pages[page.page_no].parse_score = float(
|
conv_res.confidence.pages[page.page_no].parse_score = float(
|
||||||
np.nanmean(text_scores)
|
np.nanquantile(
|
||||||
|
text_scores, q=0.05
|
||||||
|
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
||||||
)
|
)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
@ -99,12 +107,26 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
def rate_text_quality(self, text) -> ScoreValue:
|
def rate_text_quality(self, text: str) -> float:
|
||||||
"""Rates the quality of a given text string by analyzing common PDF parsing issues."""
|
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
||||||
|
blacklist_chars = ["<EFBFBD>"]
|
||||||
# Very poor-man rating function, must improve.
|
if (
|
||||||
contains_glyph = text.find("GLYPH<") >= 0
|
self.GLYPH_RE.search(text)
|
||||||
if contains_glyph:
|
or self.SLASH_G_RE.search(text)
|
||||||
|
or any([text.find(c) >= 0 for c in blacklist_chars])
|
||||||
|
):
|
||||||
return 0.0
|
return 0.0
|
||||||
|
|
||||||
return 1.0
|
penalty = 0.0
|
||||||
|
|
||||||
|
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
||||||
|
frag_matches = self.FRAG_RE.findall(text)
|
||||||
|
if len(frag_matches) >= 3:
|
||||||
|
penalty += 0.1 * len(frag_matches)
|
||||||
|
|
||||||
|
# Additional heuristic: if the average token length is below 2, add a penalty.
|
||||||
|
tokens = text.split()
|
||||||
|
if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
||||||
|
penalty += 0.2
|
||||||
|
|
||||||
|
return max(1.0 - penalty, 0.0)
|
||||||
|
@ -250,8 +250,9 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
conv_res.confidence.parse_score = float(
|
conv_res.confidence.parse_score = float(
|
||||||
np.nanmean(
|
np.nanquantile(
|
||||||
[c.parse_score for c in conv_res.confidence.pages.values()]
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||||
|
q=0.05, # parse score should relate to worst 5% of pages.
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
conv_res.confidence.table_score = float(
|
conv_res.confidence.table_score = float(
|
||||||
@ -266,8 +267,9 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
|
|
||||||
conv_res.confidence.overall_score = float(
|
conv_res.confidence.overall_score = float(
|
||||||
np.nanmean(
|
np.nanquantile(
|
||||||
[c.overall_score for c in conv_res.confidence.pages.values()]
|
[c.overall_score for c in conv_res.confidence.pages.values()],
|
||||||
|
q=0.05, # overall score should relate to worst 5% of page scores.
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user