mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Introduce mean_score and low_score, consistent aggregate computations
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
f9496e4a91
commit
2a6537289b
@ -315,29 +315,82 @@ class QualityGrade(str, Enum):
|
||||
|
||||
|
||||
class PageConfidenceScores(BaseModel):
|
||||
overall_score: ScoreValue = np.nan
|
||||
|
||||
parse_score: ScoreValue = np.nan
|
||||
layout_score: ScoreValue = np.nan
|
||||
table_score: ScoreValue = np.nan
|
||||
ocr_score: ScoreValue = np.nan
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def grade(self) -> QualityGrade:
|
||||
if self.overall_score < 0.5:
|
||||
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
||||
if score < 0.5:
|
||||
return QualityGrade.POOR
|
||||
elif self.overall_score < 0.8:
|
||||
elif score < 0.8:
|
||||
return QualityGrade.FAIR
|
||||
elif self.overall_score < 0.9:
|
||||
elif score < 0.9:
|
||||
return QualityGrade.GOOD
|
||||
elif self.overall_score >= 0.9:
|
||||
elif score >= 0.9:
|
||||
return QualityGrade.EXCELLENT
|
||||
|
||||
return QualityGrade.UNSPECIFIED
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def mean_grade(self) -> QualityGrade:
|
||||
return self._score_to_grade(self.mean_score)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def low_grade(self) -> QualityGrade:
|
||||
return self._score_to_grade(self.low_score)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def mean_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanmean(
|
||||
[
|
||||
self.ocr_score,
|
||||
self.table_score,
|
||||
self.layout_score,
|
||||
self.parse_score,
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def low_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanquantile(
|
||||
[
|
||||
self.ocr_score,
|
||||
self.table_score,
|
||||
self.layout_score,
|
||||
self.parse_score,
|
||||
],
|
||||
q=0.05,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class ConfidenceReport(PageConfidenceScores):
|
||||
pages: Dict[int, PageConfidenceScores] = Field(
|
||||
default_factory=lambda: defaultdict(PageConfidenceScores)
|
||||
)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def mean_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanmean(
|
||||
[c.mean_score for c in self.pages.values()],
|
||||
)
|
||||
)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def low_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanmean(
|
||||
[c.low_score for c in self.pages.values()],
|
||||
)
|
||||
)
|
||||
|
@ -153,17 +153,4 @@ class PageAssembleModel(BasePageModel):
|
||||
elements=elements, headers=headers, body=body
|
||||
)
|
||||
|
||||
# Aggregate page score
|
||||
scores = conv_res.confidence.pages[page.page_no]
|
||||
scores.overall_score = float(
|
||||
np.nanmean(
|
||||
[
|
||||
scores.ocr_score,
|
||||
scores.table_score,
|
||||
scores.layout_score,
|
||||
scores.parse_score,
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
yield page
|
||||
|
@ -269,13 +269,6 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
)
|
||||
)
|
||||
|
||||
conv_res.confidence.overall_score = float(
|
||||
np.nanquantile(
|
||||
[c.overall_score for c in conv_res.confidence.pages.values()],
|
||||
q=0.05, # overall score should relate to worst 5% of page scores.
|
||||
)
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
|
Loading…
Reference in New Issue
Block a user