Add OCR confidence and parse confidence (stub)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-04-07 14:49:53 +02:00
parent c907af0928
commit 83e0fa2f5e
3 changed files with 36 additions and 9 deletions

View File

@ -189,6 +189,10 @@ class LayoutModel(BasePageModel):
np.mean([c.confidence for c in processed_clusters])
)
conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean([c.confidence for c in processed_cells if c.from_ocr])
)
page.cells = processed_cells
page.predictions.layout = LayoutPrediction(
clusters=processed_clusters

View File

@ -160,13 +160,15 @@ class PageAssembleModel(BasePageModel):
# Aggregate page score
scores = conv_res.confidence.pages[page.page_no]
scores.overall_score = float(np.nanmean(
scores.overall_score = float(
np.nanmean(
[
scores.ocr_score,
scores.table_score,
scores.layout_score,
scores.parse_score,
]
))
)
)
yield page

View File

@ -1,10 +1,11 @@
from pathlib import Path
from typing import Iterable, Optional
import numpy as np
from PIL import ImageDraw
from pydantic import BaseModel
from docling.datamodel.base_models import Page
from docling.datamodel.base_models import Page, ScoreValue
from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
@ -59,6 +60,16 @@ class PagePreprocessingModel(BasePageModel):
if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page
text_scores = []
for c in page.cells:
score = self.rate_text_quality(c.text)
text_scores.append(score)
conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanmean(text_scores)
)
# DEBUG code:
def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image)
@ -87,3 +98,13 @@ class PagePreprocessingModel(BasePageModel):
draw_text_boxes(page.get_image(scale=1.0), page.cells)
return page
def rate_text_quality(self, text) -> ScoreValue:
"""Rates the quality of a given text string by analyzing common PDF parsing issues."""
# Very poor-man rating function, must improve.
contains_glyph = text.find("GLYPH<") >= 0
if contains_glyph:
return 0.0
return 1.0