Add OCR confidence and parse confidence (stub)

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-04-07 14:49:53 +02:00
parent c907af0928
commit 83e0fa2f5e
3 changed files with 36 additions and 9 deletions

View File

@ -189,6 +189,10 @@ class LayoutModel(BasePageModel):
np.mean([c.confidence for c in processed_clusters]) np.mean([c.confidence for c in processed_clusters])
) )
conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean([c.confidence for c in processed_cells if c.from_ocr])
)
page.cells = processed_cells page.cells = processed_cells
page.predictions.layout = LayoutPrediction( page.predictions.layout = LayoutPrediction(
clusters=processed_clusters clusters=processed_clusters

View File

@ -160,13 +160,15 @@ class PageAssembleModel(BasePageModel):
# Aggregate page score # Aggregate page score
scores = conv_res.confidence.pages[page.page_no] scores = conv_res.confidence.pages[page.page_no]
scores.overall_score = float(np.nanmean( scores.overall_score = float(
[ np.nanmean(
scores.ocr_score, [
scores.table_score, scores.ocr_score,
scores.layout_score, scores.table_score,
scores.parse_score, scores.layout_score,
] scores.parse_score,
)) ]
)
)
yield page yield page

View File

@ -1,10 +1,11 @@
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional from typing import Iterable, Optional
import numpy as np
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.base_models import Page from docling.datamodel.base_models import Page, ScoreValue
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
@ -59,6 +60,16 @@ class PagePreprocessingModel(BasePageModel):
if self.options.create_parsed_page: if self.options.create_parsed_page:
page.parsed_page = page._backend.get_segmented_page() page.parsed_page = page._backend.get_segmented_page()
# Rate the text quality from the PDF parser, and aggregate on page
text_scores = []
for c in page.cells:
score = self.rate_text_quality(c.text)
text_scores.append(score)
conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanmean(text_scores)
)
# DEBUG code: # DEBUG code:
def draw_text_boxes(image, cells, show: bool = False): def draw_text_boxes(image, cells, show: bool = False):
draw = ImageDraw.Draw(image) draw = ImageDraw.Draw(image)
@ -87,3 +98,13 @@ class PagePreprocessingModel(BasePageModel):
draw_text_boxes(page.get_image(scale=1.0), page.cells) draw_text_boxes(page.get_image(scale=1.0), page.cells)
return page return page
def rate_text_quality(self, text) -> ScoreValue:
"""Rates the quality of a given text string by analyzing common PDF parsing issues."""
# Very poor-man rating function, must improve.
contains_glyph = text.find("GLYPH<") >= 0
if contains_glyph:
return 0.0
return 1.0