diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index fbafdfc1..3ecb2648 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -189,6 +189,10 @@ class LayoutModel(BasePageModel): np.mean([c.confidence for c in processed_clusters]) ) + conv_res.confidence.pages[page.page_no].ocr_score = float( + np.mean([c.confidence for c in processed_cells if c.from_ocr]) + ) + page.cells = processed_cells page.predictions.layout = LayoutPrediction( clusters=processed_clusters diff --git a/docling/models/page_assemble_model.py b/docling/models/page_assemble_model.py index a71ba8a9..a67cc1c2 100644 --- a/docling/models/page_assemble_model.py +++ b/docling/models/page_assemble_model.py @@ -160,13 +160,15 @@ class PageAssembleModel(BasePageModel): # Aggregate page score scores = conv_res.confidence.pages[page.page_no] - scores.overall_score = float(np.nanmean( - [ - scores.ocr_score, - scores.table_score, - scores.layout_score, - scores.parse_score, - ] - )) + scores.overall_score = float( + np.nanmean( + [ + scores.ocr_score, + scores.table_score, + scores.layout_score, + scores.parse_score, + ] + ) + ) yield page diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index d1b29e38..5f901f65 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,10 +1,11 @@ from pathlib import Path from typing import Iterable, Optional +import numpy as np from PIL import ImageDraw from pydantic import BaseModel -from docling.datamodel.base_models import Page +from docling.datamodel.base_models import Page, ScoreValue from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel @@ -59,6 +60,16 @@ class PagePreprocessingModel(BasePageModel): if self.options.create_parsed_page: page.parsed_page = page._backend.get_segmented_page() + # Rate the text quality from the PDF parser, and aggregate on page + text_scores = [] + for c in page.cells: + score = self.rate_text_quality(c.text) + text_scores.append(score) + + conv_res.confidence.pages[page.page_no].parse_score = float( + np.nanmean(text_scores) + ) + # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): draw = ImageDraw.Draw(image) @@ -87,3 +98,13 @@ class PagePreprocessingModel(BasePageModel): draw_text_boxes(page.get_image(scale=1.0), page.cells) return page + + def rate_text_quality(self, text) -> ScoreValue: + """Rates the quality of a given text string by analyzing common PDF parsing issues.""" + + # Very poor-man rating function, must improve. + contains_glyph = text.find("GLYPH<") >= 0 + if contains_glyph: + return 0.0 + + return 1.0