mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Add OCR confidence and parse confidence (stub)
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
c907af0928
commit
83e0fa2f5e
@ -189,6 +189,10 @@ class LayoutModel(BasePageModel):
|
|||||||
np.mean([c.confidence for c in processed_clusters])
|
np.mean([c.confidence for c in processed_clusters])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
||||||
|
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
||||||
|
)
|
||||||
|
|
||||||
page.cells = processed_cells
|
page.cells = processed_cells
|
||||||
page.predictions.layout = LayoutPrediction(
|
page.predictions.layout = LayoutPrediction(
|
||||||
clusters=processed_clusters
|
clusters=processed_clusters
|
||||||
|
@ -160,13 +160,15 @@ class PageAssembleModel(BasePageModel):
|
|||||||
|
|
||||||
# Aggregate page score
|
# Aggregate page score
|
||||||
scores = conv_res.confidence.pages[page.page_no]
|
scores = conv_res.confidence.pages[page.page_no]
|
||||||
scores.overall_score = float(np.nanmean(
|
scores.overall_score = float(
|
||||||
[
|
np.nanmean(
|
||||||
scores.ocr_score,
|
[
|
||||||
scores.table_score,
|
scores.ocr_score,
|
||||||
scores.layout_score,
|
scores.table_score,
|
||||||
scores.parse_score,
|
scores.layout_score,
|
||||||
]
|
scores.parse_score,
|
||||||
))
|
]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
@ -1,10 +1,11 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional
|
from typing import Iterable, Optional
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page, ScoreValue
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
@ -59,6 +60,16 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
if self.options.create_parsed_page:
|
if self.options.create_parsed_page:
|
||||||
page.parsed_page = page._backend.get_segmented_page()
|
page.parsed_page = page._backend.get_segmented_page()
|
||||||
|
|
||||||
|
# Rate the text quality from the PDF parser, and aggregate on page
|
||||||
|
text_scores = []
|
||||||
|
for c in page.cells:
|
||||||
|
score = self.rate_text_quality(c.text)
|
||||||
|
text_scores.append(score)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].parse_score = float(
|
||||||
|
np.nanmean(text_scores)
|
||||||
|
)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
draw = ImageDraw.Draw(image)
|
draw = ImageDraw.Draw(image)
|
||||||
@ -87,3 +98,13 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
||||||
|
|
||||||
return page
|
return page
|
||||||
|
|
||||||
|
def rate_text_quality(self, text) -> ScoreValue:
|
||||||
|
"""Rates the quality of a given text string by analyzing common PDF parsing issues."""
|
||||||
|
|
||||||
|
# Very poor-man rating function, must improve.
|
||||||
|
contains_glyph = text.find("GLYPH<") >= 0
|
||||||
|
if contains_glyph:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
return 1.0
|
||||||
|
Loading…
Reference in New Issue
Block a user