Introduce OCR confidence, propagate to orphan in post-processing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-12-16 14:42:01 +01:00
parent c020f2cba3
commit efc25225ac
7 changed files with 31 additions and 19 deletions

View File

@ -139,6 +139,8 @@ class EasyOcrOptions(OcrOptions):
use_gpu: Optional[bool] = None use_gpu: Optional[bool] = None
confidence_threshold: float = 0.75
model_storage_directory: Optional[str] = None model_storage_directory: Optional[str] = None
download_enabled: bool = True download_enabled: bool = True

View File

@ -117,6 +117,7 @@ class EasyOcrModel(BaseOcrModel):
), ),
) )
for ix, line in enumerate(result) for ix, line in enumerate(result)
if line[2] >= self.options.confidence_threshold
] ]
all_ocr_cells.extend(cells) all_ocr_cells.extend(cells)

View File

@ -56,8 +56,6 @@ class LayoutModel(BasePageModel):
artifact_path=str(artifacts_path), artifact_path=str(artifacts_path),
device=device, device=device,
num_threads=accelerator_options.num_threads, num_threads=accelerator_options.num_threads,
base_threshold=0.6,
blacklist_classes={"Form", "Key-Value Region"},
) )
def draw_clusters_and_cells_side_by_side( def draw_clusters_and_cells_side_by_side(

View File

@ -259,9 +259,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label) tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
tbl.captions.extend(caption_refs) tbl.captions.extend(caption_refs)
elif ptype in ["form", "key_value_region"]: elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]:
label = DocItemLabel(ptype) label = DocItemLabel(ptype)
container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label) group_label = GroupLabel.UNSPECIFIED
if label == DocItemLabel.FORM:
group_label == GroupLabel.FORM_AREA
elif label == DocItemLabel.KEY_VALUE_REGION:
group_label == GroupLabel.KEY_VALUE_AREA
container_el = doc.add_group(label=group_label)
_add_child_elements(container_el, doc, obj, pelem) _add_child_elements(container_el, doc, obj, pelem)

View File

@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
from docling_core.types.doc import DocItemLabel, Size from docling_core.types.doc import DocItemLabel, Size
from rtree import index from rtree import index
from docling.datamodel.base_models import BoundingBox, Cell, Cluster from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -255,16 +255,21 @@ class LayoutPostprocessor:
unassigned = self._find_unassigned_cells(clusters) unassigned = self._find_unassigned_cells(clusters)
if unassigned: if unassigned:
next_id = max((c.id for c in clusters), default=0) + 1 next_id = max((c.id for c in clusters), default=0) + 1
orphan_clusters = [ orphan_clusters = []
Cluster( for i, cell in enumerate(unassigned):
id=next_id + i, conf = 1.0
label=DocItemLabel.TEXT, if isinstance(cell, OcrCell):
bbox=cell.bbox, conf = cell.confidence
confidence=0.0,
cells=[cell], orphan_clusters.append(
Cluster(
id=next_id + i,
label=DocItemLabel.TEXT,
bbox=cell.bbox,
confidence=conf,
cells=[cell],
)
) )
for i, cell in enumerate(unassigned)
]
clusters.extend(orphan_clusters) clusters.extend(orphan_clusters)
# Iterative refinement # Iterative refinement

8
poetry.lock generated
View File

@ -888,13 +888,13 @@ files = [
[[package]] [[package]]
name = "docling-core" name = "docling-core"
version = "2.10.0" version = "2.11.0"
description = "A python library to define and validate data types in Docling." description = "A python library to define and validate data types in Docling."
optional = false optional = false
python-versions = "<4.0,>=3.9" python-versions = "<4.0,>=3.9"
files = [ files = [
{file = "docling_core-2.10.0-py3-none-any.whl", hash = "sha256:b4fe310cd0f1edde7d727e15cb39f8b5a31d2bd5b1ac5af3f4670ac5209c9057"}, {file = "docling_core-2.11.0-py3-none-any.whl", hash = "sha256:6a8890a1b68d88dd2d2e43febd97ec6b71aa18f265ed308e86b44186f33e8e22"},
{file = "docling_core-2.10.0.tar.gz", hash = "sha256:f9b33074de048afb4cb6be784d52f97f8723d1d41737096e575629e0bb30add8"}, {file = "docling_core-2.11.0.tar.gz", hash = "sha256:d69993f2561ec1a17ceb8f06b20e33c7a3a9ba0d117542eb1dfb43ddc5a9b734"},
] ]
[package.dependencies] [package.dependencies]
@ -7613,4 +7613,4 @@ tesserocr = ["tesserocr"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.9" python-versions = "^3.9"
content-hash = "c99badc27c127051233e278f497b98acda8239697ce1cded43a2b05eab28795e" content-hash = "8a45a0b5be48d121e8a4611c4535022d9393d511e63d68157bc33568a009f3c3"

View File

@ -25,7 +25,7 @@ packages = [{include = "docling"}]
# actual dependencies: # actual dependencies:
###################### ######################
python = "^3.9" python = "^3.9"
docling-core = { version = "^2.10.0", extras = ["chunking"] } docling-core = { version = "^2.11.0", extras = ["chunking"] }
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-ibm-models = "^3.1.0" docling-ibm-models = "^3.1.0"
deepsearch-glm = "^1.0.0" deepsearch-glm = "^1.0.0"