From efc25225ac6badd3b2651a072182191f281dcb1a Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 16 Dec 2024 14:42:01 +0100 Subject: [PATCH] Introduce OCR confidence, propagate to orphan in post-processing Signed-off-by: Christoph Auer Signed-off-by: Christoph Auer --- docling/datamodel/pipeline_options.py | 2 ++ docling/models/easyocr_model.py | 1 + docling/models/layout_model.py | 2 -- docling/utils/glm_utils.py | 10 ++++++++-- docling/utils/layout_postprocessor.py | 25 +++++++++++++++---------- poetry.lock | 8 ++++---- pyproject.toml | 2 +- 7 files changed, 31 insertions(+), 19 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index cf1689da..1f83af7f 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -139,6 +139,8 @@ class EasyOcrOptions(OcrOptions): use_gpu: Optional[bool] = None + confidence_threshold: float = 0.75 + model_storage_directory: Optional[str] = None download_enabled: bool = True diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index 5de1409c..0d13f2b3 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -117,6 +117,7 @@ class EasyOcrModel(BaseOcrModel): ), ) for ix, line in enumerate(result) + if line[2] >= self.options.confidence_threshold ] all_ocr_cells.extend(cells) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 3dc83eba..014cddd3 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -56,8 +56,6 @@ class LayoutModel(BasePageModel): artifact_path=str(artifacts_path), device=device, num_threads=accelerator_options.num_threads, - base_threshold=0.6, - blacklist_classes={"Form", "Key-Value Region"}, ) def draw_clusters_and_cells_side_by_side( diff --git a/docling/utils/glm_utils.py b/docling/utils/glm_utils.py index 3289017b..548c39f5 100644 --- a/docling/utils/glm_utils.py +++ b/docling/utils/glm_utils.py @@ -259,9 +259,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument: tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label) tbl.captions.extend(caption_refs) - elif ptype in ["form", "key_value_region"]: + elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]: label = DocItemLabel(ptype) - container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label) + group_label = GroupLabel.UNSPECIFIED + if label == DocItemLabel.FORM: + group_label == GroupLabel.FORM_AREA + elif label == DocItemLabel.KEY_VALUE_REGION: + group_label == GroupLabel.KEY_VALUE_AREA + + container_el = doc.add_group(label=group_label) _add_child_elements(container_el, doc, obj, pelem) diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index ca2a44ef..faf0808c 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple from docling_core.types.doc import DocItemLabel, Size from rtree import index -from docling.datamodel.base_models import BoundingBox, Cell, Cluster +from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell _log = logging.getLogger(__name__) @@ -255,16 +255,21 @@ class LayoutPostprocessor: unassigned = self._find_unassigned_cells(clusters) if unassigned: next_id = max((c.id for c in clusters), default=0) + 1 - orphan_clusters = [ - Cluster( - id=next_id + i, - label=DocItemLabel.TEXT, - bbox=cell.bbox, - confidence=0.0, - cells=[cell], + orphan_clusters = [] + for i, cell in enumerate(unassigned): + conf = 1.0 + if isinstance(cell, OcrCell): + conf = cell.confidence + + orphan_clusters.append( + Cluster( + id=next_id + i, + label=DocItemLabel.TEXT, + bbox=cell.bbox, + confidence=conf, + cells=[cell], + ) ) - for i, cell in enumerate(unassigned) - ] clusters.extend(orphan_clusters) # Iterative refinement diff --git a/poetry.lock b/poetry.lock index c9ebd4f4..66eb0239 100644 --- a/poetry.lock +++ b/poetry.lock @@ -888,13 +888,13 @@ files = [ [[package]] name = "docling-core" -version = "2.10.0" +version = "2.11.0" description = "A python library to define and validate data types in Docling." optional = false python-versions = "<4.0,>=3.9" files = [ - {file = "docling_core-2.10.0-py3-none-any.whl", hash = "sha256:b4fe310cd0f1edde7d727e15cb39f8b5a31d2bd5b1ac5af3f4670ac5209c9057"}, - {file = "docling_core-2.10.0.tar.gz", hash = "sha256:f9b33074de048afb4cb6be784d52f97f8723d1d41737096e575629e0bb30add8"}, + {file = "docling_core-2.11.0-py3-none-any.whl", hash = "sha256:6a8890a1b68d88dd2d2e43febd97ec6b71aa18f265ed308e86b44186f33e8e22"}, + {file = "docling_core-2.11.0.tar.gz", hash = "sha256:d69993f2561ec1a17ceb8f06b20e33c7a3a9ba0d117542eb1dfb43ddc5a9b734"}, ] [package.dependencies] @@ -7613,4 +7613,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c99badc27c127051233e278f497b98acda8239697ce1cded43a2b05eab28795e" +content-hash = "8a45a0b5be48d121e8a4611c4535022d9393d511e63d68157bc33568a009f3c3" diff --git a/pyproject.toml b/pyproject.toml index f3d2efdb..da97348a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ packages = [{include = "docling"}] # actual dependencies: ###################### python = "^3.9" -docling-core = { version = "^2.10.0", extras = ["chunking"] } +docling-core = { version = "^2.11.0", extras = ["chunking"] } pydantic = "^2.0.0" docling-ibm-models = "^3.1.0" deepsearch-glm = "^1.0.0"