mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Introduce OCR confidence, propagate to orphan in post-processing
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
c020f2cba3
commit
efc25225ac
@ -139,6 +139,8 @@ class EasyOcrOptions(OcrOptions):
|
||||
|
||||
use_gpu: Optional[bool] = None
|
||||
|
||||
confidence_threshold: float = 0.75
|
||||
|
||||
model_storage_directory: Optional[str] = None
|
||||
download_enabled: bool = True
|
||||
|
||||
|
@ -117,6 +117,7 @@ class EasyOcrModel(BaseOcrModel):
|
||||
),
|
||||
)
|
||||
for ix, line in enumerate(result)
|
||||
if line[2] >= self.options.confidence_threshold
|
||||
]
|
||||
all_ocr_cells.extend(cells)
|
||||
|
||||
|
@ -56,8 +56,6 @@ class LayoutModel(BasePageModel):
|
||||
artifact_path=str(artifacts_path),
|
||||
device=device,
|
||||
num_threads=accelerator_options.num_threads,
|
||||
base_threshold=0.6,
|
||||
blacklist_classes={"Form", "Key-Value Region"},
|
||||
)
|
||||
|
||||
def draw_clusters_and_cells_side_by_side(
|
||||
|
@ -259,9 +259,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
||||
tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
|
||||
tbl.captions.extend(caption_refs)
|
||||
|
||||
elif ptype in ["form", "key_value_region"]:
|
||||
elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]:
|
||||
label = DocItemLabel(ptype)
|
||||
container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
|
||||
group_label = GroupLabel.UNSPECIFIED
|
||||
if label == DocItemLabel.FORM:
|
||||
group_label == GroupLabel.FORM_AREA
|
||||
elif label == DocItemLabel.KEY_VALUE_REGION:
|
||||
group_label == GroupLabel.KEY_VALUE_AREA
|
||||
|
||||
container_el = doc.add_group(label=group_label)
|
||||
|
||||
_add_child_elements(container_el, doc, obj, pelem)
|
||||
|
||||
|
@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
|
||||
from docling_core.types.doc import DocItemLabel, Size
|
||||
from rtree import index
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster
|
||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -255,16 +255,21 @@ class LayoutPostprocessor:
|
||||
unassigned = self._find_unassigned_cells(clusters)
|
||||
if unassigned:
|
||||
next_id = max((c.id for c in clusters), default=0) + 1
|
||||
orphan_clusters = [
|
||||
Cluster(
|
||||
id=next_id + i,
|
||||
label=DocItemLabel.TEXT,
|
||||
bbox=cell.bbox,
|
||||
confidence=0.0,
|
||||
cells=[cell],
|
||||
orphan_clusters = []
|
||||
for i, cell in enumerate(unassigned):
|
||||
conf = 1.0
|
||||
if isinstance(cell, OcrCell):
|
||||
conf = cell.confidence
|
||||
|
||||
orphan_clusters.append(
|
||||
Cluster(
|
||||
id=next_id + i,
|
||||
label=DocItemLabel.TEXT,
|
||||
bbox=cell.bbox,
|
||||
confidence=conf,
|
||||
cells=[cell],
|
||||
)
|
||||
)
|
||||
for i, cell in enumerate(unassigned)
|
||||
]
|
||||
clusters.extend(orphan_clusters)
|
||||
|
||||
# Iterative refinement
|
||||
|
8
poetry.lock
generated
8
poetry.lock
generated
@ -888,13 +888,13 @@ files = [
|
||||
|
||||
[[package]]
|
||||
name = "docling-core"
|
||||
version = "2.10.0"
|
||||
version = "2.11.0"
|
||||
description = "A python library to define and validate data types in Docling."
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.9"
|
||||
files = [
|
||||
{file = "docling_core-2.10.0-py3-none-any.whl", hash = "sha256:b4fe310cd0f1edde7d727e15cb39f8b5a31d2bd5b1ac5af3f4670ac5209c9057"},
|
||||
{file = "docling_core-2.10.0.tar.gz", hash = "sha256:f9b33074de048afb4cb6be784d52f97f8723d1d41737096e575629e0bb30add8"},
|
||||
{file = "docling_core-2.11.0-py3-none-any.whl", hash = "sha256:6a8890a1b68d88dd2d2e43febd97ec6b71aa18f265ed308e86b44186f33e8e22"},
|
||||
{file = "docling_core-2.11.0.tar.gz", hash = "sha256:d69993f2561ec1a17ceb8f06b20e33c7a3a9ba0d117542eb1dfb43ddc5a9b734"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -7613,4 +7613,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "c99badc27c127051233e278f497b98acda8239697ce1cded43a2b05eab28795e"
|
||||
content-hash = "8a45a0b5be48d121e8a4611c4535022d9393d511e63d68157bc33568a009f3c3"
|
||||
|
@ -25,7 +25,7 @@ packages = [{include = "docling"}]
|
||||
# actual dependencies:
|
||||
######################
|
||||
python = "^3.9"
|
||||
docling-core = { version = "^2.10.0", extras = ["chunking"] }
|
||||
docling-core = { version = "^2.11.0", extras = ["chunking"] }
|
||||
pydantic = "^2.0.0"
|
||||
docling-ibm-models = "^3.1.0"
|
||||
deepsearch-glm = "^1.0.0"
|
||||
|
Loading…
Reference in New Issue
Block a user