mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 12:34:22 +00:00
Introduce OCR confidence, propagate to orphan in post-processing
Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
c020f2cba3
commit
efc25225ac
@ -139,6 +139,8 @@ class EasyOcrOptions(OcrOptions):
|
|||||||
|
|
||||||
use_gpu: Optional[bool] = None
|
use_gpu: Optional[bool] = None
|
||||||
|
|
||||||
|
confidence_threshold: float = 0.75
|
||||||
|
|
||||||
model_storage_directory: Optional[str] = None
|
model_storage_directory: Optional[str] = None
|
||||||
download_enabled: bool = True
|
download_enabled: bool = True
|
||||||
|
|
||||||
|
@ -117,6 +117,7 @@ class EasyOcrModel(BaseOcrModel):
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
for ix, line in enumerate(result)
|
for ix, line in enumerate(result)
|
||||||
|
if line[2] >= self.options.confidence_threshold
|
||||||
]
|
]
|
||||||
all_ocr_cells.extend(cells)
|
all_ocr_cells.extend(cells)
|
||||||
|
|
||||||
|
@ -56,8 +56,6 @@ class LayoutModel(BasePageModel):
|
|||||||
artifact_path=str(artifacts_path),
|
artifact_path=str(artifacts_path),
|
||||||
device=device,
|
device=device,
|
||||||
num_threads=accelerator_options.num_threads,
|
num_threads=accelerator_options.num_threads,
|
||||||
base_threshold=0.6,
|
|
||||||
blacklist_classes={"Form", "Key-Value Region"},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def draw_clusters_and_cells_side_by_side(
|
def draw_clusters_and_cells_side_by_side(
|
||||||
|
@ -259,9 +259,15 @@ def to_docling_document(doc_glm, update_name_label=False) -> DoclingDocument:
|
|||||||
tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
|
tbl = doc.add_table(data=tbl_data, prov=prov, label=item_label)
|
||||||
tbl.captions.extend(caption_refs)
|
tbl.captions.extend(caption_refs)
|
||||||
|
|
||||||
elif ptype in ["form", "key_value_region"]:
|
elif ptype in [DocItemLabel.FORM.value, DocItemLabel.KEY_VALUE_REGION.value]:
|
||||||
label = DocItemLabel(ptype)
|
label = DocItemLabel(ptype)
|
||||||
container_el = doc.add_group(label=GroupLabel.UNSPECIFIED, name=label)
|
group_label = GroupLabel.UNSPECIFIED
|
||||||
|
if label == DocItemLabel.FORM:
|
||||||
|
group_label == GroupLabel.FORM_AREA
|
||||||
|
elif label == DocItemLabel.KEY_VALUE_REGION:
|
||||||
|
group_label == GroupLabel.KEY_VALUE_AREA
|
||||||
|
|
||||||
|
container_el = doc.add_group(label=group_label)
|
||||||
|
|
||||||
_add_child_elements(container_el, doc, obj, pelem)
|
_add_child_elements(container_el, doc, obj, pelem)
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ from typing import Dict, List, Set, Tuple
|
|||||||
from docling_core.types.doc import DocItemLabel, Size
|
from docling_core.types.doc import DocItemLabel, Size
|
||||||
from rtree import index
|
from rtree import index
|
||||||
|
|
||||||
from docling.datamodel.base_models import BoundingBox, Cell, Cluster
|
from docling.datamodel.base_models import BoundingBox, Cell, Cluster, OcrCell
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -255,16 +255,21 @@ class LayoutPostprocessor:
|
|||||||
unassigned = self._find_unassigned_cells(clusters)
|
unassigned = self._find_unassigned_cells(clusters)
|
||||||
if unassigned:
|
if unassigned:
|
||||||
next_id = max((c.id for c in clusters), default=0) + 1
|
next_id = max((c.id for c in clusters), default=0) + 1
|
||||||
orphan_clusters = [
|
orphan_clusters = []
|
||||||
|
for i, cell in enumerate(unassigned):
|
||||||
|
conf = 1.0
|
||||||
|
if isinstance(cell, OcrCell):
|
||||||
|
conf = cell.confidence
|
||||||
|
|
||||||
|
orphan_clusters.append(
|
||||||
Cluster(
|
Cluster(
|
||||||
id=next_id + i,
|
id=next_id + i,
|
||||||
label=DocItemLabel.TEXT,
|
label=DocItemLabel.TEXT,
|
||||||
bbox=cell.bbox,
|
bbox=cell.bbox,
|
||||||
confidence=0.0,
|
confidence=conf,
|
||||||
cells=[cell],
|
cells=[cell],
|
||||||
)
|
)
|
||||||
for i, cell in enumerate(unassigned)
|
)
|
||||||
]
|
|
||||||
clusters.extend(orphan_clusters)
|
clusters.extend(orphan_clusters)
|
||||||
|
|
||||||
# Iterative refinement
|
# Iterative refinement
|
||||||
|
8
poetry.lock
generated
8
poetry.lock
generated
@ -888,13 +888,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-core"
|
name = "docling-core"
|
||||||
version = "2.10.0"
|
version = "2.11.0"
|
||||||
description = "A python library to define and validate data types in Docling."
|
description = "A python library to define and validate data types in Docling."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_core-2.10.0-py3-none-any.whl", hash = "sha256:b4fe310cd0f1edde7d727e15cb39f8b5a31d2bd5b1ac5af3f4670ac5209c9057"},
|
{file = "docling_core-2.11.0-py3-none-any.whl", hash = "sha256:6a8890a1b68d88dd2d2e43febd97ec6b71aa18f265ed308e86b44186f33e8e22"},
|
||||||
{file = "docling_core-2.10.0.tar.gz", hash = "sha256:f9b33074de048afb4cb6be784d52f97f8723d1d41737096e575629e0bb30add8"},
|
{file = "docling_core-2.11.0.tar.gz", hash = "sha256:d69993f2561ec1a17ceb8f06b20e33c7a3a9ba0d117542eb1dfb43ddc5a9b734"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -7613,4 +7613,4 @@ tesserocr = ["tesserocr"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "c99badc27c127051233e278f497b98acda8239697ce1cded43a2b05eab28795e"
|
content-hash = "8a45a0b5be48d121e8a4611c4535022d9393d511e63d68157bc33568a009f3c3"
|
||||||
|
@ -25,7 +25,7 @@ packages = [{include = "docling"}]
|
|||||||
# actual dependencies:
|
# actual dependencies:
|
||||||
######################
|
######################
|
||||||
python = "^3.9"
|
python = "^3.9"
|
||||||
docling-core = { version = "^2.10.0", extras = ["chunking"] }
|
docling-core = { version = "^2.11.0", extras = ["chunking"] }
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-ibm-models = "^3.1.0"
|
docling-ibm-models = "^3.1.0"
|
||||||
deepsearch-glm = "^1.0.0"
|
deepsearch-glm = "^1.0.0"
|
||||||
|
Loading…
Reference in New Issue
Block a user