feat!: Docling v2 (#117)

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Co-authored-by: Maxim Lysak <mly@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
This commit is contained in:
Christoph Auer
2024-10-16 21:02:03 +02:00
committed by GitHub
parent d504432c1e
commit 7d3be0edeb
144 changed files with 15180 additions and 3828 deletions

View File

@@ -1,9 +1,10 @@
import logging
from typing import Any, Dict, Iterable, List, Tuple, Union
from docling_core.types.doc.base import BaseCell, BaseText, Ref, Table, TableCell
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling_core.types.legacy_doc.base import BaseCell, BaseText, Ref, Table
from docling.datamodel.base_models import BoundingBox, CoordOrigin, OcrCell
from docling.datamodel.base_models import OcrCell
from docling.datamodel.document import ConversionResult, Page
_log = logging.getLogger(__name__)
@@ -40,7 +41,7 @@ def generate_multimodal_pages(
end_ix = 0
doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []
doc = doc_result.output
doc = doc_result.legacy_document
def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
segments = []

View File

@@ -2,6 +2,7 @@ import copy
import logging
import networkx as nx
from docling_core.types.doc import DocItemLabel
logger = logging.getLogger("layout_utils")
@@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
"Treating cluster " + str(ix) + ", type " + str(new_cluster["type"])
)
logger.debug(" with cells: " + str(new_cluster["cell_ids"]))
if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture":
if len(cluster["cell_ids"]) == 0 and cluster["type"] != DocItemLabel.PICTURE:
logger.debug(" Empty non-picture, removed")
continue ## Skip this former cluster, now without cells.
new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices)
@@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices):
def adapt_bbox(raw_cells, cluster, orphan_cell_indices):
if not (cluster["type"] in ["Table", "Picture"]):
if not (cluster["type"] in [DocItemLabel.TABLE, DocItemLabel.PICTURE]):
## A text-like cluster. The bbox only needs to be around the text cells:
logger.debug(" Initial bbox: " + str(cluster["bbox"]))
new_bbox = surrounding_list(
[raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]]
)
logger.debug(" New bounding box:" + str(new_bbox))
if cluster["type"] == "Picture":
if cluster["type"] == DocItemLabel.PICTURE:
## We only make the bbox completely comprise included text cells:
logger.debug(" Picture")
if len(cluster["cell_ids"]) != 0:
@@ -587,7 +588,7 @@ def set_orphan_as_text(
max_id = -1
figures = []
for cluster in cluster_predictions:
if cluster["type"] == "Picture":
if cluster["type"] == DocItemLabel.PICTURE:
figures.append(cluster)
if cluster["id"] > max_id:
@@ -638,13 +639,13 @@ def set_orphan_as_text(
# if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans:
if fig_flag == False and lines_detector == False:
# get class from low confidence detections if not set as text:
class_type = "Text"
class_type = DocItemLabel.TEXT
for cluster in cluster_predictions_low:
intersection = compute_intersection(
orph_cell["bbox"], cluster["bbox"]
)
class_type = "Text"
class_type = DocItemLabel.TEXT
if (
cluster["confidence"] > 0.1
and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4
@@ -718,7 +719,9 @@ def merge_cells(cluster_predictions):
if cluster["id"] == node:
lines.append(cluster)
cluster_predictions.remove(cluster)
new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id)
new_merged_cluster = build_cluster_from_lines(
lines, DocItemLabel.TEXT, max_id
)
cluster_predictions.append(new_merged_cluster)
return cluster_predictions
@@ -753,9 +756,9 @@ def clean_up_clusters(
# remove clusters that might appear inside tables, or images (such as pdf cells in graphs)
elif img_table == True:
if (
cluster_1["type"] == "Text"
and cluster_2["type"] == "Picture"
or cluster_2["type"] == "Table"
cluster_1["type"] == DocItemLabel.TEXT
and cluster_2["type"] == DocItemLabel.PICTURE
or cluster_2["type"] == DocItemLabel.TABLE
):
if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5:
DuplicateDeletedClusterIDs.append(cluster_1["id"])
@@ -771,7 +774,10 @@ def clean_up_clusters(
DuplicateDeletedClusterIDs.append(cluster_1["id"])
# remove tables that have one pdf cell
if one_cell_table == True:
if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2:
if (
cluster_1["type"] == DocItemLabel.TABLE
and len(cluster_1["cell_ids"]) < 2
):
DuplicateDeletedClusterIDs.append(cluster_1["id"])
DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs))