diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 73477a75..90d9de98 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -6,6 +6,7 @@ from typing import Annotated, Any, Dict, List, Optional, Tuple, Union from docling_core.types.experimental.base import BoundingBox, Size from docling_core.types.experimental.document import BaseFigureData, TableCell +from docling_core.types.experimental.labels import PageLabel from PIL.Image import Image from pydantic import BaseModel, ConfigDict, Field, model_validator from typing_extensions import Self @@ -50,14 +51,14 @@ class OcrCell(Cell): class Cluster(BaseModel): id: int - label: str + label: PageLabel bbox: BoundingBox confidence: float = 1.0 cells: List[Cell] = [] class BasePageElement(BaseModel): - label: str + label: PageLabel id: int page_no: int cluster: Cluster diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index b7f4a7b1..735d23e9 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -12,6 +12,7 @@ from docling_core.types import Table as DsSchemaTable from docling_core.types.doc.base import BoundingBox as DsBoundingBox from docling_core.types.doc.base import Figure, TableCell from docling_core.types.experimental.document import DoclingDocument, FileInfo +from docling_core.types.experimental.labels import PageLabel from pydantic import BaseModel from typing_extensions import deprecated @@ -34,21 +35,21 @@ from docling.utils.utils import create_file_hash _log = logging.getLogger(__name__) layout_label_to_ds_type = { - "Title": "title", - "Document Index": "table-of-path_or_stream", - "Section-header": "subtitle-level-1", - "Checkbox-Selected": "checkbox-selected", - "Checkbox-Unselected": "checkbox-unselected", - "Caption": "caption", - "Page-header": "page-header", - "Page-footer": "page-footer", - "Footnote": "footnote", - "Table": "table", - "Formula": "equation", - "List-item": "paragraph", - "Code": "paragraph", - "Picture": "figure", - "Text": "paragraph", + PageLabel.TITLE: "title", + PageLabel.DOCUMENT_INDEX: "table-of-contents", + PageLabel.SECTION_HEADER: "subtitle-level-1", + PageLabel.CHECKBOX_SELECTED: "checkbox-selected", + PageLabel.CHECKBOX_UNSELECTED: "checkbox-unselected", + PageLabel.CAPTION: "caption", + PageLabel.PAGE_HEADER: "page-header", + PageLabel.PAGE_FOOTER: "page-footer", + PageLabel.FOOTNOTE: "footnote", + PageLabel.TABLE: "table", + PageLabel.FORMULA: "equation", + PageLabel.LIST_ITEM: "paragraph", + PageLabel.CODE: "paragraph", + PageLabel.PICTURE: "figure", + PageLabel.TEXT: "paragraph", } _EMPTY_DOC = DsDocument( diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 1e3e249c..987cb830 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -5,6 +5,7 @@ import time from typing import Iterable, List from docling_core.types.experimental.base import CoordOrigin +from docling_core.types.experimental.labels import PageLabel from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from PIL import ImageDraw @@ -23,23 +24,23 @@ _log = logging.getLogger(__name__) class LayoutModel: TEXT_ELEM_LABELS = [ - "Text", - "Footnote", - "Caption", - "Checkbox-Unselected", - "Checkbox-Selected", - "Section-header", - "Page-header", - "Page-footer", - "Code", - "List-item", + PageLabel.TEXT, + PageLabel.FOOTNOTE, + PageLabel.CAPTION, + PageLabel.CHECKBOX_UNSELECTED, + PageLabel.CHECKBOX_SELECTED, + PageLabel.SECTION_HEADER, + PageLabel.PAGE_HEADER, + PageLabel.PAGE_FOOTER, + PageLabel.CODE, + PageLabel.LIST_ITEM, # "Formula", ] - PAGE_HEADER_LABELS = ["Page-header", "Page-footer"] + PAGE_HEADER_LABELS = [PageLabel.PAGE_HEADER, PageLabel.PAGE_FOOTER] - TABLE_LABEL = "Table" - FIGURE_LABEL = "Picture" - FORMULA_LABEL = "Formula" + TABLE_LABEL = PageLabel.TABLE + FIGURE_LABEL = PageLabel.PICTURE + FORMULA_LABEL = PageLabel.FORMULA def __init__(self, config): self.config = config @@ -50,27 +51,27 @@ class LayoutModel: def postprocess(self, clusters: List[Cluster], cells: List[Cell], page_height): MIN_INTERSECTION = 0.2 CLASS_THRESHOLDS = { - "Caption": 0.35, - "Footnote": 0.35, - "Formula": 0.35, - "List-item": 0.35, - "Page-footer": 0.35, - "Page-header": 0.35, - "Picture": 0.2, # low threshold adjust to capture chemical structures for examples. - "Section-header": 0.45, - "Table": 0.35, - "Text": 0.45, - "Title": 0.45, - "Document Index": 0.45, - "Code": 0.45, - "Checkbox-Selected": 0.45, - "Checkbox-Unselected": 0.45, - "Form": 0.45, - "Key-Value Region": 0.45, + PageLabel.CAPTION: 0.35, + PageLabel.FOOTNOTE: 0.35, + PageLabel.FORMULA: 0.35, + PageLabel.LIST_ITEM: 0.35, + PageLabel.PAGE_FOOTER: 0.35, + PageLabel.PAGE_HEADER: 0.35, + PageLabel.PICTURE: 0.2, # low threshold adjust to capture chemical structures for examples. + PageLabel.SECTION_HEADER: 0.45, + PageLabel.TABLE: 0.35, + PageLabel.TEXT: 0.45, + PageLabel.TITLE: 0.45, + PageLabel.DOCUMENT_INDEX: 0.45, + PageLabel.CODE: 0.45, + PageLabel.CHECKBOX_SELECTED: 0.45, + PageLabel.CHECKBOX_UNSELECTED: 0.45, + PageLabel.FORM: 0.45, + PageLabel.KEY_VALUE_REGION: 0.45, } CLASS_REMAPPINGS = { - "Document Index": "Table", + PageLabel.DOCUMENT_INDEX: PageLabel.TABLE, } _log.debug("================= Start postprocess function ====================") @@ -257,7 +258,7 @@ class LayoutModel: coord=c["bbox"], origin=CoordOrigin.BOTTOMLEFT ).to_top_left_origin(page_height), confidence=c["confidence"], - label=c["type"], + label=PageLabel(c["type"]), cells=cluster_cells, ) clusters_out_new.append(c_new) @@ -270,9 +271,12 @@ class LayoutModel: for ix, pred_item in enumerate( self.layout_predictor.predict(page.get_image(scale=1.0)) ): + label = PageLabel( + pred_item["label"].lower().replace(" ", "_").replace("-", "_") + ) # Temporary, until docling-ibm-model uses docling-core types cluster = Cluster( id=ix, - label=pred_item["label"], + label=label, confidence=pred_item["confidence"], bbox=BoundingBox.model_validate(pred_item), cells=[], diff --git a/docling/utils/layout_utils.py b/docling/utils/layout_utils.py index 8b7a6b6f..bc18aabb 100644 --- a/docling/utils/layout_utils.py +++ b/docling/utils/layout_utils.py @@ -2,6 +2,7 @@ import copy import logging import networkx as nx +from docling_core.types.experimental.labels import PageLabel logger = logging.getLogger("layout_utils") @@ -370,7 +371,7 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices): "Treating cluster " + str(ix) + ", type " + str(new_cluster["type"]) ) logger.debug(" with cells: " + str(new_cluster["cell_ids"])) - if len(cluster["cell_ids"]) == 0 and cluster["type"] != "Picture": + if len(cluster["cell_ids"]) == 0 and cluster["type"] != PageLabel.PICTURE: logger.debug(" Empty non-picture, removed") continue ## Skip this former cluster, now without cells. new_bbox = adapt_bbox(raw_cells, new_cluster, orphan_cell_indices) @@ -380,14 +381,14 @@ def adapt_bboxes(raw_cells, clusters, orphan_cell_indices): def adapt_bbox(raw_cells, cluster, orphan_cell_indices): - if not (cluster["type"] in ["Table", "Picture"]): + if not (cluster["type"] in [PageLabel.TABLE, PageLabel.PICTURE]): ## A text-like cluster. The bbox only needs to be around the text cells: logger.debug(" Initial bbox: " + str(cluster["bbox"])) new_bbox = surrounding_list( [raw_cells[cid]["bbox"] for cid in cluster["cell_ids"]] ) logger.debug(" New bounding box:" + str(new_bbox)) - if cluster["type"] == "Picture": + if cluster["type"] == PageLabel.PICTURE: ## We only make the bbox completely comprise included text cells: logger.debug(" Picture") if len(cluster["cell_ids"]) != 0: @@ -587,7 +588,7 @@ def set_orphan_as_text( max_id = -1 figures = [] for cluster in cluster_predictions: - if cluster["type"] == "Picture": + if cluster["type"] == PageLabel.PICTURE: figures.append(cluster) if cluster["id"] > max_id: @@ -638,13 +639,13 @@ def set_orphan_as_text( # if fig_flag == False and raw_cells[orph_id]["text"] not in line_orphans: if fig_flag == False and lines_detector == False: # get class from low confidence detections if not set as text: - class_type = "Text" + class_type = PageLabel.TEXT for cluster in cluster_predictions_low: intersection = compute_intersection( orph_cell["bbox"], cluster["bbox"] ) - class_type = "Text" + class_type = PageLabel.TEXT if ( cluster["confidence"] > 0.1 and bb_iou(cluster["bbox"], orph_cell["bbox"]) > 0.4 @@ -718,7 +719,7 @@ def merge_cells(cluster_predictions): if cluster["id"] == node: lines.append(cluster) cluster_predictions.remove(cluster) - new_merged_cluster = build_cluster_from_lines(lines, "Text", max_id) + new_merged_cluster = build_cluster_from_lines(lines, PageLabel.TEXT, max_id) cluster_predictions.append(new_merged_cluster) return cluster_predictions @@ -753,9 +754,9 @@ def clean_up_clusters( # remove clusters that might appear inside tables, or images (such as pdf cells in graphs) elif img_table == True: if ( - cluster_1["type"] == "Text" - and cluster_2["type"] == "Picture" - or cluster_2["type"] == "Table" + cluster_1["type"] == PageLabel.TEXT + and cluster_2["type"] == PageLabel.PICTURE + or cluster_2["type"] == PageLabel.TABLE ): if bb_iou(cluster_1["bbox"], cluster_2["bbox"]) > 0.5: DuplicateDeletedClusterIDs.append(cluster_1["id"]) @@ -771,7 +772,10 @@ def clean_up_clusters( DuplicateDeletedClusterIDs.append(cluster_1["id"]) # remove tables that have one pdf cell if one_cell_table == True: - if cluster_1["type"] == "Table" and len(cluster_1["cell_ids"]) < 2: + if ( + cluster_1["type"] == PageLabel.TABLE + and len(cluster_1["cell_ids"]) < 2 + ): DuplicateDeletedClusterIDs.append(cluster_1["id"]) DuplicateDeletedClusterIDs = list(set(DuplicateDeletedClusterIDs)) diff --git a/examples/batch_convert.py b/examples/batch_convert.py index 94abf716..3c037c78 100644 --- a/examples/batch_convert.py +++ b/examples/batch_convert.py @@ -48,6 +48,14 @@ def export_documents( ) ) + # Export Docling document format to doctags (experimental): + with (output_dir / f"{doc_filename}.experimental.doctags").open("w") as fp: + fp.write(conv_res.experimental.export_to_document_tokens()) + + # Export Docling document format to markdown (experimental): + with (output_dir / f"{doc_filename}.experimental.md").open("w") as fp: + fp.write(conv_res.experimental.export_to_markdown()) + # Export Text format: with (output_dir / f"{doc_filename}.txt").open("w") as fp: fp.write(conv_res.render_as_text()) diff --git a/poetry.lock b/poetry.lock index c512531c..ad5be5fd 100644 --- a/poetry.lock +++ b/poetry.lock @@ -862,7 +862,7 @@ files = [] develop = false [package.dependencies] -docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/new-format-dev"} +docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", rev = "a83ff0056138d83ac2cb52bfb2ab1728ff86972f"} docutils = "!=0.21" matplotlib = "^3.7.1" networkx = "^3.1" @@ -882,7 +882,7 @@ toolkit = ["deepsearch-toolkit (>=0.31.0)"] type = "git" url = "ssh://git@github.com/DS4SD/deepsearch-glm.git" reference = "cau/new-format-dev" -resolved_reference = "60e4bda21fbe7ee8849d27a9321ba37cca04e7aa" +resolved_reference = "c26b52e8faf789cb31fcbed816d25e775391832f" [[package]] name = "deprecated" @@ -960,8 +960,8 @@ tabulate = "^0.9.0" [package.source] type = "git" url = "ssh://git@github.com/DS4SD/docling-core.git" -reference = "cau/new-format-dev" -resolved_reference = "0a1e6ce9559ffccf50c5e63c33962ac8fde35648" +reference = "a83ff0056138d83ac2cb52bfb2ab1728ff86972f" +resolved_reference = "a83ff0056138d83ac2cb52bfb2ab1728ff86972f" [[package]] name = "docling-ibm-models" @@ -1059,12 +1059,12 @@ files = [ [[package]] name = "easyocr" -version = "1.7.1" +version = "1.7.2" description = "End-to-End Multi-Lingual Optical Character Recognition (OCR) Solution" optional = false python-versions = "*" files = [ - {file = "easyocr-1.7.1-py3-none-any.whl", hash = "sha256:5b0a2e7cfdfc6c1ec99d9583663e570e4189dca6fbf373f074b21b8809e44d2b"}, + {file = "easyocr-1.7.2-py3-none-any.whl", hash = "sha256:5be12f9b0e595d443c9c3d10b0542074b50f0ec2d98b141a109cd961fd1c177c"}, ] [package.dependencies] @@ -7314,4 +7314,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "1b908180d822d74ae8033e8b6c650b8d00b4365fc7dd36cea6505305651b79b6" +content-hash = "325aebca1bdc6e0cfeb8fc59a84102a804d750211fe8e59cd4cb15876c1ca12e" diff --git a/pyproject.toml b/pyproject.toml index 9f0e9cb7..e5734ebd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ packages = [{include = "docling"}] [tool.poetry.dependencies] python = "^3.10" pydantic = "^2.0.0" -docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/new-format-dev"} +docling-core = {git = "ssh://git@github.com/DS4SD/docling-core.git", rev = "a83ff0056138d83ac2cb52bfb2ab1728ff86972f"} docling-ibm-models = "^1.2.0" deepsearch-glm = {git = "ssh://git@github.com/DS4SD/deepsearch-glm.git", branch = "cau/new-format-dev"}