Merge from main

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-17 00:58:25 +00:00 · 2024-10-08 14:42:33 +02:00
parent 1d55cbdca9 d412c363d7
commit c0447206af
49 changed files with 1621 additions and 1432 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -8,14 +8,8 @@ from docling_core.types.experimental import (
    BasePictureData,
    BaseTableData,
    DescriptionItem,
-    DocItemLabel,
    DoclingDocument,
-    DocumentOrigin,
-    ImageRef,
-    PictureItem,
-    SectionHeaderItem,
    TableCell,
-    TableItem,
 )
 from docling_core.types.experimental.labels import DocItemLabel, GroupLabel

--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -11,22 +11,18 @@ from docling_core.types.experimental import (
    DoclingDocument,
    DocumentOrigin,
    GroupLabel,
-    ImageRef,
-    PictureItem,
    ProvenanceItem,
    TableCell,
-    TableItem,
 )
 from docling_core.types.experimental.base import BoundingBox, CoordOrigin, Size
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
-from pptx.util import Inches

 from docling.backend.abstract_backend import (
    DeclarativeDocumentBackend,
    PaginatedDocumentBackend,
 )
-from docling.datamodel.base_models import FormatToMimeType, InputFormat
+from docling.datamodel.base_models import InputFormat

 _log = logging.getLogger(__name__)

--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -11,11 +11,7 @@ from docling_core.types.experimental import (
    DocItemLabel,
    DoclingDocument,
    GroupLabel,
-    ImageRef,
-    PictureItem,
-    SectionHeaderItem,
    TableCell,
-    TableItem,
 )
 from lxml import etree

--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -1,15 +1,12 @@
-import warnings
 from enum import Enum, auto
 from io import BytesIO
-from pathlib import Path
-from typing import Annotated, Dict, List, Optional, Union
+from typing import Dict, List, Optional, Union

 from docling_core.types.experimental import BoundingBox, Size
 from docling_core.types.experimental.document import BasePictureData, TableCell
 from docling_core.types.experimental.labels import DocItemLabel
 from PIL.Image import Image
-from pydantic import BaseModel, ConfigDict, Field, model_validator
-from typing_extensions import Self
+from pydantic import BaseModel, ConfigDict


 class ConversionStatus(str, Enum):
@@ -173,40 +170,3 @@ class DocumentStream(BaseModel):

    filename: str
    stream: BytesIO
-
-
-class TableStructureOptions(BaseModel):
-    do_cell_matching: bool = (
-        True
-        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
-        #        are merged across table columns.
-        # False: Let table structure model define the text cells, ignore PDF cells.
-    )
-
-
-class PipelineOptions(BaseModel): ...
-
-
-class PdfPipelineOptions(PipelineOptions):
-    artifacts_path: Optional[Union[Path, str]] = None
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-
-    table_structure_options: TableStructureOptions = TableStructureOptions()
-
-    keep_page_images: Annotated[
-        bool,
-        Field(
-            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
-        ),
-    ] = False  # False: page images are removed in the assemble step
-    images_scale: Optional[float] = None  # if set, the scale for generated images
-
-    @model_validator(mode="after")
-    def set_page_images_from_deprecated(self) -> Self:
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", DeprecationWarning)
-            default_scale = 1.0
-            if self.keep_page_images and self.images_scale is None:
-                self.images_scale = default_scale
-        return self
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -0,0 +1,49 @@
+import warnings
+from enum import Enum, auto
+from pathlib import Path
+from typing import Annotated, Optional, Self, Union
+
+from pydantic import BaseModel, Field, model_validator
+
+
+class TableFormerMode(str, Enum):
+    FAST = auto()
+    ACCURATE = auto()
+
+
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+    mode: TableFormerMode = TableFormerMode.FAST
+
+
+class PipelineOptions(BaseModel): ...
+
+
+class PdfPipelineOptions(PipelineOptions):
+    artifacts_path: Optional[Union[Path, str]] = None
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+
+    table_structure_options: TableStructureOptions = TableStructureOptions()
+
+    keep_page_images: Annotated[
+        bool,
+        Field(
+            deprecated="`keep_page_images` is depreacted, set the value of `images_scale` instead"
+        ),
+    ] = False  # False: page images are removed in the assemble step
+    images_scale: Optional[float] = None  # if set, the scale for generated images
+
+    @model_validator(mode="after")
+    def set_page_images_from_deprecated(self) -> Self:
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore", DeprecationWarning)
+            default_scale = 1.0
+            if self.keep_page_images and self.images_scale is None:
+                self.images_scale = default_scale
+        return self
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -8,12 +8,13 @@ import requests
 from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError

 from docling.backend.abstract_backend import AbstractDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, InputFormat
 from docling.datamodel.document import (
    ConversionResult,
    DocumentConversionInput,
    InputDocument,
 )
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.pipeline.base_model_pipeline import BaseModelPipeline
 from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -73,6 +73,7 @@ class LayoutModel(AbstractPageModel):

        CLASS_REMAPPINGS = {
            DocItemLabel.DOCUMENT_INDEX: DocItemLabel.TABLE,
+            DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
        }

        _log.debug("================= Start postprocess function ====================")
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -1,4 +1,5 @@
 import copy
+from pathlib import Path
 from typing import Iterable, List

 import numpy
@@ -9,6 +10,7 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
 from PIL import ImageDraw

 from docling.datamodel.base_models import Page, Table, TableStructurePrediction
+from docling.datamodel.pipeline_options import TableFormerMode
 from docling.models.abstract_model import AbstractPageModel


@@ -16,10 +18,15 @@ class TableStructureModel(AbstractPageModel):
    def __init__(self, config):
        self.config = config
        self.do_cell_matching = config["do_cell_matching"]
+        self.mode = config["mode"]

        self.enabled = config["enabled"]
        if self.enabled:
-            artifacts_path = config["artifacts_path"]
+            artifacts_path: Path = config["artifacts_path"]
+
+            if self.mode == TableFormerMode.ACCURATE:
+                artifacts_path = artifacts_path / "fat"
+
            # Third Party
            import docling_ibm_models.tableformer.common as c

--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@@ -12,9 +12,9 @@ from docling.datamodel.base_models import (
    DoclingComponentType,
    ErrorItem,
    Page,
-    PipelineOptions,
 )
 from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.utils.utils import chunkify

@@ -86,9 +86,6 @@ class PaginatedModelPipeline(BaseModelPipeline):  # TODO this is a bad name.
                end_pb_time = time.time() - start_pb_time
                _log.info(f"Finished converting page batch time={end_pb_time:.3f}")

-            # Free up mem resources of PDF backend
-            in_doc._backend.unload()
-
            conv_res = self.assemble_document(in_doc, conv_res)

            status = ConversionStatus.SUCCESS
@@ -113,6 +110,10 @@ class PaginatedModelPipeline(BaseModelPipeline):  # TODO this is a bad name.
                f"{trace}"
            )
            raise e
+        finally:
+            # Always unload the PDF backend, even in case of failure
+            if in_doc._backend:
+                in_doc._backend.unload()

        return conv_res

--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@@ -4,12 +4,9 @@ from docling.backend.abstract_backend import (
    AbstractDocumentBackend,
    DeclarativeDocumentBackend,
 )
-from docling.datamodel.base_models import (
-    ConversionStatus,
-    PdfPipelineOptions,
-    PipelineOptions,
-)
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
 from docling.pipeline.base_model_pipeline import BaseModelPipeline

 _log = logging.getLogger(__name__)
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@@ -4,8 +4,9 @@ from typing import Optional

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
+from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.models.ds_glm_model import GlmModel
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
@@ -18,7 +19,7 @@ _log = logging.getLogger(__name__)


 class StandardPdfModelPipeline(PaginatedModelPipeline):
-    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
+    _layout_model_path = "model_artifacts/layout/beehive_v0.0.5_pt"
    _table_model_path = "model_artifacts/tableformer"

    def __init__(self, pipeline_options: PdfPipelineOptions):
@@ -52,6 +53,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
                    / StandardPdfModelPipeline._table_model_path,
                    "enabled": pipeline_options.do_table_structure,
                    "do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
+                    "mode": pipeline_options.table_structure_options.mode,
                }
            ),
            PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
@@ -64,7 +66,10 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
        from huggingface_hub import snapshot_download

        download_path = snapshot_download(
-            repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
+            repo_id="ds4sd/docling-models",
+            force_download=force,
+            local_dir=local_dir,
+            revision="v2.0.0",
        )

        return Path(download_path)