Update datamodel structure

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2024-09-19 16:51:52 +02:00 · 2024-09-19 16:51:52 +02:00 · d8163b0865
commit d8163b0865
parent 14dcba11c0
10 changed files with 37 additions and 32 deletions
--- a/4
+++ b/4
@ -3,7 +3,7 @@ FROM python:3.11-slim-bookworm
 ENV GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=no"

 RUN apt-get update \
-    && apt-get install -y libgl1 libglib2.0-0 curl wget git \
+    && apt-get install -y libgl1 libglib2.0-0 curl wget git vim procps \
    && apt-get clean

 # This will install torch with *only* cpu support
@ -16,7 +16,7 @@ ENV TORCH_HOME=/tmp/

 COPY examples/minimal.py /root/minimal.py

-RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
+#RUN python -c 'from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models; load_pretrained_nlp_models(verbose=True);'
 RUN python -c 'from docling.document_converter import DocumentConverter; artifacts_path = DocumentConverter.download_models_hf(force=True);'

 # On container environments, always set a thread budget to avoid undesired thread congestion.
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -24,11 +24,6 @@ class DocInputType(str, Enum):
    STREAM = auto()


-class TableFormerMode(str, Enum):
-    FAST = auto()
-    ACCURATE = auto()
-
-
 class CoordOrigin(str, Enum):
    TOPLEFT = auto()
    BOTTOMLEFT = auto()
@ -303,23 +298,6 @@ class DocumentStream(BaseModel):
    stream: BytesIO


-class TableStructureOptions(BaseModel):
-    do_cell_matching: bool = (
-        True
-        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
-        #        are merged across table columns.
-        # False: Let table structure model define the text cells, ignore PDF cells.
-    )
-    mode: TableFormerMode = TableFormerMode.FAST
-
-
-class PipelineOptions(BaseModel):
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-
-    table_structure_options: TableStructureOptions = TableStructureOptions()
-
-
 class AssembleOptions(BaseModel):
    keep_page_images: Annotated[
        bool,
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -4,14 +4,14 @@ from pathlib import Path, PurePath
 from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union

 from docling_core.types import BaseCell, BaseText
-from docling_core.types import BoundingBox as DsBoundingBox
 from docling_core.types import Document as DsDocument
 from docling_core.types import DocumentDescription as DsDocumentDescription
 from docling_core.types import FileInfoObject as DsFileInfoObject
 from docling_core.types import PageDimensions, PageReference, Prov, Ref
 from docling_core.types import Table as DsSchemaTable
 from docling_core.types import TableCell
-from docling_core.types.doc.base import Figure
+from docling_core.types.legacy.base import BoundingBox as DsBoundingBox
+from docling_core.types.legacy.base import Figure
 from pydantic import BaseModel
 from typing_extensions import deprecated

--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -0,0 +1,25 @@
+from enum import Enum, auto
+
+from pydantic import BaseModel
+
+
+class TableFormerMode(str, Enum):
+    FAST = auto()
+    ACCURATE = auto()
+
+
+class TableStructureOptions(BaseModel):
+    do_cell_matching: bool = (
+        True
+        # True:  Matches predictions back to PDF cells. Can break table output if PDF cells
+        #        are merged across table columns.
+        # False: Let table structure model define the text cells, ignore PDF cells.
+    )
+    mode: TableFormerMode = TableFormerMode.FAST
+
+
+class PipelineOptions(BaseModel):
+    do_table_structure: bool = True  # True: perform table structure extraction
+    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
+
+    table_structure_options: TableStructureOptions = TableStructureOptions()
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -18,13 +18,13 @@ from docling.datamodel.base_models import (
    DoclingComponentType,
    ErrorItem,
    Page,
-    PipelineOptions,
 )
 from docling.datamodel.document import (
    ConversionResult,
    DocumentConversionInput,
    InputDocument,
 )
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.ds_glm_model import GlmModel
 from docling.models.page_assemble_model import PageAssembleModel
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -11,9 +11,9 @@ from docling.datamodel.base_models import (
    Page,
    TableCell,
    TableElement,
-    TableFormerMode,
    TableStructurePrediction,
 )
+from docling.datamodel.pipeline_options import TableFormerMode


 class TableStructureModel:
--- a/docling/pipeline/base_model_pipeline.py
+++ b/docling/pipeline/base_model_pipeline.py
@ -1,7 +1,8 @@
 from pathlib import Path
 from typing import Callable, Iterable, List

-from docling.datamodel.base_models import Page, PipelineOptions
+from docling.datamodel.base_models import Page
+from docling.datamodel.pipeline_options import PipelineOptions


 class BaseModelPipeline:
--- a/docling/pipeline/standard_model_pipeline.py
+++ b/docling/pipeline/standard_model_pipeline.py
@ -1,6 +1,6 @@
 from pathlib import Path

-from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
 from docling.models.table_structure_model import TableStructureModel
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@ -4,7 +4,7 @@ import time
 from pathlib import Path
 from typing import Iterable

-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
 from docling.document_converter import DocumentConverter

--- a/examples/custom_convert.py
+++ b/examples/custom_convert.py
@ -6,8 +6,9 @@ from typing import Iterable

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.datamodel.pipeline_options import PipelineOptions
 from docling.document_converter import DocumentConverter

 _log = logging.getLogger(__name__)