Refactor test data, legacy usage and more

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-17 00:58:25 +00:00 · 2024-10-10 13:54:44 +02:00
parent da0700f959
commit 7cad290ceb
46 changed files with 129 additions and 7646 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -94,21 +94,21 @@ def export_documents(
                fname = output_dir / f"{doc_filename}.txt"
                with fname.open("w") as fp:
                    _log.info(f"writing Text output to {fname}")
-                    fp.write(conv_res.render_as_text())
+                    fp.write(conv_res.render_as_text_v1())

            # Export Markdown format:
            if export_md:
                fname = output_dir / f"{doc_filename}.md"
                with fname.open("w") as fp:
                    _log.info(f"writing Markdown output to {fname}")
-                    fp.write(conv_res.render_as_markdown())
+                    fp.write(conv_res.render_as_markdown_v1())

            # Export Document Tags format:
            if export_doctags:
                fname = output_dir / f"{doc_filename}.doctags"
                with fname.open("w") as fp:
                    _log.info(f"writing Doc Tags output to {fname}")
-                    fp.write(conv_res.render_as_doctags())
+                    fp.write(conv_res.render_as_doctags_v1())

        else:
            _log.warning(f"Document {conv_res.input.file} failed to convert.")
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -1,4 +1,5 @@
 import logging
+from enum import Enum
 from io import BytesIO
 from pathlib import Path, PurePath
 from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
@@ -61,7 +62,7 @@ layout_label_to_ds_type = {
    DocItemLabel.TEXT: "paragraph",
 }

-_EMPTY_DOC = DsDocument(
+_EMPTY_LEGACY_DOC = DsDocument(
    _name="",
    description=DsDocumentDescription(logs=[]),
    file_info=DsFileInfoObject(
@@ -155,6 +156,11 @@ class InputDocument(BaseModel):
        )


+class DocumentFormat(str, Enum):
+    V2 = "v2"
+    V1 = "v1"
+
+
@deprecated("Use `ConversionResult` instead.")
 class ConvertedDocument(BaseModel):
    input: InputDocument
@@ -165,10 +171,10 @@ class ConvertedDocument(BaseModel):
    pages: List[Page] = []
    assembled: AssembledUnit = AssembledUnit()

-    output: DsDocument = _EMPTY_DOC
-    experimental: DoclingDocument = _EMPTY_DOCLING_DOC
+    legacy_output: DsDocument = _EMPTY_LEGACY_DOC
+    output: DoclingDocument = _EMPTY_DOCLING_DOC

-    def _to_ds_document(self) -> DsDocument:
+    def _to_legacy_document(self) -> DsDocument:
        title = ""
        desc = DsDocumentDescription(logs=[])

@@ -344,10 +350,12 @@ class ConvertedDocument(BaseModel):

        return ds_doc

-    def render_as_dict(self):
-        return self.output.model_dump(by_alias=True, exclude_none=True)
+    @deprecated("Use output.export_to_dict() instead.")
+    def render_as_dict_v1(self):
+        return self.legacy_output.model_dump(by_alias=True, exclude_none=True)

-    def render_as_markdown(
+    @deprecated("Use output.export_to_markdown() instead.")
+    def render_as_markdown_v1(
        self,
        delim: str = "\n\n",
        main_text_start: int = 0,
@@ -362,8 +370,8 @@ class ConvertedDocument(BaseModel):
        ],
        strict_text: bool = False,
        image_placeholder: str = "<!-- image -->",
-    ):
-        return self.output.export_to_markdown(
+    ) -> str:
+        return self.legacy_output.export_to_markdown(
            delim=delim,
            main_text_start=main_text_start,
            main_text_stop=main_text_stop,
@@ -372,7 +380,8 @@ class ConvertedDocument(BaseModel):
            image_placeholder=image_placeholder,
        )

-    def render_as_text(
+    @deprecated("Use output.export_to_text() instead.")
+    def render_as_text_v1(
        self,
        delim: str = "\n\n",
        main_text_start: int = 0,
@@ -383,8 +392,8 @@ class ConvertedDocument(BaseModel):
            "paragraph",
            "caption",
        ],
-    ):
-        return self.output.export_to_markdown(
+    ) -> str:
+        return self.legacy_output.export_to_markdown(
            delim=delim,
            main_text_start=main_text_start,
            main_text_stop=main_text_stop,
@@ -392,7 +401,8 @@ class ConvertedDocument(BaseModel):
            strict_text=True,
        )

-    def render_as_doctags(
+    @deprecated("Use output.export_to_document_tokens() instead.")
+    def render_as_doctags_v1(
        self,
        delim: str = "\n\n",
        main_text_start: int = 0,
@@ -415,7 +425,7 @@ class ConvertedDocument(BaseModel):
        add_table_cell_label: bool = True,
        add_table_cell_text: bool = True,
    ) -> str:
-        return self.output.export_to_document_tokens(
+        return self.legacy_output.export_to_document_tokens(
            delim=delim,
            main_text_start=main_text_start,
            main_text_stop=main_text_stop,
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@@ -9,7 +9,7 @@ from deepsearch_glm.utils.doc_utils import (
 )
 from deepsearch_glm.utils.load_pretrained_models import load_pretrained_nlp_models
 from docling_core.types import BaseText
-from docling_core.types import Document as DsDocument
+from docling_core.types import Document as DsLegacyDocument
 from docling_core.types import Ref
 from docling_core.types.experimental import BoundingBox, CoordOrigin
 from docling_core.types.experimental.document import DoclingDocument
@@ -32,8 +32,8 @@ class GlmModel:

    def __call__(
        self, conv_res: ConversionResult
-    ) -> Tuple[DsDocument, DoclingDocument]:
-        ds_doc = conv_res._to_ds_document()
+    ) -> Tuple[DsLegacyDocument, DoclingDocument]:
+        ds_doc = conv_res._to_legacy_document()
        ds_doc_dict = ds_doc.model_dump(by_alias=True)

        glm_doc = self.model.apply_on_doc(ds_doc_dict)
@@ -42,7 +42,7 @@ class GlmModel:
        )

        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
-        exported_doc = DsDocument.model_validate(ds_doc_dict)
+        legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)

        # DEBUG code:
        def draw_clusters_and_cells(ds_document, page_no):
@@ -92,4 +92,4 @@ class GlmModel:
        # draw_clusters_and_cells(ds_doc, 0)
        # draw_clusters_and_cells(exported_doc, 0)

-        return (exported_doc, docling_doc)
+        return (legacy_doc, docling_doc)
--- a/docling/pipeline/simple_model_pipeline.py
+++ b/docling/pipeline/simple_model_pipeline.py
@@ -44,7 +44,7 @@ class SimpleModelPipeline(BaseModelPipeline):
        # the backend is expected to be of type DeclarativeDocumentBackend, which can output
        # a DoclingDocument straight.

-        conv_res.experimental = in_doc._backend.convert()
+        conv_res.output = in_doc._backend.convert()

        # Do other stuff with conv_res.experimental

--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@@ -124,7 +124,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
            elements=all_elements, headers=all_headers, body=all_body
        )

-        conv_res.output, conv_res.experimental = self.glm_model(conv_res)
+        conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)

        return conv_res

--- a/docling/utils/export.py
+++ b/docling/utils/export.py
@@ -41,7 +41,7 @@ def generate_multimodal_pages(
    end_ix = 0
    doc_items: List[Tuple[int, Union[BaseCell, BaseText]]] = []

-    doc = doc_result.output
+    doc = doc_result.legacy_output

    def _process_page_segments(doc_items: list[Tuple[int, BaseCell]], page: Page):
        segments = []