Optionally produce legacy_doc

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-11 12:57:47 +02:00 · 2024-10-11 12:57:47 +02:00 · 52713f0cf5
commit 52713f0cf5
parent 025983f07b
7 changed files with 23 additions and 9 deletions
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -172,7 +172,7 @@ class ConvertedDocument(BaseModel):
    pages: List[Page] = []
    assembled: AssembledUnit = AssembledUnit()
-    legacy_output: DsDocument = _EMPTY_LEGACY_DOC
+    legacy_output: Optional[DsDocument] = None  # _EMPTY_LEGACY_DOC
    output: DoclingDocument = _EMPTY_DOCLING_DOC
    def _to_legacy_document(self) -> DsDocument:
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
    )
-class PipelineOptions(BaseModel): ...
+class PipelineOptions(BaseModel):
    create_legacy_output: bool = (
        True  # This defautl will be set to False on a future version of docling
    )
 class PdfPipelineOptions(PipelineOptions):
--- a/docling/models/ds_glm_model.py
+++ b/docling/models/ds_glm_model.py
@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
 class GlmModel:
    def __init__(self, config):
        self.config = config
        self.create_legacy_output = config.get("create_legacy_output", True)
        self.model_names = self.config.get(
            "model_names", ""
        )  # "language;term;reference"
@ -42,7 +44,10 @@ class GlmModel:
        )
        docling_doc: DoclingDocument = to_docling_document(glm_doc)  # Experimental
-        legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
+        legacy_doc: DsLegacyDocument = None
        if self.create_legacy_output:
            legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
        # DEBUG code:
        def draw_clusters_and_cells(ds_document, page_no):
@ -92,4 +97,4 @@ class GlmModel:
        # draw_clusters_and_cells(ds_doc, 0)
        # draw_clusters_and_cells(exported_doc, 0)
-        return (legacy_doc, docling_doc)
+        return (docling_doc, legacy_doc)
--- a/docling/pipeline/standard_pdf_model_pipeline.py
+++ b/docling/pipeline/standard_pdf_model_pipeline.py
@ -37,7 +37,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
            artifacts_path = self.download_models_hf()
        self.artifacts_path = Path(artifacts_path)
-        self.glm_model = GlmModel(config={})
+        self.glm_model = GlmModel(
            config={"create_legacy_output": pipeline_options.create_legacy_output}
        )
        ocr_model: BaseOcrModel
        if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
@ -128,7 +130,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
            elements=all_elements, headers=all_headers, body=all_body
        )
-        conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
+        conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
        return conv_res
--- a/examples/run_with_formats.py
+++ b/examples/run_with_formats.py
@ -1,3 +1,4 @@
 import json
 import logging
 from pathlib import Path
@ -54,12 +55,15 @@ doc_converter = DocumentConverter(  # all of the below is optional, has internal
 conv_results = doc_converter.convert_batch(input)
 for res in conv_results:
-    out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
+    out_path = Path("./scratch")
    print(
        f"Document {res.input.file.name} converted with status {res.status}."
        f"\nSaved markdown output to: {str(out_path)}"
    )
    # print(res.experimental.export_to_markdown())
    # Export Docling document format to markdown (experimental):
-    with out_path.open("w") as fp:
+    with (out_path / f"{res.input.file.name}.md").open("w") as fp:
        fp.write(res.output.export_to_markdown())
    with (out_path / f"{res.input.file.name}.json").open("w") as fp:
        fp.write(json.dumps(res.output.export_to_dict()))
--- a/tests/data/word_sample.docx
+++ b/tests/data/word_sample.docx
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -16,7 +16,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
-GENERATE = True
+GENERATE = False
 # Debug