diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index 721ccfcd..ede4e328 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -172,7 +172,7 @@ class ConvertedDocument(BaseModel): pages: List[Page] = [] assembled: AssembledUnit = AssembledUnit() - legacy_output: DsDocument = _EMPTY_LEGACY_DOC + legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC output: DoclingDocument = _EMPTY_DOCLING_DOC def _to_legacy_document(self) -> DsDocument: diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 4be6fcec..45a9771b 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions): ) -class PipelineOptions(BaseModel): ... +class PipelineOptions(BaseModel): + create_legacy_output: bool = ( + True # This defautl will be set to False on a future version of docling + ) class PdfPipelineOptions(PipelineOptions): diff --git a/docling/models/ds_glm_model.py b/docling/models/ds_glm_model.py index 06c0aea3..529b12ce 100644 --- a/docling/models/ds_glm_model.py +++ b/docling/models/ds_glm_model.py @@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult class GlmModel: def __init__(self, config): self.config = config + self.create_legacy_output = config.get("create_legacy_output", True) + self.model_names = self.config.get( "model_names", "" ) # "language;term;reference" @@ -42,7 +44,10 @@ class GlmModel: ) docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental - legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict) + legacy_doc: DsLegacyDocument = None + + if self.create_legacy_output: + legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict) # DEBUG code: def draw_clusters_and_cells(ds_document, page_no): @@ -92,4 +97,4 @@ class GlmModel: # draw_clusters_and_cells(ds_doc, 0) # draw_clusters_and_cells(exported_doc, 0) - return (legacy_doc, docling_doc) + return (docling_doc, legacy_doc) diff --git a/docling/pipeline/standard_pdf_model_pipeline.py b/docling/pipeline/standard_pdf_model_pipeline.py index dba1f3dc..c0dab3b4 100644 --- a/docling/pipeline/standard_pdf_model_pipeline.py +++ b/docling/pipeline/standard_pdf_model_pipeline.py @@ -37,7 +37,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): artifacts_path = self.download_models_hf() self.artifacts_path = Path(artifacts_path) - self.glm_model = GlmModel(config={}) + self.glm_model = GlmModel( + config={"create_legacy_output": pipeline_options.create_legacy_output} + ) ocr_model: BaseOcrModel if isinstance(pipeline_options.ocr_options, EasyOcrOptions): @@ -128,7 +130,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline): elements=all_elements, headers=all_headers, body=all_body ) - conv_res.legacy_output, conv_res.output = self.glm_model(conv_res) + conv_res.output, conv_res.legacy_output = self.glm_model(conv_res) return conv_res diff --git a/examples/run_with_formats.py b/examples/run_with_formats.py index 398121bd..f086bae2 100644 --- a/examples/run_with_formats.py +++ b/examples/run_with_formats.py @@ -1,3 +1,4 @@ +import json import logging from pathlib import Path @@ -54,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal conv_results = doc_converter.convert_batch(input) for res in conv_results: - out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" + out_path = Path("./scratch") print( f"Document {res.input.file.name} converted with status {res.status}." f"\nSaved markdown output to: {str(out_path)}" ) # print(res.experimental.export_to_markdown()) # Export Docling document format to markdown (experimental): - with out_path.open("w") as fp: + with (out_path / f"{res.input.file.name}.md").open("w") as fp: fp.write(res.output.export_to_markdown()) + + with (out_path / f"{res.input.file.name}.json").open("w") as fp: + fp.write(json.dumps(res.output.export_to_dict())) diff --git a/tests/data/word_sample.docx b/tests/data/word_sample.docx index 70ef7375..b1889405 100644 Binary files a/tests/data/word_sample.docx and b/tests/data/word_sample.docx differ diff --git a/tests/test_e2e_ocr_conversion.py b/tests/test_e2e_ocr_conversion.py index c0c0a497..9d98abfa 100644 --- a/tests/test_e2e_ocr_conversion.py +++ b/tests/test_e2e_ocr_conversion.py @@ -16,7 +16,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 -GENERATE = True +GENERATE = False # Debug