mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Optionally produce legacy_doc
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
025983f07b
commit
52713f0cf5
@ -172,7 +172,7 @@ class ConvertedDocument(BaseModel):
|
||||
pages: List[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
|
||||
legacy_output: DsDocument = _EMPTY_LEGACY_DOC
|
||||
legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC
|
||||
output: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
def _to_legacy_document(self) -> DsDocument:
|
||||
|
@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel): ...
|
||||
class PipelineOptions(BaseModel):
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
)
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
|
@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
|
||||
class GlmModel:
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.create_legacy_output = config.get("create_legacy_output", True)
|
||||
|
||||
self.model_names = self.config.get(
|
||||
"model_names", ""
|
||||
) # "language;term;reference"
|
||||
@ -42,6 +44,9 @@ class GlmModel:
|
||||
)
|
||||
|
||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||
legacy_doc: DsLegacyDocument = None
|
||||
|
||||
if self.create_legacy_output:
|
||||
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
|
||||
|
||||
# DEBUG code:
|
||||
@ -92,4 +97,4 @@ class GlmModel:
|
||||
# draw_clusters_and_cells(ds_doc, 0)
|
||||
# draw_clusters_and_cells(exported_doc, 0)
|
||||
|
||||
return (legacy_doc, docling_doc)
|
||||
return (docling_doc, legacy_doc)
|
||||
|
@ -37,7 +37,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
artifacts_path = self.download_models_hf()
|
||||
|
||||
self.artifacts_path = Path(artifacts_path)
|
||||
self.glm_model = GlmModel(config={})
|
||||
self.glm_model = GlmModel(
|
||||
config={"create_legacy_output": pipeline_options.create_legacy_output}
|
||||
)
|
||||
|
||||
ocr_model: BaseOcrModel
|
||||
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
||||
@ -128,7 +130,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
|
||||
conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
|
||||
|
||||
return conv_res
|
||||
|
||||
|
@ -1,3 +1,4 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
@ -54,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
||||
conv_results = doc_converter.convert_batch(input)
|
||||
|
||||
for res in conv_results:
|
||||
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
||||
out_path = Path("./scratch")
|
||||
print(
|
||||
f"Document {res.input.file.name} converted with status {res.status}."
|
||||
f"\nSaved markdown output to: {str(out_path)}"
|
||||
)
|
||||
# print(res.experimental.export_to_markdown())
|
||||
# Export Docling document format to markdown (experimental):
|
||||
with out_path.open("w") as fp:
|
||||
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
|
||||
fp.write(res.output.export_to_markdown())
|
||||
|
||||
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
|
||||
fp.write(json.dumps(res.output.export_to_dict()))
|
||||
|
Binary file not shown.
@ -16,7 +16,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||
|
||||
GENERATE = True
|
||||
GENERATE = False
|
||||
|
||||
|
||||
# Debug
|
||||
|
Loading…
Reference in New Issue
Block a user