Optionally produce legacy_doc

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-11 12:57:47 +02:00
parent 025983f07b
commit 52713f0cf5
7 changed files with 23 additions and 9 deletions

View File

@ -172,7 +172,7 @@ class ConvertedDocument(BaseModel):
pages: List[Page] = [] pages: List[Page] = []
assembled: AssembledUnit = AssembledUnit() assembled: AssembledUnit = AssembledUnit()
legacy_output: DsDocument = _EMPTY_LEGACY_DOC legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC
output: DoclingDocument = _EMPTY_DOCLING_DOC output: DoclingDocument = _EMPTY_DOCLING_DOC
def _to_legacy_document(self) -> DsDocument: def _to_legacy_document(self) -> DsDocument:

View File

@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
) )
class PipelineOptions(BaseModel): ... class PipelineOptions(BaseModel):
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
class PdfPipelineOptions(PipelineOptions): class PdfPipelineOptions(PipelineOptions):

View File

@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
class GlmModel: class GlmModel:
def __init__(self, config): def __init__(self, config):
self.config = config self.config = config
self.create_legacy_output = config.get("create_legacy_output", True)
self.model_names = self.config.get( self.model_names = self.config.get(
"model_names", "" "model_names", ""
) # "language;term;reference" ) # "language;term;reference"
@ -42,7 +44,10 @@ class GlmModel:
) )
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict) legacy_doc: DsLegacyDocument = None
if self.create_legacy_output:
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
# DEBUG code: # DEBUG code:
def draw_clusters_and_cells(ds_document, page_no): def draw_clusters_and_cells(ds_document, page_no):
@ -92,4 +97,4 @@ class GlmModel:
# draw_clusters_and_cells(ds_doc, 0) # draw_clusters_and_cells(ds_doc, 0)
# draw_clusters_and_cells(exported_doc, 0) # draw_clusters_and_cells(exported_doc, 0)
return (legacy_doc, docling_doc) return (docling_doc, legacy_doc)

View File

@ -37,7 +37,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
artifacts_path = self.download_models_hf() artifacts_path = self.download_models_hf()
self.artifacts_path = Path(artifacts_path) self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel(config={}) self.glm_model = GlmModel(
config={"create_legacy_output": pipeline_options.create_legacy_output}
)
ocr_model: BaseOcrModel ocr_model: BaseOcrModel
if isinstance(pipeline_options.ocr_options, EasyOcrOptions): if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
@ -128,7 +130,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
elements=all_elements, headers=all_headers, body=all_body elements=all_elements, headers=all_headers, body=all_body
) )
conv_res.legacy_output, conv_res.output = self.glm_model(conv_res) conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
return conv_res return conv_res

View File

@ -1,3 +1,4 @@
import json
import logging import logging
from pathlib import Path from pathlib import Path
@ -54,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
conv_results = doc_converter.convert_batch(input) conv_results = doc_converter.convert_batch(input)
for res in conv_results: for res in conv_results:
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md" out_path = Path("./scratch")
print( print(
f"Document {res.input.file.name} converted with status {res.status}." f"Document {res.input.file.name} converted with status {res.status}."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {str(out_path)}"
) )
# print(res.experimental.export_to_markdown()) # print(res.experimental.export_to_markdown())
# Export Docling document format to markdown (experimental): # Export Docling document format to markdown (experimental):
with out_path.open("w") as fp: with (out_path / f"{res.input.file.name}.md").open("w") as fp:
fp.write(res.output.export_to_markdown()) fp.write(res.output.export_to_markdown())
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
fp.write(json.dumps(res.output.export_to_dict()))

Binary file not shown.

View File

@ -16,7 +16,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
GENERATE = True GENERATE = False
# Debug # Debug