mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Optionally produce legacy_doc
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
025983f07b
commit
52713f0cf5
@ -172,7 +172,7 @@ class ConvertedDocument(BaseModel):
|
|||||||
pages: List[Page] = []
|
pages: List[Page] = []
|
||||||
assembled: AssembledUnit = AssembledUnit()
|
assembled: AssembledUnit = AssembledUnit()
|
||||||
|
|
||||||
legacy_output: DsDocument = _EMPTY_LEGACY_DOC
|
legacy_output: Optional[DsDocument] = None # _EMPTY_LEGACY_DOC
|
||||||
output: DoclingDocument = _EMPTY_DOCLING_DOC
|
output: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||||
|
|
||||||
def _to_legacy_document(self) -> DsDocument:
|
def _to_legacy_document(self) -> DsDocument:
|
||||||
|
@ -59,7 +59,10 @@ class TesseractOcrOptions(OcrOptions):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel): ...
|
class PipelineOptions(BaseModel):
|
||||||
|
create_legacy_output: bool = (
|
||||||
|
True # This defautl will be set to False on a future version of docling
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PipelineOptions):
|
class PdfPipelineOptions(PipelineOptions):
|
||||||
|
@ -22,6 +22,8 @@ from docling.datamodel.document import ConversionResult
|
|||||||
class GlmModel:
|
class GlmModel:
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
self.create_legacy_output = config.get("create_legacy_output", True)
|
||||||
|
|
||||||
self.model_names = self.config.get(
|
self.model_names = self.config.get(
|
||||||
"model_names", ""
|
"model_names", ""
|
||||||
) # "language;term;reference"
|
) # "language;term;reference"
|
||||||
@ -42,7 +44,10 @@ class GlmModel:
|
|||||||
)
|
)
|
||||||
|
|
||||||
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
docling_doc: DoclingDocument = to_docling_document(glm_doc) # Experimental
|
||||||
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
|
legacy_doc: DsLegacyDocument = None
|
||||||
|
|
||||||
|
if self.create_legacy_output:
|
||||||
|
legacy_doc = DsLegacyDocument.model_validate(ds_doc_dict)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_clusters_and_cells(ds_document, page_no):
|
def draw_clusters_and_cells(ds_document, page_no):
|
||||||
@ -92,4 +97,4 @@ class GlmModel:
|
|||||||
# draw_clusters_and_cells(ds_doc, 0)
|
# draw_clusters_and_cells(ds_doc, 0)
|
||||||
# draw_clusters_and_cells(exported_doc, 0)
|
# draw_clusters_and_cells(exported_doc, 0)
|
||||||
|
|
||||||
return (legacy_doc, docling_doc)
|
return (docling_doc, legacy_doc)
|
||||||
|
@ -37,7 +37,9 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
artifacts_path = self.download_models_hf()
|
artifacts_path = self.download_models_hf()
|
||||||
|
|
||||||
self.artifacts_path = Path(artifacts_path)
|
self.artifacts_path = Path(artifacts_path)
|
||||||
self.glm_model = GlmModel(config={})
|
self.glm_model = GlmModel(
|
||||||
|
config={"create_legacy_output": pipeline_options.create_legacy_output}
|
||||||
|
)
|
||||||
|
|
||||||
ocr_model: BaseOcrModel
|
ocr_model: BaseOcrModel
|
||||||
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
if isinstance(pipeline_options.ocr_options, EasyOcrOptions):
|
||||||
@ -128,7 +130,7 @@ class StandardPdfModelPipeline(PaginatedModelPipeline):
|
|||||||
elements=all_elements, headers=all_headers, body=all_body
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
)
|
)
|
||||||
|
|
||||||
conv_res.legacy_output, conv_res.output = self.glm_model(conv_res)
|
conv_res.output, conv_res.legacy_output = self.glm_model(conv_res)
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -54,12 +55,15 @@ doc_converter = DocumentConverter( # all of the below is optional, has internal
|
|||||||
conv_results = doc_converter.convert_batch(input)
|
conv_results = doc_converter.convert_batch(input)
|
||||||
|
|
||||||
for res in conv_results:
|
for res in conv_results:
|
||||||
out_path = Path("./scratch") / f"{res.input.file.name}.experimental.md"
|
out_path = Path("./scratch")
|
||||||
print(
|
print(
|
||||||
f"Document {res.input.file.name} converted with status {res.status}."
|
f"Document {res.input.file.name} converted with status {res.status}."
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
f"\nSaved markdown output to: {str(out_path)}"
|
||||||
)
|
)
|
||||||
# print(res.experimental.export_to_markdown())
|
# print(res.experimental.export_to_markdown())
|
||||||
# Export Docling document format to markdown (experimental):
|
# Export Docling document format to markdown (experimental):
|
||||||
with out_path.open("w") as fp:
|
with (out_path / f"{res.input.file.name}.md").open("w") as fp:
|
||||||
fp.write(res.output.export_to_markdown())
|
fp.write(res.output.export_to_markdown())
|
||||||
|
|
||||||
|
with (out_path / f"{res.input.file.name}.json").open("w") as fp:
|
||||||
|
fp.write(json.dumps(res.output.export_to_dict()))
|
||||||
|
Binary file not shown.
@ -16,7 +16,7 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
|
|||||||
|
|
||||||
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2
|
||||||
|
|
||||||
GENERATE = True
|
GENERATE = False
|
||||||
|
|
||||||
|
|
||||||
# Debug
|
# Debug
|
||||||
|
Loading…
Reference in New Issue
Block a user