mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 20:58:11 +00:00
feat: Implement new reading-order model (#916)
* Implement new reading-order model, replacing DS GLM model (WIP) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update reading-order model branch Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update lockfile [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add captions, footnotes and merges [skip ci] Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Updates for reading-order implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests and lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes, update tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add normalization, update tests again Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests with code Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Push final lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * sanitize text Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Inlcude furniture, Update tests with furniture Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix content_layer assignment Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * chore: Delete empty file docling/models/ds_glm_model.py Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Nikos Livathinos <nli@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Nikos Livathinos <nli@zurich.ibm.com>
This commit is contained in:
@@ -5,16 +5,18 @@ from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
import yaml
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
USE_V2 = True
|
||||
USE_LEGACY = True
|
||||
USE_LEGACY = False
|
||||
|
||||
|
||||
def export_documents(
|
||||
@@ -33,26 +35,31 @@ def export_documents(
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
if USE_V2:
|
||||
# Export Docling document format to JSON:
|
||||
with (output_dir / f"{doc_filename}.json").open("w") as fp:
|
||||
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
||||
conv_res.document.save_as_json(
|
||||
output_dir / f"{doc_filename}.json",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
conv_res.document.save_as_html(
|
||||
output_dir / f"{doc_filename}.html",
|
||||
image_mode=ImageRefMode.EMBEDDED,
|
||||
)
|
||||
conv_res.document.save_as_document_tokens(
|
||||
output_dir / f"{doc_filename}.doctags.txt"
|
||||
)
|
||||
conv_res.document.save_as_markdown(
|
||||
output_dir / f"{doc_filename}.md",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
conv_res.document.save_as_markdown(
|
||||
output_dir / f"{doc_filename}.txt",
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
strict_text=True,
|
||||
)
|
||||
|
||||
# Export Docling document format to YAML:
|
||||
with (output_dir / f"{doc_filename}.yaml").open("w") as fp:
|
||||
fp.write(yaml.safe_dump(conv_res.document.export_to_dict()))
|
||||
|
||||
# Export Docling document format to doctags:
|
||||
with (output_dir / f"{doc_filename}.doctags.txt").open("w") as fp:
|
||||
fp.write(conv_res.document.export_to_document_tokens())
|
||||
|
||||
# Export Docling document format to markdown:
|
||||
with (output_dir / f"{doc_filename}.md").open("w") as fp:
|
||||
fp.write(conv_res.document.export_to_markdown())
|
||||
|
||||
# Export Docling document format to text:
|
||||
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
|
||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
||||
|
||||
if USE_LEGACY:
|
||||
# Export Deep Search document JSON format:
|
||||
with (output_dir / f"{doc_filename}.legacy.json").open(
|
||||
@@ -119,13 +126,20 @@ def main():
|
||||
# settings.debug.visualize_tables = True
|
||||
# settings.debug.visualize_cells = True
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths,
|
||||
raises_on_error=False, # to let conversion run through all and examine results at the end
|
||||
raises_on_error=True, # to let conversion run through all and examine results at the end
|
||||
)
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("scratch")
|
||||
|
||||
Reference in New Issue
Block a user