From 27b896b9388364d2077950b9c59a5fd2d4d6a5e6 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Mon, 10 Feb 2025 16:59:52 +0100 Subject: [PATCH] Updates for reading-order implementation Signed-off-by: Christoph Auer --- docs/examples/batch_convert.py | 22 +++++++++++++++------- poetry.lock | 2 +- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/docs/examples/batch_convert.py b/docs/examples/batch_convert.py index 536b9aae..c21a4645 100644 --- a/docs/examples/batch_convert.py +++ b/docs/examples/batch_convert.py @@ -7,10 +7,11 @@ from typing import Iterable import yaml from docling_core.types.doc import ImageRefMode -from docling.datamodel.base_models import ConversionStatus +from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.settings import settings -from docling.document_converter import DocumentConverter +from docling.document_converter import DocumentConverter, PdfFormatOption _log = logging.getLogger(__name__) @@ -38,6 +39,10 @@ def export_documents( output_dir / f"{doc_filename}.json", image_mode=ImageRefMode.PLACEHOLDER, ) + conv_res.document.save_as_html( + output_dir / f"{doc_filename}.html", + image_mode=ImageRefMode.EMBEDDED, + ) conv_res.document.save_as_document_tokens( output_dir / f"{doc_filename}.doctags.txt" ) @@ -50,10 +55,6 @@ def export_documents( image_mode=ImageRefMode.PLACEHOLDER, strict_text=True, ) - conv_res.document.save_as_html( - output_dir / f"{doc_filename}.html", - image_mode=ImageRefMode.EMBEDDED, - ) # Export Docling document format to YAML: with (output_dir / f"{doc_filename}.yaml").open("w") as fp: @@ -125,7 +126,14 @@ def main(): # settings.debug.visualize_tables = True # settings.debug.visualize_cells = True - doc_converter = DocumentConverter() + pipeline_options = PdfPipelineOptions() + pipeline_options.generate_page_images = True + + doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } + ) start_time = time.time() diff --git a/poetry.lock b/poetry.lock index f2007744..d7b9fb92 100644 --- a/poetry.lock +++ b/poetry.lock @@ -879,7 +879,7 @@ transformers = [ type = "git" url = "https://github.com/DS4SD/docling-ibm-models.git" reference = "dev/add-reading-order" -resolved_reference = "2f88418a493321a1ee82a01604f617488f6a8feb" +resolved_reference = "6892adfa4fcf0878b938e8efc1407dec46e96bdd" [[package]] name = "docling-parse"