feat: Upgrade docling-parse PDF backend and interface to use page-by-page parsing (#44)

* Use docling-parse page-by-page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Propagate document_hash to PDF backends, use docling-parse 1.0.0 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Upgrade lockfile Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * repin after more packages on pypi Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2024-08-22 13:49:37 +02:00
parent f7c50c8b0e
commit a8c6b29a67
8 changed files with 73 additions and 51 deletions
--- a/examples/batch_convert.py
+++ b/examples/batch_convert.py
@@ -1,10 +1,15 @@
 import json
 import logging
 import time
+from io import BytesIO
 from pathlib import Path
 from typing import Iterable

-from docling.datamodel.base_models import ConversionStatus
+from docling.datamodel.base_models import (
+    ConversionStatus,
+    DocumentStream,
+    PipelineOptions,
+)
 from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
 from docling.document_converter import DocumentConverter

@@ -52,7 +57,11 @@ def main():
        Path("./test/data/redp5695.pdf"),
    ]

-    doc_converter = DocumentConverter()
+    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
+    # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
+    # input = DocumentConversionInput.from_streams(docs)
+
+    doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))

    input = DocumentConversionInput.from_paths(input_doc_paths)