feat: Upgrade docling-parse PDF backend and interface to use page-by-page parsing (#44)

* Use docling-parse page-by-page

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Propagate document_hash to PDF backends, use docling-parse 1.0.0

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Upgrade lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* repin after more packages on pypi

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-08-22 13:49:37 +02:00
committed by GitHub
parent f7c50c8b0e
commit a8c6b29a67
8 changed files with 73 additions and 51 deletions

View File

@@ -1,10 +1,15 @@
import json
import logging
import time
from io import BytesIO
from pathlib import Path
from typing import Iterable
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.base_models import (
ConversionStatus,
DocumentStream,
PipelineOptions,
)
from docling.datamodel.document import ConvertedDocument, DocumentConversionInput
from docling.document_converter import DocumentConverter
@@ -52,7 +57,11 @@ def main():
Path("./test/data/redp5695.pdf"),
]
doc_converter = DocumentConverter()
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter(pipeline_options=PipelineOptions(do_ocr=False))
input = DocumentConversionInput.from_paths(input_doc_paths)