mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905)
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
Some checks failed
Run Docs CD / build-deploy-docs (push) Failing after 1m25s
Run Docs CI / build-docs (push) Failing after 52s
* Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
@@ -1,10 +1,14 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import InputDocument, _DocumentConversionInput
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.document_converter import PdfFormatOption
|
||||
|
||||
|
||||
def test_in_doc_from_valid_path():
|
||||
@@ -40,7 +44,39 @@ def test_in_doc_from_invalid_buf():
|
||||
assert doc.valid == False
|
||||
|
||||
|
||||
def test_image_in_pdf_backend():
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
)
|
||||
|
||||
assert in_doc.valid
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV2DocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
|
||||
format=InputFormat.IMAGE,
|
||||
backend=DoclingParseV4DocumentBackend,
|
||||
)
|
||||
assert in_doc.valid
|
||||
|
||||
|
||||
def test_in_doc_with_page_range():
|
||||
|
||||
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
|
||||
limits = DocumentLimits()
|
||||
limits.page_range = (1, 10)
|
||||
@@ -192,7 +228,7 @@ def _make_input_doc(path):
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=path,
|
||||
format=InputFormat.PDF,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
backend=PdfFormatOption().backend, # use default
|
||||
)
|
||||
return in_doc
|
||||
|
||||
@@ -202,6 +238,6 @@ def _make_input_doc_from_stream(doc_stream):
|
||||
path_or_stream=doc_stream.stream,
|
||||
format=InputFormat.PDF,
|
||||
filename=doc_stream.name,
|
||||
backend=PyPdfiumDocumentBackend,
|
||||
backend=PdfFormatOption().backend, # use default
|
||||
)
|
||||
return in_doc
|
||||
|
||||
Reference in New Issue
Block a user