Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-03-17 09:26:31 +01:00
parent e34c0750a7
commit fe45d30942
3 changed files with 54 additions and 14 deletions

View File

@ -13,6 +13,7 @@ from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
@ -138,9 +139,10 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
with pypdfium2_lock:
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = DoclingPdfParser(loglevel="fatal")
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=path_or_stream)
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
success = self.dp_doc is not None
if not success:
@ -162,14 +164,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def load_page(
self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend:
return DoclingParseV4PageBackend(
self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
),
self._pdoc[page_no],
)
with pypdfium2_lock:
return DoclingParseV4PageBackend(
self.dp_doc.get_page(
page_no + 1,
create_words=create_words,
create_textlines=create_textlines,
),
self._pdoc[page_no],
)
def is_valid(self) -> bool:
return self.page_count() > 0
@ -177,5 +180,6 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def unload(self):
super().unload()
self.dp_doc.unload()
self._pdoc.close()
with pypdfium2_lock:
self._pdoc.close()
self._pdoc = None

View File

@ -1,6 +1,9 @@
from io import BytesIO
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput
@ -40,7 +43,39 @@ def test_in_doc_from_invalid_buf():
assert doc.valid == False
def test_image_in_pdf_backend():
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=PyPdfiumDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV2DocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV4DocumentBackend,
)
assert in_doc.valid
def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits()
limits.page_range = (1, 10)
@ -192,7 +227,7 @@ def _make_input_doc(path):
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend,
backend=DoclingParseV4DocumentBackend,
)
return in_doc
@ -202,6 +237,6 @@ def _make_input_doc_from_stream(doc_stream):
path_or_stream=doc_stream.stream,
format=InputFormat.PDF,
filename=doc_stream.name,
backend=PyPdfiumDocumentBackend,
backend=DoclingParseV4DocumentBackend,
)
return in_doc

View File

@ -4,6 +4,7 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
@ -30,7 +31,7 @@ def get_converter():
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption(
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
pipeline_options=pipeline_options,
),
}
)