Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-03-17 09:26:31 +01:00
parent e34c0750a7
commit fe45d30942
3 changed files with 54 additions and 14 deletions

View File

@ -13,6 +13,7 @@ from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Size from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING: if TYPE_CHECKING:
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
@ -138,9 +139,10 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
with pypdfium2_lock:
self._pdoc = pdfium.PdfDocument(self.path_or_stream) self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = DoclingPdfParser(loglevel="fatal") self.parser = DoclingPdfParser(loglevel="fatal")
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=path_or_stream) self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
success = self.dp_doc is not None success = self.dp_doc is not None
if not success: if not success:
@ -162,6 +164,7 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def load_page( def load_page(
self, page_no: int, create_words: bool = True, create_textlines: bool = True self, page_no: int, create_words: bool = True, create_textlines: bool = True
) -> DoclingParseV4PageBackend: ) -> DoclingParseV4PageBackend:
with pypdfium2_lock:
return DoclingParseV4PageBackend( return DoclingParseV4PageBackend(
self.dp_doc.get_page( self.dp_doc.get_page(
page_no + 1, page_no + 1,
@ -177,5 +180,6 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def unload(self): def unload(self):
super().unload() super().unload()
self.dp_doc.unload() self.dp_doc.unload()
with pypdfium2_lock:
self._pdoc.close() self._pdoc.close()
self._pdoc = None self._pdoc = None

View File

@ -1,6 +1,9 @@
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import InputDocument, _DocumentConversionInput from docling.datamodel.document import InputDocument, _DocumentConversionInput
@ -40,7 +43,39 @@ def test_in_doc_from_invalid_buf():
assert doc.valid == False assert doc.valid == False
def test_image_in_pdf_backend():
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=PyPdfiumDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseDocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV2DocumentBackend,
)
assert in_doc.valid
in_doc = InputDocument(
path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
format=InputFormat.IMAGE,
backend=DoclingParseV4DocumentBackend,
)
assert in_doc.valid
def test_in_doc_with_page_range(): def test_in_doc_with_page_range():
test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
limits = DocumentLimits() limits = DocumentLimits()
limits.page_range = (1, 10) limits.page_range = (1, 10)
@ -192,7 +227,7 @@ def _make_input_doc(path):
in_doc = InputDocument( in_doc = InputDocument(
path_or_stream=path, path_or_stream=path,
format=InputFormat.PDF, format=InputFormat.PDF,
backend=PyPdfiumDocumentBackend, backend=DoclingParseV4DocumentBackend,
) )
return in_doc return in_doc
@ -202,6 +237,6 @@ def _make_input_doc_from_stream(doc_stream):
path_or_stream=doc_stream.stream, path_or_stream=doc_stream.stream,
format=InputFormat.PDF, format=InputFormat.PDF,
filename=doc_stream.name, filename=doc_stream.name,
backend=PyPdfiumDocumentBackend, backend=DoclingParseV4DocumentBackend,
) )
return in_doc return in_doc

View File

@ -4,6 +4,7 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
@ -30,7 +31,7 @@ def get_converter():
format_options={ format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
InputFormat.IMAGE: PdfFormatOption( InputFormat.IMAGE: PdfFormatOption(
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend pipeline_options=pipeline_options,
), ),
} }
) )