Fixes for DPv4 backend init, better test coverage

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-26 03:55:00 +00:00 · 2025-03-17 09:26:31 +01:00 · 2025-03-17 09:26:31 +01:00 · fe45d30942
commit fe45d30942
parent e34c0750a7
3 changed files with 54 additions and 14 deletions
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@ -13,6 +13,7 @@ from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.datamodel.base_models import Size
+from docling.utils.locks import pypdfium2_lock

 if TYPE_CHECKING:
    from docling.datamodel.document import InputDocument
@ -138,9 +139,10 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
        super().__init__(in_doc, path_or_stream)

-        self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+        with pypdfium2_lock:
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
        self.parser = DoclingPdfParser(loglevel="fatal")
-        self.dp_doc: PdfDocument = self.parser.load(path_or_stream=path_or_stream)
+        self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
        success = self.dp_doc is not None

        if not success:
@ -162,14 +164,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
    def load_page(
        self, page_no: int, create_words: bool = True, create_textlines: bool = True
    ) -> DoclingParseV4PageBackend:
-        return DoclingParseV4PageBackend(
-            self.dp_doc.get_page(
-                page_no + 1,
-                create_words=create_words,
-                create_textlines=create_textlines,
-            ),
-            self._pdoc[page_no],
-        )
+        with pypdfium2_lock:
+            return DoclingParseV4PageBackend(
+                self.dp_doc.get_page(
+                    page_no + 1,
+                    create_words=create_words,
+                    create_textlines=create_textlines,
+                ),
+                self._pdoc[page_no],
+            )

    def is_valid(self) -> bool:
        return self.page_count() > 0
@ -177,5 +180,6 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
    def unload(self):
        super().unload()
        self.dp_doc.unload()
-        self._pdoc.close()
+        with pypdfium2_lock:
+            self._pdoc.close()
        self._pdoc = None
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@ -1,6 +1,9 @@
 from io import BytesIO
 from pathlib import Path

+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import DocumentStream, InputFormat
 from docling.datamodel.document import InputDocument, _DocumentConversionInput
@ -40,7 +43,39 @@ def test_in_doc_from_invalid_buf():
    assert doc.valid == False


+def test_image_in_pdf_backend():
+
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=PyPdfiumDocumentBackend,
+    )
+
+    assert in_doc.valid
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=DoclingParseDocumentBackend,
+    )
+    assert in_doc.valid
+
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=DoclingParseV2DocumentBackend,
+    )
+    assert in_doc.valid
+
+    in_doc = InputDocument(
+        path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"),
+        format=InputFormat.IMAGE,
+        backend=DoclingParseV4DocumentBackend,
+    )
+    assert in_doc.valid
+
+
 def test_in_doc_with_page_range():
+
    test_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
    limits = DocumentLimits()
    limits.page_range = (1, 10)
@ -192,7 +227,7 @@ def _make_input_doc(path):
    in_doc = InputDocument(
        path_or_stream=path,
        format=InputFormat.PDF,
-        backend=PyPdfiumDocumentBackend,
+        backend=DoclingParseV4DocumentBackend,
    )
    return in_doc

@ -202,6 +237,6 @@ def _make_input_doc_from_stream(doc_stream):
        path_or_stream=doc_stream.stream,
        format=InputFormat.PDF,
        filename=doc_stream.name,
-        backend=PyPdfiumDocumentBackend,
+        backend=DoclingParseV4DocumentBackend,
    )
    return in_doc
--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@ -4,6 +4,7 @@ from pathlib import Path
 import pytest

 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import DocumentConverter, PdfFormatOption
@ -30,7 +31,7 @@ def get_converter():
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options),
            InputFormat.IMAGE: PdfFormatOption(
-                pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
+                pipeline_options=pipeline_options,
            ),
        }
    )