diff --git a/docling/backend/docling_parse_v4_backend.py b/docling/backend/docling_parse_v4_backend.py index 3a8498af..9ec0aee2 100644 --- a/docling/backend/docling_parse_v4_backend.py +++ b/docling/backend/docling_parse_v4_backend.py @@ -13,6 +13,7 @@ from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Size +from docling.utils.locks import pypdfium2_lock if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -138,9 +139,10 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - self._pdoc = pdfium.PdfDocument(self.path_or_stream) + with pypdfium2_lock: + self._pdoc = pdfium.PdfDocument(self.path_or_stream) self.parser = DoclingPdfParser(loglevel="fatal") - self.dp_doc: PdfDocument = self.parser.load(path_or_stream=path_or_stream) + self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream) success = self.dp_doc is not None if not success: @@ -162,14 +164,15 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend): def load_page( self, page_no: int, create_words: bool = True, create_textlines: bool = True ) -> DoclingParseV4PageBackend: - return DoclingParseV4PageBackend( - self.dp_doc.get_page( - page_no + 1, - create_words=create_words, - create_textlines=create_textlines, - ), - self._pdoc[page_no], - ) + with pypdfium2_lock: + return DoclingParseV4PageBackend( + self.dp_doc.get_page( + page_no + 1, + create_words=create_words, + create_textlines=create_textlines, + ), + self._pdoc[page_no], + ) def is_valid(self) -> bool: return self.page_count() > 0 @@ -177,5 +180,6 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend): def unload(self): super().unload() self.dp_doc.unload() - self._pdoc.close() + with pypdfium2_lock: + self._pdoc.close() self._pdoc = None diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 66afe286..4887ee66 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -1,6 +1,9 @@ from io import BytesIO from pathlib import Path +from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import DocumentStream, InputFormat from docling.datamodel.document import InputDocument, _DocumentConversionInput @@ -40,7 +43,39 @@ def test_in_doc_from_invalid_buf(): assert doc.valid == False +def test_image_in_pdf_backend(): + + in_doc = InputDocument( + path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), + format=InputFormat.IMAGE, + backend=PyPdfiumDocumentBackend, + ) + + assert in_doc.valid + in_doc = InputDocument( + path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), + format=InputFormat.IMAGE, + backend=DoclingParseDocumentBackend, + ) + assert in_doc.valid + + in_doc = InputDocument( + path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), + format=InputFormat.IMAGE, + backend=DoclingParseV2DocumentBackend, + ) + assert in_doc.valid + + in_doc = InputDocument( + path_or_stream=Path("tests/data/2305.03393v1-pg9-img.png"), + format=InputFormat.IMAGE, + backend=DoclingParseV4DocumentBackend, + ) + assert in_doc.valid + + def test_in_doc_with_page_range(): + test_doc_path = Path("./tests/data/pdf/2206.01062.pdf") limits = DocumentLimits() limits.page_range = (1, 10) @@ -192,7 +227,7 @@ def _make_input_doc(path): in_doc = InputDocument( path_or_stream=path, format=InputFormat.PDF, - backend=PyPdfiumDocumentBackend, + backend=DoclingParseV4DocumentBackend, ) return in_doc @@ -202,6 +237,6 @@ def _make_input_doc_from_stream(doc_stream): path_or_stream=doc_stream.stream, format=InputFormat.PDF, filename=doc_stream.name, - backend=PyPdfiumDocumentBackend, + backend=DoclingParseV4DocumentBackend, ) return in_doc diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index e3cc27e1..c46f8990 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -4,6 +4,7 @@ from pathlib import Path import pytest from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.document_converter import DocumentConverter, PdfFormatOption @@ -30,7 +31,7 @@ def get_converter(): format_options={ InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), InputFormat.IMAGE: PdfFormatOption( - pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend + pipeline_options=pipeline_options, ), } )