feat(pdf): Support for password-protected PDF documents (#2499)

* add test and example for PDF with password Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use docling-parse with new password feature Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add pdfbackendoptions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * generalize backend_options and add PdfBackendOptions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add pdf-password option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update exception test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix docs description Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-22 12:48:01 +02:00
parent 89820d01b5
commit bbe82a68d0
16 changed files with 201 additions and 113 deletions
--- a/tests/data/pdf_password/2206.01062_pg3.pdf
+++ b/tests/data/pdf_password/2206.01062_pg3.pdf
--- a/tests/data/pdf_password/README.md
+++ b/tests/data/pdf_password/README.md
@@ -0,0 +1,4 @@
+This folder contains test documents which are locked.
+
+- Opening password: `1234`
+- Owner password: `owner`
--- a/tests/test_input_doc.py
+++ b/tests/test_input_doc.py
@@ -130,7 +130,7 @@ def test_in_doc_with_backend_options():
    assert not doc.backend_options.enable_local_fetch
    assert not doc.backend_options.enable_remote_fetch

-    with pytest.raises(ValueError, match="Incompatible types"):
+    with pytest.raises(AttributeError, match="no attribute 'source_uri'"):
        doc = InputDocument(
            path_or_stream=test_doc_path,
            format=InputFormat.HTML,
--- a/tests/test_pdf_password.py
+++ b/tests/test_pdf_password.py
@@ -0,0 +1,63 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Iterable
+
+import pytest
+
+from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+from docling.backend.pypdfium2_backend import (
+    PyPdfiumDocumentBackend,
+)
+from docling.datamodel.backend_options import PdfBackendOptions
+from docling.datamodel.base_models import ConversionStatus, InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+@pytest.fixture
+def test_doc_path():
+    return Path("./tests/data/pdf_password/2206.01062_pg3.pdf")
+
+
+@dataclass
+class TestOption:
+    options: PdfFormatOption
+    name: str
+
+
+def converter_opts_gen() -> Iterable[TestOption]:
+    pipeline_options = PdfPipelineOptions(
+        do_ocr=False,
+        do_table_structure=False,
+    )
+
+    backend_options = PdfBackendOptions(password="1234")
+
+    yield TestOption(
+        options=PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=PyPdfiumDocumentBackend,
+            backend_options=backend_options,
+        ),
+        name="PyPdfium",
+    )
+
+    yield TestOption(
+        options=PdfFormatOption(
+            pipeline_options=pipeline_options,
+            backend=DoclingParseV4DocumentBackend,
+            backend_options=backend_options,
+        ),
+        name="DoclingParseV4",
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("test_options", converter_opts_gen(), ids=lambda o: o.name)
+def test_get_text_from_rect(test_doc_path: Path, test_options: TestOption):
+    converter = DocumentConverter(
+        format_options={InputFormat.PDF: test_options.options}
+    )
+
+    res = converter.convert(test_doc_path)
+    assert res.status == ConversionStatus.SUCCESS