feat(pdf): Support for password-protected PDF documents (#2499)

* add test and example for PDF with password

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use docling-parse with new password feature

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add pdfbackendoptions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* generalize backend_options and add PdfBackendOptions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add pdf-password option

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update exception test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix docs description

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-22 12:48:01 +02:00
committed by GitHub
parent 89820d01b5
commit bbe82a68d0
16 changed files with 201 additions and 113 deletions

Binary file not shown.

4
tests/data/pdf_password/README.md vendored Normal file
View File

@@ -0,0 +1,4 @@
This folder contains test documents which are locked.
- Opening password: `1234`
- Owner password: `owner`

View File

@@ -130,7 +130,7 @@ def test_in_doc_with_backend_options():
assert not doc.backend_options.enable_local_fetch
assert not doc.backend_options.enable_remote_fetch
with pytest.raises(ValueError, match="Incompatible types"):
with pytest.raises(AttributeError, match="no attribute 'source_uri'"):
doc = InputDocument(
path_or_stream=test_doc_path,
format=InputFormat.HTML,

View File

@@ -0,0 +1,63 @@
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable
import pytest
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pypdfium2_backend import (
PyPdfiumDocumentBackend,
)
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture
def test_doc_path():
return Path("./tests/data/pdf_password/2206.01062_pg3.pdf")
@dataclass
class TestOption:
options: PdfFormatOption
name: str
def converter_opts_gen() -> Iterable[TestOption]:
pipeline_options = PdfPipelineOptions(
do_ocr=False,
do_table_structure=False,
)
backend_options = PdfBackendOptions(password="1234")
yield TestOption(
options=PdfFormatOption(
pipeline_options=pipeline_options,
backend=PyPdfiumDocumentBackend,
backend_options=backend_options,
),
name="PyPdfium",
)
yield TestOption(
options=PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseV4DocumentBackend,
backend_options=backend_options,
),
name="DoclingParseV4",
)
@pytest.mark.asyncio
@pytest.mark.parametrize("test_options", converter_opts_gen(), ids=lambda o: o.name)
def test_get_text_from_rect(test_doc_path: Path, test_options: TestOption):
converter = DocumentConverter(
format_options={InputFormat.PDF: test_options.options}
)
res = converter.convert(test_doc_path)
assert res.status == ConversionStatus.SUCCESS