feat(pdf): Support for password-protected PDF documents (#2499)

* add test and example for PDF with password Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use docling-parse with new password feature Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add pdfbackendoptions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * generalize backend_options and add PdfBackendOptions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add pdf-password option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update exception test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix docs description Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-11 22:28:31 +00:00 · 2025-10-22 12:48:01 +02:00
parent 89820d01b5
commit bbe82a68d0
16 changed files with 201 additions and 113 deletions
--- a/docling/backend/abstract_backend.py
+++ b/docling/backend/abstract_backend.py
@@ -5,7 +5,11 @@ from typing import TYPE_CHECKING, Union

 from docling_core.types.doc import DoclingDocument

-from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
+from docling.datamodel.backend_options import (
+    BackendOptions,
+    BaseBackendOptions,
+    DeclarativeBackendOptions,
+)

 if TYPE_CHECKING:
    from docling.datamodel.base_models import InputFormat
@@ -14,11 +18,17 @@ if TYPE_CHECKING:

 class AbstractDocumentBackend(ABC):
    @abstractmethod
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+    def __init__(
+        self,
+        in_doc: "InputDocument",
+        path_or_stream: Union[BytesIO, Path],
+        options: BaseBackendOptions = BaseBackendOptions(),
+    ):
        self.file = in_doc.file
        self.path_or_stream = path_or_stream
        self.document_hash = in_doc.document_hash
        self.input_format = in_doc.format
+        self.options = options

    @abstractmethod
    def is_valid(self) -> bool:
@@ -67,13 +77,8 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
        path_or_stream: Union[BytesIO, Path],
        options: BackendOptions = DeclarativeBackendOptions(),
    ) -> None:
-        super().__init__(in_doc, path_or_stream)
-        self.options: BackendOptions = options
+        super().__init__(in_doc, path_or_stream, options)

    @abstractmethod
    def convert(self) -> DoclingDocument:
        pass
-
-    @classmethod
-    def get_default_options(cls) -> BackendOptions:
-        return DeclarativeBackendOptions()
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@@ -12,6 +12,7 @@ from PIL import Image
 from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.backend_options import PdfBackendOptions
 from docling.datamodel.base_models import Size
 from docling.utils.locks import pypdfium2_lock

@@ -189,13 +190,23 @@ class DoclingParseV4PageBackend(PdfPageBackend):


 class DoclingParseV4DocumentBackend(PdfDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
+    def __init__(
+        self,
+        in_doc: "InputDocument",
+        path_or_stream: Union[BytesIO, Path],
+        options: PdfBackendOptions = PdfBackendOptions(),
+    ):
+        super().__init__(in_doc, path_or_stream, options)

+        password = (
+            self.options.password.get_secret_value() if self.options.password else None
+        )
        with pypdfium2_lock:
-            self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+            self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
        self.parser = DoclingPdfParser(loglevel="fatal")
-        self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
+        self.dp_doc: PdfDocument = self.parser.load(
+            path_or_stream=self.path_or_stream, password=password
+        )
        success = self.dp_doc is not None

        if not success:
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -246,11 +246,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.HTML}

-    @classmethod
-    @override
-    def get_default_options(cls) -> HTMLBackendOptions:
-        return HTMLBackendOptions()
-
    @override
    def convert(self) -> DoclingDocument:
        _log.debug("Starting HTML conversion...")
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -536,11 +536,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
    def supported_formats(cls) -> set[InputFormat]:
        return {InputFormat.MD}

-    @classmethod
-    @override
-    def get_default_options(cls) -> MarkdownBackendOptions:
-        return MarkdownBackendOptions()
-
    def convert(self) -> DoclingDocument:
        _log.debug("converting Markdown...")

--- a/docling/backend/pdf_backend.py
+++ b/docling/backend/pdf_backend.py
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
 from PIL import Image

 from docling.backend.abstract_backend import PaginatedDocumentBackend
+from docling.datamodel.backend_options import PdfBackendOptions
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):


 class PdfDocumentBackend(PaginatedDocumentBackend):
-    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
+    def __init__(
+        self,
+        in_doc: InputDocument,
+        path_or_stream: Union[BytesIO, Path],
+        options: PdfBackendOptions = PdfBackendOptions(),
+    ):
+        super().__init__(in_doc, path_or_stream, options)
+        self.options: PdfBackendOptions

        if self.input_format is not InputFormat.PDF:
            if self.input_format is InputFormat.IMAGE:
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.datamodel.backend_options import PdfBackendOptions
 from docling.utils.locks import pypdfium2_lock


@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):


 class PyPdfiumDocumentBackend(PdfDocumentBackend):
-    def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
-        super().__init__(in_doc, path_or_stream)
+    def __init__(
+        self,
+        in_doc: "InputDocument",
+        path_or_stream: Union[BytesIO, Path],
+        options: PdfBackendOptions = PdfBackendOptions(),
+    ):
+        super().__init__(in_doc, path_or_stream, options)

+        password = (
+            self.options.password.get_secret_value() if self.options.password else None
+        )
        try:
            with pypdfium2_lock:
-                self._pdoc = pdfium.PdfDocument(self.path_or_stream)
+                self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
        except PdfiumError as e:
            raise RuntimeError(
                f"pypdfium could not load document with hash {self.document_hash}"
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -51,6 +51,7 @@ from docling.datamodel.asr_model_specs import (
    WHISPER_TURBO_NATIVE,
    AsrModelType,
 )
+from docling.datamodel.backend_options import PdfBackendOptions
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@@ -404,6 +405,9 @@ def convert(  # noqa: C901
    pdf_backend: Annotated[
        PdfBackend, typer.Option(..., help="The PDF backend to use.")
    ] = PdfBackend.DLPARSE_V4,
+    pdf_password: Annotated[
+        Optional[str], typer.Option(..., help="Password for protected PDF documents")
+    ] = None,
    table_mode: Annotated[
        TableFormerMode,
        typer.Option(..., help="The mode to use in the table structure model."),
@@ -628,6 +632,9 @@ def convert(  # noqa: C901
        pipeline_options: PipelineOptions

        format_options: Dict[InputFormat, FormatOption] = {}
+        pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
+            password=pdf_password
+        )

        if pipeline == ProcessingPipeline.STANDARD:
            pipeline_options = PdfPipelineOptions(
@@ -658,8 +665,10 @@ def convert(  # noqa: C901
            backend: Type[PdfDocumentBackend]
            if pdf_backend == PdfBackend.DLPARSE_V1:
                backend = DoclingParseDocumentBackend
+                pdf_backend_options = None
            elif pdf_backend == PdfBackend.DLPARSE_V2:
                backend = DoclingParseV2DocumentBackend
+                pdf_backend_options = None
            elif pdf_backend == PdfBackend.DLPARSE_V4:
                backend = DoclingParseV4DocumentBackend  # type: ignore
            elif pdf_backend == PdfBackend.PYPDFIUM2:
@@ -670,6 +679,7 @@ def convert(  # noqa: C901
            pdf_format_option = PdfFormatOption(
                pipeline_options=pipeline_options,
                backend=backend,  # pdf_backend
+                backend_options=pdf_backend_options,
            )

            # METS GBS options
@@ -816,7 +826,7 @@ def convert(  # noqa: C901
            _log.error(f"{asr_model} is not known")
            raise ValueError(f"{asr_model} is not known")

-        _log.info(f"ASR pipeline_options: {asr_pipeline_options}")
+        _log.debug(f"ASR pipeline_options: {asr_pipeline_options}")

        audio_format_option = AudioFormatOption(
            pipeline_cls=AsrPipeline,
--- a/docling/datamodel/backend_options.py
+++ b/docling/datamodel/backend_options.py
@@ -1,7 +1,7 @@
 from pathlib import PurePath
 from typing import Annotated, Literal, Optional, Union

-from pydantic import AnyUrl, BaseModel, Field
+from pydantic import AnyUrl, BaseModel, Field, SecretStr


 class BaseBackendOptions(BaseModel):
@@ -64,7 +64,19 @@ class MarkdownBackendOptions(BaseBackendOptions):
    )


+class PdfBackendOptions(BaseBackendOptions):
+    """Backend options for pdf document backends."""
+
+    kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
+    password: Optional[SecretStr] = None
+
+
 BackendOptions = Annotated[
-    Union[DeclarativeBackendOptions, HTMLBackendOptions, MarkdownBackendOptions],
+    Union[
+        DeclarativeBackendOptions,
+        HTMLBackendOptions,
+        MarkdownBackendOptions,
+        PdfBackendOptions,
+    ],
    Field(discriminator="kind"),
 ]
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -114,7 +114,7 @@ class InputDocument(BaseModel):
    ]
    valid: bool = Field(True, description="Whether this is is a valid input document.")
    backend_options: Optional[BackendOptions] = Field(
-        None, description="Custom options for declarative backends."
+        None, description="Custom options for backends."
    )
    limits: DocumentLimits = Field(
        DocumentLimits(), description="Limits in the input document for the conversion."
@@ -146,15 +146,6 @@ class InputDocument(BaseModel):
        self.limits = limits or DocumentLimits()
        self.format = format

-        # check for backend incompatibilities
-        if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
-            if not issubclass(
-                type(backend_options), type(backend.get_default_options())
-            ):
-                raise ValueError(
-                    "Incompatible types between backend and backend_options arguments."
-                )
-
        try:
            if isinstance(path_or_stream, Path):
                self.file = path_or_stream
@@ -214,7 +205,7 @@ class InputDocument(BaseModel):
        backend: Type[AbstractDocumentBackend],
        path_or_stream: Union[BytesIO, Path],
    ) -> None:
-        if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
+        if self.backend_options:
            self._backend = backend(
                self,
                path_or_stream=path_or_stream,
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -31,7 +31,12 @@ from docling.backend.noop_backend import NoOpBackend
 from docling.backend.webvtt_backend import WebVTTDocumentBackend
 from docling.backend.xml.jats_backend import JatsDocumentBackend
 from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
-from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
+from docling.datamodel.backend_options import (
+    BackendOptions,
+    HTMLBackendOptions,
+    MarkdownBackendOptions,
+    PdfBackendOptions,
+)
 from docling.datamodel.base_models import (
    BaseFormatOption,
    ConversionStatus,
@@ -98,7 +103,7 @@ class PowerpointFormatOption(FormatOption):
 class MarkdownFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
-    backend_options: HTMLBackendOptions = HTMLBackendOptions()
+    backend_options: Optional[MarkdownBackendOptions] = None


 class AsciiDocFormatOption(FormatOption):
@@ -109,7 +114,7 @@ class AsciiDocFormatOption(FormatOption):
 class HTMLFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
-    backend_options: HTMLBackendOptions = HTMLBackendOptions()
+    backend_options: Optional[HTMLBackendOptions] = None


 class PatentUsptoFormatOption(FormatOption):
@@ -130,6 +135,7 @@ class ImageFormatOption(FormatOption):
 class PdfFormatOption(FormatOption):
    pipeline_cls: Type = StandardPdfPipeline
    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
+    backend_options: Optional[PdfBackendOptions] = None


 class AudioFormatOption(FormatOption):
@@ -139,48 +145,24 @@ class AudioFormatOption(FormatOption):

 def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
-        InputFormat.CSV: FormatOption(
-            pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
-        ),
-        InputFormat.XLSX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
-        ),
-        InputFormat.DOCX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
-        ),
-        InputFormat.PPTX: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
-        ),
-        InputFormat.MD: FormatOption(
-            pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
-        ),
-        InputFormat.ASCIIDOC: FormatOption(
-            pipeline_cls=SimplePipeline, backend=AsciiDocBackend
-        ),
-        InputFormat.HTML: FormatOption(
-            pipeline_cls=SimplePipeline,
-            backend=HTMLDocumentBackend,
-            backend_options=HTMLBackendOptions(),
-        ),
-        InputFormat.XML_USPTO: FormatOption(
-            pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
-        ),
-        InputFormat.XML_JATS: FormatOption(
-            pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
-        ),
+        InputFormat.CSV: CsvFormatOption(),
+        InputFormat.XLSX: ExcelFormatOption(),
+        InputFormat.DOCX: WordFormatOption(),
+        InputFormat.PPTX: PowerpointFormatOption(),
+        InputFormat.MD: MarkdownFormatOption(),
+        InputFormat.ASCIIDOC: AsciiDocFormatOption(),
+        InputFormat.HTML: HTMLFormatOption(),
+        InputFormat.XML_USPTO: PatentUsptoFormatOption(),
+        InputFormat.XML_JATS: XMLJatsFormatOption(),
        InputFormat.METS_GBS: FormatOption(
            pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
        ),
-        InputFormat.IMAGE: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
-        ),
-        InputFormat.PDF: FormatOption(
-            pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
-        ),
+        InputFormat.IMAGE: ImageFormatOption(),
+        InputFormat.PDF: PdfFormatOption(),
        InputFormat.JSON_DOCLING: FormatOption(
            pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
        ),
-        InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
+        InputFormat.AUDIO: AudioFormatOption(),
        InputFormat.VTT: FormatOption(
            pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
        ),