feat(pdf): Support for password-protected PDF documents (#2499)

* add test and example for PDF with password

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use docling-parse with new password feature

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add pdfbackendoptions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* generalize backend_options and add PdfBackendOptions

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add pdf-password option

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update exception test

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix docs description

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-22 12:48:01 +02:00
committed by GitHub
parent 89820d01b5
commit bbe82a68d0
16 changed files with 201 additions and 113 deletions

View File

@@ -5,7 +5,11 @@ from typing import TYPE_CHECKING, Union
from docling_core.types.doc import DoclingDocument
from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
from docling.datamodel.backend_options import (
BackendOptions,
BaseBackendOptions,
DeclarativeBackendOptions,
)
if TYPE_CHECKING:
from docling.datamodel.base_models import InputFormat
@@ -14,11 +18,17 @@ if TYPE_CHECKING:
class AbstractDocumentBackend(ABC):
@abstractmethod
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
options: BaseBackendOptions = BaseBackendOptions(),
):
self.file = in_doc.file
self.path_or_stream = path_or_stream
self.document_hash = in_doc.document_hash
self.input_format = in_doc.format
self.options = options
@abstractmethod
def is_valid(self) -> bool:
@@ -67,13 +77,8 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
path_or_stream: Union[BytesIO, Path],
options: BackendOptions = DeclarativeBackendOptions(),
) -> None:
super().__init__(in_doc, path_or_stream)
self.options: BackendOptions = options
super().__init__(in_doc, path_or_stream, options)
@abstractmethod
def convert(self) -> DoclingDocument:
pass
@classmethod
def get_default_options(cls) -> BackendOptions:
return DeclarativeBackendOptions()

View File

@@ -12,6 +12,7 @@ from PIL import Image
from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import Size
from docling.utils.locks import pypdfium2_lock
@@ -189,13 +190,23 @@ class DoclingParseV4PageBackend(PdfPageBackend):
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
):
super().__init__(in_doc, path_or_stream, options)
password = (
self.options.password.get_secret_value() if self.options.password else None
)
with pypdfium2_lock:
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
self.parser = DoclingPdfParser(loglevel="fatal")
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
self.dp_doc: PdfDocument = self.parser.load(
path_or_stream=self.path_or_stream, password=password
)
success = self.dp_doc is not None
if not success:

View File

@@ -246,11 +246,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.HTML}
@classmethod
@override
def get_default_options(cls) -> HTMLBackendOptions:
return HTMLBackendOptions()
@override
def convert(self) -> DoclingDocument:
_log.debug("Starting HTML conversion...")

View File

@@ -536,11 +536,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.MD}
@classmethod
@override
def get_default_options(cls) -> MarkdownBackendOptions:
return MarkdownBackendOptions()
def convert(self) -> DoclingDocument:
_log.debug("converting Markdown...")

View File

@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
from PIL import Image
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
class PdfDocumentBackend(PaginatedDocumentBackend):
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
def __init__(
self,
in_doc: InputDocument,
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
):
super().__init__(in_doc, path_or_stream, options)
self.options: PdfBackendOptions
if self.input_format is not InputFormat.PDF:
if self.input_format is InputFormat.IMAGE:

View File

@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.backend_options import PdfBackendOptions
from docling.utils.locks import pypdfium2_lock
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
class PyPdfiumDocumentBackend(PdfDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
options: PdfBackendOptions = PdfBackendOptions(),
):
super().__init__(in_doc, path_or_stream, options)
password = (
self.options.password.get_secret_value() if self.options.password else None
)
try:
with pypdfium2_lock:
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document with hash {self.document_hash}"

View File

@@ -51,6 +51,7 @@ from docling.datamodel.asr_model_specs import (
WHISPER_TURBO_NATIVE,
AsrModelType,
)
from docling.datamodel.backend_options import PdfBackendOptions
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@@ -404,6 +405,9 @@ def convert( # noqa: C901
pdf_backend: Annotated[
PdfBackend, typer.Option(..., help="The PDF backend to use.")
] = PdfBackend.DLPARSE_V4,
pdf_password: Annotated[
Optional[str], typer.Option(..., help="Password for protected PDF documents")
] = None,
table_mode: Annotated[
TableFormerMode,
typer.Option(..., help="The mode to use in the table structure model."),
@@ -628,6 +632,9 @@ def convert( # noqa: C901
pipeline_options: PipelineOptions
format_options: Dict[InputFormat, FormatOption] = {}
pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
password=pdf_password
)
if pipeline == ProcessingPipeline.STANDARD:
pipeline_options = PdfPipelineOptions(
@@ -658,8 +665,10 @@ def convert( # noqa: C901
backend: Type[PdfDocumentBackend]
if pdf_backend == PdfBackend.DLPARSE_V1:
backend = DoclingParseDocumentBackend
pdf_backend_options = None
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
pdf_backend_options = None
elif pdf_backend == PdfBackend.DLPARSE_V4:
backend = DoclingParseV4DocumentBackend # type: ignore
elif pdf_backend == PdfBackend.PYPDFIUM2:
@@ -670,6 +679,7 @@ def convert( # noqa: C901
pdf_format_option = PdfFormatOption(
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
backend_options=pdf_backend_options,
)
# METS GBS options
@@ -816,7 +826,7 @@ def convert( # noqa: C901
_log.error(f"{asr_model} is not known")
raise ValueError(f"{asr_model} is not known")
_log.info(f"ASR pipeline_options: {asr_pipeline_options}")
_log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
audio_format_option = AudioFormatOption(
pipeline_cls=AsrPipeline,

View File

@@ -1,7 +1,7 @@
from pathlib import PurePath
from typing import Annotated, Literal, Optional, Union
from pydantic import AnyUrl, BaseModel, Field
from pydantic import AnyUrl, BaseModel, Field, SecretStr
class BaseBackendOptions(BaseModel):
@@ -64,7 +64,19 @@ class MarkdownBackendOptions(BaseBackendOptions):
)
class PdfBackendOptions(BaseBackendOptions):
"""Backend options for pdf document backends."""
kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
password: Optional[SecretStr] = None
BackendOptions = Annotated[
Union[DeclarativeBackendOptions, HTMLBackendOptions, MarkdownBackendOptions],
Union[
DeclarativeBackendOptions,
HTMLBackendOptions,
MarkdownBackendOptions,
PdfBackendOptions,
],
Field(discriminator="kind"),
]

View File

@@ -114,7 +114,7 @@ class InputDocument(BaseModel):
]
valid: bool = Field(True, description="Whether this is is a valid input document.")
backend_options: Optional[BackendOptions] = Field(
None, description="Custom options for declarative backends."
None, description="Custom options for backends."
)
limits: DocumentLimits = Field(
DocumentLimits(), description="Limits in the input document for the conversion."
@@ -146,15 +146,6 @@ class InputDocument(BaseModel):
self.limits = limits or DocumentLimits()
self.format = format
# check for backend incompatibilities
if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
if not issubclass(
type(backend_options), type(backend.get_default_options())
):
raise ValueError(
"Incompatible types between backend and backend_options arguments."
)
try:
if isinstance(path_or_stream, Path):
self.file = path_or_stream
@@ -214,7 +205,7 @@ class InputDocument(BaseModel):
backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path],
) -> None:
if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
if self.backend_options:
self._backend = backend(
self,
path_or_stream=path_or_stream,

View File

@@ -31,7 +31,12 @@ from docling.backend.noop_backend import NoOpBackend
from docling.backend.webvtt_backend import WebVTTDocumentBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
from docling.datamodel.backend_options import (
BackendOptions,
HTMLBackendOptions,
MarkdownBackendOptions,
PdfBackendOptions,
)
from docling.datamodel.base_models import (
BaseFormatOption,
ConversionStatus,
@@ -98,7 +103,7 @@ class PowerpointFormatOption(FormatOption):
class MarkdownFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
backend_options: HTMLBackendOptions = HTMLBackendOptions()
backend_options: Optional[MarkdownBackendOptions] = None
class AsciiDocFormatOption(FormatOption):
@@ -109,7 +114,7 @@ class AsciiDocFormatOption(FormatOption):
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
backend_options: HTMLBackendOptions = HTMLBackendOptions()
backend_options: Optional[HTMLBackendOptions] = None
class PatentUsptoFormatOption(FormatOption):
@@ -130,6 +135,7 @@ class ImageFormatOption(FormatOption):
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
backend_options: Optional[PdfBackendOptions] = None
class AudioFormatOption(FormatOption):
@@ -139,48 +145,24 @@ class AudioFormatOption(FormatOption):
def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = {
InputFormat.CSV: FormatOption(
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
),
InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
),
InputFormat.DOCX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
),
InputFormat.PPTX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.MD: FormatOption(
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
),
InputFormat.ASCIIDOC: FormatOption(
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimplePipeline,
backend=HTMLDocumentBackend,
backend_options=HTMLBackendOptions(),
),
InputFormat.XML_USPTO: FormatOption(
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
),
InputFormat.XML_JATS: FormatOption(
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
),
InputFormat.CSV: CsvFormatOption(),
InputFormat.XLSX: ExcelFormatOption(),
InputFormat.DOCX: WordFormatOption(),
InputFormat.PPTX: PowerpointFormatOption(),
InputFormat.MD: MarkdownFormatOption(),
InputFormat.ASCIIDOC: AsciiDocFormatOption(),
InputFormat.HTML: HTMLFormatOption(),
InputFormat.XML_USPTO: PatentUsptoFormatOption(),
InputFormat.XML_JATS: XMLJatsFormatOption(),
InputFormat.METS_GBS: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
),
InputFormat.IMAGE: ImageFormatOption(),
InputFormat.PDF: PdfFormatOption(),
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
InputFormat.AUDIO: AudioFormatOption(),
InputFormat.VTT: FormatOption(
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
),