mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 22:28:31 +00:00
feat(pdf): Support for password-protected PDF documents (#2499)
* add test and example for PDF with password Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * use docling-parse with new password feature Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add pdfbackendoptions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * generalize backend_options and add PdfBackendOptions Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * add pdf-password option Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * update exception test Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * fix docs description Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -5,7 +5,11 @@ from typing import TYPE_CHECKING, Union
|
||||
|
||||
from docling_core.types.doc import DoclingDocument
|
||||
|
||||
from docling.datamodel.backend_options import BackendOptions, DeclarativeBackendOptions
|
||||
from docling.datamodel.backend_options import (
|
||||
BackendOptions,
|
||||
BaseBackendOptions,
|
||||
DeclarativeBackendOptions,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@@ -14,11 +18,17 @@ if TYPE_CHECKING:
|
||||
|
||||
class AbstractDocumentBackend(ABC):
|
||||
@abstractmethod
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: BaseBackendOptions = BaseBackendOptions(),
|
||||
):
|
||||
self.file = in_doc.file
|
||||
self.path_or_stream = path_or_stream
|
||||
self.document_hash = in_doc.document_hash
|
||||
self.input_format = in_doc.format
|
||||
self.options = options
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
@@ -67,13 +77,8 @@ class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: BackendOptions = DeclarativeBackendOptions(),
|
||||
) -> None:
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.options: BackendOptions = options
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
|
||||
@abstractmethod
|
||||
def convert(self) -> DoclingDocument:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> BackendOptions:
|
||||
return DeclarativeBackendOptions()
|
||||
|
||||
@@ -12,6 +12,7 @@ from PIL import Image
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.backend_options import PdfBackendOptions
|
||||
from docling.datamodel.base_models import Size
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
@@ -189,13 +190,23 @@ class DoclingParseV4PageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class DoclingParseV4DocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: PdfBackendOptions = PdfBackendOptions(),
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
|
||||
password = (
|
||||
self.options.password.get_secret_value() if self.options.password else None
|
||||
)
|
||||
with pypdfium2_lock:
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
||||
self.parser = DoclingPdfParser(loglevel="fatal")
|
||||
self.dp_doc: PdfDocument = self.parser.load(path_or_stream=self.path_or_stream)
|
||||
self.dp_doc: PdfDocument = self.parser.load(
|
||||
path_or_stream=self.path_or_stream, password=password
|
||||
)
|
||||
success = self.dp_doc is not None
|
||||
|
||||
if not success:
|
||||
|
||||
@@ -246,11 +246,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.HTML}
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def get_default_options(cls) -> HTMLBackendOptions:
|
||||
return HTMLBackendOptions()
|
||||
|
||||
@override
|
||||
def convert(self) -> DoclingDocument:
|
||||
_log.debug("Starting HTML conversion...")
|
||||
|
||||
@@ -536,11 +536,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
def supported_formats(cls) -> set[InputFormat]:
|
||||
return {InputFormat.MD}
|
||||
|
||||
@classmethod
|
||||
@override
|
||||
def get_default_options(cls) -> MarkdownBackendOptions:
|
||||
return MarkdownBackendOptions()
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
_log.debug("converting Markdown...")
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docling.datamodel.backend_options import PdfBackendOptions
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import InputDocument
|
||||
|
||||
@@ -50,8 +51,14 @@ class PdfPageBackend(ABC):
|
||||
|
||||
|
||||
class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||
def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: InputDocument,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: PdfBackendOptions = PdfBackendOptions(),
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
self.options: PdfBackendOptions
|
||||
|
||||
if self.input_format is not InputFormat.PDF:
|
||||
if self.input_format is InputFormat.IMAGE:
|
||||
|
||||
@@ -20,6 +20,7 @@ from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.backend_options import PdfBackendOptions
|
||||
from docling.utils.locks import pypdfium2_lock
|
||||
|
||||
|
||||
@@ -370,12 +371,20 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
||||
|
||||
|
||||
class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
options: PdfBackendOptions = PdfBackendOptions(),
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream, options)
|
||||
|
||||
password = (
|
||||
self.options.password.get_secret_value() if self.options.password else None
|
||||
)
|
||||
try:
|
||||
with pypdfium2_lock:
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream, password=password)
|
||||
except PdfiumError as e:
|
||||
raise RuntimeError(
|
||||
f"pypdfium could not load document with hash {self.document_hash}"
|
||||
|
||||
@@ -51,6 +51,7 @@ from docling.datamodel.asr_model_specs import (
|
||||
WHISPER_TURBO_NATIVE,
|
||||
AsrModelType,
|
||||
)
|
||||
from docling.datamodel.backend_options import PdfBackendOptions
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
@@ -404,6 +405,9 @@ def convert( # noqa: C901
|
||||
pdf_backend: Annotated[
|
||||
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
||||
] = PdfBackend.DLPARSE_V4,
|
||||
pdf_password: Annotated[
|
||||
Optional[str], typer.Option(..., help="Password for protected PDF documents")
|
||||
] = None,
|
||||
table_mode: Annotated[
|
||||
TableFormerMode,
|
||||
typer.Option(..., help="The mode to use in the table structure model."),
|
||||
@@ -628,6 +632,9 @@ def convert( # noqa: C901
|
||||
pipeline_options: PipelineOptions
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {}
|
||||
pdf_backend_options: Optional[PdfBackendOptions] = PdfBackendOptions(
|
||||
password=pdf_password
|
||||
)
|
||||
|
||||
if pipeline == ProcessingPipeline.STANDARD:
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
@@ -658,8 +665,10 @@ def convert( # noqa: C901
|
||||
backend: Type[PdfDocumentBackend]
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend = DoclingParseDocumentBackend
|
||||
pdf_backend_options = None
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
pdf_backend_options = None
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
||||
backend = DoclingParseV4DocumentBackend # type: ignore
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
@@ -670,6 +679,7 @@ def convert( # noqa: C901
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
backend_options=pdf_backend_options,
|
||||
)
|
||||
|
||||
# METS GBS options
|
||||
@@ -816,7 +826,7 @@ def convert( # noqa: C901
|
||||
_log.error(f"{asr_model} is not known")
|
||||
raise ValueError(f"{asr_model} is not known")
|
||||
|
||||
_log.info(f"ASR pipeline_options: {asr_pipeline_options}")
|
||||
_log.debug(f"ASR pipeline_options: {asr_pipeline_options}")
|
||||
|
||||
audio_format_option = AudioFormatOption(
|
||||
pipeline_cls=AsrPipeline,
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
from pathlib import PurePath
|
||||
from typing import Annotated, Literal, Optional, Union
|
||||
|
||||
from pydantic import AnyUrl, BaseModel, Field
|
||||
from pydantic import AnyUrl, BaseModel, Field, SecretStr
|
||||
|
||||
|
||||
class BaseBackendOptions(BaseModel):
|
||||
@@ -64,7 +64,19 @@ class MarkdownBackendOptions(BaseBackendOptions):
|
||||
)
|
||||
|
||||
|
||||
class PdfBackendOptions(BaseBackendOptions):
|
||||
"""Backend options for pdf document backends."""
|
||||
|
||||
kind: Literal["pdf"] = Field("pdf", exclude=True, repr=False)
|
||||
password: Optional[SecretStr] = None
|
||||
|
||||
|
||||
BackendOptions = Annotated[
|
||||
Union[DeclarativeBackendOptions, HTMLBackendOptions, MarkdownBackendOptions],
|
||||
Union[
|
||||
DeclarativeBackendOptions,
|
||||
HTMLBackendOptions,
|
||||
MarkdownBackendOptions,
|
||||
PdfBackendOptions,
|
||||
],
|
||||
Field(discriminator="kind"),
|
||||
]
|
||||
|
||||
@@ -114,7 +114,7 @@ class InputDocument(BaseModel):
|
||||
]
|
||||
valid: bool = Field(True, description="Whether this is is a valid input document.")
|
||||
backend_options: Optional[BackendOptions] = Field(
|
||||
None, description="Custom options for declarative backends."
|
||||
None, description="Custom options for backends."
|
||||
)
|
||||
limits: DocumentLimits = Field(
|
||||
DocumentLimits(), description="Limits in the input document for the conversion."
|
||||
@@ -146,15 +146,6 @@ class InputDocument(BaseModel):
|
||||
self.limits = limits or DocumentLimits()
|
||||
self.format = format
|
||||
|
||||
# check for backend incompatibilities
|
||||
if issubclass(backend, DeclarativeDocumentBackend) and backend_options:
|
||||
if not issubclass(
|
||||
type(backend_options), type(backend.get_default_options())
|
||||
):
|
||||
raise ValueError(
|
||||
"Incompatible types between backend and backend_options arguments."
|
||||
)
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
self.file = path_or_stream
|
||||
@@ -214,7 +205,7 @@ class InputDocument(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if issubclass(backend, DeclarativeDocumentBackend) and self.backend_options:
|
||||
if self.backend_options:
|
||||
self._backend = backend(
|
||||
self,
|
||||
path_or_stream=path_or_stream,
|
||||
|
||||
@@ -31,7 +31,12 @@ from docling.backend.noop_backend import NoOpBackend
|
||||
from docling.backend.webvtt_backend import WebVTTDocumentBackend
|
||||
from docling.backend.xml.jats_backend import JatsDocumentBackend
|
||||
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
|
||||
from docling.datamodel.backend_options import BackendOptions, HTMLBackendOptions
|
||||
from docling.datamodel.backend_options import (
|
||||
BackendOptions,
|
||||
HTMLBackendOptions,
|
||||
MarkdownBackendOptions,
|
||||
PdfBackendOptions,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
BaseFormatOption,
|
||||
ConversionStatus,
|
||||
@@ -98,7 +103,7 @@ class PowerpointFormatOption(FormatOption):
|
||||
class MarkdownFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
||||
backend_options: HTMLBackendOptions = HTMLBackendOptions()
|
||||
backend_options: Optional[MarkdownBackendOptions] = None
|
||||
|
||||
|
||||
class AsciiDocFormatOption(FormatOption):
|
||||
@@ -109,7 +114,7 @@ class AsciiDocFormatOption(FormatOption):
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
backend_options: HTMLBackendOptions = HTMLBackendOptions()
|
||||
backend_options: Optional[HTMLBackendOptions] = None
|
||||
|
||||
|
||||
class PatentUsptoFormatOption(FormatOption):
|
||||
@@ -130,6 +135,7 @@ class ImageFormatOption(FormatOption):
|
||||
class PdfFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
||||
backend_options: Optional[PdfBackendOptions] = None
|
||||
|
||||
|
||||
class AudioFormatOption(FormatOption):
|
||||
@@ -139,48 +145,24 @@ class AudioFormatOption(FormatOption):
|
||||
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
format_to_default_options = {
|
||||
InputFormat.CSV: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
|
||||
),
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.MD: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||
),
|
||||
InputFormat.ASCIIDOC: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline,
|
||||
backend=HTMLDocumentBackend,
|
||||
backend_options=HTMLBackendOptions(),
|
||||
),
|
||||
InputFormat.XML_USPTO: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=PatentUsptoDocumentBackend
|
||||
),
|
||||
InputFormat.XML_JATS: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=JatsDocumentBackend
|
||||
),
|
||||
InputFormat.CSV: CsvFormatOption(),
|
||||
InputFormat.XLSX: ExcelFormatOption(),
|
||||
InputFormat.DOCX: WordFormatOption(),
|
||||
InputFormat.PPTX: PowerpointFormatOption(),
|
||||
InputFormat.MD: MarkdownFormatOption(),
|
||||
InputFormat.ASCIIDOC: AsciiDocFormatOption(),
|
||||
InputFormat.HTML: HTMLFormatOption(),
|
||||
InputFormat.XML_USPTO: PatentUsptoFormatOption(),
|
||||
InputFormat.XML_JATS: XMLJatsFormatOption(),
|
||||
InputFormat.METS_GBS: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=MetsGbsDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseV4DocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: ImageFormatOption(),
|
||||
InputFormat.PDF: PdfFormatOption(),
|
||||
InputFormat.JSON_DOCLING: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||
),
|
||||
InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
|
||||
InputFormat.AUDIO: AudioFormatOption(),
|
||||
InputFormat.VTT: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=WebVTTDocumentBackend
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user