mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-17 00:58:25 +00:00
Update examples and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -20,7 +20,7 @@ class AbstractDocumentBackend(ABC):
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
||||
@@ -50,7 +50,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
|
||||
@@ -57,7 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return True # True? if so, how to handle pages...
|
||||
|
||||
def unload(self):
|
||||
|
||||
@@ -50,7 +50,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
|
||||
@@ -4,7 +4,7 @@ from typing import Iterable, Optional, Set
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docling.datamodel.base_models import Cell, InputFormat
|
||||
|
||||
|
||||
@@ -41,7 +41,7 @@ class PdfPageBackend(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(AbstractDocumentBackend):
|
||||
class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
@@ -55,5 +55,5 @@ class PdfDocumentBackend(AbstractDocumentBackend):
|
||||
return {InputFormat.PDF}
|
||||
|
||||
@classmethod
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return True
|
||||
|
||||
@@ -12,9 +12,10 @@ from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
@@ -195,9 +196,13 @@ def convert(
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
doc_converter = PdfDocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=pdf_backend,
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=pdf_backend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
# Define input files
|
||||
|
||||
@@ -168,5 +168,5 @@ class Page(BaseModel):
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
filename: str
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
||||
@@ -74,14 +74,6 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
|
||||
description=DescriptionItem(), name="dummy"
|
||||
) # TODO: Stub
|
||||
|
||||
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
InputFormat.PDF: DoclingParseDocumentBackend,
|
||||
InputFormat.HTML: HTMLDocumentBackend,
|
||||
InputFormat.DOCX: MsWordDocumentBackend,
|
||||
InputFormat.PPTX: MsPowerpointDocumentBackend,
|
||||
InputFormat.IMAGE: None,
|
||||
}
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
@@ -110,14 +102,12 @@ class InputDocument(BaseModel):
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
|
||||
self.file = path_or_stream
|
||||
self.filesize = path_or_stream.stat().st_size
|
||||
if self.filesize > self.limits.max_file_size:
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
|
||||
self._init_doc(backend, path_or_stream)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
@@ -128,12 +118,11 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
|
||||
self._init_doc(backend, path_or_stream)
|
||||
|
||||
# For paginated backends, check if the maximum page count is exceeded.
|
||||
if self.valid and self._backend.is_valid():
|
||||
if self._backend.is_paginated():
|
||||
if self._backend.supports_pagination():
|
||||
self.page_count = self._backend.page_count()
|
||||
if not self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = False
|
||||
@@ -156,12 +145,10 @@ class InputDocument(BaseModel):
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
backend = _input_format_default_backends.get(self.format)
|
||||
if backend is None:
|
||||
self.valid = False
|
||||
raise RuntimeError(
|
||||
f"Could not find suitable backend for file: {self.file}"
|
||||
)
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
self._backend = backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
@@ -473,47 +460,45 @@ class DocumentConversionInput(BaseModel):
|
||||
) -> Iterable[InputDocument]:
|
||||
|
||||
for obj in self._path_or_stream_iterator:
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.debug(
|
||||
f"Skipping input document {obj.name} because its format is not in the whitelist."
|
||||
)
|
||||
continue
|
||||
else:
|
||||
backend = format_options.get(format).backend
|
||||
|
||||
if isinstance(obj, Path):
|
||||
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
if mime is None:
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
if format not in format_options.keys():
|
||||
continue
|
||||
else:
|
||||
backend = format_options.get(format).backend
|
||||
|
||||
yield InputDocument(
|
||||
path_or_stream=obj,
|
||||
format=format,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
elif isinstance(obj, DocumentStream):
|
||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||
obj.stream.seek(0)
|
||||
|
||||
if mime is None:
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
if format not in format_options.keys():
|
||||
continue
|
||||
else:
|
||||
backend = format_options.get(format).backend
|
||||
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
format=format,
|
||||
filename=obj.filename,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
def _guess_format(self, obj):
|
||||
if isinstance(obj, Path):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
elif isinstance(obj, DocumentStream):
|
||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||
else:
|
||||
1 == 1 # alert!!
|
||||
if mime is None:
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
@classmethod
|
||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||
paths = [Path(p) for p in paths]
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import warnings
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Optional, Self, Union
|
||||
from typing import Annotated, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
@@ -40,7 +40,7 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_page_images_from_deprecated(self) -> Self:
|
||||
def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
default_scale = 1.0
|
||||
|
||||
@@ -5,10 +5,21 @@ from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Type
|
||||
|
||||
import requests
|
||||
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
||||
from pydantic import (
|
||||
AnyHttpUrl,
|
||||
BaseModel,
|
||||
ConfigDict,
|
||||
TypeAdapter,
|
||||
ValidationError,
|
||||
field_validator,
|
||||
model_validator,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
@@ -28,50 +39,53 @@ _log = logging.getLogger(__name__)
|
||||
class FormatOption(BaseModel):
|
||||
pipeline_cls: Type[BaseModelPipeline]
|
||||
pipeline_options: Optional[PipelineOptions] = None
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pipeline_cls: Type[BaseModelPipeline],
|
||||
pipeline_options: Optional[PipelineOptions] = None,
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||
):
|
||||
if pipeline_options is None:
|
||||
pipeline_options = pipeline_cls.get_default_options()
|
||||
@model_validator(mode="after")
|
||||
def set_optional_field_default(self) -> "FormatOption":
|
||||
if self.pipeline_options is None:
|
||||
self.pipeline_options = self.pipeline_cls.get_default_options()
|
||||
return self
|
||||
|
||||
super().__init__(
|
||||
pipeline_cls=pipeline_cls,
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
class WordFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimpleModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||
|
||||
|
||||
class PowerpointFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimpleModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||
|
||||
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimpleModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
|
||||
|
||||
class PdfFormatOption(FormatOption):
|
||||
def __init__(
|
||||
self,
|
||||
pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
|
||||
pipeline_options: Optional[PipelineOptions] = None,
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||
):
|
||||
if pipeline_cls is None:
|
||||
pipeline_cls = StandardPdfModelPipeline
|
||||
if backend is None:
|
||||
backend = DoclingParseDocumentBackend
|
||||
super().__init__(
|
||||
pipeline_cls=pipeline_cls,
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
pipeline_cls: Type = StandardPdfModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
_format_to_default_options = {
|
||||
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -61,8 +61,13 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
||||
f"Can not convert this with a PDF pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
@@ -32,8 +32,13 @@ class SimpleModelPipeline(BaseModelPipeline):
|
||||
return conv_res
|
||||
|
||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
|
||||
f"Can not convert this with simple pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
# Instead of running a page-level pipeline to build up the document structure,
|
||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||
|
||||
Reference in New Issue
Block a user