Update examples and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-10-09 15:20:27 +02:00
parent 080042d06d
commit 0dfbd0b6fc
25 changed files with 181 additions and 150 deletions

View File

@@ -20,7 +20,7 @@ class AbstractDocumentBackend(ABC):
@classmethod
@abstractmethod
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
pass
@abstractmethod

View File

@@ -50,7 +50,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return False
def unload(self):

View File

@@ -57,7 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
def is_valid(self) -> bool:
return self.valid
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return True # True? if so, how to handle pages...
def unload(self):

View File

@@ -50,7 +50,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return False
def unload(self):

View File

@@ -4,7 +4,7 @@ from typing import Iterable, Optional, Set
from docling_core.types.experimental import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
@@ -41,7 +41,7 @@ class PdfPageBackend(ABC):
pass
class PdfDocumentBackend(AbstractDocumentBackend):
class PdfDocumentBackend(PaginatedDocumentBackend):
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@@ -55,5 +55,5 @@ class PdfDocumentBackend(AbstractDocumentBackend):
return {InputFormat.PDF}
@classmethod
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return True

View File

@@ -12,9 +12,10 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.pdf_document_converter import PdfDocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@@ -195,9 +196,13 @@ def convert(
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=pdf_backend,
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=pdf_backend
)
}
)
# Define input files

View File

@@ -168,5 +168,5 @@ class Page(BaseModel):
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
filename: str
name: str
stream: BytesIO

View File

@@ -74,14 +74,6 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
description=DescriptionItem(), name="dummy"
) # TODO: Stub
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.PDF: DoclingParseDocumentBackend,
InputFormat.HTML: HTMLDocumentBackend,
InputFormat.DOCX: MsWordDocumentBackend,
InputFormat.PPTX: MsPowerpointDocumentBackend,
InputFormat.IMAGE: None,
}
class InputDocument(BaseModel):
file: PurePath = None
@@ -110,14 +102,12 @@ class InputDocument(BaseModel):
try:
if isinstance(path_or_stream, Path):
self.file = path_or_stream
self.filesize = path_or_stream.stat().st_size
if self.filesize > self.limits.max_file_size:
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._init_doc(backend, path_or_stream)
elif isinstance(path_or_stream, BytesIO):
@@ -128,12 +118,11 @@ class InputDocument(BaseModel):
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._init_doc(backend, path_or_stream)
# For paginated backends, check if the maximum page count is exceeded.
if self.valid and self._backend.is_valid():
if self._backend.is_paginated():
if self._backend.supports_pagination():
self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages:
self.valid = False
@@ -156,12 +145,10 @@ class InputDocument(BaseModel):
path_or_stream: Union[BytesIO, Path],
) -> None:
if backend is None:
backend = _input_format_default_backends.get(self.format)
if backend is None:
self.valid = False
raise RuntimeError(
f"Could not find suitable backend for file: {self.file}"
)
raise RuntimeError(
f"No backend configuration provided for file {self.file} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
@@ -473,47 +460,45 @@ class DocumentConversionInput(BaseModel):
) -> Iterable[InputDocument]:
for obj in self._path_or_stream_iterator:
format = self._guess_format(obj)
if format not in format_options.keys():
_log.debug(
f"Skipping input document {obj.name} because its format is not in the whitelist."
)
continue
else:
backend = format_options.get(format).backend
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
if format not in format_options.keys():
continue
else:
backend = format_options.get(format).backend
yield InputDocument(
path_or_stream=obj,
format=format,
filename=obj.name,
limits=self.limits,
backend=backend,
)
elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
obj.stream.seek(0)
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
if format not in format_options.keys():
continue
else:
backend = format_options.get(format).backend
yield InputDocument(
path_or_stream=obj.stream,
format=format,
filename=obj.filename,
filename=obj.name,
limits=self.limits,
backend=backend,
)
def _guess_format(self, obj):
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
else:
1 == 1 # alert!!
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
return format
@classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths]

View File

@@ -1,7 +1,7 @@
import warnings
from enum import Enum, auto
from pathlib import Path
from typing import Annotated, Optional, Self, Union
from typing import Annotated, Optional, Union
from pydantic import BaseModel, Field, model_validator
@@ -40,7 +40,7 @@ class PdfPipelineOptions(PipelineOptions):
images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0

View File

@@ -5,10 +5,21 @@ from pathlib import Path
from typing import Dict, Iterable, List, Optional, Type
import requests
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
from pydantic import (
AnyHttpUrl,
BaseModel,
ConfigDict,
TypeAdapter,
ValidationError,
field_validator,
model_validator,
)
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import (
ConversionResult,
@@ -28,50 +39,53 @@ _log = logging.getLogger(__name__)
class FormatOption(BaseModel):
pipeline_cls: Type[BaseModelPipeline]
pipeline_options: Optional[PipelineOptions] = None
backend: Optional[Type[AbstractDocumentBackend]] = None
backend: Type[AbstractDocumentBackend]
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(
self,
pipeline_cls: Type[BaseModelPipeline],
pipeline_options: Optional[PipelineOptions] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None,
):
if pipeline_options is None:
pipeline_options = pipeline_cls.get_default_options()
@model_validator(mode="after")
def set_optional_field_default(self) -> "FormatOption":
if self.pipeline_options is None:
self.pipeline_options = self.pipeline_cls.get_default_options()
return self
super().__init__(
pipeline_cls=pipeline_cls,
pipeline_options=pipeline_options,
backend=backend,
)
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
class PowerpointFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
class PdfFormatOption(FormatOption):
def __init__(
self,
pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
pipeline_options: Optional[PipelineOptions] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None,
):
if pipeline_cls is None:
pipeline_cls = StandardPdfModelPipeline
if backend is None:
backend = DoclingParseDocumentBackend
super().__init__(
pipeline_cls=pipeline_cls,
pipeline_options=pipeline_options,
backend=backend,
)
pipeline_cls: Type = StandardPdfModelPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
_format_to_default_options = {
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline),
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
InputFormat.DOCX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
),
InputFormat.PPTX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
),
}

View File

@@ -61,8 +61,13 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
_log.info(f"Processing document {in_doc.file.name}")
if not isinstance(in_doc._backend, PdfDocumentBackend):
conv_res.status = ConversionStatus.FAILURE
return conv_res
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))

View File

@@ -32,8 +32,13 @@ class SimpleModelPipeline(BaseModelPipeline):
return conv_res
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
conv_res.status = ConversionStatus.FAILURE
return conv_res
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
f"Can not convert this with simple pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
# Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output