Update examples and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-09 15:20:27 +02:00
parent 080042d06d
commit 0dfbd0b6fc
25 changed files with 181 additions and 150 deletions

View File

@ -20,12 +20,12 @@ repos:
# pass_filenames: false
# language: system
# files: '\.py$'
# - id: mypy
# name: MyPy
# entry: poetry run mypy docling
# pass_filenames: false
# language: system
# files: '\.py$'
# - id: mypy
# name: MyPy
# entry: poetry run mypy docling
# pass_filenames: false
# language: system
# files: '\.py$'
- id: nbqa_black
name: nbQA Black
entry: poetry run nbqa black examples

View File

@ -20,7 +20,7 @@ class AbstractDocumentBackend(ABC):
@classmethod
@abstractmethod
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
pass
@abstractmethod

View File

@ -50,7 +50,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return False
def unload(self):

View File

@ -57,7 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
def is_valid(self) -> bool:
return self.valid
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return True # True? if so, how to handle pages...
def unload(self):

View File

@ -50,7 +50,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return False
def unload(self):

View File

@ -4,7 +4,7 @@ from typing import Iterable, Optional, Set
from docling_core.types.experimental import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
@ -41,7 +41,7 @@ class PdfPageBackend(ABC):
pass
class PdfDocumentBackend(AbstractDocumentBackend):
class PdfDocumentBackend(PaginatedDocumentBackend):
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@ -55,5 +55,5 @@ class PdfDocumentBackend(AbstractDocumentBackend):
return {InputFormat.PDF}
@classmethod
def is_paginated(cls) -> bool:
def supports_pagination(cls) -> bool:
return True

View File

@ -12,9 +12,10 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.pdf_document_converter import PdfDocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -195,9 +196,13 @@ def convert(
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=pdf_backend,
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=pdf_backend
)
}
)
# Define input files

View File

@ -168,5 +168,5 @@ class Page(BaseModel):
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
filename: str
name: str
stream: BytesIO

View File

@ -74,14 +74,6 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
description=DescriptionItem(), name="dummy"
) # TODO: Stub
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.PDF: DoclingParseDocumentBackend,
InputFormat.HTML: HTMLDocumentBackend,
InputFormat.DOCX: MsWordDocumentBackend,
InputFormat.PPTX: MsPowerpointDocumentBackend,
InputFormat.IMAGE: None,
}
class InputDocument(BaseModel):
file: PurePath = None
@ -110,14 +102,12 @@ class InputDocument(BaseModel):
try:
if isinstance(path_or_stream, Path):
self.file = path_or_stream
self.filesize = path_or_stream.stat().st_size
if self.filesize > self.limits.max_file_size:
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._init_doc(backend, path_or_stream)
elif isinstance(path_or_stream, BytesIO):
@ -128,12 +118,11 @@ class InputDocument(BaseModel):
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._init_doc(backend, path_or_stream)
# For paginated backends, check if the maximum page count is exceeded.
if self.valid and self._backend.is_valid():
if self._backend.is_paginated():
if self._backend.supports_pagination():
self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages:
self.valid = False
@ -156,12 +145,10 @@ class InputDocument(BaseModel):
path_or_stream: Union[BytesIO, Path],
) -> None:
if backend is None:
backend = _input_format_default_backends.get(self.format)
if backend is None:
self.valid = False
raise RuntimeError(
f"Could not find suitable backend for file: {self.file}"
)
raise RuntimeError(
f"No backend configuration provided for file {self.file} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
@ -473,47 +460,45 @@ class DocumentConversionInput(BaseModel):
) -> Iterable[InputDocument]:
for obj in self._path_or_stream_iterator:
format = self._guess_format(obj)
if format not in format_options.keys():
_log.debug(
f"Skipping input document {obj.name} because its format is not in the whitelist."
)
continue
else:
backend = format_options.get(format).backend
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
if format not in format_options.keys():
continue
else:
backend = format_options.get(format).backend
yield InputDocument(
path_or_stream=obj,
format=format,
filename=obj.name,
limits=self.limits,
backend=backend,
)
elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
obj.stream.seek(0)
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
if format not in format_options.keys():
continue
else:
backend = format_options.get(format).backend
yield InputDocument(
path_or_stream=obj.stream,
format=format,
filename=obj.filename,
filename=obj.name,
limits=self.limits,
backend=backend,
)
def _guess_format(self, obj):
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
else:
1 == 1 # alert!!
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
return format
@classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths]

View File

@ -1,7 +1,7 @@
import warnings
from enum import Enum, auto
from pathlib import Path
from typing import Annotated, Optional, Self, Union
from typing import Annotated, Optional, Union
from pydantic import BaseModel, Field, model_validator
@ -40,7 +40,7 @@ class PdfPipelineOptions(PipelineOptions):
images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self:
def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0

View File

@ -5,10 +5,21 @@ from pathlib import Path
from typing import Dict, Iterable, List, Optional, Type
import requests
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
from pydantic import (
AnyHttpUrl,
BaseModel,
ConfigDict,
TypeAdapter,
ValidationError,
field_validator,
model_validator,
)
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import (
ConversionResult,
@ -28,50 +39,53 @@ _log = logging.getLogger(__name__)
class FormatOption(BaseModel):
pipeline_cls: Type[BaseModelPipeline]
pipeline_options: Optional[PipelineOptions] = None
backend: Optional[Type[AbstractDocumentBackend]] = None
backend: Type[AbstractDocumentBackend]
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(
self,
pipeline_cls: Type[BaseModelPipeline],
pipeline_options: Optional[PipelineOptions] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None,
):
if pipeline_options is None:
pipeline_options = pipeline_cls.get_default_options()
@model_validator(mode="after")
def set_optional_field_default(self) -> "FormatOption":
if self.pipeline_options is None:
self.pipeline_options = self.pipeline_cls.get_default_options()
return self
super().__init__(
pipeline_cls=pipeline_cls,
pipeline_options=pipeline_options,
backend=backend,
)
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
class PowerpointFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
class PdfFormatOption(FormatOption):
def __init__(
self,
pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
pipeline_options: Optional[PipelineOptions] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None,
):
if pipeline_cls is None:
pipeline_cls = StandardPdfModelPipeline
if backend is None:
backend = DoclingParseDocumentBackend
super().__init__(
pipeline_cls=pipeline_cls,
pipeline_options=pipeline_options,
backend=backend,
)
pipeline_cls: Type = StandardPdfModelPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
_format_to_default_options = {
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline),
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
InputFormat.DOCX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
),
InputFormat.PPTX: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
),
}

View File

@ -61,8 +61,13 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
_log.info(f"Processing document {in_doc.file.name}")
if not isinstance(in_doc._backend, PdfDocumentBackend):
conv_res.status = ConversionStatus.FAILURE
return conv_res
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))

View File

@ -32,8 +32,13 @@ class SimpleModelPipeline(BaseModelPipeline):
return conv_res
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
conv_res.status = ConversionStatus.FAILURE
return conv_res
raise RuntimeError(
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
f"Can not convert this with simple pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
# Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output

View File

@ -12,7 +12,7 @@ from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False
USE_EXPERIMENTAL = True
def export_documents(

View File

@ -7,7 +7,7 @@ from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, FormatOption
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
_log = logging.getLogger(__name__)
@ -104,9 +104,7 @@ def main():
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
)
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)

View File

@ -3,9 +3,15 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.document_converter import (
DocumentConverter,
FormatOption,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
@ -22,23 +28,25 @@ input_paths = [
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
## for defaults use:
# doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[
# InputFormat.PDF,
# InputFormat.DOCX,
# ], # whitelist formats, other files are ignored.
# format_options={
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
# InputFormat.DOCX: FormatOption(
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
# ),
# # InputFormat.IMAGE: PdfFormatOption(),
# },
# )
## to customize use:
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
formats=[
InputFormat.PDF,
InputFormat.DOCX,
], # whitelist formats, other files are ignored.
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
InputFormat.DOCX: WordFormatOption(
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
),
# InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert(input)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,10 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result
@ -22,14 +23,17 @@ def get_pdf_paths():
def get_converter():
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
)
}
)
return converter

View File

@ -4,10 +4,10 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream
from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result
@ -21,14 +21,17 @@ def get_pdf_path():
@pytest.fixture
def converter():
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
)
}
)
return converter
@ -61,7 +64,7 @@ def test_batch_bytes(converter: DocumentConverter):
print(f"converting {pdf_path}")
buf = BytesIO(pdf_path.open("rb").read())
docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert(conv_input)

View File

@ -3,10 +3,10 @@ from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
from docling.document_converter import DocumentConverter
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture
@ -17,15 +17,19 @@ def test_doc_path():
def get_converters_with_table_options():
for cell_matching in [True, False]:
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = cell_matching
pipeline_options.table_structure_options.mode = mode
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend,
)
}
)
yield converter