Update examples and test cases

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-09 15:20:27 +02:00
parent 080042d06d
commit 0dfbd0b6fc
25 changed files with 181 additions and 150 deletions

View File

@ -20,12 +20,12 @@ repos:
# pass_filenames: false # pass_filenames: false
# language: system # language: system
# files: '\.py$' # files: '\.py$'
# - id: mypy # - id: mypy
# name: MyPy # name: MyPy
# entry: poetry run mypy docling # entry: poetry run mypy docling
# pass_filenames: false # pass_filenames: false
# language: system # language: system
# files: '\.py$' # files: '\.py$'
- id: nbqa_black - id: nbqa_black
name: nbQA Black name: nbQA Black
entry: poetry run nbqa black examples entry: poetry run nbqa black examples

View File

@ -20,7 +20,7 @@ class AbstractDocumentBackend(ABC):
@classmethod @classmethod
@abstractmethod @abstractmethod
def is_paginated(cls) -> bool: def supports_pagination(cls) -> bool:
pass pass
@abstractmethod @abstractmethod

View File

@ -50,7 +50,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
def is_paginated(cls) -> bool: def supports_pagination(cls) -> bool:
return False return False
def unload(self): def unload(self):

View File

@ -57,7 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
def is_valid(self) -> bool: def is_valid(self) -> bool:
return self.valid return self.valid
def is_paginated(cls) -> bool: def supports_pagination(cls) -> bool:
return True # True? if so, how to handle pages... return True # True? if so, how to handle pages...
def unload(self): def unload(self):

View File

@ -50,7 +50,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def is_valid(self) -> bool: def is_valid(self) -> bool:
return True return True
def is_paginated(cls) -> bool: def supports_pagination(cls) -> bool:
return False return False
def unload(self): def unload(self):

View File

@ -4,7 +4,7 @@ from typing import Iterable, Optional, Set
from docling_core.types.experimental import BoundingBox, Size from docling_core.types.experimental import BoundingBox, Size
from PIL import Image from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import PaginatedDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat from docling.datamodel.base_models import Cell, InputFormat
@ -41,7 +41,7 @@ class PdfPageBackend(ABC):
pass pass
class PdfDocumentBackend(AbstractDocumentBackend): class PdfDocumentBackend(PaginatedDocumentBackend):
@abstractmethod @abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend: def load_page(self, page_no: int) -> PdfPageBackend:
pass pass
@ -55,5 +55,5 @@ class PdfDocumentBackend(AbstractDocumentBackend):
return {InputFormat.PDF} return {InputFormat.PDF}
@classmethod @classmethod
def is_paginated(cls) -> bool: def supports_pagination(cls) -> bool:
return True return True

View File

@ -12,9 +12,10 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.pdf_document_converter import PdfDocumentConverter from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -195,9 +196,13 @@ def convert(
do_table_structure=True, do_table_structure=True,
) )
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options, doc_converter = DocumentConverter(
pdf_backend=pdf_backend, format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=pdf_backend
)
}
) )
# Define input files # Define input files

View File

@ -168,5 +168,5 @@ class Page(BaseModel):
class DocumentStream(BaseModel): class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
filename: str name: str
stream: BytesIO stream: BytesIO

View File

@ -74,14 +74,6 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
description=DescriptionItem(), name="dummy" description=DescriptionItem(), name="dummy"
) # TODO: Stub ) # TODO: Stub
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.PDF: DoclingParseDocumentBackend,
InputFormat.HTML: HTMLDocumentBackend,
InputFormat.DOCX: MsWordDocumentBackend,
InputFormat.PPTX: MsPowerpointDocumentBackend,
InputFormat.IMAGE: None,
}
class InputDocument(BaseModel): class InputDocument(BaseModel):
file: PurePath = None file: PurePath = None
@ -110,14 +102,12 @@ class InputDocument(BaseModel):
try: try:
if isinstance(path_or_stream, Path): if isinstance(path_or_stream, Path):
self.file = path_or_stream self.file = path_or_stream
self.filesize = path_or_stream.stat().st_size self.filesize = path_or_stream.stat().st_size
if self.filesize > self.limits.max_file_size: if self.filesize > self.limits.max_file_size:
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._init_doc(backend, path_or_stream) self._init_doc(backend, path_or_stream)
elif isinstance(path_or_stream, BytesIO): elif isinstance(path_or_stream, BytesIO):
@ -128,12 +118,11 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._init_doc(backend, path_or_stream) self._init_doc(backend, path_or_stream)
# For paginated backends, check if the maximum page count is exceeded. # For paginated backends, check if the maximum page count is exceeded.
if self.valid and self._backend.is_valid(): if self.valid and self._backend.is_valid():
if self._backend.is_paginated(): if self._backend.supports_pagination():
self.page_count = self._backend.page_count() self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages: if not self.page_count <= self.limits.max_num_pages:
self.valid = False self.valid = False
@ -156,12 +145,10 @@ class InputDocument(BaseModel):
path_or_stream: Union[BytesIO, Path], path_or_stream: Union[BytesIO, Path],
) -> None: ) -> None:
if backend is None: if backend is None:
backend = _input_format_default_backends.get(self.format) raise RuntimeError(
if backend is None: f"No backend configuration provided for file {self.file} with format {self.format}. "
self.valid = False f"Please check your format configuration on DocumentConverter."
raise RuntimeError( )
f"Could not find suitable backend for file: {self.file}"
)
self._backend = backend( self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash path_or_stream=path_or_stream, document_hash=self.document_hash
@ -473,47 +460,45 @@ class DocumentConversionInput(BaseModel):
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
for obj in self._path_or_stream_iterator: for obj in self._path_or_stream_iterator:
format = self._guess_format(obj)
if format not in format_options.keys():
_log.debug(
f"Skipping input document {obj.name} because its format is not in the whitelist."
)
continue
else:
backend = format_options.get(format).backend
if isinstance(obj, Path): if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
if format not in format_options.keys():
continue
else:
backend = format_options.get(format).backend
yield InputDocument( yield InputDocument(
path_or_stream=obj, path_or_stream=obj,
format=format, format=format,
filename=obj.name,
limits=self.limits, limits=self.limits,
backend=backend, backend=backend,
) )
elif isinstance(obj, DocumentStream): elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
obj.stream.seek(0)
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
if format not in format_options.keys():
continue
else:
backend = format_options.get(format).backend
yield InputDocument( yield InputDocument(
path_or_stream=obj.stream, path_or_stream=obj.stream,
format=format, format=format,
filename=obj.filename, filename=obj.name,
limits=self.limits, limits=self.limits,
backend=backend, backend=backend,
) )
def _guess_format(self, obj):
if isinstance(obj, Path):
mime = filetype.guess_mime(str(obj))
elif isinstance(obj, DocumentStream):
mime = filetype.guess_mime(obj.stream.read(8192))
else:
1 == 1 # alert!!
if mime is None:
if obj.suffix == ".html":
mime = "text/html"
format = MimeTypeToFormat.get(mime)
return format
@classmethod @classmethod
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None): def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
paths = [Path(p) for p in paths] paths = [Path(p) for p in paths]

View File

@ -1,7 +1,7 @@
import warnings import warnings
from enum import Enum, auto from enum import Enum, auto
from pathlib import Path from pathlib import Path
from typing import Annotated, Optional, Self, Union from typing import Annotated, Optional, Union
from pydantic import BaseModel, Field, model_validator from pydantic import BaseModel, Field, model_validator
@ -40,7 +40,7 @@ class PdfPipelineOptions(PipelineOptions):
images_scale: Optional[float] = None # if set, the scale for generated images images_scale: Optional[float] = None # if set, the scale for generated images
@model_validator(mode="after") @model_validator(mode="after")
def set_page_images_from_deprecated(self) -> Self: def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning) warnings.simplefilter("ignore", DeprecationWarning)
default_scale = 1.0 default_scale = 1.0

View File

@ -5,10 +5,21 @@ from pathlib import Path
from typing import Dict, Iterable, List, Optional, Type from typing import Dict, Iterable, List, Optional, Type
import requests import requests
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError from pydantic import (
AnyHttpUrl,
BaseModel,
ConfigDict,
TypeAdapter,
ValidationError,
field_validator,
model_validator,
)
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,
@ -28,50 +39,53 @@ _log = logging.getLogger(__name__)
class FormatOption(BaseModel): class FormatOption(BaseModel):
pipeline_cls: Type[BaseModelPipeline] pipeline_cls: Type[BaseModelPipeline]
pipeline_options: Optional[PipelineOptions] = None pipeline_options: Optional[PipelineOptions] = None
backend: Optional[Type[AbstractDocumentBackend]] = None backend: Type[AbstractDocumentBackend]
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__( @model_validator(mode="after")
self, def set_optional_field_default(self) -> "FormatOption":
pipeline_cls: Type[BaseModelPipeline], if self.pipeline_options is None:
pipeline_options: Optional[PipelineOptions] = None, self.pipeline_options = self.pipeline_cls.get_default_options()
backend: Optional[Type[AbstractDocumentBackend]] = None, return self
):
if pipeline_options is None:
pipeline_options = pipeline_cls.get_default_options()
super().__init__(
pipeline_cls=pipeline_cls, class WordFormatOption(FormatOption):
pipeline_options=pipeline_options, pipeline_cls: Type = SimpleModelPipeline
backend=backend, backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
)
class PowerpointFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimpleModelPipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
class PdfFormatOption(FormatOption): class PdfFormatOption(FormatOption):
def __init__( pipeline_cls: Type = StandardPdfModelPipeline
self, backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
pipeline_options: Optional[PipelineOptions] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None,
):
if pipeline_cls is None:
pipeline_cls = StandardPdfModelPipeline
if backend is None:
backend = DoclingParseDocumentBackend
super().__init__(
pipeline_cls=pipeline_cls,
pipeline_options=pipeline_options,
backend=backend,
)
_format_to_default_options = { _format_to_default_options = {
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline), InputFormat.DOCX: FormatOption(
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline), pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline), ),
InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline), InputFormat.PPTX: FormatOption(
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline), pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
),
InputFormat.HTML: FormatOption(
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
),
InputFormat.IMAGE: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
),
InputFormat.PDF: FormatOption(
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
),
} }

View File

@ -61,8 +61,13 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
_log.info(f"Processing document {in_doc.file.name}") _log.info(f"Processing document {in_doc.file.name}")
if not isinstance(in_doc._backend, PdfDocumentBackend): if not isinstance(in_doc._backend, PdfDocumentBackend):
conv_res.status = ConversionStatus.FAILURE raise RuntimeError(
return conv_res f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
f"Can not convert this with a PDF pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
for i in range(0, in_doc.page_count): for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i)) conv_res.pages.append(Page(page_no=i))

View File

@ -32,8 +32,13 @@ class SimpleModelPipeline(BaseModelPipeline):
return conv_res return conv_res
if not isinstance(in_doc._backend, DeclarativeDocumentBackend): if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
conv_res.status = ConversionStatus.FAILURE raise RuntimeError(
return conv_res f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
f"Can not convert this with simple pipeline. "
f"Please check your format configuration on DocumentConverter."
)
# conv_res.status = ConversionStatus.FAILURE
# return conv_res
# Instead of running a page-level pipeline to build up the document structure, # Instead of running a page-level pipeline to build up the document structure,
# the backend is expected to be of type DeclarativeDocumentBackend, which can output # the backend is expected to be of type DeclarativeDocumentBackend, which can output

View File

@ -12,7 +12,7 @@ from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
USE_EXPERIMENTAL = False USE_EXPERIMENTAL = True
def export_documents( def export_documents(

View File

@ -7,7 +7,7 @@ from typing import Iterable
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, FormatOption from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -104,9 +104,7 @@ def main():
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
format_options={ format_options={
InputFormat.PDF: FormatOption( InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
)
} }
) )

View File

@ -3,9 +3,15 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import DocumentConversionInput from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption from docling.document_converter import (
DocumentConverter,
FormatOption,
PdfFormatOption,
WordFormatOption,
)
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
@ -22,23 +28,25 @@ input_paths = [
] ]
input = DocumentConversionInput.from_paths(input_paths) input = DocumentConversionInput.from_paths(input_paths)
# for defaults use: ## for defaults use:
doc_converter = DocumentConverter() # doc_converter = DocumentConverter()
# to customize use: ## to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults. doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[ formats=[
# InputFormat.PDF, InputFormat.PDF,
# InputFormat.DOCX, InputFormat.DOCX,
# ], # whitelist formats, other files are ignored. ], # whitelist formats, other files are ignored.
# format_options={ format_options={
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend), InputFormat.PDF: PdfFormatOption(
# InputFormat.DOCX: FormatOption( pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend ), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
# ), InputFormat.DOCX: WordFormatOption(
# # InputFormat.IMAGE: PdfFormatOption(), pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
# }, ),
# ) # InputFormat.IMAGE: PdfFormatOption(),
},
)
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert(input)

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,9 +1,10 @@
from pathlib import Path from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result from .verify_utils import verify_conversion_result
@ -22,14 +23,17 @@ def get_pdf_paths():
def get_converter(): def get_converter():
pipeline_options = PipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter( converter = DocumentConverter(
pipeline_options=pipeline_options, format_options={
pdf_backend=DoclingParseDocumentBackend, InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
)
}
) )
return converter return converter

View File

@ -4,10 +4,10 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import DocumentStream from docling.datamodel.base_models import DocumentStream, InputFormat
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter, PdfFormatOption
from .verify_utils import verify_conversion_result from .verify_utils import verify_conversion_result
@ -21,14 +21,17 @@ def get_pdf_path():
@pytest.fixture @pytest.fixture
def converter(): def converter():
pipeline_options = PipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter( converter = DocumentConverter(
pipeline_options=pipeline_options, format_options={
pdf_backend=DoclingParseDocumentBackend, InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
)
}
) )
return converter return converter
@ -61,7 +64,7 @@ def test_batch_bytes(converter: DocumentConverter):
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
buf = BytesIO(pdf_path.open("rb").read()) buf = BytesIO(pdf_path.open("rb").read())
docs = [DocumentStream(filename=pdf_path.name, stream=buf)] docs = [DocumentStream(name=pdf_path.name, stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs) conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert(conv_input) results = converter.convert(conv_input)

View File

@ -3,10 +3,10 @@ from pathlib import Path
import pytest import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus, InputFormat
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter, PdfFormatOption
@pytest.fixture @pytest.fixture
@ -17,15 +17,19 @@ def test_doc_path():
def get_converters_with_table_options(): def get_converters_with_table_options():
for cell_matching in [True, False]: for cell_matching in [True, False]:
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]: for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
pipeline_options = PipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = cell_matching pipeline_options.table_structure_options.do_cell_matching = cell_matching
pipeline_options.table_structure_options.mode = mode pipeline_options.table_structure_options.mode = mode
converter = DocumentConverter( converter = DocumentConverter(
pipeline_options=pipeline_options, format_options={
pdf_backend=DoclingParseDocumentBackend, InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend,
)
}
) )
yield converter yield converter