mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Update examples and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
080042d06d
commit
0dfbd0b6fc
@ -20,7 +20,7 @@ class AbstractDocumentBackend(ABC):
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
@ -50,7 +50,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
|
@ -57,7 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
||||
def is_valid(self) -> bool:
|
||||
return self.valid
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return True # True? if so, how to handle pages...
|
||||
|
||||
def unload(self):
|
||||
|
@ -50,7 +50,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
|
@ -4,7 +4,7 @@ from typing import Iterable, Optional, Set
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||
from docling.datamodel.base_models import Cell, InputFormat
|
||||
|
||||
|
||||
@ -41,7 +41,7 @@ class PdfPageBackend(ABC):
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(AbstractDocumentBackend):
|
||||
class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
@ -55,5 +55,5 @@ class PdfDocumentBackend(AbstractDocumentBackend):
|
||||
return {InputFormat.PDF}
|
||||
|
||||
@classmethod
|
||||
def is_paginated(cls) -> bool:
|
||||
def supports_pagination(cls) -> bool:
|
||||
return True
|
||||
|
@ -12,9 +12,10 @@ from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
@ -195,9 +196,13 @@ def convert(
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
doc_converter = PdfDocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=pdf_backend,
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=pdf_backend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
# Define input files
|
||||
|
@ -168,5 +168,5 @@ class Page(BaseModel):
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
filename: str
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
@ -74,14 +74,6 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
|
||||
description=DescriptionItem(), name="dummy"
|
||||
) # TODO: Stub
|
||||
|
||||
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
InputFormat.PDF: DoclingParseDocumentBackend,
|
||||
InputFormat.HTML: HTMLDocumentBackend,
|
||||
InputFormat.DOCX: MsWordDocumentBackend,
|
||||
InputFormat.PPTX: MsPowerpointDocumentBackend,
|
||||
InputFormat.IMAGE: None,
|
||||
}
|
||||
|
||||
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
@ -110,14 +102,12 @@ class InputDocument(BaseModel):
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
|
||||
self.file = path_or_stream
|
||||
self.filesize = path_or_stream.stat().st_size
|
||||
if self.filesize > self.limits.max_file_size:
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
|
||||
self._init_doc(backend, path_or_stream)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
@ -128,12 +118,11 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
|
||||
self._init_doc(backend, path_or_stream)
|
||||
|
||||
# For paginated backends, check if the maximum page count is exceeded.
|
||||
if self.valid and self._backend.is_valid():
|
||||
if self._backend.is_paginated():
|
||||
if self._backend.supports_pagination():
|
||||
self.page_count = self._backend.page_count()
|
||||
if not self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = False
|
||||
@ -156,11 +145,9 @@ class InputDocument(BaseModel):
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
backend = _input_format_default_backends.get(self.format)
|
||||
if backend is None:
|
||||
self.valid = False
|
||||
raise RuntimeError(
|
||||
f"Could not find suitable backend for file: {self.file}"
|
||||
f"No backend configuration provided for file {self.file} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
self._backend = backend(
|
||||
@ -473,47 +460,45 @@ class DocumentConversionInput(BaseModel):
|
||||
) -> Iterable[InputDocument]:
|
||||
|
||||
for obj in self._path_or_stream_iterator:
|
||||
if isinstance(obj, Path):
|
||||
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
if mime is None:
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
format = self._guess_format(obj)
|
||||
if format not in format_options.keys():
|
||||
_log.debug(
|
||||
f"Skipping input document {obj.name} because its format is not in the whitelist."
|
||||
)
|
||||
continue
|
||||
else:
|
||||
backend = format_options.get(format).backend
|
||||
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj,
|
||||
format=format,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
elif isinstance(obj, DocumentStream):
|
||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||
obj.stream.seek(0)
|
||||
|
||||
if mime is None:
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
if format not in format_options.keys():
|
||||
continue
|
||||
else:
|
||||
backend = format_options.get(format).backend
|
||||
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
format=format,
|
||||
filename=obj.filename,
|
||||
filename=obj.name,
|
||||
limits=self.limits,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
def _guess_format(self, obj):
|
||||
if isinstance(obj, Path):
|
||||
mime = filetype.guess_mime(str(obj))
|
||||
elif isinstance(obj, DocumentStream):
|
||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||
else:
|
||||
1 == 1 # alert!!
|
||||
if mime is None:
|
||||
if obj.suffix == ".html":
|
||||
mime = "text/html"
|
||||
format = MimeTypeToFormat.get(mime)
|
||||
return format
|
||||
|
||||
@classmethod
|
||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||
paths = [Path(p) for p in paths]
|
||||
|
@ -1,7 +1,7 @@
|
||||
import warnings
|
||||
from enum import Enum, auto
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Optional, Self, Union
|
||||
from typing import Annotated, Optional, Union
|
||||
|
||||
from pydantic import BaseModel, Field, model_validator
|
||||
|
||||
@ -40,7 +40,7 @@ class PdfPipelineOptions(PipelineOptions):
|
||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||
|
||||
@model_validator(mode="after")
|
||||
def set_page_images_from_deprecated(self) -> Self:
|
||||
def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
default_scale = 1.0
|
||||
|
@ -5,10 +5,21 @@ from pathlib import Path
|
||||
from typing import Dict, Iterable, List, Optional, Type
|
||||
|
||||
import requests
|
||||
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
||||
from pydantic import (
|
||||
AnyHttpUrl,
|
||||
BaseModel,
|
||||
ConfigDict,
|
||||
TypeAdapter,
|
||||
ValidationError,
|
||||
field_validator,
|
||||
model_validator,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
@ -28,50 +39,53 @@ _log = logging.getLogger(__name__)
|
||||
class FormatOption(BaseModel):
|
||||
pipeline_cls: Type[BaseModelPipeline]
|
||||
pipeline_options: Optional[PipelineOptions] = None
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pipeline_cls: Type[BaseModelPipeline],
|
||||
pipeline_options: Optional[PipelineOptions] = None,
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||
):
|
||||
if pipeline_options is None:
|
||||
pipeline_options = pipeline_cls.get_default_options()
|
||||
@model_validator(mode="after")
|
||||
def set_optional_field_default(self) -> "FormatOption":
|
||||
if self.pipeline_options is None:
|
||||
self.pipeline_options = self.pipeline_cls.get_default_options()
|
||||
return self
|
||||
|
||||
super().__init__(
|
||||
pipeline_cls=pipeline_cls,
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
class WordFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimpleModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||
|
||||
|
||||
class PowerpointFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimpleModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||
|
||||
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimpleModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
|
||||
|
||||
class PdfFormatOption(FormatOption):
|
||||
def __init__(
|
||||
self,
|
||||
pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
|
||||
pipeline_options: Optional[PipelineOptions] = None,
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||
):
|
||||
if pipeline_cls is None:
|
||||
pipeline_cls = StandardPdfModelPipeline
|
||||
if backend is None:
|
||||
backend = DoclingParseDocumentBackend
|
||||
super().__init__(
|
||||
pipeline_cls=pipeline_cls,
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
pipeline_cls: Type = StandardPdfModelPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
_format_to_default_options = {
|
||||
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
|
@ -61,8 +61,13 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
||||
f"Can not convert this with a PDF pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
@ -32,8 +32,13 @@ class SimpleModelPipeline(BaseModelPipeline):
|
||||
return conv_res
|
||||
|
||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
raise RuntimeError(
|
||||
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
|
||||
f"Can not convert this with simple pipeline. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
# conv_res.status = ConversionStatus.FAILURE
|
||||
# return conv_res
|
||||
|
||||
# Instead of running a page-level pipeline to build up the document structure,
|
||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||
|
@ -12,7 +12,7 @@ from docling.document_converter import DocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
USE_EXPERIMENTAL = False
|
||||
USE_EXPERIMENTAL = True
|
||||
|
||||
|
||||
def export_documents(
|
||||
|
@ -7,7 +7,7 @@ from typing import Iterable
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, FormatOption
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -104,9 +104,7 @@ def main():
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
|
||||
)
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
|
||||
|
@ -3,9 +3,15 @@ from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
from docling.document_converter import (
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
PdfFormatOption,
|
||||
WordFormatOption,
|
||||
)
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
@ -22,23 +28,25 @@ input_paths = [
|
||||
]
|
||||
input = DocumentConversionInput.from_paths(input_paths)
|
||||
|
||||
# for defaults use:
|
||||
doc_converter = DocumentConverter()
|
||||
## for defaults use:
|
||||
# doc_converter = DocumentConverter()
|
||||
|
||||
# to customize use:
|
||||
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
# formats=[
|
||||
# InputFormat.PDF,
|
||||
# InputFormat.DOCX,
|
||||
# ], # whitelist formats, other files are ignored.
|
||||
# format_options={
|
||||
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
|
||||
# InputFormat.DOCX: FormatOption(
|
||||
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
|
||||
# ),
|
||||
# # InputFormat.IMAGE: PdfFormatOption(),
|
||||
# },
|
||||
# )
|
||||
## to customize use:
|
||||
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
formats=[
|
||||
InputFormat.PDF,
|
||||
InputFormat.DOCX,
|
||||
], # whitelist formats, other files are ignored.
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||
InputFormat.DOCX: WordFormatOption(
|
||||
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||
),
|
||||
# InputFormat.IMAGE: PdfFormatOption(),
|
||||
},
|
||||
)
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,9 +1,10 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
@ -22,14 +23,17 @@ def get_pdf_paths():
|
||||
|
||||
def get_converter():
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return converter
|
||||
|
@ -4,10 +4,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream
|
||||
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
@ -21,14 +21,17 @@ def get_pdf_path():
|
||||
@pytest.fixture
|
||||
def converter():
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
converter = DocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
return converter
|
||||
@ -61,7 +64,7 @@ def test_batch_bytes(converter: DocumentConverter):
|
||||
print(f"converting {pdf_path}")
|
||||
|
||||
buf = BytesIO(pdf_path.open("rb").read())
|
||||
docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
|
||||
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
||||
conv_input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
results = converter.convert(conv_input)
|
||||
|
@ -3,10 +3,10 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@ -17,15 +17,19 @@ def test_doc_path():
|
||||
def get_converters_with_table_options():
|
||||
for cell_matching in [True, False]:
|
||||
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = cell_matching
|
||||
pipeline_options.table_structure_options.mode = mode
|
||||
|
||||
converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
}
|
||||
)
|
||||
|
||||
yield converter
|
||||
|
Loading…
Reference in New Issue
Block a user