mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Update examples and test cases
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
080042d06d
commit
0dfbd0b6fc
@ -20,12 +20,12 @@ repos:
|
|||||||
# pass_filenames: false
|
# pass_filenames: false
|
||||||
# language: system
|
# language: system
|
||||||
# files: '\.py$'
|
# files: '\.py$'
|
||||||
# - id: mypy
|
# - id: mypy
|
||||||
# name: MyPy
|
# name: MyPy
|
||||||
# entry: poetry run mypy docling
|
# entry: poetry run mypy docling
|
||||||
# pass_filenames: false
|
# pass_filenames: false
|
||||||
# language: system
|
# language: system
|
||||||
# files: '\.py$'
|
# files: '\.py$'
|
||||||
- id: nbqa_black
|
- id: nbqa_black
|
||||||
name: nbQA Black
|
name: nbQA Black
|
||||||
entry: poetry run nbqa black examples
|
entry: poetry run nbqa black examples
|
||||||
|
@ -20,7 +20,7 @@ class AbstractDocumentBackend(ABC):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def is_paginated(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
@ -50,7 +50,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_paginated(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
|
@ -57,7 +57,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.valid
|
return self.valid
|
||||||
|
|
||||||
def is_paginated(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return True # True? if so, how to handle pages...
|
return True # True? if so, how to handle pages...
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
|
@ -50,7 +50,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def is_paginated(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
|
@ -4,7 +4,7 @@ from typing import Iterable, Optional, Set
|
|||||||
from docling_core.types.experimental import BoundingBox, Size
|
from docling_core.types.experimental import BoundingBox, Size
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import PaginatedDocumentBackend
|
||||||
from docling.datamodel.base_models import Cell, InputFormat
|
from docling.datamodel.base_models import Cell, InputFormat
|
||||||
|
|
||||||
|
|
||||||
@ -41,7 +41,7 @@ class PdfPageBackend(ABC):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PdfDocumentBackend(AbstractDocumentBackend):
|
class PdfDocumentBackend(PaginatedDocumentBackend):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||||
pass
|
pass
|
||||||
@ -55,5 +55,5 @@ class PdfDocumentBackend(AbstractDocumentBackend):
|
|||||||
return {InputFormat.PDF}
|
return {InputFormat.PDF}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def is_paginated(cls) -> bool:
|
def supports_pagination(cls) -> bool:
|
||||||
return True
|
return True
|
||||||
|
@ -12,9 +12,10 @@ from docling_core.utils.file import resolve_file_source
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.pdf_document_converter import PdfDocumentConverter
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
@ -195,9 +196,13 @@ def convert(
|
|||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||||
doc_converter = PdfDocumentConverter(
|
|
||||||
pipeline_options=pipeline_options,
|
doc_converter = DocumentConverter(
|
||||||
pdf_backend=pdf_backend,
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options, backend=pdf_backend
|
||||||
|
)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
# Define input files
|
# Define input files
|
||||||
|
@ -168,5 +168,5 @@ class Page(BaseModel):
|
|||||||
class DocumentStream(BaseModel):
|
class DocumentStream(BaseModel):
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
filename: str
|
name: str
|
||||||
stream: BytesIO
|
stream: BytesIO
|
||||||
|
@ -74,14 +74,6 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
|
|||||||
description=DescriptionItem(), name="dummy"
|
description=DescriptionItem(), name="dummy"
|
||||||
) # TODO: Stub
|
) # TODO: Stub
|
||||||
|
|
||||||
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
|
||||||
InputFormat.PDF: DoclingParseDocumentBackend,
|
|
||||||
InputFormat.HTML: HTMLDocumentBackend,
|
|
||||||
InputFormat.DOCX: MsWordDocumentBackend,
|
|
||||||
InputFormat.PPTX: MsPowerpointDocumentBackend,
|
|
||||||
InputFormat.IMAGE: None,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class InputDocument(BaseModel):
|
class InputDocument(BaseModel):
|
||||||
file: PurePath = None
|
file: PurePath = None
|
||||||
@ -110,14 +102,12 @@ class InputDocument(BaseModel):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(path_or_stream, Path):
|
if isinstance(path_or_stream, Path):
|
||||||
|
|
||||||
self.file = path_or_stream
|
self.file = path_or_stream
|
||||||
self.filesize = path_or_stream.stat().st_size
|
self.filesize = path_or_stream.stat().st_size
|
||||||
if self.filesize > self.limits.max_file_size:
|
if self.filesize > self.limits.max_file_size:
|
||||||
self.valid = False
|
self.valid = False
|
||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
|
|
||||||
self._init_doc(backend, path_or_stream)
|
self._init_doc(backend, path_or_stream)
|
||||||
|
|
||||||
elif isinstance(path_or_stream, BytesIO):
|
elif isinstance(path_or_stream, BytesIO):
|
||||||
@ -128,12 +118,11 @@ class InputDocument(BaseModel):
|
|||||||
self.valid = False
|
self.valid = False
|
||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
|
|
||||||
self._init_doc(backend, path_or_stream)
|
self._init_doc(backend, path_or_stream)
|
||||||
|
|
||||||
# For paginated backends, check if the maximum page count is exceeded.
|
# For paginated backends, check if the maximum page count is exceeded.
|
||||||
if self.valid and self._backend.is_valid():
|
if self.valid and self._backend.is_valid():
|
||||||
if self._backend.is_paginated():
|
if self._backend.supports_pagination():
|
||||||
self.page_count = self._backend.page_count()
|
self.page_count = self._backend.page_count()
|
||||||
if not self.page_count <= self.limits.max_num_pages:
|
if not self.page_count <= self.limits.max_num_pages:
|
||||||
self.valid = False
|
self.valid = False
|
||||||
@ -156,12 +145,10 @@ class InputDocument(BaseModel):
|
|||||||
path_or_stream: Union[BytesIO, Path],
|
path_or_stream: Union[BytesIO, Path],
|
||||||
) -> None:
|
) -> None:
|
||||||
if backend is None:
|
if backend is None:
|
||||||
backend = _input_format_default_backends.get(self.format)
|
raise RuntimeError(
|
||||||
if backend is None:
|
f"No backend configuration provided for file {self.file} with format {self.format}. "
|
||||||
self.valid = False
|
f"Please check your format configuration on DocumentConverter."
|
||||||
raise RuntimeError(
|
)
|
||||||
f"Could not find suitable backend for file: {self.file}"
|
|
||||||
)
|
|
||||||
|
|
||||||
self._backend = backend(
|
self._backend = backend(
|
||||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||||
@ -473,47 +460,45 @@ class DocumentConversionInput(BaseModel):
|
|||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
|
|
||||||
for obj in self._path_or_stream_iterator:
|
for obj in self._path_or_stream_iterator:
|
||||||
|
format = self._guess_format(obj)
|
||||||
|
if format not in format_options.keys():
|
||||||
|
_log.debug(
|
||||||
|
f"Skipping input document {obj.name} because its format is not in the whitelist."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
backend = format_options.get(format).backend
|
||||||
|
|
||||||
if isinstance(obj, Path):
|
if isinstance(obj, Path):
|
||||||
|
|
||||||
mime = filetype.guess_mime(str(obj))
|
|
||||||
if mime is None:
|
|
||||||
if obj.suffix == ".html":
|
|
||||||
mime = "text/html"
|
|
||||||
|
|
||||||
format = MimeTypeToFormat.get(mime)
|
|
||||||
if format not in format_options.keys():
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
backend = format_options.get(format).backend
|
|
||||||
|
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj,
|
path_or_stream=obj,
|
||||||
format=format,
|
format=format,
|
||||||
|
filename=obj.name,
|
||||||
limits=self.limits,
|
limits=self.limits,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
)
|
)
|
||||||
elif isinstance(obj, DocumentStream):
|
elif isinstance(obj, DocumentStream):
|
||||||
mime = filetype.guess_mime(obj.stream.read(8192))
|
|
||||||
obj.stream.seek(0)
|
|
||||||
|
|
||||||
if mime is None:
|
|
||||||
if obj.suffix == ".html":
|
|
||||||
mime = "text/html"
|
|
||||||
|
|
||||||
format = MimeTypeToFormat.get(mime)
|
|
||||||
if format not in format_options.keys():
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
backend = format_options.get(format).backend
|
|
||||||
|
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj.stream,
|
path_or_stream=obj.stream,
|
||||||
format=format,
|
format=format,
|
||||||
filename=obj.filename,
|
filename=obj.name,
|
||||||
limits=self.limits,
|
limits=self.limits,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def _guess_format(self, obj):
|
||||||
|
if isinstance(obj, Path):
|
||||||
|
mime = filetype.guess_mime(str(obj))
|
||||||
|
elif isinstance(obj, DocumentStream):
|
||||||
|
mime = filetype.guess_mime(obj.stream.read(8192))
|
||||||
|
else:
|
||||||
|
1 == 1 # alert!!
|
||||||
|
if mime is None:
|
||||||
|
if obj.suffix == ".html":
|
||||||
|
mime = "text/html"
|
||||||
|
format = MimeTypeToFormat.get(mime)
|
||||||
|
return format
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
def from_paths(cls, paths: Iterable[Path], limits: Optional[DocumentLimits] = None):
|
||||||
paths = [Path(p) for p in paths]
|
paths = [Path(p) for p in paths]
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import warnings
|
import warnings
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Optional, Self, Union
|
from typing import Annotated, Optional, Union
|
||||||
|
|
||||||
from pydantic import BaseModel, Field, model_validator
|
from pydantic import BaseModel, Field, model_validator
|
||||||
|
|
||||||
@ -40,7 +40,7 @@ class PdfPipelineOptions(PipelineOptions):
|
|||||||
images_scale: Optional[float] = None # if set, the scale for generated images
|
images_scale: Optional[float] = None # if set, the scale for generated images
|
||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
def set_page_images_from_deprecated(self) -> Self:
|
def set_page_images_from_deprecated(self) -> "PdfPipelineOptions":
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("ignore", DeprecationWarning)
|
warnings.simplefilter("ignore", DeprecationWarning)
|
||||||
default_scale = 1.0
|
default_scale = 1.0
|
||||||
|
@ -5,10 +5,21 @@ from pathlib import Path
|
|||||||
from typing import Dict, Iterable, List, Optional, Type
|
from typing import Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
from pydantic import (
|
||||||
|
AnyHttpUrl,
|
||||||
|
BaseModel,
|
||||||
|
ConfigDict,
|
||||||
|
TypeAdapter,
|
||||||
|
ValidationError,
|
||||||
|
field_validator,
|
||||||
|
model_validator,
|
||||||
|
)
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
@ -28,50 +39,53 @@ _log = logging.getLogger(__name__)
|
|||||||
class FormatOption(BaseModel):
|
class FormatOption(BaseModel):
|
||||||
pipeline_cls: Type[BaseModelPipeline]
|
pipeline_cls: Type[BaseModelPipeline]
|
||||||
pipeline_options: Optional[PipelineOptions] = None
|
pipeline_options: Optional[PipelineOptions] = None
|
||||||
backend: Optional[Type[AbstractDocumentBackend]] = None
|
backend: Type[AbstractDocumentBackend]
|
||||||
|
|
||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
def __init__(
|
@model_validator(mode="after")
|
||||||
self,
|
def set_optional_field_default(self) -> "FormatOption":
|
||||||
pipeline_cls: Type[BaseModelPipeline],
|
if self.pipeline_options is None:
|
||||||
pipeline_options: Optional[PipelineOptions] = None,
|
self.pipeline_options = self.pipeline_cls.get_default_options()
|
||||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
return self
|
||||||
):
|
|
||||||
if pipeline_options is None:
|
|
||||||
pipeline_options = pipeline_cls.get_default_options()
|
|
||||||
|
|
||||||
super().__init__(
|
|
||||||
pipeline_cls=pipeline_cls,
|
class WordFormatOption(FormatOption):
|
||||||
pipeline_options=pipeline_options,
|
pipeline_cls: Type = SimpleModelPipeline
|
||||||
backend=backend,
|
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||||
)
|
|
||||||
|
|
||||||
|
class PowerpointFormatOption(FormatOption):
|
||||||
|
pipeline_cls: Type = SimpleModelPipeline
|
||||||
|
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLFormatOption(FormatOption):
|
||||||
|
pipeline_cls: Type = SimpleModelPipeline
|
||||||
|
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||||
|
|
||||||
|
|
||||||
class PdfFormatOption(FormatOption):
|
class PdfFormatOption(FormatOption):
|
||||||
def __init__(
|
pipeline_cls: Type = StandardPdfModelPipeline
|
||||||
self,
|
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||||
pipeline_cls: Optional[Type[BaseModelPipeline]] = None,
|
|
||||||
pipeline_options: Optional[PipelineOptions] = None,
|
|
||||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
|
||||||
):
|
|
||||||
if pipeline_cls is None:
|
|
||||||
pipeline_cls = StandardPdfModelPipeline
|
|
||||||
if backend is None:
|
|
||||||
backend = DoclingParseDocumentBackend
|
|
||||||
super().__init__(
|
|
||||||
pipeline_cls=pipeline_cls,
|
|
||||||
pipeline_options=pipeline_options,
|
|
||||||
backend=backend,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
_format_to_default_options = {
|
_format_to_default_options = {
|
||||||
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
InputFormat.DOCX: FormatOption(
|
||||||
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend
|
||||||
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
|
),
|
||||||
InputFormat.IMAGE: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
InputFormat.PPTX: FormatOption(
|
||||||
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
pipeline_cls=SimpleModelPipeline, backend=MsPowerpointDocumentBackend
|
||||||
|
),
|
||||||
|
InputFormat.HTML: FormatOption(
|
||||||
|
pipeline_cls=SimpleModelPipeline, backend=HTMLDocumentBackend
|
||||||
|
),
|
||||||
|
InputFormat.IMAGE: FormatOption(
|
||||||
|
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
||||||
|
),
|
||||||
|
InputFormat.PDF: FormatOption(
|
||||||
|
pipeline_cls=StandardPdfModelPipeline, backend=DoclingParseDocumentBackend
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -61,8 +61,13 @@ class PaginatedModelPipeline(BaseModelPipeline): # TODO this is a bad name.
|
|||||||
_log.info(f"Processing document {in_doc.file.name}")
|
_log.info(f"Processing document {in_doc.file.name}")
|
||||||
|
|
||||||
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
if not isinstance(in_doc._backend, PdfDocumentBackend):
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
raise RuntimeError(
|
||||||
return conv_res
|
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a PDF backend. "
|
||||||
|
f"Can not convert this with a PDF pipeline. "
|
||||||
|
f"Please check your format configuration on DocumentConverter."
|
||||||
|
)
|
||||||
|
# conv_res.status = ConversionStatus.FAILURE
|
||||||
|
# return conv_res
|
||||||
|
|
||||||
for i in range(0, in_doc.page_count):
|
for i in range(0, in_doc.page_count):
|
||||||
conv_res.pages.append(Page(page_no=i))
|
conv_res.pages.append(Page(page_no=i))
|
||||||
|
@ -32,8 +32,13 @@ class SimpleModelPipeline(BaseModelPipeline):
|
|||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
raise RuntimeError(
|
||||||
return conv_res
|
f"The selected backend {type(in_doc._backend).__name__} for {in_doc.file} is not a declarative backend. "
|
||||||
|
f"Can not convert this with simple pipeline. "
|
||||||
|
f"Please check your format configuration on DocumentConverter."
|
||||||
|
)
|
||||||
|
# conv_res.status = ConversionStatus.FAILURE
|
||||||
|
# return conv_res
|
||||||
|
|
||||||
# Instead of running a page-level pipeline to build up the document structure,
|
# Instead of running a page-level pipeline to build up the document structure,
|
||||||
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
# the backend is expected to be of type DeclarativeDocumentBackend, which can output
|
||||||
|
@ -12,7 +12,7 @@ from docling.document_converter import DocumentConverter
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
USE_EXPERIMENTAL = False
|
USE_EXPERIMENTAL = True
|
||||||
|
|
||||||
|
|
||||||
def export_documents(
|
def export_documents(
|
||||||
|
@ -7,7 +7,7 @@ from typing import Iterable
|
|||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, FormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -104,9 +104,7 @@ def main():
|
|||||||
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: FormatOption(
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
pipeline_cls=StandardPdfModelPipeline, pipeline_options=pipeline_options
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -3,9 +3,15 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import (
|
||||||
|
DocumentConverter,
|
||||||
|
FormatOption,
|
||||||
|
PdfFormatOption,
|
||||||
|
WordFormatOption,
|
||||||
|
)
|
||||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
|
||||||
@ -22,23 +28,25 @@ input_paths = [
|
|||||||
]
|
]
|
||||||
input = DocumentConversionInput.from_paths(input_paths)
|
input = DocumentConversionInput.from_paths(input_paths)
|
||||||
|
|
||||||
# for defaults use:
|
## for defaults use:
|
||||||
doc_converter = DocumentConverter()
|
# doc_converter = DocumentConverter()
|
||||||
|
|
||||||
# to customize use:
|
## to customize use:
|
||||||
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
# formats=[
|
formats=[
|
||||||
# InputFormat.PDF,
|
InputFormat.PDF,
|
||||||
# InputFormat.DOCX,
|
InputFormat.DOCX,
|
||||||
# ], # whitelist formats, other files are ignored.
|
], # whitelist formats, other files are ignored.
|
||||||
# format_options={
|
format_options={
|
||||||
# InputFormat.PDF: PdfFormatOption(backend=DoclingParseDocumentBackend),
|
InputFormat.PDF: PdfFormatOption(
|
||||||
# InputFormat.DOCX: FormatOption(
|
pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend
|
||||||
# pipeline_cls=StandardPdfModelPipeline, backend=MsWordDocumentBackend
|
), # PdfFormatOption(backend=PyPdfiumDocumentBackend),
|
||||||
# ),
|
InputFormat.DOCX: WordFormatOption(
|
||||||
# # InputFormat.IMAGE: PdfFormatOption(),
|
pipeline_cls=SimpleModelPipeline # , backend=MsWordDocumentBackend
|
||||||
# },
|
),
|
||||||
# )
|
# InputFormat.IMAGE: PdfFormatOption(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert(input)
|
||||||
|
|
||||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,9 +1,10 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result
|
from .verify_utils import verify_conversion_result
|
||||||
|
|
||||||
@ -22,14 +23,17 @@ def get_pdf_paths():
|
|||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
format_options={
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
||||||
|
)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return converter
|
return converter
|
||||||
|
@ -4,10 +4,10 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream
|
from docling.datamodel.base_models import DocumentStream, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result
|
from .verify_utils import verify_conversion_result
|
||||||
|
|
||||||
@ -21,14 +21,17 @@ def get_pdf_path():
|
|||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def converter():
|
def converter():
|
||||||
|
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
format_options={
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options, backend=DoclingParseDocumentBackend
|
||||||
|
)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return converter
|
return converter
|
||||||
@ -61,7 +64,7 @@ def test_batch_bytes(converter: DocumentConverter):
|
|||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
|
||||||
buf = BytesIO(pdf_path.open("rb").read())
|
buf = BytesIO(pdf_path.open("rb").read())
|
||||||
docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
|
docs = [DocumentStream(name=pdf_path.name, stream=buf)]
|
||||||
conv_input = DocumentConversionInput.from_streams(docs)
|
conv_input = DocumentConversionInput.from_streams(docs)
|
||||||
|
|
||||||
results = converter.convert(conv_input)
|
results = converter.convert(conv_input)
|
||||||
|
@ -3,10 +3,10 @@ from pathlib import Path
|
|||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PipelineOptions, TableFormerMode
|
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
@ -17,15 +17,19 @@ def test_doc_path():
|
|||||||
def get_converters_with_table_options():
|
def get_converters_with_table_options():
|
||||||
for cell_matching in [True, False]:
|
for cell_matching in [True, False]:
|
||||||
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
|
for mode in [TableFormerMode.FAST, TableFormerMode.ACCURATE]:
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = cell_matching
|
||||||
pipeline_options.table_structure_options.mode = mode
|
pipeline_options.table_structure_options.mode = mode
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
format_options={
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
backend=DoclingParseDocumentBackend,
|
||||||
|
)
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
yield converter
|
yield converter
|
||||||
|
Loading…
Reference in New Issue
Block a user