Fundamental refactoring for multi-format support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-01 16:27:22 +02:00
parent cd06d89c2a
commit 1fa7cd9855
34 changed files with 2102 additions and 365 deletions

View File

@ -67,11 +67,12 @@ pip install docling
### Convert a single document
To convert invidual PDF documents, use `convert_single()`, for example:
```python
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
converter = PdfDocumentConverter()
result = converter.convert_single(source)
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."

View File

@ -1,13 +1,11 @@
from abc import ABC, abstractmethod
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
from typing import Set, Union
from docling_core.types.experimental import BoundingBox, Size
from PIL import Image
from docling_core.types.experimental import DoclingDocument
if TYPE_CHECKING:
from docling.datamodel.base_models import Cell
from docling.datamodel.base_models import InputFormat
class AbstractDocumentBackend(ABC):
@ -20,6 +18,11 @@ class AbstractDocumentBackend(ABC):
def is_valid(self) -> bool:
pass
@classmethod
@abstractmethod
def is_paginated(cls) -> bool:
pass
@abstractmethod
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
@ -27,45 +30,19 @@ class AbstractDocumentBackend(ABC):
self.path_or_stream = None
class PdfPageBackend(ABC):
@classmethod
@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "Size":
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
def supported_formats(cls) -> Set[InputFormat]:
pass
class PdfDocumentBackend(AbstractDocumentBackend):
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
class DeclarativeDocumentBackend(AbstractDocumentBackend):
"""DeclarativeDocumentBackend.
A declarative document backend is a backend that can transform to DoclingDocument
straight without a recognition pipeline.
"""
@abstractmethod
def page_count(self) -> int:
def convert(self) -> DoclingDocument:
pass

View File

@ -10,7 +10,7 @@ from docling_parse.docling_parse import pdf_parser
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
_log = logging.getLogger(__name__)

View File

@ -0,0 +1,40 @@
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.experimental import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.HTML}
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
return doc

View File

@ -0,0 +1,38 @@
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.experimental import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PPTX}
def convert(self) -> DoclingDocument:
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc.add_text(text="I am a Powerpoint document.", label=DocItemLabel.TEXT)
return doc

View File

@ -0,0 +1,38 @@
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.experimental import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.DOCX}
def convert(self) -> DoclingDocument:
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc.add_text(text="I am a Word document.", label=DocItemLabel.TEXT)
return doc

View File

@ -0,0 +1,59 @@
from abc import ABC, abstractmethod
from typing import Iterable, Optional, Set
from docling_core.types.experimental import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
class PdfPageBackend(ABC):
@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "Size":
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass
class PdfDocumentBackend(AbstractDocumentBackend):
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@abstractmethod
def page_count(self) -> int:
pass
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PDF}
@classmethod
def is_paginated(cls) -> bool:
return True

View File

@ -8,10 +8,10 @@ import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
from PIL import Image, ImageDraw
from pypdfium2 import PdfPage, PdfTextPage
from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
_log = logging.getLogger(__name__)

View File

@ -12,9 +12,9 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -190,12 +190,12 @@ def convert(
case _:
raise RuntimeError(f"Unexpected backend type {backend}")
pipeline_options = PipelineOptions(
pipeline_options = PdfPipelineOptions(
do_ocr=ocr,
do_table_structure=True,
)
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
doc_converter = DocumentConverter(
doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=pdf_backend,
)

View File

@ -1,8 +1,8 @@
import copy
import warnings
from enum import Enum, auto
from io import BytesIO
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
from pathlib import Path
from typing import Annotated, Dict, List, Optional, Union
from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.document import BasePictureData, TableCell
@ -11,8 +11,6 @@ from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self
from docling.backend.abstract_backend import PdfPageBackend
class ConversionStatus(str, Enum):
PENDING = auto()
@ -30,13 +28,29 @@ class InputFormat(str, Enum):
PDF = auto()
FormatToMimeType = {
InputFormat.DOCX: {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
},
InputFormat.PPTX: {
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
},
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
InputFormat.IMAGE: {"image/png", "image/jpeg"},
InputFormat.PDF: {"application/pdf"},
}
MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
}
class DocInputType(str, Enum):
PATH = auto()
STREAM = auto()
class DoclingComponentType(str, Enum):
PDF_BACKEND = auto()
DOCUMENT_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
@ -128,13 +142,13 @@ class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
page_no: int
page_hash: Optional[str] = None
# page_hash: Optional[str] = None
size: Optional[Size] = None
cells: List[Cell] = []
predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None
_backend: Optional[PdfPageBackend] = (
_backend: Optional["PdfPageBackend"] = (
None # Internal PDF backend. By default it is cleared during assembling.
)
_default_image_scale: float = 1.0 # Default image scale for external usage.
@ -170,14 +184,16 @@ class TableStructureOptions(BaseModel):
)
class PipelineOptions(BaseModel):
class PipelineOptions(BaseModel): ...
class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions()
class AssembleOptions(BaseModel):
keep_page_images: Annotated[
bool,
Field(

View File

@ -1,9 +1,10 @@
import logging
from io import BytesIO
from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText
import filetype
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject
@ -19,8 +20,11 @@ from docling_core.types.experimental import (
from pydantic import BaseModel
from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
ConversionStatus,
@ -28,13 +32,14 @@ from docling.datamodel.base_models import (
ErrorItem,
FigureElement,
InputFormat,
MimeTypeToFormat,
Page,
PageElement,
Table,
TextElement,
)
from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash
from docling.utils.utils import create_file_hash, create_hash
_log = logging.getLogger(__name__)
@ -71,8 +76,9 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.PDF: DoclingParseDocumentBackend,
InputFormat.DOCX: None,
InputFormat.PPTX: None,
InputFormat.HTML: HTMLDocumentBackend,
InputFormat.DOCX: MsWordDocumentBackend,
InputFormat.PPTX: MsPowerpointDocumentBackend,
InputFormat.IMAGE: None,
}
@ -80,13 +86,14 @@ _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]]
class InputDocument(BaseModel):
file: PurePath = None
document_hash: Optional[str] = None
valid: bool = False
valid: bool = True
limits: DocumentLimits = DocumentLimits()
format: Optional[InputFormat] = None
filesize: Optional[int] = None
page_count: Optional[int] = None
page_count: int = 0
_backend: PdfDocumentBackend = None # Internal PDF backend used
_backend: AbstractDocumentBackend = None # Internal PDF backend used
def __init__(
self,
@ -94,27 +101,31 @@ class InputDocument(BaseModel):
filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None,
format: Optional[InputFormat] = None,
):
super().__init__()
if not backend:
backend = _input_format_default_backends[InputFormat.PDF]
self.limits = limits or DocumentLimits()
try:
if isinstance(path_or_stream, Path):
mime = filetype.guess_mime(str(path_or_stream))
if mime is None:
if path_or_stream.suffix == ".html":
mime = "text/html"
self.file = path_or_stream
self.filesize = path_or_stream.stat().st_size
if self.filesize > self.limits.max_file_size:
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
self._init_doc(backend, mime, path_or_stream)
elif isinstance(path_or_stream, BytesIO):
mime = filetype.guess_mime(path_or_stream.read(8192))
self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes
@ -122,15 +133,15 @@ class InputDocument(BaseModel):
self.valid = False
else:
self.document_hash = create_file_hash(path_or_stream)
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
if self.document_hash and self._backend.page_count() > 0:
self._init_doc(backend, mime, path_or_stream)
# For paginated backends, check if the maximum page count is exceeded.
if self.valid and self._backend.is_valid():
if self._backend.is_paginated():
self.page_count = self._backend.page_count()
if self.page_count <= self.limits.max_num_pages:
self.valid = True
if not self.page_count <= self.limits.max_num_pages:
self.valid = False
except (FileNotFoundError, OSError) as e:
_log.exception(
@ -144,6 +155,27 @@ class InputDocument(BaseModel):
)
# raise
def _init_doc(
self,
backend: AbstractDocumentBackend,
mime: str,
path_or_stream: Union[BytesIO, Path],
) -> None:
self.format = MimeTypeToFormat.get(mime)
if self.format is not None:
backend = backend or _input_format_default_backends.get(self.format)
if backend is None:
raise RuntimeError(
f"Could not find suitable default backend for format: {self.format}"
)
if self.format is None or self.format not in backend.supported_formats():
# TODO decide if to raise exception here too.
self.valid = False
else:
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
@deprecated("Use `ConversionResult` instead.")
class ConvertedDocument(BaseModel):
@ -163,7 +195,11 @@ class ConvertedDocument(BaseModel):
desc = DsDocumentDescription(logs=[])
page_hashes = [
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
PageReference(
hash=create_hash(self.input.document_hash + ":" + str(p.page_no)),
page=p.page_no + 1,
model="default",
)
for p in self.pages
]
@ -441,25 +477,21 @@ class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
limits: Optional[DocumentLimits] = DocumentLimits()
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
def docs(
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
self, backend: Optional[Type[AbstractDocumentBackend]] = None
) -> Iterable[InputDocument]:
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
for obj in self._path_or_stream_iterator:
if isinstance(obj, Path):
yield InputDocument(
path_or_stream=obj, limits=self.limits, backend=pdf_backend
path_or_stream=obj, limits=self.limits, backend=backend
)
elif isinstance(obj, DocumentStream):
yield InputDocument(
path_or_stream=obj.stream,
filename=obj.filename,
limits=self.limits,
backend=pdf_backend,
backend=backend,
)
@classmethod

View File

@ -1,81 +1,78 @@
import functools
import logging
import tempfile
import time
import traceback
from pathlib import Path
from typing import Iterable, Optional, Type, Union
from typing import Dict, Iterable, List, Optional, Type
import requests
from PIL import ImageDraw
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
from docling.backend.abstract_backend import PdfDocumentBackend
from docling.datamodel.base_models import (
AssembledUnit,
AssembleOptions,
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
from docling.datamodel.document import (
ConversionResult,
DocumentConversionInput,
InputDocument,
)
from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
from docling.utils.utils import chunkify, create_hash
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class DocumentConverter:
_default_download_filename = "file.pdf"
class FormatOption(BaseModel):
pipeline_cls: Type[BaseModelPipeline]
pipeline_options: Optional[PipelineOptions] = None
backend: Optional[Type[AbstractDocumentBackend]]
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__(
self,
artifacts_path: Optional[Union[Path, str]] = None,
pipeline_options: PipelineOptions = PipelineOptions(),
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
assemble_options: AssembleOptions = AssembleOptions(),
pipeline_cls: Type[BaseModelPipeline],
pipeline_options: Optional[PipelineOptions] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None,
):
if not artifacts_path:
artifacts_path = self.download_models_hf()
if pipeline_options is None:
pipeline_options = pipeline_cls.get_default_options()
artifacts_path = Path(artifacts_path)
self.model_pipeline = pipeline_cls(
artifacts_path=artifacts_path, pipeline_options=pipeline_options
super().__init__(
pipeline_cls=pipeline_cls,
pipeline_options=pipeline_options,
backend=backend,
)
self.page_assemble_model = PageAssembleModel(config={})
self.glm_model = GlmModel(config={})
self.pdf_backend = pdf_backend
self.assemble_options = assemble_options
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
_format_to_default_options = {
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
InputFormat.IMAGE: None,
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
}
download_path = snapshot_download(
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
class DocumentConverter:
_default_download_filename = "file"
def __init__(
self,
formats: List[InputFormat] = [e for e in InputFormat],
format_options: Dict[InputFormat, FormatOption] = _format_to_default_options,
):
self.formats = formats
self.format_to_options = format_options
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
{}
)
return Path(download_path)
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
for input_batch in chunkify(
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
input.docs(), settings.perf.doc_batch_size # pass format_options
):
_log.info(f"Going to convert document batch...")
# parallel processing only within input_batch
@ -84,8 +81,8 @@ class DocumentConverter:
# ) as pool:
# yield from pool.map(self.process_document, input_batch)
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
yield from map(self._process_document, input_batch)
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
yield from map(self.process_document, input_batch)
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
"""Convert a single document.
@ -137,156 +134,42 @@ class DocumentConverter:
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
return conv_res
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
pipeline_class = None
fopt = self.format_to_options.get(doc.format)
if fopt is None:
return None
else:
pipeline_class = fopt.pipeline_cls
if pipeline_class not in self.initialized_pipelines:
self.initialized_pipelines[pipeline_class] = pipeline_class(
pipeline_options=pipeline_class.get_default_options()
)
return self.initialized_pipelines[pipeline_class]
def process_document(self, in_doc: InputDocument) -> ConversionResult:
start_doc_time = time.time()
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid:
conv_res.status = ConversionStatus.FAILURE
return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
all_assembled_pages = []
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
# Pipeline
# 1. Initialise the page resources
init_pages = map(
functools.partial(self._initialize_page, in_doc), page_batch
)
# 2. Populate page image
pages_with_images = map(
functools.partial(self._populate_page_images, in_doc), init_pages
)
# 3. Populate programmatic page cells
pages_with_cells = map(
functools.partial(self._parse_page_cells, in_doc),
pages_with_images,
)
# 4. Run pipeline stages
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
# 5. Assemble page elements (per page)
assembled_pages = self.page_assemble_model(pipeline_pages)
# exhaust assembled_pages
for assembled_page in assembled_pages:
# Free up mem resources before moving on with next batch
# Remove page images (can be disabled)
if self.assemble_options.images_scale is None:
assembled_page._image_cache = {}
# Unload backend
assembled_page._backend.unload()
all_assembled_pages.append(assembled_page)
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
# Free up mem resources of PDF backend
in_doc._backend.unload()
conv_res.pages = all_assembled_pages
self._assemble_doc(conv_res)
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.PDF_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
conv_res.status = status
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
conv_res = self._execute_pipeline(in_doc)
end_doc_time = time.time() - start_doc_time
_log.info(
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
)
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
return conv_res
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no)
page.size = page._backend.get_size()
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult:
if in_doc.valid and in_doc.format in self.formats:
pipeline = self._get_pipeline(in_doc)
if pipeline is None: # Can't find a default pipeline. Should this raise?
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
return conv_res
return page
conv_res = pipeline.execute(in_doc)
else: # invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
# TODO add error log why it failed.
# Generate the page image and store it in the page object
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
# user requested scales
if self.assemble_options.images_scale is not None:
page._default_image_scale = self.assemble_options.images_scale
page.get_image(
scale=self.assemble_options.images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
page.cells = page._backend.get_text_cells()
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.get_image(scale=1.0), cells)
return page
def _assemble_doc(self, conv_res: ConversionResult):
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
return conv_res

View File

@ -0,0 +1,10 @@
from abc import ABC, abstractmethod
from typing import Iterable
from docling.datamodel.base_models import Page
class AbstractPageModel(ABC):
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
pass

View File

@ -3,7 +3,6 @@ import logging
from abc import abstractmethod
from typing import Iterable, List, Tuple
import numpy
import numpy as np
from docling_core.types.experimental import BoundingBox, CoordOrigin
from PIL import Image, ImageDraw
@ -11,11 +10,12 @@ from rtree import index
from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import OcrCell, Page
from docling.models.abstract_model import AbstractPageModel
_log = logging.getLogger(__name__)
class BaseOcrModel:
class BaseOcrModel(AbstractPageModel):
def __init__(self, config):
self.config = config
self.enabled = config["enabled"]

View File

@ -16,12 +16,13 @@ from docling.datamodel.base_models import (
LayoutPrediction,
Page,
)
from docling.models.abstract_model import AbstractPageModel
from docling.utils import layout_utils as lu
_log = logging.getLogger(__name__)
class LayoutModel:
class LayoutModel(AbstractPageModel):
TEXT_ELEM_LABELS = [
DocItemLabel.TEXT,

View File

@ -10,12 +10,13 @@ from docling.datamodel.base_models import (
Table,
TextElement,
)
from docling.models.abstract_model import AbstractPageModel
from docling.models.layout_model import LayoutModel
_log = logging.getLogger(__name__)
class PageAssembleModel:
class PageAssembleModel(AbstractPageModel):
def __init__(self, config):
self.config = config
@ -145,4 +146,11 @@ class PageAssembleModel:
elements=elements, headers=headers, body=body
)
# Remove page images (can be disabled)
if self.config["images_scale"] is None:
page._image_cache = {}
# Unload backend
page._backend.unload()
yield page

View File

@ -0,0 +1,50 @@
from typing import Iterable
from PIL import ImageDraw
from docling.datamodel.base_models import Page
from docling.models.abstract_model import AbstractPageModel
class PagePreprocessingModel(AbstractPageModel):
def __init__(self, config):
self.config = config
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
page = self._populate_page_images(page)
page = self._parse_page_cells(page)
yield page
# Generate the page image and store it in the page object
def _populate_page_images(self, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
images_scale = self.config["images_scale"]
# user requested scales
if images_scale is not None:
page._default_image_scale = images_scale
page.get_image(
scale=images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, page: Page) -> Page:
page.cells = page._backend.get_text_cells()
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.get_image(scale=1.0), cells)
return page

View File

@ -9,9 +9,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.models.abstract_model import AbstractPageModel
class TableStructureModel:
class TableStructureModel(AbstractPageModel):
def __init__(self, config):
self.config = config
self.do_cell_matching = config["do_cell_matching"]

View File

@ -1,17 +1,117 @@
from pathlib import Path
import functools
import logging
import time
import traceback
from abc import ABC, abstractmethod
from typing import Callable, Iterable, List
from docling.datamodel.base_models import Page, PipelineOptions
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.settings import settings
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class BaseModelPipeline:
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
self.model_pipe: List[Callable] = []
self.artifacts_path = artifacts_path
class BaseModelPipeline(ABC):
def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options
self.model_pipe: List[Callable] = []
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
@abstractmethod
def execute(self, in_doc: InputDocument) -> ConversionResult:
pass
@abstractmethod
def assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
pass
@classmethod
@abstractmethod
def get_default_options(cls) -> PipelineOptions:
pass
@classmethod
@abstractmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass
class PaginatedModelPipeline(BaseModelPipeline):
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe:
page_batch = model(page_batch)
yield from page_batch
def execute(self, in_doc: InputDocument) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
)
# 2. Run pipeline stages
pipeline_pages = self.apply_on_pages(init_pages)
for p in pipeline_pages:
pass
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
# Free up mem resources of PDF backend
in_doc._backend.unload()
conv_res = self.assemble_document(in_doc, conv_res)
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.DOCUMENT_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
conv_res.status = status
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
raise e
return conv_res
# Initialise and load resources for a page
@abstractmethod
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
pass

View File

@ -0,0 +1,57 @@
import logging
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import (
ConversionStatus,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.pipeline.base_model_pipeline import BaseModelPipeline
_log = logging.getLogger(__name__)
class SimpleModelPipeline(BaseModelPipeline):
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
def execute(self, in_doc: InputDocument) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid:
conv_res.status = ConversionStatus.FAILURE
return conv_res
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
conv_res.status = ConversionStatus.FAILURE
return conv_res
conv_res.experimental = in_doc._backend.convert()
# Do other stuff with conv_res.experimental
conv_res = self.assemble_document(in_doc, conv_res)
conv_res.status = ConversionStatus.SUCCESS
return conv_res
def assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
return conv_res
@classmethod
def get_default_options(cls) -> PipelineOptions:
return PipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, DeclarativeDocumentBackend)

View File

@ -1,38 +0,0 @@
from pathlib import Path
from docling.datamodel.base_models import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
class StandardModelPipeline(BaseModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
super().__init__(artifacts_path, pipeline_options)
self.model_pipe = [
EasyOcrModel(
config={
"lang": ["fr", "de", "es", "en"],
"enabled": pipeline_options.do_ocr,
}
),
LayoutModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._layout_model_path
}
),
TableStructureModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),
]

View File

@ -0,0 +1,108 @@
import logging
from pathlib import Path
from typing import Optional
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, InputDocument
from docling.models.ds_glm_model import GlmModel
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.models.page_preprocessing_model import PagePreprocessingModel
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
_log = logging.getLogger(__name__)
class StandardPdfModelPipeline(PaginatedModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
if not pipeline_options.artifacts_path:
artifacts_path = self.download_models_hf()
self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel(config={})
self.model_pipe = [
PagePreprocessingModel(
config={"images_scale": pipeline_options.images_scale}
),
EasyOcrModel(
config={
"lang": ["fr", "de", "es", "en"],
"enabled": pipeline_options.do_ocr,
}
),
LayoutModel(
config={
"artifacts_path": artifacts_path
/ StandardPdfModelPipeline._layout_model_path
}
),
TableStructureModel(
config={
"artifacts_path": artifacts_path
/ StandardPdfModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
]
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
)
return Path(download_path)
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no)
page.size = page._backend.get_size()
return page
def assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
return conv_res
@classmethod
def get_default_options(cls) -> PdfPipelineOptions:
return PdfPipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, PdfDocumentBackend)

View File

@ -6,9 +6,9 @@ from typing import Iterable
import yaml
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@ -107,7 +107,11 @@ def main():
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter()
doc_converter = PdfDocumentConverter(
pipeline_options=PdfPipelineOptions(),
pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
pipeline_cls=StandardModelPipeline,
)
input = DocumentConversionInput.from_paths(input_doc_paths)

View File

@ -6,9 +6,9 @@ from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@ -93,12 +93,12 @@ def main():
# Docling Parse without OCR
# -------------------------
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter(
doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

View File

@ -4,14 +4,14 @@ from pathlib import Path
from typing import Tuple
from docling.datamodel.base_models import (
AssembleOptions,
ConversionStatus,
FigureElement,
PageElement,
PdfPipelineOptions,
Table,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@ -30,12 +30,12 @@ def main():
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
start_time = time.time()

View File

@ -5,9 +5,9 @@ from pathlib import Path
import pandas as pd
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
from docling.utils.export import generate_multimodal_pages
_log = logging.getLogger(__name__)
@ -27,12 +27,12 @@ def main():
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
pipeline_options = PdfPipelineOptions()
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options)
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
start_time = time.time()

View File

@ -7,7 +7,7 @@ import pandas as pd
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__)
@ -22,7 +22,7 @@ def main():
input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter()
doc_converter = PdfDocumentConverter()
start_time = time.time()

View File

@ -1,6 +1,6 @@
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter()
converter = PdfDocumentConverter()
doc = converter.convert_single(source)
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"

View File

@ -0,0 +1,41 @@
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
InputFormat,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"),
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[InputFormat.PDF, InputFormat.DOCX],
# format_options={
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
# }
# )
conv_results = doc_converter.convert(input)
for res in conv_results:
print(
f"Document {res.input.file.name} converted with status {res.status}. Content:"
)
print(res.experimental.export_to_markdown())

Binary file not shown.

1311
tests/data/wiki_duck.html Normal file

File diff suppressed because one or more lines are too long

BIN
tests/data/word_sample.docx Normal file

Binary file not shown.

View File

@ -2,9 +2,9 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.base_models import PdfPipelineOptions
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
from .verify_utils import verify_conversion_result
@ -23,12 +23,12 @@ def get_pdf_paths():
def get_converter():
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)

View File

@ -5,9 +5,9 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, PipelineOptions
from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from docling.pdf_document_converter import PdfDocumentConverter
from .verify_utils import verify_conversion_result
@ -21,12 +21,12 @@ def get_pdf_path():
@pytest.fixture
def converter():
pipeline_options = PipelineOptions()
pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
converter = PdfDocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
@ -34,7 +34,7 @@ def converter():
return converter
def test_convert_single(converter: DocumentConverter):
def test_convert_single(converter: PdfDocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter):
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter):
def test_batch_path(converter: PdfDocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter):
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter):
def test_batch_bytes(converter: PdfDocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")