Fundamental refactoring for multi-format support

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-01 16:27:22 +02:00
parent cd06d89c2a
commit 1fa7cd9855
34 changed files with 2102 additions and 365 deletions

View File

@ -67,11 +67,12 @@ pip install docling
### Convert a single document ### Convert a single document
To convert invidual PDF documents, use `convert_single()`, for example: To convert invidual PDF documents, use `convert_single()`, for example:
```python ```python
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter() converter = PdfDocumentConverter()
result = converter.convert_single(source) result = converter.convert_single(source)
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]" print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..." print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."

View File

@ -1,13 +1,11 @@
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from io import BytesIO from io import BytesIO
from pathlib import Path from pathlib import Path
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union from typing import Set, Union
from docling_core.types.experimental import BoundingBox, Size from docling_core.types.experimental import DoclingDocument
from PIL import Image
if TYPE_CHECKING: from docling.datamodel.base_models import InputFormat
from docling.datamodel.base_models import Cell
class AbstractDocumentBackend(ABC): class AbstractDocumentBackend(ABC):
@ -20,6 +18,11 @@ class AbstractDocumentBackend(ABC):
def is_valid(self) -> bool: def is_valid(self) -> bool:
pass pass
@classmethod
@abstractmethod
def is_paginated(cls) -> bool:
pass
@abstractmethod @abstractmethod
def unload(self): def unload(self):
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
@ -27,45 +30,19 @@ class AbstractDocumentBackend(ABC):
self.path_or_stream = None self.path_or_stream = None
@classmethod
class PdfPageBackend(ABC):
@abstractmethod @abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str: def supported_formats(cls) -> Set[InputFormat]:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "Size":
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass pass
class PdfDocumentBackend(AbstractDocumentBackend): class DeclarativeDocumentBackend(AbstractDocumentBackend):
@abstractmethod """DeclarativeDocumentBackend.
def load_page(self, page_no: int) -> PdfPageBackend:
pass A declarative document backend is a backend that can transform to DoclingDocument
straight without a recognition pipeline.
"""
@abstractmethod @abstractmethod
def page_count(self) -> int: def convert(self) -> DoclingDocument:
pass pass

View File

@ -10,7 +10,7 @@ from docling_parse.docling_parse import pdf_parser
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage from pypdfium2 import PdfPage
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell from docling.datamodel.base_models import Cell
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -0,0 +1,40 @@
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.experimental import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
class HTMLDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.HTML}
def convert(self) -> DoclingDocument:
# access self.path_or_stream to load stuff
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
return doc

View File

@ -0,0 +1,38 @@
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.experimental import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PPTX}
def convert(self) -> DoclingDocument:
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc.add_text(text="I am a Powerpoint document.", label=DocItemLabel.TEXT)
return doc

View File

@ -0,0 +1,38 @@
from io import BytesIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.experimental import (
DescriptionItem,
DocItemLabel,
DoclingDocument,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
super().__init__(path_or_stream, document_hash)
def is_valid(self) -> bool:
return True
def is_paginated(cls) -> bool:
False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.DOCX}
def convert(self) -> DoclingDocument:
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
doc.add_text(text="I am a Word document.", label=DocItemLabel.TEXT)
return doc

View File

@ -0,0 +1,59 @@
from abc import ABC, abstractmethod
from typing import Iterable, Optional, Set
from docling_core.types.experimental import BoundingBox, Size
from PIL import Image
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import Cell, InputFormat
class PdfPageBackend(ABC):
@abstractmethod
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
pass
@abstractmethod
def get_text_cells(self) -> Iterable["Cell"]:
pass
@abstractmethod
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
pass
@abstractmethod
def get_page_image(
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
) -> Image.Image:
pass
@abstractmethod
def get_size(self) -> "Size":
pass
@abstractmethod
def is_valid(self) -> bool:
pass
@abstractmethod
def unload(self):
pass
class PdfDocumentBackend(AbstractDocumentBackend):
@abstractmethod
def load_page(self, page_no: int) -> PdfPageBackend:
pass
@abstractmethod
def page_count(self) -> int:
pass
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.PDF}
@classmethod
def is_paginated(cls) -> bool:
return True

View File

@ -8,10 +8,10 @@ import pypdfium2 as pdfium
import pypdfium2.raw as pdfium_c import pypdfium2.raw as pdfium_c
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
from pypdfium2 import PdfPage, PdfTextPage from pypdfium2 import PdfTextPage
from pypdfium2._helpers.misc import PdfiumError from pypdfium2._helpers.misc import PdfiumError
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell from docling.datamodel.base_models import Cell
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)

View File

@ -12,9 +12,9 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -190,12 +190,12 @@ def convert(
case _: case _:
raise RuntimeError(f"Unexpected backend type {backend}") raise RuntimeError(f"Unexpected backend type {backend}")
pipeline_options = PipelineOptions( pipeline_options = PdfPipelineOptions(
do_ocr=ocr, do_ocr=ocr,
do_table_structure=True, do_table_structure=True,
) )
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
doc_converter = DocumentConverter( doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
pdf_backend=pdf_backend, pdf_backend=pdf_backend,
) )

View File

@ -1,8 +1,8 @@
import copy
import warnings import warnings
from enum import Enum, auto from enum import Enum, auto
from io import BytesIO from io import BytesIO
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union from pathlib import Path
from typing import Annotated, Dict, List, Optional, Union
from docling_core.types.experimental import BoundingBox, Size from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.document import BasePictureData, TableCell from docling_core.types.experimental.document import BasePictureData, TableCell
@ -11,8 +11,6 @@ from PIL.Image import Image
from pydantic import BaseModel, ConfigDict, Field, model_validator from pydantic import BaseModel, ConfigDict, Field, model_validator
from typing_extensions import Self from typing_extensions import Self
from docling.backend.abstract_backend import PdfPageBackend
class ConversionStatus(str, Enum): class ConversionStatus(str, Enum):
PENDING = auto() PENDING = auto()
@ -30,13 +28,29 @@ class InputFormat(str, Enum):
PDF = auto() PDF = auto()
FormatToMimeType = {
InputFormat.DOCX: {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
},
InputFormat.PPTX: {
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
},
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
InputFormat.IMAGE: {"image/png", "image/jpeg"},
InputFormat.PDF: {"application/pdf"},
}
MimeTypeToFormat = {
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
}
class DocInputType(str, Enum): class DocInputType(str, Enum):
PATH = auto() PATH = auto()
STREAM = auto() STREAM = auto()
class DoclingComponentType(str, Enum): class DoclingComponentType(str, Enum):
PDF_BACKEND = auto() DOCUMENT_BACKEND = auto()
MODEL = auto() MODEL = auto()
DOC_ASSEMBLER = auto() DOC_ASSEMBLER = auto()
@ -128,13 +142,13 @@ class Page(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True) model_config = ConfigDict(arbitrary_types_allowed=True)
page_no: int page_no: int
page_hash: Optional[str] = None # page_hash: Optional[str] = None
size: Optional[Size] = None size: Optional[Size] = None
cells: List[Cell] = [] cells: List[Cell] = []
predictions: PagePredictions = PagePredictions() predictions: PagePredictions = PagePredictions()
assembled: Optional[AssembledUnit] = None assembled: Optional[AssembledUnit] = None
_backend: Optional[PdfPageBackend] = ( _backend: Optional["PdfPageBackend"] = (
None # Internal PDF backend. By default it is cleared during assembling. None # Internal PDF backend. By default it is cleared during assembling.
) )
_default_image_scale: float = 1.0 # Default image scale for external usage. _default_image_scale: float = 1.0 # Default image scale for external usage.
@ -170,14 +184,16 @@ class TableStructureOptions(BaseModel):
) )
class PipelineOptions(BaseModel): class PipelineOptions(BaseModel): ...
class PdfPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
class AssembleOptions(BaseModel):
keep_page_images: Annotated[ keep_page_images: Annotated[
bool, bool,
Field( Field(

View File

@ -1,9 +1,10 @@
import logging import logging
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
from docling_core.types import BaseCell, BaseText import filetype
from docling_core.types import BaseText
from docling_core.types import Document as DsDocument from docling_core.types import Document as DsDocument
from docling_core.types import DocumentDescription as DsDocumentDescription from docling_core.types import DocumentDescription as DsDocumentDescription
from docling_core.types import FileInfoObject as DsFileInfoObject from docling_core.types import FileInfoObject as DsFileInfoObject
@ -19,8 +20,11 @@ from docling_core.types.experimental import (
from pydantic import BaseModel from pydantic import BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
AssembledUnit, AssembledUnit,
ConversionStatus, ConversionStatus,
@ -28,13 +32,14 @@ from docling.datamodel.base_models import (
ErrorItem, ErrorItem,
FigureElement, FigureElement,
InputFormat, InputFormat,
MimeTypeToFormat,
Page, Page,
PageElement, PageElement,
Table, Table,
TextElement, TextElement,
) )
from docling.datamodel.settings import DocumentLimits from docling.datamodel.settings import DocumentLimits
from docling.utils.utils import create_file_hash from docling.utils.utils import create_file_hash, create_hash
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -71,8 +76,9 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = { _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
InputFormat.PDF: DoclingParseDocumentBackend, InputFormat.PDF: DoclingParseDocumentBackend,
InputFormat.DOCX: None, InputFormat.HTML: HTMLDocumentBackend,
InputFormat.PPTX: None, InputFormat.DOCX: MsWordDocumentBackend,
InputFormat.PPTX: MsPowerpointDocumentBackend,
InputFormat.IMAGE: None, InputFormat.IMAGE: None,
} }
@ -80,13 +86,14 @@ _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]]
class InputDocument(BaseModel): class InputDocument(BaseModel):
file: PurePath = None file: PurePath = None
document_hash: Optional[str] = None document_hash: Optional[str] = None
valid: bool = False valid: bool = True
limits: DocumentLimits = DocumentLimits() limits: DocumentLimits = DocumentLimits()
format: Optional[InputFormat] = None
filesize: Optional[int] = None filesize: Optional[int] = None
page_count: Optional[int] = None page_count: int = 0
_backend: PdfDocumentBackend = None # Internal PDF backend used _backend: AbstractDocumentBackend = None # Internal PDF backend used
def __init__( def __init__(
self, self,
@ -94,27 +101,31 @@ class InputDocument(BaseModel):
filename: Optional[str] = None, filename: Optional[str] = None,
limits: Optional[DocumentLimits] = None, limits: Optional[DocumentLimits] = None,
backend: Optional[Type[AbstractDocumentBackend]] = None, backend: Optional[Type[AbstractDocumentBackend]] = None,
format: Optional[InputFormat] = None,
): ):
super().__init__() super().__init__()
if not backend:
backend = _input_format_default_backends[InputFormat.PDF]
self.limits = limits or DocumentLimits() self.limits = limits or DocumentLimits()
try: try:
if isinstance(path_or_stream, Path): if isinstance(path_or_stream, Path):
mime = filetype.guess_mime(str(path_or_stream))
if mime is None:
if path_or_stream.suffix == ".html":
mime = "text/html"
self.file = path_or_stream self.file = path_or_stream
self.filesize = path_or_stream.stat().st_size self.filesize = path_or_stream.stat().st_size
if self.filesize > self.limits.max_file_size: if self.filesize > self.limits.max_file_size:
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash self._init_doc(backend, mime, path_or_stream)
)
elif isinstance(path_or_stream, BytesIO): elif isinstance(path_or_stream, BytesIO):
mime = filetype.guess_mime(path_or_stream.read(8192))
self.file = PurePath(filename) self.file = PurePath(filename)
self.filesize = path_or_stream.getbuffer().nbytes self.filesize = path_or_stream.getbuffer().nbytes
@ -122,15 +133,15 @@ class InputDocument(BaseModel):
self.valid = False self.valid = False
else: else:
self.document_hash = create_file_hash(path_or_stream) self.document_hash = create_file_hash(path_or_stream)
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
if self.document_hash and self._backend.page_count() > 0: self._init_doc(backend, mime, path_or_stream)
self.page_count = self._backend.page_count()
if self.page_count <= self.limits.max_num_pages: # For paginated backends, check if the maximum page count is exceeded.
self.valid = True if self.valid and self._backend.is_valid():
if self._backend.is_paginated():
self.page_count = self._backend.page_count()
if not self.page_count <= self.limits.max_num_pages:
self.valid = False
except (FileNotFoundError, OSError) as e: except (FileNotFoundError, OSError) as e:
_log.exception( _log.exception(
@ -144,6 +155,27 @@ class InputDocument(BaseModel):
) )
# raise # raise
def _init_doc(
self,
backend: AbstractDocumentBackend,
mime: str,
path_or_stream: Union[BytesIO, Path],
) -> None:
self.format = MimeTypeToFormat.get(mime)
if self.format is not None:
backend = backend or _input_format_default_backends.get(self.format)
if backend is None:
raise RuntimeError(
f"Could not find suitable default backend for format: {self.format}"
)
if self.format is None or self.format not in backend.supported_formats():
# TODO decide if to raise exception here too.
self.valid = False
else:
self._backend = backend(
path_or_stream=path_or_stream, document_hash=self.document_hash
)
@deprecated("Use `ConversionResult` instead.") @deprecated("Use `ConversionResult` instead.")
class ConvertedDocument(BaseModel): class ConvertedDocument(BaseModel):
@ -163,7 +195,11 @@ class ConvertedDocument(BaseModel):
desc = DsDocumentDescription(logs=[]) desc = DsDocumentDescription(logs=[])
page_hashes = [ page_hashes = [
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default") PageReference(
hash=create_hash(self.input.document_hash + ":" + str(p.page_no)),
page=p.page_no + 1,
model="default",
)
for p in self.pages for p in self.pages
] ]
@ -441,25 +477,21 @@ class DocumentConversionInput(BaseModel):
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None _path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
limits: Optional[DocumentLimits] = DocumentLimits() limits: Optional[DocumentLimits] = DocumentLimits()
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
def docs( def docs(
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None self, backend: Optional[Type[AbstractDocumentBackend]] = None
) -> Iterable[InputDocument]: ) -> Iterable[InputDocument]:
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
for obj in self._path_or_stream_iterator: for obj in self._path_or_stream_iterator:
if isinstance(obj, Path): if isinstance(obj, Path):
yield InputDocument( yield InputDocument(
path_or_stream=obj, limits=self.limits, backend=pdf_backend path_or_stream=obj, limits=self.limits, backend=backend
) )
elif isinstance(obj, DocumentStream): elif isinstance(obj, DocumentStream):
yield InputDocument( yield InputDocument(
path_or_stream=obj.stream, path_or_stream=obj.stream,
filename=obj.filename, filename=obj.filename,
limits=self.limits, limits=self.limits,
backend=pdf_backend, backend=backend,
) )
@classmethod @classmethod

View File

@ -1,81 +1,78 @@
import functools
import logging import logging
import tempfile import tempfile
import time import time
import traceback
from pathlib import Path from pathlib import Path
from typing import Iterable, Optional, Type, Union from typing import Dict, Iterable, List, Optional, Type
import requests import requests
from PIL import ImageDraw from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
from docling.backend.abstract_backend import PdfDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import ( from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
AssembledUnit,
AssembleOptions,
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import ( from docling.datamodel.document import (
ConversionResult, ConversionResult,
DocumentConversionInput, DocumentConversionInput,
InputDocument, InputDocument,
) )
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.ds_glm_model import GlmModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline from docling.pipeline.base_model_pipeline import BaseModelPipeline
from docling.pipeline.standard_model_pipeline import StandardModelPipeline from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.utils.utils import chunkify, create_hash from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class DocumentConverter: class FormatOption(BaseModel):
_default_download_filename = "file.pdf" pipeline_cls: Type[BaseModelPipeline]
pipeline_options: Optional[PipelineOptions] = None
backend: Optional[Type[AbstractDocumentBackend]]
model_config = ConfigDict(arbitrary_types_allowed=True)
def __init__( def __init__(
self, self,
artifacts_path: Optional[Union[Path, str]] = None, pipeline_cls: Type[BaseModelPipeline],
pipeline_options: PipelineOptions = PipelineOptions(), pipeline_options: Optional[PipelineOptions] = None,
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND, backend: Optional[Type[AbstractDocumentBackend]] = None,
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
assemble_options: AssembleOptions = AssembleOptions(),
): ):
if not artifacts_path: if pipeline_options is None:
artifacts_path = self.download_models_hf() pipeline_options = pipeline_cls.get_default_options()
artifacts_path = Path(artifacts_path) super().__init__(
pipeline_cls=pipeline_cls,
self.model_pipeline = pipeline_cls( pipeline_options=pipeline_options,
artifacts_path=artifacts_path, pipeline_options=pipeline_options backend=backend,
) )
self.page_assemble_model = PageAssembleModel(config={})
self.glm_model = GlmModel(config={})
self.pdf_backend = pdf_backend
self.assemble_options = assemble_options
@staticmethod _format_to_default_options = {
def download_models_hf( InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
local_dir: Optional[Path] = None, force: bool = False InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
) -> Path: InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
from huggingface_hub import snapshot_download InputFormat.IMAGE: None,
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
}
download_path = snapshot_download(
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir class DocumentConverter:
_default_download_filename = "file"
def __init__(
self,
formats: List[InputFormat] = [e for e in InputFormat],
format_options: Dict[InputFormat, FormatOption] = _format_to_default_options,
):
self.formats = formats
self.format_to_options = format_options
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
{}
) )
return Path(download_path)
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]: def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
for input_batch in chunkify( for input_batch in chunkify(
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size input.docs(), settings.perf.doc_batch_size # pass format_options
): ):
_log.info(f"Going to convert document batch...") _log.info(f"Going to convert document batch...")
# parallel processing only within input_batch # parallel processing only within input_batch
@ -84,8 +81,8 @@ class DocumentConverter:
# ) as pool: # ) as pool:
# yield from pool.map(self.process_document, input_batch) # yield from pool.map(self.process_document, input_batch)
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled. # Note: PDF backends are not thread-safe, thread pool usage was disabled.
yield from map(self._process_document, input_batch) yield from map(self.process_document, input_batch)
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult: def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
"""Convert a single document. """Convert a single document.
@ -137,156 +134,42 @@ class DocumentConverter:
raise RuntimeError(f"Conversion failed with status: {conv_res.status}") raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
return conv_res return conv_res
def _process_document(self, in_doc: InputDocument) -> ConversionResult: def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
start_doc_time = time.time() pipeline_class = None
conv_res = ConversionResult(input=in_doc) fopt = self.format_to_options.get(doc.format)
if fopt is None:
return None
else:
pipeline_class = fopt.pipeline_cls
_log.info(f"Processing document {in_doc.file.name}") if pipeline_class not in self.initialized_pipelines:
self.initialized_pipelines[pipeline_class] = pipeline_class(
if not in_doc.valid: pipeline_options=pipeline_class.get_default_options()
conv_res.status = ConversionStatus.FAILURE
return conv_res
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
all_assembled_pages = []
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
# Pipeline
# 1. Initialise the page resources
init_pages = map(
functools.partial(self._initialize_page, in_doc), page_batch
)
# 2. Populate page image
pages_with_images = map(
functools.partial(self._populate_page_images, in_doc), init_pages
)
# 3. Populate programmatic page cells
pages_with_cells = map(
functools.partial(self._parse_page_cells, in_doc),
pages_with_images,
)
# 4. Run pipeline stages
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
# 5. Assemble page elements (per page)
assembled_pages = self.page_assemble_model(pipeline_pages)
# exhaust assembled_pages
for assembled_page in assembled_pages:
# Free up mem resources before moving on with next batch
# Remove page images (can be disabled)
if self.assemble_options.images_scale is None:
assembled_page._image_cache = {}
# Unload backend
assembled_page._backend.unload()
all_assembled_pages.append(assembled_page)
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
# Free up mem resources of PDF backend
in_doc._backend.unload()
conv_res.pages = all_assembled_pages
self._assemble_doc(conv_res)
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.PDF_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
conv_res.status = status
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
) )
return self.initialized_pipelines[pipeline_class]
def process_document(self, in_doc: InputDocument) -> ConversionResult:
start_doc_time = time.time()
conv_res = self._execute_pipeline(in_doc)
end_doc_time = time.time() - start_doc_time end_doc_time = time.time() - start_doc_time
_log.info( _log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
)
return conv_res return conv_res
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...) def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult:
def _initialize_page(self, doc: InputDocument, page: Page) -> Page: if in_doc.valid and in_doc.format in self.formats:
page._backend = doc._backend.load_page(page.page_no) pipeline = self._get_pipeline(in_doc)
page.size = page._backend.get_size() if pipeline is None: # Can't find a default pipeline. Should this raise?
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no)) conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
return conv_res
return page conv_res = pipeline.execute(in_doc)
else: # invalid doc or not of desired format
conv_res = ConversionResult(input=in_doc)
conv_res.status = ConversionStatus.FAILURE
# TODO add error log why it failed.
# Generate the page image and store it in the page object return conv_res
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
# user requested scales
if self.assemble_options.images_scale is not None:
page._default_image_scale = self.assemble_options.images_scale
page.get_image(
scale=self.assemble_options.images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
page.cells = page._backend.get_text_cells()
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.get_image(scale=1.0), cells)
return page
def _assemble_doc(self, conv_res: ConversionResult):
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output, conv_res.experimental = self.glm_model(conv_res)

View File

@ -0,0 +1,10 @@
from abc import ABC, abstractmethod
from typing import Iterable
from docling.datamodel.base_models import Page
class AbstractPageModel(ABC):
@abstractmethod
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
pass

View File

@ -3,7 +3,6 @@ import logging
from abc import abstractmethod from abc import abstractmethod
from typing import Iterable, List, Tuple from typing import Iterable, List, Tuple
import numpy
import numpy as np import numpy as np
from docling_core.types.experimental import BoundingBox, CoordOrigin from docling_core.types.experimental import BoundingBox, CoordOrigin
from PIL import Image, ImageDraw from PIL import Image, ImageDraw
@ -11,11 +10,12 @@ from rtree import index
from scipy.ndimage import find_objects, label from scipy.ndimage import find_objects, label
from docling.datamodel.base_models import OcrCell, Page from docling.datamodel.base_models import OcrCell, Page
from docling.models.abstract_model import AbstractPageModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class BaseOcrModel: class BaseOcrModel(AbstractPageModel):
def __init__(self, config): def __init__(self, config):
self.config = config self.config = config
self.enabled = config["enabled"] self.enabled = config["enabled"]

View File

@ -16,12 +16,13 @@ from docling.datamodel.base_models import (
LayoutPrediction, LayoutPrediction,
Page, Page,
) )
from docling.models.abstract_model import AbstractPageModel
from docling.utils import layout_utils as lu from docling.utils import layout_utils as lu
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class LayoutModel: class LayoutModel(AbstractPageModel):
TEXT_ELEM_LABELS = [ TEXT_ELEM_LABELS = [
DocItemLabel.TEXT, DocItemLabel.TEXT,

View File

@ -10,12 +10,13 @@ from docling.datamodel.base_models import (
Table, Table,
TextElement, TextElement,
) )
from docling.models.abstract_model import AbstractPageModel
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class PageAssembleModel: class PageAssembleModel(AbstractPageModel):
def __init__(self, config): def __init__(self, config):
self.config = config self.config = config
@ -145,4 +146,11 @@ class PageAssembleModel:
elements=elements, headers=headers, body=body elements=elements, headers=headers, body=body
) )
# Remove page images (can be disabled)
if self.config["images_scale"] is None:
page._image_cache = {}
# Unload backend
page._backend.unload()
yield page yield page

View File

@ -0,0 +1,50 @@
from typing import Iterable
from PIL import ImageDraw
from docling.datamodel.base_models import Page
from docling.models.abstract_model import AbstractPageModel
class PagePreprocessingModel(AbstractPageModel):
def __init__(self, config):
self.config = config
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for page in page_batch:
page = self._populate_page_images(page)
page = self._parse_page_cells(page)
yield page
# Generate the page image and store it in the page object
def _populate_page_images(self, page: Page) -> Page:
# default scale
page.get_image(
scale=1.0
) # puts the page image on the image cache at default scale
images_scale = self.config["images_scale"]
# user requested scales
if images_scale is not None:
page._default_image_scale = images_scale
page.get_image(
scale=images_scale
) # this will trigger storing the image in the internal cache
return page
# Extract and populate the page cells and store it in the page object
def _parse_page_cells(self, page: Page) -> Page:
page.cells = page._backend.get_text_cells()
# DEBUG code:
def draw_text_boxes(image, cells):
draw = ImageDraw.Draw(image)
for c in cells:
x0, y0, x1, y1 = c.bbox.as_tuple()
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
image.show()
# draw_text_boxes(page.get_image(scale=1.0), cells)
return page

View File

@ -9,9 +9,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
from PIL import ImageDraw from PIL import ImageDraw
from docling.datamodel.base_models import Page, Table, TableStructurePrediction from docling.datamodel.base_models import Page, Table, TableStructurePrediction
from docling.models.abstract_model import AbstractPageModel
class TableStructureModel: class TableStructureModel(AbstractPageModel):
def __init__(self, config): def __init__(self, config):
self.config = config self.config = config
self.do_cell_matching = config["do_cell_matching"] self.do_cell_matching = config["do_cell_matching"]

View File

@ -1,17 +1,117 @@
from pathlib import Path import functools
import logging
import time
import traceback
from abc import ABC, abstractmethod
from typing import Callable, Iterable, List from typing import Callable, Iterable, List
from docling.datamodel.base_models import Page, PipelineOptions from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.datamodel.base_models import (
ConversionStatus,
DoclingComponentType,
ErrorItem,
Page,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.settings import settings
from docling.utils.utils import chunkify
_log = logging.getLogger(__name__)
class BaseModelPipeline: class BaseModelPipeline(ABC):
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions): def __init__(self, pipeline_options: PipelineOptions):
self.model_pipe: List[Callable] = []
self.artifacts_path = artifacts_path
self.pipeline_options = pipeline_options self.pipeline_options = pipeline_options
self.model_pipe: List[Callable] = []
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]: @abstractmethod
def execute(self, in_doc: InputDocument) -> ConversionResult:
pass
@abstractmethod
def assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
pass
@classmethod
@abstractmethod
def get_default_options(cls) -> PipelineOptions:
pass
@classmethod
@abstractmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass
class PaginatedModelPipeline(BaseModelPipeline):
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
for model in self.model_pipe: for model in self.model_pipe:
page_batch = model(page_batch) page_batch = model(page_batch)
yield from page_batch yield from page_batch
def execute(self, in_doc: InputDocument) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
for i in range(0, in_doc.page_count):
conv_res.pages.append(Page(page_no=i))
try:
# Iterate batches of pages (page_batch_size) in the doc
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
start_pb_time = time.time()
# 1. Initialise the page resources
init_pages = map(
functools.partial(self.initialize_page, in_doc), page_batch
)
# 2. Run pipeline stages
pipeline_pages = self.apply_on_pages(init_pages)
for p in pipeline_pages:
pass
end_pb_time = time.time() - start_pb_time
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
# Free up mem resources of PDF backend
in_doc._backend.unload()
conv_res = self.assemble_document(in_doc, conv_res)
status = ConversionStatus.SUCCESS
for page in conv_res.pages:
if not page._backend.is_valid():
conv_res.errors.append(
ErrorItem(
component_type=DoclingComponentType.DOCUMENT_BACKEND,
module_name=type(page._backend).__name__,
error_message=f"Page {page.page_no} failed to parse.",
)
)
status = ConversionStatus.PARTIAL_SUCCESS
conv_res.status = status
except Exception as e:
conv_res.status = ConversionStatus.FAILURE
trace = "\n".join(traceback.format_exception(e))
_log.info(
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
f"{trace}"
)
raise e
return conv_res
# Initialise and load resources for a page
@abstractmethod
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
pass

View File

@ -0,0 +1,57 @@
import logging
from docling.backend.abstract_backend import (
AbstractDocumentBackend,
DeclarativeDocumentBackend,
)
from docling.datamodel.base_models import (
ConversionStatus,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.pipeline.base_model_pipeline import BaseModelPipeline
_log = logging.getLogger(__name__)
class SimpleModelPipeline(BaseModelPipeline):
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
def execute(self, in_doc: InputDocument) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
_log.info(f"Processing document {in_doc.file.name}")
if not in_doc.valid:
conv_res.status = ConversionStatus.FAILURE
return conv_res
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
conv_res.status = ConversionStatus.FAILURE
return conv_res
conv_res.experimental = in_doc._backend.convert()
# Do other stuff with conv_res.experimental
conv_res = self.assemble_document(in_doc, conv_res)
conv_res.status = ConversionStatus.SUCCESS
return conv_res
def assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
return conv_res
@classmethod
def get_default_options(cls) -> PipelineOptions:
return PipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, DeclarativeDocumentBackend)

View File

@ -1,38 +0,0 @@
from pathlib import Path
from docling.datamodel.base_models import PipelineOptions
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_model_pipeline import BaseModelPipeline
class StandardModelPipeline(BaseModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
super().__init__(artifacts_path, pipeline_options)
self.model_pipe = [
EasyOcrModel(
config={
"lang": ["fr", "de", "es", "en"],
"enabled": pipeline_options.do_ocr,
}
),
LayoutModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._layout_model_path
}
),
TableStructureModel(
config={
"artifacts_path": artifacts_path
/ StandardModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),
]

View File

@ -0,0 +1,108 @@
import logging
from pathlib import Path
from typing import Optional
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, InputDocument
from docling.models.ds_glm_model import GlmModel
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel
from docling.models.page_preprocessing_model import PagePreprocessingModel
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
_log = logging.getLogger(__name__)
class StandardPdfModelPipeline(PaginatedModelPipeline):
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
_table_model_path = "model_artifacts/tableformer"
def __init__(self, pipeline_options: PdfPipelineOptions):
super().__init__(pipeline_options)
if not pipeline_options.artifacts_path:
artifacts_path = self.download_models_hf()
self.artifacts_path = Path(artifacts_path)
self.glm_model = GlmModel(config={})
self.model_pipe = [
PagePreprocessingModel(
config={"images_scale": pipeline_options.images_scale}
),
EasyOcrModel(
config={
"lang": ["fr", "de", "es", "en"],
"enabled": pipeline_options.do_ocr,
}
),
LayoutModel(
config={
"artifacts_path": artifacts_path
/ StandardPdfModelPipeline._layout_model_path
}
),
TableStructureModel(
config={
"artifacts_path": artifacts_path
/ StandardPdfModelPipeline._table_model_path,
"enabled": pipeline_options.do_table_structure,
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
}
),
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
]
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
) -> Path:
from huggingface_hub import snapshot_download
download_path = snapshot_download(
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
)
return Path(download_path)
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
page._backend = doc._backend.load_page(page.page_no)
page.size = page._backend.get_size()
return page
def assemble_document(
self, in_doc: InputDocument, conv_res: ConversionResult
) -> ConversionResult:
all_elements = []
all_headers = []
all_body = []
for p in conv_res.pages:
for el in p.assembled.body:
all_body.append(el)
for el in p.assembled.headers:
all_headers.append(el)
for el in p.assembled.elements:
all_elements.append(el)
conv_res.assembled = AssembledUnit(
elements=all_elements, headers=all_headers, body=all_body
)
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
return conv_res
@classmethod
def get_default_options(cls) -> PdfPipelineOptions:
return PdfPipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, PdfDocumentBackend)

View File

@ -6,9 +6,9 @@ from typing import Iterable
import yaml import yaml
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -107,7 +107,11 @@ def main():
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)] # docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
# input = DocumentConversionInput.from_streams(docs) # input = DocumentConversionInput.from_streams(docs)
doc_converter = DocumentConverter() doc_converter = PdfDocumentConverter(
pipeline_options=PdfPipelineOptions(),
pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
pipeline_cls=StandardModelPipeline,
)
input = DocumentConversionInput.from_paths(input_doc_paths) input = DocumentConversionInput.from_paths(input_doc_paths)

View File

@ -6,9 +6,9 @@ from typing import Iterable
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, PipelineOptions from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -93,12 +93,12 @@ def main():
# Docling Parse without OCR # Docling Parse without OCR
# ------------------------- # -------------------------
pipeline_options = PipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
doc_converter = DocumentConverter( doc_converter = PdfDocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend, pdf_backend=DoclingParseDocumentBackend,
) )

View File

@ -4,14 +4,14 @@ from pathlib import Path
from typing import Tuple from typing import Tuple
from docling.datamodel.base_models import ( from docling.datamodel.base_models import (
AssembleOptions,
ConversionStatus, ConversionStatus,
FigureElement, FigureElement,
PageElement, PageElement,
PdfPipelineOptions,
Table, Table,
) )
from docling.datamodel.document import DocumentConversionInput from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -30,12 +30,12 @@ def main():
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory. # will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image # scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions() pipeline_options = PdfPipelineOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options) doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
start_time = time.time() start_time = time.time()

View File

@ -5,9 +5,9 @@ from pathlib import Path
import pandas as pd import pandas as pd
from docling.datamodel.base_models import AssembleOptions, ConversionStatus from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
from docling.datamodel.document import DocumentConversionInput from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
from docling.utils.export import generate_multimodal_pages from docling.utils.export import generate_multimodal_pages
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -27,12 +27,12 @@ def main():
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
# will destroy them for cleaning up memory. # will destroy them for cleaning up memory.
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images. # This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
# scale=1 correspond of a standard 72 DPI image # scale=1 correspond of a standard 72 DPI image
assemble_options = AssembleOptions() pipeline_options = PdfPipelineOptions()
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
doc_converter = DocumentConverter(assemble_options=assemble_options) doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
start_time = time.time() start_time = time.time()

View File

@ -7,7 +7,7 @@ import pandas as pd
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import DocumentConversionInput from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -22,7 +22,7 @@ def main():
input_files = DocumentConversionInput.from_paths(input_doc_paths) input_files = DocumentConversionInput.from_paths(input_doc_paths)
doc_converter = DocumentConverter() doc_converter = PdfDocumentConverter()
start_time = time.time() start_time = time.time()

View File

@ -1,6 +1,6 @@
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
converter = DocumentConverter() converter = PdfDocumentConverter()
doc = converter.convert_single(source) doc = converter.convert_single(source)
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]" print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"

View File

@ -0,0 +1,41 @@
from pathlib import Path
from docling.backend.msword_backend import MsWordDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import (
InputFormat,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.document import DocumentConversionInput
from docling.document_converter import DocumentConverter, FormatOption
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
input_paths = [
Path("tests/data/wiki_duck.html"),
Path("tests/data/word_sample.docx"),
Path("tests/data/powerpoint_sample.pptx"),
Path("tests/data/2206.01062.pdf"),
]
input = DocumentConversionInput.from_paths(input_paths)
# for defaults use:
doc_converter = DocumentConverter()
# to customize use:
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
# formats=[InputFormat.PDF, InputFormat.DOCX],
# format_options={
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
# }
# )
conv_results = doc_converter.convert(input)
for res in conv_results:
print(
f"Document {res.input.file.name} converted with status {res.status}. Content:"
)
print(res.experimental.export_to_markdown())

Binary file not shown.

1311
tests/data/wiki_duck.html Normal file

File diff suppressed because one or more lines are too long

BIN
tests/data/word_sample.docx Normal file

Binary file not shown.

View File

@ -2,9 +2,9 @@ from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import PipelineOptions from docling.datamodel.base_models import PdfPipelineOptions
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
from .verify_utils import verify_conversion_result from .verify_utils import verify_conversion_result
@ -23,12 +23,12 @@ def get_pdf_paths():
def get_converter(): def get_converter():
pipeline_options = PipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter( converter = PdfDocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend, pdf_backend=DoclingParseDocumentBackend,
) )

View File

@ -5,9 +5,9 @@ import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, PipelineOptions from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.pdf_document_converter import PdfDocumentConverter
from .verify_utils import verify_conversion_result from .verify_utils import verify_conversion_result
@ -21,12 +21,12 @@ def get_pdf_path():
@pytest.fixture @pytest.fixture
def converter(): def converter():
pipeline_options = PipelineOptions() pipeline_options = PdfPipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter( converter = PdfDocumentConverter(
pipeline_options=pipeline_options, pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend, pdf_backend=DoclingParseDocumentBackend,
) )
@ -34,7 +34,7 @@ def converter():
return converter return converter
def test_convert_single(converter: DocumentConverter): def test_convert_single(converter: PdfDocumentConverter):
pdf_path = get_pdf_path() pdf_path = get_pdf_path()
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter):
verify_conversion_result(input_path=pdf_path, doc_result=doc_result) verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter): def test_batch_path(converter: PdfDocumentConverter):
pdf_path = get_pdf_path() pdf_path = get_pdf_path()
print(f"converting {pdf_path}") print(f"converting {pdf_path}")
@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter):
verify_conversion_result(input_path=pdf_path, doc_result=doc_result) verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter): def test_batch_bytes(converter: PdfDocumentConverter):
pdf_path = get_pdf_path() pdf_path = get_pdf_path()
print(f"converting {pdf_path}") print(f"converting {pdf_path}")