mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Fundamental refactoring for multi-format support
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cd06d89c2a
commit
1fa7cd9855
@ -67,11 +67,12 @@ pip install docling
|
||||
### Convert a single document
|
||||
|
||||
To convert invidual PDF documents, use `convert_single()`, for example:
|
||||
|
||||
```python
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
converter = PdfDocumentConverter()
|
||||
result = converter.convert_single(source)
|
||||
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
||||
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
|
||||
|
@ -1,13 +1,11 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from PIL import Image
|
||||
from docling_core.types.experimental import DoclingDocument
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.datamodel.base_models import Cell
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
|
||||
class AbstractDocumentBackend(ABC):
|
||||
@ -20,6 +18,11 @@ class AbstractDocumentBackend(ABC):
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def is_paginated(cls) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
@ -27,45 +30,19 @@ class AbstractDocumentBackend(ABC):
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable["Cell"]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
||||
) -> Image.Image:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_size(self) -> "Size":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(AbstractDocumentBackend):
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
||||
"""DeclarativeDocumentBackend.
|
||||
|
||||
A declarative document backend is a backend that can transform to DoclingDocument
|
||||
straight without a recognition pipeline.
|
||||
"""
|
||||
|
||||
@abstractmethod
|
||||
def page_count(self) -> int:
|
||||
def convert(self) -> DoclingDocument:
|
||||
pass
|
||||
|
@ -10,7 +10,7 @@ from docling_parse.docling_parse import pdf_parser
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
40
docling/backend/html_backend.py
Normal file
40
docling/backend/html_backend.py
Normal file
@ -0,0 +1,40 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.experimental import (
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
|
||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.HTML}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
|
||||
# access self.path_or_stream to load stuff
|
||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||
doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
|
||||
return doc
|
38
docling/backend/mspowerpoint_backend.py
Normal file
38
docling/backend/mspowerpoint_backend.py
Normal file
@ -0,0 +1,38 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.experimental import (
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
|
||||
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.PPTX}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||
doc.add_text(text="I am a Powerpoint document.", label=DocItemLabel.TEXT)
|
||||
return doc
|
38
docling/backend/msword_backend.py
Normal file
38
docling/backend/msword_backend.py
Normal file
@ -0,0 +1,38 @@
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.experimental import (
|
||||
DescriptionItem,
|
||||
DocItemLabel,
|
||||
DoclingDocument,
|
||||
)
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
|
||||
|
||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||
super().__init__(path_or_stream, document_hash)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return True
|
||||
|
||||
def is_paginated(cls) -> bool:
|
||||
False
|
||||
|
||||
def unload(self):
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
self.path_or_stream.close()
|
||||
|
||||
self.path_or_stream = None
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.DOCX}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||
doc.add_text(text="I am a Word document.", label=DocItemLabel.TEXT)
|
||||
return doc
|
59
docling/backend/pdf_backend.py
Normal file
59
docling/backend/pdf_backend.py
Normal file
@ -0,0 +1,59 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterable, Optional, Set
|
||||
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from PIL import Image
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.datamodel.base_models import Cell, InputFormat
|
||||
|
||||
|
||||
class PdfPageBackend(ABC):
|
||||
|
||||
@abstractmethod
|
||||
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_text_cells(self) -> Iterable["Cell"]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_page_image(
|
||||
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
||||
) -> Image.Image:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_size(self) -> "Size":
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def is_valid(self) -> bool:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def unload(self):
|
||||
pass
|
||||
|
||||
|
||||
class PdfDocumentBackend(AbstractDocumentBackend):
|
||||
@abstractmethod
|
||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def page_count(self) -> int:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return {InputFormat.PDF}
|
||||
|
||||
@classmethod
|
||||
def is_paginated(cls) -> bool:
|
||||
return True
|
@ -8,10 +8,10 @@ import pypdfium2 as pdfium
|
||||
import pypdfium2.raw as pdfium_c
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
|
||||
from PIL import Image, ImageDraw
|
||||
from pypdfium2 import PdfPage, PdfTextPage
|
||||
from pypdfium2 import PdfTextPage
|
||||
from pypdfium2._helpers.misc import PdfiumError
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||
from docling.datamodel.base_models import Cell
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
@ -12,9 +12,9 @@ from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
@ -190,12 +190,12 @@ def convert(
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
|
||||
pipeline_options = PipelineOptions(
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
doc_converter = DocumentConverter(
|
||||
doc_converter = PdfDocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=pdf_backend,
|
||||
)
|
||||
|
@ -1,8 +1,8 @@
|
||||
import copy
|
||||
import warnings
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
||||
from pathlib import Path
|
||||
from typing import Annotated, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from docling_core.types.experimental.document import BasePictureData, TableCell
|
||||
@ -11,8 +11,6 @@ from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||
from typing_extensions import Self
|
||||
|
||||
from docling.backend.abstract_backend import PdfPageBackend
|
||||
|
||||
|
||||
class ConversionStatus(str, Enum):
|
||||
PENDING = auto()
|
||||
@ -30,13 +28,29 @@ class InputFormat(str, Enum):
|
||||
PDF = auto()
|
||||
|
||||
|
||||
FormatToMimeType = {
|
||||
InputFormat.DOCX: {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
},
|
||||
InputFormat.PPTX: {
|
||||
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||
},
|
||||
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
||||
InputFormat.IMAGE: {"image/png", "image/jpeg"},
|
||||
InputFormat.PDF: {"application/pdf"},
|
||||
}
|
||||
MimeTypeToFormat = {
|
||||
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||
}
|
||||
|
||||
|
||||
class DocInputType(str, Enum):
|
||||
PATH = auto()
|
||||
STREAM = auto()
|
||||
|
||||
|
||||
class DoclingComponentType(str, Enum):
|
||||
PDF_BACKEND = auto()
|
||||
DOCUMENT_BACKEND = auto()
|
||||
MODEL = auto()
|
||||
DOC_ASSEMBLER = auto()
|
||||
|
||||
@ -128,13 +142,13 @@ class Page(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
page_no: int
|
||||
page_hash: Optional[str] = None
|
||||
# page_hash: Optional[str] = None
|
||||
size: Optional[Size] = None
|
||||
cells: List[Cell] = []
|
||||
predictions: PagePredictions = PagePredictions()
|
||||
assembled: Optional[AssembledUnit] = None
|
||||
|
||||
_backend: Optional[PdfPageBackend] = (
|
||||
_backend: Optional["PdfPageBackend"] = (
|
||||
None # Internal PDF backend. By default it is cleared during assembling.
|
||||
)
|
||||
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
||||
@ -170,14 +184,16 @@ class TableStructureOptions(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
class PipelineOptions(BaseModel): ...
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
|
||||
|
||||
class AssembleOptions(BaseModel):
|
||||
keep_page_images: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
|
@ -1,9 +1,10 @@
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||
|
||||
from docling_core.types import BaseCell, BaseText
|
||||
import filetype
|
||||
from docling_core.types import BaseText
|
||||
from docling_core.types import Document as DsDocument
|
||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||
@ -19,8 +20,11 @@ from docling_core.types.experimental import (
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConversionStatus,
|
||||
@ -28,13 +32,14 @@ from docling.datamodel.base_models import (
|
||||
ErrorItem,
|
||||
FigureElement,
|
||||
InputFormat,
|
||||
MimeTypeToFormat,
|
||||
Page,
|
||||
PageElement,
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.datamodel.settings import DocumentLimits
|
||||
from docling.utils.utils import create_file_hash
|
||||
from docling.utils.utils import create_file_hash, create_hash
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -71,8 +76,9 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
|
||||
|
||||
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||
InputFormat.PDF: DoclingParseDocumentBackend,
|
||||
InputFormat.DOCX: None,
|
||||
InputFormat.PPTX: None,
|
||||
InputFormat.HTML: HTMLDocumentBackend,
|
||||
InputFormat.DOCX: MsWordDocumentBackend,
|
||||
InputFormat.PPTX: MsPowerpointDocumentBackend,
|
||||
InputFormat.IMAGE: None,
|
||||
}
|
||||
|
||||
@ -80,13 +86,14 @@ _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]]
|
||||
class InputDocument(BaseModel):
|
||||
file: PurePath = None
|
||||
document_hash: Optional[str] = None
|
||||
valid: bool = False
|
||||
valid: bool = True
|
||||
limits: DocumentLimits = DocumentLimits()
|
||||
format: Optional[InputFormat] = None
|
||||
|
||||
filesize: Optional[int] = None
|
||||
page_count: Optional[int] = None
|
||||
page_count: int = 0
|
||||
|
||||
_backend: PdfDocumentBackend = None # Internal PDF backend used
|
||||
_backend: AbstractDocumentBackend = None # Internal PDF backend used
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -94,27 +101,31 @@ class InputDocument(BaseModel):
|
||||
filename: Optional[str] = None,
|
||||
limits: Optional[DocumentLimits] = None,
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||
format: Optional[InputFormat] = None,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
if not backend:
|
||||
backend = _input_format_default_backends[InputFormat.PDF]
|
||||
|
||||
self.limits = limits or DocumentLimits()
|
||||
|
||||
try:
|
||||
if isinstance(path_or_stream, Path):
|
||||
mime = filetype.guess_mime(str(path_or_stream))
|
||||
if mime is None:
|
||||
if path_or_stream.suffix == ".html":
|
||||
mime = "text/html"
|
||||
|
||||
self.file = path_or_stream
|
||||
self.filesize = path_or_stream.stat().st_size
|
||||
if self.filesize > self.limits.max_file_size:
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
|
||||
self._init_doc(backend, mime, path_or_stream)
|
||||
|
||||
elif isinstance(path_or_stream, BytesIO):
|
||||
mime = filetype.guess_mime(path_or_stream.read(8192))
|
||||
|
||||
self.file = PurePath(filename)
|
||||
self.filesize = path_or_stream.getbuffer().nbytes
|
||||
|
||||
@ -122,15 +133,15 @@ class InputDocument(BaseModel):
|
||||
self.valid = False
|
||||
else:
|
||||
self.document_hash = create_file_hash(path_or_stream)
|
||||
self._backend = backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
|
||||
if self.document_hash and self._backend.page_count() > 0:
|
||||
self._init_doc(backend, mime, path_or_stream)
|
||||
|
||||
# For paginated backends, check if the maximum page count is exceeded.
|
||||
if self.valid and self._backend.is_valid():
|
||||
if self._backend.is_paginated():
|
||||
self.page_count = self._backend.page_count()
|
||||
|
||||
if self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = True
|
||||
if not self.page_count <= self.limits.max_num_pages:
|
||||
self.valid = False
|
||||
|
||||
except (FileNotFoundError, OSError) as e:
|
||||
_log.exception(
|
||||
@ -144,6 +155,27 @@ class InputDocument(BaseModel):
|
||||
)
|
||||
# raise
|
||||
|
||||
def _init_doc(
|
||||
self,
|
||||
backend: AbstractDocumentBackend,
|
||||
mime: str,
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
self.format = MimeTypeToFormat.get(mime)
|
||||
if self.format is not None:
|
||||
backend = backend or _input_format_default_backends.get(self.format)
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"Could not find suitable default backend for format: {self.format}"
|
||||
)
|
||||
if self.format is None or self.format not in backend.supported_formats():
|
||||
# TODO decide if to raise exception here too.
|
||||
self.valid = False
|
||||
else:
|
||||
self._backend = backend(
|
||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||
)
|
||||
|
||||
|
||||
@deprecated("Use `ConversionResult` instead.")
|
||||
class ConvertedDocument(BaseModel):
|
||||
@ -163,7 +195,11 @@ class ConvertedDocument(BaseModel):
|
||||
desc = DsDocumentDescription(logs=[])
|
||||
|
||||
page_hashes = [
|
||||
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
|
||||
PageReference(
|
||||
hash=create_hash(self.input.document_hash + ":" + str(p.page_no)),
|
||||
page=p.page_no + 1,
|
||||
model="default",
|
||||
)
|
||||
for p in self.pages
|
||||
]
|
||||
|
||||
@ -441,25 +477,21 @@ class DocumentConversionInput(BaseModel):
|
||||
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||
|
||||
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
||||
|
||||
def docs(
|
||||
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
||||
self, backend: Optional[Type[AbstractDocumentBackend]] = None
|
||||
) -> Iterable[InputDocument]:
|
||||
|
||||
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
||||
|
||||
for obj in self._path_or_stream_iterator:
|
||||
if isinstance(obj, Path):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj, limits=self.limits, backend=pdf_backend
|
||||
path_or_stream=obj, limits=self.limits, backend=backend
|
||||
)
|
||||
elif isinstance(obj, DocumentStream):
|
||||
yield InputDocument(
|
||||
path_or_stream=obj.stream,
|
||||
filename=obj.filename,
|
||||
limits=self.limits,
|
||||
backend=pdf_backend,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
|
@ -1,81 +1,78 @@
|
||||
import functools
|
||||
import logging
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Optional, Type, Union
|
||||
from typing import Dict, Iterable, List, Optional, Type
|
||||
|
||||
import requests
|
||||
from PIL import ImageDraw
|
||||
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
||||
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
||||
|
||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
AssembleOptions,
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
DocumentConversionInput,
|
||||
InputDocument,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
|
||||
from docling.utils.utils import chunkify, create_hash
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
_default_download_filename = "file.pdf"
|
||||
class FormatOption(BaseModel):
|
||||
pipeline_cls: Type[BaseModelPipeline]
|
||||
pipeline_options: Optional[PipelineOptions] = None
|
||||
backend: Optional[Type[AbstractDocumentBackend]]
|
||||
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
artifacts_path: Optional[Union[Path, str]] = None,
|
||||
pipeline_options: PipelineOptions = PipelineOptions(),
|
||||
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
||||
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
||||
assemble_options: AssembleOptions = AssembleOptions(),
|
||||
pipeline_cls: Type[BaseModelPipeline],
|
||||
pipeline_options: Optional[PipelineOptions] = None,
|
||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||
):
|
||||
if not artifacts_path:
|
||||
artifacts_path = self.download_models_hf()
|
||||
if pipeline_options is None:
|
||||
pipeline_options = pipeline_cls.get_default_options()
|
||||
|
||||
artifacts_path = Path(artifacts_path)
|
||||
|
||||
self.model_pipeline = pipeline_cls(
|
||||
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
||||
super().__init__(
|
||||
pipeline_cls=pipeline_cls,
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend,
|
||||
)
|
||||
|
||||
self.page_assemble_model = PageAssembleModel(config={})
|
||||
self.glm_model = GlmModel(config={})
|
||||
self.pdf_backend = pdf_backend
|
||||
self.assemble_options = assemble_options
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
_format_to_default_options = {
|
||||
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||
InputFormat.IMAGE: None,
|
||||
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||
}
|
||||
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
||||
|
||||
class DocumentConverter:
|
||||
_default_download_filename = "file"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
formats: List[InputFormat] = [e for e in InputFormat],
|
||||
format_options: Dict[InputFormat, FormatOption] = _format_to_default_options,
|
||||
):
|
||||
self.formats = formats
|
||||
self.format_to_options = format_options
|
||||
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
|
||||
{}
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
||||
|
||||
for input_batch in chunkify(
|
||||
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
||||
input.docs(), settings.perf.doc_batch_size # pass format_options
|
||||
):
|
||||
_log.info(f"Going to convert document batch...")
|
||||
# parallel processing only within input_batch
|
||||
@ -84,8 +81,8 @@ class DocumentConverter:
|
||||
# ) as pool:
|
||||
# yield from pool.map(self.process_document, input_batch)
|
||||
|
||||
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
||||
yield from map(self._process_document, input_batch)
|
||||
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||
yield from map(self.process_document, input_batch)
|
||||
|
||||
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
||||
"""Convert a single document.
|
||||
@ -137,156 +134,42 @@ class DocumentConverter:
|
||||
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
||||
return conv_res
|
||||
|
||||
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
||||
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
|
||||
pipeline_class = None
|
||||
fopt = self.format_to_options.get(doc.format)
|
||||
if fopt is None:
|
||||
return None
|
||||
else:
|
||||
pipeline_class = fopt.pipeline_cls
|
||||
|
||||
if pipeline_class not in self.initialized_pipelines:
|
||||
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
||||
pipeline_options=pipeline_class.get_default_options()
|
||||
)
|
||||
return self.initialized_pipelines[pipeline_class]
|
||||
|
||||
def process_document(self, in_doc: InputDocument) -> ConversionResult:
|
||||
start_doc_time = time.time()
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
if not in_doc.valid:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
all_assembled_pages = []
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
||||
start_pb_time = time.time()
|
||||
# Pipeline
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self._initialize_page, in_doc), page_batch
|
||||
)
|
||||
|
||||
# 2. Populate page image
|
||||
pages_with_images = map(
|
||||
functools.partial(self._populate_page_images, in_doc), init_pages
|
||||
)
|
||||
|
||||
# 3. Populate programmatic page cells
|
||||
pages_with_cells = map(
|
||||
functools.partial(self._parse_page_cells, in_doc),
|
||||
pages_with_images,
|
||||
)
|
||||
|
||||
# 4. Run pipeline stages
|
||||
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
||||
|
||||
# 5. Assemble page elements (per page)
|
||||
assembled_pages = self.page_assemble_model(pipeline_pages)
|
||||
|
||||
# exhaust assembled_pages
|
||||
for assembled_page in assembled_pages:
|
||||
# Free up mem resources before moving on with next batch
|
||||
|
||||
# Remove page images (can be disabled)
|
||||
if self.assemble_options.images_scale is None:
|
||||
assembled_page._image_cache = {}
|
||||
|
||||
# Unload backend
|
||||
assembled_page._backend.unload()
|
||||
|
||||
all_assembled_pages.append(assembled_page)
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
# Free up mem resources of PDF backend
|
||||
in_doc._backend.unload()
|
||||
|
||||
conv_res.pages = all_assembled_pages
|
||||
self._assemble_doc(conv_res)
|
||||
|
||||
status = ConversionStatus.SUCCESS
|
||||
for page in conv_res.pages:
|
||||
if not page._backend.is_valid():
|
||||
conv_res.errors.append(
|
||||
ErrorItem(
|
||||
component_type=DoclingComponentType.PDF_BACKEND,
|
||||
module_name=type(page._backend).__name__,
|
||||
error_message=f"Page {page.page_no} failed to parse.",
|
||||
)
|
||||
)
|
||||
status = ConversionStatus.PARTIAL_SUCCESS
|
||||
|
||||
conv_res.status = status
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.info(
|
||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||
f"{trace}"
|
||||
)
|
||||
conv_res = self._execute_pipeline(in_doc)
|
||||
|
||||
end_doc_time = time.time() - start_doc_time
|
||||
_log.info(
|
||||
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
||||
)
|
||||
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
|
||||
|
||||
return conv_res
|
||||
|
||||
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
||||
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
page._backend = doc._backend.load_page(page.page_no)
|
||||
page.size = page._backend.get_size()
|
||||
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
||||
def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult:
|
||||
if in_doc.valid and in_doc.format in self.formats:
|
||||
pipeline = self._get_pipeline(in_doc)
|
||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
return page
|
||||
conv_res = pipeline.execute(in_doc)
|
||||
else: # invalid doc or not of desired format
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
# TODO add error log why it failed.
|
||||
|
||||
# Generate the page image and store it in the page object
|
||||
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
||||
# default scale
|
||||
page.get_image(
|
||||
scale=1.0
|
||||
) # puts the page image on the image cache at default scale
|
||||
|
||||
# user requested scales
|
||||
if self.assemble_options.images_scale is not None:
|
||||
page._default_image_scale = self.assemble_options.images_scale
|
||||
page.get_image(
|
||||
scale=self.assemble_options.images_scale
|
||||
) # this will trigger storing the image in the internal cache
|
||||
|
||||
return page
|
||||
|
||||
# Extract and populate the page cells and store it in the page object
|
||||
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
||||
page.cells = page._backend.get_text_cells()
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells):
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
image.show()
|
||||
|
||||
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
||||
|
||||
return page
|
||||
|
||||
def _assemble_doc(self, conv_res: ConversionResult):
|
||||
all_elements = []
|
||||
all_headers = []
|
||||
all_body = []
|
||||
|
||||
for p in conv_res.pages:
|
||||
|
||||
for el in p.assembled.body:
|
||||
all_body.append(el)
|
||||
for el in p.assembled.headers:
|
||||
all_headers.append(el)
|
||||
for el in p.assembled.elements:
|
||||
all_elements.append(el)
|
||||
|
||||
conv_res.assembled = AssembledUnit(
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
|
||||
return conv_res
|
||||
|
10
docling/models/abstract_model.py
Normal file
10
docling/models/abstract_model.py
Normal file
@ -0,0 +1,10 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Iterable
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
|
||||
|
||||
class AbstractPageModel(ABC):
|
||||
@abstractmethod
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
pass
|
@ -3,7 +3,6 @@ import logging
|
||||
from abc import abstractmethod
|
||||
from typing import Iterable, List, Tuple
|
||||
|
||||
import numpy
|
||||
import numpy as np
|
||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||
from PIL import Image, ImageDraw
|
||||
@ -11,11 +10,12 @@ from rtree import index
|
||||
from scipy.ndimage import find_objects, label
|
||||
|
||||
from docling.datamodel.base_models import OcrCell, Page
|
||||
from docling.models.abstract_model import AbstractPageModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseOcrModel:
|
||||
class BaseOcrModel(AbstractPageModel):
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.enabled = config["enabled"]
|
||||
|
@ -16,12 +16,13 @@ from docling.datamodel.base_models import (
|
||||
LayoutPrediction,
|
||||
Page,
|
||||
)
|
||||
from docling.models.abstract_model import AbstractPageModel
|
||||
from docling.utils import layout_utils as lu
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LayoutModel:
|
||||
class LayoutModel(AbstractPageModel):
|
||||
|
||||
TEXT_ELEM_LABELS = [
|
||||
DocItemLabel.TEXT,
|
||||
|
@ -10,12 +10,13 @@ from docling.datamodel.base_models import (
|
||||
Table,
|
||||
TextElement,
|
||||
)
|
||||
from docling.models.abstract_model import AbstractPageModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PageAssembleModel:
|
||||
class PageAssembleModel(AbstractPageModel):
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
@ -145,4 +146,11 @@ class PageAssembleModel:
|
||||
elements=elements, headers=headers, body=body
|
||||
)
|
||||
|
||||
# Remove page images (can be disabled)
|
||||
if self.config["images_scale"] is None:
|
||||
page._image_cache = {}
|
||||
|
||||
# Unload backend
|
||||
page._backend.unload()
|
||||
|
||||
yield page
|
||||
|
50
docling/models/page_preprocessing_model.py
Normal file
50
docling/models/page_preprocessing_model.py
Normal file
@ -0,0 +1,50 @@
|
||||
from typing import Iterable
|
||||
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.models.abstract_model import AbstractPageModel
|
||||
|
||||
|
||||
class PagePreprocessingModel(AbstractPageModel):
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for page in page_batch:
|
||||
page = self._populate_page_images(page)
|
||||
page = self._parse_page_cells(page)
|
||||
yield page
|
||||
|
||||
# Generate the page image and store it in the page object
|
||||
def _populate_page_images(self, page: Page) -> Page:
|
||||
# default scale
|
||||
page.get_image(
|
||||
scale=1.0
|
||||
) # puts the page image on the image cache at default scale
|
||||
|
||||
images_scale = self.config["images_scale"]
|
||||
# user requested scales
|
||||
if images_scale is not None:
|
||||
page._default_image_scale = images_scale
|
||||
page.get_image(
|
||||
scale=images_scale
|
||||
) # this will trigger storing the image in the internal cache
|
||||
|
||||
return page
|
||||
|
||||
# Extract and populate the page cells and store it in the page object
|
||||
def _parse_page_cells(self, page: Page) -> Page:
|
||||
page.cells = page._backend.get_text_cells()
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells):
|
||||
draw = ImageDraw.Draw(image)
|
||||
for c in cells:
|
||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||
image.show()
|
||||
|
||||
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
||||
|
||||
return page
|
@ -9,9 +9,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
||||
from PIL import ImageDraw
|
||||
|
||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||
from docling.models.abstract_model import AbstractPageModel
|
||||
|
||||
|
||||
class TableStructureModel:
|
||||
class TableStructureModel(AbstractPageModel):
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
self.do_cell_matching = config["do_cell_matching"]
|
||||
|
@ -1,17 +1,117 @@
|
||||
from pathlib import Path
|
||||
import functools
|
||||
import logging
|
||||
import time
|
||||
import traceback
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Callable, Iterable, List
|
||||
|
||||
from docling.datamodel.base_models import Page, PipelineOptions
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
ErrorItem,
|
||||
Page,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.utils.utils import chunkify
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseModelPipeline:
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
self.model_pipe: List[Callable] = []
|
||||
self.artifacts_path = artifacts_path
|
||||
class BaseModelPipeline(ABC):
|
||||
def __init__(self, pipeline_options: PipelineOptions):
|
||||
self.pipeline_options = pipeline_options
|
||||
self.model_pipe: List[Callable] = []
|
||||
|
||||
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
@abstractmethod
|
||||
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def get_default_options(cls) -> PipelineOptions:
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
pass
|
||||
|
||||
|
||||
class PaginatedModelPipeline(BaseModelPipeline):
|
||||
|
||||
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||
for model in self.model_pipe:
|
||||
page_batch = model(page_batch)
|
||||
|
||||
yield from page_batch
|
||||
|
||||
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
for i in range(0, in_doc.page_count):
|
||||
conv_res.pages.append(Page(page_no=i))
|
||||
|
||||
try:
|
||||
# Iterate batches of pages (page_batch_size) in the doc
|
||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
||||
start_pb_time = time.time()
|
||||
|
||||
# 1. Initialise the page resources
|
||||
init_pages = map(
|
||||
functools.partial(self.initialize_page, in_doc), page_batch
|
||||
)
|
||||
|
||||
# 2. Run pipeline stages
|
||||
pipeline_pages = self.apply_on_pages(init_pages)
|
||||
|
||||
for p in pipeline_pages:
|
||||
pass
|
||||
|
||||
end_pb_time = time.time() - start_pb_time
|
||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||
|
||||
# Free up mem resources of PDF backend
|
||||
in_doc._backend.unload()
|
||||
|
||||
conv_res = self.assemble_document(in_doc, conv_res)
|
||||
|
||||
status = ConversionStatus.SUCCESS
|
||||
for page in conv_res.pages:
|
||||
if not page._backend.is_valid():
|
||||
conv_res.errors.append(
|
||||
ErrorItem(
|
||||
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
||||
module_name=type(page._backend).__name__,
|
||||
error_message=f"Page {page.page_no} failed to parse.",
|
||||
)
|
||||
)
|
||||
status = ConversionStatus.PARTIAL_SUCCESS
|
||||
|
||||
conv_res.status = status
|
||||
|
||||
except Exception as e:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
trace = "\n".join(traceback.format_exception(e))
|
||||
_log.info(
|
||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||
f"{trace}"
|
||||
)
|
||||
raise e
|
||||
|
||||
return conv_res
|
||||
|
||||
# Initialise and load resources for a page
|
||||
@abstractmethod
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
pass
|
||||
|
57
docling/pipeline/simple_model_pipeline.py
Normal file
57
docling/pipeline/simple_model_pipeline.py
Normal file
@ -0,0 +1,57 @@
|
||||
import logging
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
AbstractDocumentBackend,
|
||||
DeclarativeDocumentBackend,
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SimpleModelPipeline(BaseModelPipeline):
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
|
||||
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
|
||||
_log.info(f"Processing document {in_doc.file.name}")
|
||||
|
||||
if not in_doc.valid:
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
conv_res.experimental = in_doc._backend.convert()
|
||||
|
||||
# Do other stuff with conv_res.experimental
|
||||
|
||||
conv_res = self.assemble_document(in_doc, conv_res)
|
||||
|
||||
conv_res.status = ConversionStatus.SUCCESS
|
||||
|
||||
return conv_res
|
||||
|
||||
def assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> PipelineOptions:
|
||||
return PipelineOptions()
|
||||
|
||||
@classmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
return isinstance(backend, DeclarativeDocumentBackend)
|
@ -1,38 +0,0 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import PipelineOptions
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||
|
||||
|
||||
class StandardModelPipeline(BaseModelPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
||||
super().__init__(artifacts_path, pipeline_options)
|
||||
|
||||
self.model_pipe = [
|
||||
EasyOcrModel(
|
||||
config={
|
||||
"lang": ["fr", "de", "es", "en"],
|
||||
"enabled": pipeline_options.do_ocr,
|
||||
}
|
||||
),
|
||||
LayoutModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._layout_model_path
|
||||
}
|
||||
),
|
||||
TableStructureModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardModelPipeline._table_model_path,
|
||||
"enabled": pipeline_options.do_table_structure,
|
||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
||||
}
|
||||
),
|
||||
]
|
108
docling/pipeline/standard_pdf_model_pipeline.py
Normal file
108
docling/pipeline/standard_pdf_model_pipeline.py
Normal file
@ -0,0 +1,108 @@
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, InputDocument
|
||||
from docling.models.ds_glm_model import GlmModel
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.page_assemble_model import PageAssembleModel
|
||||
from docling.models.page_preprocessing_model import PagePreprocessingModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||
_table_model_path = "model_artifacts/tableformer"
|
||||
|
||||
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||
super().__init__(pipeline_options)
|
||||
|
||||
if not pipeline_options.artifacts_path:
|
||||
artifacts_path = self.download_models_hf()
|
||||
|
||||
self.artifacts_path = Path(artifacts_path)
|
||||
self.glm_model = GlmModel(config={})
|
||||
|
||||
self.model_pipe = [
|
||||
PagePreprocessingModel(
|
||||
config={"images_scale": pipeline_options.images_scale}
|
||||
),
|
||||
EasyOcrModel(
|
||||
config={
|
||||
"lang": ["fr", "de", "es", "en"],
|
||||
"enabled": pipeline_options.do_ocr,
|
||||
}
|
||||
),
|
||||
LayoutModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardPdfModelPipeline._layout_model_path
|
||||
}
|
||||
),
|
||||
TableStructureModel(
|
||||
config={
|
||||
"artifacts_path": artifacts_path
|
||||
/ StandardPdfModelPipeline._table_model_path,
|
||||
"enabled": pipeline_options.do_table_structure,
|
||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
||||
}
|
||||
),
|
||||
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
||||
)
|
||||
|
||||
return Path(download_path)
|
||||
|
||||
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||
page._backend = doc._backend.load_page(page.page_no)
|
||||
page.size = page._backend.get_size()
|
||||
|
||||
return page
|
||||
|
||||
def assemble_document(
|
||||
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||
) -> ConversionResult:
|
||||
all_elements = []
|
||||
all_headers = []
|
||||
all_body = []
|
||||
|
||||
for p in conv_res.pages:
|
||||
|
||||
for el in p.assembled.body:
|
||||
all_body.append(el)
|
||||
for el in p.assembled.headers:
|
||||
all_headers.append(el)
|
||||
for el in p.assembled.elements:
|
||||
all_elements.append(el)
|
||||
|
||||
conv_res.assembled = AssembledUnit(
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
|
||||
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
def get_default_options(cls) -> PdfPipelineOptions:
|
||||
return PdfPipelineOptions()
|
||||
|
||||
@classmethod
|
||||
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||
return isinstance(backend, PdfDocumentBackend)
|
@ -6,9 +6,9 @@ from typing import Iterable
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -107,7 +107,11 @@ def main():
|
||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||
# input = DocumentConversionInput.from_streams(docs)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
doc_converter = PdfDocumentConverter(
|
||||
pipeline_options=PdfPipelineOptions(),
|
||||
pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
|
||||
pipeline_cls=StandardModelPipeline,
|
||||
)
|
||||
|
||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
|
@ -6,9 +6,9 @@ from typing import Iterable
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -93,12 +93,12 @@ def main():
|
||||
|
||||
# Docling Parse without OCR
|
||||
# -------------------------
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
doc_converter = PdfDocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
@ -4,14 +4,14 @@ from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
AssembleOptions,
|
||||
ConversionStatus,
|
||||
FigureElement,
|
||||
PageElement,
|
||||
PdfPipelineOptions,
|
||||
Table,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -30,12 +30,12 @@ def main():
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
assemble_options = AssembleOptions()
|
||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
@ -5,9 +5,9 @@ from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
|
||||
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
from docling.utils.export import generate_multimodal_pages
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
@ -27,12 +27,12 @@ def main():
|
||||
|
||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||
# will destroy them for cleaning up memory.
|
||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
||||
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
||||
# scale=1 correspond of a standard 72 DPI image
|
||||
assemble_options = AssembleOptions()
|
||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||
|
||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
||||
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
@ -7,7 +7,7 @@ import pandas as pd
|
||||
|
||||
from docling.datamodel.base_models import ConversionStatus
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -22,7 +22,7 @@ def main():
|
||||
|
||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||
|
||||
doc_converter = DocumentConverter()
|
||||
doc_converter = PdfDocumentConverter()
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||
converter = DocumentConverter()
|
||||
converter = PdfDocumentConverter()
|
||||
doc = converter.convert_single(source)
|
||||
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
||||
|
41
examples/run_with_formats.py
Normal file
41
examples/run_with_formats.py
Normal file
@ -0,0 +1,41 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import (
|
||||
InputFormat,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
)
|
||||
from docling.datamodel.document import DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter, FormatOption
|
||||
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||
|
||||
input_paths = [
|
||||
Path("tests/data/wiki_duck.html"),
|
||||
Path("tests/data/word_sample.docx"),
|
||||
Path("tests/data/powerpoint_sample.pptx"),
|
||||
Path("tests/data/2206.01062.pdf"),
|
||||
]
|
||||
input = DocumentConversionInput.from_paths(input_paths)
|
||||
|
||||
# for defaults use:
|
||||
doc_converter = DocumentConverter()
|
||||
|
||||
# to customize use:
|
||||
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||
# formats=[InputFormat.PDF, InputFormat.DOCX],
|
||||
# format_options={
|
||||
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
|
||||
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
|
||||
# }
|
||||
# )
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
|
||||
for res in conv_results:
|
||||
print(
|
||||
f"Document {res.input.file.name} converted with status {res.status}. Content:"
|
||||
)
|
||||
print(res.experimental.export_to_markdown())
|
BIN
tests/data/powerpoint_sample.pptx
Normal file
BIN
tests/data/powerpoint_sample.pptx
Normal file
Binary file not shown.
1311
tests/data/wiki_duck.html
Normal file
1311
tests/data/wiki_duck.html
Normal file
File diff suppressed because one or more lines are too long
BIN
tests/data/word_sample.docx
Normal file
BIN
tests/data/word_sample.docx
Normal file
Binary file not shown.
@ -2,9 +2,9 @@ from pathlib import Path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import PipelineOptions
|
||||
from docling.datamodel.base_models import PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
@ -23,12 +23,12 @@ def get_pdf_paths():
|
||||
|
||||
def get_converter():
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
converter = DocumentConverter(
|
||||
converter = PdfDocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
|
@ -5,9 +5,9 @@ import pytest
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import DocumentStream, PipelineOptions
|
||||
from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions
|
||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||
from docling.document_converter import DocumentConverter
|
||||
from docling.pdf_document_converter import PdfDocumentConverter
|
||||
|
||||
from .verify_utils import verify_conversion_result
|
||||
|
||||
@ -21,12 +21,12 @@ def get_pdf_path():
|
||||
@pytest.fixture
|
||||
def converter():
|
||||
|
||||
pipeline_options = PipelineOptions()
|
||||
pipeline_options = PdfPipelineOptions()
|
||||
pipeline_options.do_ocr = False
|
||||
pipeline_options.do_table_structure = True
|
||||
pipeline_options.table_structure_options.do_cell_matching = True
|
||||
|
||||
converter = DocumentConverter(
|
||||
converter = PdfDocumentConverter(
|
||||
pipeline_options=pipeline_options,
|
||||
pdf_backend=DoclingParseDocumentBackend,
|
||||
)
|
||||
@ -34,7 +34,7 @@ def converter():
|
||||
return converter
|
||||
|
||||
|
||||
def test_convert_single(converter: DocumentConverter):
|
||||
def test_convert_single(converter: PdfDocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter):
|
||||
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
|
||||
|
||||
|
||||
def test_batch_path(converter: DocumentConverter):
|
||||
def test_batch_path(converter: PdfDocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter):
|
||||
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
|
||||
|
||||
|
||||
def test_batch_bytes(converter: DocumentConverter):
|
||||
def test_batch_bytes(converter: PdfDocumentConverter):
|
||||
|
||||
pdf_path = get_pdf_path()
|
||||
print(f"converting {pdf_path}")
|
||||
|
Loading…
Reference in New Issue
Block a user