mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Fundamental refactoring for multi-format support
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
cd06d89c2a
commit
1fa7cd9855
@ -67,11 +67,12 @@ pip install docling
|
|||||||
### Convert a single document
|
### Convert a single document
|
||||||
|
|
||||||
To convert invidual PDF documents, use `convert_single()`, for example:
|
To convert invidual PDF documents, use `convert_single()`, for example:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||||
converter = DocumentConverter()
|
converter = PdfDocumentConverter()
|
||||||
result = converter.convert_single(source)
|
result = converter.convert_single(source)
|
||||||
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
print(result.render_as_markdown()) # output: "## Docling Technical Report[...]"
|
||||||
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
|
print(result.render_as_doctags()) # output: "<document><title><page_1><loc_20>..."
|
||||||
|
@ -1,13 +1,11 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Iterable, Optional, Union
|
from typing import Set, Union
|
||||||
|
|
||||||
from docling_core.types.experimental import BoundingBox, Size
|
from docling_core.types.experimental import DoclingDocument
|
||||||
from PIL import Image
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.base_models import Cell
|
|
||||||
|
|
||||||
|
|
||||||
class AbstractDocumentBackend(ABC):
|
class AbstractDocumentBackend(ABC):
|
||||||
@ -20,6 +18,11 @@ class AbstractDocumentBackend(ABC):
|
|||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def is_paginated(cls) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def unload(self):
|
def unload(self):
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
@ -27,45 +30,19 @@ class AbstractDocumentBackend(ABC):
|
|||||||
|
|
||||||
self.path_or_stream = None
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
class PdfPageBackend(ABC):
|
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_text_cells(self) -> Iterable["Cell"]:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_page_image(
|
|
||||||
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
|
||||||
) -> Image.Image:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def get_size(self) -> "Size":
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def is_valid(self) -> bool:
|
|
||||||
pass
|
|
||||||
|
|
||||||
@abstractmethod
|
|
||||||
def unload(self):
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PdfDocumentBackend(AbstractDocumentBackend):
|
class DeclarativeDocumentBackend(AbstractDocumentBackend):
|
||||||
@abstractmethod
|
"""DeclarativeDocumentBackend.
|
||||||
def load_page(self, page_no: int) -> PdfPageBackend:
|
|
||||||
pass
|
A declarative document backend is a backend that can transform to DoclingDocument
|
||||||
|
straight without a recognition pipeline.
|
||||||
|
"""
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def page_count(self) -> int:
|
def convert(self) -> DoclingDocument:
|
||||||
pass
|
pass
|
||||||
|
@ -10,7 +10,7 @@ from docling_parse.docling_parse import pdf_parser
|
|||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pypdfium2 import PdfPage
|
from pypdfium2 import PdfPage
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell
|
from docling.datamodel.base_models import Cell
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
40
docling/backend/html_backend.py
Normal file
40
docling/backend/html_backend.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Set, Union
|
||||||
|
|
||||||
|
from docling_core.types.experimental import (
|
||||||
|
DescriptionItem,
|
||||||
|
DocItemLabel,
|
||||||
|
DoclingDocument,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
|
||||||
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
|
super().__init__(path_or_stream, document_hash)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def is_paginated(cls) -> bool:
|
||||||
|
False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
self.path_or_stream.close()
|
||||||
|
|
||||||
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.HTML}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
|
||||||
|
# access self.path_or_stream to load stuff
|
||||||
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
|
doc.add_text(text="I am a HTML document.", label=DocItemLabel.TEXT)
|
||||||
|
return doc
|
38
docling/backend/mspowerpoint_backend.py
Normal file
38
docling/backend/mspowerpoint_backend.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Set, Union
|
||||||
|
|
||||||
|
from docling_core.types.experimental import (
|
||||||
|
DescriptionItem,
|
||||||
|
DocItemLabel,
|
||||||
|
DoclingDocument,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
|
||||||
|
class MsPowerpointDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
|
super().__init__(path_or_stream, document_hash)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def is_paginated(cls) -> bool:
|
||||||
|
False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
self.path_or_stream.close()
|
||||||
|
|
||||||
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.PPTX}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
|
doc.add_text(text="I am a Powerpoint document.", label=DocItemLabel.TEXT)
|
||||||
|
return doc
|
38
docling/backend/msword_backend.py
Normal file
38
docling/backend/msword_backend.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Set, Union
|
||||||
|
|
||||||
|
from docling_core.types.experimental import (
|
||||||
|
DescriptionItem,
|
||||||
|
DocItemLabel,
|
||||||
|
DoclingDocument,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
|
||||||
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
def __init__(self, path_or_stream: Union[BytesIO, Path], document_hash: str):
|
||||||
|
super().__init__(path_or_stream, document_hash)
|
||||||
|
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def is_paginated(cls) -> bool:
|
||||||
|
False
|
||||||
|
|
||||||
|
def unload(self):
|
||||||
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
|
self.path_or_stream.close()
|
||||||
|
|
||||||
|
self.path_or_stream = None
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.DOCX}
|
||||||
|
|
||||||
|
def convert(self) -> DoclingDocument:
|
||||||
|
doc = DoclingDocument(description=DescriptionItem(), name="dummy")
|
||||||
|
doc.add_text(text="I am a Word document.", label=DocItemLabel.TEXT)
|
||||||
|
return doc
|
59
docling/backend/pdf_backend.py
Normal file
59
docling/backend/pdf_backend.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Iterable, Optional, Set
|
||||||
|
|
||||||
|
from docling_core.types.experimental import BoundingBox, Size
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.datamodel.base_models import Cell, InputFormat
|
||||||
|
|
||||||
|
|
||||||
|
class PdfPageBackend(ABC):
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_text_in_rect(self, bbox: "BoundingBox") -> str:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_text_cells(self) -> Iterable["Cell"]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_bitmap_rects(self, float: int = 1) -> Iterable["BoundingBox"]:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_page_image(
|
||||||
|
self, scale: float = 1, cropbox: Optional["BoundingBox"] = None
|
||||||
|
) -> Image.Image:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def get_size(self) -> "Size":
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def is_valid(self) -> bool:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def unload(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PdfDocumentBackend(AbstractDocumentBackend):
|
||||||
|
@abstractmethod
|
||||||
|
def load_page(self, page_no: int) -> PdfPageBackend:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def page_count(self) -> int:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def supported_formats(cls) -> Set[InputFormat]:
|
||||||
|
return {InputFormat.PDF}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_paginated(cls) -> bool:
|
||||||
|
return True
|
@ -8,10 +8,10 @@ import pypdfium2 as pdfium
|
|||||||
import pypdfium2.raw as pdfium_c
|
import pypdfium2.raw as pdfium_c
|
||||||
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
|
from docling_core.types.experimental import BoundingBox, CoordOrigin, Size
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
from pypdfium2 import PdfPage, PdfTextPage
|
from pypdfium2 import PdfTextPage
|
||||||
from pypdfium2._helpers.misc import PdfiumError
|
from pypdfium2._helpers.misc import PdfiumError
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell
|
from docling.datamodel.base_models import Cell
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
@ -12,9 +12,9 @@ from docling_core.utils.file import resolve_file_source
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
@ -190,12 +190,12 @@ def convert(
|
|||||||
case _:
|
case _:
|
||||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||||
|
|
||||||
pipeline_options = PipelineOptions(
|
pipeline_options = PdfPipelineOptions(
|
||||||
do_ocr=ocr,
|
do_ocr=ocr,
|
||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = PdfDocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
pdf_backend=pdf_backend,
|
pdf_backend=pdf_backend,
|
||||||
)
|
)
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
import copy
|
|
||||||
import warnings
|
import warnings
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Annotated, Any, Dict, List, Optional, Tuple, Union
|
from pathlib import Path
|
||||||
|
from typing import Annotated, Dict, List, Optional, Union
|
||||||
|
|
||||||
from docling_core.types.experimental import BoundingBox, Size
|
from docling_core.types.experimental import BoundingBox, Size
|
||||||
from docling_core.types.experimental.document import BasePictureData, TableCell
|
from docling_core.types.experimental.document import BasePictureData, TableCell
|
||||||
@ -11,8 +11,6 @@ from PIL.Image import Image
|
|||||||
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
||||||
from typing_extensions import Self
|
from typing_extensions import Self
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PdfPageBackend
|
|
||||||
|
|
||||||
|
|
||||||
class ConversionStatus(str, Enum):
|
class ConversionStatus(str, Enum):
|
||||||
PENDING = auto()
|
PENDING = auto()
|
||||||
@ -30,13 +28,29 @@ class InputFormat(str, Enum):
|
|||||||
PDF = auto()
|
PDF = auto()
|
||||||
|
|
||||||
|
|
||||||
|
FormatToMimeType = {
|
||||||
|
InputFormat.DOCX: {
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||||
|
},
|
||||||
|
InputFormat.PPTX: {
|
||||||
|
"application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
||||||
|
},
|
||||||
|
InputFormat.HTML: {"text/html", "application/xhtml+xml"},
|
||||||
|
InputFormat.IMAGE: {"image/png", "image/jpeg"},
|
||||||
|
InputFormat.PDF: {"application/pdf"},
|
||||||
|
}
|
||||||
|
MimeTypeToFormat = {
|
||||||
|
mime: fmt for fmt, mimes in FormatToMimeType.items() for mime in mimes
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class DocInputType(str, Enum):
|
class DocInputType(str, Enum):
|
||||||
PATH = auto()
|
PATH = auto()
|
||||||
STREAM = auto()
|
STREAM = auto()
|
||||||
|
|
||||||
|
|
||||||
class DoclingComponentType(str, Enum):
|
class DoclingComponentType(str, Enum):
|
||||||
PDF_BACKEND = auto()
|
DOCUMENT_BACKEND = auto()
|
||||||
MODEL = auto()
|
MODEL = auto()
|
||||||
DOC_ASSEMBLER = auto()
|
DOC_ASSEMBLER = auto()
|
||||||
|
|
||||||
@ -128,13 +142,13 @@ class Page(BaseModel):
|
|||||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
page_no: int
|
page_no: int
|
||||||
page_hash: Optional[str] = None
|
# page_hash: Optional[str] = None
|
||||||
size: Optional[Size] = None
|
size: Optional[Size] = None
|
||||||
cells: List[Cell] = []
|
cells: List[Cell] = []
|
||||||
predictions: PagePredictions = PagePredictions()
|
predictions: PagePredictions = PagePredictions()
|
||||||
assembled: Optional[AssembledUnit] = None
|
assembled: Optional[AssembledUnit] = None
|
||||||
|
|
||||||
_backend: Optional[PdfPageBackend] = (
|
_backend: Optional["PdfPageBackend"] = (
|
||||||
None # Internal PDF backend. By default it is cleared during assembling.
|
None # Internal PDF backend. By default it is cleared during assembling.
|
||||||
)
|
)
|
||||||
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
_default_image_scale: float = 1.0 # Default image scale for external usage.
|
||||||
@ -170,14 +184,16 @@ class TableStructureOptions(BaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel):
|
class PipelineOptions(BaseModel): ...
|
||||||
|
|
||||||
|
|
||||||
|
class PdfPipelineOptions(PipelineOptions):
|
||||||
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
do_table_structure: bool = True # True: perform table structure extraction
|
do_table_structure: bool = True # True: perform table structure extraction
|
||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
|
|
||||||
|
|
||||||
class AssembleOptions(BaseModel):
|
|
||||||
keep_page_images: Annotated[
|
keep_page_images: Annotated[
|
||||||
bool,
|
bool,
|
||||||
Field(
|
Field(
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path, PurePath
|
from pathlib import Path, PurePath
|
||||||
from typing import ClassVar, Dict, Iterable, List, Optional, Tuple, Type, Union
|
from typing import Dict, Iterable, List, Optional, Tuple, Type, Union
|
||||||
|
|
||||||
from docling_core.types import BaseCell, BaseText
|
import filetype
|
||||||
|
from docling_core.types import BaseText
|
||||||
from docling_core.types import Document as DsDocument
|
from docling_core.types import Document as DsDocument
|
||||||
from docling_core.types import DocumentDescription as DsDocumentDescription
|
from docling_core.types import DocumentDescription as DsDocumentDescription
|
||||||
from docling_core.types import FileInfoObject as DsFileInfoObject
|
from docling_core.types import FileInfoObject as DsFileInfoObject
|
||||||
@ -19,8 +20,11 @@ from docling_core.types.experimental import (
|
|||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend, PdfDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
|
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||||
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
AssembledUnit,
|
AssembledUnit,
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
@ -28,13 +32,14 @@ from docling.datamodel.base_models import (
|
|||||||
ErrorItem,
|
ErrorItem,
|
||||||
FigureElement,
|
FigureElement,
|
||||||
InputFormat,
|
InputFormat,
|
||||||
|
MimeTypeToFormat,
|
||||||
Page,
|
Page,
|
||||||
PageElement,
|
PageElement,
|
||||||
Table,
|
Table,
|
||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import DocumentLimits
|
from docling.datamodel.settings import DocumentLimits
|
||||||
from docling.utils.utils import create_file_hash
|
from docling.utils.utils import create_file_hash, create_hash
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -71,8 +76,9 @@ _EMPTY_DOCLING_DOC = DoclingDocument(
|
|||||||
|
|
||||||
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
_input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]] = {
|
||||||
InputFormat.PDF: DoclingParseDocumentBackend,
|
InputFormat.PDF: DoclingParseDocumentBackend,
|
||||||
InputFormat.DOCX: None,
|
InputFormat.HTML: HTMLDocumentBackend,
|
||||||
InputFormat.PPTX: None,
|
InputFormat.DOCX: MsWordDocumentBackend,
|
||||||
|
InputFormat.PPTX: MsPowerpointDocumentBackend,
|
||||||
InputFormat.IMAGE: None,
|
InputFormat.IMAGE: None,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -80,13 +86,14 @@ _input_format_default_backends: Dict[InputFormat, Type[AbstractDocumentBackend]]
|
|||||||
class InputDocument(BaseModel):
|
class InputDocument(BaseModel):
|
||||||
file: PurePath = None
|
file: PurePath = None
|
||||||
document_hash: Optional[str] = None
|
document_hash: Optional[str] = None
|
||||||
valid: bool = False
|
valid: bool = True
|
||||||
limits: DocumentLimits = DocumentLimits()
|
limits: DocumentLimits = DocumentLimits()
|
||||||
|
format: Optional[InputFormat] = None
|
||||||
|
|
||||||
filesize: Optional[int] = None
|
filesize: Optional[int] = None
|
||||||
page_count: Optional[int] = None
|
page_count: int = 0
|
||||||
|
|
||||||
_backend: PdfDocumentBackend = None # Internal PDF backend used
|
_backend: AbstractDocumentBackend = None # Internal PDF backend used
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -94,27 +101,31 @@ class InputDocument(BaseModel):
|
|||||||
filename: Optional[str] = None,
|
filename: Optional[str] = None,
|
||||||
limits: Optional[DocumentLimits] = None,
|
limits: Optional[DocumentLimits] = None,
|
||||||
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||||
|
format: Optional[InputFormat] = None,
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
|
|
||||||
if not backend:
|
|
||||||
backend = _input_format_default_backends[InputFormat.PDF]
|
|
||||||
|
|
||||||
self.limits = limits or DocumentLimits()
|
self.limits = limits or DocumentLimits()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if isinstance(path_or_stream, Path):
|
if isinstance(path_or_stream, Path):
|
||||||
|
mime = filetype.guess_mime(str(path_or_stream))
|
||||||
|
if mime is None:
|
||||||
|
if path_or_stream.suffix == ".html":
|
||||||
|
mime = "text/html"
|
||||||
|
|
||||||
self.file = path_or_stream
|
self.file = path_or_stream
|
||||||
self.filesize = path_or_stream.stat().st_size
|
self.filesize = path_or_stream.stat().st_size
|
||||||
if self.filesize > self.limits.max_file_size:
|
if self.filesize > self.limits.max_file_size:
|
||||||
self.valid = False
|
self.valid = False
|
||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
self._backend = backend(
|
|
||||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
self._init_doc(backend, mime, path_or_stream)
|
||||||
)
|
|
||||||
|
|
||||||
elif isinstance(path_or_stream, BytesIO):
|
elif isinstance(path_or_stream, BytesIO):
|
||||||
|
mime = filetype.guess_mime(path_or_stream.read(8192))
|
||||||
|
|
||||||
self.file = PurePath(filename)
|
self.file = PurePath(filename)
|
||||||
self.filesize = path_or_stream.getbuffer().nbytes
|
self.filesize = path_or_stream.getbuffer().nbytes
|
||||||
|
|
||||||
@ -122,15 +133,15 @@ class InputDocument(BaseModel):
|
|||||||
self.valid = False
|
self.valid = False
|
||||||
else:
|
else:
|
||||||
self.document_hash = create_file_hash(path_or_stream)
|
self.document_hash = create_file_hash(path_or_stream)
|
||||||
self._backend = backend(
|
|
||||||
path_or_stream=path_or_stream, document_hash=self.document_hash
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.document_hash and self._backend.page_count() > 0:
|
self._init_doc(backend, mime, path_or_stream)
|
||||||
self.page_count = self._backend.page_count()
|
|
||||||
|
|
||||||
if self.page_count <= self.limits.max_num_pages:
|
# For paginated backends, check if the maximum page count is exceeded.
|
||||||
self.valid = True
|
if self.valid and self._backend.is_valid():
|
||||||
|
if self._backend.is_paginated():
|
||||||
|
self.page_count = self._backend.page_count()
|
||||||
|
if not self.page_count <= self.limits.max_num_pages:
|
||||||
|
self.valid = False
|
||||||
|
|
||||||
except (FileNotFoundError, OSError) as e:
|
except (FileNotFoundError, OSError) as e:
|
||||||
_log.exception(
|
_log.exception(
|
||||||
@ -144,6 +155,27 @@ class InputDocument(BaseModel):
|
|||||||
)
|
)
|
||||||
# raise
|
# raise
|
||||||
|
|
||||||
|
def _init_doc(
|
||||||
|
self,
|
||||||
|
backend: AbstractDocumentBackend,
|
||||||
|
mime: str,
|
||||||
|
path_or_stream: Union[BytesIO, Path],
|
||||||
|
) -> None:
|
||||||
|
self.format = MimeTypeToFormat.get(mime)
|
||||||
|
if self.format is not None:
|
||||||
|
backend = backend or _input_format_default_backends.get(self.format)
|
||||||
|
if backend is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Could not find suitable default backend for format: {self.format}"
|
||||||
|
)
|
||||||
|
if self.format is None or self.format not in backend.supported_formats():
|
||||||
|
# TODO decide if to raise exception here too.
|
||||||
|
self.valid = False
|
||||||
|
else:
|
||||||
|
self._backend = backend(
|
||||||
|
path_or_stream=path_or_stream, document_hash=self.document_hash
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@deprecated("Use `ConversionResult` instead.")
|
@deprecated("Use `ConversionResult` instead.")
|
||||||
class ConvertedDocument(BaseModel):
|
class ConvertedDocument(BaseModel):
|
||||||
@ -163,7 +195,11 @@ class ConvertedDocument(BaseModel):
|
|||||||
desc = DsDocumentDescription(logs=[])
|
desc = DsDocumentDescription(logs=[])
|
||||||
|
|
||||||
page_hashes = [
|
page_hashes = [
|
||||||
PageReference(hash=p.page_hash, page=p.page_no + 1, model="default")
|
PageReference(
|
||||||
|
hash=create_hash(self.input.document_hash + ":" + str(p.page_no)),
|
||||||
|
page=p.page_no + 1,
|
||||||
|
model="default",
|
||||||
|
)
|
||||||
for p in self.pages
|
for p in self.pages
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -441,25 +477,21 @@ class DocumentConversionInput(BaseModel):
|
|||||||
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
_path_or_stream_iterator: Iterable[Union[Path, DocumentStream]] = None
|
||||||
limits: Optional[DocumentLimits] = DocumentLimits()
|
limits: Optional[DocumentLimits] = DocumentLimits()
|
||||||
|
|
||||||
DEFAULT_BACKEND: ClassVar = DoclingParseDocumentBackend
|
|
||||||
|
|
||||||
def docs(
|
def docs(
|
||||||
self, pdf_backend: Optional[Type[PdfDocumentBackend]] = None
|
self, backend: Optional[Type[AbstractDocumentBackend]] = None
|
||||||
) -> Iterable[InputDocument]:
|
) -> Iterable[InputDocument]:
|
||||||
|
|
||||||
pdf_backend = pdf_backend or DocumentConversionInput.DEFAULT_BACKEND
|
|
||||||
|
|
||||||
for obj in self._path_or_stream_iterator:
|
for obj in self._path_or_stream_iterator:
|
||||||
if isinstance(obj, Path):
|
if isinstance(obj, Path):
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj, limits=self.limits, backend=pdf_backend
|
path_or_stream=obj, limits=self.limits, backend=backend
|
||||||
)
|
)
|
||||||
elif isinstance(obj, DocumentStream):
|
elif isinstance(obj, DocumentStream):
|
||||||
yield InputDocument(
|
yield InputDocument(
|
||||||
path_or_stream=obj.stream,
|
path_or_stream=obj.stream,
|
||||||
filename=obj.filename,
|
filename=obj.filename,
|
||||||
limits=self.limits,
|
limits=self.limits,
|
||||||
backend=pdf_backend,
|
backend=backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
@ -1,81 +1,78 @@
|
|||||||
import functools
|
|
||||||
import logging
|
import logging
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
import traceback
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable, Optional, Type, Union
|
from typing import Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from PIL import ImageDraw
|
from pydantic import AnyHttpUrl, BaseModel, ConfigDict, TypeAdapter, ValidationError
|
||||||
from pydantic import AnyHttpUrl, TypeAdapter, ValidationError
|
|
||||||
|
|
||||||
from docling.backend.abstract_backend import PdfDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import ConversionStatus, InputFormat, PipelineOptions
|
||||||
AssembledUnit,
|
|
||||||
AssembleOptions,
|
|
||||||
ConversionStatus,
|
|
||||||
DoclingComponentType,
|
|
||||||
ErrorItem,
|
|
||||||
Page,
|
|
||||||
PipelineOptions,
|
|
||||||
)
|
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
ConversionResult,
|
ConversionResult,
|
||||||
DocumentConversionInput,
|
DocumentConversionInput,
|
||||||
InputDocument,
|
InputDocument,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.ds_glm_model import GlmModel
|
|
||||||
from docling.models.page_assemble_model import PageAssembleModel
|
|
||||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||||
from docling.pipeline.standard_model_pipeline import StandardModelPipeline
|
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||||
from docling.utils.utils import chunkify, create_hash
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverter:
|
class FormatOption(BaseModel):
|
||||||
_default_download_filename = "file.pdf"
|
pipeline_cls: Type[BaseModelPipeline]
|
||||||
|
pipeline_options: Optional[PipelineOptions] = None
|
||||||
|
backend: Optional[Type[AbstractDocumentBackend]]
|
||||||
|
|
||||||
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
artifacts_path: Optional[Union[Path, str]] = None,
|
pipeline_cls: Type[BaseModelPipeline],
|
||||||
pipeline_options: PipelineOptions = PipelineOptions(),
|
pipeline_options: Optional[PipelineOptions] = None,
|
||||||
pdf_backend: Type[PdfDocumentBackend] = DocumentConversionInput.DEFAULT_BACKEND,
|
backend: Optional[Type[AbstractDocumentBackend]] = None,
|
||||||
pipeline_cls: Type[BaseModelPipeline] = StandardModelPipeline,
|
|
||||||
assemble_options: AssembleOptions = AssembleOptions(),
|
|
||||||
):
|
):
|
||||||
if not artifacts_path:
|
if pipeline_options is None:
|
||||||
artifacts_path = self.download_models_hf()
|
pipeline_options = pipeline_cls.get_default_options()
|
||||||
|
|
||||||
artifacts_path = Path(artifacts_path)
|
super().__init__(
|
||||||
|
pipeline_cls=pipeline_cls,
|
||||||
self.model_pipeline = pipeline_cls(
|
pipeline_options=pipeline_options,
|
||||||
artifacts_path=artifacts_path, pipeline_options=pipeline_options
|
backend=backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.page_assemble_model = PageAssembleModel(config={})
|
|
||||||
self.glm_model = GlmModel(config={})
|
|
||||||
self.pdf_backend = pdf_backend
|
|
||||||
self.assemble_options = assemble_options
|
|
||||||
|
|
||||||
@staticmethod
|
_format_to_default_options = {
|
||||||
def download_models_hf(
|
InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||||
local_dir: Optional[Path] = None, force: bool = False
|
InputFormat.PPTX: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||||
) -> Path:
|
InputFormat.HTML: FormatOption(pipeline_cls=SimpleModelPipeline),
|
||||||
from huggingface_hub import snapshot_download
|
InputFormat.IMAGE: None,
|
||||||
|
InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline),
|
||||||
|
}
|
||||||
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
class DocumentConverter:
|
||||||
|
_default_download_filename = "file"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
formats: List[InputFormat] = [e for e in InputFormat],
|
||||||
|
format_options: Dict[InputFormat, FormatOption] = _format_to_default_options,
|
||||||
|
):
|
||||||
|
self.formats = formats
|
||||||
|
self.format_to_options = format_options
|
||||||
|
self.initialized_pipelines: Dict[Type[BaseModelPipeline], BaseModelPipeline] = (
|
||||||
|
{}
|
||||||
)
|
)
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
def convert(self, input: DocumentConversionInput) -> Iterable[ConversionResult]:
|
||||||
|
|
||||||
for input_batch in chunkify(
|
for input_batch in chunkify(
|
||||||
input.docs(pdf_backend=self.pdf_backend), settings.perf.doc_batch_size
|
input.docs(), settings.perf.doc_batch_size # pass format_options
|
||||||
):
|
):
|
||||||
_log.info(f"Going to convert document batch...")
|
_log.info(f"Going to convert document batch...")
|
||||||
# parallel processing only within input_batch
|
# parallel processing only within input_batch
|
||||||
@ -84,8 +81,8 @@ class DocumentConverter:
|
|||||||
# ) as pool:
|
# ) as pool:
|
||||||
# yield from pool.map(self.process_document, input_batch)
|
# yield from pool.map(self.process_document, input_batch)
|
||||||
|
|
||||||
# Note: Pdfium backend is not thread-safe, thread pool usage was disabled.
|
# Note: PDF backends are not thread-safe, thread pool usage was disabled.
|
||||||
yield from map(self._process_document, input_batch)
|
yield from map(self.process_document, input_batch)
|
||||||
|
|
||||||
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
def convert_single(self, source: Path | AnyHttpUrl | str) -> ConversionResult:
|
||||||
"""Convert a single document.
|
"""Convert a single document.
|
||||||
@ -137,156 +134,42 @@ class DocumentConverter:
|
|||||||
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
raise RuntimeError(f"Conversion failed with status: {conv_res.status}")
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _process_document(self, in_doc: InputDocument) -> ConversionResult:
|
def _get_pipeline(self, doc: InputDocument) -> Optional[BaseModelPipeline]:
|
||||||
start_doc_time = time.time()
|
pipeline_class = None
|
||||||
conv_res = ConversionResult(input=in_doc)
|
fopt = self.format_to_options.get(doc.format)
|
||||||
|
if fopt is None:
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
pipeline_class = fopt.pipeline_cls
|
||||||
|
|
||||||
_log.info(f"Processing document {in_doc.file.name}")
|
if pipeline_class not in self.initialized_pipelines:
|
||||||
|
self.initialized_pipelines[pipeline_class] = pipeline_class(
|
||||||
if not in_doc.valid:
|
pipeline_options=pipeline_class.get_default_options()
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
return conv_res
|
|
||||||
|
|
||||||
for i in range(0, in_doc.page_count):
|
|
||||||
conv_res.pages.append(Page(page_no=i))
|
|
||||||
|
|
||||||
all_assembled_pages = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Iterate batches of pages (page_batch_size) in the doc
|
|
||||||
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
|
||||||
start_pb_time = time.time()
|
|
||||||
# Pipeline
|
|
||||||
|
|
||||||
# 1. Initialise the page resources
|
|
||||||
init_pages = map(
|
|
||||||
functools.partial(self._initialize_page, in_doc), page_batch
|
|
||||||
)
|
|
||||||
|
|
||||||
# 2. Populate page image
|
|
||||||
pages_with_images = map(
|
|
||||||
functools.partial(self._populate_page_images, in_doc), init_pages
|
|
||||||
)
|
|
||||||
|
|
||||||
# 3. Populate programmatic page cells
|
|
||||||
pages_with_cells = map(
|
|
||||||
functools.partial(self._parse_page_cells, in_doc),
|
|
||||||
pages_with_images,
|
|
||||||
)
|
|
||||||
|
|
||||||
# 4. Run pipeline stages
|
|
||||||
pipeline_pages = self.model_pipeline.apply(pages_with_cells)
|
|
||||||
|
|
||||||
# 5. Assemble page elements (per page)
|
|
||||||
assembled_pages = self.page_assemble_model(pipeline_pages)
|
|
||||||
|
|
||||||
# exhaust assembled_pages
|
|
||||||
for assembled_page in assembled_pages:
|
|
||||||
# Free up mem resources before moving on with next batch
|
|
||||||
|
|
||||||
# Remove page images (can be disabled)
|
|
||||||
if self.assemble_options.images_scale is None:
|
|
||||||
assembled_page._image_cache = {}
|
|
||||||
|
|
||||||
# Unload backend
|
|
||||||
assembled_page._backend.unload()
|
|
||||||
|
|
||||||
all_assembled_pages.append(assembled_page)
|
|
||||||
|
|
||||||
end_pb_time = time.time() - start_pb_time
|
|
||||||
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
|
||||||
|
|
||||||
# Free up mem resources of PDF backend
|
|
||||||
in_doc._backend.unload()
|
|
||||||
|
|
||||||
conv_res.pages = all_assembled_pages
|
|
||||||
self._assemble_doc(conv_res)
|
|
||||||
|
|
||||||
status = ConversionStatus.SUCCESS
|
|
||||||
for page in conv_res.pages:
|
|
||||||
if not page._backend.is_valid():
|
|
||||||
conv_res.errors.append(
|
|
||||||
ErrorItem(
|
|
||||||
component_type=DoclingComponentType.PDF_BACKEND,
|
|
||||||
module_name=type(page._backend).__name__,
|
|
||||||
error_message=f"Page {page.page_no} failed to parse.",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
status = ConversionStatus.PARTIAL_SUCCESS
|
|
||||||
|
|
||||||
conv_res.status = status
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
conv_res.status = ConversionStatus.FAILURE
|
|
||||||
trace = "\n".join(traceback.format_exception(e))
|
|
||||||
_log.info(
|
|
||||||
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
|
||||||
f"{trace}"
|
|
||||||
)
|
)
|
||||||
|
return self.initialized_pipelines[pipeline_class]
|
||||||
|
|
||||||
|
def process_document(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
|
start_doc_time = time.time()
|
||||||
|
|
||||||
|
conv_res = self._execute_pipeline(in_doc)
|
||||||
|
|
||||||
end_doc_time = time.time() - start_doc_time
|
end_doc_time = time.time() - start_doc_time
|
||||||
_log.info(
|
_log.info(f"Finished converting document in {end_doc_time:.2f} seconds.")
|
||||||
f"Finished converting document time-pages={end_doc_time:.2f}/{in_doc.page_count}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
# Initialise and load resources for a page, before downstream steps (populate images, cells, ...)
|
def _execute_pipeline(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
def _initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
if in_doc.valid and in_doc.format in self.formats:
|
||||||
page._backend = doc._backend.load_page(page.page_no)
|
pipeline = self._get_pipeline(in_doc)
|
||||||
page.size = page._backend.get_size()
|
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||||
page.page_hash = create_hash(doc.document_hash + ":" + str(page.page_no))
|
conv_res = ConversionResult(input=in_doc)
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
return conv_res
|
||||||
|
|
||||||
return page
|
conv_res = pipeline.execute(in_doc)
|
||||||
|
else: # invalid doc or not of desired format
|
||||||
|
conv_res = ConversionResult(input=in_doc)
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
# TODO add error log why it failed.
|
||||||
|
|
||||||
# Generate the page image and store it in the page object
|
return conv_res
|
||||||
def _populate_page_images(self, doc: InputDocument, page: Page) -> Page:
|
|
||||||
# default scale
|
|
||||||
page.get_image(
|
|
||||||
scale=1.0
|
|
||||||
) # puts the page image on the image cache at default scale
|
|
||||||
|
|
||||||
# user requested scales
|
|
||||||
if self.assemble_options.images_scale is not None:
|
|
||||||
page._default_image_scale = self.assemble_options.images_scale
|
|
||||||
page.get_image(
|
|
||||||
scale=self.assemble_options.images_scale
|
|
||||||
) # this will trigger storing the image in the internal cache
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
# Extract and populate the page cells and store it in the page object
|
|
||||||
def _parse_page_cells(self, doc: InputDocument, page: Page) -> Page:
|
|
||||||
page.cells = page._backend.get_text_cells()
|
|
||||||
|
|
||||||
# DEBUG code:
|
|
||||||
def draw_text_boxes(image, cells):
|
|
||||||
draw = ImageDraw.Draw(image)
|
|
||||||
for c in cells:
|
|
||||||
x0, y0, x1, y1 = c.bbox.as_tuple()
|
|
||||||
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
|
||||||
image.show()
|
|
||||||
|
|
||||||
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
|
||||||
|
|
||||||
return page
|
|
||||||
|
|
||||||
def _assemble_doc(self, conv_res: ConversionResult):
|
|
||||||
all_elements = []
|
|
||||||
all_headers = []
|
|
||||||
all_body = []
|
|
||||||
|
|
||||||
for p in conv_res.pages:
|
|
||||||
|
|
||||||
for el in p.assembled.body:
|
|
||||||
all_body.append(el)
|
|
||||||
for el in p.assembled.headers:
|
|
||||||
all_headers.append(el)
|
|
||||||
for el in p.assembled.elements:
|
|
||||||
all_elements.append(el)
|
|
||||||
|
|
||||||
conv_res.assembled = AssembledUnit(
|
|
||||||
elements=all_elements, headers=all_headers, body=all_body
|
|
||||||
)
|
|
||||||
|
|
||||||
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
|
|
||||||
|
10
docling/models/abstract_model.py
Normal file
10
docling/models/abstract_model.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import Page
|
||||||
|
|
||||||
|
|
||||||
|
class AbstractPageModel(ABC):
|
||||||
|
@abstractmethod
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
pass
|
@ -3,7 +3,6 @@ import logging
|
|||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Iterable, List, Tuple
|
from typing import Iterable, List, Tuple
|
||||||
|
|
||||||
import numpy
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
from docling_core.types.experimental import BoundingBox, CoordOrigin
|
||||||
from PIL import Image, ImageDraw
|
from PIL import Image, ImageDraw
|
||||||
@ -11,11 +10,12 @@ from rtree import index
|
|||||||
from scipy.ndimage import find_objects, label
|
from scipy.ndimage import find_objects, label
|
||||||
|
|
||||||
from docling.datamodel.base_models import OcrCell, Page
|
from docling.datamodel.base_models import OcrCell, Page
|
||||||
|
from docling.models.abstract_model import AbstractPageModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseOcrModel:
|
class BaseOcrModel(AbstractPageModel):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.enabled = config["enabled"]
|
self.enabled = config["enabled"]
|
||||||
|
@ -16,12 +16,13 @@ from docling.datamodel.base_models import (
|
|||||||
LayoutPrediction,
|
LayoutPrediction,
|
||||||
Page,
|
Page,
|
||||||
)
|
)
|
||||||
|
from docling.models.abstract_model import AbstractPageModel
|
||||||
from docling.utils import layout_utils as lu
|
from docling.utils import layout_utils as lu
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class LayoutModel:
|
class LayoutModel(AbstractPageModel):
|
||||||
|
|
||||||
TEXT_ELEM_LABELS = [
|
TEXT_ELEM_LABELS = [
|
||||||
DocItemLabel.TEXT,
|
DocItemLabel.TEXT,
|
||||||
|
@ -10,12 +10,13 @@ from docling.datamodel.base_models import (
|
|||||||
Table,
|
Table,
|
||||||
TextElement,
|
TextElement,
|
||||||
)
|
)
|
||||||
|
from docling.models.abstract_model import AbstractPageModel
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class PageAssembleModel:
|
class PageAssembleModel(AbstractPageModel):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.config = config
|
self.config = config
|
||||||
|
|
||||||
@ -145,4 +146,11 @@ class PageAssembleModel:
|
|||||||
elements=elements, headers=headers, body=body
|
elements=elements, headers=headers, body=body
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Remove page images (can be disabled)
|
||||||
|
if self.config["images_scale"] is None:
|
||||||
|
page._image_cache = {}
|
||||||
|
|
||||||
|
# Unload backend
|
||||||
|
page._backend.unload()
|
||||||
|
|
||||||
yield page
|
yield page
|
||||||
|
50
docling/models/page_preprocessing_model.py
Normal file
50
docling/models/page_preprocessing_model.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import Page
|
||||||
|
from docling.models.abstract_model import AbstractPageModel
|
||||||
|
|
||||||
|
|
||||||
|
class PagePreprocessingModel(AbstractPageModel):
|
||||||
|
def __init__(self, config):
|
||||||
|
self.config = config
|
||||||
|
|
||||||
|
def __call__(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
|
for page in page_batch:
|
||||||
|
page = self._populate_page_images(page)
|
||||||
|
page = self._parse_page_cells(page)
|
||||||
|
yield page
|
||||||
|
|
||||||
|
# Generate the page image and store it in the page object
|
||||||
|
def _populate_page_images(self, page: Page) -> Page:
|
||||||
|
# default scale
|
||||||
|
page.get_image(
|
||||||
|
scale=1.0
|
||||||
|
) # puts the page image on the image cache at default scale
|
||||||
|
|
||||||
|
images_scale = self.config["images_scale"]
|
||||||
|
# user requested scales
|
||||||
|
if images_scale is not None:
|
||||||
|
page._default_image_scale = images_scale
|
||||||
|
page.get_image(
|
||||||
|
scale=images_scale
|
||||||
|
) # this will trigger storing the image in the internal cache
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
# Extract and populate the page cells and store it in the page object
|
||||||
|
def _parse_page_cells(self, page: Page) -> Page:
|
||||||
|
page.cells = page._backend.get_text_cells()
|
||||||
|
|
||||||
|
# DEBUG code:
|
||||||
|
def draw_text_boxes(image, cells):
|
||||||
|
draw = ImageDraw.Draw(image)
|
||||||
|
for c in cells:
|
||||||
|
x0, y0, x1, y1 = c.bbox.as_tuple()
|
||||||
|
draw.rectangle([(x0, y0), (x1, y1)], outline="red")
|
||||||
|
image.show()
|
||||||
|
|
||||||
|
# draw_text_boxes(page.get_image(scale=1.0), cells)
|
||||||
|
|
||||||
|
return page
|
@ -9,9 +9,10 @@ from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredic
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||||
|
from docling.models.abstract_model import AbstractPageModel
|
||||||
|
|
||||||
|
|
||||||
class TableStructureModel:
|
class TableStructureModel(AbstractPageModel):
|
||||||
def __init__(self, config):
|
def __init__(self, config):
|
||||||
self.config = config
|
self.config = config
|
||||||
self.do_cell_matching = config["do_cell_matching"]
|
self.do_cell_matching = config["do_cell_matching"]
|
||||||
|
@ -1,17 +1,117 @@
|
|||||||
from pathlib import Path
|
import functools
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
from typing import Callable, Iterable, List
|
from typing import Callable, Iterable, List
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, PipelineOptions
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
ConversionStatus,
|
||||||
|
DoclingComponentType,
|
||||||
|
ErrorItem,
|
||||||
|
Page,
|
||||||
|
PipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.utils.utils import chunkify
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseModelPipeline:
|
class BaseModelPipeline(ABC):
|
||||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
def __init__(self, pipeline_options: PipelineOptions):
|
||||||
self.model_pipe: List[Callable] = []
|
|
||||||
self.artifacts_path = artifacts_path
|
|
||||||
self.pipeline_options = pipeline_options
|
self.pipeline_options = pipeline_options
|
||||||
|
self.model_pipe: List[Callable] = []
|
||||||
|
|
||||||
def apply(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
@abstractmethod
|
||||||
|
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def assemble_document(
|
||||||
|
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||||
|
) -> ConversionResult:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def get_default_options(cls) -> PipelineOptions:
|
||||||
|
pass
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
@abstractmethod
|
||||||
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class PaginatedModelPipeline(BaseModelPipeline):
|
||||||
|
|
||||||
|
def apply_on_pages(self, page_batch: Iterable[Page]) -> Iterable[Page]:
|
||||||
for model in self.model_pipe:
|
for model in self.model_pipe:
|
||||||
page_batch = model(page_batch)
|
page_batch = model(page_batch)
|
||||||
|
|
||||||
yield from page_batch
|
yield from page_batch
|
||||||
|
|
||||||
|
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
|
conv_res = ConversionResult(input=in_doc)
|
||||||
|
|
||||||
|
_log.info(f"Processing document {in_doc.file.name}")
|
||||||
|
|
||||||
|
for i in range(0, in_doc.page_count):
|
||||||
|
conv_res.pages.append(Page(page_no=i))
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Iterate batches of pages (page_batch_size) in the doc
|
||||||
|
for page_batch in chunkify(conv_res.pages, settings.perf.page_batch_size):
|
||||||
|
start_pb_time = time.time()
|
||||||
|
|
||||||
|
# 1. Initialise the page resources
|
||||||
|
init_pages = map(
|
||||||
|
functools.partial(self.initialize_page, in_doc), page_batch
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2. Run pipeline stages
|
||||||
|
pipeline_pages = self.apply_on_pages(init_pages)
|
||||||
|
|
||||||
|
for p in pipeline_pages:
|
||||||
|
pass
|
||||||
|
|
||||||
|
end_pb_time = time.time() - start_pb_time
|
||||||
|
_log.info(f"Finished converting page batch time={end_pb_time:.3f}")
|
||||||
|
|
||||||
|
# Free up mem resources of PDF backend
|
||||||
|
in_doc._backend.unload()
|
||||||
|
|
||||||
|
conv_res = self.assemble_document(in_doc, conv_res)
|
||||||
|
|
||||||
|
status = ConversionStatus.SUCCESS
|
||||||
|
for page in conv_res.pages:
|
||||||
|
if not page._backend.is_valid():
|
||||||
|
conv_res.errors.append(
|
||||||
|
ErrorItem(
|
||||||
|
component_type=DoclingComponentType.DOCUMENT_BACKEND,
|
||||||
|
module_name=type(page._backend).__name__,
|
||||||
|
error_message=f"Page {page.page_no} failed to parse.",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
status = ConversionStatus.PARTIAL_SUCCESS
|
||||||
|
|
||||||
|
conv_res.status = status
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
trace = "\n".join(traceback.format_exception(e))
|
||||||
|
_log.info(
|
||||||
|
f"Encountered an error during conversion of document {in_doc.document_hash}:\n"
|
||||||
|
f"{trace}"
|
||||||
|
)
|
||||||
|
raise e
|
||||||
|
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
# Initialise and load resources for a page
|
||||||
|
@abstractmethod
|
||||||
|
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||||
|
pass
|
||||||
|
57
docling/pipeline/simple_model_pipeline.py
Normal file
57
docling/pipeline/simple_model_pipeline.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import (
|
||||||
|
AbstractDocumentBackend,
|
||||||
|
DeclarativeDocumentBackend,
|
||||||
|
)
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
ConversionStatus,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
PipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
|
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SimpleModelPipeline(BaseModelPipeline):
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
|
||||||
|
def execute(self, in_doc: InputDocument) -> ConversionResult:
|
||||||
|
conv_res = ConversionResult(input=in_doc)
|
||||||
|
|
||||||
|
_log.info(f"Processing document {in_doc.file.name}")
|
||||||
|
|
||||||
|
if not in_doc.valid:
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
if not isinstance(in_doc._backend, DeclarativeDocumentBackend):
|
||||||
|
conv_res.status = ConversionStatus.FAILURE
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
conv_res.experimental = in_doc._backend.convert()
|
||||||
|
|
||||||
|
# Do other stuff with conv_res.experimental
|
||||||
|
|
||||||
|
conv_res = self.assemble_document(in_doc, conv_res)
|
||||||
|
|
||||||
|
conv_res.status = ConversionStatus.SUCCESS
|
||||||
|
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
def assemble_document(
|
||||||
|
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||||
|
) -> ConversionResult:
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls) -> PipelineOptions:
|
||||||
|
return PipelineOptions()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
|
return isinstance(backend, DeclarativeDocumentBackend)
|
@ -1,38 +0,0 @@
|
|||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from docling.datamodel.base_models import PipelineOptions
|
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
|
||||||
from docling.models.layout_model import LayoutModel
|
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
|
||||||
from docling.pipeline.base_model_pipeline import BaseModelPipeline
|
|
||||||
|
|
||||||
|
|
||||||
class StandardModelPipeline(BaseModelPipeline):
|
|
||||||
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
|
||||||
_table_model_path = "model_artifacts/tableformer"
|
|
||||||
|
|
||||||
def __init__(self, artifacts_path: Path, pipeline_options: PipelineOptions):
|
|
||||||
super().__init__(artifacts_path, pipeline_options)
|
|
||||||
|
|
||||||
self.model_pipe = [
|
|
||||||
EasyOcrModel(
|
|
||||||
config={
|
|
||||||
"lang": ["fr", "de", "es", "en"],
|
|
||||||
"enabled": pipeline_options.do_ocr,
|
|
||||||
}
|
|
||||||
),
|
|
||||||
LayoutModel(
|
|
||||||
config={
|
|
||||||
"artifacts_path": artifacts_path
|
|
||||||
/ StandardModelPipeline._layout_model_path
|
|
||||||
}
|
|
||||||
),
|
|
||||||
TableStructureModel(
|
|
||||||
config={
|
|
||||||
"artifacts_path": artifacts_path
|
|
||||||
/ StandardModelPipeline._table_model_path,
|
|
||||||
"enabled": pipeline_options.do_table_structure,
|
|
||||||
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
|
||||||
}
|
|
||||||
),
|
|
||||||
]
|
|
108
docling/pipeline/standard_pdf_model_pipeline.py
Normal file
108
docling/pipeline/standard_pdf_model_pipeline.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
|
from docling.datamodel.base_models import AssembledUnit, Page, PdfPipelineOptions
|
||||||
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
|
from docling.models.ds_glm_model import GlmModel
|
||||||
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
|
from docling.models.layout_model import LayoutModel
|
||||||
|
from docling.models.page_assemble_model import PageAssembleModel
|
||||||
|
from docling.models.page_preprocessing_model import PagePreprocessingModel
|
||||||
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
from docling.pipeline.base_model_pipeline import PaginatedModelPipeline
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class StandardPdfModelPipeline(PaginatedModelPipeline):
|
||||||
|
_layout_model_path = "model_artifacts/layout/beehive_v0.0.5"
|
||||||
|
_table_model_path = "model_artifacts/tableformer"
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: PdfPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
|
||||||
|
if not pipeline_options.artifacts_path:
|
||||||
|
artifacts_path = self.download_models_hf()
|
||||||
|
|
||||||
|
self.artifacts_path = Path(artifacts_path)
|
||||||
|
self.glm_model = GlmModel(config={})
|
||||||
|
|
||||||
|
self.model_pipe = [
|
||||||
|
PagePreprocessingModel(
|
||||||
|
config={"images_scale": pipeline_options.images_scale}
|
||||||
|
),
|
||||||
|
EasyOcrModel(
|
||||||
|
config={
|
||||||
|
"lang": ["fr", "de", "es", "en"],
|
||||||
|
"enabled": pipeline_options.do_ocr,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
LayoutModel(
|
||||||
|
config={
|
||||||
|
"artifacts_path": artifacts_path
|
||||||
|
/ StandardPdfModelPipeline._layout_model_path
|
||||||
|
}
|
||||||
|
),
|
||||||
|
TableStructureModel(
|
||||||
|
config={
|
||||||
|
"artifacts_path": artifacts_path
|
||||||
|
/ StandardPdfModelPipeline._table_model_path,
|
||||||
|
"enabled": pipeline_options.do_table_structure,
|
||||||
|
"do_cell_matching": pipeline_options.table_structure_options.do_cell_matching,
|
||||||
|
}
|
||||||
|
),
|
||||||
|
PageAssembleModel(config={"images_scale": pipeline_options.images_scale}),
|
||||||
|
]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def download_models_hf(
|
||||||
|
local_dir: Optional[Path] = None, force: bool = False
|
||||||
|
) -> Path:
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
|
download_path = snapshot_download(
|
||||||
|
repo_id="ds4sd/docling-models", force_download=force, local_dir=local_dir
|
||||||
|
)
|
||||||
|
|
||||||
|
return Path(download_path)
|
||||||
|
|
||||||
|
def initialize_page(self, doc: InputDocument, page: Page) -> Page:
|
||||||
|
page._backend = doc._backend.load_page(page.page_no)
|
||||||
|
page.size = page._backend.get_size()
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
def assemble_document(
|
||||||
|
self, in_doc: InputDocument, conv_res: ConversionResult
|
||||||
|
) -> ConversionResult:
|
||||||
|
all_elements = []
|
||||||
|
all_headers = []
|
||||||
|
all_body = []
|
||||||
|
|
||||||
|
for p in conv_res.pages:
|
||||||
|
|
||||||
|
for el in p.assembled.body:
|
||||||
|
all_body.append(el)
|
||||||
|
for el in p.assembled.headers:
|
||||||
|
all_headers.append(el)
|
||||||
|
for el in p.assembled.elements:
|
||||||
|
all_elements.append(el)
|
||||||
|
|
||||||
|
conv_res.assembled = AssembledUnit(
|
||||||
|
elements=all_elements, headers=all_headers, body=all_body
|
||||||
|
)
|
||||||
|
|
||||||
|
conv_res.output, conv_res.experimental = self.glm_model(conv_res)
|
||||||
|
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls) -> PdfPipelineOptions:
|
||||||
|
return PdfPipelineOptions()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
|
return isinstance(backend, PdfDocumentBackend)
|
@ -6,9 +6,9 @@ from typing import Iterable
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -107,7 +107,11 @@ def main():
|
|||||||
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
# docs = [DocumentStream(filename="my_doc.pdf", stream=buf)]
|
||||||
# input = DocumentConversionInput.from_streams(docs)
|
# input = DocumentConversionInput.from_streams(docs)
|
||||||
|
|
||||||
doc_converter = DocumentConverter()
|
doc_converter = PdfDocumentConverter(
|
||||||
|
pipeline_options=PdfPipelineOptions(),
|
||||||
|
pdf_backend=DocumentConversionInput.DEFAULT_BACKEND,
|
||||||
|
pipeline_cls=StandardModelPipeline,
|
||||||
|
)
|
||||||
|
|
||||||
input = DocumentConversionInput.from_paths(input_doc_paths)
|
input = DocumentConversionInput.from_paths(input_doc_paths)
|
||||||
|
|
||||||
|
@ -6,9 +6,9 @@ from typing import Iterable
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import ConversionStatus, PipelineOptions
|
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -93,12 +93,12 @@ def main():
|
|||||||
|
|
||||||
# Docling Parse without OCR
|
# Docling Parse without OCR
|
||||||
# -------------------------
|
# -------------------------
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = PdfDocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
)
|
)
|
||||||
|
@ -4,14 +4,14 @@ from pathlib import Path
|
|||||||
from typing import Tuple
|
from typing import Tuple
|
||||||
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
AssembleOptions,
|
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FigureElement,
|
FigureElement,
|
||||||
PageElement,
|
PageElement,
|
||||||
|
PdfPipelineOptions,
|
||||||
Table,
|
Table,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -30,12 +30,12 @@ def main():
|
|||||||
|
|
||||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
# will destroy them for cleaning up memory.
|
# will destroy them for cleaning up memory.
|
||||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
||||||
# scale=1 correspond of a standard 72 DPI image
|
# scale=1 correspond of a standard 72 DPI image
|
||||||
assemble_options = AssembleOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
|
||||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -5,9 +5,9 @@ from pathlib import Path
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
from docling.datamodel.base_models import AssembleOptions, ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus, PdfPipelineOptions
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
from docling.utils.export import generate_multimodal_pages
|
from docling.utils.export import generate_multimodal_pages
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
@ -27,12 +27,12 @@ def main():
|
|||||||
|
|
||||||
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
# Important: For operating with page images, we must keep them, otherwise the DocumentConverter
|
||||||
# will destroy them for cleaning up memory.
|
# will destroy them for cleaning up memory.
|
||||||
# This is done by setting AssembleOptions.images_scale, which also defines the scale of images.
|
# This is done by setting PipelineOptions.images_scale, which also defines the scale of images.
|
||||||
# scale=1 correspond of a standard 72 DPI image
|
# scale=1 correspond of a standard 72 DPI image
|
||||||
assemble_options = AssembleOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
assemble_options.images_scale = IMAGE_RESOLUTION_SCALE
|
pipeline_options.images_scale = IMAGE_RESOLUTION_SCALE
|
||||||
|
|
||||||
doc_converter = DocumentConverter(assemble_options=assemble_options)
|
doc_converter = PdfDocumentConverter(pipeline_options=pipeline_options)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ import pandas as pd
|
|||||||
|
|
||||||
from docling.datamodel.base_models import ConversionStatus
|
from docling.datamodel.base_models import ConversionStatus
|
||||||
from docling.datamodel.document import DocumentConversionInput
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -22,7 +22,7 @@ def main():
|
|||||||
|
|
||||||
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
input_files = DocumentConversionInput.from_paths(input_doc_paths)
|
||||||
|
|
||||||
doc_converter = DocumentConverter()
|
doc_converter = PdfDocumentConverter()
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
source = "https://arxiv.org/pdf/2408.09869" # PDF path or URL
|
||||||
converter = DocumentConverter()
|
converter = PdfDocumentConverter()
|
||||||
doc = converter.convert_single(source)
|
doc = converter.convert_single(source)
|
||||||
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
print(doc.render_as_markdown()) # output: ## Docling Technical Report [...]"
|
||||||
|
41
examples/run_with_formats.py
Normal file
41
examples/run_with_formats.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
from docling.datamodel.base_models import (
|
||||||
|
InputFormat,
|
||||||
|
PdfPipelineOptions,
|
||||||
|
PipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.document import DocumentConversionInput
|
||||||
|
from docling.document_converter import DocumentConverter, FormatOption
|
||||||
|
from docling.pipeline.simple_model_pipeline import SimpleModelPipeline
|
||||||
|
from docling.pipeline.standard_pdf_model_pipeline import StandardPdfModelPipeline
|
||||||
|
|
||||||
|
input_paths = [
|
||||||
|
Path("tests/data/wiki_duck.html"),
|
||||||
|
Path("tests/data/word_sample.docx"),
|
||||||
|
Path("tests/data/powerpoint_sample.pptx"),
|
||||||
|
Path("tests/data/2206.01062.pdf"),
|
||||||
|
]
|
||||||
|
input = DocumentConversionInput.from_paths(input_paths)
|
||||||
|
|
||||||
|
# for defaults use:
|
||||||
|
doc_converter = DocumentConverter()
|
||||||
|
|
||||||
|
# to customize use:
|
||||||
|
# doc_converter = DocumentConverter( # all of the below is optional, has internal defaults.
|
||||||
|
# formats=[InputFormat.PDF, InputFormat.DOCX],
|
||||||
|
# format_options={
|
||||||
|
# InputFormat.PDF: FormatOption(pipeline_cls=StandardPdfModelPipeline, backend=PyPdfiumDocumentBackend),
|
||||||
|
# InputFormat.DOCX: FormatOption(pipeline_cls=SimpleModelPipeline, backend=MsWordDocumentBackend)
|
||||||
|
# }
|
||||||
|
# )
|
||||||
|
|
||||||
|
conv_results = doc_converter.convert(input)
|
||||||
|
|
||||||
|
for res in conv_results:
|
||||||
|
print(
|
||||||
|
f"Document {res.input.file.name} converted with status {res.status}. Content:"
|
||||||
|
)
|
||||||
|
print(res.experimental.export_to_markdown())
|
BIN
tests/data/powerpoint_sample.pptx
Normal file
BIN
tests/data/powerpoint_sample.pptx
Normal file
Binary file not shown.
1311
tests/data/wiki_duck.html
Normal file
1311
tests/data/wiki_duck.html
Normal file
File diff suppressed because one or more lines are too long
BIN
tests/data/word_sample.docx
Normal file
BIN
tests/data/word_sample.docx
Normal file
Binary file not shown.
@ -2,9 +2,9 @@ from pathlib import Path
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import PipelineOptions
|
from docling.datamodel.base_models import PdfPipelineOptions
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result
|
from .verify_utils import verify_conversion_result
|
||||||
|
|
||||||
@ -23,12 +23,12 @@ def get_pdf_paths():
|
|||||||
|
|
||||||
def get_converter():
|
def get_converter():
|
||||||
|
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = PdfDocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
)
|
)
|
||||||
|
@ -5,9 +5,9 @@ import pytest
|
|||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import DocumentStream, PipelineOptions
|
from docling.datamodel.base_models import DocumentStream, PdfPipelineOptions
|
||||||
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
from docling.datamodel.document import ConversionResult, DocumentConversionInput
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.pdf_document_converter import PdfDocumentConverter
|
||||||
|
|
||||||
from .verify_utils import verify_conversion_result
|
from .verify_utils import verify_conversion_result
|
||||||
|
|
||||||
@ -21,12 +21,12 @@ def get_pdf_path():
|
|||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def converter():
|
def converter():
|
||||||
|
|
||||||
pipeline_options = PipelineOptions()
|
pipeline_options = PdfPipelineOptions()
|
||||||
pipeline_options.do_ocr = False
|
pipeline_options.do_ocr = False
|
||||||
pipeline_options.do_table_structure = True
|
pipeline_options.do_table_structure = True
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True
|
pipeline_options.table_structure_options.do_cell_matching = True
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = PdfDocumentConverter(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
pdf_backend=DoclingParseDocumentBackend,
|
pdf_backend=DoclingParseDocumentBackend,
|
||||||
)
|
)
|
||||||
@ -34,7 +34,7 @@ def converter():
|
|||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_convert_single(converter: DocumentConverter):
|
def test_convert_single(converter: PdfDocumentConverter):
|
||||||
|
|
||||||
pdf_path = get_pdf_path()
|
pdf_path = get_pdf_path()
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
@ -43,7 +43,7 @@ def test_convert_single(converter: DocumentConverter):
|
|||||||
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
|
||||||
|
|
||||||
|
|
||||||
def test_batch_path(converter: DocumentConverter):
|
def test_batch_path(converter: PdfDocumentConverter):
|
||||||
|
|
||||||
pdf_path = get_pdf_path()
|
pdf_path = get_pdf_path()
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
@ -55,7 +55,7 @@ def test_batch_path(converter: DocumentConverter):
|
|||||||
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
|
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
|
||||||
|
|
||||||
|
|
||||||
def test_batch_bytes(converter: DocumentConverter):
|
def test_batch_bytes(converter: PdfDocumentConverter):
|
||||||
|
|
||||||
pdf_path = get_pdf_path()
|
pdf_path = get_pdf_path()
|
||||||
print(f"converting {pdf_path}")
|
print(f"converting {pdf_path}")
|
||||||
|
Loading…
Reference in New Issue
Block a user