Merge branch 'release_v3' of github.com:DS4SD/docling into cau/layout-postprocessing

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer
2024-12-04 14:21:09 +01:00
27 changed files with 1581 additions and 835 deletions

View File

@@ -1,5 +1,4 @@
from enum import Enum, auto
from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Union
from docling_core.types.doc import (
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
Size,
TableCell,
)
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
DocumentStream,
)
from PIL.Image import Image
from pydantic import BaseModel, ConfigDict
@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
FAILURE = auto()
SUCCESS = auto()
PARTIAL_SUCCESS = auto()
SKIPPED = auto()
class InputFormat(str, Enum):
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
DOCUMENT_BACKEND = auto()
MODEL = auto()
DOC_ASSEMBLER = auto()
USER_INPUT = auto()
class ErrorItem(BaseModel):
@@ -214,10 +218,3 @@ class Page(BaseModel):
@property
def image(self) -> Optional[Image]:
return self.get_image(scale=self._default_image_scale)
class DocumentStream(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)
name: str
stream: BytesIO

View File

@@ -3,7 +3,7 @@ import re
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
import filetype
from docling_core.types.doc import (
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
)
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from docling_core.utils.file import resolve_file_source
from docling_core.utils.file import resolve_source_to_stream
from pydantic import BaseModel
from typing_extensions import deprecated
@@ -166,12 +166,6 @@ class InputDocument(BaseModel):
backend: Type[AbstractDocumentBackend],
path_or_stream: Union[BytesIO, Path],
) -> None:
if backend is None:
raise RuntimeError(
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
f"Please check your format configuration on DocumentConverter."
)
self._backend = backend(self, path_or_stream=path_or_stream)
if not self._backend.is_valid():
self.valid = False
@@ -452,6 +446,25 @@ class ConversionResult(BaseModel):
return ds_doc
class _DummyBackend(AbstractDocumentBackend):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def is_valid(self) -> bool:
return False
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return set()
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
return super().unload()
class _DocumentConversionInput(BaseModel):
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
@@ -461,13 +474,14 @@ class _DocumentConversionInput(BaseModel):
self, format_options: Dict[InputFormat, "FormatOption"]
) -> Iterable[InputDocument]:
for item in self.path_or_stream_iterator:
obj = resolve_file_source(item) if isinstance(item, str) else item
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
format = self._guess_format(obj)
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys():
_log.info(
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
_log.error(
f"Input document {obj.name} does not match any allowed format."
)
continue
backend = _DummyBackend
else:
backend = format_options[format].backend

View File

@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
class TableFormerMode(str, Enum):
"""Modes for the TableFormer model."""
FAST = "fast"
ACCURATE = "accurate"
class TableStructureOptions(BaseModel):
"""Options for the table structure."""
do_cell_matching: bool = (
True
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
class OcrOptions(BaseModel):
"""OCR options."""
kind: str
lang: List[str]
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
class RapidOcrOptions(OcrOptions):
"""Options for the RapidOCR engine."""
kind: Literal["rapidocr"] = "rapidocr"
# English and chinese are the most commly used models and have been tested with RapidOCR.
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
class EasyOcrOptions(OcrOptions):
"""Options for the EasyOCR engine."""
kind: Literal["easyocr"] = "easyocr"
lang: List[str] = ["fr", "de", "es", "en"]
use_gpu: bool = True # same default as easyocr.Reader
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
class TesseractCliOcrOptions(OcrOptions):
"""Options for the TesseractCli engine."""
kind: Literal["tesseract"] = "tesseract"
lang: List[str] = ["fra", "deu", "spa", "eng"]
tesseract_cmd: str = "tesseract"
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
class TesseractOcrOptions(OcrOptions):
"""Options for the Tesseract engine."""
kind: Literal["tesserocr"] = "tesserocr"
lang: List[str] = ["fra", "deu", "spa", "eng"]
path: Optional[str] = None
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
class OcrMacOptions(OcrOptions):
"""Options for the Mac OCR engine."""
kind: Literal["ocrmac"] = "ocrmac"
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
recognition: str = "accurate"
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
class PipelineOptions(BaseModel):
"""Base pipeline options."""
create_legacy_output: bool = (
True # This defautl will be set to False on a future version of docling
)
class PdfPipelineOptions(PipelineOptions):
"""Options for the PDF pipeline."""
artifacts_path: Optional[Union[Path, str]] = None
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text