mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-15 16:18:22 +00:00
Merge branch 'release_v3' of github.com:DS4SD/docling into cau/layout-postprocessing
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -1,5 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
|
||||
FAILURE = auto()
|
||||
SUCCESS = auto()
|
||||
PARTIAL_SUCCESS = auto()
|
||||
SKIPPED = auto()
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
|
||||
DOCUMENT_BACKEND = auto()
|
||||
MODEL = auto()
|
||||
DOC_ASSEMBLER = auto()
|
||||
USER_INPUT = auto()
|
||||
|
||||
|
||||
class ErrorItem(BaseModel):
|
||||
@@ -214,10 +218,3 @@ class Page(BaseModel):
|
||||
@property
|
||||
def image(self) -> Optional[Image]:
|
||||
return self.get_image(scale=self._default_image_scale)
|
||||
|
||||
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
||||
@@ -3,7 +3,7 @@ import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
||||
|
||||
import filetype
|
||||
from docling_core.types.doc import (
|
||||
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@@ -166,12 +166,6 @@ class InputDocument(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
if not self._backend.is_valid():
|
||||
self.valid = False
|
||||
@@ -452,6 +446,25 @@ class ConversionResult(BaseModel):
|
||||
return ds_doc
|
||||
|
||||
|
||||
class _DummyBackend(AbstractDocumentBackend):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return set()
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
return super().unload()
|
||||
|
||||
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
@@ -461,13 +474,14 @@ class _DocumentConversionInput(BaseModel):
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||
_log.error(
|
||||
f"Input document {obj.name} does not match any allowed format."
|
||||
)
|
||||
continue
|
||||
backend = _DummyBackend
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
|
||||
|
||||
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
"""Modes for the TableFormer model."""
|
||||
|
||||
FAST = "fast"
|
||||
ACCURATE = "accurate"
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
"""Options for the table structure."""
|
||||
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
|
||||
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
"""OCR options."""
|
||||
|
||||
kind: str
|
||||
lang: List[str]
|
||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
|
||||
|
||||
|
||||
class RapidOcrOptions(OcrOptions):
|
||||
"""Options for the RapidOCR engine."""
|
||||
|
||||
kind: Literal["rapidocr"] = "rapidocr"
|
||||
|
||||
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
||||
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class EasyOcrOptions(OcrOptions):
|
||||
"""Options for the EasyOCR engine."""
|
||||
|
||||
kind: Literal["easyocr"] = "easyocr"
|
||||
lang: List[str] = ["fr", "de", "es", "en"]
|
||||
use_gpu: bool = True # same default as easyocr.Reader
|
||||
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class TesseractCliOcrOptions(OcrOptions):
|
||||
"""Options for the TesseractCli engine."""
|
||||
|
||||
kind: Literal["tesseract"] = "tesseract"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
tesseract_cmd: str = "tesseract"
|
||||
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class TesseractOcrOptions(OcrOptions):
|
||||
"""Options for the Tesseract engine."""
|
||||
|
||||
kind: Literal["tesserocr"] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
path: Optional[str] = None
|
||||
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class OcrMacOptions(OcrOptions):
|
||||
"""Options for the Mac OCR engine."""
|
||||
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
recognition: str = "accurate"
|
||||
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
"""Base pipeline options."""
|
||||
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
)
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
"""Options for the PDF pipeline."""
|
||||
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
Reference in New Issue
Block a user