mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-13 23:28:21 +00:00
Merge branch 'release_v3' of github.com:DS4SD/docling into cau/layout-postprocessing
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -2,6 +2,7 @@ import importlib
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import tempfile
|
||||
import time
|
||||
import warnings
|
||||
from enum import Enum
|
||||
@@ -9,7 +10,7 @@ from pathlib import Path
|
||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||
|
||||
import typer
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
@@ -32,6 +33,7 @@ from docling.datamodel.pipeline_options import (
|
||||
TesseractCliOcrOptions,
|
||||
TesseractOcrOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
@@ -212,6 +214,24 @@ def convert(
|
||||
help="Set the verbosity level. -v for info logging, -vv for debug logging.",
|
||||
),
|
||||
] = 0,
|
||||
debug_visualize_cells: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="Enable debug output which visualizes the PDF cells"),
|
||||
] = False,
|
||||
debug_visualize_ocr: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="Enable debug output which visualizes the OCR cells"),
|
||||
] = False,
|
||||
debug_visualize_layout: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="Enable debug output which visualizes the layour clusters"
|
||||
),
|
||||
] = False,
|
||||
debug_visualize_tables: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="Enable debug output which visualizes the table cells"),
|
||||
] = False,
|
||||
version: Annotated[
|
||||
Optional[bool],
|
||||
typer.Option(
|
||||
@@ -229,98 +249,106 @@ def convert(
|
||||
elif verbose == 2:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
settings.debug.visualize_cells = debug_visualize_cells
|
||||
settings.debug.visualize_layout = debug_visualize_layout
|
||||
settings.debug.visualize_tables = debug_visualize_tables
|
||||
settings.debug.visualize_ocr = debug_visualize_ocr
|
||||
|
||||
if from_formats is None:
|
||||
from_formats = [e for e in InputFormat]
|
||||
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_file_source(source=src)
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions[fmt]:
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
with tempfile.TemporaryDirectory() as tempdir:
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_source_to_path(source=src, workdir=Path(tempdir))
|
||||
if not source.exists():
|
||||
err_console.print(
|
||||
f"[red]Error: The input file {source} does not exist.[/red]"
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions[fmt]:
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
if ocr_engine == OcrEngine.EASYOCR:
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.RAPIDOCR:
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
if ocr_engine == OcrEngine.EASYOCR:
|
||||
ocr_options: OcrOptions = EasyOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT_CLI:
|
||||
ocr_options = TesseractCliOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.OCRMAC:
|
||||
ocr_options = OcrMacOptions(force_full_page_ocr=force_ocr)
|
||||
elif ocr_engine == OcrEngine.RAPIDOCR:
|
||||
ocr_options = RapidOcrOptions(force_full_page_ocr=force_ocr)
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = (
|
||||
True # do_cell_matching
|
||||
)
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
start_time = time.time()
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend
|
||||
else:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
conv_results,
|
||||
output_dir=output,
|
||||
export_json=export_json,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
)
|
||||
format_options: Dict[InputFormat, FormatOption] = {
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options,
|
||||
backend=backend, # pdf_backend
|
||||
)
|
||||
}
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
input_doc_paths, raises_on_error=abort_on_error
|
||||
)
|
||||
|
||||
output.mkdir(parents=True, exist_ok=True)
|
||||
export_documents(
|
||||
conv_results,
|
||||
output_dir=output,
|
||||
export_json=export_json,
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
@@ -9,6 +8,9 @@ from docling_core.types.doc import (
|
||||
Size,
|
||||
TableCell,
|
||||
)
|
||||
from docling_core.types.io import ( # DO ΝΟΤ REMOVE; explicitly exposed from this location
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
|
||||
@@ -22,6 +24,7 @@ class ConversionStatus(str, Enum):
|
||||
FAILURE = auto()
|
||||
SUCCESS = auto()
|
||||
PARTIAL_SUCCESS = auto()
|
||||
SKIPPED = auto()
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
@@ -93,6 +96,7 @@ class DoclingComponentType(str, Enum):
|
||||
DOCUMENT_BACKEND = auto()
|
||||
MODEL = auto()
|
||||
DOC_ASSEMBLER = auto()
|
||||
USER_INPUT = auto()
|
||||
|
||||
|
||||
class ErrorItem(BaseModel):
|
||||
@@ -214,10 +218,3 @@ class Page(BaseModel):
|
||||
@property
|
||||
def image(self) -> Optional[Image]:
|
||||
return self.get_image(scale=self._default_image_scale)
|
||||
|
||||
|
||||
class DocumentStream(BaseModel):
|
||||
model_config = ConfigDict(arbitrary_types_allowed=True)
|
||||
|
||||
name: str
|
||||
stream: BytesIO
|
||||
|
||||
@@ -3,7 +3,7 @@ import re
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Type, Union
|
||||
from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Set, Type, Union
|
||||
|
||||
import filetype
|
||||
from docling_core.types.doc import (
|
||||
@@ -32,7 +32,7 @@ from docling_core.types.legacy_doc.document import (
|
||||
)
|
||||
from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
|
||||
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
|
||||
from docling_core.utils.file import resolve_file_source
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from pydantic import BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@@ -166,12 +166,6 @@ class InputDocument(BaseModel):
|
||||
backend: Type[AbstractDocumentBackend],
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
) -> None:
|
||||
if backend is None:
|
||||
raise RuntimeError(
|
||||
f"No backend configuration provided for file {self.file.name} with format {self.format}. "
|
||||
f"Please check your format configuration on DocumentConverter."
|
||||
)
|
||||
|
||||
self._backend = backend(self, path_or_stream=path_or_stream)
|
||||
if not self._backend.is_valid():
|
||||
self.valid = False
|
||||
@@ -452,6 +446,25 @@ class ConversionResult(BaseModel):
|
||||
return ds_doc
|
||||
|
||||
|
||||
class _DummyBackend(AbstractDocumentBackend):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def supported_formats(cls) -> Set[InputFormat]:
|
||||
return set()
|
||||
|
||||
@classmethod
|
||||
def supports_pagination(cls) -> bool:
|
||||
return False
|
||||
|
||||
def unload(self):
|
||||
return super().unload()
|
||||
|
||||
|
||||
class _DocumentConversionInput(BaseModel):
|
||||
|
||||
path_or_stream_iterator: Iterable[Union[Path, str, DocumentStream]]
|
||||
@@ -461,13 +474,14 @@ class _DocumentConversionInput(BaseModel):
|
||||
self, format_options: Dict[InputFormat, "FormatOption"]
|
||||
) -> Iterable[InputDocument]:
|
||||
for item in self.path_or_stream_iterator:
|
||||
obj = resolve_file_source(item) if isinstance(item, str) else item
|
||||
obj = resolve_source_to_stream(item) if isinstance(item, str) else item
|
||||
format = self._guess_format(obj)
|
||||
backend: Type[AbstractDocumentBackend]
|
||||
if format not in format_options.keys():
|
||||
_log.info(
|
||||
f"Skipping input document {obj.name} because it isn't matching any of the allowed formats."
|
||||
_log.error(
|
||||
f"Input document {obj.name} does not match any allowed format."
|
||||
)
|
||||
continue
|
||||
backend = _DummyBackend
|
||||
else:
|
||||
backend = format_options[format].backend
|
||||
|
||||
|
||||
@@ -6,11 +6,15 @@ from pydantic import BaseModel, ConfigDict, Field
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
"""Modes for the TableFormer model."""
|
||||
|
||||
FAST = "fast"
|
||||
ACCURATE = "accurate"
|
||||
|
||||
|
||||
class TableStructureOptions(BaseModel):
|
||||
"""Options for the table structure."""
|
||||
|
||||
do_cell_matching: bool = (
|
||||
True
|
||||
# True: Matches predictions back to PDF cells. Can break table output if PDF cells
|
||||
@@ -21,6 +25,8 @@ class TableStructureOptions(BaseModel):
|
||||
|
||||
|
||||
class OcrOptions(BaseModel):
|
||||
"""OCR options."""
|
||||
|
||||
kind: str
|
||||
lang: List[str]
|
||||
force_full_page_ocr: bool = False # If enabled a full page OCR is always applied
|
||||
@@ -30,6 +36,8 @@ class OcrOptions(BaseModel):
|
||||
|
||||
|
||||
class RapidOcrOptions(OcrOptions):
|
||||
"""Options for the RapidOCR engine."""
|
||||
|
||||
kind: Literal["rapidocr"] = "rapidocr"
|
||||
|
||||
# English and chinese are the most commly used models and have been tested with RapidOCR.
|
||||
@@ -66,6 +74,8 @@ class RapidOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class EasyOcrOptions(OcrOptions):
|
||||
"""Options for the EasyOCR engine."""
|
||||
|
||||
kind: Literal["easyocr"] = "easyocr"
|
||||
lang: List[str] = ["fr", "de", "es", "en"]
|
||||
use_gpu: bool = True # same default as easyocr.Reader
|
||||
@@ -79,6 +89,8 @@ class EasyOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class TesseractCliOcrOptions(OcrOptions):
|
||||
"""Options for the TesseractCli engine."""
|
||||
|
||||
kind: Literal["tesseract"] = "tesseract"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
tesseract_cmd: str = "tesseract"
|
||||
@@ -90,6 +102,8 @@ class TesseractCliOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class TesseractOcrOptions(OcrOptions):
|
||||
"""Options for the Tesseract engine."""
|
||||
|
||||
kind: Literal["tesserocr"] = "tesserocr"
|
||||
lang: List[str] = ["fra", "deu", "spa", "eng"]
|
||||
path: Optional[str] = None
|
||||
@@ -100,6 +114,8 @@ class TesseractOcrOptions(OcrOptions):
|
||||
|
||||
|
||||
class OcrMacOptions(OcrOptions):
|
||||
"""Options for the Mac OCR engine."""
|
||||
|
||||
kind: Literal["ocrmac"] = "ocrmac"
|
||||
lang: List[str] = ["fr-FR", "de-DE", "es-ES", "en-US"]
|
||||
recognition: str = "accurate"
|
||||
@@ -111,12 +127,16 @@ class OcrMacOptions(OcrOptions):
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
"""Base pipeline options."""
|
||||
|
||||
create_legacy_output: bool = (
|
||||
True # This defautl will be set to False on a future version of docling
|
||||
)
|
||||
|
||||
|
||||
class PdfPipelineOptions(PipelineOptions):
|
||||
"""Options for the PDF pipeline."""
|
||||
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
|
||||
@@ -15,7 +15,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, DocumentStream, InputFormat
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
DoclingComponentType,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
InputFormat,
|
||||
)
|
||||
from docling.datamodel.document import (
|
||||
ConversionResult,
|
||||
InputDocument,
|
||||
@@ -23,6 +29,7 @@ from docling.datamodel.document import (
|
||||
)
|
||||
from docling.datamodel.pipeline_options import PipelineOptions
|
||||
from docling.datamodel.settings import DocumentLimits, settings
|
||||
from docling.exceptions import ConversionError
|
||||
from docling.pipeline.base_pipeline import BasePipeline
|
||||
from docling.pipeline.simple_pipeline import SimplePipeline
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
@@ -85,32 +92,37 @@ class ImageFormatOption(FormatOption):
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseDocumentBackend
|
||||
|
||||
|
||||
_format_to_default_options = {
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.MD: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||
),
|
||||
InputFormat.ASCIIDOC: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
format_to_default_options = {
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
InputFormat.DOCX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsWordDocumentBackend
|
||||
),
|
||||
InputFormat.PPTX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsPowerpointDocumentBackend
|
||||
),
|
||||
InputFormat.MD: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MarkdownDocumentBackend
|
||||
),
|
||||
InputFormat.ASCIIDOC: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=AsciiDocBackend
|
||||
),
|
||||
InputFormat.HTML: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=HTMLDocumentBackend
|
||||
),
|
||||
InputFormat.IMAGE: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
InputFormat.PDF: FormatOption(
|
||||
pipeline_cls=StandardPdfPipeline, backend=DoclingParseDocumentBackend
|
||||
),
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
else:
|
||||
raise RuntimeError(f"No default options configured for {format}")
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
@@ -121,36 +133,26 @@ class DocumentConverter:
|
||||
allowed_formats: Optional[List[InputFormat]] = None,
|
||||
format_options: Optional[Dict[InputFormat, FormatOption]] = None,
|
||||
):
|
||||
self.allowed_formats = allowed_formats
|
||||
self.format_to_options = format_options
|
||||
|
||||
if self.allowed_formats is None:
|
||||
# if self.format_to_options is not None:
|
||||
# self.allowed_formats = self.format_to_options.keys()
|
||||
# else:
|
||||
self.allowed_formats = [e for e in InputFormat] # all formats
|
||||
|
||||
if self.format_to_options is None:
|
||||
self.format_to_options = _format_to_default_options
|
||||
else:
|
||||
for f in self.allowed_formats:
|
||||
if f not in self.format_to_options.keys():
|
||||
_log.debug(f"Requested format {f} will use default options.")
|
||||
self.format_to_options[f] = _format_to_default_options[f]
|
||||
|
||||
remove_keys = []
|
||||
for f in self.format_to_options.keys():
|
||||
if f not in self.allowed_formats:
|
||||
remove_keys.append(f)
|
||||
|
||||
for f in remove_keys:
|
||||
self.format_to_options.pop(f)
|
||||
|
||||
self.allowed_formats = (
|
||||
allowed_formats if allowed_formats is not None else [e for e in InputFormat]
|
||||
)
|
||||
self.format_to_options = {
|
||||
format: (
|
||||
_get_default_option(format=format)
|
||||
if (custom_option := (format_options or {}).get(format)) is None
|
||||
else custom_option
|
||||
)
|
||||
for format in self.allowed_formats
|
||||
}
|
||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||
|
||||
def initialize_pipeline(self, format: InputFormat):
|
||||
"""Initialize the conversion pipeline for the selected format."""
|
||||
self._get_pipeline(doc_format=format)
|
||||
pipeline = self._get_pipeline(doc_format=format)
|
||||
if pipeline is None:
|
||||
raise ConversionError(
|
||||
f"No pipeline could be initialized for format {format}"
|
||||
)
|
||||
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert(
|
||||
@@ -186,22 +188,28 @@ class DocumentConverter:
|
||||
limits=limits,
|
||||
)
|
||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||
|
||||
had_result = False
|
||||
for conv_res in conv_res_iter:
|
||||
had_result = True
|
||||
if raises_on_error and conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise RuntimeError(
|
||||
raise ConversionError(
|
||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||
)
|
||||
else:
|
||||
yield conv_res
|
||||
|
||||
if not had_result and raises_on_error:
|
||||
raise ConversionError(
|
||||
f"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
)
|
||||
|
||||
def _convert(
|
||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||
) -> Iterator[ConversionResult]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
start_time = time.monotonic()
|
||||
|
||||
for input_batch in chunkify(
|
||||
@@ -223,27 +231,22 @@ class DocumentConverter:
|
||||
):
|
||||
elapsed = time.monotonic() - start_time
|
||||
start_time = time.monotonic()
|
||||
|
||||
if item is not None:
|
||||
_log.info(
|
||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||
)
|
||||
yield item
|
||||
else:
|
||||
_log.info(f"Skipped a document. We lost {elapsed:.2f} sec.")
|
||||
_log.info(
|
||||
f"Finished converting document {item.input.file.name} in {elapsed:.2f} sec."
|
||||
)
|
||||
yield item
|
||||
|
||||
def _get_pipeline(self, doc_format: InputFormat) -> Optional[BasePipeline]:
|
||||
assert self.format_to_options is not None
|
||||
|
||||
fopt = self.format_to_options.get(doc_format)
|
||||
|
||||
if fopt is None:
|
||||
raise RuntimeError(f"Could not get pipeline for {doc_format}")
|
||||
return None
|
||||
else:
|
||||
pipeline_class = fopt.pipeline_cls
|
||||
pipeline_options = fopt.pipeline_options
|
||||
|
||||
assert pipeline_options is not None
|
||||
if pipeline_options is None:
|
||||
return None
|
||||
# TODO this will ignore if different options have been defined for the same pipeline class.
|
||||
if (
|
||||
pipeline_class not in self.initialized_pipelines
|
||||
@@ -257,11 +260,26 @@ class DocumentConverter:
|
||||
|
||||
def _process_document(
|
||||
self, in_doc: InputDocument, raises_on_error: bool
|
||||
) -> Optional[ConversionResult]:
|
||||
assert self.allowed_formats is not None
|
||||
assert in_doc.format in self.allowed_formats
|
||||
) -> ConversionResult:
|
||||
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
valid = (
|
||||
self.allowed_formats is not None and in_doc.format in self.allowed_formats
|
||||
)
|
||||
if valid:
|
||||
conv_res = self._execute_pipeline(in_doc, raises_on_error=raises_on_error)
|
||||
else:
|
||||
error_message = f"File format not allowed: {in_doc.file}"
|
||||
if raises_on_error:
|
||||
raise ConversionError(error_message)
|
||||
else:
|
||||
error_item = ErrorItem(
|
||||
component_type=DoclingComponentType.USER_INPUT,
|
||||
module_name="",
|
||||
error_message=error_message,
|
||||
)
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc, status=ConversionStatus.SKIPPED, errors=[error_item]
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
@@ -270,26 +288,28 @@ class DocumentConverter:
|
||||
) -> ConversionResult:
|
||||
if in_doc.valid:
|
||||
pipeline = self._get_pipeline(in_doc.format)
|
||||
if pipeline is None: # Can't find a default pipeline. Should this raise?
|
||||
if pipeline is not None:
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
else:
|
||||
if raises_on_error:
|
||||
raise RuntimeError(
|
||||
raise ConversionError(
|
||||
f"No pipeline could be initialized for {in_doc.file}."
|
||||
)
|
||||
else:
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
return conv_res
|
||||
|
||||
conv_res = pipeline.execute(in_doc, raises_on_error=raises_on_error)
|
||||
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc,
|
||||
status=ConversionStatus.FAILURE,
|
||||
)
|
||||
else:
|
||||
if raises_on_error:
|
||||
raise RuntimeError(f"Input document {in_doc.file} is not valid.")
|
||||
raise ConversionError(f"Input document {in_doc.file} is not valid.")
|
||||
|
||||
else:
|
||||
# invalid doc or not of desired format
|
||||
conv_res = ConversionResult(input=in_doc)
|
||||
conv_res.status = ConversionStatus.FAILURE
|
||||
conv_res = ConversionResult(
|
||||
input=in_doc,
|
||||
status=ConversionStatus.FAILURE,
|
||||
)
|
||||
# TODO add error log why it failed.
|
||||
|
||||
return conv_res
|
||||
|
||||
6
docling/exceptions.py
Normal file
6
docling/exceptions.py
Normal file
@@ -0,0 +1,6 @@
|
||||
class BaseError(RuntimeError):
|
||||
pass
|
||||
|
||||
|
||||
class ConversionError(BaseError):
|
||||
pass
|
||||
@@ -1,5 +1,7 @@
|
||||
import csv
|
||||
import io
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from subprocess import DEVNULL, PIPE, Popen
|
||||
from typing import Iterable, Optional, Tuple
|
||||
@@ -95,7 +97,7 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
# _log.info(decoded_data)
|
||||
|
||||
# Read the TSV file generated by Tesseract
|
||||
df = pd.read_csv(io.StringIO(decoded_data), sep="\t")
|
||||
df = pd.read_csv(io.StringIO(decoded_data), quoting=csv.QUOTE_NONE, sep="\t")
|
||||
|
||||
# Display the dataframe (optional)
|
||||
# _log.info("df: ", df.head())
|
||||
@@ -130,14 +132,17 @@ class TesseractOcrCliModel(BaseOcrModel):
|
||||
high_res_image = page._backend.get_page_image(
|
||||
scale=self.scale, cropbox=ocr_rect
|
||||
)
|
||||
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w"
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(fname)
|
||||
try:
|
||||
with tempfile.NamedTemporaryFile(
|
||||
suffix=".png", mode="w+b", delete=False
|
||||
) as image_file:
|
||||
fname = image_file.name
|
||||
high_res_image.save(image_file)
|
||||
|
||||
df = self._run_tesseract(fname)
|
||||
finally:
|
||||
if os.path.exists(fname):
|
||||
os.remove(fname)
|
||||
|
||||
# _log.info(df)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user