mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Refactor: Address minor code quality issues and remove deprecated features
This commit is contained in:
parent
56a0e104f7
commit
9ba627b40a
@ -23,21 +23,13 @@ from docling_core.utils.file import resolve_source_to_path
|
|||||||
from pydantic import TypeAdapter
|
from pydantic import TypeAdapter
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
||||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.asr_model_specs import (
|
|
||||||
WHISPER_BASE,
|
|
||||||
WHISPER_LARGE,
|
|
||||||
WHISPER_MEDIUM,
|
|
||||||
WHISPER_SMALL,
|
|
||||||
WHISPER_TINY,
|
|
||||||
WHISPER_TURBO,
|
|
||||||
AsrModelType,
|
|
||||||
)
|
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FormatToExtensions,
|
FormatToExtensions,
|
||||||
@ -45,35 +37,13 @@ from docling.datamodel.base_models import (
|
|||||||
OutputFormat,
|
OutputFormat,
|
||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
|
||||||
AsrPipelineOptions,
|
|
||||||
EasyOcrOptions,
|
|
||||||
OcrOptions,
|
|
||||||
PaginatedPipelineOptions,
|
|
||||||
PdfBackend,
|
|
||||||
PdfPipelineOptions,
|
|
||||||
PipelineOptions,
|
|
||||||
ProcessingPipeline,
|
|
||||||
TableFormerMode,
|
|
||||||
VlmPipelineOptions,
|
|
||||||
)
|
|
||||||
from docling.datamodel.settings import settings
|
|
||||||
from docling.datamodel.vlm_model_specs import (
|
|
||||||
GRANITE_VISION_OLLAMA,
|
|
||||||
GRANITE_VISION_TRANSFORMERS,
|
|
||||||
SMOLDOCLING_MLX,
|
|
||||||
SMOLDOCLING_TRANSFORMERS,
|
|
||||||
VlmModelType,
|
|
||||||
)
|
|
||||||
from docling.document_converter import (
|
|
||||||
AudioFormatOption,
|
|
||||||
DocumentConverter,
|
|
||||||
FormatOption,
|
|
||||||
PdfFormatOption,
|
|
||||||
)
|
|
||||||
from docling.models.factories import get_ocr_factory
|
from docling.models.factories import get_ocr_factory
|
||||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
|
||||||
|
|
||||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||||
@ -190,79 +160,10 @@ def export_documents(
|
|||||||
failure_count = 0
|
failure_count = 0
|
||||||
|
|
||||||
for conv_res in conv_results:
|
for conv_res in conv_results:
|
||||||
if conv_res.status == ConversionStatus.SUCCESS:
|
if conv_res.status != ConversionStatus.SUCCESS:
|
||||||
success_count += 1
|
|
||||||
doc_filename = conv_res.input.file.stem
|
|
||||||
|
|
||||||
# Export JSON format:
|
|
||||||
if export_json:
|
|
||||||
fname = output_dir / f"{doc_filename}.json"
|
|
||||||
_log.info(f"writing JSON output to {fname}")
|
|
||||||
conv_res.document.save_as_json(
|
|
||||||
filename=fname, image_mode=image_export_mode
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export HTML format:
|
|
||||||
if export_html:
|
|
||||||
fname = output_dir / f"{doc_filename}.html"
|
|
||||||
_log.info(f"writing HTML output to {fname}")
|
|
||||||
conv_res.document.save_as_html(
|
|
||||||
filename=fname, image_mode=image_export_mode, split_page_view=False
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export HTML format:
|
|
||||||
if export_html_split_page:
|
|
||||||
fname = output_dir / f"{doc_filename}.html"
|
|
||||||
_log.info(f"writing HTML output to {fname}")
|
|
||||||
if show_layout:
|
|
||||||
ser = HTMLDocSerializer(
|
|
||||||
doc=conv_res.document,
|
|
||||||
params=HTMLParams(
|
|
||||||
image_mode=image_export_mode,
|
|
||||||
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
visualizer = LayoutVisualizer()
|
|
||||||
visualizer.params.show_label = False
|
|
||||||
ser_res = ser.serialize(
|
|
||||||
visualizer=visualizer,
|
|
||||||
)
|
|
||||||
with open(fname, "w") as fw:
|
|
||||||
fw.write(ser_res.text)
|
|
||||||
else:
|
|
||||||
conv_res.document.save_as_html(
|
|
||||||
filename=fname,
|
|
||||||
image_mode=image_export_mode,
|
|
||||||
split_page_view=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export Text format:
|
|
||||||
if export_txt:
|
|
||||||
fname = output_dir / f"{doc_filename}.txt"
|
|
||||||
_log.info(f"writing TXT output to {fname}")
|
|
||||||
conv_res.document.save_as_markdown(
|
|
||||||
filename=fname,
|
|
||||||
strict_text=True,
|
|
||||||
image_mode=ImageRefMode.PLACEHOLDER,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export Markdown format:
|
|
||||||
if export_md:
|
|
||||||
fname = output_dir / f"{doc_filename}.md"
|
|
||||||
_log.info(f"writing Markdown output to {fname}")
|
|
||||||
conv_res.document.save_as_markdown(
|
|
||||||
filename=fname, image_mode=image_export_mode
|
|
||||||
)
|
|
||||||
|
|
||||||
# Export Document Tags format:
|
|
||||||
if export_doctags:
|
|
||||||
fname = output_dir / f"{doc_filename}.doctags"
|
|
||||||
_log.info(f"writing Doc Tags output to {fname}")
|
|
||||||
conv_res.document.save_as_document_tokens(filename=fname)
|
|
||||||
|
|
||||||
else:
|
|
||||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||||
failure_count += 1
|
failure_count += 1
|
||||||
|
continue
|
||||||
|
|
||||||
_log.info(
|
_log.info(
|
||||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||||
@ -270,9 +171,7 @@ def export_documents(
|
|||||||
|
|
||||||
|
|
||||||
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
||||||
if raw is None:
|
return re.split(r"[;,]", raw) if raw else None
|
||||||
return None
|
|
||||||
return re.split(r"[;,]", raw)
|
|
||||||
|
|
||||||
|
|
||||||
@app.command(no_args_is_help=True)
|
@app.command(no_args_is_help=True)
|
||||||
@ -485,11 +384,11 @@ def convert( # noqa: C901
|
|||||||
settings.debug.visualize_tables = debug_visualize_tables
|
settings.debug.visualize_tables = debug_visualize_tables
|
||||||
settings.debug.visualize_ocr = debug_visualize_ocr
|
settings.debug.visualize_ocr = debug_visualize_ocr
|
||||||
|
|
||||||
if from_formats is None:
|
if not from_formats:
|
||||||
from_formats = list(InputFormat)
|
from_formats = list(InputFormat)
|
||||||
|
|
||||||
parsed_headers: Optional[Dict[str, str]] = None
|
parsed_headers: Optional[Dict[str, str]] = None
|
||||||
if headers is not None:
|
if headers:
|
||||||
headers_t = TypeAdapter(Dict[str, str])
|
headers_t = TypeAdapter(Dict[str, str])
|
||||||
parsed_headers = headers_t.validate_json(headers)
|
parsed_headers = headers_t.validate_json(headers)
|
||||||
|
|
||||||
@ -532,7 +431,7 @@ def convert( # noqa: C901
|
|||||||
_log.info(err) # will print more details if verbose is activated
|
_log.info(err) # will print more details if verbose is activated
|
||||||
raise typer.Abort()
|
raise typer.Abort()
|
||||||
|
|
||||||
if to_formats is None:
|
if not to_formats:
|
||||||
to_formats = [OutputFormat.MARKDOWN]
|
to_formats = [OutputFormat.MARKDOWN]
|
||||||
|
|
||||||
export_json = OutputFormat.JSON in to_formats
|
export_json = OutputFormat.JSON in to_formats
|
||||||
@ -549,7 +448,7 @@ def convert( # noqa: C901
|
|||||||
)
|
)
|
||||||
|
|
||||||
ocr_lang_list = _split_list(ocr_lang)
|
ocr_lang_list = _split_list(ocr_lang)
|
||||||
if ocr_lang_list is not None:
|
if ocr_lang_list:
|
||||||
ocr_options.lang = ocr_lang_list
|
ocr_options.lang = ocr_lang_list
|
||||||
|
|
||||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||||
@ -585,15 +484,14 @@ def convert( # noqa: C901
|
|||||||
pipeline_options.images_scale = 2
|
pipeline_options.images_scale = 2
|
||||||
|
|
||||||
backend: Type[PdfDocumentBackend]
|
backend: Type[PdfDocumentBackend]
|
||||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
backend_map = {
|
||||||
backend = DoclingParseDocumentBackend
|
PdfBackend.DLPARSE_V1: DoclingParseDocumentBackend,
|
||||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
PdfBackend.DLPARSE_V2: DoclingParseV2DocumentBackend,
|
||||||
backend = DoclingParseV2DocumentBackend
|
PdfBackend.DLPARSE_V4: DoclingParseV4DocumentBackend,
|
||||||
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
PdfBackend.PYPDFIUM2: PyPdfiumDocumentBackend,
|
||||||
backend = DoclingParseV4DocumentBackend # type: ignore
|
}
|
||||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
backend = backend_map.get(pdf_backend)
|
||||||
backend = PyPdfiumDocumentBackend # type: ignore
|
if not backend:
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||||
|
|
||||||
pdf_format_option = PdfFormatOption(
|
pdf_format_option = PdfFormatOption(
|
||||||
@ -611,13 +509,14 @@ def convert( # noqa: C901
|
|||||||
enable_remote_services=enable_remote_services,
|
enable_remote_services=enable_remote_services,
|
||||||
)
|
)
|
||||||
|
|
||||||
if vlm_model == VlmModelType.GRANITE_VISION:
|
vlm_model_map = {
|
||||||
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
VlmModelType.GRANITE_VISION: GRANITE_VISION_TRANSFORMERS,
|
||||||
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
VlmModelType.GRANITE_VISION_OLLAMA: GRANITE_VISION_OLLAMA,
|
||||||
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
VlmModelType.SMOLDOCLING: SMOLDOCLING_TRANSFORMERS,
|
||||||
elif vlm_model == VlmModelType.SMOLDOCLING:
|
}
|
||||||
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
pipeline_options.vlm_options = vlm_model_map.get(vlm_model)
|
||||||
if sys.platform == "darwin":
|
|
||||||
|
if vlm_model == VlmModelType.SMOLDOCLING and sys.platform == "darwin":
|
||||||
try:
|
try:
|
||||||
import mlx_vlm
|
import mlx_vlm
|
||||||
|
|
||||||
@ -643,19 +542,16 @@ def convert( # noqa: C901
|
|||||||
# artifacts_path = artifacts_path
|
# artifacts_path = artifacts_path
|
||||||
)
|
)
|
||||||
|
|
||||||
if asr_model == AsrModelType.WHISPER_TINY:
|
asr_model_map = {
|
||||||
pipeline_options.asr_options = WHISPER_TINY
|
AsrModelType.WHISPER_TINY: WHISPER_TINY,
|
||||||
elif asr_model == AsrModelType.WHISPER_SMALL:
|
AsrModelType.WHISPER_SMALL: WHISPER_SMALL,
|
||||||
pipeline_options.asr_options = WHISPER_SMALL
|
AsrModelType.WHISPER_MEDIUM: WHISPER_MEDIUM,
|
||||||
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
AsrModelType.WHISPER_BASE: WHISPER_BASE,
|
||||||
pipeline_options.asr_options = WHISPER_MEDIUM
|
AsrModelType.WHISPER_LARGE: WHISPER_LARGE,
|
||||||
elif asr_model == AsrModelType.WHISPER_BASE:
|
AsrModelType.WHISPER_TURBO: WHISPER_TURBO,
|
||||||
pipeline_options.asr_options = WHISPER_BASE
|
}
|
||||||
elif asr_model == AsrModelType.WHISPER_LARGE:
|
pipeline_options.asr_options = asr_model_map.get(asr_model)
|
||||||
pipeline_options.asr_options = WHISPER_LARGE
|
if not pipeline_options.asr_options:
|
||||||
elif asr_model == AsrModelType.WHISPER_TURBO:
|
|
||||||
pipeline_options.asr_options = WHISPER_TURBO
|
|
||||||
else:
|
|
||||||
_log.error(f"{asr_model} is not known")
|
_log.error(f"{asr_model} is not known")
|
||||||
raise ValueError(f"{asr_model} is not known")
|
raise ValueError(f"{asr_model} is not known")
|
||||||
|
|
||||||
@ -670,9 +566,8 @@ def convert( # noqa: C901
|
|||||||
InputFormat.AUDIO: audio_format_option,
|
InputFormat.AUDIO: audio_format_option,
|
||||||
}
|
}
|
||||||
|
|
||||||
if artifacts_path is not None:
|
if artifacts_path:
|
||||||
pipeline_options.artifacts_path = artifacts_path
|
pipeline_options.artifacts_path = artifacts_path
|
||||||
# audio_pipeline_options.artifacts_path = artifacts_path
|
|
||||||
|
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
allowed_formats=from_formats,
|
allowed_formats=from_formats,
|
||||||
|
@ -14,31 +14,15 @@ from typing_extensions import deprecated
|
|||||||
from docling.datamodel import asr_model_specs
|
from docling.datamodel import asr_model_specs
|
||||||
|
|
||||||
# Import the following for backwards compatibility
|
# Import the following for backwards compatibility
|
||||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
|
||||||
from docling.datamodel.pipeline_options_asr_model import (
|
|
||||||
InlineAsrOptions,
|
|
||||||
)
|
|
||||||
from docling.datamodel.pipeline_options_vlm_model import (
|
|
||||||
ApiVlmOptions,
|
|
||||||
InferenceFramework,
|
|
||||||
InlineVlmOptions,
|
|
||||||
ResponseFormat,
|
|
||||||
)
|
|
||||||
from docling.datamodel.vlm_model_specs import (
|
|
||||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
|
||||||
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
|
||||||
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
|
||||||
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
|
||||||
VlmModelType,
|
|
||||||
)
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class BaseOptions(BaseModel):
|
|
||||||
"""Base class for options."""
|
|
||||||
|
|
||||||
kind: ClassVar[str]
|
|
||||||
|
|
||||||
|
|
||||||
class TableFormerMode(str, Enum):
|
class TableFormerMode(str, Enum):
|
||||||
@ -200,16 +184,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
|||||||
return self.repo_id.replace("/", "--")
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
# SmolVLM
|
|
||||||
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
|
||||||
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
|
||||||
)
|
|
||||||
|
|
||||||
# GraniteVision
|
|
||||||
granite_picture_description = PictureDescriptionVlmOptions(
|
|
||||||
repo_id="ibm-granite/granite-vision-3.2-2b-preview",
|
|
||||||
prompt="What is shown in this image?",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the backend options
|
# Define an enum for the backend options
|
||||||
@ -223,15 +198,7 @@ class PdfBackend(str, Enum):
|
|||||||
|
|
||||||
|
|
||||||
# Define an enum for the ocr engines
|
# Define an enum for the ocr engines
|
||||||
@deprecated("Use ocr_factory.registered_enum")
|
|
||||||
class OcrEngine(str, Enum):
|
|
||||||
"""Enum of valid OCR engines."""
|
|
||||||
|
|
||||||
EASYOCR = "easyocr"
|
|
||||||
TESSERACT_CLI = "tesseract_cli"
|
|
||||||
TESSERACT = "tesseract"
|
|
||||||
OCRMAC = "ocrmac"
|
|
||||||
RAPIDOCR = "rapidocr"
|
|
||||||
|
|
||||||
|
|
||||||
class PipelineOptions(BaseModel):
|
class PipelineOptions(BaseModel):
|
||||||
@ -246,68 +213,10 @@ class PipelineOptions(BaseModel):
|
|||||||
allow_external_plugins: bool = False
|
allow_external_plugins: bool = False
|
||||||
|
|
||||||
|
|
||||||
class PaginatedPipelineOptions(PipelineOptions):
|
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
|
||||||
|
|
||||||
images_scale: float = 1.0
|
|
||||||
generate_page_images: bool = False
|
|
||||||
generate_picture_images: bool = False
|
|
||||||
|
|
||||||
|
|
||||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
|
||||||
generate_page_images: bool = True
|
|
||||||
force_backend_text: bool = (
|
|
||||||
False # (To be used with vlms, or other generative models)
|
|
||||||
)
|
|
||||||
# If True, text from backend will be used instead of generated text
|
|
||||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
|
||||||
smoldocling_vlm_conversion_options
|
|
||||||
)
|
|
||||||
|
|
||||||
|
class VlmPipelineOptions(PipelineOptions):
|
||||||
class AsrPipelineOptions(PipelineOptions):
|
|
||||||
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PaginatedPipelineOptions):
|
|
||||||
"""Options for the PDF pipeline."""
|
|
||||||
|
|
||||||
do_table_structure: bool = True # True: perform table structure extraction
|
|
||||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
|
||||||
do_code_enrichment: bool = False # True: perform code OCR
|
|
||||||
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
|
||||||
do_picture_classification: bool = False # True: classify pictures in documents
|
|
||||||
do_picture_description: bool = False # True: run describe pictures in documents
|
|
||||||
force_backend_text: bool = (
|
|
||||||
False # (To be used with vlms, or other generative models)
|
|
||||||
)
|
|
||||||
# If True, text from backend will be used instead of generated text
|
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
|
||||||
ocr_options: OcrOptions = EasyOcrOptions()
|
|
||||||
picture_description_options: PictureDescriptionBaseOptions = (
|
|
||||||
smolvlm_picture_description
|
|
||||||
)
|
|
||||||
|
|
||||||
images_scale: float = 1.0
|
|
||||||
generate_page_images: bool = False
|
|
||||||
generate_picture_images: bool = False
|
|
||||||
generate_table_images: bool = Field(
|
|
||||||
default=False,
|
|
||||||
deprecated=(
|
|
||||||
"Field `generate_table_images` is deprecated. "
|
|
||||||
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
|
||||||
"before conversion and then use the `TableItem.get_image` function."
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
generate_parsed_pages: Literal[True] = (
|
|
||||||
True # Always True since parsed_page is now mandatory
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class ProcessingPipeline(str, Enum):
|
|
||||||
STANDARD = "standard"
|
|
||||||
VLM = "vlm"
|
|
||||||
ASR = "asr"
|
|
||||||
|
@ -6,23 +6,14 @@ from pydantic import BaseModel, PlainValidator
|
|||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
|
||||||
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
|
||||||
if v[0] < 1 or v[1] < v[0]:
|
|
||||||
raise ValueError(
|
|
||||||
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
|
||||||
)
|
|
||||||
return v
|
|
||||||
|
|
||||||
|
|
||||||
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
|
||||||
|
|
||||||
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentLimits(BaseModel):
|
class DocumentLimits(BaseModel):
|
||||||
max_num_pages: int = sys.maxsize
|
max_num_pages: int = sys.maxsize
|
||||||
max_file_size: int = sys.maxsize
|
max_file_size: int = sys.maxsize
|
||||||
page_range: PageRange = DEFAULT_PAGE_RANGE
|
|
||||||
|
|
||||||
|
|
||||||
class BatchConcurrencySettings(BaseModel):
|
class BatchConcurrencySettings(BaseModel):
|
||||||
@ -32,14 +23,7 @@ class BatchConcurrencySettings(BaseModel):
|
|||||||
page_batch_concurrency: int = 2
|
page_batch_concurrency: int = 2
|
||||||
elements_batch_size: int = 16
|
elements_batch_size: int = 16
|
||||||
|
|
||||||
# doc_batch_size: int = 1
|
|
||||||
# doc_batch_concurrency: int = 1
|
|
||||||
# page_batch_size: int = 1
|
|
||||||
# page_batch_concurrency: int = 1
|
|
||||||
|
|
||||||
# model_concurrency: int = 2
|
|
||||||
|
|
||||||
# To force models into single core: export OMP_NUM_THREADS=1
|
|
||||||
|
|
||||||
|
|
||||||
class DebugSettings(BaseModel):
|
class DebugSettings(BaseModel):
|
||||||
|
@ -65,65 +65,8 @@ class FormatOption(BaseModel):
|
|||||||
return self
|
return self
|
||||||
|
|
||||||
|
|
||||||
class CsvFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class ExcelFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class WordFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class PowerpointFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class MarkdownFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class AsciiDocFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
|
||||||
|
|
||||||
|
|
||||||
class HTMLFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class PatentUsptoFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class XMLJatsFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = SimplePipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class ImageFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = StandardPdfPipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class PdfFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = StandardPdfPipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
|
||||||
|
|
||||||
|
|
||||||
class AudioFormatOption(FormatOption):
|
|
||||||
pipeline_cls: Type = AsrPipeline
|
|
||||||
backend: Type[AbstractDocumentBackend] = NoOpBackend
|
|
||||||
|
|
||||||
|
|
||||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||||
format_to_default_options = {
|
format_to_default_options = {
|
||||||
@ -167,12 +110,11 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|||||||
}
|
}
|
||||||
if (options := format_to_default_options.get(format)) is not None:
|
if (options := format_to_default_options.get(format)) is not None:
|
||||||
return options
|
return options
|
||||||
else:
|
|
||||||
raise RuntimeError(f"No default options configured for {format}")
|
raise RuntimeError(f"No default options configured for {format}")
|
||||||
|
|
||||||
|
|
||||||
class DocumentConverter:
|
class DocumentConverter:
|
||||||
_default_download_filename = "file"
|
_default_filename = "file"
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -194,10 +136,7 @@ class DocumentConverter:
|
|||||||
Tuple[Type[BasePipeline], str], BasePipeline
|
Tuple[Type[BasePipeline], str], BasePipeline
|
||||||
] = {}
|
] = {}
|
||||||
|
|
||||||
def _get_initialized_pipelines(
|
|
||||||
self,
|
|
||||||
) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
|
|
||||||
return self.initialized_pipelines
|
|
||||||
|
|
||||||
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
||||||
"""Generate a hash of pipeline options to use as part of the cache key."""
|
"""Generate a hash of pipeline options to use as part of the cache key."""
|
||||||
@ -217,7 +156,7 @@ class DocumentConverter:
|
|||||||
@validate_call(config=ConfigDict(strict=True))
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
def convert(
|
def convert(
|
||||||
self,
|
self,
|
||||||
source: Union[Path, str, DocumentStream], # TODO review naming
|
documents: Union[Path, str, DocumentStream], # TODO review naming
|
||||||
headers: Optional[Dict[str, str]] = None,
|
headers: Optional[Dict[str, str]] = None,
|
||||||
raises_on_error: bool = True,
|
raises_on_error: bool = True,
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
@ -225,7 +164,7 @@ class DocumentConverter:
|
|||||||
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||||
) -> ConversionResult:
|
) -> ConversionResult:
|
||||||
all_res = self.convert_all(
|
all_res = self.convert_all(
|
||||||
source=[source],
|
documents=[documents],
|
||||||
raises_on_error=raises_on_error,
|
raises_on_error=raises_on_error,
|
||||||
max_num_pages=max_num_pages,
|
max_num_pages=max_num_pages,
|
||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
@ -237,7 +176,7 @@ class DocumentConverter:
|
|||||||
@validate_call(config=ConfigDict(strict=True))
|
@validate_call(config=ConfigDict(strict=True))
|
||||||
def convert_all(
|
def convert_all(
|
||||||
self,
|
self,
|
||||||
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
documents: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
||||||
headers: Optional[Dict[str, str]] = None,
|
headers: Optional[Dict[str, str]] = None,
|
||||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||||
max_num_pages: int = sys.maxsize,
|
max_num_pages: int = sys.maxsize,
|
||||||
@ -249,28 +188,10 @@ class DocumentConverter:
|
|||||||
max_file_size=max_file_size,
|
max_file_size=max_file_size,
|
||||||
page_range=page_range,
|
page_range=page_range,
|
||||||
)
|
)
|
||||||
conv_input = _DocumentConversionInput(
|
"""Converts a batch of documents.
|
||||||
path_or_stream_iterator=source, limits=limits, headers=headers
|
|
||||||
)
|
|
||||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
|
||||||
|
|
||||||
had_result = False
|
Note: PDF backends are not thread-safe, so thread pool usage is disabled.
|
||||||
for conv_res in conv_res_iter:
|
"""
|
||||||
had_result = True
|
|
||||||
if raises_on_error and conv_res.status not in {
|
|
||||||
ConversionStatus.SUCCESS,
|
|
||||||
ConversionStatus.PARTIAL_SUCCESS,
|
|
||||||
}:
|
|
||||||
raise ConversionError(
|
|
||||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
yield conv_res
|
|
||||||
|
|
||||||
if not had_result and raises_on_error:
|
|
||||||
raise ConversionError(
|
|
||||||
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
|
||||||
)
|
|
||||||
|
|
||||||
def _convert(
|
def _convert(
|
||||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||||
@ -380,5 +301,6 @@ class DocumentConverter:
|
|||||||
status=ConversionStatus.FAILURE,
|
status=ConversionStatus.FAILURE,
|
||||||
)
|
)
|
||||||
# TODO add error log why it failed.
|
# TODO add error log why it failed.
|
||||||
|
_log.error(f"Input document {in_doc.file} is not valid.")
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
Loading…
Reference in New Issue
Block a user