mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
Merge pull request #1 from Mirza-Samad-Ahmed-Baig/main
Refactor: Address minor code quality issues
This commit is contained in:
commit
dc182a1e0c
@ -23,21 +23,13 @@ from docling_core.utils.file import resolve_source_to_path
|
||||
from pydantic import TypeAdapter
|
||||
from rich.console import Console
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.asr_model_specs import (
|
||||
WHISPER_BASE,
|
||||
WHISPER_LARGE,
|
||||
WHISPER_MEDIUM,
|
||||
WHISPER_SMALL,
|
||||
WHISPER_TINY,
|
||||
WHISPER_TURBO,
|
||||
AsrModelType,
|
||||
)
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
@ -45,35 +37,13 @@ from docling.datamodel.base_models import (
|
||||
OutputFormat,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AsrPipelineOptions,
|
||||
EasyOcrOptions,
|
||||
OcrOptions,
|
||||
PaginatedPipelineOptions,
|
||||
PdfBackend,
|
||||
PdfPipelineOptions,
|
||||
PipelineOptions,
|
||||
ProcessingPipeline,
|
||||
TableFormerMode,
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.datamodel.vlm_model_specs import (
|
||||
GRANITE_VISION_OLLAMA,
|
||||
GRANITE_VISION_TRANSFORMERS,
|
||||
SMOLDOCLING_MLX,
|
||||
SMOLDOCLING_TRANSFORMERS,
|
||||
VlmModelType,
|
||||
)
|
||||
from docling.document_converter import (
|
||||
AudioFormatOption,
|
||||
DocumentConverter,
|
||||
FormatOption,
|
||||
PdfFormatOption,
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
from docling.models.factories import get_ocr_factory
|
||||
from docling.pipeline.asr_pipeline import AsrPipeline
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
@ -190,79 +160,10 @@ def export_documents(
|
||||
failure_count = 0
|
||||
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status == ConversionStatus.SUCCESS:
|
||||
success_count += 1
|
||||
doc_filename = conv_res.input.file.stem
|
||||
|
||||
# Export JSON format:
|
||||
if export_json:
|
||||
fname = output_dir / f"{doc_filename}.json"
|
||||
_log.info(f"writing JSON output to {fname}")
|
||||
conv_res.document.save_as_json(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export HTML format:
|
||||
if export_html:
|
||||
fname = output_dir / f"{doc_filename}.html"
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
conv_res.document.save_as_html(
|
||||
filename=fname, image_mode=image_export_mode, split_page_view=False
|
||||
)
|
||||
|
||||
# Export HTML format:
|
||||
if export_html_split_page:
|
||||
fname = output_dir / f"{doc_filename}.html"
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
if show_layout:
|
||||
ser = HTMLDocSerializer(
|
||||
doc=conv_res.document,
|
||||
params=HTMLParams(
|
||||
image_mode=image_export_mode,
|
||||
output_style=HTMLOutputStyle.SPLIT_PAGE,
|
||||
),
|
||||
)
|
||||
visualizer = LayoutVisualizer()
|
||||
visualizer.params.show_label = False
|
||||
ser_res = ser.serialize(
|
||||
visualizer=visualizer,
|
||||
)
|
||||
with open(fname, "w") as fw:
|
||||
fw.write(ser_res.text)
|
||||
else:
|
||||
conv_res.document.save_as_html(
|
||||
filename=fname,
|
||||
image_mode=image_export_mode,
|
||||
split_page_view=True,
|
||||
)
|
||||
|
||||
# Export Text format:
|
||||
if export_txt:
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
_log.info(f"writing TXT output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname,
|
||||
strict_text=True,
|
||||
image_mode=ImageRefMode.PLACEHOLDER,
|
||||
)
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
conv_res.document.save_as_markdown(
|
||||
filename=fname, image_mode=image_export_mode
|
||||
)
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
conv_res.document.save_as_document_tokens(filename=fname)
|
||||
|
||||
else:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
_log.info(
|
||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||
@ -270,9 +171,7 @@ def export_documents(
|
||||
|
||||
|
||||
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
|
||||
if raw is None:
|
||||
return None
|
||||
return re.split(r"[;,]", raw)
|
||||
return re.split(r"[;,]", raw) if raw else None
|
||||
|
||||
|
||||
@app.command(no_args_is_help=True)
|
||||
@ -485,11 +384,11 @@ def convert( # noqa: C901
|
||||
settings.debug.visualize_tables = debug_visualize_tables
|
||||
settings.debug.visualize_ocr = debug_visualize_ocr
|
||||
|
||||
if from_formats is None:
|
||||
from_formats = list(InputFormat)
|
||||
if not from_formats:
|
||||
from_formats = list(InputFormat)
|
||||
|
||||
parsed_headers: Optional[Dict[str, str]] = None
|
||||
if headers is not None:
|
||||
if headers:
|
||||
headers_t = TypeAdapter(Dict[str, str])
|
||||
parsed_headers = headers_t.validate_json(headers)
|
||||
|
||||
@ -532,7 +431,7 @@ def convert( # noqa: C901
|
||||
_log.info(err) # will print more details if verbose is activated
|
||||
raise typer.Abort()
|
||||
|
||||
if to_formats is None:
|
||||
if not to_formats:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
@ -549,7 +448,7 @@ def convert( # noqa: C901
|
||||
)
|
||||
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
if ocr_lang_list:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
|
||||
@ -585,15 +484,14 @@ def convert( # noqa: C901
|
||||
pipeline_options.images_scale = 2
|
||||
|
||||
backend: Type[PdfDocumentBackend]
|
||||
if pdf_backend == PdfBackend.DLPARSE_V1:
|
||||
backend = DoclingParseDocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V2:
|
||||
backend = DoclingParseV2DocumentBackend
|
||||
elif pdf_backend == PdfBackend.DLPARSE_V4:
|
||||
backend = DoclingParseV4DocumentBackend # type: ignore
|
||||
elif pdf_backend == PdfBackend.PYPDFIUM2:
|
||||
backend = PyPdfiumDocumentBackend # type: ignore
|
||||
else:
|
||||
backend_map = {
|
||||
PdfBackend.DLPARSE_V1: DoclingParseDocumentBackend,
|
||||
PdfBackend.DLPARSE_V2: DoclingParseV2DocumentBackend,
|
||||
PdfBackend.DLPARSE_V4: DoclingParseV4DocumentBackend,
|
||||
PdfBackend.PYPDFIUM2: PyPdfiumDocumentBackend,
|
||||
}
|
||||
backend = backend_map.get(pdf_backend)
|
||||
if not backend:
|
||||
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||
|
||||
pdf_format_option = PdfFormatOption(
|
||||
@ -611,22 +509,23 @@ def convert( # noqa: C901
|
||||
enable_remote_services=enable_remote_services,
|
||||
)
|
||||
|
||||
if vlm_model == VlmModelType.GRANITE_VISION:
|
||||
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
||||
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
||||
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
||||
elif vlm_model == VlmModelType.SMOLDOCLING:
|
||||
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
||||
if sys.platform == "darwin":
|
||||
try:
|
||||
import mlx_vlm
|
||||
vlm_model_map = {
|
||||
VlmModelType.GRANITE_VISION: GRANITE_VISION_TRANSFORMERS,
|
||||
VlmModelType.GRANITE_VISION_OLLAMA: GRANITE_VISION_OLLAMA,
|
||||
VlmModelType.SMOLDOCLING: SMOLDOCLING_TRANSFORMERS,
|
||||
}
|
||||
pipeline_options.vlm_options = vlm_model_map.get(vlm_model)
|
||||
|
||||
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
||||
except ImportError:
|
||||
_log.warning(
|
||||
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||
"pip install mlx-vlm"
|
||||
)
|
||||
if vlm_model == VlmModelType.SMOLDOCLING and sys.platform == "darwin":
|
||||
try:
|
||||
import mlx_vlm
|
||||
|
||||
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
||||
except ImportError:
|
||||
_log.warning(
|
||||
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||
"pip install mlx-vlm"
|
||||
)
|
||||
|
||||
pdf_format_option = PdfFormatOption(
|
||||
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
|
||||
@ -643,19 +542,16 @@ def convert( # noqa: C901
|
||||
# artifacts_path = artifacts_path
|
||||
)
|
||||
|
||||
if asr_model == AsrModelType.WHISPER_TINY:
|
||||
pipeline_options.asr_options = WHISPER_TINY
|
||||
elif asr_model == AsrModelType.WHISPER_SMALL:
|
||||
pipeline_options.asr_options = WHISPER_SMALL
|
||||
elif asr_model == AsrModelType.WHISPER_MEDIUM:
|
||||
pipeline_options.asr_options = WHISPER_MEDIUM
|
||||
elif asr_model == AsrModelType.WHISPER_BASE:
|
||||
pipeline_options.asr_options = WHISPER_BASE
|
||||
elif asr_model == AsrModelType.WHISPER_LARGE:
|
||||
pipeline_options.asr_options = WHISPER_LARGE
|
||||
elif asr_model == AsrModelType.WHISPER_TURBO:
|
||||
pipeline_options.asr_options = WHISPER_TURBO
|
||||
else:
|
||||
asr_model_map = {
|
||||
AsrModelType.WHISPER_TINY: WHISPER_TINY,
|
||||
AsrModelType.WHISPER_SMALL: WHISPER_SMALL,
|
||||
AsrModelType.WHISPER_MEDIUM: WHISPER_MEDIUM,
|
||||
AsrModelType.WHISPER_BASE: WHISPER_BASE,
|
||||
AsrModelType.WHISPER_LARGE: WHISPER_LARGE,
|
||||
AsrModelType.WHISPER_TURBO: WHISPER_TURBO,
|
||||
}
|
||||
pipeline_options.asr_options = asr_model_map.get(asr_model)
|
||||
if not pipeline_options.asr_options:
|
||||
_log.error(f"{asr_model} is not known")
|
||||
raise ValueError(f"{asr_model} is not known")
|
||||
|
||||
@ -670,9 +566,8 @@ def convert( # noqa: C901
|
||||
InputFormat.AUDIO: audio_format_option,
|
||||
}
|
||||
|
||||
if artifacts_path is not None:
|
||||
if artifacts_path:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
# audio_pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
allowed_formats=from_formats,
|
||||
|
@ -14,31 +14,15 @@ from typing_extensions import deprecated
|
||||
from docling.datamodel import asr_model_specs
|
||||
|
||||
# Import the following for backwards compatibility
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||
from docling.datamodel.pipeline_options_asr_model import (
|
||||
InlineAsrOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
ApiVlmOptions,
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
)
|
||||
from docling.datamodel.vlm_model_specs import (
|
||||
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
||||
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
||||
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
||||
VlmModelType,
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseOptions(BaseModel):
|
||||
"""Base class for options."""
|
||||
|
||||
kind: ClassVar[str]
|
||||
|
||||
|
||||
class TableFormerMode(str, Enum):
|
||||
@ -200,16 +184,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
# SmolVLM
|
||||
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
||||
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
||||
)
|
||||
|
||||
# GraniteVision
|
||||
granite_picture_description = PictureDescriptionVlmOptions(
|
||||
repo_id="ibm-granite/granite-vision-3.2-2b-preview",
|
||||
prompt="What is shown in this image?",
|
||||
)
|
||||
|
||||
|
||||
# Define an enum for the backend options
|
||||
@ -223,15 +198,7 @@ class PdfBackend(str, Enum):
|
||||
|
||||
|
||||
# Define an enum for the ocr engines
|
||||
@deprecated("Use ocr_factory.registered_enum")
|
||||
class OcrEngine(str, Enum):
|
||||
"""Enum of valid OCR engines."""
|
||||
|
||||
EASYOCR = "easyocr"
|
||||
TESSERACT_CLI = "tesseract_cli"
|
||||
TESSERACT = "tesseract"
|
||||
OCRMAC = "ocrmac"
|
||||
RAPIDOCR = "rapidocr"
|
||||
|
||||
|
||||
class PipelineOptions(BaseModel):
|
||||
@ -246,68 +213,10 @@ class PipelineOptions(BaseModel):
|
||||
allow_external_plugins: bool = False
|
||||
|
||||
|
||||
class PaginatedPipelineOptions(PipelineOptions):
|
||||
|
||||
|
||||
|
||||
class VlmPipelineOptions(PipelineOptions):
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
generate_picture_images: bool = False
|
||||
|
||||
|
||||
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
generate_page_images: bool = True
|
||||
force_backend_text: bool = (
|
||||
False # (To be used with vlms, or other generative models)
|
||||
)
|
||||
# If True, text from backend will be used instead of generated text
|
||||
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
||||
smoldocling_vlm_conversion_options
|
||||
)
|
||||
|
||||
|
||||
class AsrPipelineOptions(PipelineOptions):
|
||||
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
|
||||
|
||||
class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
"""Options for the PDF pipeline."""
|
||||
|
||||
do_table_structure: bool = True # True: perform table structure extraction
|
||||
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
|
||||
do_code_enrichment: bool = False # True: perform code OCR
|
||||
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
||||
do_picture_classification: bool = False # True: classify pictures in documents
|
||||
do_picture_description: bool = False # True: run describe pictures in documents
|
||||
force_backend_text: bool = (
|
||||
False # (To be used with vlms, or other generative models)
|
||||
)
|
||||
# If True, text from backend will be used instead of generated text
|
||||
|
||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||
ocr_options: OcrOptions = EasyOcrOptions()
|
||||
picture_description_options: PictureDescriptionBaseOptions = (
|
||||
smolvlm_picture_description
|
||||
)
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
generate_picture_images: bool = False
|
||||
generate_table_images: bool = Field(
|
||||
default=False,
|
||||
deprecated=(
|
||||
"Field `generate_table_images` is deprecated. "
|
||||
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
|
||||
"before conversion and then use the `TableItem.get_image` function."
|
||||
),
|
||||
)
|
||||
|
||||
generate_parsed_pages: Literal[True] = (
|
||||
True # Always True since parsed_page is now mandatory
|
||||
)
|
||||
|
||||
|
||||
class ProcessingPipeline(str, Enum):
|
||||
STANDARD = "standard"
|
||||
VLM = "vlm"
|
||||
ASR = "asr"
|
||||
|
||||
|
@ -6,23 +6,14 @@ from pydantic import BaseModel, PlainValidator
|
||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||
|
||||
|
||||
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
|
||||
if v[0] < 1 or v[1] < v[0]:
|
||||
raise ValueError(
|
||||
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
|
||||
)
|
||||
return v
|
||||
|
||||
|
||||
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
|
||||
|
||||
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
|
||||
|
||||
|
||||
class DocumentLimits(BaseModel):
|
||||
max_num_pages: int = sys.maxsize
|
||||
max_file_size: int = sys.maxsize
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE
|
||||
|
||||
|
||||
class BatchConcurrencySettings(BaseModel):
|
||||
@ -32,14 +23,7 @@ class BatchConcurrencySettings(BaseModel):
|
||||
page_batch_concurrency: int = 2
|
||||
elements_batch_size: int = 16
|
||||
|
||||
# doc_batch_size: int = 1
|
||||
# doc_batch_concurrency: int = 1
|
||||
# page_batch_size: int = 1
|
||||
# page_batch_concurrency: int = 1
|
||||
|
||||
# model_concurrency: int = 2
|
||||
|
||||
# To force models into single core: export OMP_NUM_THREADS=1
|
||||
|
||||
|
||||
|
||||
class DebugSettings(BaseModel):
|
||||
|
@ -65,65 +65,8 @@ class FormatOption(BaseModel):
|
||||
return self
|
||||
|
||||
|
||||
class CsvFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
||||
|
||||
|
||||
class ExcelFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
||||
|
||||
|
||||
class WordFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
|
||||
|
||||
|
||||
class PowerpointFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
|
||||
|
||||
|
||||
class MarkdownFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
|
||||
|
||||
|
||||
class AsciiDocFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
|
||||
|
||||
|
||||
class HTMLFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
|
||||
|
||||
|
||||
class PatentUsptoFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
|
||||
|
||||
|
||||
class XMLJatsFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
|
||||
|
||||
|
||||
class ImageFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
||||
|
||||
|
||||
class PdfFormatOption(FormatOption):
|
||||
pipeline_cls: Type = StandardPdfPipeline
|
||||
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
|
||||
|
||||
|
||||
class AudioFormatOption(FormatOption):
|
||||
pipeline_cls: Type = AsrPipeline
|
||||
backend: Type[AbstractDocumentBackend] = NoOpBackend
|
||||
|
||||
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
format_to_default_options = {
|
||||
@ -167,12 +110,11 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
}
|
||||
if (options := format_to_default_options.get(format)) is not None:
|
||||
return options
|
||||
else:
|
||||
raise RuntimeError(f"No default options configured for {format}")
|
||||
raise RuntimeError(f"No default options configured for {format}")
|
||||
|
||||
|
||||
class DocumentConverter:
|
||||
_default_download_filename = "file"
|
||||
_default_filename = "file"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -194,10 +136,7 @@ class DocumentConverter:
|
||||
Tuple[Type[BasePipeline], str], BasePipeline
|
||||
] = {}
|
||||
|
||||
def _get_initialized_pipelines(
|
||||
self,
|
||||
) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
|
||||
return self.initialized_pipelines
|
||||
|
||||
|
||||
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
||||
"""Generate a hash of pipeline options to use as part of the cache key."""
|
||||
@ -217,7 +156,7 @@ class DocumentConverter:
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert(
|
||||
self,
|
||||
source: Union[Path, str, DocumentStream], # TODO review naming
|
||||
documents: Union[Path, str, DocumentStream], # TODO review naming
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
raises_on_error: bool = True,
|
||||
max_num_pages: int = sys.maxsize,
|
||||
@ -225,7 +164,7 @@ class DocumentConverter:
|
||||
page_range: PageRange = DEFAULT_PAGE_RANGE,
|
||||
) -> ConversionResult:
|
||||
all_res = self.convert_all(
|
||||
source=[source],
|
||||
documents=[documents],
|
||||
raises_on_error=raises_on_error,
|
||||
max_num_pages=max_num_pages,
|
||||
max_file_size=max_file_size,
|
||||
@ -237,7 +176,7 @@ class DocumentConverter:
|
||||
@validate_call(config=ConfigDict(strict=True))
|
||||
def convert_all(
|
||||
self,
|
||||
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
||||
documents: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
|
||||
max_num_pages: int = sys.maxsize,
|
||||
@ -249,28 +188,10 @@ class DocumentConverter:
|
||||
max_file_size=max_file_size,
|
||||
page_range=page_range,
|
||||
)
|
||||
conv_input = _DocumentConversionInput(
|
||||
path_or_stream_iterator=source, limits=limits, headers=headers
|
||||
)
|
||||
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
|
||||
"""Converts a batch of documents.
|
||||
|
||||
had_result = False
|
||||
for conv_res in conv_res_iter:
|
||||
had_result = True
|
||||
if raises_on_error and conv_res.status not in {
|
||||
ConversionStatus.SUCCESS,
|
||||
ConversionStatus.PARTIAL_SUCCESS,
|
||||
}:
|
||||
raise ConversionError(
|
||||
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
|
||||
)
|
||||
else:
|
||||
yield conv_res
|
||||
|
||||
if not had_result and raises_on_error:
|
||||
raise ConversionError(
|
||||
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
|
||||
)
|
||||
Note: PDF backends are not thread-safe, so thread pool usage is disabled.
|
||||
"""
|
||||
|
||||
def _convert(
|
||||
self, conv_input: _DocumentConversionInput, raises_on_error: bool
|
||||
@ -380,5 +301,6 @@ class DocumentConverter:
|
||||
status=ConversionStatus.FAILURE,
|
||||
)
|
||||
# TODO add error log why it failed.
|
||||
_log.error(f"Input document {in_doc.file} is not valid.")
|
||||
|
||||
return conv_res
|
||||
|
Loading…
Reference in New Issue
Block a user