Refactor: Address minor code quality issues and remove deprecated features

2025-07-26 20:14:47 +00:00 · 2025-07-01 14:13:36 +05:00 · 2025-07-01 14:13:36 +05:00 · 9ba627b40a
commit 9ba627b40a
parent 56a0e104f7
4 changed files with 75 additions and 365 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -23,21 +23,13 @@ from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
 from rich.console import Console
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
+
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
+
-from docling.backend.pdf_backend import PdfDocumentBackend
+
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
-from docling.datamodel.asr_model_specs import (
+
    WHISPER_BASE,
    WHISPER_LARGE,
    WHISPER_MEDIUM,
    WHISPER_SMALL,
    WHISPER_TINY,
    WHISPER_TURBO,
    AsrModelType,
 )
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@ -45,35 +37,13 @@ from docling.datamodel.base_models import (
    OutputFormat,
 )
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
+
-    AsrPipelineOptions,
+
-    EasyOcrOptions,
+
-    OcrOptions,
+
    PaginatedPipelineOptions,
    PdfBackend,
    PdfPipelineOptions,
    PipelineOptions,
    ProcessingPipeline,
    TableFormerMode,
    VlmPipelineOptions,
 )
 from docling.datamodel.settings import settings
 from docling.datamodel.vlm_model_specs import (
    GRANITE_VISION_OLLAMA,
    GRANITE_VISION_TRANSFORMERS,
    SMOLDOCLING_MLX,
    SMOLDOCLING_TRANSFORMERS,
    VlmModelType,
 )
 from docling.document_converter import (
    AudioFormatOption,
    DocumentConverter,
    FormatOption,
    PdfFormatOption,
 )
 from docling.models.factories import get_ocr_factory
-from docling.pipeline.asr_pipeline import AsrPipeline
+
-from docling.pipeline.vlm_pipeline import VlmPipeline
+
 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -190,79 +160,10 @@ def export_documents(
    failure_count = 0
    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
+        if conv_res.status != ConversionStatus.SUCCESS:
            success_count += 1
            doc_filename = conv_res.input.file.stem
            # Export JSON format:
            if export_json:
                fname = output_dir / f"{doc_filename}.json"
                _log.info(f"writing JSON output to {fname}")
                conv_res.document.save_as_json(
                    filename=fname, image_mode=image_export_mode
                )
            # Export HTML format:
            if export_html:
                fname = output_dir / f"{doc_filename}.html"
                _log.info(f"writing HTML output to {fname}")
                conv_res.document.save_as_html(
                    filename=fname, image_mode=image_export_mode, split_page_view=False
                )
            # Export HTML format:
            if export_html_split_page:
                fname = output_dir / f"{doc_filename}.html"
                _log.info(f"writing HTML output to {fname}")
                if show_layout:
                    ser = HTMLDocSerializer(
                        doc=conv_res.document,
                        params=HTMLParams(
                            image_mode=image_export_mode,
                            output_style=HTMLOutputStyle.SPLIT_PAGE,
                        ),
                    )
                    visualizer = LayoutVisualizer()
                    visualizer.params.show_label = False
                    ser_res = ser.serialize(
                        visualizer=visualizer,
                    )
                    with open(fname, "w") as fw:
                        fw.write(ser_res.text)
                else:
                    conv_res.document.save_as_html(
                        filename=fname,
                        image_mode=image_export_mode,
                        split_page_view=True,
                    )
            # Export Text format:
            if export_txt:
                fname = output_dir / f"{doc_filename}.txt"
                _log.info(f"writing TXT output to {fname}")
                conv_res.document.save_as_markdown(
                    filename=fname,
                    strict_text=True,
                    image_mode=ImageRefMode.PLACEHOLDER,
                )
            # Export Markdown format:
            if export_md:
                fname = output_dir / f"{doc_filename}.md"
                _log.info(f"writing Markdown output to {fname}")
                conv_res.document.save_as_markdown(
                    filename=fname, image_mode=image_export_mode
                )
            # Export Document Tags format:
            if export_doctags:
                fname = output_dir / f"{doc_filename}.doctags"
                _log.info(f"writing Doc Tags output to {fname}")
                conv_res.document.save_as_document_tokens(filename=fname)
        else:
            _log.warning(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1
            continue
    _log.info(
        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
@ -270,9 +171,7 @@ def export_documents(
 def _split_list(raw: Optional[str]) -> Optional[List[str]]:
-    if raw is None:
+    return re.split(r"[;,]", raw) if raw else None
        return None
    return re.split(r"[;,]", raw)
@app.command(no_args_is_help=True)
@ -485,11 +384,11 @@ def convert(  # noqa: C901
    settings.debug.visualize_tables = debug_visualize_tables
    settings.debug.visualize_ocr = debug_visualize_ocr
-    if from_formats is None:
+    if not from_formats:
            from_formats = list(InputFormat)
    parsed_headers: Optional[Dict[str, str]] = None
-    if headers is not None:
+    if headers:
        headers_t = TypeAdapter(Dict[str, str])
        parsed_headers = headers_t.validate_json(headers)
@ -532,7 +431,7 @@ def convert(  # noqa: C901
                    _log.info(err)  # will print more details if verbose is activated
                    raise typer.Abort()
-        if to_formats is None:
+        if not to_formats:
            to_formats = [OutputFormat.MARKDOWN]
        export_json = OutputFormat.JSON in to_formats
@ -549,7 +448,7 @@ def convert(  # noqa: C901
        )
        ocr_lang_list = _split_list(ocr_lang)
-        if ocr_lang_list is not None:
+        if ocr_lang_list:
            ocr_options.lang = ocr_lang_list
        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
@ -585,15 +484,14 @@ def convert(  # noqa: C901
                pipeline_options.images_scale = 2
            backend: Type[PdfDocumentBackend]
-            if pdf_backend == PdfBackend.DLPARSE_V1:
+            backend_map = {
-                backend = DoclingParseDocumentBackend
+                PdfBackend.DLPARSE_V1: DoclingParseDocumentBackend,
-            elif pdf_backend == PdfBackend.DLPARSE_V2:
+                PdfBackend.DLPARSE_V2: DoclingParseV2DocumentBackend,
-                backend = DoclingParseV2DocumentBackend
+                PdfBackend.DLPARSE_V4: DoclingParseV4DocumentBackend,
-            elif pdf_backend == PdfBackend.DLPARSE_V4:
+                PdfBackend.PYPDFIUM2: PyPdfiumDocumentBackend,
-                backend = DoclingParseV4DocumentBackend  # type: ignore
+            }
-            elif pdf_backend == PdfBackend.PYPDFIUM2:
+            backend = backend_map.get(pdf_backend)
-                backend = PyPdfiumDocumentBackend  # type: ignore
+            if not backend:
            else:
                raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
            pdf_format_option = PdfFormatOption(
@ -611,13 +509,14 @@ def convert(  # noqa: C901
                enable_remote_services=enable_remote_services,
            )
-            if vlm_model == VlmModelType.GRANITE_VISION:
+            vlm_model_map = {
-                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
+                VlmModelType.GRANITE_VISION: GRANITE_VISION_TRANSFORMERS,
-            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
+                VlmModelType.GRANITE_VISION_OLLAMA: GRANITE_VISION_OLLAMA,
-                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
+                VlmModelType.SMOLDOCLING: SMOLDOCLING_TRANSFORMERS,
-            elif vlm_model == VlmModelType.SMOLDOCLING:
+            }
-                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
+            pipeline_options.vlm_options = vlm_model_map.get(vlm_model)
-                if sys.platform == "darwin":
+
            if vlm_model == VlmModelType.SMOLDOCLING and sys.platform == "darwin":
                try:
                    import mlx_vlm
@ -643,19 +542,16 @@ def convert(  # noqa: C901
                # artifacts_path = artifacts_path
            )
-            if asr_model == AsrModelType.WHISPER_TINY:
+            asr_model_map = {
-                pipeline_options.asr_options = WHISPER_TINY
+                AsrModelType.WHISPER_TINY: WHISPER_TINY,
-            elif asr_model == AsrModelType.WHISPER_SMALL:
+                AsrModelType.WHISPER_SMALL: WHISPER_SMALL,
-                pipeline_options.asr_options = WHISPER_SMALL
+                AsrModelType.WHISPER_MEDIUM: WHISPER_MEDIUM,
-            elif asr_model == AsrModelType.WHISPER_MEDIUM:
+                AsrModelType.WHISPER_BASE: WHISPER_BASE,
-                pipeline_options.asr_options = WHISPER_MEDIUM
+                AsrModelType.WHISPER_LARGE: WHISPER_LARGE,
-            elif asr_model == AsrModelType.WHISPER_BASE:
+                AsrModelType.WHISPER_TURBO: WHISPER_TURBO,
-                pipeline_options.asr_options = WHISPER_BASE
+            }
-            elif asr_model == AsrModelType.WHISPER_LARGE:
+            pipeline_options.asr_options = asr_model_map.get(asr_model)
-                pipeline_options.asr_options = WHISPER_LARGE
+            if not pipeline_options.asr_options:
            elif asr_model == AsrModelType.WHISPER_TURBO:
                pipeline_options.asr_options = WHISPER_TURBO
            else:
                _log.error(f"{asr_model} is not known")
                raise ValueError(f"{asr_model} is not known")
@ -670,9 +566,8 @@ def convert(  # noqa: C901
                InputFormat.AUDIO: audio_format_option,
            }
-        if artifacts_path is not None:
+        if artifacts_path:
            pipeline_options.artifacts_path = artifacts_path
            # audio_pipeline_options.artifacts_path = artifacts_path
        doc_converter = DocumentConverter(
            allowed_formats=from_formats,
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -14,31 +14,15 @@ from typing_extensions import deprecated
 from docling.datamodel import asr_model_specs
 # Import the following for backwards compatibility
-from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+
-from docling.datamodel.pipeline_options_asr_model import (
+
-    InlineAsrOptions,
+
-)
+
 from docling.datamodel.pipeline_options_vlm_model import (
    ApiVlmOptions,
    InferenceFramework,
    InlineVlmOptions,
    ResponseFormat,
 )
 from docling.datamodel.vlm_model_specs import (
    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
    GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
    VlmModelType,
 )
 _log = logging.getLogger(__name__)
 class BaseOptions(BaseModel):
    """Base class for options."""
    kind: ClassVar[str]
 class TableFormerMode(str, Enum):
@ -200,16 +184,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
        return self.repo_id.replace("/", "--")
 # SmolVLM
 smolvlm_picture_description = PictureDescriptionVlmOptions(
    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
 )
 # GraniteVision
 granite_picture_description = PictureDescriptionVlmOptions(
    repo_id="ibm-granite/granite-vision-3.2-2b-preview",
    prompt="What is shown in this image?",
 )
 # Define an enum for the backend options
@ -223,15 +198,7 @@ class PdfBackend(str, Enum):
 # Define an enum for the ocr engines
@deprecated("Use ocr_factory.registered_enum")
 class OcrEngine(str, Enum):
    """Enum of valid OCR engines."""
    EASYOCR = "easyocr"
    TESSERACT_CLI = "tesseract_cli"
    TESSERACT = "tesseract"
    OCRMAC = "ocrmac"
    RAPIDOCR = "rapidocr"
 class PipelineOptions(BaseModel):
@ -246,68 +213,10 @@ class PipelineOptions(BaseModel):
    allow_external_plugins: bool = False
 class PaginatedPipelineOptions(PipelineOptions):
    artifacts_path: Optional[Union[Path, str]] = None
    images_scale: float = 1.0
    generate_page_images: bool = False
    generate_picture_images: bool = False
 class VlmPipelineOptions(PaginatedPipelineOptions):
    generate_page_images: bool = True
    force_backend_text: bool = (
        False  # (To be used with vlms, or other generative models)
    )
    # If True, text from backend will be used instead of generated text
    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
        smoldocling_vlm_conversion_options
    )
-
+class VlmPipelineOptions(PipelineOptions):
 class AsrPipelineOptions(PipelineOptions):
    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
    artifacts_path: Optional[Union[Path, str]] = None
 class PdfPipelineOptions(PaginatedPipelineOptions):
    """Options for the PDF pipeline."""
    do_table_structure: bool = True  # True: perform table structure extraction
    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
    do_code_enrichment: bool = False  # True: perform code OCR
    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
    do_picture_classification: bool = False  # True: classify pictures in documents
    do_picture_description: bool = False  # True: run describe pictures in documents
    force_backend_text: bool = (
        False  # (To be used with vlms, or other generative models)
    )
    # If True, text from backend will be used instead of generated text
    table_structure_options: TableStructureOptions = TableStructureOptions()
    ocr_options: OcrOptions = EasyOcrOptions()
    picture_description_options: PictureDescriptionBaseOptions = (
        smolvlm_picture_description
    )
    images_scale: float = 1.0
    generate_page_images: bool = False
    generate_picture_images: bool = False
    generate_table_images: bool = Field(
        default=False,
        deprecated=(
            "Field `generate_table_images` is deprecated. "
            "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
            "before conversion and then use the `TableItem.get_image` function."
        ),
    )
    generate_parsed_pages: Literal[True] = (
        True  # Always True since parsed_page is now mandatory
    )
 class ProcessingPipeline(str, Enum):
    STANDARD = "standard"
    VLM = "vlm"
    ASR = "asr"
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -6,23 +6,14 @@ from pydantic import BaseModel, PlainValidator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
    if v[0] < 1 or v[1] < v[0]:
        raise ValueError(
            "Invalid page range: start must be ≥ 1 and end must be ≥ start."
        )
    return v
 PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
 DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
 class DocumentLimits(BaseModel):
    max_num_pages: int = sys.maxsize
    max_file_size: int = sys.maxsize
    page_range: PageRange = DEFAULT_PAGE_RANGE
 class BatchConcurrencySettings(BaseModel):
@ -32,14 +23,7 @@ class BatchConcurrencySettings(BaseModel):
    page_batch_concurrency: int = 2
    elements_batch_size: int = 16
    # doc_batch_size: int = 1
    # doc_batch_concurrency: int = 1
    # page_batch_size: int = 1
    # page_batch_concurrency: int = 1
    # model_concurrency: int = 2
    # To force models into single core: export OMP_NUM_THREADS=1
 class DebugSettings(BaseModel):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -65,65 +65,8 @@ class FormatOption(BaseModel):
        return self
 class CsvFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
 class ExcelFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
 class WordFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
 class PowerpointFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
 class MarkdownFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
 class AsciiDocFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = AsciiDocBackend
 class HTMLFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
 class PatentUsptoFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
 class XMLJatsFormatOption(FormatOption):
    pipeline_cls: Type = SimplePipeline
    backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
 class ImageFormatOption(FormatOption):
    pipeline_cls: Type = StandardPdfPipeline
    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
 class PdfFormatOption(FormatOption):
    pipeline_cls: Type = StandardPdfPipeline
    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
 class AudioFormatOption(FormatOption):
    pipeline_cls: Type = AsrPipeline
    backend: Type[AbstractDocumentBackend] = NoOpBackend
 def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
@ -167,12 +110,11 @@ def _get_default_option(format: InputFormat) -> FormatOption:
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
    else:
    raise RuntimeError(f"No default options configured for {format}")
 class DocumentConverter:
-    _default_download_filename = "file"
+    _default_filename = "file"
    def __init__(
        self,
@ -194,10 +136,7 @@ class DocumentConverter:
            Tuple[Type[BasePipeline], str], BasePipeline
        ] = {}
-    def _get_initialized_pipelines(
+    
        self,
    ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
        return self.initialized_pipelines
    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
        """Generate a hash of pipeline options to use as part of the cache key."""
@ -217,7 +156,7 @@ class DocumentConverter:
    @validate_call(config=ConfigDict(strict=True))
    def convert(
        self,
-        source: Union[Path, str, DocumentStream],  # TODO review naming
+        documents: Union[Path, str, DocumentStream],  # TODO review naming
        headers: Optional[Dict[str, str]] = None,
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
@ -225,7 +164,7 @@ class DocumentConverter:
        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> ConversionResult:
        all_res = self.convert_all(
-            source=[source],
+            documents=[documents],
            raises_on_error=raises_on_error,
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
@ -237,7 +176,7 @@ class DocumentConverter:
    @validate_call(config=ConfigDict(strict=True))
    def convert_all(
        self,
-        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
+        documents: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
        headers: Optional[Dict[str, str]] = None,
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
@ -249,28 +188,10 @@ class DocumentConverter:
            max_file_size=max_file_size,
            page_range=page_range,
        )
-        conv_input = _DocumentConversionInput(
+        """Converts a batch of documents.
            path_or_stream_iterator=source, limits=limits, headers=headers
        )
        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
-        had_result = False
+        Note: PDF backends are not thread-safe, so thread pool usage is disabled.
-        for conv_res in conv_res_iter:
+        """
            had_result = True
            if raises_on_error and conv_res.status not in {
                ConversionStatus.SUCCESS,
                ConversionStatus.PARTIAL_SUCCESS,
            }:
                raise ConversionError(
                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
                )
            else:
                yield conv_res
        if not had_result and raises_on_error:
            raise ConversionError(
                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
            )
    def _convert(
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
@ -380,5 +301,6 @@ class DocumentConverter:
                    status=ConversionStatus.FAILURE,
                )
                # TODO add error log why it failed.
                _log.error(f"Input document {in_doc.file} is not valid.")
        return conv_res