Merge pull request #1 from Mirza-Samad-Ahmed-Baig/main

Refactor: Address minor code quality issues
2025-07-25 19:44:34 +00:00 · 2025-07-01 14:56:59 +05:00 · 2025-07-01 14:56:59 +05:00 · dc182a1e0c
commit dc182a1e0c
parent 56a0e104f7 9ba627b40a
4 changed files with 75 additions and 365 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -23,21 +23,13 @@ from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
 from rich.console import Console

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
-from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
-from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+
+
+
+
+
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
-from docling.datamodel.asr_model_specs import (
-    WHISPER_BASE,
-    WHISPER_LARGE,
-    WHISPER_MEDIUM,
-    WHISPER_SMALL,
-    WHISPER_TINY,
-    WHISPER_TURBO,
-    AsrModelType,
-)
+
 from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
@ -45,35 +37,13 @@ from docling.datamodel.base_models import (
    OutputFormat,
 )
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.pipeline_options import (
-    AsrPipelineOptions,
-    EasyOcrOptions,
-    OcrOptions,
-    PaginatedPipelineOptions,
-    PdfBackend,
-    PdfPipelineOptions,
-    PipelineOptions,
-    ProcessingPipeline,
-    TableFormerMode,
-    VlmPipelineOptions,
-)
-from docling.datamodel.settings import settings
-from docling.datamodel.vlm_model_specs import (
-    GRANITE_VISION_OLLAMA,
-    GRANITE_VISION_TRANSFORMERS,
-    SMOLDOCLING_MLX,
-    SMOLDOCLING_TRANSFORMERS,
-    VlmModelType,
-)
-from docling.document_converter import (
-    AudioFormatOption,
-    DocumentConverter,
-    FormatOption,
-    PdfFormatOption,
-)
+
+
+
+
 from docling.models.factories import get_ocr_factory
-from docling.pipeline.asr_pipeline import AsrPipeline
-from docling.pipeline.vlm_pipeline import VlmPipeline
+
+

 warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
 warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -190,79 +160,10 @@ def export_documents(
    failure_count = 0

    for conv_res in conv_results:
-        if conv_res.status == ConversionStatus.SUCCESS:
-            success_count += 1
-            doc_filename = conv_res.input.file.stem
-
-            # Export JSON format:
-            if export_json:
-                fname = output_dir / f"{doc_filename}.json"
-                _log.info(f"writing JSON output to {fname}")
-                conv_res.document.save_as_json(
-                    filename=fname, image_mode=image_export_mode
-                )
-
-            # Export HTML format:
-            if export_html:
-                fname = output_dir / f"{doc_filename}.html"
-                _log.info(f"writing HTML output to {fname}")
-                conv_res.document.save_as_html(
-                    filename=fname, image_mode=image_export_mode, split_page_view=False
-                )
-
-            # Export HTML format:
-            if export_html_split_page:
-                fname = output_dir / f"{doc_filename}.html"
-                _log.info(f"writing HTML output to {fname}")
-                if show_layout:
-                    ser = HTMLDocSerializer(
-                        doc=conv_res.document,
-                        params=HTMLParams(
-                            image_mode=image_export_mode,
-                            output_style=HTMLOutputStyle.SPLIT_PAGE,
-                        ),
-                    )
-                    visualizer = LayoutVisualizer()
-                    visualizer.params.show_label = False
-                    ser_res = ser.serialize(
-                        visualizer=visualizer,
-                    )
-                    with open(fname, "w") as fw:
-                        fw.write(ser_res.text)
-                else:
-                    conv_res.document.save_as_html(
-                        filename=fname,
-                        image_mode=image_export_mode,
-                        split_page_view=True,
-                    )
-
-            # Export Text format:
-            if export_txt:
-                fname = output_dir / f"{doc_filename}.txt"
-                _log.info(f"writing TXT output to {fname}")
-                conv_res.document.save_as_markdown(
-                    filename=fname,
-                    strict_text=True,
-                    image_mode=ImageRefMode.PLACEHOLDER,
-                )
-
-            # Export Markdown format:
-            if export_md:
-                fname = output_dir / f"{doc_filename}.md"
-                _log.info(f"writing Markdown output to {fname}")
-                conv_res.document.save_as_markdown(
-                    filename=fname, image_mode=image_export_mode
-                )
-
-            # Export Document Tags format:
-            if export_doctags:
-                fname = output_dir / f"{doc_filename}.doctags"
-                _log.info(f"writing Doc Tags output to {fname}")
-                conv_res.document.save_as_document_tokens(filename=fname)
-
-        else:
+        if conv_res.status != ConversionStatus.SUCCESS:
            _log.warning(f"Document {conv_res.input.file} failed to convert.")
            failure_count += 1
+            continue

    _log.info(
        f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
@ -270,9 +171,7 @@ def export_documents(


 def _split_list(raw: Optional[str]) -> Optional[List[str]]:
-    if raw is None:
-        return None
-    return re.split(r"[;,]", raw)
+    return re.split(r"[;,]", raw) if raw else None


@app.command(no_args_is_help=True)
@ -485,11 +384,11 @@ def convert(  # noqa: C901
    settings.debug.visualize_tables = debug_visualize_tables
    settings.debug.visualize_ocr = debug_visualize_ocr

-    if from_formats is None:
-        from_formats = list(InputFormat)
+    if not from_formats:
+            from_formats = list(InputFormat)

    parsed_headers: Optional[Dict[str, str]] = None
-    if headers is not None:
+    if headers:
        headers_t = TypeAdapter(Dict[str, str])
        parsed_headers = headers_t.validate_json(headers)

@ -532,7 +431,7 @@ def convert(  # noqa: C901
                    _log.info(err)  # will print more details if verbose is activated
                    raise typer.Abort()

-        if to_formats is None:
+        if not to_formats:
            to_formats = [OutputFormat.MARKDOWN]

        export_json = OutputFormat.JSON in to_formats
@ -549,7 +448,7 @@ def convert(  # noqa: C901
        )

        ocr_lang_list = _split_list(ocr_lang)
-        if ocr_lang_list is not None:
+        if ocr_lang_list:
            ocr_options.lang = ocr_lang_list

        accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
@ -585,15 +484,14 @@ def convert(  # noqa: C901
                pipeline_options.images_scale = 2

            backend: Type[PdfDocumentBackend]
-            if pdf_backend == PdfBackend.DLPARSE_V1:
-                backend = DoclingParseDocumentBackend
-            elif pdf_backend == PdfBackend.DLPARSE_V2:
-                backend = DoclingParseV2DocumentBackend
-            elif pdf_backend == PdfBackend.DLPARSE_V4:
-                backend = DoclingParseV4DocumentBackend  # type: ignore
-            elif pdf_backend == PdfBackend.PYPDFIUM2:
-                backend = PyPdfiumDocumentBackend  # type: ignore
-            else:
+            backend_map = {
+                PdfBackend.DLPARSE_V1: DoclingParseDocumentBackend,
+                PdfBackend.DLPARSE_V2: DoclingParseV2DocumentBackend,
+                PdfBackend.DLPARSE_V4: DoclingParseV4DocumentBackend,
+                PdfBackend.PYPDFIUM2: PyPdfiumDocumentBackend,
+            }
+            backend = backend_map.get(pdf_backend)
+            if not backend:
                raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")

            pdf_format_option = PdfFormatOption(
@ -611,22 +509,23 @@ def convert(  # noqa: C901
                enable_remote_services=enable_remote_services,
            )

-            if vlm_model == VlmModelType.GRANITE_VISION:
-                pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
-            elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
-                pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
-            elif vlm_model == VlmModelType.SMOLDOCLING:
-                pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
-                if sys.platform == "darwin":
-                    try:
-                        import mlx_vlm
+            vlm_model_map = {
+                VlmModelType.GRANITE_VISION: GRANITE_VISION_TRANSFORMERS,
+                VlmModelType.GRANITE_VISION_OLLAMA: GRANITE_VISION_OLLAMA,
+                VlmModelType.SMOLDOCLING: SMOLDOCLING_TRANSFORMERS,
+            }
+            pipeline_options.vlm_options = vlm_model_map.get(vlm_model)

-                        pipeline_options.vlm_options = SMOLDOCLING_MLX
-                    except ImportError:
-                        _log.warning(
-                            "To run SmolDocling faster, please install mlx-vlm:\n"
-                            "pip install mlx-vlm"
-                        )
+            if vlm_model == VlmModelType.SMOLDOCLING and sys.platform == "darwin":
+                try:
+                    import mlx_vlm
+
+                    pipeline_options.vlm_options = SMOLDOCLING_MLX
+                except ImportError:
+                    _log.warning(
+                        "To run SmolDocling faster, please install mlx-vlm:\n"
+                        "pip install mlx-vlm"
+                    )

            pdf_format_option = PdfFormatOption(
                pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
@ -643,19 +542,16 @@ def convert(  # noqa: C901
                # artifacts_path = artifacts_path
            )

-            if asr_model == AsrModelType.WHISPER_TINY:
-                pipeline_options.asr_options = WHISPER_TINY
-            elif asr_model == AsrModelType.WHISPER_SMALL:
-                pipeline_options.asr_options = WHISPER_SMALL
-            elif asr_model == AsrModelType.WHISPER_MEDIUM:
-                pipeline_options.asr_options = WHISPER_MEDIUM
-            elif asr_model == AsrModelType.WHISPER_BASE:
-                pipeline_options.asr_options = WHISPER_BASE
-            elif asr_model == AsrModelType.WHISPER_LARGE:
-                pipeline_options.asr_options = WHISPER_LARGE
-            elif asr_model == AsrModelType.WHISPER_TURBO:
-                pipeline_options.asr_options = WHISPER_TURBO
-            else:
+            asr_model_map = {
+                AsrModelType.WHISPER_TINY: WHISPER_TINY,
+                AsrModelType.WHISPER_SMALL: WHISPER_SMALL,
+                AsrModelType.WHISPER_MEDIUM: WHISPER_MEDIUM,
+                AsrModelType.WHISPER_BASE: WHISPER_BASE,
+                AsrModelType.WHISPER_LARGE: WHISPER_LARGE,
+                AsrModelType.WHISPER_TURBO: WHISPER_TURBO,
+            }
+            pipeline_options.asr_options = asr_model_map.get(asr_model)
+            if not pipeline_options.asr_options:
                _log.error(f"{asr_model} is not known")
                raise ValueError(f"{asr_model} is not known")

@ -670,9 +566,8 @@ def convert(  # noqa: C901
                InputFormat.AUDIO: audio_format_option,
            }

-        if artifacts_path is not None:
+        if artifacts_path:
            pipeline_options.artifacts_path = artifacts_path
-            # audio_pipeline_options.artifacts_path = artifacts_path

        doc_converter = DocumentConverter(
            allowed_formats=from_formats,
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -14,31 +14,15 @@ from typing_extensions import deprecated
 from docling.datamodel import asr_model_specs

 # Import the following for backwards compatibility
-from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
-from docling.datamodel.pipeline_options_asr_model import (
-    InlineAsrOptions,
-)
-from docling.datamodel.pipeline_options_vlm_model import (
-    ApiVlmOptions,
-    InferenceFramework,
-    InlineVlmOptions,
-    ResponseFormat,
-)
-from docling.datamodel.vlm_model_specs import (
-    GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
-    GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
-    SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
-    SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
-    VlmModelType,
-)
+
+
+
+

 _log = logging.getLogger(__name__)


-class BaseOptions(BaseModel):
-    """Base class for options."""

-    kind: ClassVar[str]


 class TableFormerMode(str, Enum):
@ -200,16 +184,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
        return self.repo_id.replace("/", "--")


-# SmolVLM
-smolvlm_picture_description = PictureDescriptionVlmOptions(
-    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
-)

-# GraniteVision
-granite_picture_description = PictureDescriptionVlmOptions(
-    repo_id="ibm-granite/granite-vision-3.2-2b-preview",
-    prompt="What is shown in this image?",
-)


 # Define an enum for the backend options
@ -223,15 +198,7 @@ class PdfBackend(str, Enum):


 # Define an enum for the ocr engines
-@deprecated("Use ocr_factory.registered_enum")
-class OcrEngine(str, Enum):
-    """Enum of valid OCR engines."""

-    EASYOCR = "easyocr"
-    TESSERACT_CLI = "tesseract_cli"
-    TESSERACT = "tesseract"
-    OCRMAC = "ocrmac"
-    RAPIDOCR = "rapidocr"


 class PipelineOptions(BaseModel):
@ -246,68 +213,10 @@ class PipelineOptions(BaseModel):
    allow_external_plugins: bool = False


-class PaginatedPipelineOptions(PipelineOptions):
+
+
+
+class VlmPipelineOptions(PipelineOptions):
    artifacts_path: Optional[Union[Path, str]] = None

-    images_scale: float = 1.0
-    generate_page_images: bool = False
-    generate_picture_images: bool = False
-
-
-class VlmPipelineOptions(PaginatedPipelineOptions):
-    generate_page_images: bool = True
-    force_backend_text: bool = (
-        False  # (To be used with vlms, or other generative models)
-    )
-    # If True, text from backend will be used instead of generated text
-    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
-        smoldocling_vlm_conversion_options
-    )
-
-
-class AsrPipelineOptions(PipelineOptions):
-    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
-    artifacts_path: Optional[Union[Path, str]] = None
-
-
-class PdfPipelineOptions(PaginatedPipelineOptions):
-    """Options for the PDF pipeline."""
-
-    do_table_structure: bool = True  # True: perform table structure extraction
-    do_ocr: bool = True  # True: perform OCR, replace programmatic PDF text
-    do_code_enrichment: bool = False  # True: perform code OCR
-    do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
-    do_picture_classification: bool = False  # True: classify pictures in documents
-    do_picture_description: bool = False  # True: run describe pictures in documents
-    force_backend_text: bool = (
-        False  # (To be used with vlms, or other generative models)
-    )
-    # If True, text from backend will be used instead of generated text
-
-    table_structure_options: TableStructureOptions = TableStructureOptions()
-    ocr_options: OcrOptions = EasyOcrOptions()
-    picture_description_options: PictureDescriptionBaseOptions = (
-        smolvlm_picture_description
-    )
-
-    images_scale: float = 1.0
-    generate_page_images: bool = False
-    generate_picture_images: bool = False
-    generate_table_images: bool = Field(
-        default=False,
-        deprecated=(
-            "Field `generate_table_images` is deprecated. "
-            "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
-            "before conversion and then use the `TableItem.get_image` function."
-        ),
-    )
-
-    generate_parsed_pages: Literal[True] = (
-        True  # Always True since parsed_page is now mandatory
-    )
-
-
-class ProcessingPipeline(str, Enum):
-    STANDARD = "standard"
-    VLM = "vlm"
-    ASR = "asr"
+    
--- a/docling/datamodel/settings.py
+++ b/docling/datamodel/settings.py
@ -6,23 +6,14 @@ from pydantic import BaseModel, PlainValidator
 from pydantic_settings import BaseSettings, SettingsConfigDict


-def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
-    if v[0] < 1 or v[1] < v[0]:
-        raise ValueError(
-            "Invalid page range: start must be ≥ 1 and end must be ≥ start."
-        )
-    return v


-PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]

-DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)


 class DocumentLimits(BaseModel):
    max_num_pages: int = sys.maxsize
    max_file_size: int = sys.maxsize
-    page_range: PageRange = DEFAULT_PAGE_RANGE


 class BatchConcurrencySettings(BaseModel):
@ -32,14 +23,7 @@ class BatchConcurrencySettings(BaseModel):
    page_batch_concurrency: int = 2
    elements_batch_size: int = 16

-    # doc_batch_size: int = 1
-    # doc_batch_concurrency: int = 1
-    # page_batch_size: int = 1
-    # page_batch_concurrency: int = 1
-
-    # model_concurrency: int = 2
-
-    # To force models into single core: export OMP_NUM_THREADS=1
+    


 class DebugSettings(BaseModel):
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -65,65 +65,8 @@ class FormatOption(BaseModel):
        return self


-class CsvFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = CsvDocumentBackend


-class ExcelFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
-
-
-class WordFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
-
-
-class PowerpointFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
-
-
-class MarkdownFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
-
-
-class AsciiDocFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = AsciiDocBackend
-
-
-class HTMLFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
-
-
-class PatentUsptoFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
-
-
-class XMLJatsFormatOption(FormatOption):
-    pipeline_cls: Type = SimplePipeline
-    backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
-
-
-class ImageFormatOption(FormatOption):
-    pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
-
-
-class PdfFormatOption(FormatOption):
-    pipeline_cls: Type = StandardPdfPipeline
-    backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
-
-
-class AudioFormatOption(FormatOption):
-    pipeline_cls: Type = AsrPipeline
-    backend: Type[AbstractDocumentBackend] = NoOpBackend
-

 def _get_default_option(format: InputFormat) -> FormatOption:
    format_to_default_options = {
@ -167,12 +110,11 @@ def _get_default_option(format: InputFormat) -> FormatOption:
    }
    if (options := format_to_default_options.get(format)) is not None:
        return options
-    else:
-        raise RuntimeError(f"No default options configured for {format}")
+    raise RuntimeError(f"No default options configured for {format}")


 class DocumentConverter:
-    _default_download_filename = "file"
+    _default_filename = "file"

    def __init__(
        self,
@ -194,10 +136,7 @@ class DocumentConverter:
            Tuple[Type[BasePipeline], str], BasePipeline
        ] = {}

-    def _get_initialized_pipelines(
-        self,
-    ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
-        return self.initialized_pipelines
+    

    def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
        """Generate a hash of pipeline options to use as part of the cache key."""
@ -217,7 +156,7 @@ class DocumentConverter:
    @validate_call(config=ConfigDict(strict=True))
    def convert(
        self,
-        source: Union[Path, str, DocumentStream],  # TODO review naming
+        documents: Union[Path, str, DocumentStream],  # TODO review naming
        headers: Optional[Dict[str, str]] = None,
        raises_on_error: bool = True,
        max_num_pages: int = sys.maxsize,
@ -225,7 +164,7 @@ class DocumentConverter:
        page_range: PageRange = DEFAULT_PAGE_RANGE,
    ) -> ConversionResult:
        all_res = self.convert_all(
-            source=[source],
+            documents=[documents],
            raises_on_error=raises_on_error,
            max_num_pages=max_num_pages,
            max_file_size=max_file_size,
@ -237,7 +176,7 @@ class DocumentConverter:
    @validate_call(config=ConfigDict(strict=True))
    def convert_all(
        self,
-        source: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
+        documents: Iterable[Union[Path, str, DocumentStream]],  # TODO review naming
        headers: Optional[Dict[str, str]] = None,
        raises_on_error: bool = True,  # True: raises on first conversion error; False: does not raise on conv error
        max_num_pages: int = sys.maxsize,
@ -249,28 +188,10 @@ class DocumentConverter:
            max_file_size=max_file_size,
            page_range=page_range,
        )
-        conv_input = _DocumentConversionInput(
-            path_or_stream_iterator=source, limits=limits, headers=headers
-        )
-        conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
+        """Converts a batch of documents.

-        had_result = False
-        for conv_res in conv_res_iter:
-            had_result = True
-            if raises_on_error and conv_res.status not in {
-                ConversionStatus.SUCCESS,
-                ConversionStatus.PARTIAL_SUCCESS,
-            }:
-                raise ConversionError(
-                    f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
-                )
-            else:
-                yield conv_res
-
-        if not had_result and raises_on_error:
-            raise ConversionError(
-                "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
-            )
+        Note: PDF backends are not thread-safe, so thread pool usage is disabled.
+        """

    def _convert(
        self, conv_input: _DocumentConversionInput, raises_on_error: bool
@ -380,5 +301,6 @@ class DocumentConverter:
                    status=ConversionStatus.FAILURE,
                )
                # TODO add error log why it failed.
+                _log.error(f"Input document {in_doc.file} is not valid.")

        return conv_res