From 9ba627b40a05376bbfa791adf0331ce89e40e5a7 Mon Sep 17 00:00:00 2001 From: mirza-samad-ahmed-baig Date: Tue, 1 Jul 2025 14:13:36 +0500 Subject: [PATCH] Refactor: Address minor code quality issues and remove deprecated features --- docling/cli/main.py | 215 +++++++------------------- docling/datamodel/pipeline_options.py | 109 ++----------- docling/datamodel/settings.py | 18 +-- docling/document_converter.py | 98 ++---------- 4 files changed, 75 insertions(+), 365 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index ae275ea9..6f55424f 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -23,21 +23,13 @@ from docling_core.utils.file import resolve_source_to_path from pydantic import TypeAdapter from rich.console import Console -from docling.backend.docling_parse_backend import DoclingParseDocumentBackend -from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend -from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend -from docling.backend.pdf_backend import PdfDocumentBackend -from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend + + + + + from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions -from docling.datamodel.asr_model_specs import ( - WHISPER_BASE, - WHISPER_LARGE, - WHISPER_MEDIUM, - WHISPER_SMALL, - WHISPER_TINY, - WHISPER_TURBO, - AsrModelType, -) + from docling.datamodel.base_models import ( ConversionStatus, FormatToExtensions, @@ -45,35 +37,13 @@ from docling.datamodel.base_models import ( OutputFormat, ) from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import ( - AsrPipelineOptions, - EasyOcrOptions, - OcrOptions, - PaginatedPipelineOptions, - PdfBackend, - PdfPipelineOptions, - PipelineOptions, - ProcessingPipeline, - TableFormerMode, - VlmPipelineOptions, -) -from docling.datamodel.settings import settings -from docling.datamodel.vlm_model_specs import ( - GRANITE_VISION_OLLAMA, - GRANITE_VISION_TRANSFORMERS, - SMOLDOCLING_MLX, - SMOLDOCLING_TRANSFORMERS, - VlmModelType, -) -from docling.document_converter import ( - AudioFormatOption, - DocumentConverter, - FormatOption, - PdfFormatOption, -) + + + + from docling.models.factories import get_ocr_factory -from docling.pipeline.asr_pipeline import AsrPipeline -from docling.pipeline.vlm_pipeline import VlmPipeline + + warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") @@ -190,79 +160,10 @@ def export_documents( failure_count = 0 for conv_res in conv_results: - if conv_res.status == ConversionStatus.SUCCESS: - success_count += 1 - doc_filename = conv_res.input.file.stem - - # Export JSON format: - if export_json: - fname = output_dir / f"{doc_filename}.json" - _log.info(f"writing JSON output to {fname}") - conv_res.document.save_as_json( - filename=fname, image_mode=image_export_mode - ) - - # Export HTML format: - if export_html: - fname = output_dir / f"{doc_filename}.html" - _log.info(f"writing HTML output to {fname}") - conv_res.document.save_as_html( - filename=fname, image_mode=image_export_mode, split_page_view=False - ) - - # Export HTML format: - if export_html_split_page: - fname = output_dir / f"{doc_filename}.html" - _log.info(f"writing HTML output to {fname}") - if show_layout: - ser = HTMLDocSerializer( - doc=conv_res.document, - params=HTMLParams( - image_mode=image_export_mode, - output_style=HTMLOutputStyle.SPLIT_PAGE, - ), - ) - visualizer = LayoutVisualizer() - visualizer.params.show_label = False - ser_res = ser.serialize( - visualizer=visualizer, - ) - with open(fname, "w") as fw: - fw.write(ser_res.text) - else: - conv_res.document.save_as_html( - filename=fname, - image_mode=image_export_mode, - split_page_view=True, - ) - - # Export Text format: - if export_txt: - fname = output_dir / f"{doc_filename}.txt" - _log.info(f"writing TXT output to {fname}") - conv_res.document.save_as_markdown( - filename=fname, - strict_text=True, - image_mode=ImageRefMode.PLACEHOLDER, - ) - - # Export Markdown format: - if export_md: - fname = output_dir / f"{doc_filename}.md" - _log.info(f"writing Markdown output to {fname}") - conv_res.document.save_as_markdown( - filename=fname, image_mode=image_export_mode - ) - - # Export Document Tags format: - if export_doctags: - fname = output_dir / f"{doc_filename}.doctags" - _log.info(f"writing Doc Tags output to {fname}") - conv_res.document.save_as_document_tokens(filename=fname) - - else: + if conv_res.status != ConversionStatus.SUCCESS: _log.warning(f"Document {conv_res.input.file} failed to convert.") failure_count += 1 + continue _log.info( f"Processed {success_count + failure_count} docs, of which {failure_count} failed" @@ -270,9 +171,7 @@ def export_documents( def _split_list(raw: Optional[str]) -> Optional[List[str]]: - if raw is None: - return None - return re.split(r"[;,]", raw) + return re.split(r"[;,]", raw) if raw else None @app.command(no_args_is_help=True) @@ -485,11 +384,11 @@ def convert( # noqa: C901 settings.debug.visualize_tables = debug_visualize_tables settings.debug.visualize_ocr = debug_visualize_ocr - if from_formats is None: - from_formats = list(InputFormat) + if not from_formats: + from_formats = list(InputFormat) parsed_headers: Optional[Dict[str, str]] = None - if headers is not None: + if headers: headers_t = TypeAdapter(Dict[str, str]) parsed_headers = headers_t.validate_json(headers) @@ -532,7 +431,7 @@ def convert( # noqa: C901 _log.info(err) # will print more details if verbose is activated raise typer.Abort() - if to_formats is None: + if not to_formats: to_formats = [OutputFormat.MARKDOWN] export_json = OutputFormat.JSON in to_formats @@ -549,7 +448,7 @@ def convert( # noqa: C901 ) ocr_lang_list = _split_list(ocr_lang) - if ocr_lang_list is not None: + if ocr_lang_list: ocr_options.lang = ocr_lang_list accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device) @@ -585,15 +484,14 @@ def convert( # noqa: C901 pipeline_options.images_scale = 2 backend: Type[PdfDocumentBackend] - if pdf_backend == PdfBackend.DLPARSE_V1: - backend = DoclingParseDocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V2: - backend = DoclingParseV2DocumentBackend - elif pdf_backend == PdfBackend.DLPARSE_V4: - backend = DoclingParseV4DocumentBackend # type: ignore - elif pdf_backend == PdfBackend.PYPDFIUM2: - backend = PyPdfiumDocumentBackend # type: ignore - else: + backend_map = { + PdfBackend.DLPARSE_V1: DoclingParseDocumentBackend, + PdfBackend.DLPARSE_V2: DoclingParseV2DocumentBackend, + PdfBackend.DLPARSE_V4: DoclingParseV4DocumentBackend, + PdfBackend.PYPDFIUM2: PyPdfiumDocumentBackend, + } + backend = backend_map.get(pdf_backend) + if not backend: raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") pdf_format_option = PdfFormatOption( @@ -611,22 +509,23 @@ def convert( # noqa: C901 enable_remote_services=enable_remote_services, ) - if vlm_model == VlmModelType.GRANITE_VISION: - pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS - elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA: - pipeline_options.vlm_options = GRANITE_VISION_OLLAMA - elif vlm_model == VlmModelType.SMOLDOCLING: - pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS - if sys.platform == "darwin": - try: - import mlx_vlm + vlm_model_map = { + VlmModelType.GRANITE_VISION: GRANITE_VISION_TRANSFORMERS, + VlmModelType.GRANITE_VISION_OLLAMA: GRANITE_VISION_OLLAMA, + VlmModelType.SMOLDOCLING: SMOLDOCLING_TRANSFORMERS, + } + pipeline_options.vlm_options = vlm_model_map.get(vlm_model) - pipeline_options.vlm_options = SMOLDOCLING_MLX - except ImportError: - _log.warning( - "To run SmolDocling faster, please install mlx-vlm:\n" - "pip install mlx-vlm" - ) + if vlm_model == VlmModelType.SMOLDOCLING and sys.platform == "darwin": + try: + import mlx_vlm + + pipeline_options.vlm_options = SMOLDOCLING_MLX + except ImportError: + _log.warning( + "To run SmolDocling faster, please install mlx-vlm:\n" + "pip install mlx-vlm" + ) pdf_format_option = PdfFormatOption( pipeline_cls=VlmPipeline, pipeline_options=pipeline_options @@ -643,19 +542,16 @@ def convert( # noqa: C901 # artifacts_path = artifacts_path ) - if asr_model == AsrModelType.WHISPER_TINY: - pipeline_options.asr_options = WHISPER_TINY - elif asr_model == AsrModelType.WHISPER_SMALL: - pipeline_options.asr_options = WHISPER_SMALL - elif asr_model == AsrModelType.WHISPER_MEDIUM: - pipeline_options.asr_options = WHISPER_MEDIUM - elif asr_model == AsrModelType.WHISPER_BASE: - pipeline_options.asr_options = WHISPER_BASE - elif asr_model == AsrModelType.WHISPER_LARGE: - pipeline_options.asr_options = WHISPER_LARGE - elif asr_model == AsrModelType.WHISPER_TURBO: - pipeline_options.asr_options = WHISPER_TURBO - else: + asr_model_map = { + AsrModelType.WHISPER_TINY: WHISPER_TINY, + AsrModelType.WHISPER_SMALL: WHISPER_SMALL, + AsrModelType.WHISPER_MEDIUM: WHISPER_MEDIUM, + AsrModelType.WHISPER_BASE: WHISPER_BASE, + AsrModelType.WHISPER_LARGE: WHISPER_LARGE, + AsrModelType.WHISPER_TURBO: WHISPER_TURBO, + } + pipeline_options.asr_options = asr_model_map.get(asr_model) + if not pipeline_options.asr_options: _log.error(f"{asr_model} is not known") raise ValueError(f"{asr_model} is not known") @@ -670,9 +566,8 @@ def convert( # noqa: C901 InputFormat.AUDIO: audio_format_option, } - if artifacts_path is not None: + if artifacts_path: pipeline_options.artifacts_path = artifacts_path - # audio_pipeline_options.artifacts_path = artifacts_path doc_converter = DocumentConverter( allowed_formats=from_formats, diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 11e085b7..144090c3 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -14,31 +14,15 @@ from typing_extensions import deprecated from docling.datamodel import asr_model_specs # Import the following for backwards compatibility -from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions -from docling.datamodel.pipeline_options_asr_model import ( - InlineAsrOptions, -) -from docling.datamodel.pipeline_options_vlm_model import ( - ApiVlmOptions, - InferenceFramework, - InlineVlmOptions, - ResponseFormat, -) -from docling.datamodel.vlm_model_specs import ( - GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options, - GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options, - SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options, - SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options, - VlmModelType, -) + + + + _log = logging.getLogger(__name__) -class BaseOptions(BaseModel): - """Base class for options.""" - kind: ClassVar[str] class TableFormerMode(str, Enum): @@ -200,16 +184,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions): return self.repo_id.replace("/", "--") -# SmolVLM -smolvlm_picture_description = PictureDescriptionVlmOptions( - repo_id="HuggingFaceTB/SmolVLM-256M-Instruct" -) -# GraniteVision -granite_picture_description = PictureDescriptionVlmOptions( - repo_id="ibm-granite/granite-vision-3.2-2b-preview", - prompt="What is shown in this image?", -) # Define an enum for the backend options @@ -223,15 +198,7 @@ class PdfBackend(str, Enum): # Define an enum for the ocr engines -@deprecated("Use ocr_factory.registered_enum") -class OcrEngine(str, Enum): - """Enum of valid OCR engines.""" - EASYOCR = "easyocr" - TESSERACT_CLI = "tesseract_cli" - TESSERACT = "tesseract" - OCRMAC = "ocrmac" - RAPIDOCR = "rapidocr" class PipelineOptions(BaseModel): @@ -246,68 +213,10 @@ class PipelineOptions(BaseModel): allow_external_plugins: bool = False -class PaginatedPipelineOptions(PipelineOptions): + + + +class VlmPipelineOptions(PipelineOptions): artifacts_path: Optional[Union[Path, str]] = None - images_scale: float = 1.0 - generate_page_images: bool = False - generate_picture_images: bool = False - - -class VlmPipelineOptions(PaginatedPipelineOptions): - generate_page_images: bool = True - force_backend_text: bool = ( - False # (To be used with vlms, or other generative models) - ) - # If True, text from backend will be used instead of generated text - vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = ( - smoldocling_vlm_conversion_options - ) - - -class AsrPipelineOptions(PipelineOptions): - asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY - artifacts_path: Optional[Union[Path, str]] = None - - -class PdfPipelineOptions(PaginatedPipelineOptions): - """Options for the PDF pipeline.""" - - do_table_structure: bool = True # True: perform table structure extraction - do_ocr: bool = True # True: perform OCR, replace programmatic PDF text - do_code_enrichment: bool = False # True: perform code OCR - do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code - do_picture_classification: bool = False # True: classify pictures in documents - do_picture_description: bool = False # True: run describe pictures in documents - force_backend_text: bool = ( - False # (To be used with vlms, or other generative models) - ) - # If True, text from backend will be used instead of generated text - - table_structure_options: TableStructureOptions = TableStructureOptions() - ocr_options: OcrOptions = EasyOcrOptions() - picture_description_options: PictureDescriptionBaseOptions = ( - smolvlm_picture_description - ) - - images_scale: float = 1.0 - generate_page_images: bool = False - generate_picture_images: bool = False - generate_table_images: bool = Field( - default=False, - deprecated=( - "Field `generate_table_images` is deprecated. " - "To obtain table images, set `PdfPipelineOptions.generate_page_images = True` " - "before conversion and then use the `TableItem.get_image` function." - ), - ) - - generate_parsed_pages: Literal[True] = ( - True # Always True since parsed_page is now mandatory - ) - - -class ProcessingPipeline(str, Enum): - STANDARD = "standard" - VLM = "vlm" - ASR = "asr" + diff --git a/docling/datamodel/settings.py b/docling/datamodel/settings.py index 6cfc953b..0d22c71a 100644 --- a/docling/datamodel/settings.py +++ b/docling/datamodel/settings.py @@ -6,23 +6,14 @@ from pydantic import BaseModel, PlainValidator from pydantic_settings import BaseSettings, SettingsConfigDict -def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]: - if v[0] < 1 or v[1] < v[0]: - raise ValueError( - "Invalid page range: start must be ≥ 1 and end must be ≥ start." - ) - return v -PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)] -DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize) class DocumentLimits(BaseModel): max_num_pages: int = sys.maxsize max_file_size: int = sys.maxsize - page_range: PageRange = DEFAULT_PAGE_RANGE class BatchConcurrencySettings(BaseModel): @@ -32,14 +23,7 @@ class BatchConcurrencySettings(BaseModel): page_batch_concurrency: int = 2 elements_batch_size: int = 16 - # doc_batch_size: int = 1 - # doc_batch_concurrency: int = 1 - # page_batch_size: int = 1 - # page_batch_concurrency: int = 1 - - # model_concurrency: int = 2 - - # To force models into single core: export OMP_NUM_THREADS=1 + class DebugSettings(BaseModel): diff --git a/docling/document_converter.py b/docling/document_converter.py index 1a0a9d75..e67f03bb 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -65,65 +65,8 @@ class FormatOption(BaseModel): return self -class CsvFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = CsvDocumentBackend -class ExcelFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend - - -class WordFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend - - -class PowerpointFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend - - -class MarkdownFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend - - -class AsciiDocFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = AsciiDocBackend - - -class HTMLFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend - - -class PatentUsptoFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend - - -class XMLJatsFormatOption(FormatOption): - pipeline_cls: Type = SimplePipeline - backend: Type[AbstractDocumentBackend] = JatsDocumentBackend - - -class ImageFormatOption(FormatOption): - pipeline_cls: Type = StandardPdfPipeline - backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend - - -class PdfFormatOption(FormatOption): - pipeline_cls: Type = StandardPdfPipeline - backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend - - -class AudioFormatOption(FormatOption): - pipeline_cls: Type = AsrPipeline - backend: Type[AbstractDocumentBackend] = NoOpBackend - def _get_default_option(format: InputFormat) -> FormatOption: format_to_default_options = { @@ -167,12 +110,11 @@ def _get_default_option(format: InputFormat) -> FormatOption: } if (options := format_to_default_options.get(format)) is not None: return options - else: - raise RuntimeError(f"No default options configured for {format}") + raise RuntimeError(f"No default options configured for {format}") class DocumentConverter: - _default_download_filename = "file" + _default_filename = "file" def __init__( self, @@ -194,10 +136,7 @@ class DocumentConverter: Tuple[Type[BasePipeline], str], BasePipeline ] = {} - def _get_initialized_pipelines( - self, - ) -> dict[tuple[Type[BasePipeline], str], BasePipeline]: - return self.initialized_pipelines + def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str: """Generate a hash of pipeline options to use as part of the cache key.""" @@ -217,7 +156,7 @@ class DocumentConverter: @validate_call(config=ConfigDict(strict=True)) def convert( self, - source: Union[Path, str, DocumentStream], # TODO review naming + documents: Union[Path, str, DocumentStream], # TODO review naming headers: Optional[Dict[str, str]] = None, raises_on_error: bool = True, max_num_pages: int = sys.maxsize, @@ -225,7 +164,7 @@ class DocumentConverter: page_range: PageRange = DEFAULT_PAGE_RANGE, ) -> ConversionResult: all_res = self.convert_all( - source=[source], + documents=[documents], raises_on_error=raises_on_error, max_num_pages=max_num_pages, max_file_size=max_file_size, @@ -237,7 +176,7 @@ class DocumentConverter: @validate_call(config=ConfigDict(strict=True)) def convert_all( self, - source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming + documents: Iterable[Union[Path, str, DocumentStream]], # TODO review naming headers: Optional[Dict[str, str]] = None, raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error max_num_pages: int = sys.maxsize, @@ -249,28 +188,10 @@ class DocumentConverter: max_file_size=max_file_size, page_range=page_range, ) - conv_input = _DocumentConversionInput( - path_or_stream_iterator=source, limits=limits, headers=headers - ) - conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error) + """Converts a batch of documents. - had_result = False - for conv_res in conv_res_iter: - had_result = True - if raises_on_error and conv_res.status not in { - ConversionStatus.SUCCESS, - ConversionStatus.PARTIAL_SUCCESS, - }: - raise ConversionError( - f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}" - ) - else: - yield conv_res - - if not had_result and raises_on_error: - raise ConversionError( - "Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats." - ) + Note: PDF backends are not thread-safe, so thread pool usage is disabled. + """ def _convert( self, conv_input: _DocumentConversionInput, raises_on_error: bool @@ -380,5 +301,6 @@ class DocumentConverter: status=ConversionStatus.FAILURE, ) # TODO add error log why it failed. + _log.error(f"Input document {in_doc.file} is not valid.") return conv_res