Merge pull request #1 from Mirza-Samad-Ahmed-Baig/main

Refactor: Address minor code quality issues
This commit is contained in:
MirzaSamad20 2025-07-01 14:56:59 +05:00 committed by GitHub
commit dc182a1e0c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 75 additions and 365 deletions

View File

@ -23,21 +23,13 @@ from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter
from rich.console import Console
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_BASE,
WHISPER_LARGE,
WHISPER_MEDIUM,
WHISPER_SMALL,
WHISPER_TINY,
WHISPER_TURBO,
AsrModelType,
)
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@ -45,35 +37,13 @@ from docling.datamodel.base_models import (
OutputFormat,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
EasyOcrOptions,
OcrOptions,
PaginatedPipelineOptions,
PdfBackend,
PdfPipelineOptions,
PipelineOptions,
ProcessingPipeline,
TableFormerMode,
VlmPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.datamodel.vlm_model_specs import (
GRANITE_VISION_OLLAMA,
GRANITE_VISION_TRANSFORMERS,
SMOLDOCLING_MLX,
SMOLDOCLING_TRANSFORMERS,
VlmModelType,
)
from docling.document_converter import (
AudioFormatOption,
DocumentConverter,
FormatOption,
PdfFormatOption,
)
from docling.models.factories import get_ocr_factory
from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
@ -190,79 +160,10 @@ def export_documents(
failure_count = 0
for conv_res in conv_results:
if conv_res.status == ConversionStatus.SUCCESS:
success_count += 1
doc_filename = conv_res.input.file.stem
# Export JSON format:
if export_json:
fname = output_dir / f"{doc_filename}.json"
_log.info(f"writing JSON output to {fname}")
conv_res.document.save_as_json(
filename=fname, image_mode=image_export_mode
)
# Export HTML format:
if export_html:
fname = output_dir / f"{doc_filename}.html"
_log.info(f"writing HTML output to {fname}")
conv_res.document.save_as_html(
filename=fname, image_mode=image_export_mode, split_page_view=False
)
# Export HTML format:
if export_html_split_page:
fname = output_dir / f"{doc_filename}.html"
_log.info(f"writing HTML output to {fname}")
if show_layout:
ser = HTMLDocSerializer(
doc=conv_res.document,
params=HTMLParams(
image_mode=image_export_mode,
output_style=HTMLOutputStyle.SPLIT_PAGE,
),
)
visualizer = LayoutVisualizer()
visualizer.params.show_label = False
ser_res = ser.serialize(
visualizer=visualizer,
)
with open(fname, "w") as fw:
fw.write(ser_res.text)
else:
conv_res.document.save_as_html(
filename=fname,
image_mode=image_export_mode,
split_page_view=True,
)
# Export Text format:
if export_txt:
fname = output_dir / f"{doc_filename}.txt"
_log.info(f"writing TXT output to {fname}")
conv_res.document.save_as_markdown(
filename=fname,
strict_text=True,
image_mode=ImageRefMode.PLACEHOLDER,
)
# Export Markdown format:
if export_md:
fname = output_dir / f"{doc_filename}.md"
_log.info(f"writing Markdown output to {fname}")
conv_res.document.save_as_markdown(
filename=fname, image_mode=image_export_mode
)
# Export Document Tags format:
if export_doctags:
fname = output_dir / f"{doc_filename}.doctags"
_log.info(f"writing Doc Tags output to {fname}")
conv_res.document.save_as_document_tokens(filename=fname)
else:
if conv_res.status != ConversionStatus.SUCCESS:
_log.warning(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
_log.info(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
@ -270,9 +171,7 @@ def export_documents(
def _split_list(raw: Optional[str]) -> Optional[List[str]]:
if raw is None:
return None
return re.split(r"[;,]", raw)
return re.split(r"[;,]", raw) if raw else None
@app.command(no_args_is_help=True)
@ -485,11 +384,11 @@ def convert( # noqa: C901
settings.debug.visualize_tables = debug_visualize_tables
settings.debug.visualize_ocr = debug_visualize_ocr
if from_formats is None:
from_formats = list(InputFormat)
if not from_formats:
from_formats = list(InputFormat)
parsed_headers: Optional[Dict[str, str]] = None
if headers is not None:
if headers:
headers_t = TypeAdapter(Dict[str, str])
parsed_headers = headers_t.validate_json(headers)
@ -532,7 +431,7 @@ def convert( # noqa: C901
_log.info(err) # will print more details if verbose is activated
raise typer.Abort()
if to_formats is None:
if not to_formats:
to_formats = [OutputFormat.MARKDOWN]
export_json = OutputFormat.JSON in to_formats
@ -549,7 +448,7 @@ def convert( # noqa: C901
)
ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None:
if ocr_lang_list:
ocr_options.lang = ocr_lang_list
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
@ -585,15 +484,14 @@ def convert( # noqa: C901
pipeline_options.images_scale = 2
backend: Type[PdfDocumentBackend]
if pdf_backend == PdfBackend.DLPARSE_V1:
backend = DoclingParseDocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V2:
backend = DoclingParseV2DocumentBackend
elif pdf_backend == PdfBackend.DLPARSE_V4:
backend = DoclingParseV4DocumentBackend # type: ignore
elif pdf_backend == PdfBackend.PYPDFIUM2:
backend = PyPdfiumDocumentBackend # type: ignore
else:
backend_map = {
PdfBackend.DLPARSE_V1: DoclingParseDocumentBackend,
PdfBackend.DLPARSE_V2: DoclingParseV2DocumentBackend,
PdfBackend.DLPARSE_V4: DoclingParseV4DocumentBackend,
PdfBackend.PYPDFIUM2: PyPdfiumDocumentBackend,
}
backend = backend_map.get(pdf_backend)
if not backend:
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
pdf_format_option = PdfFormatOption(
@ -611,22 +509,23 @@ def convert( # noqa: C901
enable_remote_services=enable_remote_services,
)
if vlm_model == VlmModelType.GRANITE_VISION:
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
elif vlm_model == VlmModelType.SMOLDOCLING:
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
if sys.platform == "darwin":
try:
import mlx_vlm
vlm_model_map = {
VlmModelType.GRANITE_VISION: GRANITE_VISION_TRANSFORMERS,
VlmModelType.GRANITE_VISION_OLLAMA: GRANITE_VISION_OLLAMA,
VlmModelType.SMOLDOCLING: SMOLDOCLING_TRANSFORMERS,
}
pipeline_options.vlm_options = vlm_model_map.get(vlm_model)
pipeline_options.vlm_options = SMOLDOCLING_MLX
except ImportError:
_log.warning(
"To run SmolDocling faster, please install mlx-vlm:\n"
"pip install mlx-vlm"
)
if vlm_model == VlmModelType.SMOLDOCLING and sys.platform == "darwin":
try:
import mlx_vlm
pipeline_options.vlm_options = SMOLDOCLING_MLX
except ImportError:
_log.warning(
"To run SmolDocling faster, please install mlx-vlm:\n"
"pip install mlx-vlm"
)
pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
@ -643,19 +542,16 @@ def convert( # noqa: C901
# artifacts_path = artifacts_path
)
if asr_model == AsrModelType.WHISPER_TINY:
pipeline_options.asr_options = WHISPER_TINY
elif asr_model == AsrModelType.WHISPER_SMALL:
pipeline_options.asr_options = WHISPER_SMALL
elif asr_model == AsrModelType.WHISPER_MEDIUM:
pipeline_options.asr_options = WHISPER_MEDIUM
elif asr_model == AsrModelType.WHISPER_BASE:
pipeline_options.asr_options = WHISPER_BASE
elif asr_model == AsrModelType.WHISPER_LARGE:
pipeline_options.asr_options = WHISPER_LARGE
elif asr_model == AsrModelType.WHISPER_TURBO:
pipeline_options.asr_options = WHISPER_TURBO
else:
asr_model_map = {
AsrModelType.WHISPER_TINY: WHISPER_TINY,
AsrModelType.WHISPER_SMALL: WHISPER_SMALL,
AsrModelType.WHISPER_MEDIUM: WHISPER_MEDIUM,
AsrModelType.WHISPER_BASE: WHISPER_BASE,
AsrModelType.WHISPER_LARGE: WHISPER_LARGE,
AsrModelType.WHISPER_TURBO: WHISPER_TURBO,
}
pipeline_options.asr_options = asr_model_map.get(asr_model)
if not pipeline_options.asr_options:
_log.error(f"{asr_model} is not known")
raise ValueError(f"{asr_model} is not known")
@ -670,9 +566,8 @@ def convert( # noqa: C901
InputFormat.AUDIO: audio_format_option,
}
if artifacts_path is not None:
if artifacts_path:
pipeline_options.artifacts_path = artifacts_path
# audio_pipeline_options.artifacts_path = artifacts_path
doc_converter = DocumentConverter(
allowed_formats=from_formats,

View File

@ -14,31 +14,15 @@ from typing_extensions import deprecated
from docling.datamodel import asr_model_specs
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrOptions,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
)
from docling.datamodel.vlm_model_specs import (
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
VlmModelType,
)
_log = logging.getLogger(__name__)
class BaseOptions(BaseModel):
"""Base class for options."""
kind: ClassVar[str]
class TableFormerMode(str, Enum):
@ -200,16 +184,7 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
return self.repo_id.replace("/", "--")
# SmolVLM
smolvlm_picture_description = PictureDescriptionVlmOptions(
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
)
# GraniteVision
granite_picture_description = PictureDescriptionVlmOptions(
repo_id="ibm-granite/granite-vision-3.2-2b-preview",
prompt="What is shown in this image?",
)
# Define an enum for the backend options
@ -223,15 +198,7 @@ class PdfBackend(str, Enum):
# Define an enum for the ocr engines
@deprecated("Use ocr_factory.registered_enum")
class OcrEngine(str, Enum):
"""Enum of valid OCR engines."""
EASYOCR = "easyocr"
TESSERACT_CLI = "tesseract_cli"
TESSERACT = "tesseract"
OCRMAC = "ocrmac"
RAPIDOCR = "rapidocr"
class PipelineOptions(BaseModel):
@ -246,68 +213,10 @@ class PipelineOptions(BaseModel):
allow_external_plugins: bool = False
class PaginatedPipelineOptions(PipelineOptions):
class VlmPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
class VlmPipelineOptions(PaginatedPipelineOptions):
generate_page_images: bool = True
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
smoldocling_vlm_conversion_options
)
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""
do_table_structure: bool = True # True: perform table structure extraction
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: OcrOptions = EasyOcrOptions()
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
generate_table_images: bool = Field(
default=False,
deprecated=(
"Field `generate_table_images` is deprecated. "
"To obtain table images, set `PdfPipelineOptions.generate_page_images = True` "
"before conversion and then use the `TableItem.get_image` function."
),
)
generate_parsed_pages: Literal[True] = (
True # Always True since parsed_page is now mandatory
)
class ProcessingPipeline(str, Enum):
STANDARD = "standard"
VLM = "vlm"
ASR = "asr"

View File

@ -6,23 +6,14 @@ from pydantic import BaseModel, PlainValidator
from pydantic_settings import BaseSettings, SettingsConfigDict
def _validate_page_range(v: Tuple[int, int]) -> Tuple[int, int]:
if v[0] < 1 or v[1] < v[0]:
raise ValueError(
"Invalid page range: start must be ≥ 1 and end must be ≥ start."
)
return v
PageRange = Annotated[Tuple[int, int], PlainValidator(_validate_page_range)]
DEFAULT_PAGE_RANGE: PageRange = (1, sys.maxsize)
class DocumentLimits(BaseModel):
max_num_pages: int = sys.maxsize
max_file_size: int = sys.maxsize
page_range: PageRange = DEFAULT_PAGE_RANGE
class BatchConcurrencySettings(BaseModel):
@ -32,14 +23,7 @@ class BatchConcurrencySettings(BaseModel):
page_batch_concurrency: int = 2
elements_batch_size: int = 16
# doc_batch_size: int = 1
# doc_batch_concurrency: int = 1
# page_batch_size: int = 1
# page_batch_concurrency: int = 1
# model_concurrency: int = 2
# To force models into single core: export OMP_NUM_THREADS=1
class DebugSettings(BaseModel):

View File

@ -65,65 +65,8 @@ class FormatOption(BaseModel):
return self
class CsvFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
class WordFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsWordDocumentBackend
class PowerpointFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsPowerpointDocumentBackend
class MarkdownFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MarkdownDocumentBackend
class AsciiDocFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = AsciiDocBackend
class HTMLFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = HTMLDocumentBackend
class PatentUsptoFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[PatentUsptoDocumentBackend] = PatentUsptoDocumentBackend
class XMLJatsFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = JatsDocumentBackend
class ImageFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
class PdfFormatOption(FormatOption):
pipeline_cls: Type = StandardPdfPipeline
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
class AudioFormatOption(FormatOption):
pipeline_cls: Type = AsrPipeline
backend: Type[AbstractDocumentBackend] = NoOpBackend
def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = {
@ -167,12 +110,11 @@ def _get_default_option(format: InputFormat) -> FormatOption:
}
if (options := format_to_default_options.get(format)) is not None:
return options
else:
raise RuntimeError(f"No default options configured for {format}")
raise RuntimeError(f"No default options configured for {format}")
class DocumentConverter:
_default_download_filename = "file"
_default_filename = "file"
def __init__(
self,
@ -194,10 +136,7 @@ class DocumentConverter:
Tuple[Type[BasePipeline], str], BasePipeline
] = {}
def _get_initialized_pipelines(
self,
) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
return self.initialized_pipelines
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
"""Generate a hash of pipeline options to use as part of the cache key."""
@ -217,7 +156,7 @@ class DocumentConverter:
@validate_call(config=ConfigDict(strict=True))
def convert(
self,
source: Union[Path, str, DocumentStream], # TODO review naming
documents: Union[Path, str, DocumentStream], # TODO review naming
headers: Optional[Dict[str, str]] = None,
raises_on_error: bool = True,
max_num_pages: int = sys.maxsize,
@ -225,7 +164,7 @@ class DocumentConverter:
page_range: PageRange = DEFAULT_PAGE_RANGE,
) -> ConversionResult:
all_res = self.convert_all(
source=[source],
documents=[documents],
raises_on_error=raises_on_error,
max_num_pages=max_num_pages,
max_file_size=max_file_size,
@ -237,7 +176,7 @@ class DocumentConverter:
@validate_call(config=ConfigDict(strict=True))
def convert_all(
self,
source: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
documents: Iterable[Union[Path, str, DocumentStream]], # TODO review naming
headers: Optional[Dict[str, str]] = None,
raises_on_error: bool = True, # True: raises on first conversion error; False: does not raise on conv error
max_num_pages: int = sys.maxsize,
@ -249,28 +188,10 @@ class DocumentConverter:
max_file_size=max_file_size,
page_range=page_range,
)
conv_input = _DocumentConversionInput(
path_or_stream_iterator=source, limits=limits, headers=headers
)
conv_res_iter = self._convert(conv_input, raises_on_error=raises_on_error)
"""Converts a batch of documents.
had_result = False
for conv_res in conv_res_iter:
had_result = True
if raises_on_error and conv_res.status not in {
ConversionStatus.SUCCESS,
ConversionStatus.PARTIAL_SUCCESS,
}:
raise ConversionError(
f"Conversion failed for: {conv_res.input.file} with status: {conv_res.status}"
)
else:
yield conv_res
if not had_result and raises_on_error:
raise ConversionError(
"Conversion failed because the provided file has no recognizable format or it wasn't in the list of allowed formats."
)
Note: PDF backends are not thread-safe, so thread pool usage is disabled.
"""
def _convert(
self, conv_input: _DocumentConversionInput, raises_on_error: bool
@ -380,5 +301,6 @@ class DocumentConverter:
status=ConversionStatus.FAILURE,
)
# TODO add error log why it failed.
_log.error(f"Input document {in_doc.file} is not valid.")
return conv_res