From 2c9123419f541feda8cc98c53aeb37288fabcaee Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Thu, 11 Sep 2025 15:09:00 +0200 Subject: [PATCH] feat: enrichment steps on all convert pipelines (incl docx, html, etc) (#2251) * allow enrichment on all convert pipelines Signed-off-by: Michele Dolfi * set options in CLI Signed-off-by: Michele Dolfi --------- Signed-off-by: Michele Dolfi --- docling/cli/main.py | 29 +++++++ docling/datamodel/pipeline_options.py | 23 ++--- docling/models/base_model.py | 29 ++++++- docling/pipeline/asr_pipeline.py | 14 +--- docling/pipeline/base_extraction_pipeline.py | 20 ++++- docling/pipeline/base_pipeline.py | 84 +++++++++++++++++-- docling/pipeline/extraction_vlm_pipeline.py | 22 ++--- docling/pipeline/simple_pipeline.py | 12 +-- docling/pipeline/standard_pdf_pipeline.py | 61 ++------------ .../threaded_standard_pdf_pipeline.py | 78 ++++------------- docling/pipeline/vlm_pipeline.py | 18 +--- docs/examples/enrich_simple_pipeline.py | 35 ++++++++ 12 files changed, 235 insertions(+), 190 deletions(-) create mode 100644 docs/examples/enrich_simple_pipeline.py diff --git a/docling/cli/main.py b/docling/cli/main.py index 82c57efb..692efc30 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -48,6 +48,7 @@ from docling.datamodel.base_models import ( from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ( AsrPipelineOptions, + ConvertPipelineOptions, EasyOcrOptions, OcrOptions, PaginatedPipelineOptions, @@ -71,8 +72,13 @@ from docling.datamodel.vlm_model_specs import ( from docling.document_converter import ( AudioFormatOption, DocumentConverter, + ExcelFormatOption, FormatOption, + HTMLFormatOption, + MarkdownFormatOption, PdfFormatOption, + PowerpointFormatOption, + WordFormatOption, ) from docling.models.factories import get_ocr_factory from docling.pipeline.asr_pipeline import AsrPipeline @@ -626,10 +632,33 @@ def convert( # noqa: C901 backend=MetsGbsDocumentBackend, ) + # SimplePipeline options + simple_format_option = ConvertPipelineOptions( + do_picture_description=enrich_picture_description, + do_picture_classification=enrich_picture_classes, + ) + if artifacts_path is not None: + simple_format_option.artifacts_path = artifacts_path + format_options = { InputFormat.PDF: pdf_format_option, InputFormat.IMAGE: pdf_format_option, InputFormat.METS_GBS: mets_gbs_format_option, + InputFormat.DOCX: WordFormatOption( + pipeline_options=simple_format_option + ), + InputFormat.PPTX: PowerpointFormatOption( + pipeline_options=simple_format_option + ), + InputFormat.XLSX: ExcelFormatOption( + pipeline_options=simple_format_option + ), + InputFormat.HTML: HTMLFormatOption( + pipeline_options=simple_format_option + ), + InputFormat.MD: MarkdownFormatOption( + pipeline_options=simple_format_option + ), } elif pipeline == ProcessingPipeline.VLM: diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 58392a5a..278713d3 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -259,11 +259,21 @@ class PipelineOptions(BaseOptions): accelerator_options: AcceleratorOptions = AcceleratorOptions() enable_remote_services: bool = False allow_external_plugins: bool = False - - -class PaginatedPipelineOptions(PipelineOptions): artifacts_path: Optional[Union[Path, str]] = None + +class ConvertPipelineOptions(PipelineOptions): + """Base convert pipeline options.""" + + do_picture_classification: bool = False # True: classify pictures in documents + + do_picture_description: bool = False # True: run describe pictures in documents + picture_description_options: PictureDescriptionBaseOptions = ( + smolvlm_picture_description + ) + + +class PaginatedPipelineOptions(ConvertPipelineOptions): images_scale: float = 1.0 generate_page_images: bool = False generate_picture_images: bool = False @@ -295,13 +305,11 @@ class LayoutOptions(BaseModel): class AsrPipelineOptions(PipelineOptions): asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY - artifacts_path: Optional[Union[Path, str]] = None class VlmExtractionPipelineOptions(PipelineOptions): """Options for extraction pipeline.""" - artifacts_path: Optional[Union[Path, str]] = None vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS @@ -312,8 +320,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions): do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_code_enrichment: bool = False # True: perform code OCR do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code - do_picture_classification: bool = False # True: classify pictures in documents - do_picture_description: bool = False # True: run describe pictures in documents force_backend_text: bool = ( False # (To be used with vlms, or other generative models) ) @@ -321,9 +327,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions): table_structure_options: TableStructureOptions = TableStructureOptions() ocr_options: OcrOptions = EasyOcrOptions() - picture_description_options: PictureDescriptionBaseOptions = ( - smolvlm_picture_description - ) layout_options: LayoutOptions = LayoutOptions() images_scale: float = 1.0 diff --git a/docling/models/base_model.py b/docling/models/base_model.py index dabad80b..bc78b78b 100644 --- a/docling/models/base_model.py +++ b/docling/models/base_model.py @@ -4,7 +4,13 @@ from collections.abc import Iterable from typing import Any, Generic, Optional, Protocol, Type, Union import numpy as np -from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem +from docling_core.types.doc import ( + BoundingBox, + DocItem, + DoclingDocument, + NodeItem, + PictureItem, +) from PIL.Image import Image from typing_extensions import TypeVar @@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel( return None assert isinstance(element, DocItem) - element_prov = element.prov[0] + # Allow the case of documents without page images but embedded images (e.g. Word and HTML docs) + if len(element.prov) == 0 and isinstance(element, PictureItem): + embedded_im = element.get_image(conv_res.document) + if embedded_im is not None: + return ItemAndImageEnrichmentElement(item=element, image=embedded_im) + else: + return None + + # Crop the image form the page + element_prov = element.prov[0] bbox = element_prov.bbox width = bbox.r - bbox.l height = bbox.t - bbox.b @@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel( cropped_image = conv_res.pages[page_ix].get_image( scale=self.images_scale, cropbox=expanded_bbox ) + + # Allow for images being embedded without the page backend or page images + if cropped_image is None and isinstance(element, PictureItem): + embedded_im = element.get_image(conv_res.document) + if embedded_im is not None: + return ItemAndImageEnrichmentElement(item=element, image=embedded_im) + else: + return None + + # Return the proper cropped image return ItemAndImageEnrichmentElement(item=element, image=cropped_image) diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py index 94fa6341..d043f9bb 100644 --- a/docling/pipeline/asr_pipeline.py +++ b/docling/pipeline/asr_pipeline.py @@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline): self.pipeline_options: AsrPipelineOptions = pipeline_options - artifacts_path: Optional[Path] = None - if pipeline_options.artifacts_path is not None: - artifacts_path = Path(pipeline_options.artifacts_path).expanduser() - elif settings.artifacts_path is not None: - artifacts_path = Path(settings.artifacts_path).expanduser() - - if artifacts_path is not None and not artifacts_path.is_dir(): - raise RuntimeError( - f"The value of {artifacts_path=} is not valid. " - "When defined, it must point to a folder containing all models required by the pipeline." - ) - if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions): asr_options: InlineAsrNativeWhisperOptions = ( self.pipeline_options.asr_options ) self._model = _NativeWhisperModel( enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, + artifacts_path=self.artifacts_path, accelerator_options=pipeline_options.accelerator_options, asr_options=asr_options, ) diff --git a/docling/pipeline/base_extraction_pipeline.py b/docling/pipeline/base_extraction_pipeline.py index 959c1b11..95a16c72 100644 --- a/docling/pipeline/base_extraction_pipeline.py +++ b/docling/pipeline/base_extraction_pipeline.py @@ -1,19 +1,33 @@ import logging from abc import ABC, abstractmethod +from pathlib import Path from typing import Optional from docling.datamodel.base_models import ConversionStatus, ErrorItem from docling.datamodel.document import InputDocument from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType -from docling.datamodel.pipeline_options import BaseOptions +from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions +from docling.datamodel.settings import settings _log = logging.getLogger(__name__) class BaseExtractionPipeline(ABC): - def __init__(self, pipeline_options: BaseOptions): + def __init__(self, pipeline_options: PipelineOptions): self.pipeline_options = pipeline_options + self.artifacts_path: Optional[Path] = None + if pipeline_options.artifacts_path is not None: + self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser() + elif settings.artifacts_path is not None: + self.artifacts_path = Path(settings.artifacts_path).expanduser() + + if self.artifacts_path is not None and not self.artifacts_path.is_dir(): + raise RuntimeError( + f"The value of {self.artifacts_path=} is not valid. " + "When defined, it must point to a folder containing all models required by the pipeline." + ) + def execute( self, in_doc: InputDocument, @@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC): @classmethod @abstractmethod - def get_default_options(cls) -> BaseOptions: + def get_default_options(cls) -> PipelineOptions: pass diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index 0ac6b4ba..0c35d24c 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -4,7 +4,8 @@ import time import traceback from abc import ABC, abstractmethod from collections.abc import Iterable -from typing import Any, Callable, List +from pathlib import Path +from typing import Any, Callable, List, Optional from docling_core.types.doc import NodeItem @@ -20,9 +21,19 @@ from docling.datamodel.base_models import ( Page, ) from docling.datamodel.document import ConversionResult, InputDocument -from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions +from docling.datamodel.pipeline_options import ( + ConvertPipelineOptions, + PdfPipelineOptions, + PipelineOptions, +) from docling.datamodel.settings import settings from docling.models.base_model import GenericEnrichmentModel +from docling.models.document_picture_classifier import ( + DocumentPictureClassifier, + DocumentPictureClassifierOptions, +) +from docling.models.factories import get_picture_description_factory +from docling.models.picture_description_base_model import PictureDescriptionBaseModel from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.utils import chunkify @@ -36,6 +47,18 @@ class BasePipeline(ABC): self.build_pipe: List[Callable] = [] self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = [] + self.artifacts_path: Optional[Path] = None + if pipeline_options.artifacts_path is not None: + self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser() + elif settings.artifacts_path is not None: + self.artifacts_path = Path(settings.artifacts_path).expanduser() + + if self.artifacts_path is not None and not self.artifacts_path.is_dir(): + raise RuntimeError( + f"The value of {self.artifacts_path=} is not valid. " + "When defined, it must point to a folder containing all models required by the pipeline." + ) + def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult: conv_res = ConversionResult(input=in_doc) @@ -108,15 +131,58 @@ class BasePipeline(ABC): def is_backend_supported(cls, backend: AbstractDocumentBackend): pass - # def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]: - # for model in self.build_pipe: - # element_batch = model(element_batch) - # - # yield from element_batch + +class ConvertPipeline(BasePipeline): + def __init__(self, pipeline_options: ConvertPipelineOptions): + super().__init__(pipeline_options) + self.pipeline_options: ConvertPipelineOptions + + # ------ Common enrichment models working on all backends + + # Picture description model + if ( + picture_description_model := self._get_picture_description_model( + artifacts_path=self.artifacts_path + ) + ) is None: + raise RuntimeError( + f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}." + ) + + self.enrichment_pipe = [ + # Document Picture Classifier + DocumentPictureClassifier( + enabled=pipeline_options.do_picture_classification, + artifacts_path=self.artifacts_path, + options=DocumentPictureClassifierOptions(), + accelerator_options=pipeline_options.accelerator_options, + ), + # Document Picture description + picture_description_model, + ] + + def _get_picture_description_model( + self, artifacts_path: Optional[Path] = None + ) -> Optional[PictureDescriptionBaseModel]: + factory = get_picture_description_factory( + allow_external_plugins=self.pipeline_options.allow_external_plugins + ) + return factory.create_instance( + options=self.pipeline_options.picture_description_options, + enabled=self.pipeline_options.do_picture_description, + enable_remote_services=self.pipeline_options.enable_remote_services, + artifacts_path=artifacts_path, + accelerator_options=self.pipeline_options.accelerator_options, + ) + + @classmethod + @abstractmethod + def get_default_options(cls) -> ConvertPipelineOptions: + pass -class PaginatedPipeline(BasePipeline): # TODO this is a bad name. - def __init__(self, pipeline_options: PipelineOptions): +class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name. + def __init__(self, pipeline_options: ConvertPipelineOptions): super().__init__(pipeline_options) self.keep_backend = False diff --git a/docling/pipeline/extraction_vlm_pipeline.py b/docling/pipeline/extraction_vlm_pipeline.py index 68222cd5..47aba8cd 100644 --- a/docling/pipeline/extraction_vlm_pipeline.py +++ b/docling/pipeline/extraction_vlm_pipeline.py @@ -1,7 +1,6 @@ import inspect import json import logging -from pathlib import Path from typing import Optional from PIL.Image import Image @@ -16,7 +15,10 @@ from docling.datamodel.extraction import ( ExtractionResult, ExtractionTemplateType, ) -from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions +from docling.datamodel.pipeline_options import ( + PipelineOptions, + VlmExtractionPipelineOptions, +) from docling.datamodel.settings import settings from docling.models.vlm_models_inline.nuextract_transformers_model import ( NuExtractTransformersModel, @@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline): self.accelerator_options = pipeline_options.accelerator_options self.pipeline_options: VlmExtractionPipelineOptions - artifacts_path: Optional[Path] = None - if pipeline_options.artifacts_path is not None: - artifacts_path = Path(pipeline_options.artifacts_path).expanduser() - elif settings.artifacts_path is not None: - artifacts_path = Path(settings.artifacts_path).expanduser() - - if artifacts_path is not None and not artifacts_path.is_dir(): - raise RuntimeError( - f"The value of {artifacts_path=} is not valid. " - "When defined, it must point to a folder containing all models required by the pipeline." - ) - # Create VLM model instance self.vlm_model = NuExtractTransformersModel( enabled=True, - artifacts_path=artifacts_path, # Will download automatically + artifacts_path=self.artifacts_path, # Will download automatically accelerator_options=self.accelerator_options, vlm_options=pipeline_options.vlm_options, ) @@ -203,5 +193,5 @@ class ExtractionVlmPipeline(BaseExtractionPipeline): raise ValueError(f"Unsupported template type: {type(template)}") @classmethod - def get_default_options(cls) -> BaseOptions: + def get_default_options(cls) -> PipelineOptions: return VlmExtractionPipelineOptions() diff --git a/docling/pipeline/simple_pipeline.py b/docling/pipeline/simple_pipeline.py index 2e8f0ea0..0e3f1b6f 100644 --- a/docling/pipeline/simple_pipeline.py +++ b/docling/pipeline/simple_pipeline.py @@ -6,21 +6,21 @@ from docling.backend.abstract_backend import ( ) from docling.datamodel.base_models import ConversionStatus from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PipelineOptions -from docling.pipeline.base_pipeline import BasePipeline +from docling.datamodel.pipeline_options import ConvertPipelineOptions +from docling.pipeline.base_pipeline import ConvertPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder _log = logging.getLogger(__name__) -class SimplePipeline(BasePipeline): +class SimplePipeline(ConvertPipeline): """SimpleModelPipeline. This class is used at the moment for formats / backends which produce straight DoclingDocument output. """ - def __init__(self, pipeline_options: PipelineOptions): + def __init__(self, pipeline_options: ConvertPipelineOptions): super().__init__(pipeline_options) def _build_document(self, conv_res: ConversionResult) -> ConversionResult: @@ -47,8 +47,8 @@ class SimplePipeline(BasePipeline): return ConversionStatus.SUCCESS @classmethod - def get_default_options(cls) -> PipelineOptions: - return PipelineOptions() + def get_default_options(cls) -> ConvertPipelineOptions: + return ConvertPipelineOptions() @classmethod def is_backend_supported(cls, backend: AbstractDocumentBackend): diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index c04ddca9..1722ca5b 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -15,18 +15,13 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions -from docling.models.document_picture_classifier import ( - DocumentPictureClassifier, - DocumentPictureClassifierOptions, -) -from docling.models.factories import get_ocr_factory, get_picture_description_factory +from docling.models.factories import get_ocr_factory from docling.models.layout_model import LayoutModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_preprocessing_model import ( PagePreprocessingModel, PagePreprocessingOptions, ) -from docling.models.picture_description_base_model import PictureDescriptionBaseModel from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions from docling.models.table_structure_model import TableStructureModel from docling.pipeline.base_pipeline import PaginatedPipeline @@ -41,18 +36,6 @@ class StandardPdfPipeline(PaginatedPipeline): super().__init__(pipeline_options) self.pipeline_options: PdfPipelineOptions - artifacts_path: Optional[Path] = None - if pipeline_options.artifacts_path is not None: - artifacts_path = Path(pipeline_options.artifacts_path).expanduser() - elif settings.artifacts_path is not None: - artifacts_path = Path(settings.artifacts_path).expanduser() - - if artifacts_path is not None and not artifacts_path.is_dir(): - raise RuntimeError( - f"The value of {artifacts_path=} is not valid. " - "When defined, it must point to a folder containing all models required by the pipeline." - ) - with warnings.catch_warnings(): # deprecated generate_table_images warnings.filterwarnings("ignore", category=DeprecationWarning) self.keep_images = ( @@ -63,7 +46,7 @@ class StandardPdfPipeline(PaginatedPipeline): self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) - ocr_model = self.get_ocr_model(artifacts_path=artifacts_path) + ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path) self.build_pipe = [ # Pre-processing @@ -76,14 +59,14 @@ class StandardPdfPipeline(PaginatedPipeline): ocr_model, # Layout model LayoutModel( - artifacts_path=artifacts_path, + artifacts_path=self.artifacts_path, accelerator_options=pipeline_options.accelerator_options, options=pipeline_options.layout_options, ), # Table structure model TableStructureModel( enabled=pipeline_options.do_table_structure, - artifacts_path=artifacts_path, + artifacts_path=self.artifacts_path, options=pipeline_options.table_structure_options, accelerator_options=pipeline_options.accelerator_options, ), @@ -91,37 +74,19 @@ class StandardPdfPipeline(PaginatedPipeline): PageAssembleModel(options=PageAssembleOptions()), ] - # Picture description model - if ( - picture_description_model := self.get_picture_description_model( - artifacts_path=artifacts_path - ) - ) is None: - raise RuntimeError( - f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}." - ) - self.enrichment_pipe = [ # Code Formula Enrichment Model CodeFormulaModel( enabled=pipeline_options.do_code_enrichment or pipeline_options.do_formula_enrichment, - artifacts_path=artifacts_path, + artifacts_path=self.artifacts_path, options=CodeFormulaModelOptions( do_code_enrichment=pipeline_options.do_code_enrichment, do_formula_enrichment=pipeline_options.do_formula_enrichment, ), accelerator_options=pipeline_options.accelerator_options, ), - # Document Picture Classifier - DocumentPictureClassifier( - enabled=pipeline_options.do_picture_classification, - artifacts_path=artifacts_path, - options=DocumentPictureClassifierOptions(), - accelerator_options=pipeline_options.accelerator_options, - ), - # Document Picture description - picture_description_model, + *self.enrichment_pipe, ] if ( @@ -158,20 +123,6 @@ class StandardPdfPipeline(PaginatedPipeline): accelerator_options=self.pipeline_options.accelerator_options, ) - def get_picture_description_model( - self, artifacts_path: Optional[Path] = None - ) -> Optional[PictureDescriptionBaseModel]: - factory = get_picture_description_factory( - allow_external_plugins=self.pipeline_options.allow_external_plugins - ) - return factory.create_instance( - options=self.pipeline_options.picture_description_options, - enabled=self.pipeline_options.do_picture_description, - enable_remote_services=self.pipeline_options.enable_remote_services, - artifacts_path=artifacts_path, - accelerator_options=self.pipeline_options.accelerator_options, - ) - def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: with TimeRecorder(conv_res, "page_init"): page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore diff --git a/docling/pipeline/threaded_standard_pdf_pipeline.py b/docling/pipeline/threaded_standard_pdf_pipeline.py index d5328cfd..99e83770 100644 --- a/docling/pipeline/threaded_standard_pdf_pipeline.py +++ b/docling/pipeline/threaded_standard_pdf_pipeline.py @@ -32,21 +32,16 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions from docling.datamodel.settings import settings from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions -from docling.models.document_picture_classifier import ( - DocumentPictureClassifier, - DocumentPictureClassifierOptions, -) -from docling.models.factories import get_ocr_factory, get_picture_description_factory +from docling.models.factories import get_ocr_factory from docling.models.layout_model import LayoutModel from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_preprocessing_model import ( PagePreprocessingModel, PagePreprocessingOptions, ) -from docling.models.picture_description_base_model import PictureDescriptionBaseModel from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions from docling.models.table_structure_model import TableStructureModel -from docling.pipeline.base_pipeline import BasePipeline +from docling.pipeline.base_pipeline import ConvertPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.utils import chunkify @@ -294,7 +289,7 @@ class RunContext: # ────────────────────────────────────────────────────────────────────────────── -class ThreadedStandardPdfPipeline(BasePipeline): +class ThreadedStandardPdfPipeline(ConvertPipeline): """High-performance PDF pipeline with multi-threaded stages.""" def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None: @@ -310,7 +305,7 @@ class ThreadedStandardPdfPipeline(BasePipeline): # ──────────────────────────────────────────────────────────────────────── def _init_models(self) -> None: - art_path = self._resolve_artifacts_path() + art_path = self.artifacts_path self.keep_images = ( self.pipeline_options.generate_page_images or self.pipeline_options.generate_picture_images @@ -337,32 +332,20 @@ class ThreadedStandardPdfPipeline(BasePipeline): self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) # --- optional enrichment ------------------------------------------------ - self.enrichment_pipe = [] - code_formula = CodeFormulaModel( - enabled=self.pipeline_options.do_code_enrichment - or self.pipeline_options.do_formula_enrichment, - artifacts_path=art_path, - options=CodeFormulaModelOptions( - do_code_enrichment=self.pipeline_options.do_code_enrichment, - do_formula_enrichment=self.pipeline_options.do_formula_enrichment, + self.enrichment_pipe = [ + # Code Formula Enrichment Model + CodeFormulaModel( + enabled=self.pipeline_options.do_code_enrichment + or self.pipeline_options.do_formula_enrichment, + artifacts_path=self.artifacts_path, + options=CodeFormulaModelOptions( + do_code_enrichment=self.pipeline_options.do_code_enrichment, + do_formula_enrichment=self.pipeline_options.do_formula_enrichment, + ), + accelerator_options=self.pipeline_options.accelerator_options, ), - accelerator_options=self.pipeline_options.accelerator_options, - ) - if code_formula.enabled: - self.enrichment_pipe.append(code_formula) - - picture_classifier = DocumentPictureClassifier( - enabled=self.pipeline_options.do_picture_classification, - artifacts_path=art_path, - options=DocumentPictureClassifierOptions(), - accelerator_options=self.pipeline_options.accelerator_options, - ) - if picture_classifier.enabled: - self.enrichment_pipe.append(picture_classifier) - - picture_descr = self._make_picture_description_model(art_path) - if picture_descr and picture_descr.enabled: - self.enrichment_pipe.append(picture_descr) + *self.enrichment_pipe, + ] self.keep_backend = any( ( @@ -374,19 +357,6 @@ class ThreadedStandardPdfPipeline(BasePipeline): ) # ---------------------------------------------------------------- helpers - def _resolve_artifacts_path(self) -> Optional[Path]: - if self.pipeline_options.artifacts_path: - p = Path(self.pipeline_options.artifacts_path).expanduser() - elif settings.artifacts_path: - p = Path(settings.artifacts_path).expanduser() - else: - return None - if not p.is_dir(): - raise RuntimeError( - f"{p} does not exist or is not a directory containing the required models" - ) - return p - def _make_ocr_model(self, art_path: Optional[Path]) -> Any: factory = get_ocr_factory( allow_external_plugins=self.pipeline_options.allow_external_plugins @@ -398,20 +368,6 @@ class ThreadedStandardPdfPipeline(BasePipeline): accelerator_options=self.pipeline_options.accelerator_options, ) - def _make_picture_description_model( - self, art_path: Optional[Path] - ) -> Optional[PictureDescriptionBaseModel]: - factory = get_picture_description_factory( - allow_external_plugins=self.pipeline_options.allow_external_plugins - ) - return factory.create_instance( - options=self.pipeline_options.picture_description_options, - enabled=self.pipeline_options.do_picture_description, - enable_remote_services=self.pipeline_options.enable_remote_services, - artifacts_path=art_path, - accelerator_options=self.pipeline_options.accelerator_options, - ) - # ──────────────────────────────────────────────────────────────────────── # Build - thread pipeline # ──────────────────────────────────────────────────────────────────────── diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index d69f2485..fa75be99 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -54,18 +54,6 @@ class VlmPipeline(PaginatedPipeline): self.pipeline_options: VlmPipelineOptions - artifacts_path: Optional[Path] = None - if pipeline_options.artifacts_path is not None: - artifacts_path = Path(pipeline_options.artifacts_path).expanduser() - elif settings.artifacts_path is not None: - artifacts_path = Path(settings.artifacts_path).expanduser() - - if artifacts_path is not None and not artifacts_path.is_dir(): - raise RuntimeError( - f"The value of {artifacts_path=} is not valid. " - "When defined, it must point to a folder containing all models required by the pipeline." - ) - # force_backend_text = False - use text that is coming from VLM response # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags self.force_backend_text = ( @@ -89,7 +77,7 @@ class VlmPipeline(PaginatedPipeline): self.build_pipe = [ HuggingFaceMlxModel( enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, + artifacts_path=self.artifacts_path, accelerator_options=pipeline_options.accelerator_options, vlm_options=vlm_options, ), @@ -98,7 +86,7 @@ class VlmPipeline(PaginatedPipeline): self.build_pipe = [ HuggingFaceTransformersVlmModel( enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, + artifacts_path=self.artifacts_path, accelerator_options=pipeline_options.accelerator_options, vlm_options=vlm_options, ), @@ -109,7 +97,7 @@ class VlmPipeline(PaginatedPipeline): self.build_pipe = [ VllmVlmModel( enabled=True, # must be always enabled for this pipeline to make sense. - artifacts_path=artifacts_path, + artifacts_path=self.artifacts_path, accelerator_options=pipeline_options.accelerator_options, vlm_options=vlm_options, ), diff --git a/docs/examples/enrich_simple_pipeline.py b/docs/examples/enrich_simple_pipeline.py new file mode 100644 index 00000000..91af94ca --- /dev/null +++ b/docs/examples/enrich_simple_pipeline.py @@ -0,0 +1,35 @@ +import logging +from pathlib import Path + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import ConvertPipelineOptions +from docling.document_converter import ( + DocumentConverter, + HTMLFormatOption, + WordFormatOption, +) + +_log = logging.getLogger(__name__) + + +def main(): + input_path = Path("tests/data/docx/word_sample.docx") + + pipeline_options = ConvertPipelineOptions() + pipeline_options.do_picture_classification = True + pipeline_options.do_picture_description = True + + doc_converter = DocumentConverter( + format_options={ + InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options), + InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options), + }, + ) + + res = doc_converter.convert(input_path) + + print(res.document.export_to_markdown()) + + +if __name__ == "__main__": + main()