feat: enrichment steps on all convert pipelines (incl docx, html, etc) (#2251)

* allow enrichment on all convert pipelines

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* set options in CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-09-11 15:09:00 +02:00
committed by GitHub
parent c6965495a2
commit 2c9123419f
12 changed files with 235 additions and 190 deletions

View File

@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
AsrPipelineOptions, AsrPipelineOptions,
ConvertPipelineOptions,
EasyOcrOptions, EasyOcrOptions,
OcrOptions, OcrOptions,
PaginatedPipelineOptions, PaginatedPipelineOptions,
@@ -71,8 +72,13 @@ from docling.datamodel.vlm_model_specs import (
from docling.document_converter import ( from docling.document_converter import (
AudioFormatOption, AudioFormatOption,
DocumentConverter, DocumentConverter,
ExcelFormatOption,
FormatOption, FormatOption,
HTMLFormatOption,
MarkdownFormatOption,
PdfFormatOption, PdfFormatOption,
PowerpointFormatOption,
WordFormatOption,
) )
from docling.models.factories import get_ocr_factory from docling.models.factories import get_ocr_factory
from docling.pipeline.asr_pipeline import AsrPipeline from docling.pipeline.asr_pipeline import AsrPipeline
@@ -626,10 +632,33 @@ def convert( # noqa: C901
backend=MetsGbsDocumentBackend, backend=MetsGbsDocumentBackend,
) )
# SimplePipeline options
simple_format_option = ConvertPipelineOptions(
do_picture_description=enrich_picture_description,
do_picture_classification=enrich_picture_classes,
)
if artifacts_path is not None:
simple_format_option.artifacts_path = artifacts_path
format_options = { format_options = {
InputFormat.PDF: pdf_format_option, InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option, InputFormat.IMAGE: pdf_format_option,
InputFormat.METS_GBS: mets_gbs_format_option, InputFormat.METS_GBS: mets_gbs_format_option,
InputFormat.DOCX: WordFormatOption(
pipeline_options=simple_format_option
),
InputFormat.PPTX: PowerpointFormatOption(
pipeline_options=simple_format_option
),
InputFormat.XLSX: ExcelFormatOption(
pipeline_options=simple_format_option
),
InputFormat.HTML: HTMLFormatOption(
pipeline_options=simple_format_option
),
InputFormat.MD: MarkdownFormatOption(
pipeline_options=simple_format_option
),
} }
elif pipeline == ProcessingPipeline.VLM: elif pipeline == ProcessingPipeline.VLM:

View File

@@ -259,11 +259,21 @@ class PipelineOptions(BaseOptions):
accelerator_options: AcceleratorOptions = AcceleratorOptions() accelerator_options: AcceleratorOptions = AcceleratorOptions()
enable_remote_services: bool = False enable_remote_services: bool = False
allow_external_plugins: bool = False allow_external_plugins: bool = False
class PaginatedPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
class ConvertPipelineOptions(PipelineOptions):
"""Base convert pipeline options."""
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
class PaginatedPipelineOptions(ConvertPipelineOptions):
images_scale: float = 1.0 images_scale: float = 1.0
generate_page_images: bool = False generate_page_images: bool = False
generate_picture_images: bool = False generate_picture_images: bool = False
@@ -295,13 +305,11 @@ class LayoutOptions(BaseModel):
class AsrPipelineOptions(PipelineOptions): class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
class VlmExtractionPipelineOptions(PipelineOptions): class VlmExtractionPipelineOptions(PipelineOptions):
"""Options for extraction pipeline.""" """Options for extraction pipeline."""
artifacts_path: Optional[Union[Path, str]] = None
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
@@ -312,8 +320,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = ( force_backend_text: bool = (
False # (To be used with vlms, or other generative models) False # (To be used with vlms, or other generative models)
) )
@@ -321,9 +327,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
table_structure_options: TableStructureOptions = TableStructureOptions() table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: OcrOptions = EasyOcrOptions() ocr_options: OcrOptions = EasyOcrOptions()
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
layout_options: LayoutOptions = LayoutOptions() layout_options: LayoutOptions = LayoutOptions()
images_scale: float = 1.0 images_scale: float = 1.0

View File

@@ -4,7 +4,13 @@ from collections.abc import Iterable
from typing import Any, Generic, Optional, Protocol, Type, Union from typing import Any, Generic, Optional, Protocol, Type, Union
import numpy as np import numpy as np
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem from docling_core.types.doc import (
BoundingBox,
DocItem,
DoclingDocument,
NodeItem,
PictureItem,
)
from PIL.Image import Image from PIL.Image import Image
from typing_extensions import TypeVar from typing_extensions import TypeVar
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
return None return None
assert isinstance(element, DocItem) assert isinstance(element, DocItem)
element_prov = element.prov[0]
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
if len(element.prov) == 0 and isinstance(element, PictureItem):
embedded_im = element.get_image(conv_res.document)
if embedded_im is not None:
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
else:
return None
# Crop the image form the page
element_prov = element.prov[0]
bbox = element_prov.bbox bbox = element_prov.bbox
width = bbox.r - bbox.l width = bbox.r - bbox.l
height = bbox.t - bbox.b height = bbox.t - bbox.b
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
cropped_image = conv_res.pages[page_ix].get_image( cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=expanded_bbox scale=self.images_scale, cropbox=expanded_bbox
) )
# Allow for images being embedded without the page backend or page images
if cropped_image is None and isinstance(element, PictureItem):
embedded_im = element.get_image(conv_res.document)
if embedded_im is not None:
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
else:
return None
# Return the proper cropped image
return ItemAndImageEnrichmentElement(item=element, image=cropped_image) return ItemAndImageEnrichmentElement(item=element, image=cropped_image)

View File

@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
self.pipeline_options: AsrPipelineOptions = pipeline_options self.pipeline_options: AsrPipelineOptions = pipeline_options
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions): if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
asr_options: InlineAsrNativeWhisperOptions = ( asr_options: InlineAsrNativeWhisperOptions = (
self.pipeline_options.asr_options self.pipeline_options.asr_options
) )
self._model = _NativeWhisperModel( self._model = _NativeWhisperModel(
enabled=True, # must be always enabled for this pipeline to make sense. enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path, artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
asr_options=asr_options, asr_options=asr_options,
) )

View File

@@ -1,19 +1,33 @@
import logging import logging
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from pathlib import Path
from typing import Optional from typing import Optional
from docling.datamodel.base_models import ConversionStatus, ErrorItem from docling.datamodel.base_models import ConversionStatus, ErrorItem
from docling.datamodel.document import InputDocument from docling.datamodel.document import InputDocument
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
from docling.datamodel.pipeline_options import BaseOptions from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
from docling.datamodel.settings import settings
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class BaseExtractionPipeline(ABC): class BaseExtractionPipeline(ABC):
def __init__(self, pipeline_options: BaseOptions): def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options self.pipeline_options = pipeline_options
self.artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
self.artifacts_path = Path(settings.artifacts_path).expanduser()
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
raise RuntimeError(
f"The value of {self.artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
def execute( def execute(
self, self,
in_doc: InputDocument, in_doc: InputDocument,
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
@classmethod @classmethod
@abstractmethod @abstractmethod
def get_default_options(cls) -> BaseOptions: def get_default_options(cls) -> PipelineOptions:
pass pass

View File

@@ -4,7 +4,8 @@ import time
import traceback import traceback
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from collections.abc import Iterable from collections.abc import Iterable
from typing import Any, Callable, List from pathlib import Path
from typing import Any, Callable, List, Optional
from docling_core.types.doc import NodeItem from docling_core.types.doc import NodeItem
@@ -20,9 +21,19 @@ from docling.datamodel.base_models import (
Page, Page,
) )
from docling.datamodel.document import ConversionResult, InputDocument from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions from docling.datamodel.pipeline_options import (
ConvertPipelineOptions,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import GenericEnrichmentModel from docling.models.base_model import GenericEnrichmentModel
from docling.models.document_picture_classifier import (
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.factories import get_picture_description_factory
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify from docling.utils.utils import chunkify
@@ -36,6 +47,18 @@ class BasePipeline(ABC):
self.build_pipe: List[Callable] = [] self.build_pipe: List[Callable] = []
self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = [] self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
self.artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
self.artifacts_path = Path(settings.artifacts_path).expanduser()
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
raise RuntimeError(
f"The value of {self.artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult: def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
conv_res = ConversionResult(input=in_doc) conv_res = ConversionResult(input=in_doc)
@@ -108,15 +131,58 @@ class BasePipeline(ABC):
def is_backend_supported(cls, backend: AbstractDocumentBackend): def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass pass
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
# for model in self.build_pipe: class ConvertPipeline(BasePipeline):
# element_batch = model(element_batch) def __init__(self, pipeline_options: ConvertPipelineOptions):
# super().__init__(pipeline_options)
# yield from element_batch self.pipeline_options: ConvertPipelineOptions
# ------ Common enrichment models working on all backends
# Picture description model
if (
picture_description_model := self._get_picture_description_model(
artifacts_path=self.artifacts_path
)
) is None:
raise RuntimeError(
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
)
self.enrichment_pipe = [
# Document Picture Classifier
DocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification,
artifacts_path=self.artifacts_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture description
picture_description_model,
]
def _get_picture_description_model(
self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description,
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
@classmethod
@abstractmethod
def get_default_options(cls) -> ConvertPipelineOptions:
pass
class PaginatedPipeline(BasePipeline): # TODO this is a bad name. class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
def __init__(self, pipeline_options: PipelineOptions): def __init__(self, pipeline_options: ConvertPipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.keep_backend = False self.keep_backend = False

View File

@@ -1,7 +1,6 @@
import inspect import inspect
import json import json
import logging import logging
from pathlib import Path
from typing import Optional from typing import Optional
from PIL.Image import Image from PIL.Image import Image
@@ -16,7 +15,10 @@ from docling.datamodel.extraction import (
ExtractionResult, ExtractionResult,
ExtractionTemplateType, ExtractionTemplateType,
) )
from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions from docling.datamodel.pipeline_options import (
PipelineOptions,
VlmExtractionPipelineOptions,
)
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.vlm_models_inline.nuextract_transformers_model import ( from docling.models.vlm_models_inline.nuextract_transformers_model import (
NuExtractTransformersModel, NuExtractTransformersModel,
@@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
self.accelerator_options = pipeline_options.accelerator_options self.accelerator_options = pipeline_options.accelerator_options
self.pipeline_options: VlmExtractionPipelineOptions self.pipeline_options: VlmExtractionPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
# Create VLM model instance # Create VLM model instance
self.vlm_model = NuExtractTransformersModel( self.vlm_model = NuExtractTransformersModel(
enabled=True, enabled=True,
artifacts_path=artifacts_path, # Will download automatically artifacts_path=self.artifacts_path, # Will download automatically
accelerator_options=self.accelerator_options, accelerator_options=self.accelerator_options,
vlm_options=pipeline_options.vlm_options, vlm_options=pipeline_options.vlm_options,
) )
@@ -203,5 +193,5 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
raise ValueError(f"Unsupported template type: {type(template)}") raise ValueError(f"Unsupported template type: {type(template)}")
@classmethod @classmethod
def get_default_options(cls) -> BaseOptions: def get_default_options(cls) -> PipelineOptions:
return VlmExtractionPipelineOptions() return VlmExtractionPipelineOptions()

View File

@@ -6,21 +6,21 @@ from docling.backend.abstract_backend import (
) )
from docling.datamodel.base_models import ConversionStatus from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.base_pipeline import ConvertPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
class SimplePipeline(BasePipeline): class SimplePipeline(ConvertPipeline):
"""SimpleModelPipeline. """SimpleModelPipeline.
This class is used at the moment for formats / backends This class is used at the moment for formats / backends
which produce straight DoclingDocument output. which produce straight DoclingDocument output.
""" """
def __init__(self, pipeline_options: PipelineOptions): def __init__(self, pipeline_options: ConvertPipelineOptions):
super().__init__(pipeline_options) super().__init__(pipeline_options)
def _build_document(self, conv_res: ConversionResult) -> ConversionResult: def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
@@ -47,8 +47,8 @@ class SimplePipeline(BasePipeline):
return ConversionStatus.SUCCESS return ConversionStatus.SUCCESS
@classmethod @classmethod
def get_default_options(cls) -> PipelineOptions: def get_default_options(cls) -> ConvertPipelineOptions:
return PipelineOptions() return ConvertPipelineOptions()
@classmethod @classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend): def is_backend_supported(cls, backend: AbstractDocumentBackend):

View File

@@ -15,18 +15,13 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel from docling.models.base_ocr_model import BaseOcrModel
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
from docling.models.document_picture_classifier import ( from docling.models.factories import get_ocr_factory
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.factories import get_ocr_factory, get_picture_description_factory
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import ( from docling.models.page_preprocessing_model import (
PagePreprocessingModel, PagePreprocessingModel,
PagePreprocessingOptions, PagePreprocessingOptions,
) )
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_pipeline import PaginatedPipeline from docling.pipeline.base_pipeline import PaginatedPipeline
@@ -41,18 +36,6 @@ class StandardPdfPipeline(PaginatedPipeline):
super().__init__(pipeline_options) super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions self.pipeline_options: PdfPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
with warnings.catch_warnings(): # deprecated generate_table_images with warnings.catch_warnings(): # deprecated generate_table_images
warnings.filterwarnings("ignore", category=DeprecationWarning) warnings.filterwarnings("ignore", category=DeprecationWarning)
self.keep_images = ( self.keep_images = (
@@ -63,7 +46,7 @@ class StandardPdfPipeline(PaginatedPipeline):
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path) ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
self.build_pipe = [ self.build_pipe = [
# Pre-processing # Pre-processing
@@ -76,14 +59,14 @@ class StandardPdfPipeline(PaginatedPipeline):
ocr_model, ocr_model,
# Layout model # Layout model
LayoutModel( LayoutModel(
artifacts_path=artifacts_path, artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
options=pipeline_options.layout_options, options=pipeline_options.layout_options,
), ),
# Table structure model # Table structure model
TableStructureModel( TableStructureModel(
enabled=pipeline_options.do_table_structure, enabled=pipeline_options.do_table_structure,
artifacts_path=artifacts_path, artifacts_path=self.artifacts_path,
options=pipeline_options.table_structure_options, options=pipeline_options.table_structure_options,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
), ),
@@ -91,37 +74,19 @@ class StandardPdfPipeline(PaginatedPipeline):
PageAssembleModel(options=PageAssembleOptions()), PageAssembleModel(options=PageAssembleOptions()),
] ]
# Picture description model
if (
picture_description_model := self.get_picture_description_model(
artifacts_path=artifacts_path
)
) is None:
raise RuntimeError(
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
)
self.enrichment_pipe = [ self.enrichment_pipe = [
# Code Formula Enrichment Model # Code Formula Enrichment Model
CodeFormulaModel( CodeFormulaModel(
enabled=pipeline_options.do_code_enrichment enabled=pipeline_options.do_code_enrichment
or pipeline_options.do_formula_enrichment, or pipeline_options.do_formula_enrichment,
artifacts_path=artifacts_path, artifacts_path=self.artifacts_path,
options=CodeFormulaModelOptions( options=CodeFormulaModelOptions(
do_code_enrichment=pipeline_options.do_code_enrichment, do_code_enrichment=pipeline_options.do_code_enrichment,
do_formula_enrichment=pipeline_options.do_formula_enrichment, do_formula_enrichment=pipeline_options.do_formula_enrichment,
), ),
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
), ),
# Document Picture Classifier *self.enrichment_pipe,
DocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification,
artifacts_path=artifacts_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture description
picture_description_model,
] ]
if ( if (
@@ -158,20 +123,6 @@ class StandardPdfPipeline(PaginatedPipeline):
accelerator_options=self.pipeline_options.accelerator_options, accelerator_options=self.pipeline_options.accelerator_options,
) )
def get_picture_description_model(
self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description,
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page: def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"): with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore

View File

@@ -32,21 +32,16 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
from docling.models.document_picture_classifier import ( from docling.models.factories import get_ocr_factory
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.factories import get_ocr_factory, get_picture_description_factory
from docling.models.layout_model import LayoutModel from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import ( from docling.models.page_preprocessing_model import (
PagePreprocessingModel, PagePreprocessingModel,
PagePreprocessingOptions, PagePreprocessingOptions,
) )
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
from docling.models.table_structure_model import TableStructureModel from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_pipeline import BasePipeline from docling.pipeline.base_pipeline import ConvertPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify from docling.utils.utils import chunkify
@@ -294,7 +289,7 @@ class RunContext:
# ────────────────────────────────────────────────────────────────────────────── # ──────────────────────────────────────────────────────────────────────────────
class ThreadedStandardPdfPipeline(BasePipeline): class ThreadedStandardPdfPipeline(ConvertPipeline):
"""High-performance PDF pipeline with multi-threaded stages.""" """High-performance PDF pipeline with multi-threaded stages."""
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None: def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
@@ -310,7 +305,7 @@ class ThreadedStandardPdfPipeline(BasePipeline):
# ──────────────────────────────────────────────────────────────────────── # ────────────────────────────────────────────────────────────────────────
def _init_models(self) -> None: def _init_models(self) -> None:
art_path = self._resolve_artifacts_path() art_path = self.artifacts_path
self.keep_images = ( self.keep_images = (
self.pipeline_options.generate_page_images self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images or self.pipeline_options.generate_picture_images
@@ -337,32 +332,20 @@ class ThreadedStandardPdfPipeline(BasePipeline):
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
# --- optional enrichment ------------------------------------------------ # --- optional enrichment ------------------------------------------------
self.enrichment_pipe = [] self.enrichment_pipe = [
code_formula = CodeFormulaModel( # Code Formula Enrichment Model
CodeFormulaModel(
enabled=self.pipeline_options.do_code_enrichment enabled=self.pipeline_options.do_code_enrichment
or self.pipeline_options.do_formula_enrichment, or self.pipeline_options.do_formula_enrichment,
artifacts_path=art_path, artifacts_path=self.artifacts_path,
options=CodeFormulaModelOptions( options=CodeFormulaModelOptions(
do_code_enrichment=self.pipeline_options.do_code_enrichment, do_code_enrichment=self.pipeline_options.do_code_enrichment,
do_formula_enrichment=self.pipeline_options.do_formula_enrichment, do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
), ),
accelerator_options=self.pipeline_options.accelerator_options, accelerator_options=self.pipeline_options.accelerator_options,
) ),
if code_formula.enabled: *self.enrichment_pipe,
self.enrichment_pipe.append(code_formula) ]
picture_classifier = DocumentPictureClassifier(
enabled=self.pipeline_options.do_picture_classification,
artifacts_path=art_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=self.pipeline_options.accelerator_options,
)
if picture_classifier.enabled:
self.enrichment_pipe.append(picture_classifier)
picture_descr = self._make_picture_description_model(art_path)
if picture_descr and picture_descr.enabled:
self.enrichment_pipe.append(picture_descr)
self.keep_backend = any( self.keep_backend = any(
( (
@@ -374,19 +357,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
) )
# ---------------------------------------------------------------- helpers # ---------------------------------------------------------------- helpers
def _resolve_artifacts_path(self) -> Optional[Path]:
if self.pipeline_options.artifacts_path:
p = Path(self.pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path:
p = Path(settings.artifacts_path).expanduser()
else:
return None
if not p.is_dir():
raise RuntimeError(
f"{p} does not exist or is not a directory containing the required models"
)
return p
def _make_ocr_model(self, art_path: Optional[Path]) -> Any: def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
factory = get_ocr_factory( factory = get_ocr_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins allow_external_plugins=self.pipeline_options.allow_external_plugins
@@ -398,20 +368,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
accelerator_options=self.pipeline_options.accelerator_options, accelerator_options=self.pipeline_options.accelerator_options,
) )
def _make_picture_description_model(
self, art_path: Optional[Path]
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description,
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=art_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
# ──────────────────────────────────────────────────────────────────────── # ────────────────────────────────────────────────────────────────────────
# Build - thread pipeline # Build - thread pipeline
# ──────────────────────────────────────────────────────────────────────── # ────────────────────────────────────────────────────────────────────────

View File

@@ -54,18 +54,6 @@ class VlmPipeline(PaginatedPipeline):
self.pipeline_options: VlmPipelineOptions self.pipeline_options: VlmPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
# force_backend_text = False - use text that is coming from VLM response # force_backend_text = False - use text that is coming from VLM response
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
self.force_backend_text = ( self.force_backend_text = (
@@ -89,7 +77,7 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [ self.build_pipe = [
HuggingFaceMlxModel( HuggingFaceMlxModel(
enabled=True, # must be always enabled for this pipeline to make sense. enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path, artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options, vlm_options=vlm_options,
), ),
@@ -98,7 +86,7 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [ self.build_pipe = [
HuggingFaceTransformersVlmModel( HuggingFaceTransformersVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense. enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path, artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options, vlm_options=vlm_options,
), ),
@@ -109,7 +97,7 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [ self.build_pipe = [
VllmVlmModel( VllmVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense. enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path, artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options, vlm_options=vlm_options,
), ),

35
docs/examples/enrich_simple_pipeline.py vendored Normal file
View File

@@ -0,0 +1,35 @@
import logging
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.document_converter import (
DocumentConverter,
HTMLFormatOption,
WordFormatOption,
)
_log = logging.getLogger(__name__)
def main():
input_path = Path("tests/data/docx/word_sample.docx")
pipeline_options = ConvertPipelineOptions()
pipeline_options.do_picture_classification = True
pipeline_options.do_picture_description = True
doc_converter = DocumentConverter(
format_options={
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
},
)
res = doc_converter.convert(input_path)
print(res.document.export_to_markdown())
if __name__ == "__main__":
main()