feat: enrichment steps on all convert pipelines (incl docx, html, etc) (#2251)

* allow enrichment on all convert pipelines

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* set options in CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-09-11 15:09:00 +02:00
committed by GitHub
parent c6965495a2
commit 2c9123419f
12 changed files with 235 additions and 190 deletions

View File

@@ -48,6 +48,7 @@ from docling.datamodel.base_models import (
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
ConvertPipelineOptions,
EasyOcrOptions,
OcrOptions,
PaginatedPipelineOptions,
@@ -71,8 +72,13 @@ from docling.datamodel.vlm_model_specs import (
from docling.document_converter import (
AudioFormatOption,
DocumentConverter,
ExcelFormatOption,
FormatOption,
HTMLFormatOption,
MarkdownFormatOption,
PdfFormatOption,
PowerpointFormatOption,
WordFormatOption,
)
from docling.models.factories import get_ocr_factory
from docling.pipeline.asr_pipeline import AsrPipeline
@@ -626,10 +632,33 @@ def convert( # noqa: C901
backend=MetsGbsDocumentBackend,
)
# SimplePipeline options
simple_format_option = ConvertPipelineOptions(
do_picture_description=enrich_picture_description,
do_picture_classification=enrich_picture_classes,
)
if artifacts_path is not None:
simple_format_option.artifacts_path = artifacts_path
format_options = {
InputFormat.PDF: pdf_format_option,
InputFormat.IMAGE: pdf_format_option,
InputFormat.METS_GBS: mets_gbs_format_option,
InputFormat.DOCX: WordFormatOption(
pipeline_options=simple_format_option
),
InputFormat.PPTX: PowerpointFormatOption(
pipeline_options=simple_format_option
),
InputFormat.XLSX: ExcelFormatOption(
pipeline_options=simple_format_option
),
InputFormat.HTML: HTMLFormatOption(
pipeline_options=simple_format_option
),
InputFormat.MD: MarkdownFormatOption(
pipeline_options=simple_format_option
),
}
elif pipeline == ProcessingPipeline.VLM:

View File

@@ -259,11 +259,21 @@ class PipelineOptions(BaseOptions):
accelerator_options: AcceleratorOptions = AcceleratorOptions()
enable_remote_services: bool = False
allow_external_plugins: bool = False
class PaginatedPipelineOptions(PipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
class ConvertPipelineOptions(PipelineOptions):
"""Base convert pipeline options."""
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
class PaginatedPipelineOptions(ConvertPipelineOptions):
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
@@ -295,13 +305,11 @@ class LayoutOptions(BaseModel):
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
class VlmExtractionPipelineOptions(PipelineOptions):
"""Options for extraction pipeline."""
artifacts_path: Optional[Union[Path, str]] = None
vlm_options: Union[InlineVlmOptions] = NU_EXTRACT_2B_TRANSFORMERS
@@ -312,8 +320,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
do_ocr: bool = True # True: perform OCR, replace programmatic PDF text
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
@@ -321,9 +327,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: OcrOptions = EasyOcrOptions()
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
layout_options: LayoutOptions = LayoutOptions()
images_scale: float = 1.0

View File

@@ -4,7 +4,13 @@ from collections.abc import Iterable
from typing import Any, Generic, Optional, Protocol, Type, Union
import numpy as np
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from docling_core.types.doc import (
BoundingBox,
DocItem,
DoclingDocument,
NodeItem,
PictureItem,
)
from PIL.Image import Image
from typing_extensions import TypeVar
@@ -164,8 +170,17 @@ class BaseItemAndImageEnrichmentModel(
return None
assert isinstance(element, DocItem)
element_prov = element.prov[0]
# Allow the case of documents without page images but embedded images (e.g. Word and HTML docs)
if len(element.prov) == 0 and isinstance(element, PictureItem):
embedded_im = element.get_image(conv_res.document)
if embedded_im is not None:
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
else:
return None
# Crop the image form the page
element_prov = element.prov[0]
bbox = element_prov.bbox
width = bbox.r - bbox.l
height = bbox.t - bbox.b
@@ -183,4 +198,14 @@ class BaseItemAndImageEnrichmentModel(
cropped_image = conv_res.pages[page_ix].get_image(
scale=self.images_scale, cropbox=expanded_bbox
)
# Allow for images being embedded without the page backend or page images
if cropped_image is None and isinstance(element, PictureItem):
embedded_im = element.get_image(conv_res.document)
if embedded_im is not None:
return ItemAndImageEnrichmentElement(item=element, image=embedded_im)
else:
return None
# Return the proper cropped image
return ItemAndImageEnrichmentElement(item=element, image=cropped_image)

View File

@@ -208,25 +208,13 @@ class AsrPipeline(BasePipeline):
self.pipeline_options: AsrPipelineOptions = pipeline_options
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
asr_options: InlineAsrNativeWhisperOptions = (
self.pipeline_options.asr_options
)
self._model = _NativeWhisperModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
asr_options=asr_options,
)

View File

@@ -1,19 +1,33 @@
import logging
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Optional
from docling.datamodel.base_models import ConversionStatus, ErrorItem
from docling.datamodel.document import InputDocument
from docling.datamodel.extraction import ExtractionResult, ExtractionTemplateType
from docling.datamodel.pipeline_options import BaseOptions
from docling.datamodel.pipeline_options import BaseOptions, PipelineOptions
from docling.datamodel.settings import settings
_log = logging.getLogger(__name__)
class BaseExtractionPipeline(ABC):
def __init__(self, pipeline_options: BaseOptions):
def __init__(self, pipeline_options: PipelineOptions):
self.pipeline_options = pipeline_options
self.artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
self.artifacts_path = Path(settings.artifacts_path).expanduser()
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
raise RuntimeError(
f"The value of {self.artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
def execute(
self,
in_doc: InputDocument,
@@ -54,5 +68,5 @@ class BaseExtractionPipeline(ABC):
@classmethod
@abstractmethod
def get_default_options(cls) -> BaseOptions:
def get_default_options(cls) -> PipelineOptions:
pass

View File

@@ -4,7 +4,8 @@ import time
import traceback
from abc import ABC, abstractmethod
from collections.abc import Iterable
from typing import Any, Callable, List
from pathlib import Path
from typing import Any, Callable, List, Optional
from docling_core.types.doc import NodeItem
@@ -20,9 +21,19 @@ from docling.datamodel.base_models import (
Page,
)
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import PdfPipelineOptions, PipelineOptions
from docling.datamodel.pipeline_options import (
ConvertPipelineOptions,
PdfPipelineOptions,
PipelineOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import GenericEnrichmentModel
from docling.models.document_picture_classifier import (
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.factories import get_picture_description_factory
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify
@@ -36,6 +47,18 @@ class BasePipeline(ABC):
self.build_pipe: List[Callable] = []
self.enrichment_pipe: List[GenericEnrichmentModel[Any]] = []
self.artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
self.artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
self.artifacts_path = Path(settings.artifacts_path).expanduser()
if self.artifacts_path is not None and not self.artifacts_path.is_dir():
raise RuntimeError(
f"The value of {self.artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
def execute(self, in_doc: InputDocument, raises_on_error: bool) -> ConversionResult:
conv_res = ConversionResult(input=in_doc)
@@ -108,15 +131,58 @@ class BasePipeline(ABC):
def is_backend_supported(cls, backend: AbstractDocumentBackend):
pass
# def _apply_on_elements(self, element_batch: Iterable[NodeItem]) -> Iterable[Any]:
# for model in self.build_pipe:
# element_batch = model(element_batch)
#
# yield from element_batch
class ConvertPipeline(BasePipeline):
def __init__(self, pipeline_options: ConvertPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: ConvertPipelineOptions
# ------ Common enrichment models working on all backends
# Picture description model
if (
picture_description_model := self._get_picture_description_model(
artifacts_path=self.artifacts_path
)
) is None:
raise RuntimeError(
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
)
self.enrichment_pipe = [
# Document Picture Classifier
DocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification,
artifacts_path=self.artifacts_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture description
picture_description_model,
]
def _get_picture_description_model(
self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description,
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
@classmethod
@abstractmethod
def get_default_options(cls) -> ConvertPipelineOptions:
pass
class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def __init__(self, pipeline_options: PipelineOptions):
class PaginatedPipeline(ConvertPipeline): # TODO this is a bad name.
def __init__(self, pipeline_options: ConvertPipelineOptions):
super().__init__(pipeline_options)
self.keep_backend = False

View File

@@ -1,7 +1,6 @@
import inspect
import json
import logging
from pathlib import Path
from typing import Optional
from PIL.Image import Image
@@ -16,7 +15,10 @@ from docling.datamodel.extraction import (
ExtractionResult,
ExtractionTemplateType,
)
from docling.datamodel.pipeline_options import BaseOptions, VlmExtractionPipelineOptions
from docling.datamodel.pipeline_options import (
PipelineOptions,
VlmExtractionPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.models.vlm_models_inline.nuextract_transformers_model import (
NuExtractTransformersModel,
@@ -35,22 +37,10 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
self.accelerator_options = pipeline_options.accelerator_options
self.pipeline_options: VlmExtractionPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
# Create VLM model instance
self.vlm_model = NuExtractTransformersModel(
enabled=True,
artifacts_path=artifacts_path, # Will download automatically
artifacts_path=self.artifacts_path, # Will download automatically
accelerator_options=self.accelerator_options,
vlm_options=pipeline_options.vlm_options,
)
@@ -203,5 +193,5 @@ class ExtractionVlmPipeline(BaseExtractionPipeline):
raise ValueError(f"Unsupported template type: {type(template)}")
@classmethod
def get_default_options(cls) -> BaseOptions:
def get_default_options(cls) -> PipelineOptions:
return VlmExtractionPipelineOptions()

View File

@@ -6,21 +6,21 @@ from docling.backend.abstract_backend import (
)
from docling.datamodel.base_models import ConversionStatus
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PipelineOptions
from docling.pipeline.base_pipeline import BasePipeline
from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.pipeline.base_pipeline import ConvertPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
class SimplePipeline(BasePipeline):
class SimplePipeline(ConvertPipeline):
"""SimpleModelPipeline.
This class is used at the moment for formats / backends
which produce straight DoclingDocument output.
"""
def __init__(self, pipeline_options: PipelineOptions):
def __init__(self, pipeline_options: ConvertPipelineOptions):
super().__init__(pipeline_options)
def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
@@ -47,8 +47,8 @@ class SimplePipeline(BasePipeline):
return ConversionStatus.SUCCESS
@classmethod
def get_default_options(cls) -> PipelineOptions:
return PipelineOptions()
def get_default_options(cls) -> ConvertPipelineOptions:
return ConvertPipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):

View File

@@ -15,18 +15,13 @@ from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
from docling.models.document_picture_classifier import (
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.factories import get_ocr_factory, get_picture_description_factory
from docling.models.factories import get_ocr_factory
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_pipeline import PaginatedPipeline
@@ -41,18 +36,6 @@ class StandardPdfPipeline(PaginatedPipeline):
super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
with warnings.catch_warnings(): # deprecated generate_table_images
warnings.filterwarnings("ignore", category=DeprecationWarning)
self.keep_images = (
@@ -63,7 +46,7 @@ class StandardPdfPipeline(PaginatedPipeline):
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
ocr_model = self.get_ocr_model(artifacts_path=self.artifacts_path)
self.build_pipe = [
# Pre-processing
@@ -76,14 +59,14 @@ class StandardPdfPipeline(PaginatedPipeline):
ocr_model,
# Layout model
LayoutModel(
artifacts_path=artifacts_path,
artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
options=pipeline_options.layout_options,
),
# Table structure model
TableStructureModel(
enabled=pipeline_options.do_table_structure,
artifacts_path=artifacts_path,
artifacts_path=self.artifacts_path,
options=pipeline_options.table_structure_options,
accelerator_options=pipeline_options.accelerator_options,
),
@@ -91,37 +74,19 @@ class StandardPdfPipeline(PaginatedPipeline):
PageAssembleModel(options=PageAssembleOptions()),
]
# Picture description model
if (
picture_description_model := self.get_picture_description_model(
artifacts_path=artifacts_path
)
) is None:
raise RuntimeError(
f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
)
self.enrichment_pipe = [
# Code Formula Enrichment Model
CodeFormulaModel(
enabled=pipeline_options.do_code_enrichment
or pipeline_options.do_formula_enrichment,
artifacts_path=artifacts_path,
artifacts_path=self.artifacts_path,
options=CodeFormulaModelOptions(
do_code_enrichment=pipeline_options.do_code_enrichment,
do_formula_enrichment=pipeline_options.do_formula_enrichment,
),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture Classifier
DocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification,
artifacts_path=artifacts_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture description
picture_description_model,
*self.enrichment_pipe,
]
if (
@@ -158,20 +123,6 @@ class StandardPdfPipeline(PaginatedPipeline):
accelerator_options=self.pipeline_options.accelerator_options,
)
def get_picture_description_model(
self, artifacts_path: Optional[Path] = None
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description,
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=artifacts_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore

View File

@@ -32,21 +32,16 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ThreadedPdfPipelineOptions
from docling.datamodel.settings import settings
from docling.models.code_formula_model import CodeFormulaModel, CodeFormulaModelOptions
from docling.models.document_picture_classifier import (
DocumentPictureClassifier,
DocumentPictureClassifierOptions,
)
from docling.models.factories import get_ocr_factory, get_picture_description_factory
from docling.models.factories import get_ocr_factory
from docling.models.layout_model import LayoutModel
from docling.models.page_assemble_model import PageAssembleModel, PageAssembleOptions
from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
from docling.models.readingorder_model import ReadingOrderModel, ReadingOrderOptions
from docling.models.table_structure_model import TableStructureModel
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.base_pipeline import ConvertPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
from docling.utils.utils import chunkify
@@ -294,7 +289,7 @@ class RunContext:
# ──────────────────────────────────────────────────────────────────────────────
class ThreadedStandardPdfPipeline(BasePipeline):
class ThreadedStandardPdfPipeline(ConvertPipeline):
"""High-performance PDF pipeline with multi-threaded stages."""
def __init__(self, pipeline_options: ThreadedPdfPipelineOptions) -> None:
@@ -310,7 +305,7 @@ class ThreadedStandardPdfPipeline(BasePipeline):
# ────────────────────────────────────────────────────────────────────────
def _init_models(self) -> None:
art_path = self._resolve_artifacts_path()
art_path = self.artifacts_path
self.keep_images = (
self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images
@@ -337,32 +332,20 @@ class ThreadedStandardPdfPipeline(BasePipeline):
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
# --- optional enrichment ------------------------------------------------
self.enrichment_pipe = []
code_formula = CodeFormulaModel(
self.enrichment_pipe = [
# Code Formula Enrichment Model
CodeFormulaModel(
enabled=self.pipeline_options.do_code_enrichment
or self.pipeline_options.do_formula_enrichment,
artifacts_path=art_path,
artifacts_path=self.artifacts_path,
options=CodeFormulaModelOptions(
do_code_enrichment=self.pipeline_options.do_code_enrichment,
do_formula_enrichment=self.pipeline_options.do_formula_enrichment,
),
accelerator_options=self.pipeline_options.accelerator_options,
)
if code_formula.enabled:
self.enrichment_pipe.append(code_formula)
picture_classifier = DocumentPictureClassifier(
enabled=self.pipeline_options.do_picture_classification,
artifacts_path=art_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=self.pipeline_options.accelerator_options,
)
if picture_classifier.enabled:
self.enrichment_pipe.append(picture_classifier)
picture_descr = self._make_picture_description_model(art_path)
if picture_descr and picture_descr.enabled:
self.enrichment_pipe.append(picture_descr)
),
*self.enrichment_pipe,
]
self.keep_backend = any(
(
@@ -374,19 +357,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
)
# ---------------------------------------------------------------- helpers
def _resolve_artifacts_path(self) -> Optional[Path]:
if self.pipeline_options.artifacts_path:
p = Path(self.pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path:
p = Path(settings.artifacts_path).expanduser()
else:
return None
if not p.is_dir():
raise RuntimeError(
f"{p} does not exist or is not a directory containing the required models"
)
return p
def _make_ocr_model(self, art_path: Optional[Path]) -> Any:
factory = get_ocr_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
@@ -398,20 +368,6 @@ class ThreadedStandardPdfPipeline(BasePipeline):
accelerator_options=self.pipeline_options.accelerator_options,
)
def _make_picture_description_model(
self, art_path: Optional[Path]
) -> Optional[PictureDescriptionBaseModel]:
factory = get_picture_description_factory(
allow_external_plugins=self.pipeline_options.allow_external_plugins
)
return factory.create_instance(
options=self.pipeline_options.picture_description_options,
enabled=self.pipeline_options.do_picture_description,
enable_remote_services=self.pipeline_options.enable_remote_services,
artifacts_path=art_path,
accelerator_options=self.pipeline_options.accelerator_options,
)
# ────────────────────────────────────────────────────────────────────────
# Build - thread pipeline
# ────────────────────────────────────────────────────────────────────────

View File

@@ -54,18 +54,6 @@ class VlmPipeline(PaginatedPipeline):
self.pipeline_options: VlmPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
# force_backend_text = False - use text that is coming from VLM response
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
self.force_backend_text = (
@@ -89,7 +77,7 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [
HuggingFaceMlxModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options,
),
@@ -98,7 +86,7 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [
HuggingFaceTransformersVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options,
),
@@ -109,7 +97,7 @@ class VlmPipeline(PaginatedPipeline):
self.build_pipe = [
VllmVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
artifacts_path=self.artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=vlm_options,
),

35
docs/examples/enrich_simple_pipeline.py vendored Normal file
View File

@@ -0,0 +1,35 @@
import logging
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ConvertPipelineOptions
from docling.document_converter import (
DocumentConverter,
HTMLFormatOption,
WordFormatOption,
)
_log = logging.getLogger(__name__)
def main():
input_path = Path("tests/data/docx/word_sample.docx")
pipeline_options = ConvertPipelineOptions()
pipeline_options.do_picture_classification = True
pipeline_options.do_picture_description = True
doc_converter = DocumentConverter(
format_options={
InputFormat.DOCX: WordFormatOption(pipeline_options=pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=pipeline_options),
},
)
res = doc_converter.convert(input_path)
print(res.document.export_to_markdown())
if __name__ == "__main__":
main()