diff --git a/docling/pipeline/base_pipeline.py b/docling/pipeline/base_pipeline.py index d08cf85a..a0796349 100644 --- a/docling/pipeline/base_pipeline.py +++ b/docling/pipeline/base_pipeline.py @@ -116,10 +116,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name. def __init__(self, pipeline_options: PipelineOptions): super().__init__(pipeline_options) - self.keep_backend = ( - True # For now, need to be able to query for page size post prediction - ) - # self.keep_backend = False + self.keep_backend = False def _apply_on_pages( self, conv_res: ConversionResult, page_batch: Iterable[Page] diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 04ae40d9..da512d3d 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -1,6 +1,7 @@ import itertools import logging import re +import warnings # from io import BytesIO from pathlib import Path @@ -28,7 +29,8 @@ from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult -from docling.datamodel.pipeline_options import PdfPipelineOptions +from docling.datamodel.pipeline_options import PdfPipelineOptions, VlmPipelineOptions +from docling.datamodel.settings import settings from docling.models.smol_docling_model import SmolDoclingModel from docling.pipeline.base_pipeline import PaginatedPipeline from docling.utils.profiling import ProfilingScope, TimeRecorder @@ -38,9 +40,29 @@ _log = logging.getLogger(__name__) class VlmPipeline(PaginatedPipeline): - def __init__(self, pipeline_options: PdfPipelineOptions): + def __init__(self, pipeline_options: VlmPipelineOptions): super().__init__(pipeline_options) - self.pipeline_options: PdfPipelineOptions + self.keep_backend = True + + warnings.warn( + "This API is currently experimental and may change in upcoming versions without notice.", + category=UserWarning, + stacklevel=2, + ) + + self.pipeline_options: VlmPipelineOptions + + artifacts_path: Optional[Path] = None + if pipeline_options.artifacts_path is not None: + artifacts_path = Path(pipeline_options.artifacts_path).expanduser() + elif settings.artifacts_path is not None: + artifacts_path = Path(settings.artifacts_path).expanduser() + + if artifacts_path is not None and not artifacts_path.is_dir(): + raise RuntimeError( + f"The value of {artifacts_path=} is not valid. " + "When defined, it must point to a folder containing all models required by the pipeline." + ) # force_backend_text = False - use text that is coming from SmolDocling # force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss