Moved keep_backend = True to vlm pipeline

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2025-02-13 17:53:03 +01:00
parent 853544ba11
commit 0c60ef199a
2 changed files with 26 additions and 7 deletions

View File

@ -116,10 +116,7 @@ class PaginatedPipeline(BasePipeline): # TODO this is a bad name.
def __init__(self, pipeline_options: PipelineOptions):
super().__init__(pipeline_options)
self.keep_backend = (
True # For now, need to be able to query for page size post prediction
)
# self.keep_backend = False
self.keep_backend = False
def _apply_on_pages(
self, conv_res: ConversionResult, page_batch: Iterable[Page]

View File

@ -1,6 +1,7 @@
import itertools
import logging
import re
import warnings
# from io import BytesIO
from pathlib import Path
@ -28,7 +29,8 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.pipeline_options import PdfPipelineOptions, VlmPipelineOptions
from docling.datamodel.settings import settings
from docling.models.smol_docling_model import SmolDoclingModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -38,9 +40,29 @@ _log = logging.getLogger(__name__)
class VlmPipeline(PaginatedPipeline):
def __init__(self, pipeline_options: PdfPipelineOptions):
def __init__(self, pipeline_options: VlmPipelineOptions):
super().__init__(pipeline_options)
self.pipeline_options: PdfPipelineOptions
self.keep_backend = True
warnings.warn(
"This API is currently experimental and may change in upcoming versions without notice.",
category=UserWarning,
stacklevel=2,
)
self.pipeline_options: VlmPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
# force_backend_text = False - use text that is coming from SmolDocling
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDoclingss