This commit is contained in:
Peter W. J. Staar 2025-07-23 14:06:06 +02:00 committed by GitHub
commit 0418d2887f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
15 changed files with 506 additions and 79 deletions

View File

@ -63,6 +63,7 @@ from docling.datamodel.vlm_model_specs import (
GRANITE_VISION_TRANSFORMERS,
SMOLDOCLING_MLX,
SMOLDOCLING_TRANSFORMERS,
VLM2STAGE,
VlmModelType,
)
from docling.document_converter import (
@ -627,6 +628,12 @@ def convert( # noqa: C901
"To run SmolDocling faster, please install mlx-vlm:\n"
"pip install mlx-vlm"
)
elif vlm_model == VlmModelType.VLM2STAGE:
pipeline_options.vlm_options = VLM2STAGE
else:
raise ValueError(
f"{vlm_model} is not of type GRANITE_VISION, GRANITE_VISION_OLLAMA, SMOLDOCLING_TRANSFORMERS or VLM2STAGE"
)
pdf_format_option = PdfFormatOption(
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options

View File

@ -11,12 +11,13 @@ from docling.datamodel.pipeline_options_asr_model import (
# ApiAsrOptions,
InferenceAsrFramework,
InlineAsrNativeWhisperOptions,
TransformersModelType,
InlineAsrOptions,
# TransformersModelType,
)
_log = logging.getLogger(__name__)
WHISPER_TINY = InlineAsrNativeWhisperOptions(
WHISPER_TINY: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="tiny",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
@ -27,7 +28,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
max_time_chunk=30.0,
)
WHISPER_SMALL = InlineAsrNativeWhisperOptions(
WHISPER_SMALL: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="small",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
@ -38,7 +39,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
max_time_chunk=30.0,
)
WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
WHISPER_MEDIUM: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="medium",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
@ -49,7 +50,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
max_time_chunk=30.0,
)
WHISPER_BASE = InlineAsrNativeWhisperOptions(
WHISPER_BASE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="base",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
@ -60,7 +61,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
max_time_chunk=30.0,
)
WHISPER_LARGE = InlineAsrNativeWhisperOptions(
WHISPER_LARGE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="large",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,
@ -71,7 +72,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
max_time_chunk=30.0,
)
WHISPER_TURBO = InlineAsrNativeWhisperOptions(
WHISPER_TURBO: InlineAsrOptions = InlineAsrNativeWhisperOptions(
repo_id="turbo",
inference_framework=InferenceAsrFramework.WHISPER,
verbose=True,

View File

@ -26,8 +26,6 @@ class LayoutModelConfig(BaseModel):
return self.repo_id.replace("/", "--")
# HuggingFace Layout Models
# Default Docling Layout Model
DOCLING_LAYOUT_V2 = LayoutModelConfig(
name="docling_layout_v2",

View File

@ -12,10 +12,16 @@ from pydantic import (
)
from typing_extensions import deprecated
from docling.datamodel import asr_model_specs
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.asr_model_specs import (
WHISPER_BASE,
WHISPER_LARGE,
WHISPER_MEDIUM,
WHISPER_SMALL,
WHISPER_TINY,
WHISPER_TURBO,
)
from docling.datamodel.layout_model_specs import (
DOCLING_LAYOUT_EGRET_LARGE,
DOCLING_LAYOUT_EGRET_MEDIUM,
@ -33,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
TwoStageVlmOptions,
)
from docling.datamodel.vlm_model_specs import (
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
@ -270,8 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
smoldocling_vlm_conversion_options
# SMOLDOCLING_TRANSFORMERS
)
@ -286,7 +294,7 @@ class LayoutOptions(BaseModel):
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
asr_options: Union[InlineAsrOptions] = WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None

View File

@ -5,10 +5,11 @@ from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.pipeline_options_vlm_model import (
# InferenceFramework,
TransformersModelType,
)
# from docling.datamodel.pipeline_options_vlm_model import (
# InferenceFramework,
# TransformersModelType,
# )
class BaseAsrOptions(BaseModel):

View File

@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.layout_model_specs import (
LayoutModelConfig,
)
class BaseVlmOptions(BaseModel):
@ -87,3 +90,12 @@ class ApiVlmOptions(BaseVlmOptions):
timeout: float = 60
concurrency: int = 1
response_format: ResponseFormat
class TwoStageVlmOptions(BaseModel):
kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
response_format: ResponseFormat # final response of the VLM
layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS

View File

@ -6,12 +6,17 @@ from pydantic import (
)
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.layout_model_specs import (
DOCLING_LAYOUT_HERON,
DOCLING_LAYOUT_V2,
)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
TransformersModelType,
TwoStageVlmOptions,
)
_log = logging.getLogger(__name__)
@ -137,8 +142,15 @@ GEMMA3_27B_MLX = InlineVlmOptions(
temperature=0.0,
)
VLM2STAGE = TwoStageVlmOptions(
vlm_options=SMOLDOCLING_MLX,
layout_options=DOCLING_LAYOUT_HERON,
response_format=SMOLDOCLING_MLX.response_format,
)
class VlmModelType(str, Enum):
SMOLDOCLING = "smoldocling"
GRANITE_VISION = "granite_vision"
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
VLM2STAGE = "vlm2stage"

View File

@ -3,9 +3,16 @@ from collections.abc import Iterable
from typing import Generic, Optional, Protocol, Type
from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from PIL import Image
from typing_extensions import TypeVar
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
from docling.datamodel.base_models import (
Cluster,
ItemAndImageEnrichmentElement,
Page,
TextCell,
VlmPredictionToken,
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import BaseOptions
from docling.datamodel.settings import settings
@ -19,6 +26,9 @@ class BaseModelWithOptions(Protocol):
class BasePageModel(ABC):
scale: float # scale with which the page-image needs to be created (dpi = 72*scale)
max_size: int # max size of width/height of page-image
@abstractmethod
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
@ -26,6 +36,30 @@ class BasePageModel(ABC):
pass
class BaseLayoutModel(BasePageModel):
@abstractmethod
def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
pass
@abstractmethod
def postprocess_on_page_image(
self, *, page: Page, clusters: list[Cluster]
) -> tuple[Page, list[Cluster], list[TextCell]]:
pass
class BaseVlmModel(BasePageModel):
@abstractmethod
def get_user_prompt(self, page: Optional[Page]) -> str:
pass
@abstractmethod
def predict_on_page_image(
self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
) -> tuple[str, Optional[list[VlmPredictionToken]]]:
pass
EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)

View File

@ -7,6 +7,7 @@ from typing import Optional
import numpy as np
from docling_core.types.doc import DocItemLabel
from docling_core.types.doc.page import TextCell
from PIL import Image
from docling.datamodel.accelerator_options import AcceleratorOptions
@ -15,7 +16,7 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
from docling.datamodel.pipeline_options import LayoutOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.base_model import BaseLayoutModel, BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
from docling.utils.layout_postprocessor import LayoutPostprocessor
@ -25,7 +26,7 @@ from docling.utils.visualization import draw_clusters
_log = logging.getLogger(__name__)
class LayoutModel(BasePageModel):
class LayoutModel(BaseLayoutModel):
TEXT_ELEM_LABELS = [
DocItemLabel.TEXT,
DocItemLabel.FOOTNOTE,
@ -158,6 +159,7 @@ class LayoutModel(BasePageModel):
page_image = page.get_image(scale=1.0)
assert page_image is not None
"""
clusters = []
for ix, pred_item in enumerate(
self.layout_predictor.predict(page_image)
@ -176,14 +178,18 @@ class LayoutModel(BasePageModel):
cells=[],
)
clusters.append(cluster)
"""
predicted_clusters = self.predict_on_page_image(
page_image=page_image
)
if settings.debug.visualize_raw_layout:
self.draw_clusters_and_cells_side_by_side(
conv_res, page, clusters, mode_prefix="raw"
conv_res, page, predicted_clusters, mode_prefix="raw"
)
# Apply postprocessing
"""
processed_clusters, processed_cells = LayoutPostprocessor(
page, clusters, self.options
).postprocess()
@ -210,6 +216,30 @@ class LayoutModel(BasePageModel):
page.predictions.layout = LayoutPrediction(
clusters=processed_clusters
)
"""
page, processed_clusters, processed_cells = (
self.postprocess_on_page_image(
page=page, clusters=predicted_clusters
)
)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Mean of empty slice|invalid value encountered in scalar divide",
RuntimeWarning,
"numpy",
)
conv_res.confidence.pages[page.page_no].layout_score = float(
np.mean([c.confidence for c in processed_clusters])
)
conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean(
[c.confidence for c in processed_cells if c.from_ocr]
)
)
if settings.debug.visualize_layout:
self.draw_clusters_and_cells_side_by_side(
@ -217,3 +247,34 @@ class LayoutModel(BasePageModel):
)
yield page
def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
pred_items = self.layout_predictor.predict(page_image)
clusters = []
for ix, pred_item in enumerate(pred_items):
label = DocItemLabel(
pred_item["label"].lower().replace(" ", "_").replace("-", "_")
) # Temporary, until docling-ibm-model uses docling-core types
cluster = Cluster(
id=ix,
label=label,
confidence=pred_item["confidence"],
bbox=BoundingBox.model_validate(pred_item),
cells=[],
)
clusters.append(cluster)
return clusters
def postprocess_on_page_image(
self, *, page: Page, clusters: list[Cluster]
) -> tuple[Page, list[Cluster], list[TextCell]]:
processed_clusters, processed_cells = LayoutPostprocessor(
page, clusters, self.options
).postprocess()
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
return page, processed_clusters, processed_cells

View File

@ -5,17 +5,19 @@ from collections.abc import Iterable
from pathlib import Path
from typing import Any, Optional
from PIL import Image
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import (
InlineVlmOptions,
TransformersModelType,
TransformersPromptStyle,
)
from docling.models.base_model import BasePageModel
from docling.models.base_model import BasePageModel, BaseVlmModel
from docling.models.utils.hf_model_download import (
HuggingFaceModelDownloadMixin,
)
@ -25,7 +27,7 @@ from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
def __init__(
self,
enabled: bool,
@ -37,6 +39,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
self.vlm_options = vlm_options
self.scale = self.vlm_options.scale
self.max_size = 512
if isinstance(self.vlm_options.max_size, int):
self.max_size = self.vlm_options.max_size
if self.enabled:
import torch
from transformers import (
@ -119,6 +126,43 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
# Load generation config
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
def get_user_prompt(self, page: Optional[Page]) -> str:
# Define prompt structure
user_prompt = ""
if callable(self.vlm_options.prompt) and page is not None:
user_prompt = self.vlm_options.prompt(page.parsed_page)
elif isinstance(self.vlm_options.prompt, str):
user_prompt = self.vlm_options.prompt
prompt = self.formulate_prompt(user_prompt)
return prompt
def predict_on_page_image(
self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
) -> tuple[str, Optional[list[VlmPredictionToken]]]:
output = ""
inputs = self.processor(
text=prompt, images=[page_image], return_tensors="pt"
).to(self.device)
# Call model to generate:
generated_ids = self.vlm_model.generate(
**inputs,
max_new_tokens=self.max_new_tokens,
use_cache=self.use_cache,
temperature=self.temperature,
generation_config=self.generation_config,
**self.vlm_options.extra_generation_config,
)
output = self.processor.batch_decode(
generated_ids[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=False,
)[0]
return output, []
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
@ -130,22 +174,29 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
with TimeRecorder(conv_res, "vlm"):
assert page.size is not None
hi_res_image = page.get_image(
page_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
assert page_image is not None
# Define prompt structure
"""
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.formulate_prompt(user_prompt)
inputs = self.processor(
text=prompt, images=[hi_res_image], return_tensors="pt"
).to(self.device)
"""
prompt = self.get_user_prompt(page=page)
start_time = time.time()
"""
inputs = self.processor(
text=prompt, images=[page_image], return_tensors="pt"
).to(self.device)
# Call model to generate:
generated_ids = self.vlm_model.generate(
**inputs,
@ -166,9 +217,14 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
_log.debug(
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
)
"""
generated_text = self.predict_on_page_image(
page_image=page_image, prompt=prompt, output_tokens=False
)
page.predictions.vlm_response = VlmPrediction(
text=generated_texts,
generation_time=generation_time,
text=generated_text,
generation_time=time.time() - start_time,
)
yield page

View File

@ -4,13 +4,15 @@ from collections.abc import Iterable
from pathlib import Path
from typing import Optional
from PIL import Image
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
from docling.models.base_model import BasePageModel
from docling.models.base_model import BasePageModel, BaseVlmModel
from docling.models.utils.hf_model_download import (
HuggingFaceModelDownloadMixin,
)
@ -19,7 +21,7 @@ from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
def __init__(
self,
enabled: bool,
@ -28,10 +30,15 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
vlm_options: InlineVlmOptions,
):
self.enabled = enabled
self.vlm_options = vlm_options
self.max_tokens = vlm_options.max_new_tokens
self.temperature = vlm_options.temperature
self.scale = self.vlm_options.scale
self.max_size = 512
if isinstance(self.vlm_options.max_size, int):
self.max_size = self.vlm_options.max_size
if self.enabled:
try:
@ -60,6 +67,55 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
self.vlm_model, self.processor = load(artifacts_path)
self.config = load_config(artifacts_path)
def get_user_prompt(self, page: Optional[Page]) -> str:
if callable(self.vlm_options.prompt) and page is not None:
return self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.apply_chat_template(
self.processor, self.config, user_prompt, num_images=1
)
return prompt
def predict_on_page_image(
self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
) -> tuple[str, Optional[list[VlmPredictionToken]]]:
tokens = []
output = ""
for token in self.stream_generate(
self.vlm_model,
self.processor,
prompt,
[page_image],
max_tokens=self.max_tokens,
verbose=False,
temp=self.temperature,
):
if len(token.logprobs.shape) == 1:
tokens.append(
VlmPredictionToken(
text=token.text,
token=token.token,
logprob=token.logprobs[token.token],
)
)
elif len(token.logprobs.shape) == 2 and token.logprobs.shape[0] == 1:
tokens.append(
VlmPredictionToken(
text=token.text,
token=token.token,
logprob=token.logprobs[0, token.token],
)
)
else:
_log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}")
output += token.text
if "</doctag>" in token.text:
break
return output, tokens
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
@ -71,19 +127,23 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
assert page.size is not None
hi_res_image = page.get_image(
page_image = page.get_image(
scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
)
if hi_res_image is not None:
im_width, im_height = hi_res_image.size
"""
if page_image is not None:
im_width, im_height = page_image.size
"""
assert page_image is not None
# populate page_tags with predicted doc tags
page_tags = ""
if hi_res_image:
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")
if page_image:
if page_image.mode != "RGB":
page_image = page_image.convert("RGB")
"""
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
@ -91,11 +151,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
prompt = self.apply_chat_template(
self.processor, self.config, user_prompt, num_images=1
)
start_time = time.time()
_log.debug("start generating ...")
"""
prompt = self.get_user_prompt(page)
# Call model to generate:
start_time = time.time()
"""
tokens: list[VlmPredictionToken] = []
output = ""
@ -103,7 +164,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
self.vlm_model,
self.processor,
prompt,
[hi_res_image],
[page_image],
max_tokens=self.max_tokens,
verbose=False,
temp=self.temperature,
@ -135,13 +196,20 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
output += token.text
if "</doctag>" in token.text:
break
"""
output, tokens = self.predict_on_page_image(
page_image=page_image, prompt=prompt, output_tokens=True
)
generation_time = time.time() - start_time
page_tags = output
"""
_log.debug(
f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
)
"""
page.predictions.vlm_response = VlmPrediction(
text=page_tags,
generation_time=generation_time,

View File

@ -0,0 +1,119 @@
import importlib.metadata
import logging
import time
from collections.abc import Iterable
from pathlib import Path
from typing import Any, Optional
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import Cluster, Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import (
InlineVlmOptions,
TransformersModelType,
TransformersPromptStyle,
)
from docling.models.base_model import BaseLayoutModel, BasePageModel, BaseVlmModel
from docling.models.layout_model import LayoutModel
from docling.models.utils.hf_model_download import (
HuggingFaceModelDownloadMixin,
)
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
def __init__(
self,
*,
layout_model: BaseLayoutModel,
vlm_model: BaseVlmModel,
):
self.layout_model = layout_model
self.vlm_model = vlm_model
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "two-staged-vlm"):
assert page.size is not None
page_image = page.get_image(
scale=self.vlm_model.scale, max_size=self.vlm_model.max_size
)
assert page_image is not None
pred_clusters = self.layout_model.predict_on_page_image(
page_image=page_image
)
page, processed_clusters, processed_cells = (
self.layout_model.postprocess_on_page_image(
page=page, clusters=pred_clusters
)
)
user_prompt = self.vlm_model.get_user_prompt(page=page)
prompt = self.formulate_prompt(
user_prompt=user_prompt,
clusters=processed_clusters,
image_width=page_image.width,
image_height=page_image.height,
)
start_time = time.time()
generated_text, generated_tokens = (
self.vlm_model.predict_on_page_image(
page_image=page_image, prompt=prompt
)
)
print("generated-text: \n", generated_text, "\n")
page.predictions.vlm_response = VlmPrediction(
text=generated_text,
generation_time=time.time() - start_time,
generated_tokens=generated_tokens,
)
exit(-1)
yield page
def formulate_prompt(
self,
*,
user_prompt: str,
clusters: list[Cluster],
image_width: int,
image_height: int,
vlm_width: int = 512,
vlm_height: int = 512,
) -> str:
"""Formulate a prompt for the VLM."""
known_clusters = ["here is a list of unsorted text-blocks:", "<doctags>"]
for cluster in clusters:
print(" => ", cluster)
loc_l = f"<loc_{int(vlm_width * cluster.bbox.l / image_width)}>"
loc_b = f"<loc_{int(vlm_height * cluster.bbox.b / image_height)}>"
loc_r = f"<loc_{int(vlm_width * cluster.bbox.r / image_width)}>"
loc_t = f"<loc_{int(vlm_height * cluster.bbox.t / image_height)}>"
known_clusters.append(
f"<{cluster.label}>{loc_l}{loc_b}{loc_r}{loc_t}</{cluster.label}>"
)
known_clusters.append("</doctags>")
user_prompt = "\n".join(known_clusters) + f"\n\n{user_prompt}"
print("user-prompt: ", user_prompt, "\n")
return user_prompt

View File

@ -26,21 +26,24 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
VlmPipelineOptions,
)
from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
TwoStageVlmOptions,
)
from docling.datamodel.settings import settings
from docling.models.api_vlm_model import ApiVlmModel
from docling.models.layout_model import LayoutModel
from docling.models.vlm_models_inline.hf_transformers_model import (
HuggingFaceTransformersVlmModel,
)
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
from docling.models.vlm_models_inline.two_stage_vlm_model import (
TwoStageVlmModel,
)
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
@ -107,6 +110,53 @@ class VlmPipeline(PaginatedPipeline):
raise ValueError(
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
)
elif isinstance(self.pipeline_options.vlm_options, TwoStageVlmOptions):
twostagevlm_options = cast(
TwoStageVlmOptions, self.pipeline_options.vlm_options
)
stage_1_options = twostagevlm_options.layout_options
stage_2_options = twostagevlm_options.vlm_options
layout_model = LayoutModel(
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
options=LayoutOptions(
create_orphan_clusters=False, model_spec=stage_1_options
),
)
if (
isinstance(stage_2_options, InlineVlmOptions)
and stage_2_options.inference_framework == InferenceFramework.MLX
):
vlm_model_mlx = HuggingFaceMlxModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=stage_2_options,
)
self.build_pipe = [
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
]
elif (
isinstance(stage_2_options, InlineVlmOptions)
and stage_2_options.inference_framework
== InferenceFramework.TRANSFORMERS
):
vlm_model_hf = HuggingFaceTransformersVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=stage_2_options,
)
self.build_pipe = [
TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
]
else:
raise ValueError(
f"Could not instantiate the right type of VLM pipeline: {stage_2_options}"
)
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument

View File

@ -213,10 +213,10 @@
"prov": [
{
"bbox": [
139.66741943359375,
139.6674041748047,
322.5054626464844,
475.00927734375,
454.45458984375
454.4546203613281
],
"page": 1,
"span": [

View File

@ -2705,7 +2705,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373534917831421,
"confidence": 0.9373533129692078,
"cells": [
{
"index": 0,
@ -2745,7 +2745,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858680725097656,
"confidence": 0.8858679533004761,
"cells": [
{
"index": 1,
@ -2785,7 +2785,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806433916091919,
"confidence": 0.9806435108184814,
"cells": [
{
"index": 2,
@ -2940,7 +2940,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.98504239320755,
"confidence": 0.9850425124168396,
"cells": [
{
"index": 7,
@ -3155,7 +3155,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591909050941467,
"confidence": 0.9591907262802124,
"cells": [
{
"index": 15,
@ -3339,8 +3339,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"l": 139.6674041748047,
"t": 337.5453796386719,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -7846,7 +7846,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589294195175171,
"confidence": 0.9589295387268066,
"cells": [
{
"index": 91,
@ -7911,7 +7911,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849975109100342,
"confidence": 0.9849976301193237,
"cells": [
{
"index": 93,
@ -8243,8 +8243,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"l": 139.6674041748047,
"t": 337.5453796386719,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -13641,7 +13641,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373534917831421,
"confidence": 0.9373533129692078,
"cells": [
{
"index": 0,
@ -13687,7 +13687,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858680725097656,
"confidence": 0.8858679533004761,
"cells": [
{
"index": 1,
@ -13733,7 +13733,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806433916091919,
"confidence": 0.9806435108184814,
"cells": [
{
"index": 2,
@ -13900,7 +13900,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.98504239320755,
"confidence": 0.9850425124168396,
"cells": [
{
"index": 7,
@ -14121,7 +14121,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591909050941467,
"confidence": 0.9591907262802124,
"cells": [
{
"index": 15,
@ -14311,8 +14311,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"l": 139.6674041748047,
"t": 337.5453796386719,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -19701,7 +19701,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589294195175171,
"confidence": 0.9589295387268066,
"cells": [
{
"index": 91,
@ -19772,7 +19772,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849975109100342,
"confidence": 0.9849976301193237,
"cells": [
{
"index": 93,
@ -20116,7 +20116,7 @@
"b": 152.90697999999998,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9806433916091919,
"confidence": 0.9806435108184814,
"cells": [
{
"index": 2,
@ -20283,7 +20283,7 @@
"b": 255.42400999999995,
"coord_origin": "TOPLEFT"
},
"confidence": 0.98504239320755,
"confidence": 0.9850425124168396,
"cells": [
{
"index": 7,
@ -20504,7 +20504,7 @@
"b": 327.98218,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9591909050941467,
"confidence": 0.9591907262802124,
"cells": [
{
"index": 15,
@ -20694,8 +20694,8 @@
"id": 0,
"label": "table",
"bbox": {
"l": 139.66741943359375,
"t": 337.54541015625,
"l": 139.6674041748047,
"t": 337.5453796386719,
"r": 475.00927734375,
"b": 469.4945373535156,
"coord_origin": "TOPLEFT"
@ -26084,7 +26084,7 @@
"b": 518.17419,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9589294195175171,
"confidence": 0.9589295387268066,
"cells": [
{
"index": 91,
@ -26155,7 +26155,7 @@
"b": 618.3,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9849975109100342,
"confidence": 0.9849976301193237,
"cells": [
{
"index": 93,
@ -26499,7 +26499,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.9373534917831421,
"confidence": 0.9373533129692078,
"cells": [
{
"index": 0,
@ -26545,7 +26545,7 @@
"b": 102.78223000000003,
"coord_origin": "TOPLEFT"
},
"confidence": 0.8858680725097656,
"confidence": 0.8858679533004761,
"cells": [
{
"index": 1,