diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 5e60bcc2..fd672b1b 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -1,16 +1,16 @@ from enum import Enum from typing import Any, Callable, Dict, List, Literal, Optional, Union +from docling_core.types.doc.page import SegmentedPage from pydantic import AnyUrl, BaseModel from typing_extensions import deprecated from docling.datamodel.accelerator_options import AcceleratorDevice -from docling.datamodel.base_models import Page class BaseVlmOptions(BaseModel): kind: str - prompt: Union[str, Callable[[Page], str]] + prompt: Union[str, Callable[[Optional[SegmentedPage]], str]] scale: float = 2.0 max_size: Optional[int] = None temperature: float = 0.0 diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index 646c2cef..164ac285 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -54,7 +54,7 @@ class ApiVlmModel(BasePageModel): hi_res_image = hi_res_image.convert("RGB") if callable(self.vlm_options.prompt): - prompt = self.vlm_options.prompt(page) + prompt = self.vlm_options.prompt(page.parsed_page) else: prompt = self.vlm_options.prompt diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index ac58ba87..4e2d80b8 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -129,7 +129,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix # Define prompt structure if callable(self.vlm_options.prompt): - user_prompt = self.vlm_options.prompt(page) + user_prompt = self.vlm_options.prompt(page.parsed_page) else: user_prompt = self.vlm_options.prompt prompt = self.formulate_prompt(user_prompt) diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py index cf403069..647ce531 100644 --- a/docling/models/vlm_models_inline/mlx_model.py +++ b/docling/models/vlm_models_inline/mlx_model.py @@ -85,7 +85,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): hi_res_image = hi_res_image.convert("RGB") if callable(self.vlm_options.prompt): - user_prompt = self.vlm_options.prompt(page) + user_prompt = self.vlm_options.prompt(page.parsed_page) else: user_prompt = self.vlm_options.prompt prompt = self.apply_chat_template( diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 2ecfe55a..ab474fab 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline): page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore if page._backend is not None and page._backend.is_valid(): page.size = page._backend.get_size() + page.parsed_page = page._backend.get_segmented_page() return page diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index ff377808..01bbbb14 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -1,11 +1,13 @@ import logging import os from pathlib import Path +from typing import Optional import requests +from docling_core.types.doc.page import SegmentedPage from dotenv import load_dotenv -from docling.datamodel.base_models import InputFormat, Page +from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) @@ -53,26 +55,41 @@ def ollama_vlm_options(model: str, prompt: str): def ollama_olmocr_vlm_options(model: str): - def _dynamic_olmocr_prompt(page: Page): - anchor = [f"Page dimensions: {int(page.size.width)}x{int(page.size.height)}"] + def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]): + if page is None: + return ( + "Below is the image of one page of a document. Just return the plain text" + " representation of this document as if you were reading it naturally.\n" + "Do not hallucinate.\n" + ) - for cell in page._backend.get_text_cells(): - if not cell.text.strip(): + anchor = [ + f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}" + ] + + for text_cell in page.textline_cells: + if not text_cell.text.strip(): continue - bbox = cell.to_bounding_box().to_bottom_left_origin(page.size.height) - anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {cell.text}") + bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin( + page.dimension.height + ) + anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}") - for rect in page._backend.get_bitmap_rects(): - bbox = rect.to_bottom_left_origin(page.size.height) + for image_cell in page.bitmap_resources: + bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin( + page.dimension.height + ) anchor.append( f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]" ) if len(anchor) == 1: anchor.append( - f"[Image 0x0 to {int(page.size.width)}x{int(page.size.height)}]" + f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]" ) + # Original prompt uses cells sorting. We are skipping it in this demo. + base_text = "\n".join(anchor) return ( @@ -181,7 +198,7 @@ def main(): # Example using the OlmOcr (dynamic prompt) model with Ollama: # (uncomment the following lines) # pipeline_options.vlm_options = ollama_olmocr_vlm_options( - # model="hf.co/mradermacher/olmOCR-7B-0225-preview-GGUF:Q8_0", + # model="hf.co/allenai/olmOCR-7B-0225-preview-GGUF:Q8_0", # ) # Another possibility is using online services, e.g. watsonx.ai.