Replace Page with SegmentedPage

2025-12-08 12:48:28 +00:00 · 2025-07-07 12:41:22 +03:00
parent 3829e9d9ce
commit 1a162066dd
6 changed files with 34 additions and 16 deletions
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -1,16 +1,16 @@
 from enum import Enum
 from typing import Any, Callable, Dict, List, Literal, Optional, Union

+from docling_core.types.doc.page import SegmentedPage
 from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated

 from docling.datamodel.accelerator_options import AcceleratorDevice
-from docling.datamodel.base_models import Page


 class BaseVlmOptions(BaseModel):
    kind: str
-    prompt: Union[str, Callable[[Page], str]]
+    prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
    scale: float = 2.0
    max_size: Optional[int] = None
    temperature: float = 0.0
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@@ -54,7 +54,7 @@ class ApiVlmModel(BasePageModel):
                            hi_res_image = hi_res_image.convert("RGB")

                    if callable(self.vlm_options.prompt):
-                        prompt = self.vlm_options.prompt(page)
+                        prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
                        prompt = self.vlm_options.prompt

--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -129,7 +129,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix

                    # Define prompt structure
                    if callable(self.vlm_options.prompt):
-                        user_prompt = self.vlm_options.prompt(page)
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
                        user_prompt = self.vlm_options.prompt
                    prompt = self.formulate_prompt(user_prompt)
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -85,7 +85,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                            hi_res_image = hi_res_image.convert("RGB")

                    if callable(self.vlm_options.prompt):
-                        user_prompt = self.vlm_options.prompt(page)
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
                    else:
                        user_prompt = self.vlm_options.prompt
                    prompt = self.apply_chat_template(
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
            page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
            if page._backend is not None and page._backend.is_valid():
                page.size = page._backend.get_size()
+                page.parsed_page = page._backend.get_segmented_page()

        return page

--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@@ -1,11 +1,13 @@
 import logging
 import os
 from pathlib import Path
+from typing import Optional

 import requests
+from docling_core.types.doc.page import SegmentedPage
 from dotenv import load_dotenv

-from docling.datamodel.base_models import InputFormat, Page
+from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import (
    VlmPipelineOptions,
 )
@@ -53,26 +55,41 @@ def ollama_vlm_options(model: str, prompt: str):


 def ollama_olmocr_vlm_options(model: str):
-    def _dynamic_olmocr_prompt(page: Page):
-        anchor = [f"Page dimensions: {int(page.size.width)}x{int(page.size.height)}"]
+    def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
+        if page is None:
+            return (
+                "Below is the image of one page of a document. Just return the plain text"
+                " representation of this document as if you were reading it naturally.\n"
+                "Do not hallucinate.\n"
+            )

-        for cell in page._backend.get_text_cells():
-            if not cell.text.strip():
+        anchor = [
+            f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
+        ]
+
+        for text_cell in page.textline_cells:
+            if not text_cell.text.strip():
                continue
-            bbox = cell.to_bounding_box().to_bottom_left_origin(page.size.height)
-            anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {cell.text}")
+            bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
+            anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")

-        for rect in page._backend.get_bitmap_rects():
-            bbox = rect.to_bottom_left_origin(page.size.height)
+        for image_cell in page.bitmap_resources:
+            bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
+                page.dimension.height
+            )
            anchor.append(
                f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
            )

        if len(anchor) == 1:
            anchor.append(
-                f"[Image 0x0 to {int(page.size.width)}x{int(page.size.height)}]"
+                f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
            )

+        # Original prompt uses cells sorting. We are skipping it in this demo.
+
        base_text = "\n".join(anchor)

        return (
@@ -181,7 +198,7 @@ def main():
    # Example using the OlmOcr (dynamic prompt) model with Ollama:
    # (uncomment the following lines)
    # pipeline_options.vlm_options = ollama_olmocr_vlm_options(
-    #     model="hf.co/mradermacher/olmOCR-7B-0225-preview-GGUF:Q8_0",
+    #     model="hf.co/allenai/olmOCR-7B-0225-preview-GGUF:Q8_0",
    # )

    # Another possibility is using online services, e.g. watsonx.ai.