Replace Page with SegmentedPage

This commit is contained in:
Shkarupa Alex 2025-07-07 12:41:22 +03:00
parent 3829e9d9ce
commit 1a162066dd
6 changed files with 34 additions and 16 deletions

View File

@ -1,16 +1,16 @@
from enum import Enum
from typing import Any, Callable, Dict, List, Literal, Optional, Union
from docling_core.types.doc.page import SegmentedPage
from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import Page
class BaseVlmOptions(BaseModel):
kind: str
prompt: Union[str, Callable[[Page], str]]
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
scale: float = 2.0
max_size: Optional[int] = None
temperature: float = 0.0

View File

@ -54,7 +54,7 @@ class ApiVlmModel(BasePageModel):
hi_res_image = hi_res_image.convert("RGB")
if callable(self.vlm_options.prompt):
prompt = self.vlm_options.prompt(page)
prompt = self.vlm_options.prompt(page.parsed_page)
else:
prompt = self.vlm_options.prompt

View File

@ -129,7 +129,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
# Define prompt structure
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page)
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.formulate_prompt(user_prompt)

View File

@ -85,7 +85,7 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
hi_res_image = hi_res_image.convert("RGB")
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page)
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.apply_chat_template(

View File

@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
page.parsed_page = page._backend.get_segmented_page()
return page

View File

@ -1,11 +1,13 @@
import logging
import os
from pathlib import Path
from typing import Optional
import requests
from docling_core.types.doc.page import SegmentedPage
from dotenv import load_dotenv
from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
VlmPipelineOptions,
)
@ -53,26 +55,41 @@ def ollama_vlm_options(model: str, prompt: str):
def ollama_olmocr_vlm_options(model: str):
def _dynamic_olmocr_prompt(page: Page):
anchor = [f"Page dimensions: {int(page.size.width)}x{int(page.size.height)}"]
def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
if page is None:
return (
"Below is the image of one page of a document. Just return the plain text"
" representation of this document as if you were reading it naturally.\n"
"Do not hallucinate.\n"
)
for cell in page._backend.get_text_cells():
if not cell.text.strip():
anchor = [
f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
]
for text_cell in page.textline_cells:
if not text_cell.text.strip():
continue
bbox = cell.to_bounding_box().to_bottom_left_origin(page.size.height)
anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {cell.text}")
bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
page.dimension.height
)
anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
for rect in page._backend.get_bitmap_rects():
bbox = rect.to_bottom_left_origin(page.size.height)
for image_cell in page.bitmap_resources:
bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
page.dimension.height
)
anchor.append(
f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
)
if len(anchor) == 1:
anchor.append(
f"[Image 0x0 to {int(page.size.width)}x{int(page.size.height)}]"
f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
)
# Original prompt uses cells sorting. We are skipping it in this demo.
base_text = "\n".join(anchor)
return (
@ -181,7 +198,7 @@ def main():
# Example using the OlmOcr (dynamic prompt) model with Ollama:
# (uncomment the following lines)
# pipeline_options.vlm_options = ollama_olmocr_vlm_options(
# model="hf.co/mradermacher/olmOCR-7B-0225-preview-GGUF:Q8_0",
# model="hf.co/allenai/olmOCR-7B-0225-preview-GGUF:Q8_0",
# )
# Another possibility is using online services, e.g. watsonx.ai.