Move to pipeline_options.layout_options.model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-07-08 11:24:06 +02:00
commit af0461e5b1
19 changed files with 327 additions and 63 deletions

View File

@ -1,3 +1,20 @@
## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04
### Feature
* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb))
* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1))
### Fix
* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e))
* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26))
### Performance
* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6))
* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839))
## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
### Feature

View File

@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
temperature=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
temperature=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
temperature=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
temperature=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
temperature=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)
@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
verbose=True,
timestamps=True,
word_timestamps=True,
temperatue=0.0,
temperature=0.0,
max_new_tokens=256,
max_time_chunk=30.0,
)

View File

@ -0,0 +1,91 @@
import logging
from enum import Enum
from pathlib import Path
from typing import Optional
from pydantic import BaseModel
from docling.datamodel.accelerator_options import AcceleratorDevice
_log = logging.getLogger(__name__)
class LayoutModelConfig(BaseModel):
name: str
repo_id: str
revision: str
model_path: str
supported_devices: list[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
AcceleratorDevice.MPS,
]
@property
def model_repo_folder(self) -> str:
return self.repo_id.replace("/", "--")
# HuggingFace Layout Models
# Default Docling Layout Model
DOCLING_LAYOUT_V2 = LayoutModelConfig(
name="docling_layout_old",
repo_id="ds4sd/docling-layout-old",
revision="main",
model_path="",
)
DOCLING_LAYOUT_HERON = LayoutModelConfig(
name="docling_layout_heron",
repo_id="ds4sd/docling-layout-heron",
revision="main",
model_path="",
)
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
name="docling_layout_heron_101",
repo_id="ds4sd/docling-layout-heron-101",
revision="main",
model_path="",
)
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
name="docling_layout_egret_medium",
repo_id="ds4sd/docling-layout-egret-medium",
revision="main",
model_path="",
)
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
name="docling_layout_egret_large",
repo_id="ds4sd/docling-layout-egret-large",
revision="main",
model_path="",
)
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
name="docling_layout_egret_xlarge",
repo_id="ds4sd/docling-layout-egret-xlarge",
revision="main",
model_path="",
)
# Example for a hypothetical alternative model
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
# name="alternative_layout",
# repo_id="someorg/alternative-layout",
# revision="main",
# model_path="model_artifacts/layout_alt",
# )
class LayoutModelType(str, Enum):
DOCLING_LAYOUT_V2 = "docling_layout_v2"
DOCLING_LAYOUT_OLD = "docling_layout_old"
DOCLING_LAYOUT_HERON = "docling_layout_heron"
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
# ALTERNATIVE_LAYOUT = "alternative_layout"

View File

@ -1,4 +1,5 @@
import logging
from datetime import datetime
from enum import Enum
from pathlib import Path
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -274,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
)
class LayoutOptions(BaseModel):
"""Options for layout processing."""
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
model: LayoutModelConfig = DOCLING_LAYOUT_V2
class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None
@ -298,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description
)
layout_options: LayoutOptions = LayoutOptions()
images_scale: float = 1.0
generate_page_images: bool = False
@ -315,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
True # Always True since parsed_page is now mandatory
)
layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2
class ProcessingPipeline(str, Enum):
STANDARD = "standard"

View File

@ -1,6 +1,7 @@
from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union
from typing import Any, Callable, Dict, List, Literal, Optional, Union
from docling_core.types.doc.page import SegmentedPage
from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated
@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
class BaseVlmOptions(BaseModel):
kind: str
prompt: str
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
scale: float = 2.0
max_size: Optional[int] = None
temperature: float = 0.0
class ResponseFormat(str, Enum):
@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
AUTOMODEL = "automodel"
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
AUTOMODEL_CAUSALLM = "automodel-causallm"
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
class TransformersPromptStyle(str, Enum):
CHAT = "chat"
RAW = "raw"
class InlineVlmOptions(BaseVlmOptions):
@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
inference_framework: InferenceFramework
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
response_format: ResponseFormat
torch_dtype: Optional[str] = None
@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
AcceleratorDevice.MPS,
]
temperature: float = 0.0
stop_strings: List[str] = []
extra_generation_config: Dict[str, Any] = {}

View File

@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
self.timeout = self.vlm_options.timeout
self.concurrency = self.vlm_options.concurrency
self.prompt_content = (
f"This is a page from a document.\n{self.vlm_options.prompt}"
)
self.params = {
**self.vlm_options.params,
"temperature": 0,
"temperature": self.vlm_options.temperature,
}
def __call__(
@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")
if callable(self.vlm_options.prompt):
prompt = self.vlm_options.prompt(page.parsed_page)
else:
prompt = self.vlm_options.prompt
page_tags = api_image_request(
image=hi_res_image,
prompt=self.prompt_content,
prompt=prompt,
url=self.vlm_options.url,
timeout=self.timeout,
headers=self.vlm_options.headers,

View File

@ -14,7 +14,8 @@ from PIL import Image
from pydantic import BaseModel
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.models.base_model import BaseEnrichmentModel
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
from docling.models.base_model import BaseItemAndImageEnrichmentModel
from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device
@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
class DocumentPictureClassifier(BaseEnrichmentModel):
class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
"""
A model for classifying pictures in documents.
@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
def __call__(
self,
doc: DoclingDocument,
element_batch: Iterable[NodeItem],
element_batch: Iterable[ItemAndImageEnrichmentElement],
) -> Iterable[NodeItem]:
"""
Processes a batch of elements and enriches them with classification predictions.
@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
----------
doc : DoclingDocument
The document containing the elements to be processed.
element_batch : Iterable[NodeItem]
element_batch : Iterable[ItemAndImageEnrichmentElement]
A batch of pictures to classify.
Returns
@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
"""
if not self.enabled:
for element in element_batch:
yield element
yield element.item
return
images: List[Union[Image.Image, np.ndarray]] = []
elements: List[PictureItem] = []
for el in element_batch:
assert isinstance(el, PictureItem)
elements.append(el)
img = el.get_image(doc)
assert img is not None
images.append(img)
assert isinstance(el.item, PictureItem)
elements.append(el.item)
images.append(el.image)
outputs = self.document_picture_classifier.predict(images)
for element, output in zip(elements, outputs):
element.annotations.append(
for item, output in zip(elements, outputs):
item.annotations.append(
PictureClassificationData(
provenance="DocumentPictureClassifier",
predicted_classes=[
@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
)
)
yield element
yield item

View File

@ -13,6 +13,7 @@ from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
from docling.datamodel.document import ConversionResult
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
from docling.datamodel.pipeline_options import LayoutOptions
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model
@ -49,12 +50,14 @@ class LayoutModel(BasePageModel):
self,
artifacts_path: Optional[Path],
accelerator_options: AcceleratorOptions,
layout_model_config: LayoutModelConfig,
options: LayoutOptions,
):
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
self.options = options
device = decide_device(accelerator_options.device)
self.layout_model_config = layout_model_config
layout_model_config = options.model
model_repo_folder = layout_model_config.model_repo_folder
model_path = layout_model_config.model_path
@ -182,7 +185,7 @@ class LayoutModel(BasePageModel):
# Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor(
page, clusters
page, clusters, self.options
).postprocess()
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

View File

@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import (
InlineVlmOptions,
TransformersModelType,
TransformersPromptStyle,
)
from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import (
@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
from transformers import (
AutoModel,
AutoModelForCausalLM,
AutoModelForImageTextToText,
AutoModelForVision2Seq,
AutoProcessor,
BitsAndBytesConfig,
@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
== TransformersModelType.AUTOMODEL_VISION2SEQ
):
model_cls = AutoModelForVision2Seq
elif (
self.vlm_options.transformers_model_type
== TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
):
model_cls = AutoModelForImageTextToText
self.processor = AutoProcessor.from_pretrained(
artifacts_path,
@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
)
# Define prompt structure
prompt = self.formulate_prompt()
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.formulate_prompt(user_prompt)
inputs = self.processor(
text=prompt, images=[hi_res_image], return_tensors="pt"
@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
yield page
def formulate_prompt(self) -> str:
def formulate_prompt(self, user_prompt: str) -> str:
"""Formulate a prompt for the VLM."""
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
return user_prompt
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
_log.debug("Using specialized prompt for Phi-4")
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>"
prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
return prompt
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "This is a page from a document.",
},
{"type": "image"},
{"type": "text", "text": self.vlm_options.prompt},
],
}
]
prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=False
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "This is a page from a document.",
},
{"type": "image"},
{"type": "text", "text": user_prompt},
],
}
]
prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=False
)
return prompt
raise RuntimeError(
f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
)
return prompt

View File

@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
elif (artifacts_path / repo_cache_folder).exists():
artifacts_path = artifacts_path / repo_cache_folder
self.param_question = vlm_options.prompt
## Load the model
self.vlm_model, self.processor = load(artifacts_path)
self.config = load_config(artifacts_path)
@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.apply_chat_template(
self.processor, self.config, self.param_question, num_images=1
self.processor, self.config, user_prompt, num_images=1
)
start_time = time.time()

View File

@ -81,7 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline):
LayoutModel(
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
layout_model_config=pipeline_options.layout_model_config,
options=pipeline_options.layout_options,
),
# Table structure model
TableStructureModel(
@ -130,6 +130,7 @@ class StandardPdfPipeline(PaginatedPipeline):
if (
self.pipeline_options.do_formula_enrichment
or self.pipeline_options.do_code_enrichment
or self.pipeline_options.do_picture_classification
or self.pipeline_options.do_picture_description
):
self.keep_backend = True

View File

@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
page.parsed_page = page._backend.get_segmented_page()
return page

View File

@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
from rtree import index
from docling.datamodel.base_models import BoundingBox, Cluster, Page
from docling.datamodel.pipeline_options import LayoutOptions
_log = logging.getLogger(__name__)
@ -194,12 +195,16 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
}
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
def __init__(
self, page: Page, clusters: List[Cluster], options: LayoutOptions
) -> None:
"""Initialize processor with page and clusters."""
self.cells = page.cells
self.page = page
self.page_size = page.size
self.all_clusters = clusters
self.options = options
self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES
]
@ -267,7 +272,7 @@ class LayoutPostprocessor:
# Handle orphaned cells
unassigned = self._find_unassigned_cells(clusters)
if unassigned:
if unassigned and self.options.create_orphan_clusters:
next_id = max((c.id for c in self.all_clusters), default=0) + 1
orphan_clusters = []
for i, cell in enumerate(unassigned):

View File

@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
from tabulate import tabulate
from docling.datamodel import vlm_model_specs
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
VlmPipelineOptions,
)
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
from docling.datamodel.pipeline_options_vlm_model import (
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
TransformersModelType,
TransformersPromptStyle,
)
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
@ -101,6 +108,33 @@ if __name__ == "__main__":
out_path = Path("scratch")
out_path.mkdir(parents=True, exist_ok=True)
## Definiton of more inline models
llava_qwen = InlineVlmOptions(
repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
# prompt="Read text in the image.",
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
# prompt="Parse the reading order of this document.",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS,
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
scale=2.0,
temperature=0.0,
)
# Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
dolphin_oneshot = InlineVlmOptions(
repo_id="ByteDance/Dolphin",
prompt="<s>Read text in the image. <Answer/>",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS,
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
transformers_prompt_style=TransformersPromptStyle.RAW,
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
scale=2.0,
temperature=0.0,
)
## Use VlmPipeline
pipeline_options = VlmPipelineOptions()
pipeline_options.generate_page_images = True
@ -121,6 +155,9 @@ if __name__ == "__main__":
vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
vlm_model_specs.PHI4_TRANSFORMERS,
vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
## More inline models
dolphin_oneshot,
llava_qwen,
]
# Remove MLX models if not on Mac

View File

@ -1,8 +1,10 @@
import logging
import os
from pathlib import Path
from typing import Optional
import requests
from docling_core.types.doc.page import SegmentedPage
from dotenv import load_dotenv
from docling.datamodel.base_models import InputFormat
@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
return options
#### Using LM Studio with OlmOcr model
def lms_olmocr_vlm_options(model: str):
def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
if page is None:
return (
"Below is the image of one page of a document. Just return the plain text"
" representation of this document as if you were reading it naturally.\n"
"Do not hallucinate.\n"
)
anchor = [
f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
]
for text_cell in page.textline_cells:
if not text_cell.text.strip():
continue
bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
page.dimension.height
)
anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
for image_cell in page.bitmap_resources:
bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
page.dimension.height
)
anchor.append(
f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
)
if len(anchor) == 1:
anchor.append(
f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
)
# Original prompt uses cells sorting. We are skipping it in this demo.
base_text = "\n".join(anchor)
return (
f"Below is the image of one page of a document, as well as some raw textual"
f" content that was previously extracted for it. Just return the plain text"
f" representation of this document as if you were reading it naturally.\n"
f"Do not hallucinate.\n"
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
)
options = ApiVlmOptions(
url="http://localhost:1234/v1/chat/completions",
params=dict(
model=model,
),
prompt=_dynamic_olmocr_prompt,
timeout=90,
scale=1.0,
max_size=1024, # from OlmOcr pipeline
response_format=ResponseFormat.MARKDOWN,
)
return options
#### Using Ollama
@ -123,6 +188,12 @@ def main():
# format=ResponseFormat.MARKDOWN,
# )
# Example using the OlmOcr (dynamic prompt) model with LM Studio:
# (uncomment the following lines)
# pipeline_options.vlm_options = lms_olmocr_vlm_options(
# model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
# )
# Example using the Granite Vision model with Ollama:
# (uncomment the following lines)
# pipeline_options.vlm_options = ollama_vlm_options(

View File

@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
=== "RHEL"
```console
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
```

View File

@ -1,6 +1,6 @@
[project]
name = "docling"
version = "2.39.0" # DO NOT EDIT, updated automatically
version = "2.40.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [

View File

@ -17,8 +17,9 @@ def get_converter():
pipeline_options.do_table_structure = False
pipeline_options.do_code_enrichment = False
pipeline_options.do_formula_enrichment = False
pipeline_options.generate_picture_images = False
pipeline_options.generate_page_images = False
pipeline_options.do_picture_classification = True
pipeline_options.generate_picture_images = True
pipeline_options.images_scale = 2
converter = DocumentConverter(

2
uv.lock generated
View File

@ -805,7 +805,7 @@ wheels = [
[[package]]
name = "docling"
version = "2.39.0"
version = "2.40.0"
source = { editable = "." }
dependencies = [
{ name = "beautifulsoup4" },