Move to pipeline_options.layout_options.model

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2025-07-08 11:24:06 +02:00
commit af0461e5b1
19 changed files with 327 additions and 63 deletions

View File

@ -1,3 +1,20 @@
## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04
### Feature
* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb))
* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1))
### Fix
* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e))
* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26))
### Performance
* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6))
* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839))
## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27 ## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
### Feature ### Feature

View File

@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
verbose=True, verbose=True,
timestamps=True, timestamps=True,
word_timestamps=True, word_timestamps=True,
temperatue=0.0, temperature=0.0,
max_new_tokens=256, max_new_tokens=256,
max_time_chunk=30.0, max_time_chunk=30.0,
) )
@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
verbose=True, verbose=True,
timestamps=True, timestamps=True,
word_timestamps=True, word_timestamps=True,
temperatue=0.0, temperature=0.0,
max_new_tokens=256, max_new_tokens=256,
max_time_chunk=30.0, max_time_chunk=30.0,
) )
@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
verbose=True, verbose=True,
timestamps=True, timestamps=True,
word_timestamps=True, word_timestamps=True,
temperatue=0.0, temperature=0.0,
max_new_tokens=256, max_new_tokens=256,
max_time_chunk=30.0, max_time_chunk=30.0,
) )
@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
verbose=True, verbose=True,
timestamps=True, timestamps=True,
word_timestamps=True, word_timestamps=True,
temperatue=0.0, temperature=0.0,
max_new_tokens=256, max_new_tokens=256,
max_time_chunk=30.0, max_time_chunk=30.0,
) )
@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
verbose=True, verbose=True,
timestamps=True, timestamps=True,
word_timestamps=True, word_timestamps=True,
temperatue=0.0, temperature=0.0,
max_new_tokens=256, max_new_tokens=256,
max_time_chunk=30.0, max_time_chunk=30.0,
) )
@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
verbose=True, verbose=True,
timestamps=True, timestamps=True,
word_timestamps=True, word_timestamps=True,
temperatue=0.0, temperature=0.0,
max_new_tokens=256, max_new_tokens=256,
max_time_chunk=30.0, max_time_chunk=30.0,
) )

View File

@ -0,0 +1,91 @@
import logging
from enum import Enum
from pathlib import Path
from typing import Optional
from pydantic import BaseModel
from docling.datamodel.accelerator_options import AcceleratorDevice
_log = logging.getLogger(__name__)
class LayoutModelConfig(BaseModel):
name: str
repo_id: str
revision: str
model_path: str
supported_devices: list[AcceleratorDevice] = [
AcceleratorDevice.CPU,
AcceleratorDevice.CUDA,
AcceleratorDevice.MPS,
]
@property
def model_repo_folder(self) -> str:
return self.repo_id.replace("/", "--")
# HuggingFace Layout Models
# Default Docling Layout Model
DOCLING_LAYOUT_V2 = LayoutModelConfig(
name="docling_layout_old",
repo_id="ds4sd/docling-layout-old",
revision="main",
model_path="",
)
DOCLING_LAYOUT_HERON = LayoutModelConfig(
name="docling_layout_heron",
repo_id="ds4sd/docling-layout-heron",
revision="main",
model_path="",
)
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
name="docling_layout_heron_101",
repo_id="ds4sd/docling-layout-heron-101",
revision="main",
model_path="",
)
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
name="docling_layout_egret_medium",
repo_id="ds4sd/docling-layout-egret-medium",
revision="main",
model_path="",
)
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
name="docling_layout_egret_large",
repo_id="ds4sd/docling-layout-egret-large",
revision="main",
model_path="",
)
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
name="docling_layout_egret_xlarge",
repo_id="ds4sd/docling-layout-egret-xlarge",
revision="main",
model_path="",
)
# Example for a hypothetical alternative model
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
# name="alternative_layout",
# repo_id="someorg/alternative-layout",
# revision="main",
# model_path="model_artifacts/layout_alt",
# )
class LayoutModelType(str, Enum):
DOCLING_LAYOUT_V2 = "docling_layout_v2"
DOCLING_LAYOUT_OLD = "docling_layout_old"
DOCLING_LAYOUT_HERON = "docling_layout_heron"
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
# ALTERNATIVE_LAYOUT = "alternative_layout"

View File

@ -1,4 +1,5 @@
import logging import logging
from datetime import datetime
from enum import Enum from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
@ -274,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
) )
class LayoutOptions(BaseModel):
"""Options for layout processing."""
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
model: LayoutModelConfig = DOCLING_LAYOUT_V2
class AsrPipelineOptions(PipelineOptions): class AsrPipelineOptions(PipelineOptions):
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
artifacts_path: Optional[Union[Path, str]] = None artifacts_path: Optional[Union[Path, str]] = None
@ -298,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
picture_description_options: PictureDescriptionBaseOptions = ( picture_description_options: PictureDescriptionBaseOptions = (
smolvlm_picture_description smolvlm_picture_description
) )
layout_options: LayoutOptions = LayoutOptions()
images_scale: float = 1.0 images_scale: float = 1.0
generate_page_images: bool = False generate_page_images: bool = False
@ -315,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
True # Always True since parsed_page is now mandatory True # Always True since parsed_page is now mandatory
) )
layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2
class ProcessingPipeline(str, Enum): class ProcessingPipeline(str, Enum):
STANDARD = "standard" STANDARD = "standard"

View File

@ -1,6 +1,7 @@
from enum import Enum from enum import Enum
from typing import Any, Dict, List, Literal, Optional, Union from typing import Any, Callable, Dict, List, Literal, Optional, Union
from docling_core.types.doc.page import SegmentedPage
from pydantic import AnyUrl, BaseModel from pydantic import AnyUrl, BaseModel
from typing_extensions import deprecated from typing_extensions import deprecated
@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
class BaseVlmOptions(BaseModel): class BaseVlmOptions(BaseModel):
kind: str kind: str
prompt: str prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
scale: float = 2.0 scale: float = 2.0
max_size: Optional[int] = None max_size: Optional[int] = None
temperature: float = 0.0
class ResponseFormat(str, Enum): class ResponseFormat(str, Enum):
@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
AUTOMODEL = "automodel" AUTOMODEL = "automodel"
AUTOMODEL_VISION2SEQ = "automodel-vision2seq" AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
AUTOMODEL_CAUSALLM = "automodel-causallm" AUTOMODEL_CAUSALLM = "automodel-causallm"
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
class TransformersPromptStyle(str, Enum):
CHAT = "chat"
RAW = "raw"
class InlineVlmOptions(BaseVlmOptions): class InlineVlmOptions(BaseVlmOptions):
@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
inference_framework: InferenceFramework inference_framework: InferenceFramework
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
response_format: ResponseFormat response_format: ResponseFormat
torch_dtype: Optional[str] = None torch_dtype: Optional[str] = None
@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
AcceleratorDevice.MPS, AcceleratorDevice.MPS,
] ]
temperature: float = 0.0
stop_strings: List[str] = [] stop_strings: List[str] = []
extra_generation_config: Dict[str, Any] = {} extra_generation_config: Dict[str, Any] = {}

View File

@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
self.timeout = self.vlm_options.timeout self.timeout = self.vlm_options.timeout
self.concurrency = self.vlm_options.concurrency self.concurrency = self.vlm_options.concurrency
self.prompt_content = (
f"This is a page from a document.\n{self.vlm_options.prompt}"
)
self.params = { self.params = {
**self.vlm_options.params, **self.vlm_options.params,
"temperature": 0, "temperature": self.vlm_options.temperature,
} }
def __call__( def __call__(
@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
if hi_res_image.mode != "RGB": if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB") hi_res_image = hi_res_image.convert("RGB")
if callable(self.vlm_options.prompt):
prompt = self.vlm_options.prompt(page.parsed_page)
else:
prompt = self.vlm_options.prompt
page_tags = api_image_request( page_tags = api_image_request(
image=hi_res_image, image=hi_res_image,
prompt=self.prompt_content, prompt=prompt,
url=self.vlm_options.url, url=self.vlm_options.url,
timeout=self.timeout, timeout=self.timeout,
headers=self.vlm_options.headers, headers=self.vlm_options.headers,

View File

@ -14,7 +14,8 @@ from PIL import Image
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.models.base_model import BaseEnrichmentModel from docling.datamodel.base_models import ItemAndImageEnrichmentElement
from docling.models.base_model import BaseItemAndImageEnrichmentModel
from docling.models.utils.hf_model_download import download_hf_model from docling.models.utils.hf_model_download import download_hf_model
from docling.utils.accelerator_utils import decide_device from docling.utils.accelerator_utils import decide_device
@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
kind: Literal["document_picture_classifier"] = "document_picture_classifier" kind: Literal["document_picture_classifier"] = "document_picture_classifier"
class DocumentPictureClassifier(BaseEnrichmentModel): class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
""" """
A model for classifying pictures in documents. A model for classifying pictures in documents.
@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
def __call__( def __call__(
self, self,
doc: DoclingDocument, doc: DoclingDocument,
element_batch: Iterable[NodeItem], element_batch: Iterable[ItemAndImageEnrichmentElement],
) -> Iterable[NodeItem]: ) -> Iterable[NodeItem]:
""" """
Processes a batch of elements and enriches them with classification predictions. Processes a batch of elements and enriches them with classification predictions.
@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
---------- ----------
doc : DoclingDocument doc : DoclingDocument
The document containing the elements to be processed. The document containing the elements to be processed.
element_batch : Iterable[NodeItem] element_batch : Iterable[ItemAndImageEnrichmentElement]
A batch of pictures to classify. A batch of pictures to classify.
Returns Returns
@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
""" """
if not self.enabled: if not self.enabled:
for element in element_batch: for element in element_batch:
yield element yield element.item
return return
images: List[Union[Image.Image, np.ndarray]] = [] images: List[Union[Image.Image, np.ndarray]] = []
elements: List[PictureItem] = [] elements: List[PictureItem] = []
for el in element_batch: for el in element_batch:
assert isinstance(el, PictureItem) assert isinstance(el.item, PictureItem)
elements.append(el) elements.append(el.item)
img = el.get_image(doc) images.append(el.image)
assert img is not None
images.append(img)
outputs = self.document_picture_classifier.predict(images) outputs = self.document_picture_classifier.predict(images)
for element, output in zip(elements, outputs): for item, output in zip(elements, outputs):
element.annotations.append( item.annotations.append(
PictureClassificationData( PictureClassificationData(
provenance="DocumentPictureClassifier", provenance="DocumentPictureClassifier",
predicted_classes=[ predicted_classes=[
@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
) )
) )
yield element yield item

View File

@ -13,6 +13,7 @@ from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
from docling.datamodel.pipeline_options import LayoutOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import download_hf_model from docling.models.utils.hf_model_download import download_hf_model
@ -49,12 +50,14 @@ class LayoutModel(BasePageModel):
self, self,
artifacts_path: Optional[Path], artifacts_path: Optional[Path],
accelerator_options: AcceleratorOptions, accelerator_options: AcceleratorOptions,
layout_model_config: LayoutModelConfig, options: LayoutOptions,
): ):
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
self.options = options
device = decide_device(accelerator_options.device) device = decide_device(accelerator_options.device)
self.layout_model_config = layout_model_config layout_model_config = options.model
model_repo_folder = layout_model_config.model_repo_folder model_repo_folder = layout_model_config.model_repo_folder
model_path = layout_model_config.model_path model_path = layout_model_config.model_path
@ -182,7 +185,7 @@ class LayoutModel(BasePageModel):
# Apply postprocessing # Apply postprocessing
processed_clusters, processed_cells = LayoutPostprocessor( processed_clusters, processed_cells = LayoutPostprocessor(
page, clusters page, clusters, self.options
).postprocess() ).postprocess()
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

View File

@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options_vlm_model import ( from docling.datamodel.pipeline_options_vlm_model import (
InlineVlmOptions, InlineVlmOptions,
TransformersModelType, TransformersModelType,
TransformersPromptStyle,
) )
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
from docling.models.utils.hf_model_download import ( from docling.models.utils.hf_model_download import (
@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
from transformers import ( from transformers import (
AutoModel, AutoModel,
AutoModelForCausalLM, AutoModelForCausalLM,
AutoModelForImageTextToText,
AutoModelForVision2Seq, AutoModelForVision2Seq,
AutoProcessor, AutoProcessor,
BitsAndBytesConfig, BitsAndBytesConfig,
@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
== TransformersModelType.AUTOMODEL_VISION2SEQ == TransformersModelType.AUTOMODEL_VISION2SEQ
): ):
model_cls = AutoModelForVision2Seq model_cls = AutoModelForVision2Seq
elif (
self.vlm_options.transformers_model_type
== TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
):
model_cls = AutoModelForImageTextToText
self.processor = AutoProcessor.from_pretrained( self.processor = AutoProcessor.from_pretrained(
artifacts_path, artifacts_path,
@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
) )
# Define prompt structure # Define prompt structure
prompt = self.formulate_prompt() if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.formulate_prompt(user_prompt)
inputs = self.processor( inputs = self.processor(
text=prompt, images=[hi_res_image], return_tensors="pt" text=prompt, images=[hi_res_image], return_tensors="pt"
@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
yield page yield page
def formulate_prompt(self) -> str: def formulate_prompt(self, user_prompt: str) -> str:
"""Formulate a prompt for the VLM.""" """Formulate a prompt for the VLM."""
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
return user_prompt
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
_log.debug("Using specialized prompt for Phi-4") _log.debug("Using specialized prompt for Phi-4")
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
@ -173,11 +187,12 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
assistant_prompt = "<|assistant|>" assistant_prompt = "<|assistant|>"
prompt_suffix = "<|end|>" prompt_suffix = "<|end|>"
prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}" prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}") _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
return prompt return prompt
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
messages = [ messages = [
{ {
"role": "user", "role": "user",
@ -187,7 +202,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
"text": "This is a page from a document.", "text": "This is a page from a document.",
}, },
{"type": "image"}, {"type": "image"},
{"type": "text", "text": self.vlm_options.prompt}, {"type": "text", "text": user_prompt},
], ],
} }
] ]
@ -195,3 +210,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
messages, add_generation_prompt=False messages, add_generation_prompt=False
) )
return prompt return prompt
raise RuntimeError(
f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
)

View File

@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
elif (artifacts_path / repo_cache_folder).exists(): elif (artifacts_path / repo_cache_folder).exists():
artifacts_path = artifacts_path / repo_cache_folder artifacts_path = artifacts_path / repo_cache_folder
self.param_question = vlm_options.prompt
## Load the model ## Load the model
self.vlm_model, self.processor = load(artifacts_path) self.vlm_model, self.processor = load(artifacts_path)
self.config = load_config(artifacts_path) self.config = load_config(artifacts_path)
@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
if hi_res_image.mode != "RGB": if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB") hi_res_image = hi_res_image.convert("RGB")
if callable(self.vlm_options.prompt):
user_prompt = self.vlm_options.prompt(page.parsed_page)
else:
user_prompt = self.vlm_options.prompt
prompt = self.apply_chat_template( prompt = self.apply_chat_template(
self.processor, self.config, self.param_question, num_images=1 self.processor, self.config, user_prompt, num_images=1
) )
start_time = time.time() start_time = time.time()

View File

@ -81,7 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline):
LayoutModel( LayoutModel(
artifacts_path=artifacts_path, artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options, accelerator_options=pipeline_options.accelerator_options,
layout_model_config=pipeline_options.layout_model_config, options=pipeline_options.layout_options,
), ),
# Table structure model # Table structure model
TableStructureModel( TableStructureModel(
@ -130,6 +130,7 @@ class StandardPdfPipeline(PaginatedPipeline):
if ( if (
self.pipeline_options.do_formula_enrichment self.pipeline_options.do_formula_enrichment
or self.pipeline_options.do_code_enrichment or self.pipeline_options.do_code_enrichment
or self.pipeline_options.do_picture_classification
or self.pipeline_options.do_picture_description or self.pipeline_options.do_picture_description
): ):
self.keep_backend = True self.keep_backend = True

View File

@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid(): if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size() page.size = page._backend.get_size()
page.parsed_page = page._backend.get_segmented_page()
return page return page

View File

@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
from rtree import index from rtree import index
from docling.datamodel.base_models import BoundingBox, Cluster, Page from docling.datamodel.base_models import BoundingBox, Cluster, Page
from docling.datamodel.pipeline_options import LayoutOptions
_log = logging.getLogger(__name__) _log = logging.getLogger(__name__)
@ -194,12 +195,16 @@ class LayoutPostprocessor:
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
} }
def __init__(self, page: Page, clusters: List[Cluster]) -> None: def __init__(
self, page: Page, clusters: List[Cluster], options: LayoutOptions
) -> None:
"""Initialize processor with page and clusters.""" """Initialize processor with page and clusters."""
self.cells = page.cells self.cells = page.cells
self.page = page self.page = page
self.page_size = page.size self.page_size = page.size
self.all_clusters = clusters self.all_clusters = clusters
self.options = options
self.regular_clusters = [ self.regular_clusters = [
c for c in clusters if c.label not in self.SPECIAL_TYPES c for c in clusters if c.label not in self.SPECIAL_TYPES
] ]
@ -267,7 +272,7 @@ class LayoutPostprocessor:
# Handle orphaned cells # Handle orphaned cells
unassigned = self._find_unassigned_cells(clusters) unassigned = self._find_unassigned_cells(clusters)
if unassigned: if unassigned and self.options.create_orphan_clusters:
next_id = max((c.id for c in self.all_clusters), default=0) + 1 next_id = max((c.id for c in self.all_clusters), default=0) + 1
orphan_clusters = [] orphan_clusters = []
for i, cell in enumerate(unassigned): for i, cell in enumerate(unassigned):

View File

@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
from tabulate import tabulate from tabulate import tabulate
from docling.datamodel import vlm_model_specs from docling.datamodel import vlm_model_specs
from docling.datamodel.accelerator_options import AcceleratorDevice
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
VlmPipelineOptions, VlmPipelineOptions,
) )
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework from docling.datamodel.pipeline_options_vlm_model import (
InferenceFramework,
InlineVlmOptions,
ResponseFormat,
TransformersModelType,
TransformersPromptStyle,
)
from docling.document_converter import DocumentConverter, PdfFormatOption from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline from docling.pipeline.vlm_pipeline import VlmPipeline
@ -101,6 +108,33 @@ if __name__ == "__main__":
out_path = Path("scratch") out_path = Path("scratch")
out_path.mkdir(parents=True, exist_ok=True) out_path.mkdir(parents=True, exist_ok=True)
## Definiton of more inline models
llava_qwen = InlineVlmOptions(
repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
# prompt="Read text in the image.",
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
# prompt="Parse the reading order of this document.",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS,
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
scale=2.0,
temperature=0.0,
)
# Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
dolphin_oneshot = InlineVlmOptions(
repo_id="ByteDance/Dolphin",
prompt="<s>Read text in the image. <Answer/>",
response_format=ResponseFormat.MARKDOWN,
inference_framework=InferenceFramework.TRANSFORMERS,
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
transformers_prompt_style=TransformersPromptStyle.RAW,
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
scale=2.0,
temperature=0.0,
)
## Use VlmPipeline ## Use VlmPipeline
pipeline_options = VlmPipelineOptions() pipeline_options = VlmPipelineOptions()
pipeline_options.generate_page_images = True pipeline_options.generate_page_images = True
@ -121,6 +155,9 @@ if __name__ == "__main__":
vlm_model_specs.GRANITE_VISION_TRANSFORMERS, vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
vlm_model_specs.PHI4_TRANSFORMERS, vlm_model_specs.PHI4_TRANSFORMERS,
vlm_model_specs.PIXTRAL_12B_TRANSFORMERS, vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
## More inline models
dolphin_oneshot,
llava_qwen,
] ]
# Remove MLX models if not on Mac # Remove MLX models if not on Mac

View File

@ -1,8 +1,10 @@
import logging import logging
import os import os
from pathlib import Path from pathlib import Path
from typing import Optional
import requests import requests
from docling_core.types.doc.page import SegmentedPage
from dotenv import load_dotenv from dotenv import load_dotenv
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
return options return options
#### Using LM Studio with OlmOcr model
def lms_olmocr_vlm_options(model: str):
def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
if page is None:
return (
"Below is the image of one page of a document. Just return the plain text"
" representation of this document as if you were reading it naturally.\n"
"Do not hallucinate.\n"
)
anchor = [
f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
]
for text_cell in page.textline_cells:
if not text_cell.text.strip():
continue
bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
page.dimension.height
)
anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
for image_cell in page.bitmap_resources:
bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
page.dimension.height
)
anchor.append(
f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
)
if len(anchor) == 1:
anchor.append(
f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
)
# Original prompt uses cells sorting. We are skipping it in this demo.
base_text = "\n".join(anchor)
return (
f"Below is the image of one page of a document, as well as some raw textual"
f" content that was previously extracted for it. Just return the plain text"
f" representation of this document as if you were reading it naturally.\n"
f"Do not hallucinate.\n"
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
)
options = ApiVlmOptions(
url="http://localhost:1234/v1/chat/completions",
params=dict(
model=model,
),
prompt=_dynamic_olmocr_prompt,
timeout=90,
scale=1.0,
max_size=1024, # from OlmOcr pipeline
response_format=ResponseFormat.MARKDOWN,
)
return options
#### Using Ollama #### Using Ollama
@ -123,6 +188,12 @@ def main():
# format=ResponseFormat.MARKDOWN, # format=ResponseFormat.MARKDOWN,
# ) # )
# Example using the OlmOcr (dynamic prompt) model with LM Studio:
# (uncomment the following lines)
# pipeline_options.vlm_options = lms_olmocr_vlm_options(
# model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
# )
# Example using the Granite Vision model with Ollama: # Example using the Granite Vision model with Ollama:
# (uncomment the following lines) # (uncomment the following lines)
# pipeline_options.vlm_options = ollama_vlm_options( # pipeline_options.vlm_options = ollama_vlm_options(

View File

@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
=== "RHEL" === "RHEL"
```console ```console
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
``` ```

View File

@ -1,6 +1,6 @@
[project] [project]
name = "docling" name = "docling"
version = "2.39.0" # DO NOT EDIT, updated automatically version = "2.40.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT" license = "MIT"
keywords = [ keywords = [

View File

@ -17,8 +17,9 @@ def get_converter():
pipeline_options.do_table_structure = False pipeline_options.do_table_structure = False
pipeline_options.do_code_enrichment = False pipeline_options.do_code_enrichment = False
pipeline_options.do_formula_enrichment = False pipeline_options.do_formula_enrichment = False
pipeline_options.generate_picture_images = False
pipeline_options.generate_page_images = False
pipeline_options.do_picture_classification = True pipeline_options.do_picture_classification = True
pipeline_options.generate_picture_images = True
pipeline_options.images_scale = 2 pipeline_options.images_scale = 2
converter = DocumentConverter( converter = DocumentConverter(

2
uv.lock generated
View File

@ -805,7 +805,7 @@ wheels = [
[[package]] [[package]]
name = "docling" name = "docling"
version = "2.39.0" version = "2.40.0"
source = { editable = "." } source = { editable = "." }
dependencies = [ dependencies = [
{ name = "beautifulsoup4" }, { name = "beautifulsoup4" },