mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-25 19:44:34 +00:00
Move to pipeline_options.layout_options.model
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
commit
af0461e5b1
17
CHANGELOG.md
17
CHANGELOG.md
@ -1,3 +1,20 @@
|
||||
## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04
|
||||
|
||||
### Feature
|
||||
|
||||
* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb))
|
||||
* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1))
|
||||
|
||||
### Fix
|
||||
|
||||
* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e))
|
||||
* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26))
|
||||
|
||||
### Performance
|
||||
|
||||
* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6))
|
||||
* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839))
|
||||
|
||||
## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
|
||||
|
||||
### Feature
|
||||
|
@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
temperature=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
temperature=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
temperature=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
temperature=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
temperature=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions(
|
||||
verbose=True,
|
||||
timestamps=True,
|
||||
word_timestamps=True,
|
||||
temperatue=0.0,
|
||||
temperature=0.0,
|
||||
max_new_tokens=256,
|
||||
max_time_chunk=30.0,
|
||||
)
|
||||
|
91
docling/datamodel/layout_model_specs.py
Normal file
91
docling/datamodel/layout_model_specs.py
Normal file
@ -0,0 +1,91 @@
|
||||
import logging
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LayoutModelConfig(BaseModel):
|
||||
name: str
|
||||
repo_id: str
|
||||
revision: str
|
||||
model_path: str
|
||||
supported_devices: list[AcceleratorDevice] = [
|
||||
AcceleratorDevice.CPU,
|
||||
AcceleratorDevice.CUDA,
|
||||
AcceleratorDevice.MPS,
|
||||
]
|
||||
|
||||
@property
|
||||
def model_repo_folder(self) -> str:
|
||||
return self.repo_id.replace("/", "--")
|
||||
|
||||
|
||||
# HuggingFace Layout Models
|
||||
|
||||
# Default Docling Layout Model
|
||||
DOCLING_LAYOUT_V2 = LayoutModelConfig(
|
||||
name="docling_layout_old",
|
||||
repo_id="ds4sd/docling-layout-old",
|
||||
revision="main",
|
||||
model_path="",
|
||||
)
|
||||
|
||||
DOCLING_LAYOUT_HERON = LayoutModelConfig(
|
||||
name="docling_layout_heron",
|
||||
repo_id="ds4sd/docling-layout-heron",
|
||||
revision="main",
|
||||
model_path="",
|
||||
)
|
||||
|
||||
DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
|
||||
name="docling_layout_heron_101",
|
||||
repo_id="ds4sd/docling-layout-heron-101",
|
||||
revision="main",
|
||||
model_path="",
|
||||
)
|
||||
|
||||
DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
|
||||
name="docling_layout_egret_medium",
|
||||
repo_id="ds4sd/docling-layout-egret-medium",
|
||||
revision="main",
|
||||
model_path="",
|
||||
)
|
||||
|
||||
DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
|
||||
name="docling_layout_egret_large",
|
||||
repo_id="ds4sd/docling-layout-egret-large",
|
||||
revision="main",
|
||||
model_path="",
|
||||
)
|
||||
|
||||
DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
|
||||
name="docling_layout_egret_xlarge",
|
||||
repo_id="ds4sd/docling-layout-egret-xlarge",
|
||||
revision="main",
|
||||
model_path="",
|
||||
)
|
||||
|
||||
# Example for a hypothetical alternative model
|
||||
# ALTERNATIVE_LAYOUT = LayoutModelConfig(
|
||||
# name="alternative_layout",
|
||||
# repo_id="someorg/alternative-layout",
|
||||
# revision="main",
|
||||
# model_path="model_artifacts/layout_alt",
|
||||
# )
|
||||
|
||||
|
||||
class LayoutModelType(str, Enum):
|
||||
DOCLING_LAYOUT_V2 = "docling_layout_v2"
|
||||
DOCLING_LAYOUT_OLD = "docling_layout_old"
|
||||
DOCLING_LAYOUT_HERON = "docling_layout_heron"
|
||||
DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
|
||||
DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
|
||||
DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large"
|
||||
DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge"
|
||||
# ALTERNATIVE_LAYOUT = "alternative_layout"
|
@ -1,4 +1,5 @@
|
||||
import logging
|
||||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
||||
@ -274,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||
)
|
||||
|
||||
|
||||
class LayoutOptions(BaseModel):
|
||||
"""Options for layout processing."""
|
||||
|
||||
create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells
|
||||
model: LayoutModelConfig = DOCLING_LAYOUT_V2
|
||||
|
||||
|
||||
class AsrPipelineOptions(PipelineOptions):
|
||||
asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
|
||||
artifacts_path: Optional[Union[Path, str]] = None
|
||||
@ -298,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
picture_description_options: PictureDescriptionBaseOptions = (
|
||||
smolvlm_picture_description
|
||||
)
|
||||
layout_options: LayoutOptions = LayoutOptions()
|
||||
|
||||
images_scale: float = 1.0
|
||||
generate_page_images: bool = False
|
||||
@ -315,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||
True # Always True since parsed_page is now mandatory
|
||||
)
|
||||
|
||||
layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2
|
||||
|
||||
|
||||
class ProcessingPipeline(str, Enum):
|
||||
STANDARD = "standard"
|
||||
|
@ -1,6 +1,7 @@
|
||||
from enum import Enum
|
||||
from typing import Any, Dict, List, Literal, Optional, Union
|
||||
from typing import Any, Callable, Dict, List, Literal, Optional, Union
|
||||
|
||||
from docling_core.types.doc.page import SegmentedPage
|
||||
from pydantic import AnyUrl, BaseModel
|
||||
from typing_extensions import deprecated
|
||||
|
||||
@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
|
||||
class BaseVlmOptions(BaseModel):
|
||||
kind: str
|
||||
prompt: str
|
||||
prompt: Union[str, Callable[[Optional[SegmentedPage]], str]]
|
||||
scale: float = 2.0
|
||||
max_size: Optional[int] = None
|
||||
temperature: float = 0.0
|
||||
|
||||
|
||||
class ResponseFormat(str, Enum):
|
||||
@ -29,6 +31,12 @@ class TransformersModelType(str, Enum):
|
||||
AUTOMODEL = "automodel"
|
||||
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
||||
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
||||
AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext"
|
||||
|
||||
|
||||
class TransformersPromptStyle(str, Enum):
|
||||
CHAT = "chat"
|
||||
RAW = "raw"
|
||||
|
||||
|
||||
class InlineVlmOptions(BaseVlmOptions):
|
||||
@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions):
|
||||
|
||||
inference_framework: InferenceFramework
|
||||
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
||||
transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT
|
||||
response_format: ResponseFormat
|
||||
|
||||
torch_dtype: Optional[str] = None
|
||||
@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions):
|
||||
AcceleratorDevice.MPS,
|
||||
]
|
||||
|
||||
temperature: float = 0.0
|
||||
stop_strings: List[str] = []
|
||||
extra_generation_config: Dict[str, Any] = {}
|
||||
|
||||
|
@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel):
|
||||
|
||||
self.timeout = self.vlm_options.timeout
|
||||
self.concurrency = self.vlm_options.concurrency
|
||||
self.prompt_content = (
|
||||
f"This is a page from a document.\n{self.vlm_options.prompt}"
|
||||
)
|
||||
self.params = {
|
||||
**self.vlm_options.params,
|
||||
"temperature": 0,
|
||||
"temperature": self.vlm_options.temperature,
|
||||
}
|
||||
|
||||
def __call__(
|
||||
@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel):
|
||||
if hi_res_image.mode != "RGB":
|
||||
hi_res_image = hi_res_image.convert("RGB")
|
||||
|
||||
if callable(self.vlm_options.prompt):
|
||||
prompt = self.vlm_options.prompt(page.parsed_page)
|
||||
else:
|
||||
prompt = self.vlm_options.prompt
|
||||
|
||||
page_tags = api_image_request(
|
||||
image=hi_res_image,
|
||||
prompt=self.prompt_content,
|
||||
prompt=prompt,
|
||||
url=self.vlm_options.url,
|
||||
timeout=self.timeout,
|
||||
headers=self.vlm_options.headers,
|
||||
|
@ -14,7 +14,8 @@ from PIL import Image
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||
from docling.models.base_model import BaseEnrichmentModel
|
||||
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
||||
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
||||
from docling.models.utils.hf_model_download import download_hf_model
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
|
||||
@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel):
|
||||
kind: Literal["document_picture_classifier"] = "document_picture_classifier"
|
||||
|
||||
|
||||
class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel):
|
||||
"""
|
||||
A model for classifying pictures in documents.
|
||||
|
||||
@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
def __call__(
|
||||
self,
|
||||
doc: DoclingDocument,
|
||||
element_batch: Iterable[NodeItem],
|
||||
element_batch: Iterable[ItemAndImageEnrichmentElement],
|
||||
) -> Iterable[NodeItem]:
|
||||
"""
|
||||
Processes a batch of elements and enriches them with classification predictions.
|
||||
@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
----------
|
||||
doc : DoclingDocument
|
||||
The document containing the elements to be processed.
|
||||
element_batch : Iterable[NodeItem]
|
||||
element_batch : Iterable[ItemAndImageEnrichmentElement]
|
||||
A batch of pictures to classify.
|
||||
|
||||
Returns
|
||||
@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
"""
|
||||
if not self.enabled:
|
||||
for element in element_batch:
|
||||
yield element
|
||||
yield element.item
|
||||
return
|
||||
|
||||
images: List[Union[Image.Image, np.ndarray]] = []
|
||||
elements: List[PictureItem] = []
|
||||
for el in element_batch:
|
||||
assert isinstance(el, PictureItem)
|
||||
elements.append(el)
|
||||
img = el.get_image(doc)
|
||||
assert img is not None
|
||||
images.append(img)
|
||||
assert isinstance(el.item, PictureItem)
|
||||
elements.append(el.item)
|
||||
images.append(el.image)
|
||||
|
||||
outputs = self.document_picture_classifier.predict(images)
|
||||
|
||||
for element, output in zip(elements, outputs):
|
||||
element.annotations.append(
|
||||
for item, output in zip(elements, outputs):
|
||||
item.annotations.append(
|
||||
PictureClassificationData(
|
||||
provenance="DocumentPictureClassifier",
|
||||
predicted_classes=[
|
||||
@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
)
|
||||
)
|
||||
|
||||
yield element
|
||||
yield item
|
||||
|
@ -13,6 +13,7 @@ from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
|
||||
from docling.datamodel.pipeline_options import LayoutOptions
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.utils.hf_model_download import download_hf_model
|
||||
@ -49,12 +50,14 @@ class LayoutModel(BasePageModel):
|
||||
self,
|
||||
artifacts_path: Optional[Path],
|
||||
accelerator_options: AcceleratorOptions,
|
||||
layout_model_config: LayoutModelConfig,
|
||||
options: LayoutOptions,
|
||||
):
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
|
||||
self.options = options
|
||||
|
||||
device = decide_device(accelerator_options.device)
|
||||
self.layout_model_config = layout_model_config
|
||||
layout_model_config = options.model
|
||||
model_repo_folder = layout_model_config.model_repo_folder
|
||||
model_path = layout_model_config.model_path
|
||||
|
||||
@ -182,7 +185,7 @@ class LayoutModel(BasePageModel):
|
||||
# Apply postprocessing
|
||||
|
||||
processed_clusters, processed_cells = LayoutPostprocessor(
|
||||
page, clusters
|
||||
page, clusters, self.options
|
||||
).postprocess()
|
||||
# Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
|
||||
|
||||
|
@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InlineVlmOptions,
|
||||
TransformersModelType,
|
||||
TransformersPromptStyle,
|
||||
)
|
||||
from docling.models.base_model import BasePageModel
|
||||
from docling.models.utils.hf_model_download import (
|
||||
@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
from transformers import (
|
||||
AutoModel,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForImageTextToText,
|
||||
AutoModelForVision2Seq,
|
||||
AutoProcessor,
|
||||
BitsAndBytesConfig,
|
||||
@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
== TransformersModelType.AUTOMODEL_VISION2SEQ
|
||||
):
|
||||
model_cls = AutoModelForVision2Seq
|
||||
elif (
|
||||
self.vlm_options.transformers_model_type
|
||||
== TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT
|
||||
):
|
||||
model_cls = AutoModelForImageTextToText
|
||||
|
||||
self.processor = AutoProcessor.from_pretrained(
|
||||
artifacts_path,
|
||||
@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
)
|
||||
|
||||
# Define prompt structure
|
||||
prompt = self.formulate_prompt()
|
||||
if callable(self.vlm_options.prompt):
|
||||
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
||||
else:
|
||||
user_prompt = self.vlm_options.prompt
|
||||
prompt = self.formulate_prompt(user_prompt)
|
||||
|
||||
inputs = self.processor(
|
||||
text=prompt, images=[hi_res_image], return_tensors="pt"
|
||||
@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
|
||||
yield page
|
||||
|
||||
def formulate_prompt(self) -> str:
|
||||
def formulate_prompt(self, user_prompt: str) -> str:
|
||||
"""Formulate a prompt for the VLM."""
|
||||
|
||||
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
||||
if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
|
||||
return user_prompt
|
||||
|
||||
elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
||||
_log.debug("Using specialized prompt for Phi-4")
|
||||
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
|
||||
|
||||
@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
|
||||
assistant_prompt = "<|assistant|>"
|
||||
prompt_suffix = "<|end|>"
|
||||
|
||||
prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
|
||||
prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
|
||||
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
|
||||
|
||||
return prompt
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is a page from a document.",
|
||||
},
|
||||
{"type": "image"},
|
||||
{"type": "text", "text": self.vlm_options.prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
prompt = self.processor.apply_chat_template(
|
||||
messages, add_generation_prompt=False
|
||||
elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "This is a page from a document.",
|
||||
},
|
||||
{"type": "image"},
|
||||
{"type": "text", "text": user_prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
prompt = self.processor.apply_chat_template(
|
||||
messages, add_generation_prompt=False
|
||||
)
|
||||
return prompt
|
||||
|
||||
raise RuntimeError(
|
||||
f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
|
||||
)
|
||||
return prompt
|
||||
|
@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
elif (artifacts_path / repo_cache_folder).exists():
|
||||
artifacts_path = artifacts_path / repo_cache_folder
|
||||
|
||||
self.param_question = vlm_options.prompt
|
||||
|
||||
## Load the model
|
||||
self.vlm_model, self.processor = load(artifacts_path)
|
||||
self.config = load_config(artifacts_path)
|
||||
@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||
if hi_res_image.mode != "RGB":
|
||||
hi_res_image = hi_res_image.convert("RGB")
|
||||
|
||||
if callable(self.vlm_options.prompt):
|
||||
user_prompt = self.vlm_options.prompt(page.parsed_page)
|
||||
else:
|
||||
user_prompt = self.vlm_options.prompt
|
||||
prompt = self.apply_chat_template(
|
||||
self.processor, self.config, self.param_question, num_images=1
|
||||
self.processor, self.config, user_prompt, num_images=1
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
@ -81,7 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
LayoutModel(
|
||||
artifacts_path=artifacts_path,
|
||||
accelerator_options=pipeline_options.accelerator_options,
|
||||
layout_model_config=pipeline_options.layout_model_config,
|
||||
options=pipeline_options.layout_options,
|
||||
),
|
||||
# Table structure model
|
||||
TableStructureModel(
|
||||
@ -130,6 +130,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
if (
|
||||
self.pipeline_options.do_formula_enrichment
|
||||
or self.pipeline_options.do_code_enrichment
|
||||
or self.pipeline_options.do_picture_classification
|
||||
or self.pipeline_options.do_picture_description
|
||||
):
|
||||
self.keep_backend = True
|
||||
|
@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline):
|
||||
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||
if page._backend is not None and page._backend.is_valid():
|
||||
page.size = page._backend.get_size()
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
|
||||
return page
|
||||
|
||||
|
@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell
|
||||
from rtree import index
|
||||
|
||||
from docling.datamodel.base_models import BoundingBox, Cluster, Page
|
||||
from docling.datamodel.pipeline_options import LayoutOptions
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
@ -194,12 +195,16 @@ class LayoutPostprocessor:
|
||||
DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
|
||||
}
|
||||
|
||||
def __init__(self, page: Page, clusters: List[Cluster]) -> None:
|
||||
def __init__(
|
||||
self, page: Page, clusters: List[Cluster], options: LayoutOptions
|
||||
) -> None:
|
||||
"""Initialize processor with page and clusters."""
|
||||
|
||||
self.cells = page.cells
|
||||
self.page = page
|
||||
self.page_size = page.size
|
||||
self.all_clusters = clusters
|
||||
self.options = options
|
||||
self.regular_clusters = [
|
||||
c for c in clusters if c.label not in self.SPECIAL_TYPES
|
||||
]
|
||||
@ -267,7 +272,7 @@ class LayoutPostprocessor:
|
||||
|
||||
# Handle orphaned cells
|
||||
unassigned = self._find_unassigned_cells(clusters)
|
||||
if unassigned:
|
||||
if unassigned and self.options.create_orphan_clusters:
|
||||
next_id = max((c.id for c in self.all_clusters), default=0) + 1
|
||||
orphan_clusters = []
|
||||
for i, cell in enumerate(unassigned):
|
||||
|
39
docs/examples/compare_vlm_models.py
vendored
39
docs/examples/compare_vlm_models.py
vendored
@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||
from tabulate import tabulate
|
||||
|
||||
from docling.datamodel import vlm_model_specs
|
||||
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import (
|
||||
VlmPipelineOptions,
|
||||
)
|
||||
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
|
||||
from docling.datamodel.pipeline_options_vlm_model import (
|
||||
InferenceFramework,
|
||||
InlineVlmOptions,
|
||||
ResponseFormat,
|
||||
TransformersModelType,
|
||||
TransformersPromptStyle,
|
||||
)
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||
|
||||
@ -101,6 +108,33 @@ if __name__ == "__main__":
|
||||
out_path = Path("scratch")
|
||||
out_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
## Definiton of more inline models
|
||||
llava_qwen = InlineVlmOptions(
|
||||
repo_id="llava-hf/llava-interleave-qwen-0.5b-hf",
|
||||
# prompt="Read text in the image.",
|
||||
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||
# prompt="Parse the reading order of this document.",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
# Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt.
|
||||
dolphin_oneshot = InlineVlmOptions(
|
||||
repo_id="ByteDance/Dolphin",
|
||||
prompt="<s>Read text in the image. <Answer/>",
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||
transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT,
|
||||
transformers_prompt_style=TransformersPromptStyle.RAW,
|
||||
supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU],
|
||||
scale=2.0,
|
||||
temperature=0.0,
|
||||
)
|
||||
|
||||
## Use VlmPipeline
|
||||
pipeline_options = VlmPipelineOptions()
|
||||
pipeline_options.generate_page_images = True
|
||||
@ -121,6 +155,9 @@ if __name__ == "__main__":
|
||||
vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
|
||||
vlm_model_specs.PHI4_TRANSFORMERS,
|
||||
vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
|
||||
## More inline models
|
||||
dolphin_oneshot,
|
||||
llava_qwen,
|
||||
]
|
||||
|
||||
# Remove MLX models if not on Mac
|
||||
|
71
docs/examples/vlm_pipeline_api_model.py
vendored
71
docs/examples/vlm_pipeline_api_model.py
vendored
@ -1,8 +1,10 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from docling_core.types.doc.page import SegmentedPage
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat):
|
||||
return options
|
||||
|
||||
|
||||
#### Using LM Studio with OlmOcr model
|
||||
|
||||
|
||||
def lms_olmocr_vlm_options(model: str):
|
||||
def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]):
|
||||
if page is None:
|
||||
return (
|
||||
"Below is the image of one page of a document. Just return the plain text"
|
||||
" representation of this document as if you were reading it naturally.\n"
|
||||
"Do not hallucinate.\n"
|
||||
)
|
||||
|
||||
anchor = [
|
||||
f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}"
|
||||
]
|
||||
|
||||
for text_cell in page.textline_cells:
|
||||
if not text_cell.text.strip():
|
||||
continue
|
||||
bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin(
|
||||
page.dimension.height
|
||||
)
|
||||
anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}")
|
||||
|
||||
for image_cell in page.bitmap_resources:
|
||||
bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin(
|
||||
page.dimension.height
|
||||
)
|
||||
anchor.append(
|
||||
f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]"
|
||||
)
|
||||
|
||||
if len(anchor) == 1:
|
||||
anchor.append(
|
||||
f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]"
|
||||
)
|
||||
|
||||
# Original prompt uses cells sorting. We are skipping it in this demo.
|
||||
|
||||
base_text = "\n".join(anchor)
|
||||
|
||||
return (
|
||||
f"Below is the image of one page of a document, as well as some raw textual"
|
||||
f" content that was previously extracted for it. Just return the plain text"
|
||||
f" representation of this document as if you were reading it naturally.\n"
|
||||
f"Do not hallucinate.\n"
|
||||
f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END"
|
||||
)
|
||||
|
||||
options = ApiVlmOptions(
|
||||
url="http://localhost:1234/v1/chat/completions",
|
||||
params=dict(
|
||||
model=model,
|
||||
),
|
||||
prompt=_dynamic_olmocr_prompt,
|
||||
timeout=90,
|
||||
scale=1.0,
|
||||
max_size=1024, # from OlmOcr pipeline
|
||||
response_format=ResponseFormat.MARKDOWN,
|
||||
)
|
||||
return options
|
||||
|
||||
|
||||
#### Using Ollama
|
||||
|
||||
|
||||
@ -123,6 +188,12 @@ def main():
|
||||
# format=ResponseFormat.MARKDOWN,
|
||||
# )
|
||||
|
||||
# Example using the OlmOcr (dynamic prompt) model with LM Studio:
|
||||
# (uncomment the following lines)
|
||||
# pipeline_options.vlm_options = lms_olmocr_vlm_options(
|
||||
# model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF",
|
||||
# )
|
||||
|
||||
# Example using the Granite Vision model with Ollama:
|
||||
# (uncomment the following lines)
|
||||
# pipeline_options.vlm_options = ollama_vlm_options(
|
||||
|
2
docs/installation/index.md
vendored
2
docs/installation/index.md
vendored
@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
||||
=== "RHEL"
|
||||
|
||||
```console
|
||||
dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel
|
||||
dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel
|
||||
TESSDATA_PREFIX=/usr/share/tesseract/tessdata/
|
||||
echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}"
|
||||
```
|
||||
|
@ -1,6 +1,6 @@
|
||||
[project]
|
||||
name = "docling"
|
||||
version = "2.39.0" # DO NOT EDIT, updated automatically
|
||||
version = "2.40.0" # DO NOT EDIT, updated automatically
|
||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||
license = "MIT"
|
||||
keywords = [
|
||||
|
@ -17,8 +17,9 @@ def get_converter():
|
||||
pipeline_options.do_table_structure = False
|
||||
pipeline_options.do_code_enrichment = False
|
||||
pipeline_options.do_formula_enrichment = False
|
||||
pipeline_options.generate_picture_images = False
|
||||
pipeline_options.generate_page_images = False
|
||||
pipeline_options.do_picture_classification = True
|
||||
pipeline_options.generate_picture_images = True
|
||||
pipeline_options.images_scale = 2
|
||||
|
||||
converter = DocumentConverter(
|
||||
|
Loading…
Reference in New Issue
Block a user