diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e04a4dd..d49f5a3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,20 @@ +## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04 + +### Feature + +* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb)) +* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1)) + +### Fix + +* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e)) +* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26)) + +### Performance + +* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6)) +* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839)) + ## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27 ### Feature diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py index 95287ad2..426b5851 100644 --- a/docling/datamodel/asr_model_specs.py +++ b/docling/datamodel/asr_model_specs.py @@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) diff --git a/docling/datamodel/layout_model_specs.py b/docling/datamodel/layout_model_specs.py new file mode 100644 index 00000000..08d5cd50 --- /dev/null +++ b/docling/datamodel/layout_model_specs.py @@ -0,0 +1,91 @@ +import logging +from enum import Enum +from pathlib import Path +from typing import Optional + +from pydantic import BaseModel + +from docling.datamodel.accelerator_options import AcceleratorDevice + +_log = logging.getLogger(__name__) + + +class LayoutModelConfig(BaseModel): + name: str + repo_id: str + revision: str + model_path: str + supported_devices: list[AcceleratorDevice] = [ + AcceleratorDevice.CPU, + AcceleratorDevice.CUDA, + AcceleratorDevice.MPS, + ] + + @property + def model_repo_folder(self) -> str: + return self.repo_id.replace("/", "--") + + +# HuggingFace Layout Models + +# Default Docling Layout Model +DOCLING_LAYOUT_V2 = LayoutModelConfig( + name="docling_layout_old", + repo_id="ds4sd/docling-layout-old", + revision="main", + model_path="", +) + +DOCLING_LAYOUT_HERON = LayoutModelConfig( + name="docling_layout_heron", + repo_id="ds4sd/docling-layout-heron", + revision="main", + model_path="", +) + +DOCLING_LAYOUT_HERON_101 = LayoutModelConfig( + name="docling_layout_heron_101", + repo_id="ds4sd/docling-layout-heron-101", + revision="main", + model_path="", +) + +DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig( + name="docling_layout_egret_medium", + repo_id="ds4sd/docling-layout-egret-medium", + revision="main", + model_path="", +) + +DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig( + name="docling_layout_egret_large", + repo_id="ds4sd/docling-layout-egret-large", + revision="main", + model_path="", +) + +DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig( + name="docling_layout_egret_xlarge", + repo_id="ds4sd/docling-layout-egret-xlarge", + revision="main", + model_path="", +) + +# Example for a hypothetical alternative model +# ALTERNATIVE_LAYOUT = LayoutModelConfig( +# name="alternative_layout", +# repo_id="someorg/alternative-layout", +# revision="main", +# model_path="model_artifacts/layout_alt", +# ) + + +class LayoutModelType(str, Enum): + DOCLING_LAYOUT_V2 = "docling_layout_v2" + DOCLING_LAYOUT_OLD = "docling_layout_old" + DOCLING_LAYOUT_HERON = "docling_layout_heron" + DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101" + DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium" + DOCLING_LAYOUT_EGRET_LARGE = "docling_layout_egret_large" + DOCLING_LAYOUT_EGRET_XLARGE = "docling_layout_egret_xlarge" + # ALTERNATIVE_LAYOUT = "alternative_layout" diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 42a4b21a..b4573384 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from enum import Enum from pathlib import Path from typing import Any, ClassVar, Dict, List, Literal, Optional, Union @@ -274,6 +275,13 @@ class VlmPipelineOptions(PaginatedPipelineOptions): ) +class LayoutOptions(BaseModel): + """Options for layout processing.""" + + create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells + model: LayoutModelConfig = DOCLING_LAYOUT_V2 + + class AsrPipelineOptions(PipelineOptions): asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY artifacts_path: Optional[Union[Path, str]] = None @@ -298,6 +306,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions): picture_description_options: PictureDescriptionBaseOptions = ( smolvlm_picture_description ) + layout_options: LayoutOptions = LayoutOptions() images_scale: float = 1.0 generate_page_images: bool = False @@ -315,8 +324,6 @@ class PdfPipelineOptions(PaginatedPipelineOptions): True # Always True since parsed_page is now mandatory ) - layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2 - class ProcessingPipeline(str, Enum): STANDARD = "standard" diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 90ab6685..bcea2493 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -1,6 +1,7 @@ from enum import Enum -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Union +from docling_core.types.doc.page import SegmentedPage from pydantic import AnyUrl, BaseModel from typing_extensions import deprecated @@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice class BaseVlmOptions(BaseModel): kind: str - prompt: str + prompt: Union[str, Callable[[Optional[SegmentedPage]], str]] scale: float = 2.0 max_size: Optional[int] = None + temperature: float = 0.0 class ResponseFormat(str, Enum): @@ -29,6 +31,12 @@ class TransformersModelType(str, Enum): AUTOMODEL = "automodel" AUTOMODEL_VISION2SEQ = "automodel-vision2seq" AUTOMODEL_CAUSALLM = "automodel-causallm" + AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext" + + +class TransformersPromptStyle(str, Enum): + CHAT = "chat" + RAW = "raw" class InlineVlmOptions(BaseVlmOptions): @@ -42,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions): inference_framework: InferenceFramework transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL + transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT response_format: ResponseFormat torch_dtype: Optional[str] = None @@ -51,7 +60,6 @@ class InlineVlmOptions(BaseVlmOptions): AcceleratorDevice.MPS, ] - temperature: float = 0.0 stop_strings: List[str] = [] extra_generation_config: Dict[str, Any] = {} diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index bfd00003..164ac285 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel): self.timeout = self.vlm_options.timeout self.concurrency = self.vlm_options.concurrency - self.prompt_content = ( - f"This is a page from a document.\n{self.vlm_options.prompt}" - ) self.params = { **self.vlm_options.params, - "temperature": 0, + "temperature": self.vlm_options.temperature, } def __call__( @@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel): if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") + if callable(self.vlm_options.prompt): + prompt = self.vlm_options.prompt(page.parsed_page) + else: + prompt = self.vlm_options.prompt + page_tags = api_image_request( image=hi_res_image, - prompt=self.prompt_content, + prompt=prompt, url=self.vlm_options.url, timeout=self.timeout, headers=self.vlm_options.headers, diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 73a30203..24e45078 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -14,7 +14,8 @@ from PIL import Image from pydantic import BaseModel from docling.datamodel.accelerator_options import AcceleratorOptions -from docling.models.base_model import BaseEnrichmentModel +from docling.datamodel.base_models import ItemAndImageEnrichmentElement +from docling.models.base_model import BaseItemAndImageEnrichmentModel from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device @@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel): kind: Literal["document_picture_classifier"] = "document_picture_classifier" -class DocumentPictureClassifier(BaseEnrichmentModel): +class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel): """ A model for classifying pictures in documents. @@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): def __call__( self, doc: DoclingDocument, - element_batch: Iterable[NodeItem], + element_batch: Iterable[ItemAndImageEnrichmentElement], ) -> Iterable[NodeItem]: """ Processes a batch of elements and enriches them with classification predictions. @@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ---------- doc : DoclingDocument The document containing the elements to be processed. - element_batch : Iterable[NodeItem] + element_batch : Iterable[ItemAndImageEnrichmentElement] A batch of pictures to classify. Returns @@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel): """ if not self.enabled: for element in element_batch: - yield element + yield element.item return images: List[Union[Image.Image, np.ndarray]] = [] elements: List[PictureItem] = [] for el in element_batch: - assert isinstance(el, PictureItem) - elements.append(el) - img = el.get_image(doc) - assert img is not None - images.append(img) + assert isinstance(el.item, PictureItem) + elements.append(el.item) + images.append(el.image) outputs = self.document_picture_classifier.predict(images) - for element, output in zip(elements, outputs): - element.annotations.append( + for item, output in zip(elements, outputs): + item.annotations.append( PictureClassificationData( provenance="DocumentPictureClassifier", predicted_classes=[ @@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ) ) - yield element + yield item diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 5d2748e5..fdd5701f 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -13,6 +13,7 @@ from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page from docling.datamodel.document import ConversionResult from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig +from docling.datamodel.pipeline_options import LayoutOptions from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import download_hf_model @@ -49,12 +50,14 @@ class LayoutModel(BasePageModel): self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions, - layout_model_config: LayoutModelConfig, + options: LayoutOptions, ): from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor + self.options = options + device = decide_device(accelerator_options.device) - self.layout_model_config = layout_model_config + layout_model_config = options.model model_repo_folder = layout_model_config.model_repo_folder model_path = layout_model_config.model_path @@ -182,7 +185,7 @@ class LayoutModel(BasePageModel): # Apply postprocessing processed_clusters, processed_cells = LayoutPostprocessor( - page, clusters + page, clusters, self.options ).postprocess() # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index bd35888d..d84925dd 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options_vlm_model import ( InlineVlmOptions, TransformersModelType, + TransformersPromptStyle, ) from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import ( @@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix from transformers import ( AutoModel, AutoModelForCausalLM, + AutoModelForImageTextToText, AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig, @@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix == TransformersModelType.AUTOMODEL_VISION2SEQ ): model_cls = AutoModelForVision2Seq + elif ( + self.vlm_options.transformers_model_type + == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT + ): + model_cls = AutoModelForImageTextToText self.processor = AutoProcessor.from_pretrained( artifacts_path, @@ -128,7 +135,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix ) # Define prompt structure - prompt = self.formulate_prompt() + if callable(self.vlm_options.prompt): + user_prompt = self.vlm_options.prompt(page.parsed_page) + else: + user_prompt = self.vlm_options.prompt + prompt = self.formulate_prompt(user_prompt) inputs = self.processor( text=prompt, images=[hi_res_image], return_tensors="pt" @@ -162,10 +173,13 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix yield page - def formulate_prompt(self) -> str: + def formulate_prompt(self, user_prompt: str) -> str: """Formulate a prompt for the VLM.""" - if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": + if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW: + return user_prompt + + elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": _log.debug("Using specialized prompt for Phi-4") # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally @@ -173,25 +187,30 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix assistant_prompt = "<|assistant|>" prompt_suffix = "<|end|>" - prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}" + prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}" _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}") return prompt - messages = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "This is a page from a document.", - }, - {"type": "image"}, - {"type": "text", "text": self.vlm_options.prompt}, - ], - } - ] - prompt = self.processor.apply_chat_template( - messages, add_generation_prompt=False + elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT: + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "This is a page from a document.", + }, + {"type": "image"}, + {"type": "text", "text": user_prompt}, + ], + } + ] + prompt = self.processor.apply_chat_template( + messages, add_generation_prompt=False + ) + return prompt + + raise RuntimeError( + f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}." ) - return prompt diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py index 58f037fc..647ce531 100644 --- a/docling/models/vlm_models_inline/mlx_model.py +++ b/docling/models/vlm_models_inline/mlx_model.py @@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder - self.param_question = vlm_options.prompt - ## Load the model self.vlm_model, self.processor = load(artifacts_path) self.config = load_config(artifacts_path) @@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") + if callable(self.vlm_options.prompt): + user_prompt = self.vlm_options.prompt(page.parsed_page) + else: + user_prompt = self.vlm_options.prompt prompt = self.apply_chat_template( - self.processor, self.config, self.param_question, num_images=1 + self.processor, self.config, user_prompt, num_images=1 ) start_time = time.time() diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index bd0c9924..b00a9ad7 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -81,7 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline): LayoutModel( artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, - layout_model_config=pipeline_options.layout_model_config, + options=pipeline_options.layout_options, ), # Table structure model TableStructureModel( @@ -130,6 +130,7 @@ class StandardPdfPipeline(PaginatedPipeline): if ( self.pipeline_options.do_formula_enrichment or self.pipeline_options.do_code_enrichment + or self.pipeline_options.do_picture_classification or self.pipeline_options.do_picture_description ): self.keep_backend = True diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 2ecfe55a..ab474fab 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline): page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore if page._backend is not None and page._backend.is_valid(): page.size = page._backend.get_size() + page.parsed_page = page._backend.get_segmented_page() return page diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index 3db1cf8d..a98b3aab 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell from rtree import index from docling.datamodel.base_models import BoundingBox, Cluster, Page +from docling.datamodel.pipeline_options import LayoutOptions _log = logging.getLogger(__name__) @@ -194,12 +195,16 @@ class LayoutPostprocessor: DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, } - def __init__(self, page: Page, clusters: List[Cluster]) -> None: + def __init__( + self, page: Page, clusters: List[Cluster], options: LayoutOptions + ) -> None: """Initialize processor with page and clusters.""" + self.cells = page.cells self.page = page self.page_size = page.size self.all_clusters = clusters + self.options = options self.regular_clusters = [ c for c in clusters if c.label not in self.SPECIAL_TYPES ] @@ -267,7 +272,7 @@ class LayoutPostprocessor: # Handle orphaned cells unassigned = self._find_unassigned_cells(clusters) - if unassigned: + if unassigned and self.options.create_orphan_clusters: next_id = max((c.id for c in self.all_clusters), default=0) + 1 orphan_clusters = [] for i, cell in enumerate(unassigned): diff --git a/docs/examples/compare_vlm_models.py b/docs/examples/compare_vlm_models.py index f9bd2dcd..49c34387 100644 --- a/docs/examples/compare_vlm_models.py +++ b/docs/examples/compare_vlm_models.py @@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS from tabulate import tabulate from docling.datamodel import vlm_model_specs +from docling.datamodel.accelerator_options import AcceleratorDevice from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) -from docling.datamodel.pipeline_options_vlm_model import InferenceFramework +from docling.datamodel.pipeline_options_vlm_model import ( + InferenceFramework, + InlineVlmOptions, + ResponseFormat, + TransformersModelType, + TransformersPromptStyle, +) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -101,6 +108,33 @@ if __name__ == "__main__": out_path = Path("scratch") out_path.mkdir(parents=True, exist_ok=True) + ## Definiton of more inline models + llava_qwen = InlineVlmOptions( + repo_id="llava-hf/llava-interleave-qwen-0.5b-hf", + # prompt="Read text in the image.", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + # prompt="Parse the reading order of this document.", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, + supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU], + scale=2.0, + temperature=0.0, + ) + + # Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt. + dolphin_oneshot = InlineVlmOptions( + repo_id="ByteDance/Dolphin", + prompt="Read text in the image. ", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, + transformers_prompt_style=TransformersPromptStyle.RAW, + supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU], + scale=2.0, + temperature=0.0, + ) + ## Use VlmPipeline pipeline_options = VlmPipelineOptions() pipeline_options.generate_page_images = True @@ -121,6 +155,9 @@ if __name__ == "__main__": vlm_model_specs.GRANITE_VISION_TRANSFORMERS, vlm_model_specs.PHI4_TRANSFORMERS, vlm_model_specs.PIXTRAL_12B_TRANSFORMERS, + ## More inline models + dolphin_oneshot, + llava_qwen, ] # Remove MLX models if not on Mac diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index 679f7bd7..a809b926 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -1,8 +1,10 @@ import logging import os from pathlib import Path +from typing import Optional import requests +from docling_core.types.doc.page import SegmentedPage from dotenv import load_dotenv from docling.datamodel.base_models import InputFormat @@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat): return options +#### Using LM Studio with OlmOcr model + + +def lms_olmocr_vlm_options(model: str): + def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]): + if page is None: + return ( + "Below is the image of one page of a document. Just return the plain text" + " representation of this document as if you were reading it naturally.\n" + "Do not hallucinate.\n" + ) + + anchor = [ + f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}" + ] + + for text_cell in page.textline_cells: + if not text_cell.text.strip(): + continue + bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin( + page.dimension.height + ) + anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}") + + for image_cell in page.bitmap_resources: + bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin( + page.dimension.height + ) + anchor.append( + f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]" + ) + + if len(anchor) == 1: + anchor.append( + f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]" + ) + + # Original prompt uses cells sorting. We are skipping it in this demo. + + base_text = "\n".join(anchor) + + return ( + f"Below is the image of one page of a document, as well as some raw textual" + f" content that was previously extracted for it. Just return the plain text" + f" representation of this document as if you were reading it naturally.\n" + f"Do not hallucinate.\n" + f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END" + ) + + options = ApiVlmOptions( + url="http://localhost:1234/v1/chat/completions", + params=dict( + model=model, + ), + prompt=_dynamic_olmocr_prompt, + timeout=90, + scale=1.0, + max_size=1024, # from OlmOcr pipeline + response_format=ResponseFormat.MARKDOWN, + ) + return options + + #### Using Ollama @@ -123,6 +188,12 @@ def main(): # format=ResponseFormat.MARKDOWN, # ) + # Example using the OlmOcr (dynamic prompt) model with LM Studio: + # (uncomment the following lines) + # pipeline_options.vlm_options = lms_olmocr_vlm_options( + # model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF", + # ) + # Example using the Granite Vision model with Ollama: # (uncomment the following lines) # pipeline_options.vlm_options = ollama_vlm_options( diff --git a/docs/installation/index.md b/docs/installation/index.md index 5930525c..38fba4c8 100644 --- a/docs/installation/index.md +++ b/docs/installation/index.md @@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi === "RHEL" ```console - dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel + dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" ``` diff --git a/pyproject.toml b/pyproject.toml index ee9ea944..391d64f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docling" -version = "2.39.0" # DO NOT EDIT, updated automatically +version = "2.40.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." license = "MIT" keywords = [ diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 5dc5e926..3a43a61a 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -17,8 +17,9 @@ def get_converter(): pipeline_options.do_table_structure = False pipeline_options.do_code_enrichment = False pipeline_options.do_formula_enrichment = False + pipeline_options.generate_picture_images = False + pipeline_options.generate_page_images = False pipeline_options.do_picture_classification = True - pipeline_options.generate_picture_images = True pipeline_options.images_scale = 2 converter = DocumentConverter( diff --git a/uv.lock b/uv.lock index f72c2832..81326046 100644 --- a/uv.lock +++ b/uv.lock @@ -805,7 +805,7 @@ wheels = [ [[package]] name = "docling" -version = "2.39.0" +version = "2.40.0" source = { editable = "." } dependencies = [ { name = "beautifulsoup4" },