From ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb Mon Sep 17 00:00:00 2001 From: Christoph Auer <60343111+cau-git@users.noreply.github.com> Date: Fri, 4 Jul 2025 15:36:13 +0200 Subject: [PATCH 1/7] feat: Introduce LayoutOptions to control layout postprocessing behaviour (#1870) Signed-off-by: Christoph Auer --- docling/datamodel/pipeline_options.py | 8 ++++++++ docling/models/layout_model.py | 10 ++++++++-- docling/pipeline/standard_pdf_pipeline.py | 1 + docling/utils/layout_postprocessor.py | 9 +++++++-- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index 11e085b7..fcf091ef 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,4 +1,5 @@ import logging +from datetime import datetime from enum import Enum from pathlib import Path from typing import Any, ClassVar, Dict, List, Literal, Optional, Union @@ -265,6 +266,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions): ) +class LayoutOptions(BaseModel): + """Options for layout processing.""" + + create_orphan_clusters: bool = True # Whether to create clusters for orphaned cells + + class AsrPipelineOptions(PipelineOptions): asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY artifacts_path: Optional[Union[Path, str]] = None @@ -289,6 +296,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions): picture_description_options: PictureDescriptionBaseOptions = ( smolvlm_picture_description ) + layout_options: LayoutOptions = LayoutOptions() images_scale: float = 1.0 generate_page_images: bool = False diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index da75bb8f..44e7286d 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -12,6 +12,7 @@ from PIL import Image from docling.datamodel.accelerator_options import AcceleratorOptions from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page from docling.datamodel.document import ConversionResult +from docling.datamodel.pipeline_options import LayoutOptions from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import download_hf_model @@ -48,10 +49,15 @@ class LayoutModel(BasePageModel): CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION] def __init__( - self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions + self, + artifacts_path: Optional[Path], + accelerator_options: AcceleratorOptions, + options: LayoutOptions, ): from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor + self.options = options + device = decide_device(accelerator_options.device) if artifacts_path is None: @@ -177,7 +183,7 @@ class LayoutModel(BasePageModel): # Apply postprocessing processed_clusters, processed_cells = LayoutPostprocessor( - page, clusters + page, clusters, self.options ).postprocess() # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index ad4f36da..8861174a 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -80,6 +80,7 @@ class StandardPdfPipeline(PaginatedPipeline): LayoutModel( artifacts_path=artifacts_path, accelerator_options=pipeline_options.accelerator_options, + options=pipeline_options.layout_options, ), # Table structure model TableStructureModel( diff --git a/docling/utils/layout_postprocessor.py b/docling/utils/layout_postprocessor.py index 3db1cf8d..a98b3aab 100644 --- a/docling/utils/layout_postprocessor.py +++ b/docling/utils/layout_postprocessor.py @@ -9,6 +9,7 @@ from docling_core.types.doc.page import TextCell from rtree import index from docling.datamodel.base_models import BoundingBox, Cluster, Page +from docling.datamodel.pipeline_options import LayoutOptions _log = logging.getLogger(__name__) @@ -194,12 +195,16 @@ class LayoutPostprocessor: DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER, } - def __init__(self, page: Page, clusters: List[Cluster]) -> None: + def __init__( + self, page: Page, clusters: List[Cluster], options: LayoutOptions + ) -> None: """Initialize processor with page and clusters.""" + self.cells = page.cells self.page = page self.page_size = page.size self.all_clusters = clusters + self.options = options self.regular_clusters = [ c for c in clusters if c.label not in self.SPECIAL_TYPES ] @@ -267,7 +272,7 @@ class LayoutPostprocessor: # Handle orphaned cells unassigned = self._find_unassigned_cells(clusters) - if unassigned: + if unassigned and self.options.create_orphan_clusters: next_id = max((c.id for c in self.all_clusters), default=0) + 1 orphan_clusters = [] for i, cell in enumerate(unassigned): From f4a1c06937f5356758296cd9994ab3a8092bbe01 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 4 Jul 2025 15:31:36 +0000 Subject: [PATCH 2/7] chore: bump version to 2.40.0 [skip ci] --- CHANGELOG.md | 17 ++++++++++++++ pyproject.toml | 2 +- uv.lock | 60 +++++++++++++++++++++++++------------------------- 3 files changed, 48 insertions(+), 31 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e04a4dd..d49f5a3b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,20 @@ +## [v2.40.0](https://github.com/docling-project/docling/releases/tag/v2.40.0) - 2025-07-04 + +### Feature + +* Introduce LayoutOptions to control layout postprocessing behaviour ([#1870](https://github.com/docling-project/docling/issues/1870)) ([`ec6cf6f`](https://github.com/docling-project/docling/commit/ec6cf6f7e8050db30c14f0625d6d5c6bbfeb6aeb)) +* Integrate ListItemMarkerProcessor into document assembly ([#1825](https://github.com/docling-project/docling/issues/1825)) ([`56a0e10`](https://github.com/docling-project/docling/commit/56a0e104f76c5ba30ac0fcd247be61f911b560c1)) + +### Fix + +* Secure torch model inits with global locks ([#1884](https://github.com/docling-project/docling/issues/1884)) ([`598c9c5`](https://github.com/docling-project/docling/commit/598c9c53d401de6aac89b7c51bccd57160dace1e)) +* Ensure that TesseractOcrModel does not crash in case OSD is not installed ([#1866](https://github.com/docling-project/docling/issues/1866)) ([`ae39a94`](https://github.com/docling-project/docling/commit/ae39a9411a09b2165ac745af358dea644f868e26)) + +### Performance + +* **msexcel:** _find_table_bounds use iter_rows/iter_cols instead of Worksheet.cell ([#1875](https://github.com/docling-project/docling/issues/1875)) ([`13865c0`](https://github.com/docling-project/docling/commit/13865c06f5c564b9e57f3dbb60d26e60c75258b6)) +* Move expensive imports closer to usage ([#1863](https://github.com/docling-project/docling/issues/1863)) ([`3089cf2`](https://github.com/docling-project/docling/commit/3089cf2d26918eed4007398a528f53971c19f839)) + ## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27 ### Feature diff --git a/pyproject.toml b/pyproject.toml index 7139c031..1bd0f3d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docling" -version = "2.39.0" # DO NOT EDIT, updated automatically +version = "2.40.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." license = "MIT" keywords = [ diff --git a/uv.lock b/uv.lock index c98b69b6..b14c8f3d 100644 --- a/uv.lock +++ b/uv.lock @@ -805,7 +805,7 @@ wheels = [ [[package]] name = "docling" -version = "2.39.0" +version = "2.40.0" source = { editable = "." } dependencies = [ { name = "beautifulsoup4" }, @@ -3367,7 +3367,7 @@ name = "nvidia-cudnn-cu12" version = "9.5.1.17" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386, upload-time = "2024-10-25T19:54:26.39Z" }, @@ -3378,7 +3378,7 @@ name = "nvidia-cufft-cu12" version = "11.3.0.4" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632, upload-time = "2024-11-20T17:41:32.357Z" }, @@ -3407,9 +3407,9 @@ name = "nvidia-cusolver-cu12" version = "11.7.1.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, - { name = "nvidia-cusparse-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, - { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-cusparse-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790, upload-time = "2024-11-20T17:43:43.211Z" }, @@ -3421,7 +3421,7 @@ name = "nvidia-cusparse-cu12" version = "12.5.4.2" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367, upload-time = "2024-11-20T17:44:54.824Z" }, @@ -3466,10 +3466,10 @@ name = "ocrmac" version = "1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "pillow" }, - { name = "pyobjc-framework-vision" }, + { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux')" }, + { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'darwin'" }, + { name = "pillow", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-framework-vision", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" } wheels = [ @@ -4496,7 +4496,7 @@ name = "pyobjc-framework-cocoa" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" } wheels = [ @@ -4515,8 +4515,8 @@ name = "pyobjc-framework-coreml" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" } wheels = [ @@ -4535,8 +4535,8 @@ name = "pyobjc-framework-quartz" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" } wheels = [ @@ -4555,10 +4555,10 @@ name = "pyobjc-framework-vision" version = "11.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "pyobjc-core" }, - { name = "pyobjc-framework-cocoa" }, - { name = "pyobjc-framework-coreml" }, - { name = "pyobjc-framework-quartz" }, + { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, + { name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" }, ] sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" } wheels = [ @@ -5038,17 +5038,17 @@ version = "1.4.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, + { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" }, { name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "opencv-python" }, - { name = "pillow" }, - { name = "pyclipper" }, - { name = "pyyaml" }, + { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" }, + { name = "opencv-python", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "pillow", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "pyclipper", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "pyyaml", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, { name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" }, - { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" }, - { name = "six" }, - { name = "tqdm" }, + { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" }, + { name = "six", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "tqdm", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" }, @@ -6344,7 +6344,7 @@ name = "triton" version = "3.3.1" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "setuptools", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" }, + { name = "setuptools", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/8d/a9/549e51e9b1b2c9b854fd761a1d23df0ba2fbc60bd0c13b489ffa518cfcb7/triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e", size = 155600257, upload-time = "2025-05-29T23:39:36.085Z" }, From dd8fde7f19ecd9695d6bc6cf94896a2cf87a0e7c Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Mon, 7 Jul 2025 08:59:14 +0200 Subject: [PATCH 3/7] fix: typo in asr options (#1902) fix typo Signed-off-by: Michele Dolfi --- docling/datamodel/asr_model_specs.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py index 95287ad2..426b5851 100644 --- a/docling/datamodel/asr_model_specs.py +++ b/docling/datamodel/asr_model_specs.py @@ -22,7 +22,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -33,7 +33,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -44,7 +44,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -55,7 +55,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -66,7 +66,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) @@ -77,7 +77,7 @@ WHISPER_TURBO = InlineAsrNativeWhisperOptions( verbose=True, timestamps=True, word_timestamps=True, - temperatue=0.0, + temperature=0.0, max_new_tokens=256, max_time_chunk=30.0, ) From edd4356aac25b62c30cae6d2e8c69095f63bd442 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Mon, 7 Jul 2025 16:23:16 +0200 Subject: [PATCH 4/7] fix: use only backend for picture classifier (#1904) use backend for picture classifier Signed-off-by: Michele Dolfi --- docling/models/document_picture_classifier.py | 25 +++++++++---------- docling/pipeline/standard_pdf_pipeline.py | 1 + tests/test_document_picture_classifier.py | 3 ++- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index 73a30203..24e45078 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -14,7 +14,8 @@ from PIL import Image from pydantic import BaseModel from docling.datamodel.accelerator_options import AcceleratorOptions -from docling.models.base_model import BaseEnrichmentModel +from docling.datamodel.base_models import ItemAndImageEnrichmentElement +from docling.models.base_model import BaseItemAndImageEnrichmentModel from docling.models.utils.hf_model_download import download_hf_model from docling.utils.accelerator_utils import decide_device @@ -32,7 +33,7 @@ class DocumentPictureClassifierOptions(BaseModel): kind: Literal["document_picture_classifier"] = "document_picture_classifier" -class DocumentPictureClassifier(BaseEnrichmentModel): +class DocumentPictureClassifier(BaseItemAndImageEnrichmentModel): """ A model for classifying pictures in documents. @@ -135,7 +136,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): def __call__( self, doc: DoclingDocument, - element_batch: Iterable[NodeItem], + element_batch: Iterable[ItemAndImageEnrichmentElement], ) -> Iterable[NodeItem]: """ Processes a batch of elements and enriches them with classification predictions. @@ -144,7 +145,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ---------- doc : DoclingDocument The document containing the elements to be processed. - element_batch : Iterable[NodeItem] + element_batch : Iterable[ItemAndImageEnrichmentElement] A batch of pictures to classify. Returns @@ -155,22 +156,20 @@ class DocumentPictureClassifier(BaseEnrichmentModel): """ if not self.enabled: for element in element_batch: - yield element + yield element.item return images: List[Union[Image.Image, np.ndarray]] = [] elements: List[PictureItem] = [] for el in element_batch: - assert isinstance(el, PictureItem) - elements.append(el) - img = el.get_image(doc) - assert img is not None - images.append(img) + assert isinstance(el.item, PictureItem) + elements.append(el.item) + images.append(el.image) outputs = self.document_picture_classifier.predict(images) - for element, output in zip(elements, outputs): - element.annotations.append( + for item, output in zip(elements, outputs): + item.annotations.append( PictureClassificationData( provenance="DocumentPictureClassifier", predicted_classes=[ @@ -183,4 +182,4 @@ class DocumentPictureClassifier(BaseEnrichmentModel): ) ) - yield element + yield item diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 8861174a..de76ef24 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -129,6 +129,7 @@ class StandardPdfPipeline(PaginatedPipeline): if ( self.pipeline_options.do_formula_enrichment or self.pipeline_options.do_code_enrichment + or self.pipeline_options.do_picture_classification or self.pipeline_options.do_picture_description ): self.keep_backend = True diff --git a/tests/test_document_picture_classifier.py b/tests/test_document_picture_classifier.py index 5dc5e926..3a43a61a 100644 --- a/tests/test_document_picture_classifier.py +++ b/tests/test_document_picture_classifier.py @@ -17,8 +17,9 @@ def get_converter(): pipeline_options.do_table_structure = False pipeline_options.do_code_enrichment = False pipeline_options.do_formula_enrichment = False + pipeline_options.generate_picture_images = False + pipeline_options.generate_page_images = False pipeline_options.do_picture_classification = True - pipeline_options.generate_picture_images = True pipeline_options.images_scale = 2 converter = DocumentConverter( From b8813eea806a33f3bcc4f865d7e6ceba8b2fffa5 Mon Sep 17 00:00:00 2001 From: Shkarupa Alex Date: Mon, 7 Jul 2025 17:58:42 +0300 Subject: [PATCH 5/7] feat(vlm): Dynamic prompts (#1808) * Unify temperature options for Vlm models * Dynamic prompt support with example * DCO Remediation Commit for Shkarupa Alex I, Shkarupa Alex , hereby add my Signed-off-by to this commit: 34d446cb9829835cf6b8f8fdb4abd9fef3455c3a I, Shkarupa Alex , hereby add my Signed-off-by to this commit: 9c595d574fce5e3e139f5af780f8223496735ff1 Signed-off-by: Shkarupa Alex * Replace Page with SegmentedPage * Fix example HF repo link Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Sign-off Signed-off-by: Shkarupa Alex * DCO Remediation Commit for Shkarupa Alex I, Shkarupa Alex , hereby add my Signed-off-by to this commit: 1a162066dd3e4ee240d272d9d503d549a0856590 Signed-off-by: Shkarupa Alex Signed-off-by: Shkarupa Alex * Use lmstudio-community model Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> * Swap inference engine to LM Studio Signed-off-by: Shkarupa Alex --------- Signed-off-by: Shkarupa Alex Signed-off-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> Co-authored-by: Christoph Auer <60343111+cau-git@users.noreply.github.com> --- .../datamodel/pipeline_options_vlm_model.py | 7 +- docling/models/api_vlm_model.py | 12 ++-- .../hf_transformers_model.py | 12 ++-- docling/models/vlm_models_inline/mlx_model.py | 8 ++- docling/pipeline/vlm_pipeline.py | 1 + docs/examples/vlm_pipeline_api_model.py | 71 +++++++++++++++++++ 6 files changed, 96 insertions(+), 15 deletions(-) diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index 90ab6685..fd672b1b 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -1,6 +1,7 @@ from enum import Enum -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Any, Callable, Dict, List, Literal, Optional, Union +from docling_core.types.doc.page import SegmentedPage from pydantic import AnyUrl, BaseModel from typing_extensions import deprecated @@ -9,9 +10,10 @@ from docling.datamodel.accelerator_options import AcceleratorDevice class BaseVlmOptions(BaseModel): kind: str - prompt: str + prompt: Union[str, Callable[[Optional[SegmentedPage]], str]] scale: float = 2.0 max_size: Optional[int] = None + temperature: float = 0.0 class ResponseFormat(str, Enum): @@ -51,7 +53,6 @@ class InlineVlmOptions(BaseVlmOptions): AcceleratorDevice.MPS, ] - temperature: float = 0.0 stop_strings: List[str] = [] extra_generation_config: Dict[str, Any] = {} diff --git a/docling/models/api_vlm_model.py b/docling/models/api_vlm_model.py index bfd00003..164ac285 100644 --- a/docling/models/api_vlm_model.py +++ b/docling/models/api_vlm_model.py @@ -29,12 +29,9 @@ class ApiVlmModel(BasePageModel): self.timeout = self.vlm_options.timeout self.concurrency = self.vlm_options.concurrency - self.prompt_content = ( - f"This is a page from a document.\n{self.vlm_options.prompt}" - ) self.params = { **self.vlm_options.params, - "temperature": 0, + "temperature": self.vlm_options.temperature, } def __call__( @@ -56,9 +53,14 @@ class ApiVlmModel(BasePageModel): if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") + if callable(self.vlm_options.prompt): + prompt = self.vlm_options.prompt(page.parsed_page) + else: + prompt = self.vlm_options.prompt + page_tags = api_image_request( image=hi_res_image, - prompt=self.prompt_content, + prompt=prompt, url=self.vlm_options.url, timeout=self.timeout, headers=self.vlm_options.headers, diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index bd35888d..4e2d80b8 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -128,7 +128,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix ) # Define prompt structure - prompt = self.formulate_prompt() + if callable(self.vlm_options.prompt): + user_prompt = self.vlm_options.prompt(page.parsed_page) + else: + user_prompt = self.vlm_options.prompt + prompt = self.formulate_prompt(user_prompt) inputs = self.processor( text=prompt, images=[hi_res_image], return_tensors="pt" @@ -162,7 +166,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix yield page - def formulate_prompt(self) -> str: + def formulate_prompt(self, user_prompt: str) -> str: """Formulate a prompt for the VLM.""" if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": @@ -173,7 +177,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix assistant_prompt = "<|assistant|>" prompt_suffix = "<|end|>" - prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}" + prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}" _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}") return prompt @@ -187,7 +191,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix "text": "This is a page from a document.", }, {"type": "image"}, - {"type": "text", "text": self.vlm_options.prompt}, + {"type": "text", "text": user_prompt}, ], } ] diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py index 58f037fc..647ce531 100644 --- a/docling/models/vlm_models_inline/mlx_model.py +++ b/docling/models/vlm_models_inline/mlx_model.py @@ -56,8 +56,6 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): elif (artifacts_path / repo_cache_folder).exists(): artifacts_path = artifacts_path / repo_cache_folder - self.param_question = vlm_options.prompt - ## Load the model self.vlm_model, self.processor = load(artifacts_path) self.config = load_config(artifacts_path) @@ -86,8 +84,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin): if hi_res_image.mode != "RGB": hi_res_image = hi_res_image.convert("RGB") + if callable(self.vlm_options.prompt): + user_prompt = self.vlm_options.prompt(page.parsed_page) + else: + user_prompt = self.vlm_options.prompt prompt = self.apply_chat_template( - self.processor, self.config, self.param_question, num_images=1 + self.processor, self.config, user_prompt, num_images=1 ) start_time = time.time() diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py index 2ecfe55a..ab474fab 100644 --- a/docling/pipeline/vlm_pipeline.py +++ b/docling/pipeline/vlm_pipeline.py @@ -117,6 +117,7 @@ class VlmPipeline(PaginatedPipeline): page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore if page._backend is not None and page._backend.is_valid(): page.size = page._backend.get_size() + page.parsed_page = page._backend.get_segmented_page() return page diff --git a/docs/examples/vlm_pipeline_api_model.py b/docs/examples/vlm_pipeline_api_model.py index 679f7bd7..a809b926 100644 --- a/docs/examples/vlm_pipeline_api_model.py +++ b/docs/examples/vlm_pipeline_api_model.py @@ -1,8 +1,10 @@ import logging import os from pathlib import Path +from typing import Optional import requests +from docling_core.types.doc.page import SegmentedPage from dotenv import load_dotenv from docling.datamodel.base_models import InputFormat @@ -32,6 +34,69 @@ def lms_vlm_options(model: str, prompt: str, format: ResponseFormat): return options +#### Using LM Studio with OlmOcr model + + +def lms_olmocr_vlm_options(model: str): + def _dynamic_olmocr_prompt(page: Optional[SegmentedPage]): + if page is None: + return ( + "Below is the image of one page of a document. Just return the plain text" + " representation of this document as if you were reading it naturally.\n" + "Do not hallucinate.\n" + ) + + anchor = [ + f"Page dimensions: {int(page.dimension.width)}x{int(page.dimension.height)}" + ] + + for text_cell in page.textline_cells: + if not text_cell.text.strip(): + continue + bbox = text_cell.rect.to_bounding_box().to_bottom_left_origin( + page.dimension.height + ) + anchor.append(f"[{int(bbox.l)}x{int(bbox.b)}] {text_cell.text}") + + for image_cell in page.bitmap_resources: + bbox = image_cell.rect.to_bounding_box().to_bottom_left_origin( + page.dimension.height + ) + anchor.append( + f"[Image {int(bbox.l)}x{int(bbox.b)} to {int(bbox.r)}x{int(bbox.t)}]" + ) + + if len(anchor) == 1: + anchor.append( + f"[Image 0x0 to {int(page.dimension.width)}x{int(page.dimension.height)}]" + ) + + # Original prompt uses cells sorting. We are skipping it in this demo. + + base_text = "\n".join(anchor) + + return ( + f"Below is the image of one page of a document, as well as some raw textual" + f" content that was previously extracted for it. Just return the plain text" + f" representation of this document as if you were reading it naturally.\n" + f"Do not hallucinate.\n" + f"RAW_TEXT_START\n{base_text}\nRAW_TEXT_END" + ) + + options = ApiVlmOptions( + url="http://localhost:1234/v1/chat/completions", + params=dict( + model=model, + ), + prompt=_dynamic_olmocr_prompt, + timeout=90, + scale=1.0, + max_size=1024, # from OlmOcr pipeline + response_format=ResponseFormat.MARKDOWN, + ) + return options + + #### Using Ollama @@ -123,6 +188,12 @@ def main(): # format=ResponseFormat.MARKDOWN, # ) + # Example using the OlmOcr (dynamic prompt) model with LM Studio: + # (uncomment the following lines) + # pipeline_options.vlm_options = lms_olmocr_vlm_options( + # model="hf.co/lmstudio-community/olmOCR-7B-0225-preview-GGUF", + # ) + # Example using the Granite Vision model with Ollama: # (uncomment the following lines) # pipeline_options.vlm_options = ollama_vlm_options( From e25873d55766761741ad5781efd18bc3bfea5e3d Mon Sep 17 00:00:00 2001 From: VIktor Kuropiantnyk <103574791+vku-ibm@users.noreply.github.com> Date: Mon, 7 Jul 2025 17:06:26 +0200 Subject: [PATCH 6/7] fix: docs are missing osd packages for tesseract on RHEL (#1905) Fixed missing packages in the docs on tesseract Signed-off-by: Viktor Kuropiatnyk --- docs/installation/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/installation/index.md b/docs/installation/index.md index 5930525c..38fba4c8 100644 --- a/docs/installation/index.md +++ b/docs/installation/index.md @@ -77,7 +77,7 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi === "RHEL" ```console - dnf install tesseract tesseract-devel tesseract-langpack-eng leptonica-devel + dnf install tesseract tesseract-devel tesseract-langpack-eng tesseract-osd leptonica-devel TESSDATA_PREFIX=/usr/share/tesseract/tessdata/ echo "Set TESSDATA_PREFIX=${TESSDATA_PREFIX}" ``` From a07ba863c4c3dacfecaca159faa5653097662755 Mon Sep 17 00:00:00 2001 From: geoHeil <1694964+geoHeil@users.noreply.github.com> Date: Tue, 8 Jul 2025 05:54:57 +0200 Subject: [PATCH 7/7] feat: add image-text-to-text models in transformers (#1772) * feat(dolphin): add dolphin support Signed-off-by: Georg Heiler * rename Signed-off-by: Georg Heiler * reformat Signed-off-by: Georg Heiler * fix mypy Signed-off-by: Georg Heiler * add prompt style and examples Signed-off-by: Michele Dolfi --------- Signed-off-by: Georg Heiler Signed-off-by: Michele Dolfi Co-authored-by: Michele Dolfi --- .../datamodel/pipeline_options_vlm_model.py | 7 +++ .../hf_transformers_model.py | 49 ++++++++++++------- docs/examples/compare_vlm_models.py | 39 ++++++++++++++- 3 files changed, 77 insertions(+), 18 deletions(-) diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py index fd672b1b..bcea2493 100644 --- a/docling/datamodel/pipeline_options_vlm_model.py +++ b/docling/datamodel/pipeline_options_vlm_model.py @@ -31,6 +31,12 @@ class TransformersModelType(str, Enum): AUTOMODEL = "automodel" AUTOMODEL_VISION2SEQ = "automodel-vision2seq" AUTOMODEL_CAUSALLM = "automodel-causallm" + AUTOMODEL_IMAGETEXTTOTEXT = "automodel-imagetexttotext" + + +class TransformersPromptStyle(str, Enum): + CHAT = "chat" + RAW = "raw" class InlineVlmOptions(BaseVlmOptions): @@ -44,6 +50,7 @@ class InlineVlmOptions(BaseVlmOptions): inference_framework: InferenceFramework transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL + transformers_prompt_style: TransformersPromptStyle = TransformersPromptStyle.CHAT response_format: ResponseFormat torch_dtype: Optional[str] = None diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py index 4e2d80b8..d84925dd 100644 --- a/docling/models/vlm_models_inline/hf_transformers_model.py +++ b/docling/models/vlm_models_inline/hf_transformers_model.py @@ -13,6 +13,7 @@ from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options_vlm_model import ( InlineVlmOptions, TransformersModelType, + TransformersPromptStyle, ) from docling.models.base_model import BasePageModel from docling.models.utils.hf_model_download import ( @@ -41,6 +42,7 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix from transformers import ( AutoModel, AutoModelForCausalLM, + AutoModelForImageTextToText, AutoModelForVision2Seq, AutoProcessor, BitsAndBytesConfig, @@ -91,6 +93,11 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix == TransformersModelType.AUTOMODEL_VISION2SEQ ): model_cls = AutoModelForVision2Seq + elif ( + self.vlm_options.transformers_model_type + == TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT + ): + model_cls = AutoModelForImageTextToText self.processor = AutoProcessor.from_pretrained( artifacts_path, @@ -169,7 +176,10 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix def formulate_prompt(self, user_prompt: str) -> str: """Formulate a prompt for the VLM.""" - if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": + if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW: + return user_prompt + + elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct": _log.debug("Using specialized prompt for Phi-4") # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally @@ -182,20 +192,25 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix return prompt - messages = [ - { - "role": "user", - "content": [ - { - "type": "text", - "text": "This is a page from a document.", - }, - {"type": "image"}, - {"type": "text", "text": user_prompt}, - ], - } - ] - prompt = self.processor.apply_chat_template( - messages, add_generation_prompt=False + elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT: + messages = [ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "This is a page from a document.", + }, + {"type": "image"}, + {"type": "text", "text": user_prompt}, + ], + } + ] + prompt = self.processor.apply_chat_template( + messages, add_generation_prompt=False + ) + return prompt + + raise RuntimeError( + f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}." ) - return prompt diff --git a/docs/examples/compare_vlm_models.py b/docs/examples/compare_vlm_models.py index f9bd2dcd..49c34387 100644 --- a/docs/examples/compare_vlm_models.py +++ b/docs/examples/compare_vlm_models.py @@ -14,11 +14,18 @@ from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS from tabulate import tabulate from docling.datamodel import vlm_model_specs +from docling.datamodel.accelerator_options import AcceleratorDevice from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( VlmPipelineOptions, ) -from docling.datamodel.pipeline_options_vlm_model import InferenceFramework +from docling.datamodel.pipeline_options_vlm_model import ( + InferenceFramework, + InlineVlmOptions, + ResponseFormat, + TransformersModelType, + TransformersPromptStyle, +) from docling.document_converter import DocumentConverter, PdfFormatOption from docling.pipeline.vlm_pipeline import VlmPipeline @@ -101,6 +108,33 @@ if __name__ == "__main__": out_path = Path("scratch") out_path.mkdir(parents=True, exist_ok=True) + ## Definiton of more inline models + llava_qwen = InlineVlmOptions( + repo_id="llava-hf/llava-interleave-qwen-0.5b-hf", + # prompt="Read text in the image.", + prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!", + # prompt="Parse the reading order of this document.", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, + supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU], + scale=2.0, + temperature=0.0, + ) + + # Note that this is not the expected way of using the Dolphin model, but it shows the usage of a raw prompt. + dolphin_oneshot = InlineVlmOptions( + repo_id="ByteDance/Dolphin", + prompt="Read text in the image. ", + response_format=ResponseFormat.MARKDOWN, + inference_framework=InferenceFramework.TRANSFORMERS, + transformers_model_type=TransformersModelType.AUTOMODEL_IMAGETEXTTOTEXT, + transformers_prompt_style=TransformersPromptStyle.RAW, + supported_devices=[AcceleratorDevice.CUDA, AcceleratorDevice.CPU], + scale=2.0, + temperature=0.0, + ) + ## Use VlmPipeline pipeline_options = VlmPipelineOptions() pipeline_options.generate_page_images = True @@ -121,6 +155,9 @@ if __name__ == "__main__": vlm_model_specs.GRANITE_VISION_TRANSFORMERS, vlm_model_specs.PHI4_TRANSFORMERS, vlm_model_specs.PIXTRAL_12B_TRANSFORMERS, + ## More inline models + dolphin_oneshot, + llava_qwen, ] # Remove MLX models if not on Mac