From 90b766e2ae1695a759191df37c272efc09be5ee3 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Fri, 7 Feb 2025 12:55:12 +0100
Subject: [PATCH 1/6] fix(markdown): handle nested lists (#910)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docling/backend/md_backend.py                 | 108 ++++++++++--------
 .../data/groundtruth/docling_v2/nested.md.md  |  31 +++++
 tests/data/md/nested.md                       |  66 +++++++++++
 tests/test_backend_markdown.py                |  12 +-
 tests/test_data_gen_flag.py                   |   9 ++
 5 files changed, 177 insertions(+), 49 deletions(-)
 create mode 100644 tests/data/groundtruth/docling_v2/nested.md.md
 create mode 100644 tests/data/md/nested.md
 create mode 100644 tests/test_data_gen_flag.py

diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index eaf47537..19a21c19 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
 
 
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
+    def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
 
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 # very long sequences of underscores will lead to unnecessary long processing times.
                 # In any proper Markdown files, underscores have to be escaped,
                 # otherwise they represent emphasis (bold or italic)
-                self.markdown = self.shorten_underscore_sequences(text_stream)
+                self.markdown = self._shorten_underscore_sequences(text_stream)
             if isinstance(self.path_or_stream, Path):
                 with open(self.path_or_stream, "r", encoding="utf-8") as f:
                     md_content = f.read()
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     # very long sequences of underscores will lead to unnecessary long processing times.
                     # In any proper Markdown files, underscores have to be escaped,
                     # otherwise they represent emphasis (bold or italic)
-                    self.markdown = self.shorten_underscore_sequences(md_content)
+                    self.markdown = self._shorten_underscore_sequences(md_content)
             self.valid = True
 
             _log.debug(self.markdown)
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             ) from e
         return
 
-    def close_table(self, doc: DoclingDocument):
+    def _close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 doc.add_table(data=table_data)
         return
 
-    def process_inline_text(
-        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    def _process_inline_text(
+        self, parent_item: Optional[NodeItem], doc: DoclingDocument
     ):
         txt = " ".join(self.inline_texts)
         if len(txt) > 0:
             doc.add_text(
                 label=DocItemLabel.PARAGRAPH,
-                parent=parent_element,
+                parent=parent_item,
                 text=txt,
             )
         self.inline_texts = []
 
-    def iterate_elements(
+    def _iterate_elements(
         self,
         element: marko.element.Element,
         depth: int,
         doc: DoclingDocument,
-        parent_element: Optional[NodeItem] = None,
+        visited: Set[marko.element.Element],
+        parent_item: Optional[NodeItem] = None,
     ):
+
+        if element in visited:
+            return
+
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(
                 f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             traverse(element)
             snippet_text = "".join(strings)
             if len(snippet_text) > 0:
-                parent_element = doc.add_text(
-                    label=doc_label, parent=parent_element, text=snippet_text
+                parent_item = doc.add_text(
+                    label=doc_label, parent=parent_item, text=snippet_text
                 )
 
         elif isinstance(element, marko.block.List):
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     has_non_empty_list_items = True
                     break
 
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
             if has_non_empty_list_items:
                 label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
-                parent_element = doc.add_group(
-                    label=label, name=f"list", parent=parent_element
+                parent_item = doc.add_group(
+                    label=label, name=f"list", parent=parent_item
                 )
 
         elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(" - List item")
 
-            snippet_text = str(element.children[0].children[0].children)  # type: ignore
+            first_child = element.children[0]
+            snippet_text = str(first_child.children[0].children)  # type: ignore
             is_numbered = False
             if (
-                parent_element is not None
-                and isinstance(parent_element, DocItem)
-                and parent_element.label == GroupLabel.ORDERED_LIST
+                parent_item is not None
+                and isinstance(parent_item, DocItem)
+                and parent_item.label == GroupLabel.ORDERED_LIST
             ):
                 is_numbered = True
             doc.add_list_item(
-                enumerated=is_numbered, parent=parent_element, text=snippet_text
+                enumerated=is_numbered, parent=parent_item, text=snippet_text
             )
+            visited.add(first_child)
 
         elif isinstance(element, marko.inline.Image):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
 
             fig_caption: Optional[TextItem] = None
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                     label=DocItemLabel.CAPTION, text=element.title
                 )
 
-            doc.add_picture(parent=parent_element, caption=fig_caption)
+            doc.add_picture(parent=parent_item, caption=fig_caption)
 
         elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self.process_inline_text(parent_element, doc)
+            self._process_inline_text(parent_item, doc)
 
         elif isinstance(element, marko.inline.RawText):
             _log.debug(f" - Paragraph (raw text): {element.children}")
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                 else:
                     self.md_table_buffer.append(snippet_text)
             else:
-                self.close_table(doc)
-                self.in_table = False
+                self._close_table(doc)
                 # most likely just inline text
                 self.inline_texts.append(str(element.children))
 
         elif isinstance(element, marko.inline.CodeSpan):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)
 
         elif (
             isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             and isinstance((first_child := element.children[0]), marko.inline.RawText)
             and len(snippet_text := (first_child.children.strip())) > 0
         ):
-            self.close_table(doc)
-            self.process_inline_text(parent_element, doc)
+            self._close_table(doc)
+            self._process_inline_text(parent_item, doc)
             _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_element, text=snippet_text)
+            doc.add_code(parent=parent_item, text=snippet_text)
 
         elif isinstance(element, marko.inline.LineBreak):
             if self.in_table:
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
 
         elif isinstance(element, marko.block.HTMLBlock):
             self._html_blocks += 1
-            self.process_inline_text(parent_element, doc)
-            self.close_table(doc)
+            self._process_inline_text(parent_item, doc)
+            self._close_table(doc)
             _log.debug("HTML Block: {}".format(element))
             if (
                 len(element.body) > 0
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
 
                 # wrap in markers to enable post-processing in convert()
                 text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_element, text=text_to_add)
+                doc.add_code(parent=parent_item, text=text_to_add)
         else:
             if not isinstance(element, str):
-                self.close_table(doc)
+                self._close_table(doc)
                 _log.debug("Some other element: {}".format(element))
 
         processed_block_types = (
-            marko.block.ListItem,
             marko.block.Heading,
             marko.block.CodeBlock,
             marko.block.FencedCode,
-            # marko.block.Paragraph,
             marko.inline.RawText,
         )
 
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             element, processed_block_types
         ):
             for child in element.children:
-                self.iterate_elements(child, depth + 1, doc, parent_element)
+                self._iterate_elements(
+                    element=child,
+                    depth=depth + 1,
+                    doc=doc,
+                    visited=visited,
+                    parent_item=parent_item,
+                )
 
     def is_valid(self) -> bool:
         return self.valid
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
             marko_parser = Markdown()
             parsed_ast = marko_parser.parse(self.markdown)
             # Start iterating from the root of the AST
-            self.iterate_elements(parsed_ast, 0, doc, None)
-            self.process_inline_text(None, doc)  # handle last hanging inline text
-            self.close_table(doc=doc)  # handle any last hanging table
+            self._iterate_elements(
+                element=parsed_ast,
+                depth=0,
+                doc=doc,
+                parent_item=None,
+                visited=set(),
+            )
+            self._process_inline_text(None, doc)  # handle last hanging inline text
+            self._close_table(doc=doc)  # handle any last hanging table
 
             # if HTML blocks were detected, export to HTML and delegate to HTML backend
             if self._html_blocks > 0:
diff --git a/tests/data/groundtruth/docling_v2/nested.md.md b/tests/data/groundtruth/docling_v2/nested.md.md
new file mode 100644
index 00000000..6e430e0c
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/nested.md.md
@@ -0,0 +1,31 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+    - abc123
+        - abc1234
+            - abc12345
+                - a.
+                - b.
+        - abcd1234：
+            - abcd12345：
+                - a.
+                - b.
+- def：
+    - def1234：
+        - def12345。
+- after one empty line
+    - foo
+- afer two empty lines
+    - bar
+
+- changing symbol
+
+A nested HTML list:
+
+- First item
+- Second item with subitems:
+    - Subitem 1
+    - Subitem 2
+- Last list item
diff --git a/tests/data/md/nested.md b/tests/data/md/nested.md
new file mode 100644
index 00000000..4e203eec
--- /dev/null
+++ b/tests/data/md/nested.md
@@ -0,0 +1,66 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+	- abc123
+		- abc1234
+			- abc12345
+				- a.
+				- b.
+		- abcd1234：
+			- abcd12345：
+				- a.
+				- b.
+- def：
+	- def1234：
+		- def12345。
+
+- after one empty line
+	- foo
+
+
+- afer two empty lines
+	- bar
+* changing symbol
+
+A nested HTML list:
+
+<ul>
+    <li>First item</li>
+    <li>Second item with subitems:
+        <ul>
+            <li>Subitem 1</li>
+            <li>Subitem 2</li>
+        </ul>
+    </li>
+    <li>Last list item</li>
+</ul>
+
+<!--
+Table nesting apparently not yet suported by HTML backend:
+
+<table>
+  <tr>
+    <td>Cell</td>
+    <td>Nested Table
+      <table>
+        <tr>
+          <td>Cell 1</td>
+		  <>
+        </tr>
+        <tr>
+          <td>Cell 2</td>
+        </tr>
+        <tr>
+          <td>Cell 3</td>
+        </tr>
+        <tr>
+          <td>Cell 4</td>
+        </tr>
+      </table>
+    </td>
+  </tr>
+  <tr><td>additional row</td></tr>
+</table>
+-->
diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py
index caa94d9f..5a201ab2 100644
--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
 
+from .test_data_gen_flag import GEN_TEST_DATA
+
 
 def test_convert_valid():
     fmt = InputFormat.MD
@@ -30,6 +32,10 @@ def test_convert_valid():
         act_doc = backend.convert()
         act_data = act_doc.export_to_markdown()
 
-        with open(gt_path, "r", encoding="utf-8") as f:
-            exp_data = f.read().rstrip()
-        assert act_data == exp_data
+        if GEN_TEST_DATA:
+            with open(gt_path, mode="w", encoding="utf-8") as f:
+                f.write(f"{act_data}\n")
+        else:
+            with open(gt_path, encoding="utf-8") as f:
+                exp_data = f.read().rstrip()
+            assert exp_data == act_data
diff --git a/tests/test_data_gen_flag.py b/tests/test_data_gen_flag.py
new file mode 100644
index 00000000..a4baff66
--- /dev/null
+++ b/tests/test_data_gen_flag.py
@@ -0,0 +1,9 @@
+import os
+
+from pydantic import TypeAdapter
+
+GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
+
+
+def test_gen_test_data_flag():
+    assert not GEN_TEST_DATA

From 02faf5376b22e174a6aa90dc7bd95feb14a94754 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 7 Feb 2025 13:58:05 +0100
Subject: [PATCH 2/6] refactor: use org--name in artifacts-path (#912)

use org--name in artifacts-path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/models/code_formula_model.py          | 2 +-
 docling/models/document_picture_classifier.py | 2 +-
 docling/models/layout_model.py                | 2 +-
 docling/models/table_structure_model.py       | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py
index 8bb29af2..1a0f0bf0 100644
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@@ -62,7 +62,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
         Processes the given batch of elements and enriches them with predictions.
     """
 
-    _model_repo_folder = "CodeFormula"
+    _model_repo_folder = "ds4sd--CodeFormula"
     elements_batch_size = 5
     images_scale = 1.66  # = 120 dpi, aligned with training data resolution
     expansion_factor = 0.03
diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py
index 302d18cb..6e71246b 100644
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@@ -56,7 +56,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
         Processes a batch of elements and adds classification annotations.
     """
 
-    _model_repo_folder = "DocumentFigureClassifier"
+    _model_repo_folder = "ds4sd--DocumentFigureClassifier"
     images_scale = 2
 
     def __init__(
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index c88f91cb..b3cbd954 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -22,7 +22,7 @@ _log = logging.getLogger(__name__)
 
 
 class LayoutModel(BasePageModel):
-    _model_repo_folder = "docling-models"
+    _model_repo_folder = "ds4sd--docling-models"
     _model_path = "model_artifacts/layout"
 
     TEXT_ELEM_LABELS = [
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index b5ab5a2a..64979157 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -23,7 +23,7 @@ from docling.utils.profiling import TimeRecorder
 
 
 class TableStructureModel(BasePageModel):
-    _model_repo_folder = "docling-models"
+    _model_repo_folder = "ds4sd--docling-models"
     _model_path = "model_artifacts/tableformer"
 
     def __init__(

From fba3cf9be75e239896c353a1c0f3bdd0fa4a92fa Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 7 Feb 2025 13:36:54 +0000
Subject: [PATCH 3/6] chore: bump version to 2.19.0 [skip ci]

---
 CHANGELOG.md   | 17 +++++++++++++++++
 pyproject.toml |  2 +-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ad3b47d..8dc85cf5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,20 @@
+## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07
+
+### Feature
+
+* New artifacts path and CLI utility ([#876](https://github.com/DS4SD/docling/issues/876)) ([`ed74fe2`](https://github.com/DS4SD/docling/commit/ed74fe2ec0a702834f0deacfdb5717c8c587dab1))
+
+### Fix
+
+* **markdown:** Handle nested lists ([#910](https://github.com/DS4SD/docling/issues/910)) ([`90b766e`](https://github.com/DS4SD/docling/commit/90b766e2ae1695a759191df37c272efc09be5ee3))
+* Test cases for RTL programmatic PDFs and fixes for the formula model ([#903](https://github.com/DS4SD/docling/issues/903)) ([`9114ada`](https://github.com/DS4SD/docling/commit/9114ada7bc4dd45ce0046de2f9d00a80ccb25c79))
+* **msword_backend:** Handle conversion error in label parsing ([#896](https://github.com/DS4SD/docling/issues/896)) ([`722a6eb`](https://github.com/DS4SD/docling/commit/722a6eb7b994a0261312a356df80b2fced121812))
+* Enrichment models batch size and expose picture classifier ([#878](https://github.com/DS4SD/docling/issues/878)) ([`5ad6de0`](https://github.com/DS4SD/docling/commit/5ad6de05600315617b574bd12af553e00b4d316e))
+
+### Documentation
+
+* Introduce example with custom models for RapidOCR ([#874](https://github.com/DS4SD/docling/issues/874)) ([`6d3fea0`](https://github.com/DS4SD/docling/commit/6d3fea019635bd6ca94bd36c3928b28c245d638d))
+
 ## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03
 
 ### Feature
diff --git a/pyproject.toml b/pyproject.toml
index e1c30a3c..3bc88b05 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.18.0"  # DO NOT EDIT, updated automatically
+version = "2.19.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"

From 4cc6e3ea5e858b367136acc729b723ea0552d22a Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 7 Feb 2025 16:30:42 +0100
Subject: [PATCH 4/6] feat: Describe pictures using vision models (#259)

* draft for picture description models

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* vlm description using AutoModelForVision2Seq

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add generation options

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update vlm API

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* allow only localhost traffic

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* rename model

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* do not run with vlm api

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* more renaming

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix examples path

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* apply CLI download login

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix name of cli argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use with_smolvlm in models download

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 .github/workflows/checks.yml                  |   2 +-
 docling/cli/main.py                           |   5 +
 docling/cli/models.py                         |   2 +
 docling/datamodel/pipeline_options.py         |  54 ++++++++-
 docling/models/base_model.py                  |   4 +-
 .../models/picture_description_api_model.py   | 105 +++++++++++++++++
 .../models/picture_description_base_model.py  |  64 ++++++++++
 .../models/picture_description_vlm_model.py   | 109 ++++++++++++++++++
 docling/pipeline/standard_pdf_pipeline.py     |  42 ++++++-
 docling/utils/model_downloader.py             |  12 ++
 docs/examples/pictures_description.py         |  48 ++++++++
 docs/examples/pictures_description_api.py     |  55 +++++++++
 poetry.lock                                   |   9 +-
 pyproject.toml                                |   8 +-
 14 files changed, 508 insertions(+), 11 deletions(-)
 create mode 100644 docling/models/picture_description_api_model.py
 create mode 100644 docling/models/picture_description_base_model.py
 create mode 100644 docling/models/picture_description_vlm_model.py
 create mode 100644 docs/examples/pictures_description.py
 create mode 100644 docs/examples/pictures_description_api.py

diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 75ea5970..89bcfd79 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -28,7 +28,7 @@ jobs:
         run: |
           for file in docs/examples/*.py; do
             # Skip batch_convert.py
-            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert).py ]]; then
+            if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
                 echo "Skipping $file"
                 continue
             fi
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 19f77e4e..e2bc0dd6 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -226,6 +226,10 @@ def convert(
             help="Enable the picture classification enrichment model in the pipeline.",
         ),
     ] = False,
+    enrich_picture_description: Annotated[
+        bool,
+        typer.Option(..., help="Enable the picture description model in the pipeline."),
+    ] = False,
     artifacts_path: Annotated[
         Optional[Path],
         typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -382,6 +386,7 @@ def convert(
             do_table_structure=True,
             do_code_enrichment=enrich_code,
             do_formula_enrichment=enrich_formula,
+            do_picture_description=enrich_picture_description,
             do_picture_classification=enrich_picture_classes,
             document_timeout=document_timeout,
         )
diff --git a/docling/cli/models.py b/docling/cli/models.py
index aea498c5..3b62ad6b 100644
--- a/docling/cli/models.py
+++ b/docling/cli/models.py
@@ -31,6 +31,7 @@ class _AvailableModels(str, Enum):
     TABLEFORMER = "tableformer"
     CODE_FORMULA = "code_formula"
     PICTURE_CLASSIFIER = "picture_classifier"
+    SMOLVLM = "smolvlm"
     EASYOCR = "easyocr"
 
 
@@ -81,6 +82,7 @@ def download(
         with_tableformer=_AvailableModels.TABLEFORMER in to_download,
         with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
         with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+        with_smolvlm=_AvailableModels.SMOLVLM in to_download,
         with_easyocr=_AvailableModels.EASYOCR in to_download,
     )
 
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 14ca75bf..3b6401b6 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -2,9 +2,9 @@ import logging
 import os
 from enum import Enum
 from pathlib import Path
-from typing import Any, List, Literal, Optional, Union
+from typing import Annotated, Any, Dict, List, Literal, Optional, Union
 
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 _log = logging.getLogger(__name__)
@@ -184,6 +184,51 @@ class OcrMacOptions(OcrOptions):
     )
 
 
+class PictureDescriptionBaseOptions(BaseModel):
+    kind: str
+    batch_size: int = 8
+    scale: float = 2
+
+    bitmap_area_threshold: float = (
+        0.2  # percentage of the area for a bitmap to processed with the models
+    )
+
+
+class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
+    kind: Literal["api"] = "api"
+
+    url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
+    headers: Dict[str, str] = {}
+    params: Dict[str, Any] = {}
+    timeout: float = 20
+
+    prompt: str = "Describe this image in a few sentences."
+    provenance: str = ""
+
+
+class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
+    kind: Literal["vlm"] = "vlm"
+
+    repo_id: str
+    prompt: str = "Describe this image in a few sentences."
+    # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
+    generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
+
+    @property
+    def repo_cache_folder(self) -> str:
+        return self.repo_id.replace("/", "--")
+
+
+smolvlm_picture_description = PictureDescriptionVlmOptions(
+    repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
+)
+# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
+granite_picture_description = PictureDescriptionVlmOptions(
+    repo_id="ibm-granite/granite-vision-3.1-2b-preview",
+    prompt="What is shown in this image?",
+)
+
+
 # Define an enum for the backend options
 class PdfBackend(str, Enum):
     """Enum of valid PDF backends."""
@@ -223,6 +268,7 @@ class PdfPipelineOptions(PipelineOptions):
     do_code_enrichment: bool = False  # True: perform code OCR
     do_formula_enrichment: bool = False  # True: perform formula OCR, return Latex code
     do_picture_classification: bool = False  # True: classify pictures in documents
+    do_picture_description: bool = False  # True: run describe pictures in documents
 
     table_structure_options: TableStructureOptions = TableStructureOptions()
     ocr_options: Union[
@@ -232,6 +278,10 @@ class PdfPipelineOptions(PipelineOptions):
         OcrMacOptions,
         RapidOcrOptions,
     ] = Field(EasyOcrOptions(), discriminator="kind")
+    picture_description_options: Annotated[
+        Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
+        Field(discriminator="kind"),
+    ] = smolvlm_picture_description
 
     images_scale: float = 1.0
     generate_page_images: bool = False
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index a2bc776e..9cdc0ecb 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from typing import Any, Generic, Iterable, Optional
 
-from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
+from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
 from typing_extensions import TypeVar
 
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
@@ -64,7 +64,7 @@ class BaseItemAndImageEnrichmentModel(
         if not self.is_processable(doc=conv_res.document, element=element):
             return None
 
-        assert isinstance(element, TextItem)
+        assert isinstance(element, DocItem)
         element_prov = element.prov[0]
 
         bbox = element_prov.bbox
diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py
new file mode 100644
index 00000000..6c7e02fc
--- /dev/null
+++ b/docling/models/picture_description_api_model.py
@@ -0,0 +1,105 @@
+import base64
+import io
+import logging
+from typing import Iterable, List, Optional
+
+import httpx
+from docling_core.types.doc import PictureItem
+from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
+    PictureDescriptionData,
+)
+from PIL import Image
+from pydantic import BaseModel, ConfigDict
+
+from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+
+_log = logging.getLogger(__name__)
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: str
+
+
+class ResponseUsage(BaseModel):
+    prompt_tokens: int
+    completion_tokens: int
+    total_tokens: int
+
+
+class ApiResponse(BaseModel):
+    model_config = ConfigDict(
+        protected_namespaces=(),
+    )
+
+    id: str
+    model: Optional[str] = None  # returned by openai
+    choices: List[ResponseChoice]
+    created: int
+    usage: ResponseUsage
+
+
+class PictureDescriptionApiModel(PictureDescriptionBaseModel):
+    # elements_batch_size = 4
+
+    def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
+        super().__init__(enabled=enabled, options=options)
+        self.options: PictureDescriptionApiOptions
+
+        if self.enabled:
+            if options.url.host != "localhost":
+                raise NotImplementedError(
+                    "The options try to connect to remote APIs which are not yet allowed."
+                )
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        # Note: technically we could make a batch request here,
+        # but not all APIs will allow for it. For example, vllm won't allow more than 1.
+        for image in images:
+            img_io = io.BytesIO()
+            image.save(img_io, "PNG")
+            image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": self.options.prompt,
+                        },
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{image_base64}"
+                            },
+                        },
+                    ],
+                }
+            ]
+
+            payload = {
+                "messages": messages,
+                **self.options.params,
+            }
+
+            r = httpx.post(
+                str(self.options.url),
+                headers=self.options.headers,
+                json=payload,
+                timeout=self.options.timeout,
+            )
+            if not r.is_success:
+                _log.error(f"Error calling the API. Reponse was {r.text}")
+            r.raise_for_status()
+
+            api_resp = ApiResponse.model_validate_json(r.text)
+            generated_text = api_resp.choices[0].message.content.strip()
+            yield generated_text
diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py
new file mode 100644
index 00000000..b653e0e3
--- /dev/null
+++ b/docling/models/picture_description_base_model.py
@@ -0,0 +1,64 @@
+import logging
+from pathlib import Path
+from typing import Any, Iterable, List, Optional, Union
+
+from docling_core.types.doc import (
+    DoclingDocument,
+    NodeItem,
+    PictureClassificationClass,
+    PictureItem,
+)
+from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
+    PictureDescriptionData,
+)
+from PIL import Image
+
+from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
+from docling.models.base_model import (
+    BaseItemAndImageEnrichmentModel,
+    ItemAndImageEnrichmentElement,
+)
+
+
+class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
+    images_scale: float = 2.0
+
+    def __init__(
+        self,
+        enabled: bool,
+        options: PictureDescriptionBaseOptions,
+    ):
+        self.enabled = enabled
+        self.options = options
+        self.provenance = "not-implemented"
+
+    def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+        return self.enabled and isinstance(element, PictureItem)
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        raise NotImplementedError
+
+    def __call__(
+        self,
+        doc: DoclingDocument,
+        element_batch: Iterable[ItemAndImageEnrichmentElement],
+    ) -> Iterable[NodeItem]:
+        if not self.enabled:
+            for element in element_batch:
+                yield element.item
+            return
+
+        images: List[Image.Image] = []
+        elements: List[PictureItem] = []
+        for el in element_batch:
+            assert isinstance(el.item, PictureItem)
+            elements.append(el.item)
+            images.append(el.image)
+
+        outputs = self._annotate_images(images)
+
+        for item, output in zip(elements, outputs):
+            item.annotations.append(
+                PictureDescriptionData(text=output, provenance=self.provenance)
+            )
+            yield item
diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py
new file mode 100644
index 00000000..9fa4826d
--- /dev/null
+++ b/docling/models/picture_description_vlm_model.py
@@ -0,0 +1,109 @@
+from pathlib import Path
+from typing import Iterable, Optional, Union
+
+from PIL import Image
+
+from docling.datamodel.pipeline_options import (
+    AcceleratorOptions,
+    PictureDescriptionVlmOptions,
+)
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.utils.accelerator_utils import decide_device
+
+
+class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+
+    def __init__(
+        self,
+        enabled: bool,
+        artifacts_path: Optional[Union[Path, str]],
+        options: PictureDescriptionVlmOptions,
+        accelerator_options: AcceleratorOptions,
+    ):
+        super().__init__(enabled=enabled, options=options)
+        self.options: PictureDescriptionVlmOptions
+
+        if self.enabled:
+
+            if artifacts_path is None:
+                artifacts_path = self.download_models(repo_id=self.options.repo_id)
+            else:
+                artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
+
+            self.device = decide_device(accelerator_options.device)
+
+            try:
+                import torch
+                from transformers import AutoModelForVision2Seq, AutoProcessor
+            except ImportError:
+                raise ImportError(
+                    "transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
+                )
+
+            # Initialize processor and model
+            self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
+            self.model = AutoModelForVision2Seq.from_pretrained(
+                self.options.repo_id,
+                torch_dtype=torch.bfloat16,
+                _attn_implementation=(
+                    "flash_attention_2" if self.device.startswith("cuda") else "eager"
+                ),
+            ).to(self.device)
+
+            self.provenance = f"{self.options.repo_id}"
+
+    @staticmethod
+    def download_models(
+        repo_id: str,
+        local_dir: Optional[Path] = None,
+        force: bool = False,
+        progress: bool = False,
+    ) -> Path:
+        from huggingface_hub import snapshot_download
+        from huggingface_hub.utils import disable_progress_bars
+
+        if not progress:
+            disable_progress_bars()
+        download_path = snapshot_download(
+            repo_id=repo_id,
+            force_download=force,
+            local_dir=local_dir,
+        )
+
+        return Path(download_path)
+
+    def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+        from transformers import GenerationConfig
+
+        # Create input messages
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": self.options.prompt},
+                ],
+            },
+        ]
+
+        # TODO: do batch generation
+
+        for image in images:
+            # Prepare inputs
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=True
+            )
+            inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
+            inputs = inputs.to(self.device)
+
+            # Generate outputs
+            generated_ids = self.model.generate(
+                **inputs,
+                generation_config=GenerationConfig(**self.options.generation_config),
+            )
+            generated_texts = self.processor.batch_decode(
+                generated_ids[:, inputs["input_ids"].shape[1] :],
+                skip_special_tokens=True,
+            )
+
+            yield generated_texts[0].strip()
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 4e66415f..13e435f9 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -14,6 +14,8 @@ from docling.datamodel.pipeline_options import (
     EasyOcrOptions,
     OcrMacOptions,
     PdfPipelineOptions,
+    PictureDescriptionApiOptions,
+    PictureDescriptionVlmOptions,
     RapidOcrOptions,
     TesseractCliOcrOptions,
     TesseractOcrOptions,
@@ -34,6 +36,9 @@ from docling.models.page_preprocessing_model import (
     PagePreprocessingModel,
     PagePreprocessingOptions,
 )
+from docling.models.picture_description_api_model import PictureDescriptionApiModel
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.rapid_ocr_model import RapidOcrModel
 from docling.models.table_structure_model import TableStructureModel
 from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
@@ -95,8 +100,17 @@ class StandardPdfPipeline(PaginatedPipeline):
             PageAssembleModel(options=PageAssembleOptions()),
         ]
 
+        # Picture description model
+        if (
+            picture_description_model := self.get_picture_description_model(
+                artifacts_path=artifacts_path
+            )
+        ) is None:
+            raise RuntimeError(
+                f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
+            )
+
         self.enrichment_pipe = [
-            # Other models working on `NodeItem` elements in the DoclingDocument
             # Code Formula Enrichment Model
             CodeFormulaModel(
                 enabled=pipeline_options.do_code_enrichment
@@ -115,11 +129,14 @@ class StandardPdfPipeline(PaginatedPipeline):
                 options=DocumentPictureClassifierOptions(),
                 accelerator_options=pipeline_options.accelerator_options,
             ),
+            # Document Picture description
+            picture_description_model,
         ]
 
         if (
             self.pipeline_options.do_formula_enrichment
             or self.pipeline_options.do_code_enrichment
+            or self.pipeline_options.do_picture_description
         ):
             self.keep_backend = True
 
@@ -175,6 +192,29 @@ class StandardPdfPipeline(PaginatedPipeline):
             )
         return None
 
+    def get_picture_description_model(
+        self, artifacts_path: Optional[Path] = None
+    ) -> Optional[PictureDescriptionBaseModel]:
+        if isinstance(
+            self.pipeline_options.picture_description_options,
+            PictureDescriptionApiOptions,
+        ):
+            return PictureDescriptionApiModel(
+                enabled=self.pipeline_options.do_picture_description,
+                options=self.pipeline_options.picture_description_options,
+            )
+        elif isinstance(
+            self.pipeline_options.picture_description_options,
+            PictureDescriptionVlmOptions,
+        ):
+            return PictureDescriptionVlmModel(
+                enabled=self.pipeline_options.do_picture_description,
+                artifacts_path=artifacts_path,
+                options=self.pipeline_options.picture_description_options,
+                accelerator_options=self.pipeline_options.accelerator_options,
+            )
+        return None
+
     def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
         with TimeRecorder(conv_res, "page_init"):
             page._backend = conv_res.input._backend.load_page(page.page_no)  # type: ignore
diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py
index 504618ec..7d22b77b 100644
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -2,11 +2,13 @@ import logging
 from pathlib import Path
 from typing import Optional
 
+from docling.datamodel.pipeline_options import smolvlm_picture_description
 from docling.datamodel.settings import settings
 from docling.models.code_formula_model import CodeFormulaModel
 from docling.models.document_picture_classifier import DocumentPictureClassifier
 from docling.models.easyocr_model import EasyOcrModel
 from docling.models.layout_model import LayoutModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
 from docling.models.table_structure_model import TableStructureModel
 
 _log = logging.getLogger(__name__)
@@ -21,6 +23,7 @@ def download_models(
     with_tableformer: bool = True,
     with_code_formula: bool = True,
     with_picture_classifier: bool = True,
+    with_smolvlm: bool = True,
     with_easyocr: bool = True,
 ):
     if output_dir is None:
@@ -61,6 +64,15 @@ def download_models(
             progress=progress,
         )
 
+    if with_smolvlm:
+        _log.info(f"Downloading SmolVlm model...")
+        PictureDescriptionVlmModel.download_models(
+            repo_id=smolvlm_picture_description.repo_id,
+            local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
+            force=force,
+            progress=progress,
+        )
+
     if with_easyocr:
         _log.info(f"Downloading easyocr models...")
         EasyOcrModel.download_models(
diff --git a/docs/examples/pictures_description.py b/docs/examples/pictures_description.py
new file mode 100644
index 00000000..f60ac29d
--- /dev/null
+++ b/docs/examples/pictures_description.py
@@ -0,0 +1,48 @@
+import logging
+from pathlib import Path
+
+from docling_core.types.doc import PictureItem
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    granite_picture_description,
+    smolvlm_picture_description,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_picture_description = True
+    pipeline_options.picture_description_options = smolvlm_picture_description
+    # pipeline_options.picture_description_options = granite_picture_description
+
+    pipeline_options.picture_description_options.prompt = (
+        "Describe the image in three sentences. Be consise and accurate."
+    )
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+    result = doc_converter.convert(input_doc_path)
+
+    for element, _level in result.document.iterate_items():
+        if isinstance(element, PictureItem):
+            print(
+                f"Picture {element.self_ref}\n"
+                f"Caption: {element.caption_text(doc=result.document)}\n"
+                f"Annotations: {element.annotations}"
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py
new file mode 100644
index 00000000..3da37edf
--- /dev/null
+++ b/docs/examples/pictures_description_api.py
@@ -0,0 +1,55 @@
+import logging
+from pathlib import Path
+
+from docling_core.types.doc import PictureItem
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    PictureDescriptionApiOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+
+    # This is using a local API server to do picture description.
+    # For example, you can launch it locally with:
+    # $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
+
+    pipeline_options = PdfPipelineOptions()
+    pipeline_options.do_picture_description = True
+    pipeline_options.picture_description_options = PictureDescriptionApiOptions(
+        url="http://localhost:8000/v1/chat/completions",
+        params=dict(
+            model="HuggingFaceTB/SmolVLM-256M-Instruct",
+            seed=42,
+            max_completion_tokens=200,
+        ),
+        prompt="Describe the image in three sentences. Be consise and accurate.",
+        timeout=90,
+    )
+
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_options=pipeline_options,
+            )
+        }
+    )
+    result = doc_converter.convert(input_doc_path)
+
+    for element, _level in result.document.iterate_items():
+        if isinstance(element, PictureItem):
+            print(
+                f"Picture {element.self_ref}\n"
+                f"Caption: {element.caption_text(doc=result.document)}\n"
+                f"Annotations: {element.annotations}"
+            )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/poetry.lock b/poetry.lock
index b261db4b..691dd844 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2727,13 +2727,13 @@ pygments = ">2.12.0"
 
 [[package]]
 name = "mkdocs-material"
-version = "9.6.2"
+version = "9.6.3"
 description = "Documentation that simply works"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mkdocs_material-9.6.2-py3-none-any.whl", hash = "sha256:71d90dbd63b393ad11a4d90151dfe3dcbfcd802c0f29ce80bebd9bbac6abc753"},
-    {file = "mkdocs_material-9.6.2.tar.gz", hash = "sha256:a3de1c5d4c745f10afa78b1a02f917b9dce0808fb206adc0f5bb48b58c1ca21f"},
+    {file = "mkdocs_material-9.6.3-py3-none-any.whl", hash = "sha256:1125622067e26940806701219303b27c0933e04533560725d97ec26fd16a39cf"},
+    {file = "mkdocs_material-9.6.3.tar.gz", hash = "sha256:c87f7d1c39ce6326da5e10e232aed51bae46252e646755900f4b0fc9192fa832"},
 ]
 
 [package.dependencies]
@@ -7846,8 +7846,9 @@ type = ["pytest-mypy"]
 ocrmac = ["ocrmac"]
 rapidocr = ["onnxruntime", "onnxruntime", "rapidocr-onnxruntime"]
 tesserocr = ["tesserocr"]
+vlm = ["transformers", "transformers"]
 
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "ca0464df452664834ae9bccc59f89240e2f5e8f3b179761de615548c799680e7"
+content-hash = "86d266adc6272f3db65ab07f5cce35cbe9626368dc0e09ab374c861f0809f693"
diff --git a/pyproject.toml b/pyproject.toml
index 3bc88b05..9b1b5e9b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,10 @@ onnxruntime = [
   { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
   { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
 ]
+transformers = [
+  {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
+  {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
+]
 pillow = "^10.0.0"
 tqdm = "^4.65.0"
 
@@ -121,6 +125,7 @@ torchvision = [
 [tool.poetry.extras]
 tesserocr = ["tesserocr"]
 ocrmac = ["ocrmac"]
+vlm = ["transformers"]
 rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
 
 [tool.poetry.scripts]
@@ -162,7 +167,8 @@ module = [
     "deepsearch_glm.*",
     "lxml.*",
     "bs4.*",
-    "huggingface_hub.*"
+    "huggingface_hub.*",
+    "transformers.*",
 ]
 ignore_missing_imports = true
 

From c18f47c5c032c49bf3175aecd2236df37c0e9ae1 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 7 Feb 2025 17:51:31 +0100
Subject: [PATCH 5/6] fix: remove unused httpx (#919)

* remove unused httpx

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* use requests instead of httpx

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* remove more usage of httpx

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/models/easyocr_model.py                 |  2 --
 docling/models/picture_description_api_model.py | 10 +++-------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
index 9b1b2a02..0eccb988 100644
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -4,9 +4,7 @@ import zipfile
 from pathlib import Path
 from typing import Iterable, List, Optional
 
-import httpx
 import numpy
-import torch
 from docling_core.types.doc import BoundingBox, CoordOrigin
 
 from docling.datamodel.base_models import Cell, OcrCell, Page
diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py
index 6c7e02fc..86b76944 100644
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@@ -3,11 +3,7 @@ import io
 import logging
 from typing import Iterable, List, Optional
 
-import httpx
-from docling_core.types.doc import PictureItem
-from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
-    PictureDescriptionData,
-)
+import requests
 from PIL import Image
 from pydantic import BaseModel, ConfigDict
 
@@ -90,13 +86,13 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
                 **self.options.params,
             }
 
-            r = httpx.post(
+            r = requests.post(
                 str(self.options.url),
                 headers=self.options.headers,
                 json=payload,
                 timeout=self.options.timeout,
             )
-            if not r.is_success:
+            if not r.ok:
                 _log.error(f"Error calling the API. Reponse was {r.text}")
             r.raise_for_status()
 

From 3e26597995f236fe81ccd7f1a247d05b8a8420cb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]" <github-actions[bot]@users.noreply.github.com>
Date: Fri, 7 Feb 2025 17:46:36 +0000
Subject: [PATCH 6/6] chore: bump version to 2.20.0 [skip ci]

---
 CHANGELOG.md   | 10 ++++++++++
 pyproject.toml |  2 +-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8dc85cf5..030c6954 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## [v2.20.0](https://github.com/DS4SD/docling/releases/tag/v2.20.0) - 2025-02-07
+
+### Feature
+
+* Describe pictures using vision models ([#259](https://github.com/DS4SD/docling/issues/259)) ([`4cc6e3e`](https://github.com/DS4SD/docling/commit/4cc6e3ea5e858b367136acc729b723ea0552d22a))
+
+### Fix
+
+* Remove unused httpx ([#919](https://github.com/DS4SD/docling/issues/919)) ([`c18f47c`](https://github.com/DS4SD/docling/commit/c18f47c5c032c49bf3175aecd2236df37c0e9ae1))
+
 ## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07
 
 ### Feature
diff --git a/pyproject.toml b/pyproject.toml
index 9b1b5e9b..e4425ffe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.19.0"  # DO NOT EDIT, updated automatically
+version = "2.20.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"