From 90b766e2ae1695a759191df37c272efc09be5ee3 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Fri, 7 Feb 2025 12:55:12 +0100
Subject: [PATCH 1/6] fix(markdown): handle nested lists (#910)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
docling/backend/md_backend.py | 108 ++++++++++--------
.../data/groundtruth/docling_v2/nested.md.md | 31 +++++
tests/data/md/nested.md | 66 +++++++++++
tests/test_backend_markdown.py | 12 +-
tests/test_data_gen_flag.py | 9 ++
5 files changed, 177 insertions(+), 49 deletions(-)
create mode 100644 tests/data/groundtruth/docling_v2/nested.md.md
create mode 100644 tests/data/md/nested.md
create mode 100644 tests/test_data_gen_flag.py
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index eaf47537..19a21c19 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -36,7 +36,7 @@ _STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
- def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
+ def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
pattern = r"_+"
@@ -81,7 +81,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
- self.markdown = self.shorten_underscore_sequences(text_stream)
+ self.markdown = self._shorten_underscore_sequences(text_stream)
if isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, "r", encoding="utf-8") as f:
md_content = f.read()
@@ -89,7 +89,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# very long sequences of underscores will lead to unnecessary long processing times.
# In any proper Markdown files, underscores have to be escaped,
# otherwise they represent emphasis (bold or italic)
- self.markdown = self.shorten_underscore_sequences(md_content)
+ self.markdown = self._shorten_underscore_sequences(md_content)
self.valid = True
_log.debug(self.markdown)
@@ -99,7 +99,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
) from e
return
- def close_table(self, doc: DoclingDocument):
+ def _close_table(self, doc: DoclingDocument):
if self.in_table:
_log.debug("=== TABLE START ===")
for md_table_row in self.md_table_buffer:
@@ -156,30 +156,35 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data)
return
- def process_inline_text(
- self, parent_element: Optional[NodeItem], doc: DoclingDocument
+ def _process_inline_text(
+ self, parent_item: Optional[NodeItem], doc: DoclingDocument
):
txt = " ".join(self.inline_texts)
if len(txt) > 0:
doc.add_text(
label=DocItemLabel.PARAGRAPH,
- parent=parent_element,
+ parent=parent_item,
text=txt,
)
self.inline_texts = []
- def iterate_elements(
+ def _iterate_elements(
self,
element: marko.element.Element,
depth: int,
doc: DoclingDocument,
- parent_element: Optional[NodeItem] = None,
+ visited: Set[marko.element.Element],
+ parent_item: Optional[NodeItem] = None,
):
+
+ if element in visited:
+ return
+
# Iterates over all elements in the AST
# Check for different element types and process relevant details
if isinstance(element, marko.block.Heading) and len(element.children) > 0:
- self.close_table(doc)
- self.process_inline_text(parent_element, doc)
+ self._close_table(doc)
+ self._process_inline_text(parent_item, doc)
_log.debug(
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
@@ -207,8 +212,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
traverse(element)
snippet_text = "".join(strings)
if len(snippet_text) > 0:
- parent_element = doc.add_text(
- label=doc_label, parent=parent_element, text=snippet_text
+ parent_item = doc.add_text(
+ label=doc_label, parent=parent_item, text=snippet_text
)
elif isinstance(element, marko.block.List):
@@ -218,35 +223,37 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
has_non_empty_list_items = True
break
- self.close_table(doc)
- self.process_inline_text(parent_element, doc)
+ self._close_table(doc)
+ self._process_inline_text(parent_item, doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
- parent_element = doc.add_group(
- label=label, name=f"list", parent=parent_element
+ parent_item = doc.add_group(
+ label=label, name=f"list", parent=parent_item
)
elif isinstance(element, marko.block.ListItem) and len(element.children) > 0:
- self.close_table(doc)
- self.process_inline_text(parent_element, doc)
+ self._close_table(doc)
+ self._process_inline_text(parent_item, doc)
_log.debug(" - List item")
- snippet_text = str(element.children[0].children[0].children) # type: ignore
+ first_child = element.children[0]
+ snippet_text = str(first_child.children[0].children) # type: ignore
is_numbered = False
if (
- parent_element is not None
- and isinstance(parent_element, DocItem)
- and parent_element.label == GroupLabel.ORDERED_LIST
+ parent_item is not None
+ and isinstance(parent_item, DocItem)
+ and parent_item.label == GroupLabel.ORDERED_LIST
):
is_numbered = True
doc.add_list_item(
- enumerated=is_numbered, parent=parent_element, text=snippet_text
+ enumerated=is_numbered, parent=parent_item, text=snippet_text
)
+ visited.add(first_child)
elif isinstance(element, marko.inline.Image):
- self.close_table(doc)
- self.process_inline_text(parent_element, doc)
+ self._close_table(doc)
+ self._process_inline_text(parent_item, doc)
_log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
fig_caption: Optional[TextItem] = None
@@ -255,10 +262,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
label=DocItemLabel.CAPTION, text=element.title
)
- doc.add_picture(parent=parent_element, caption=fig_caption)
+ doc.add_picture(parent=parent_item, caption=fig_caption)
elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
- self.process_inline_text(parent_element, doc)
+ self._process_inline_text(parent_item, doc)
elif isinstance(element, marko.inline.RawText):
_log.debug(f" - Paragraph (raw text): {element.children}")
@@ -272,17 +279,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
else:
self.md_table_buffer.append(snippet_text)
else:
- self.close_table(doc)
- self.in_table = False
+ self._close_table(doc)
# most likely just inline text
self.inline_texts.append(str(element.children))
elif isinstance(element, marko.inline.CodeSpan):
- self.close_table(doc)
- self.process_inline_text(parent_element, doc)
+ self._close_table(doc)
+ self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Span: {element.children}")
snippet_text = str(element.children).strip()
- doc.add_code(parent=parent_element, text=snippet_text)
+ doc.add_code(parent=parent_item, text=snippet_text)
elif (
isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
@@ -290,10 +296,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
and isinstance((first_child := element.children[0]), marko.inline.RawText)
and len(snippet_text := (first_child.children.strip())) > 0
):
- self.close_table(doc)
- self.process_inline_text(parent_element, doc)
+ self._close_table(doc)
+ self._process_inline_text(parent_item, doc)
_log.debug(f" - Code Block: {element.children}")
- doc.add_code(parent=parent_element, text=snippet_text)
+ doc.add_code(parent=parent_item, text=snippet_text)
elif isinstance(element, marko.inline.LineBreak):
if self.in_table:
@@ -302,8 +308,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
- self.process_inline_text(parent_element, doc)
- self.close_table(doc)
+ self._process_inline_text(parent_item, doc)
+ self._close_table(doc)
_log.debug("HTML Block: {}".format(element))
if (
len(element.body) > 0
@@ -312,18 +318,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
- doc.add_code(parent=parent_element, text=text_to_add)
+ doc.add_code(parent=parent_item, text=text_to_add)
else:
if not isinstance(element, str):
- self.close_table(doc)
+ self._close_table(doc)
_log.debug("Some other element: {}".format(element))
processed_block_types = (
- marko.block.ListItem,
marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
- # marko.block.Paragraph,
marko.inline.RawText,
)
@@ -332,7 +336,13 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
element, processed_block_types
):
for child in element.children:
- self.iterate_elements(child, depth + 1, doc, parent_element)
+ self._iterate_elements(
+ element=child,
+ depth=depth + 1,
+ doc=doc,
+ visited=visited,
+ parent_item=parent_item,
+ )
def is_valid(self) -> bool:
return self.valid
@@ -366,9 +376,15 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
marko_parser = Markdown()
parsed_ast = marko_parser.parse(self.markdown)
# Start iterating from the root of the AST
- self.iterate_elements(parsed_ast, 0, doc, None)
- self.process_inline_text(None, doc) # handle last hanging inline text
- self.close_table(doc=doc) # handle any last hanging table
+ self._iterate_elements(
+ element=parsed_ast,
+ depth=0,
+ doc=doc,
+ parent_item=None,
+ visited=set(),
+ )
+ self._process_inline_text(None, doc) # handle last hanging inline text
+ self._close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:
diff --git a/tests/data/groundtruth/docling_v2/nested.md.md b/tests/data/groundtruth/docling_v2/nested.md.md
new file mode 100644
index 00000000..6e430e0c
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/nested.md.md
@@ -0,0 +1,31 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+ - abc123
+ - abc1234
+ - abc12345
+ - a.
+ - b.
+ - abcd1234:
+ - abcd12345:
+ - a.
+ - b.
+- def:
+ - def1234:
+ - def12345。
+- after one empty line
+ - foo
+- afer two empty lines
+ - bar
+
+- changing symbol
+
+A nested HTML list:
+
+- First item
+- Second item with subitems:
+ - Subitem 1
+ - Subitem 2
+- Last list item
diff --git a/tests/data/md/nested.md b/tests/data/md/nested.md
new file mode 100644
index 00000000..4e203eec
--- /dev/null
+++ b/tests/data/md/nested.md
@@ -0,0 +1,66 @@
+# Nesting
+
+A list featuring nesting:
+
+- abc
+ - abc123
+ - abc1234
+ - abc12345
+ - a.
+ - b.
+ - abcd1234:
+ - abcd12345:
+ - a.
+ - b.
+- def:
+ - def1234:
+ - def12345。
+
+- after one empty line
+ - foo
+
+
+- afer two empty lines
+ - bar
+* changing symbol
+
+A nested HTML list:
+
+
+ - First item
+ - Second item with subitems:
+
+ - Subitem 1
+ - Subitem 2
+
+
+ - Last list item
+
+
+
diff --git a/tests/test_backend_markdown.py b/tests/test_backend_markdown.py
index caa94d9f..5a201ab2 100644
--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@@ -4,6 +4,8 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
+from .test_data_gen_flag import GEN_TEST_DATA
+
def test_convert_valid():
fmt = InputFormat.MD
@@ -30,6 +32,10 @@ def test_convert_valid():
act_doc = backend.convert()
act_data = act_doc.export_to_markdown()
- with open(gt_path, "r", encoding="utf-8") as f:
- exp_data = f.read().rstrip()
- assert act_data == exp_data
+ if GEN_TEST_DATA:
+ with open(gt_path, mode="w", encoding="utf-8") as f:
+ f.write(f"{act_data}\n")
+ else:
+ with open(gt_path, encoding="utf-8") as f:
+ exp_data = f.read().rstrip()
+ assert exp_data == act_data
diff --git a/tests/test_data_gen_flag.py b/tests/test_data_gen_flag.py
new file mode 100644
index 00000000..a4baff66
--- /dev/null
+++ b/tests/test_data_gen_flag.py
@@ -0,0 +1,9 @@
+import os
+
+from pydantic import TypeAdapter
+
+GEN_TEST_DATA = TypeAdapter(bool).validate_python(os.getenv("DOCLING_GEN_TEST_DATA", 0))
+
+
+def test_gen_test_data_flag():
+ assert not GEN_TEST_DATA
From 02faf5376b22e174a6aa90dc7bd95feb14a94754 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 7 Feb 2025 13:58:05 +0100
Subject: [PATCH 2/6] refactor: use org--name in artifacts-path (#912)
use org--name in artifacts-path
Signed-off-by: Michele Dolfi
---
docling/models/code_formula_model.py | 2 +-
docling/models/document_picture_classifier.py | 2 +-
docling/models/layout_model.py | 2 +-
docling/models/table_structure_model.py | 2 +-
4 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py
index 8bb29af2..1a0f0bf0 100644
--- a/docling/models/code_formula_model.py
+++ b/docling/models/code_formula_model.py
@@ -62,7 +62,7 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
Processes the given batch of elements and enriches them with predictions.
"""
- _model_repo_folder = "CodeFormula"
+ _model_repo_folder = "ds4sd--CodeFormula"
elements_batch_size = 5
images_scale = 1.66 # = 120 dpi, aligned with training data resolution
expansion_factor = 0.03
diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py
index 302d18cb..6e71246b 100644
--- a/docling/models/document_picture_classifier.py
+++ b/docling/models/document_picture_classifier.py
@@ -56,7 +56,7 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
Processes a batch of elements and adds classification annotations.
"""
- _model_repo_folder = "DocumentFigureClassifier"
+ _model_repo_folder = "ds4sd--DocumentFigureClassifier"
images_scale = 2
def __init__(
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index c88f91cb..b3cbd954 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -22,7 +22,7 @@ _log = logging.getLogger(__name__)
class LayoutModel(BasePageModel):
- _model_repo_folder = "docling-models"
+ _model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/layout"
TEXT_ELEM_LABELS = [
diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py
index b5ab5a2a..64979157 100644
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@@ -23,7 +23,7 @@ from docling.utils.profiling import TimeRecorder
class TableStructureModel(BasePageModel):
- _model_repo_folder = "docling-models"
+ _model_repo_folder = "ds4sd--docling-models"
_model_path = "model_artifacts/tableformer"
def __init__(
From fba3cf9be75e239896c353a1c0f3bdd0fa4a92fa Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Fri, 7 Feb 2025 13:36:54 +0000
Subject: [PATCH 3/6] chore: bump version to 2.19.0 [skip ci]
---
CHANGELOG.md | 17 +++++++++++++++++
pyproject.toml | 2 +-
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4ad3b47d..8dc85cf5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,20 @@
+## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07
+
+### Feature
+
+* New artifacts path and CLI utility ([#876](https://github.com/DS4SD/docling/issues/876)) ([`ed74fe2`](https://github.com/DS4SD/docling/commit/ed74fe2ec0a702834f0deacfdb5717c8c587dab1))
+
+### Fix
+
+* **markdown:** Handle nested lists ([#910](https://github.com/DS4SD/docling/issues/910)) ([`90b766e`](https://github.com/DS4SD/docling/commit/90b766e2ae1695a759191df37c272efc09be5ee3))
+* Test cases for RTL programmatic PDFs and fixes for the formula model ([#903](https://github.com/DS4SD/docling/issues/903)) ([`9114ada`](https://github.com/DS4SD/docling/commit/9114ada7bc4dd45ce0046de2f9d00a80ccb25c79))
+* **msword_backend:** Handle conversion error in label parsing ([#896](https://github.com/DS4SD/docling/issues/896)) ([`722a6eb`](https://github.com/DS4SD/docling/commit/722a6eb7b994a0261312a356df80b2fced121812))
+* Enrichment models batch size and expose picture classifier ([#878](https://github.com/DS4SD/docling/issues/878)) ([`5ad6de0`](https://github.com/DS4SD/docling/commit/5ad6de05600315617b574bd12af553e00b4d316e))
+
+### Documentation
+
+* Introduce example with custom models for RapidOCR ([#874](https://github.com/DS4SD/docling/issues/874)) ([`6d3fea0`](https://github.com/DS4SD/docling/commit/6d3fea019635bd6ca94bd36c3928b28c245d638d))
+
## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03
### Feature
diff --git a/pyproject.toml b/pyproject.toml
index e1c30a3c..3bc88b05 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
-version = "2.18.0" # DO NOT EDIT, updated automatically
+version = "2.19.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "]
license = "MIT"
From 4cc6e3ea5e858b367136acc729b723ea0552d22a Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 7 Feb 2025 16:30:42 +0100
Subject: [PATCH 4/6] feat: Describe pictures using vision models (#259)
* draft for picture description models
Signed-off-by: Michele Dolfi
* vlm description using AutoModelForVision2Seq
Signed-off-by: Michele Dolfi
* add generation options
Signed-off-by: Michele Dolfi
* update vlm API
Signed-off-by: Michele Dolfi
* allow only localhost traffic
Signed-off-by: Michele Dolfi
* rename model
Signed-off-by: Michele Dolfi
* do not run with vlm api
Signed-off-by: Michele Dolfi
* more renaming
Signed-off-by: Michele Dolfi
* fix examples path
Signed-off-by: Michele Dolfi
* apply CLI download login
Signed-off-by: Michele Dolfi
* fix name of cli argument
Signed-off-by: Michele Dolfi
* use with_smolvlm in models download
Signed-off-by: Michele Dolfi
---------
Signed-off-by: Michele Dolfi
---
.github/workflows/checks.yml | 2 +-
docling/cli/main.py | 5 +
docling/cli/models.py | 2 +
docling/datamodel/pipeline_options.py | 54 ++++++++-
docling/models/base_model.py | 4 +-
.../models/picture_description_api_model.py | 105 +++++++++++++++++
.../models/picture_description_base_model.py | 64 ++++++++++
.../models/picture_description_vlm_model.py | 109 ++++++++++++++++++
docling/pipeline/standard_pdf_pipeline.py | 42 ++++++-
docling/utils/model_downloader.py | 12 ++
docs/examples/pictures_description.py | 48 ++++++++
docs/examples/pictures_description_api.py | 55 +++++++++
poetry.lock | 9 +-
pyproject.toml | 8 +-
14 files changed, 508 insertions(+), 11 deletions(-)
create mode 100644 docling/models/picture_description_api_model.py
create mode 100644 docling/models/picture_description_base_model.py
create mode 100644 docling/models/picture_description_vlm_model.py
create mode 100644 docs/examples/pictures_description.py
create mode 100644 docs/examples/pictures_description_api.py
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index 75ea5970..89bcfd79 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -28,7 +28,7 @@ jobs:
run: |
for file in docs/examples/*.py; do
# Skip batch_convert.py
- if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert).py ]]; then
+ if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
echo "Skipping $file"
continue
fi
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 19f77e4e..e2bc0dd6 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -226,6 +226,10 @@ def convert(
help="Enable the picture classification enrichment model in the pipeline.",
),
] = False,
+ enrich_picture_description: Annotated[
+ bool,
+ typer.Option(..., help="Enable the picture description model in the pipeline."),
+ ] = False,
artifacts_path: Annotated[
Optional[Path],
typer.Option(..., help="If provided, the location of the model artifacts."),
@@ -382,6 +386,7 @@ def convert(
do_table_structure=True,
do_code_enrichment=enrich_code,
do_formula_enrichment=enrich_formula,
+ do_picture_description=enrich_picture_description,
do_picture_classification=enrich_picture_classes,
document_timeout=document_timeout,
)
diff --git a/docling/cli/models.py b/docling/cli/models.py
index aea498c5..3b62ad6b 100644
--- a/docling/cli/models.py
+++ b/docling/cli/models.py
@@ -31,6 +31,7 @@ class _AvailableModels(str, Enum):
TABLEFORMER = "tableformer"
CODE_FORMULA = "code_formula"
PICTURE_CLASSIFIER = "picture_classifier"
+ SMOLVLM = "smolvlm"
EASYOCR = "easyocr"
@@ -81,6 +82,7 @@ def download(
with_tableformer=_AvailableModels.TABLEFORMER in to_download,
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
+ with_smolvlm=_AvailableModels.SMOLVLM in to_download,
with_easyocr=_AvailableModels.EASYOCR in to_download,
)
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 14ca75bf..3b6401b6 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -2,9 +2,9 @@ import logging
import os
from enum import Enum
from pathlib import Path
-from typing import Any, List, Literal, Optional, Union
+from typing import Annotated, Any, Dict, List, Literal, Optional, Union
-from pydantic import BaseModel, ConfigDict, Field, model_validator
+from pydantic import AnyUrl, BaseModel, ConfigDict, Field, model_validator
from pydantic_settings import BaseSettings, SettingsConfigDict
_log = logging.getLogger(__name__)
@@ -184,6 +184,51 @@ class OcrMacOptions(OcrOptions):
)
+class PictureDescriptionBaseOptions(BaseModel):
+ kind: str
+ batch_size: int = 8
+ scale: float = 2
+
+ bitmap_area_threshold: float = (
+ 0.2 # percentage of the area for a bitmap to processed with the models
+ )
+
+
+class PictureDescriptionApiOptions(PictureDescriptionBaseOptions):
+ kind: Literal["api"] = "api"
+
+ url: AnyUrl = AnyUrl("http://localhost:8000/v1/chat/completions")
+ headers: Dict[str, str] = {}
+ params: Dict[str, Any] = {}
+ timeout: float = 20
+
+ prompt: str = "Describe this image in a few sentences."
+ provenance: str = ""
+
+
+class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
+ kind: Literal["vlm"] = "vlm"
+
+ repo_id: str
+ prompt: str = "Describe this image in a few sentences."
+ # Config from here https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationConfig
+ generation_config: Dict[str, Any] = dict(max_new_tokens=200, do_sample=False)
+
+ @property
+ def repo_cache_folder(self) -> str:
+ return self.repo_id.replace("/", "--")
+
+
+smolvlm_picture_description = PictureDescriptionVlmOptions(
+ repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
+)
+# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
+granite_picture_description = PictureDescriptionVlmOptions(
+ repo_id="ibm-granite/granite-vision-3.1-2b-preview",
+ prompt="What is shown in this image?",
+)
+
+
# Define an enum for the backend options
class PdfBackend(str, Enum):
"""Enum of valid PDF backends."""
@@ -223,6 +268,7 @@ class PdfPipelineOptions(PipelineOptions):
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
+ do_picture_description: bool = False # True: run describe pictures in documents
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: Union[
@@ -232,6 +278,10 @@ class PdfPipelineOptions(PipelineOptions):
OcrMacOptions,
RapidOcrOptions,
] = Field(EasyOcrOptions(), discriminator="kind")
+ picture_description_options: Annotated[
+ Union[PictureDescriptionApiOptions, PictureDescriptionVlmOptions],
+ Field(discriminator="kind"),
+ ] = smolvlm_picture_description
images_scale: float = 1.0
generate_page_images: bool = False
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index a2bc776e..9cdc0ecb 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -1,7 +1,7 @@
from abc import ABC, abstractmethod
from typing import Any, Generic, Iterable, Optional
-from docling_core.types.doc import BoundingBox, DoclingDocument, NodeItem, TextItem
+from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
from typing_extensions import TypeVar
from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
@@ -64,7 +64,7 @@ class BaseItemAndImageEnrichmentModel(
if not self.is_processable(doc=conv_res.document, element=element):
return None
- assert isinstance(element, TextItem)
+ assert isinstance(element, DocItem)
element_prov = element.prov[0]
bbox = element_prov.bbox
diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py
new file mode 100644
index 00000000..6c7e02fc
--- /dev/null
+++ b/docling/models/picture_description_api_model.py
@@ -0,0 +1,105 @@
+import base64
+import io
+import logging
+from typing import Iterable, List, Optional
+
+import httpx
+from docling_core.types.doc import PictureItem
+from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
+ PictureDescriptionData,
+)
+from PIL import Image
+from pydantic import BaseModel, ConfigDict
+
+from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+
+_log = logging.getLogger(__name__)
+
+
+class ChatMessage(BaseModel):
+ role: str
+ content: str
+
+
+class ResponseChoice(BaseModel):
+ index: int
+ message: ChatMessage
+ finish_reason: str
+
+
+class ResponseUsage(BaseModel):
+ prompt_tokens: int
+ completion_tokens: int
+ total_tokens: int
+
+
+class ApiResponse(BaseModel):
+ model_config = ConfigDict(
+ protected_namespaces=(),
+ )
+
+ id: str
+ model: Optional[str] = None # returned by openai
+ choices: List[ResponseChoice]
+ created: int
+ usage: ResponseUsage
+
+
+class PictureDescriptionApiModel(PictureDescriptionBaseModel):
+ # elements_batch_size = 4
+
+ def __init__(self, enabled: bool, options: PictureDescriptionApiOptions):
+ super().__init__(enabled=enabled, options=options)
+ self.options: PictureDescriptionApiOptions
+
+ if self.enabled:
+ if options.url.host != "localhost":
+ raise NotImplementedError(
+ "The options try to connect to remote APIs which are not yet allowed."
+ )
+
+ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+ # Note: technically we could make a batch request here,
+ # but not all APIs will allow for it. For example, vllm won't allow more than 1.
+ for image in images:
+ img_io = io.BytesIO()
+ image.save(img_io, "PNG")
+ image_base64 = base64.b64encode(img_io.getvalue()).decode("utf-8")
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": self.options.prompt,
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": f"data:image/png;base64,{image_base64}"
+ },
+ },
+ ],
+ }
+ ]
+
+ payload = {
+ "messages": messages,
+ **self.options.params,
+ }
+
+ r = httpx.post(
+ str(self.options.url),
+ headers=self.options.headers,
+ json=payload,
+ timeout=self.options.timeout,
+ )
+ if not r.is_success:
+ _log.error(f"Error calling the API. Reponse was {r.text}")
+ r.raise_for_status()
+
+ api_resp = ApiResponse.model_validate_json(r.text)
+ generated_text = api_resp.choices[0].message.content.strip()
+ yield generated_text
diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py
new file mode 100644
index 00000000..b653e0e3
--- /dev/null
+++ b/docling/models/picture_description_base_model.py
@@ -0,0 +1,64 @@
+import logging
+from pathlib import Path
+from typing import Any, Iterable, List, Optional, Union
+
+from docling_core.types.doc import (
+ DoclingDocument,
+ NodeItem,
+ PictureClassificationClass,
+ PictureItem,
+)
+from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
+ PictureDescriptionData,
+)
+from PIL import Image
+
+from docling.datamodel.pipeline_options import PictureDescriptionBaseOptions
+from docling.models.base_model import (
+ BaseItemAndImageEnrichmentModel,
+ ItemAndImageEnrichmentElement,
+)
+
+
+class PictureDescriptionBaseModel(BaseItemAndImageEnrichmentModel):
+ images_scale: float = 2.0
+
+ def __init__(
+ self,
+ enabled: bool,
+ options: PictureDescriptionBaseOptions,
+ ):
+ self.enabled = enabled
+ self.options = options
+ self.provenance = "not-implemented"
+
+ def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
+ return self.enabled and isinstance(element, PictureItem)
+
+ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+ raise NotImplementedError
+
+ def __call__(
+ self,
+ doc: DoclingDocument,
+ element_batch: Iterable[ItemAndImageEnrichmentElement],
+ ) -> Iterable[NodeItem]:
+ if not self.enabled:
+ for element in element_batch:
+ yield element.item
+ return
+
+ images: List[Image.Image] = []
+ elements: List[PictureItem] = []
+ for el in element_batch:
+ assert isinstance(el.item, PictureItem)
+ elements.append(el.item)
+ images.append(el.image)
+
+ outputs = self._annotate_images(images)
+
+ for item, output in zip(elements, outputs):
+ item.annotations.append(
+ PictureDescriptionData(text=output, provenance=self.provenance)
+ )
+ yield item
diff --git a/docling/models/picture_description_vlm_model.py b/docling/models/picture_description_vlm_model.py
new file mode 100644
index 00000000..9fa4826d
--- /dev/null
+++ b/docling/models/picture_description_vlm_model.py
@@ -0,0 +1,109 @@
+from pathlib import Path
+from typing import Iterable, Optional, Union
+
+from PIL import Image
+
+from docling.datamodel.pipeline_options import (
+ AcceleratorOptions,
+ PictureDescriptionVlmOptions,
+)
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.utils.accelerator_utils import decide_device
+
+
+class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
+
+ def __init__(
+ self,
+ enabled: bool,
+ artifacts_path: Optional[Union[Path, str]],
+ options: PictureDescriptionVlmOptions,
+ accelerator_options: AcceleratorOptions,
+ ):
+ super().__init__(enabled=enabled, options=options)
+ self.options: PictureDescriptionVlmOptions
+
+ if self.enabled:
+
+ if artifacts_path is None:
+ artifacts_path = self.download_models(repo_id=self.options.repo_id)
+ else:
+ artifacts_path = Path(artifacts_path) / self.options.repo_cache_folder
+
+ self.device = decide_device(accelerator_options.device)
+
+ try:
+ import torch
+ from transformers import AutoModelForVision2Seq, AutoProcessor
+ except ImportError:
+ raise ImportError(
+ "transformers >=4.46 is not installed. Please install Docling with the required extras `pip install docling[vlm]`."
+ )
+
+ # Initialize processor and model
+ self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
+ self.model = AutoModelForVision2Seq.from_pretrained(
+ self.options.repo_id,
+ torch_dtype=torch.bfloat16,
+ _attn_implementation=(
+ "flash_attention_2" if self.device.startswith("cuda") else "eager"
+ ),
+ ).to(self.device)
+
+ self.provenance = f"{self.options.repo_id}"
+
+ @staticmethod
+ def download_models(
+ repo_id: str,
+ local_dir: Optional[Path] = None,
+ force: bool = False,
+ progress: bool = False,
+ ) -> Path:
+ from huggingface_hub import snapshot_download
+ from huggingface_hub.utils import disable_progress_bars
+
+ if not progress:
+ disable_progress_bars()
+ download_path = snapshot_download(
+ repo_id=repo_id,
+ force_download=force,
+ local_dir=local_dir,
+ )
+
+ return Path(download_path)
+
+ def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
+ from transformers import GenerationConfig
+
+ # Create input messages
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": self.options.prompt},
+ ],
+ },
+ ]
+
+ # TODO: do batch generation
+
+ for image in images:
+ # Prepare inputs
+ prompt = self.processor.apply_chat_template(
+ messages, add_generation_prompt=True
+ )
+ inputs = self.processor(text=prompt, images=[image], return_tensors="pt")
+ inputs = inputs.to(self.device)
+
+ # Generate outputs
+ generated_ids = self.model.generate(
+ **inputs,
+ generation_config=GenerationConfig(**self.options.generation_config),
+ )
+ generated_texts = self.processor.batch_decode(
+ generated_ids[:, inputs["input_ids"].shape[1] :],
+ skip_special_tokens=True,
+ )
+
+ yield generated_texts[0].strip()
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 4e66415f..13e435f9 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -14,6 +14,8 @@ from docling.datamodel.pipeline_options import (
EasyOcrOptions,
OcrMacOptions,
PdfPipelineOptions,
+ PictureDescriptionApiOptions,
+ PictureDescriptionVlmOptions,
RapidOcrOptions,
TesseractCliOcrOptions,
TesseractOcrOptions,
@@ -34,6 +36,9 @@ from docling.models.page_preprocessing_model import (
PagePreprocessingModel,
PagePreprocessingOptions,
)
+from docling.models.picture_description_api_model import PictureDescriptionApiModel
+from docling.models.picture_description_base_model import PictureDescriptionBaseModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
from docling.models.tesseract_ocr_cli_model import TesseractOcrCliModel
@@ -95,8 +100,17 @@ class StandardPdfPipeline(PaginatedPipeline):
PageAssembleModel(options=PageAssembleOptions()),
]
+ # Picture description model
+ if (
+ picture_description_model := self.get_picture_description_model(
+ artifacts_path=artifacts_path
+ )
+ ) is None:
+ raise RuntimeError(
+ f"The specified picture description kind is not supported: {pipeline_options.picture_description_options.kind}."
+ )
+
self.enrichment_pipe = [
- # Other models working on `NodeItem` elements in the DoclingDocument
# Code Formula Enrichment Model
CodeFormulaModel(
enabled=pipeline_options.do_code_enrichment
@@ -115,11 +129,14 @@ class StandardPdfPipeline(PaginatedPipeline):
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
),
+ # Document Picture description
+ picture_description_model,
]
if (
self.pipeline_options.do_formula_enrichment
or self.pipeline_options.do_code_enrichment
+ or self.pipeline_options.do_picture_description
):
self.keep_backend = True
@@ -175,6 +192,29 @@ class StandardPdfPipeline(PaginatedPipeline):
)
return None
+ def get_picture_description_model(
+ self, artifacts_path: Optional[Path] = None
+ ) -> Optional[PictureDescriptionBaseModel]:
+ if isinstance(
+ self.pipeline_options.picture_description_options,
+ PictureDescriptionApiOptions,
+ ):
+ return PictureDescriptionApiModel(
+ enabled=self.pipeline_options.do_picture_description,
+ options=self.pipeline_options.picture_description_options,
+ )
+ elif isinstance(
+ self.pipeline_options.picture_description_options,
+ PictureDescriptionVlmOptions,
+ ):
+ return PictureDescriptionVlmModel(
+ enabled=self.pipeline_options.do_picture_description,
+ artifacts_path=artifacts_path,
+ options=self.pipeline_options.picture_description_options,
+ accelerator_options=self.pipeline_options.accelerator_options,
+ )
+ return None
+
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py
index 504618ec..7d22b77b 100644
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -2,11 +2,13 @@ import logging
from pathlib import Path
from typing import Optional
+from docling.datamodel.pipeline_options import smolvlm_picture_description
from docling.datamodel.settings import settings
from docling.models.code_formula_model import CodeFormulaModel
from docling.models.document_picture_classifier import DocumentPictureClassifier
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
+from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
from docling.models.table_structure_model import TableStructureModel
_log = logging.getLogger(__name__)
@@ -21,6 +23,7 @@ def download_models(
with_tableformer: bool = True,
with_code_formula: bool = True,
with_picture_classifier: bool = True,
+ with_smolvlm: bool = True,
with_easyocr: bool = True,
):
if output_dir is None:
@@ -61,6 +64,15 @@ def download_models(
progress=progress,
)
+ if with_smolvlm:
+ _log.info(f"Downloading SmolVlm model...")
+ PictureDescriptionVlmModel.download_models(
+ repo_id=smolvlm_picture_description.repo_id,
+ local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
+ force=force,
+ progress=progress,
+ )
+
if with_easyocr:
_log.info(f"Downloading easyocr models...")
EasyOcrModel.download_models(
diff --git a/docs/examples/pictures_description.py b/docs/examples/pictures_description.py
new file mode 100644
index 00000000..f60ac29d
--- /dev/null
+++ b/docs/examples/pictures_description.py
@@ -0,0 +1,48 @@
+import logging
+from pathlib import Path
+
+from docling_core.types.doc import PictureItem
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+ PdfPipelineOptions,
+ granite_picture_description,
+ smolvlm_picture_description,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+
+ input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+
+ pipeline_options = PdfPipelineOptions()
+ pipeline_options.do_picture_description = True
+ pipeline_options.picture_description_options = smolvlm_picture_description
+ # pipeline_options.picture_description_options = granite_picture_description
+
+ pipeline_options.picture_description_options.prompt = (
+ "Describe the image in three sentences. Be consise and accurate."
+ )
+
+ doc_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=pipeline_options,
+ )
+ }
+ )
+ result = doc_converter.convert(input_doc_path)
+
+ for element, _level in result.document.iterate_items():
+ if isinstance(element, PictureItem):
+ print(
+ f"Picture {element.self_ref}\n"
+ f"Caption: {element.caption_text(doc=result.document)}\n"
+ f"Annotations: {element.annotations}"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/docs/examples/pictures_description_api.py b/docs/examples/pictures_description_api.py
new file mode 100644
index 00000000..3da37edf
--- /dev/null
+++ b/docs/examples/pictures_description_api.py
@@ -0,0 +1,55 @@
+import logging
+from pathlib import Path
+
+from docling_core.types.doc import PictureItem
+
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+ PdfPipelineOptions,
+ PictureDescriptionApiOptions,
+)
+from docling.document_converter import DocumentConverter, PdfFormatOption
+
+
+def main():
+ logging.basicConfig(level=logging.INFO)
+
+ input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+
+ # This is using a local API server to do picture description.
+ # For example, you can launch it locally with:
+ # $ vllm serve "HuggingFaceTB/SmolVLM-256M-Instruct"
+
+ pipeline_options = PdfPipelineOptions()
+ pipeline_options.do_picture_description = True
+ pipeline_options.picture_description_options = PictureDescriptionApiOptions(
+ url="http://localhost:8000/v1/chat/completions",
+ params=dict(
+ model="HuggingFaceTB/SmolVLM-256M-Instruct",
+ seed=42,
+ max_completion_tokens=200,
+ ),
+ prompt="Describe the image in three sentences. Be consise and accurate.",
+ timeout=90,
+ )
+
+ doc_converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=pipeline_options,
+ )
+ }
+ )
+ result = doc_converter.convert(input_doc_path)
+
+ for element, _level in result.document.iterate_items():
+ if isinstance(element, PictureItem):
+ print(
+ f"Picture {element.self_ref}\n"
+ f"Caption: {element.caption_text(doc=result.document)}\n"
+ f"Annotations: {element.annotations}"
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/poetry.lock b/poetry.lock
index b261db4b..691dd844 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2727,13 +2727,13 @@ pygments = ">2.12.0"
[[package]]
name = "mkdocs-material"
-version = "9.6.2"
+version = "9.6.3"
description = "Documentation that simply works"
optional = false
python-versions = ">=3.8"
files = [
- {file = "mkdocs_material-9.6.2-py3-none-any.whl", hash = "sha256:71d90dbd63b393ad11a4d90151dfe3dcbfcd802c0f29ce80bebd9bbac6abc753"},
- {file = "mkdocs_material-9.6.2.tar.gz", hash = "sha256:a3de1c5d4c745f10afa78b1a02f917b9dce0808fb206adc0f5bb48b58c1ca21f"},
+ {file = "mkdocs_material-9.6.3-py3-none-any.whl", hash = "sha256:1125622067e26940806701219303b27c0933e04533560725d97ec26fd16a39cf"},
+ {file = "mkdocs_material-9.6.3.tar.gz", hash = "sha256:c87f7d1c39ce6326da5e10e232aed51bae46252e646755900f4b0fc9192fa832"},
]
[package.dependencies]
@@ -7846,8 +7846,9 @@ type = ["pytest-mypy"]
ocrmac = ["ocrmac"]
rapidocr = ["onnxruntime", "onnxruntime", "rapidocr-onnxruntime"]
tesserocr = ["tesserocr"]
+vlm = ["transformers", "transformers"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
-content-hash = "ca0464df452664834ae9bccc59f89240e2f5e8f3b179761de615548c799680e7"
+content-hash = "86d266adc6272f3db65ab07f5cce35cbe9626368dc0e09ab374c861f0809f693"
diff --git a/pyproject.toml b/pyproject.toml
index 3bc88b05..9b1b5e9b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,6 +59,10 @@ onnxruntime = [
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
]
+transformers = [
+ {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
+ {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
+]
pillow = "^10.0.0"
tqdm = "^4.65.0"
@@ -121,6 +125,7 @@ torchvision = [
[tool.poetry.extras]
tesserocr = ["tesserocr"]
ocrmac = ["ocrmac"]
+vlm = ["transformers"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts]
@@ -162,7 +167,8 @@ module = [
"deepsearch_glm.*",
"lxml.*",
"bs4.*",
- "huggingface_hub.*"
+ "huggingface_hub.*",
+ "transformers.*",
]
ignore_missing_imports = true
From c18f47c5c032c49bf3175aecd2236df37c0e9ae1 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Date: Fri, 7 Feb 2025 17:51:31 +0100
Subject: [PATCH 5/6] fix: remove unused httpx (#919)
* remove unused httpx
Signed-off-by: Michele Dolfi
* use requests instead of httpx
Signed-off-by: Michele Dolfi
* remove more usage of httpx
Signed-off-by: Michele Dolfi
---------
Signed-off-by: Michele Dolfi
---
docling/models/easyocr_model.py | 2 --
docling/models/picture_description_api_model.py | 10 +++-------
2 files changed, 3 insertions(+), 9 deletions(-)
diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py
index 9b1b2a02..0eccb988 100644
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@@ -4,9 +4,7 @@ import zipfile
from pathlib import Path
from typing import Iterable, List, Optional
-import httpx
import numpy
-import torch
from docling_core.types.doc import BoundingBox, CoordOrigin
from docling.datamodel.base_models import Cell, OcrCell, Page
diff --git a/docling/models/picture_description_api_model.py b/docling/models/picture_description_api_model.py
index 6c7e02fc..86b76944 100644
--- a/docling/models/picture_description_api_model.py
+++ b/docling/models/picture_description_api_model.py
@@ -3,11 +3,7 @@ import io
import logging
from typing import Iterable, List, Optional
-import httpx
-from docling_core.types.doc import PictureItem
-from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
- PictureDescriptionData,
-)
+import requests
from PIL import Image
from pydantic import BaseModel, ConfigDict
@@ -90,13 +86,13 @@ class PictureDescriptionApiModel(PictureDescriptionBaseModel):
**self.options.params,
}
- r = httpx.post(
+ r = requests.post(
str(self.options.url),
headers=self.options.headers,
json=payload,
timeout=self.options.timeout,
)
- if not r.is_success:
+ if not r.ok:
_log.error(f"Error calling the API. Reponse was {r.text}")
r.raise_for_status()
From 3e26597995f236fe81ccd7f1a247d05b8a8420cb Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Fri, 7 Feb 2025 17:46:36 +0000
Subject: [PATCH 6/6] chore: bump version to 2.20.0 [skip ci]
---
CHANGELOG.md | 10 ++++++++++
pyproject.toml | 2 +-
2 files changed, 11 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8dc85cf5..030c6954 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## [v2.20.0](https://github.com/DS4SD/docling/releases/tag/v2.20.0) - 2025-02-07
+
+### Feature
+
+* Describe pictures using vision models ([#259](https://github.com/DS4SD/docling/issues/259)) ([`4cc6e3e`](https://github.com/DS4SD/docling/commit/4cc6e3ea5e858b367136acc729b723ea0552d22a))
+
+### Fix
+
+* Remove unused httpx ([#919](https://github.com/DS4SD/docling/issues/919)) ([`c18f47c`](https://github.com/DS4SD/docling/commit/c18f47c5c032c49bf3175aecd2236df37c0e9ae1))
+
## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07
### Feature
diff --git a/pyproject.toml b/pyproject.toml
index 9b1b5e9b..e4425ffe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
-version = "2.19.0" # DO NOT EDIT, updated automatically
+version = "2.20.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "]
license = "MIT"