diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index f18fc88c..ddc06f28 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -22,8 +22,8 @@ jobs:
python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
steps:
- uses: actions/checkout@v4
- - name: Install tesseract
- run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
+ - name: Install tesseract and ffmpeg
+ run: sudo apt-get update && sudo apt-get install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config
- name: Set TESSDATA_PREFIX
run: |
echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
@@ -60,7 +60,7 @@ jobs:
run: |
for file in docs/examples/*.py; do
# Skip batch_convert.py
- if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
+ if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
echo "Skipping $file"
continue
fi
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4f7d4475..3e04a4dd 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,42 @@
+## [v2.39.0](https://github.com/docling-project/docling/releases/tag/v2.39.0) - 2025-06-27
+
+### Feature
+
+* Leverage new list modeling, capture default markers ([#1856](https://github.com/docling-project/docling/issues/1856)) ([`0533da1`](https://github.com/docling-project/docling/commit/0533da1923598e4a2d6392283f6de0f9c7002b01))
+
+### Fix
+
+* **markdown:** Make parsing of rich table cells valid ([#1821](https://github.com/docling-project/docling/issues/1821)) ([`e79e4f0`](https://github.com/docling-project/docling/commit/e79e4f0ab6c5b8276316e423b14c9821165049f2))
+
+## [v2.38.1](https://github.com/docling-project/docling/releases/tag/v2.38.1) - 2025-06-25
+
+### Fix
+
+* Updated granite vision model version for picture description ([#1852](https://github.com/docling-project/docling/issues/1852)) ([`d337825`](https://github.com/docling-project/docling/commit/d337825b8ef9ab3ec00c1496c340041e406bd271))
+* **markdown:** Fix single-formatted headings & list items ([#1820](https://github.com/docling-project/docling/issues/1820)) ([`7c5614a`](https://github.com/docling-project/docling/commit/7c5614a37a316950c9a1d123e4fd94e0e831aca0))
+* Fix response type of ollama ([#1850](https://github.com/docling-project/docling/issues/1850)) ([`41e8cae`](https://github.com/docling-project/docling/commit/41e8cae26b625b95ffab021fb4dc337249e8caad))
+* Handle missing runs to avoid out of range exception ([#1844](https://github.com/docling-project/docling/issues/1844)) ([`4002de1`](https://github.com/docling-project/docling/commit/4002de1f9220a6568ed87ba726254cde3ab1168a))
+
+## [v2.38.0](https://github.com/docling-project/docling/releases/tag/v2.38.0) - 2025-06-23
+
+### Feature
+
+* Support audio input ([#1763](https://github.com/docling-project/docling/issues/1763)) ([`1557e7c`](https://github.com/docling-project/docling/commit/1557e7ce3e036fb51eb118296f5cbff3b6dfbfa7))
+* **markdown:** Add formatting & improve inline support ([#1804](https://github.com/docling-project/docling/issues/1804)) ([`861abcd`](https://github.com/docling-project/docling/commit/861abcdcb0d406342b9566f81203b87cf32b7ad0))
+* Maximum image size for Vlm models ([#1802](https://github.com/docling-project/docling/issues/1802)) ([`215b540`](https://github.com/docling-project/docling/commit/215b540f6c078a72464310ef22975ebb6cde4f0a))
+
+### Fix
+
+* **docx:** Ensure list items have a list parent ([#1827](https://github.com/docling-project/docling/issues/1827)) ([`d26dac6`](https://github.com/docling-project/docling/commit/d26dac61a86b0af5b16686f78956ba047bcbddba))
+* **msword_backend:** Identify text in the same line after an image #1425 ([#1610](https://github.com/docling-project/docling/issues/1610)) ([`1350a8d`](https://github.com/docling-project/docling/commit/1350a8d3e5ea3c4b4d506757758880c8f78efd8c))
+* Ensure uninitialized pages are removed before assembling document ([#1812](https://github.com/docling-project/docling/issues/1812)) ([`dd7f64f`](https://github.com/docling-project/docling/commit/dd7f64ff28226cd9964fc4d8ba807b2c8a6358ef))
+* Formula conversion with page_range param set ([#1791](https://github.com/docling-project/docling/issues/1791)) ([`dbab30e`](https://github.com/docling-project/docling/commit/dbab30e92cc1d130ce7f9335ab9c46aa7a30930d))
+
+### Documentation
+
+* Update readme and add ASR example ([#1836](https://github.com/docling-project/docling/issues/1836)) ([`f3ae302`](https://github.com/docling-project/docling/commit/f3ae3029b8a6d6f0109383fbc82ebf9da3942afd))
+* Support running examples from root or subfolder ([#1816](https://github.com/docling-project/docling/issues/1816)) ([`64ac043`](https://github.com/docling-project/docling/commit/64ac043786efdece0c61827051a5b41dddf6c5d7))
+
## [v2.37.0](https://github.com/docling-project/docling/releases/tag/v2.37.0) - 2025-06-16
### Feature
diff --git a/README.md b/README.md
index 309e1030..c53e7b79 100644
--- a/README.md
+++ b/README.md
@@ -28,14 +28,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
-* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
* 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
* 💻 Simple and convenient CLI
### Coming soon
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7c716908..3b9a55a5 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -17,6 +17,7 @@ from docling_core.types.doc import (
TableData,
)
from docling_core.types.doc.document import ContentLayer
+from pydantic import BaseModel
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@@ -48,6 +49,11 @@ TAGS_FOR_NODE_ITEMS: Final = [
]
+class _Context(BaseModel):
+ list_ordered_flag_by_ref: dict[str, bool] = {}
+ list_start_by_ref: dict[str, int] = {}
+
+
class HTMLDocumentBackend(DeclarativeDocumentBackend):
@override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
@@ -59,6 +65,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.max_levels = 10
self.level = 0
self.parents: dict[int, Optional[Union[DocItem, GroupItem]]] = {}
+ self.ctx = _Context()
for i in range(self.max_levels):
self.parents[i] = None
@@ -121,6 +128,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
+ self.ctx = _Context() # reset context
self.walk(content, doc)
else:
raise RuntimeError(
@@ -294,28 +302,25 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
- if element.name == "ul":
- # create a list group
- self.parents[self.level + 1] = doc.add_group(
- parent=self.parents[self.level],
- name="list",
- label=GroupLabel.LIST,
- content_layer=self.content_layer,
- )
- elif element.name == "ol":
+ start: Optional[int] = None
+ if is_ordered := element.name == "ol":
start_attr = element.get("start")
- start: int = (
- int(start_attr)
- if isinstance(start_attr, str) and start_attr.isnumeric()
- else 1
- )
- # create a list group
- self.parents[self.level + 1] = doc.add_group(
- parent=self.parents[self.level],
- name="ordered list" + (f" start {start}" if start != 1 else ""),
- label=GroupLabel.ORDERED_LIST,
- content_layer=self.content_layer,
- )
+ if isinstance(start_attr, str) and start_attr.isnumeric():
+ start = int(start_attr)
+ name = "ordered list" + (f" start {start}" if start is not None else "")
+ else:
+ name = "list"
+ # create a list group
+ list_group = doc.add_list_group(
+ name=name,
+ parent=self.parents[self.level],
+ content_layer=self.content_layer,
+ )
+ self.parents[self.level + 1] = list_group
+ self.ctx.list_ordered_flag_by_ref[list_group.self_ref] = is_ordered
+ if is_ordered and start is not None:
+ self.ctx.list_start_by_ref[list_group.self_ref] = start
+
self.level += 1
self.walk(element, doc)
@@ -331,16 +336,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if parent is None:
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
return
- parent_label: str = parent.label
- index_in_list = len(parent.children) + 1
- if (
- parent_label == GroupLabel.ORDERED_LIST
- and isinstance(parent, GroupItem)
- and parent.name
- ):
- start_in_list: str = parent.name.split(" ")[-1]
- start: int = int(start_in_list) if start_in_list.isnumeric() else 1
- index_in_list += start - 1
+ enumerated = self.ctx.list_ordered_flag_by_ref.get(parent.self_ref, False)
+ if enumerated and (start := self.ctx.list_start_by_ref.get(parent.self_ref)):
+ marker = f"{start + len(parent.children)}."
+ else:
+ marker = ""
if nested_list:
# Text in list item can be hidden within hierarchy, hence
@@ -350,12 +350,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = text.replace("\n", "").replace("\r", "")
text = " ".join(text.split()).strip()
- marker = ""
- enumerated = False
- if parent_label == GroupLabel.ORDERED_LIST:
- marker = str(index_in_list)
- enumerated = True
-
if len(text) > 0:
# create a list-item
self.parents[self.level + 1] = doc.add_list_item(
@@ -375,11 +369,6 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif element.text.strip():
text = element.text.strip()
- marker = ""
- enumerated = False
- if parent_label == GroupLabel.ORDERED_LIST:
- marker = f"{index_in_list!s}."
- enumerated = True
doc.add_list_item(
text=text,
enumerated=enumerated,
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index b8b0e6d0..fb42547e 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -2,9 +2,10 @@ import logging
import re
import warnings
from copy import deepcopy
+from enum import Enum
from io import BytesIO
from pathlib import Path
-from typing import List, Optional, Set, Union
+from typing import List, Literal, Optional, Set, Union
import marko
import marko.element
@@ -13,15 +14,15 @@ from docling_core.types.doc import (
DocItemLabel,
DoclingDocument,
DocumentOrigin,
- GroupLabel,
NodeItem,
TableCell,
TableData,
TextItem,
)
-from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
+from docling_core.types.doc.document import Formatting
from marko import Markdown
-from pydantic import AnyUrl, TypeAdapter
+from pydantic import AnyUrl, BaseModel, Field, TypeAdapter
+from typing_extensions import Annotated
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
@@ -35,6 +36,32 @@ _START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"
+class _PendingCreationType(str, Enum):
+ """CoordOrigin."""
+
+ HEADING = "heading"
+ LIST_ITEM = "list_item"
+
+
+class _HeadingCreationPayload(BaseModel):
+ kind: Literal["heading"] = "heading"
+ level: int
+
+
+class _ListItemCreationPayload(BaseModel):
+ kind: Literal["list_item"] = "list_item"
+ enumerated: bool
+
+
+_CreationPayload = Annotated[
+ Union[
+ _HeadingCreationPayload,
+ _ListItemCreationPayload,
+ ],
+ Field(discriminator="kind"),
+]
+
+
class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def _shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
# This regex will match any sequence of underscores
@@ -155,6 +182,50 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc.add_table(data=table_data)
return
+ def _create_list_item(
+ self,
+ doc: DoclingDocument,
+ parent_item: Optional[NodeItem],
+ text: str,
+ enumerated: bool,
+ formatting: Optional[Formatting] = None,
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
+ ):
+ item = doc.add_list_item(
+ text=text,
+ enumerated=enumerated,
+ parent=parent_item,
+ formatting=formatting,
+ hyperlink=hyperlink,
+ )
+ return item
+
+ def _create_heading_item(
+ self,
+ doc: DoclingDocument,
+ parent_item: Optional[NodeItem],
+ text: str,
+ level: int,
+ formatting: Optional[Formatting] = None,
+ hyperlink: Optional[Union[AnyUrl, Path]] = None,
+ ):
+ if level == 1:
+ item = doc.add_title(
+ text=text,
+ parent=parent_item,
+ formatting=formatting,
+ hyperlink=hyperlink,
+ )
+ else:
+ item = doc.add_heading(
+ text=text,
+ level=level - 1,
+ parent=parent_item,
+ formatting=formatting,
+ hyperlink=hyperlink,
+ )
+ return item
+
def _iterate_elements( # noqa: C901
self,
*,
@@ -162,6 +233,10 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
depth: int,
doc: DoclingDocument,
visited: Set[marko.element.Element],
+ creation_stack: list[
+ _CreationPayload
+ ], # stack for lazy item creation triggered deep in marko's AST (on RawText)
+ list_ordered_flag_by_ref: dict[str, bool],
parent_item: Optional[NodeItem] = None,
formatting: Optional[Formatting] = None,
hyperlink: Optional[Union[AnyUrl, Path]] = None,
@@ -177,28 +252,17 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore
)
- if len(element.children) == 1:
- child = element.children[0]
- snippet_text = str(child.children) # type: ignore
- visited.add(child)
- else:
- snippet_text = "" # inline group will be created
-
- if element.level == 1:
- parent_item = doc.add_title(
- text=snippet_text,
- parent=parent_item,
+ if len(element.children) > 1: # inline group will be created further down
+ parent_item = self._create_heading_item(
+ doc=doc,
+ parent_item=parent_item,
+ text="",
+ level=element.level,
formatting=formatting,
hyperlink=hyperlink,
)
else:
- parent_item = doc.add_heading(
- text=snippet_text,
- level=element.level - 1,
- parent=parent_item,
- formatting=formatting,
- hyperlink=hyperlink,
- )
+ creation_stack.append(_HeadingCreationPayload(level=element.level))
elif isinstance(element, marko.block.List):
has_non_empty_list_items = False
@@ -210,10 +274,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._close_table(doc)
_log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
if has_non_empty_list_items:
- label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
- parent_item = doc.add_group(
- label=label, name="list", parent=parent_item
- )
+ parent_item = doc.add_list_group(name="list", parent=parent_item)
+ list_ordered_flag_by_ref[parent_item.self_ref] = element.ordered
elif (
isinstance(element, marko.block.ListItem)
@@ -224,22 +286,22 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self._close_table(doc)
_log.debug(" - List item")
- if len(child.children) == 1:
- snippet_text = str(child.children[0].children) # type: ignore
- visited.add(child)
- else:
- snippet_text = "" # inline group will be created
- is_numbered = isinstance(parent_item, OrderedList)
- if not isinstance(parent_item, (OrderedList, UnorderedList)):
- _log.warning("ListItem would have not had a list parent, adding one.")
- parent_item = doc.add_unordered_list(parent=parent_item)
- parent_item = doc.add_list_item(
- enumerated=is_numbered,
- parent=parent_item,
- text=snippet_text,
- formatting=formatting,
- hyperlink=hyperlink,
+ enumerated = (
+ list_ordered_flag_by_ref.get(parent_item.self_ref, False)
+ if parent_item
+ else False
)
+ if len(child.children) > 1: # inline group will be created further down
+ parent_item = self._create_list_item(
+ doc=doc,
+ parent_item=parent_item,
+ text="",
+ enumerated=enumerated,
+ formatting=formatting,
+ hyperlink=hyperlink,
+ )
+ else:
+ creation_stack.append(_ListItemCreationPayload(enumerated=enumerated))
elif isinstance(element, marko.inline.Image):
self._close_table(doc)
@@ -276,7 +338,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
_log.debug(f" - Paragraph (raw text): {element.children}")
snippet_text = element.children.strip()
# Detect start of the table:
- if "|" in snippet_text:
+ if "|" in snippet_text or self.in_table:
# most likely part of the markdown table
self.in_table = True
if len(self.md_table_buffer) > 0:
@@ -285,13 +347,46 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
self.md_table_buffer.append(snippet_text)
elif snippet_text:
self._close_table(doc)
- doc.add_text(
- label=DocItemLabel.TEXT,
- parent=parent_item,
- text=snippet_text,
- formatting=formatting,
- hyperlink=hyperlink,
- )
+
+ if creation_stack:
+ while len(creation_stack) > 0:
+ to_create = creation_stack.pop()
+ if isinstance(to_create, _ListItemCreationPayload):
+ enumerated = (
+ list_ordered_flag_by_ref.get(
+ parent_item.self_ref, False
+ )
+ if parent_item
+ else False
+ )
+ parent_item = self._create_list_item(
+ doc=doc,
+ parent_item=parent_item,
+ text=snippet_text,
+ enumerated=enumerated,
+ formatting=formatting,
+ hyperlink=hyperlink,
+ )
+ elif isinstance(to_create, _HeadingCreationPayload):
+ # not keeping as parent_item as logic for correctly tracking
+ # that not implemented yet (section components not captured
+ # as heading children in marko)
+ self._create_heading_item(
+ doc=doc,
+ parent_item=parent_item,
+ text=snippet_text,
+ level=to_create.level,
+ formatting=formatting,
+ hyperlink=hyperlink,
+ )
+ else:
+ doc.add_text(
+ label=DocItemLabel.TEXT,
+ parent=parent_item,
+ text=snippet_text,
+ formatting=formatting,
+ hyperlink=hyperlink,
+ )
elif isinstance(element, marko.inline.CodeSpan):
self._close_table(doc)
@@ -353,7 +448,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
parent_item = doc.add_inline_group(parent=parent_item)
processed_block_types = (
- # marko.block.Heading,
marko.block.CodeBlock,
marko.block.FencedCode,
marko.inline.RawText,
@@ -369,6 +463,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
depth=depth + 1,
doc=doc,
visited=visited,
+ creation_stack=creation_stack,
+ list_ordered_flag_by_ref=list_ordered_flag_by_ref,
parent_item=parent_item,
formatting=formatting,
hyperlink=hyperlink,
@@ -412,6 +508,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
doc=doc,
parent_item=None,
visited=set(),
+ creation_stack=[],
+ list_ordered_flag_by_ref={},
)
self._close_table(doc=doc) # handle any last hanging table
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index 63aa9e93..f512fb7e 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -121,7 +121,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
return prov
- def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
+ def handle_text_elements(
+ self, shape, parent_slide, slide_ind, doc: DoclingDocument, slide_size
+ ):
is_list_group_created = False
enum_list_item_value = 0
new_list = None
@@ -165,10 +167,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
enumerated = bullet_type == "Numbered"
if not is_list_group_created:
- new_list = doc.add_group(
- label=GroupLabel.ORDERED_LIST
- if enumerated
- else GroupLabel.LIST,
+ new_list = doc.add_list_group(
name="list",
parent=parent_slide,
)
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index 8386082a..abbcc6f6 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -10,11 +10,12 @@ from docling_core.types.doc import (
DocumentOrigin,
GroupLabel,
ImageRef,
+ ListGroup,
NodeItem,
TableCell,
TableData,
)
-from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
+from docling_core.types.doc.document import Formatting
from docx import Document
from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc
@@ -397,7 +398,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if isinstance(c, Hyperlink):
text = c.text
hyperlink = Path(c.address)
- format = self._get_format_from_run(c.runs[0])
+ format = (
+ self._get_format_from_run(c.runs[0])
+ if c.runs and len(c.runs) > 0
+ else None
+ )
elif isinstance(c, Run):
text = c.text
hyperlink = None
@@ -684,7 +689,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_elements: list,
) -> Optional[NodeItem]:
return (
- doc.add_group(label=GroupLabel.INLINE, parent=prev_parent)
+ doc.add_inline_group(parent=prev_parent)
if len(paragraph_elements) > 1
else prev_parent
)
@@ -777,9 +782,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else:
# Inline equation
level = self._get_level()
- inline_equation = doc.add_group(
- label=GroupLabel.INLINE, parent=self.parents[level - 1]
- )
+ inline_equation = doc.add_inline_group(parent=self.parents[level - 1])
text_tmp = text
for eq in equations:
if len(text_tmp) == 0:
@@ -927,18 +930,22 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
level: int,
) -> None:
# This should not happen by construction
- if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
+ if not isinstance(self.parents[level], ListGroup):
return
+ if not elements:
+ return
+
if len(elements) == 1:
text, format, hyperlink = elements[0]
- doc.add_list_item(
- marker=marker,
- enumerated=enumerated,
- parent=self.parents[level],
- text=text,
- formatting=format,
- hyperlink=hyperlink,
- )
+ if text:
+ doc.add_list_item(
+ marker=marker,
+ enumerated=enumerated,
+ parent=self.parents[level],
+ text=text,
+ formatting=format,
+ hyperlink=hyperlink,
+ )
else:
new_item = doc.add_list_item(
marker=marker,
@@ -946,15 +953,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[level],
text="",
)
- new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
+ new_parent = doc.add_inline_group(parent=new_item)
for text, format, hyperlink in elements:
- doc.add_text(
- label=DocItemLabel.TEXT,
- parent=new_parent,
- text=text,
- formatting=format,
- hyperlink=hyperlink,
- )
+ if text:
+ doc.add_text(
+ label=DocItemLabel.TEXT,
+ parent=new_parent,
+ text=text,
+ formatting=format,
+ hyperlink=hyperlink,
+ )
def _add_list_item(
self,
@@ -975,8 +983,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if self._prev_numid() is None: # Open new list
self.level_at_new_list = level
- self.parents[level] = doc.add_group(
- label=GroupLabel.LIST, name="list", parent=self.parents[level - 1]
+ self.parents[level] = doc.add_list_group(
+ name="list", parent=self.parents[level - 1]
)
# Set marker and enumerated arguments if this is an enumeration element.
@@ -997,19 +1005,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level_at_new_list + prev_indent + 1,
self.level_at_new_list + ilevel + 1,
):
- # Determine if this is an unordered list or an ordered list.
- # Set GroupLabel.ORDERED_LIST when it fits.
self.listIter = 0
- if is_numbered:
- self.parents[i] = doc.add_group(
- label=GroupLabel.ORDERED_LIST,
- name="list",
- parent=self.parents[i - 1],
- )
- else:
- self.parents[i] = doc.add_group(
- label=GroupLabel.LIST, name="list", parent=self.parents[i - 1]
- )
+ self.parents[i] = doc.add_list_group(
+ name="list", parent=self.parents[i - 1]
+ )
# TODO: Set marker and enumerated arguments if this is an enumeration element.
self.listIter += 1
diff --git a/docling/backend/noop_backend.py b/docling/backend/noop_backend.py
new file mode 100644
index 00000000..4974ba07
--- /dev/null
+++ b/docling/backend/noop_backend.py
@@ -0,0 +1,51 @@
+import logging
+from io import BytesIO
+from pathlib import Path
+from typing import Set, Union
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.document import InputDocument
+
+_log = logging.getLogger(__name__)
+
+
+class NoOpBackend(AbstractDocumentBackend):
+ """
+ A no-op backend that only validates input existence.
+ Used e.g. for audio files where actual processing is handled by the ASR pipeline.
+ """
+
+ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
+ super().__init__(in_doc, path_or_stream)
+
+ _log.debug(f"NoOpBackend initialized for: {path_or_stream}")
+
+ # Validate input
+ try:
+ if isinstance(self.path_or_stream, BytesIO):
+ # Check if stream has content
+ self.valid = len(self.path_or_stream.getvalue()) > 0
+ _log.debug(
+ f"BytesIO stream length: {len(self.path_or_stream.getvalue())}"
+ )
+ elif isinstance(self.path_or_stream, Path):
+ # Check if file exists
+ self.valid = self.path_or_stream.exists()
+ _log.debug(f"File exists: {self.valid}")
+ else:
+ self.valid = False
+ except Exception as e:
+ _log.error(f"NoOpBackend validation failed: {e}")
+ self.valid = False
+
+ def is_valid(self) -> bool:
+ return self.valid
+
+ @classmethod
+ def supports_pagination(cls) -> bool:
+ return False
+
+ @classmethod
+ def supported_formats(cls) -> Set[InputFormat]:
+ return set(InputFormat)
diff --git a/docling/cli/main.py b/docling/cli/main.py
index 083f53b2..ae275ea9 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -29,6 +29,15 @@ from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBacke
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+ WHISPER_BASE,
+ WHISPER_LARGE,
+ WHISPER_MEDIUM,
+ WHISPER_SMALL,
+ WHISPER_TINY,
+ WHISPER_TURBO,
+ AsrModelType,
+)
from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
@@ -37,12 +46,14 @@ from docling.datamodel.base_models import (
)
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
+ AsrPipelineOptions,
EasyOcrOptions,
OcrOptions,
PaginatedPipelineOptions,
PdfBackend,
- PdfPipeline,
PdfPipelineOptions,
+ PipelineOptions,
+ ProcessingPipeline,
TableFormerMode,
VlmPipelineOptions,
)
@@ -54,8 +65,14 @@ from docling.datamodel.vlm_model_specs import (
SMOLDOCLING_TRANSFORMERS,
VlmModelType,
)
-from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
+from docling.document_converter import (
+ AudioFormatOption,
+ DocumentConverter,
+ FormatOption,
+ PdfFormatOption,
+)
from docling.models.factories import get_ocr_factory
+from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.vlm_pipeline import VlmPipeline
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
@@ -296,13 +313,17 @@ def convert( # noqa: C901
),
] = ImageRefMode.EMBEDDED,
pipeline: Annotated[
- PdfPipeline,
+ ProcessingPipeline,
typer.Option(..., help="Choose the pipeline to process PDF or image files."),
- ] = PdfPipeline.STANDARD,
+ ] = ProcessingPipeline.STANDARD,
vlm_model: Annotated[
VlmModelType,
typer.Option(..., help="Choose the VLM model to use with PDF or image files."),
] = VlmModelType.SMOLDOCLING,
+ asr_model: Annotated[
+ AsrModelType,
+ typer.Option(..., help="Choose the ASR model to use with audio/video files."),
+ ] = AsrModelType.WHISPER_TINY,
ocr: Annotated[
bool,
typer.Option(
@@ -450,12 +471,14 @@ def convert( # noqa: C901
),
] = None,
):
+ log_format = "%(asctime)s\t%(levelname)s\t%(name)s: %(message)s"
+
if verbose == 0:
- logging.basicConfig(level=logging.WARNING)
+ logging.basicConfig(level=logging.WARNING, format=log_format)
elif verbose == 1:
- logging.basicConfig(level=logging.INFO)
+ logging.basicConfig(level=logging.INFO, format=log_format)
else:
- logging.basicConfig(level=logging.DEBUG)
+ logging.basicConfig(level=logging.DEBUG, format=log_format)
settings.debug.visualize_cells = debug_visualize_cells
settings.debug.visualize_layout = debug_visualize_layout
@@ -530,9 +553,12 @@ def convert( # noqa: C901
ocr_options.lang = ocr_lang_list
accelerator_options = AcceleratorOptions(num_threads=num_threads, device=device)
- pipeline_options: PaginatedPipelineOptions
+ # pipeline_options: PaginatedPipelineOptions
+ pipeline_options: PipelineOptions
- if pipeline == PdfPipeline.STANDARD:
+ format_options: Dict[InputFormat, FormatOption] = {}
+
+ if pipeline == ProcessingPipeline.STANDARD:
pipeline_options = PdfPipelineOptions(
allow_external_plugins=allow_external_plugins,
enable_remote_services=enable_remote_services,
@@ -574,7 +600,13 @@ def convert( # noqa: C901
pipeline_options=pipeline_options,
backend=backend, # pdf_backend
)
- elif pipeline == PdfPipeline.VLM:
+
+ format_options = {
+ InputFormat.PDF: pdf_format_option,
+ InputFormat.IMAGE: pdf_format_option,
+ }
+
+ elif pipeline == ProcessingPipeline.VLM:
pipeline_options = VlmPipelineOptions(
enable_remote_services=enable_remote_services,
)
@@ -600,13 +632,48 @@ def convert( # noqa: C901
pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
)
+ format_options = {
+ InputFormat.PDF: pdf_format_option,
+ InputFormat.IMAGE: pdf_format_option,
+ }
+
+ elif pipeline == ProcessingPipeline.ASR:
+ pipeline_options = AsrPipelineOptions(
+ # enable_remote_services=enable_remote_services,
+ # artifacts_path = artifacts_path
+ )
+
+ if asr_model == AsrModelType.WHISPER_TINY:
+ pipeline_options.asr_options = WHISPER_TINY
+ elif asr_model == AsrModelType.WHISPER_SMALL:
+ pipeline_options.asr_options = WHISPER_SMALL
+ elif asr_model == AsrModelType.WHISPER_MEDIUM:
+ pipeline_options.asr_options = WHISPER_MEDIUM
+ elif asr_model == AsrModelType.WHISPER_BASE:
+ pipeline_options.asr_options = WHISPER_BASE
+ elif asr_model == AsrModelType.WHISPER_LARGE:
+ pipeline_options.asr_options = WHISPER_LARGE
+ elif asr_model == AsrModelType.WHISPER_TURBO:
+ pipeline_options.asr_options = WHISPER_TURBO
+ else:
+ _log.error(f"{asr_model} is not known")
+ raise ValueError(f"{asr_model} is not known")
+
+ _log.info(f"pipeline_options: {pipeline_options}")
+
+ audio_format_option = AudioFormatOption(
+ pipeline_cls=AsrPipeline,
+ pipeline_options=pipeline_options,
+ )
+
+ format_options = {
+ InputFormat.AUDIO: audio_format_option,
+ }
+
if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path
+ # audio_pipeline_options.artifacts_path = artifacts_path
- format_options: Dict[InputFormat, FormatOption] = {
- InputFormat.PDF: pdf_format_option,
- InputFormat.IMAGE: pdf_format_option,
- }
doc_converter = DocumentConverter(
allowed_formats=from_formats,
format_options=format_options,
@@ -614,6 +681,7 @@ def convert( # noqa: C901
start_time = time.time()
+ _log.info(f"paths: {input_doc_paths}")
conv_results = doc_converter.convert_all(
input_doc_paths, headers=parsed_headers, raises_on_error=abort_on_error
)
diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py
new file mode 100644
index 00000000..95287ad2
--- /dev/null
+++ b/docling/datamodel/asr_model_specs.py
@@ -0,0 +1,92 @@
+import logging
+from enum import Enum
+
+from pydantic import (
+ AnyUrl,
+)
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_asr_model import (
+ # AsrResponseFormat,
+ # ApiAsrOptions,
+ InferenceAsrFramework,
+ InlineAsrNativeWhisperOptions,
+ TransformersModelType,
+)
+
+_log = logging.getLogger(__name__)
+
+WHISPER_TINY = InlineAsrNativeWhisperOptions(
+ repo_id="tiny",
+ inference_framework=InferenceAsrFramework.WHISPER,
+ verbose=True,
+ timestamps=True,
+ word_timestamps=True,
+ temperatue=0.0,
+ max_new_tokens=256,
+ max_time_chunk=30.0,
+)
+
+WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+ repo_id="small",
+ inference_framework=InferenceAsrFramework.WHISPER,
+ verbose=True,
+ timestamps=True,
+ word_timestamps=True,
+ temperatue=0.0,
+ max_new_tokens=256,
+ max_time_chunk=30.0,
+)
+
+WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+ repo_id="medium",
+ inference_framework=InferenceAsrFramework.WHISPER,
+ verbose=True,
+ timestamps=True,
+ word_timestamps=True,
+ temperatue=0.0,
+ max_new_tokens=256,
+ max_time_chunk=30.0,
+)
+
+WHISPER_BASE = InlineAsrNativeWhisperOptions(
+ repo_id="base",
+ inference_framework=InferenceAsrFramework.WHISPER,
+ verbose=True,
+ timestamps=True,
+ word_timestamps=True,
+ temperatue=0.0,
+ max_new_tokens=256,
+ max_time_chunk=30.0,
+)
+
+WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+ repo_id="large",
+ inference_framework=InferenceAsrFramework.WHISPER,
+ verbose=True,
+ timestamps=True,
+ word_timestamps=True,
+ temperatue=0.0,
+ max_new_tokens=256,
+ max_time_chunk=30.0,
+)
+
+WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+ repo_id="turbo",
+ inference_framework=InferenceAsrFramework.WHISPER,
+ verbose=True,
+ timestamps=True,
+ word_timestamps=True,
+ temperatue=0.0,
+ max_new_tokens=256,
+ max_time_chunk=30.0,
+)
+
+
+class AsrModelType(str, Enum):
+ WHISPER_TINY = "whisper_tiny"
+ WHISPER_SMALL = "whisper_small"
+ WHISPER_MEDIUM = "whisper_medium"
+ WHISPER_BASE = "whisper_base"
+ WHISPER_LARGE = "whisper_large"
+ WHISPER_TURBO = "whisper_turbo"
diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py
index c1fdb033..d9a829ed 100644
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@@ -49,6 +49,7 @@ class InputFormat(str, Enum):
XML_USPTO = "xml_uspto"
XML_JATS = "xml_jats"
JSON_DOCLING = "json_docling"
+ AUDIO = "audio"
class OutputFormat(str, Enum):
@@ -73,6 +74,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.XLSX: ["xlsx", "xlsm"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
+ InputFormat.AUDIO: ["wav", "mp3"],
}
FormatToMimeType: Dict[InputFormat, List[str]] = {
@@ -104,6 +106,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
],
InputFormat.XML_USPTO: ["application/xml", "text/plain"],
InputFormat.JSON_DOCLING: ["application/json"],
+ InputFormat.AUDIO: ["audio/x-wav", "audio/mpeg", "audio/wav", "audio/mp3"],
}
MimeTypeToFormat: dict[str, list[InputFormat]] = {
@@ -298,7 +301,7 @@ class OpenAiChatMessage(BaseModel):
class OpenAiResponseChoice(BaseModel):
index: int
message: OpenAiChatMessage
- finish_reason: str
+ finish_reason: Optional[str]
class OpenAiResponseUsage(BaseModel):
diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py
index 4c71f5c8..9f5cf82c 100644
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@@ -249,7 +249,7 @@ class _DocumentConversionInput(BaseModel):
backend: Type[AbstractDocumentBackend]
if format not in format_options.keys():
_log.error(
- f"Input document {obj.name} does not match any allowed format."
+ f"Input document {obj.name} with format {format} does not match any allowed format: ({format_options.keys()})"
)
backend = _DummyBackend
else:
@@ -318,6 +318,8 @@ class _DocumentConversionInput(BaseModel):
mime = mime or _DocumentConversionInput._detect_csv(content)
mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, [])
+ _log.info(f"detected formats: {formats}")
+
if formats:
if len(formats) == 1 and mime not in ("text/plain"):
return formats[0]
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 5995f453..11e085b7 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -11,8 +11,13 @@ from pydantic import (
)
from typing_extensions import deprecated
+from docling.datamodel import asr_model_specs
+
# Import the following for backwards compatibility
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.pipeline_options_asr_model import (
+ InlineAsrOptions,
+)
from docling.datamodel.pipeline_options_vlm_model import (
ApiVlmOptions,
InferenceFramework,
@@ -202,7 +207,7 @@ smolvlm_picture_description = PictureDescriptionVlmOptions(
# GraniteVision
granite_picture_description = PictureDescriptionVlmOptions(
- repo_id="ibm-granite/granite-vision-3.1-2b-preview",
+ repo_id="ibm-granite/granite-vision-3.2-2b-preview",
prompt="What is shown in this image?",
)
@@ -260,6 +265,11 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
)
+class AsrPipelineOptions(PipelineOptions):
+ asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+ artifacts_path: Optional[Union[Path, str]] = None
+
+
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""
@@ -297,6 +307,7 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
)
-class PdfPipeline(str, Enum):
+class ProcessingPipeline(str, Enum):
STANDARD = "standard"
VLM = "vlm"
+ ASR = "asr"
diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py
new file mode 100644
index 00000000..20e2e453
--- /dev/null
+++ b/docling/datamodel/pipeline_options_asr_model.py
@@ -0,0 +1,57 @@
+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Union
+
+from pydantic import AnyUrl, BaseModel
+from typing_extensions import deprecated
+
+from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options_vlm_model import (
+ # InferenceFramework,
+ TransformersModelType,
+)
+
+
+class BaseAsrOptions(BaseModel):
+ kind: str
+ # prompt: str
+
+
+class InferenceAsrFramework(str, Enum):
+ # MLX = "mlx" # disabled for now
+ # TRANSFORMERS = "transformers" # disabled for now
+ WHISPER = "whisper"
+
+
+class InlineAsrOptions(BaseAsrOptions):
+ kind: Literal["inline_model_options"] = "inline_model_options"
+
+ repo_id: str
+
+ verbose: bool = False
+ timestamps: bool = True
+
+ temperature: float = 0.0
+ max_new_tokens: int = 256
+ max_time_chunk: float = 30.0
+
+ torch_dtype: Optional[str] = None
+ supported_devices: List[AcceleratorDevice] = [
+ AcceleratorDevice.CPU,
+ AcceleratorDevice.CUDA,
+ AcceleratorDevice.MPS,
+ ]
+
+ @property
+ def repo_cache_folder(self) -> str:
+ return self.repo_id.replace("/", "--")
+
+
+class InlineAsrNativeWhisperOptions(InlineAsrOptions):
+ inference_framework: InferenceAsrFramework = InferenceAsrFramework.WHISPER
+
+ language: str = "en"
+ supported_devices: List[AcceleratorDevice] = [
+ AcceleratorDevice.CPU,
+ AcceleratorDevice.CUDA,
+ ]
+ word_timestamps: bool = True
diff --git a/docling/document_converter.py b/docling/document_converter.py
index e553c083..1a0a9d75 100644
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@@ -19,6 +19,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
+from docling.backend.noop_backend import NoOpBackend
from docling.backend.xml.jats_backend import JatsDocumentBackend
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend
from docling.datamodel.base_models import (
@@ -41,6 +42,7 @@ from docling.datamodel.settings import (
settings,
)
from docling.exceptions import ConversionError
+from docling.pipeline.asr_pipeline import AsrPipeline
from docling.pipeline.base_pipeline import BasePipeline
from docling.pipeline.simple_pipeline import SimplePipeline
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
@@ -118,6 +120,11 @@ class PdfFormatOption(FormatOption):
backend: Type[AbstractDocumentBackend] = DoclingParseV4DocumentBackend
+class AudioFormatOption(FormatOption):
+ pipeline_cls: Type = AsrPipeline
+ backend: Type[AbstractDocumentBackend] = NoOpBackend
+
+
def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = {
InputFormat.CSV: FormatOption(
@@ -156,6 +163,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
),
+ InputFormat.AUDIO: FormatOption(pipeline_cls=AsrPipeline, backend=NoOpBackend),
}
if (options := format_to_default_options.get(format)) is not None:
return options
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
new file mode 100644
index 00000000..94fa6341
--- /dev/null
+++ b/docling/pipeline/asr_pipeline.py
@@ -0,0 +1,253 @@
+import logging
+import os
+import re
+from io import BytesIO
+from pathlib import Path
+from typing import List, Optional, Union, cast
+
+from docling_core.types.doc import DoclingDocument, DocumentOrigin
+
+# import whisper # type: ignore
+# import librosa
+# import numpy as np
+# import soundfile as sf # type: ignore
+from docling_core.types.doc.labels import DocItemLabel
+from pydantic import BaseModel, Field, validator
+
+from docling.backend.abstract_backend import AbstractDocumentBackend
+from docling.backend.noop_backend import NoOpBackend
+
+# from pydub import AudioSegment # type: ignore
+# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
+from docling.datamodel.accelerator_options import (
+ AcceleratorOptions,
+)
+from docling.datamodel.base_models import (
+ ConversionStatus,
+ FormatToMimeType,
+)
+from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.pipeline_options import (
+ AsrPipelineOptions,
+)
+from docling.datamodel.pipeline_options_asr_model import (
+ InlineAsrNativeWhisperOptions,
+ # AsrResponseFormat,
+ InlineAsrOptions,
+)
+from docling.datamodel.pipeline_options_vlm_model import (
+ InferenceFramework,
+)
+from docling.datamodel.settings import settings
+from docling.pipeline.base_pipeline import BasePipeline
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import ProfilingScope, TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class _ConversationWord(BaseModel):
+ text: str
+ start_time: Optional[float] = Field(
+ None, description="Start time in seconds from video start"
+ )
+ end_time: Optional[float] = Field(
+ None, ge=0, description="End time in seconds from video start"
+ )
+
+
+class _ConversationItem(BaseModel):
+ text: str
+ start_time: Optional[float] = Field(
+ None, description="Start time in seconds from video start"
+ )
+ end_time: Optional[float] = Field(
+ None, ge=0, description="End time in seconds from video start"
+ )
+ speaker_id: Optional[int] = Field(None, description="Numeric speaker identifier")
+ speaker: Optional[str] = Field(
+ None, description="Speaker name, defaults to speaker-{speaker_id}"
+ )
+ words: Optional[list[_ConversationWord]] = Field(
+ None, description="Individual words with time-stamps"
+ )
+
+ def __lt__(self, other):
+ if not isinstance(other, _ConversationItem):
+ return NotImplemented
+ return self.start_time < other.start_time
+
+ def __eq__(self, other):
+ if not isinstance(other, _ConversationItem):
+ return NotImplemented
+ return self.start_time == other.start_time
+
+ def to_string(self) -> str:
+ """Format the conversation entry as a string"""
+ result = ""
+ if (self.start_time is not None) and (self.end_time is not None):
+ result += f"[time: {self.start_time}-{self.end_time}] "
+
+ if self.speaker is not None:
+ result += f"[speaker:{self.speaker}] "
+
+ result += self.text
+ return result
+
+
+class _NativeWhisperModel:
+ def __init__(
+ self,
+ enabled: bool,
+ artifacts_path: Optional[Path],
+ accelerator_options: AcceleratorOptions,
+ asr_options: InlineAsrNativeWhisperOptions,
+ ):
+ """
+ Transcriber using native Whisper.
+ """
+ self.enabled = enabled
+
+ _log.info(f"artifacts-path: {artifacts_path}")
+ _log.info(f"accelerator_options: {accelerator_options}")
+
+ if self.enabled:
+ try:
+ import whisper # type: ignore
+ except ImportError:
+ raise ImportError(
+ "whisper is not installed. Please install it via `pip install openai-whisper` or do `uv sync --extra asr`."
+ )
+ self.asr_options = asr_options
+ self.max_tokens = asr_options.max_new_tokens
+ self.temperature = asr_options.temperature
+
+ self.device = decide_device(
+ accelerator_options.device,
+ supported_devices=asr_options.supported_devices,
+ )
+ _log.info(f"Available device for Whisper: {self.device}")
+
+ self.model_name = asr_options.repo_id
+ _log.info(f"loading _NativeWhisperModel({self.model_name})")
+ if artifacts_path is not None:
+ _log.info(f"loading {self.model_name} from {artifacts_path}")
+ self.model = whisper.load_model(
+ name=self.model_name,
+ device=self.device,
+ download_root=str(artifacts_path),
+ )
+ else:
+ self.model = whisper.load_model(
+ name=self.model_name, device=self.device
+ )
+
+ self.verbose = asr_options.verbose
+ self.timestamps = asr_options.timestamps
+ self.word_timestamps = asr_options.word_timestamps
+
+ def run(self, conv_res: ConversionResult) -> ConversionResult:
+ audio_path: Path = Path(conv_res.input.file).resolve()
+
+ try:
+ conversation = self.transcribe(audio_path)
+
+ # Ensure we have a proper DoclingDocument
+ origin = DocumentOrigin(
+ filename=conv_res.input.file.name or "audio.wav",
+ mimetype="audio/x-wav",
+ binary_hash=conv_res.input.document_hash,
+ )
+ conv_res.document = DoclingDocument(
+ name=conv_res.input.file.stem or "audio.wav", origin=origin
+ )
+
+ for citem in conversation:
+ conv_res.document.add_text(
+ label=DocItemLabel.TEXT, text=citem.to_string()
+ )
+
+ conv_res.status = ConversionStatus.SUCCESS
+ return conv_res
+
+ except Exception as exc:
+ _log.error(f"Audio tranciption has an error: {exc}")
+
+ conv_res.status = ConversionStatus.FAILURE
+ return conv_res
+
+ def transcribe(self, fpath: Path) -> list[_ConversationItem]:
+ result = self.model.transcribe(
+ str(fpath), verbose=self.verbose, word_timestamps=self.word_timestamps
+ )
+
+ convo: list[_ConversationItem] = []
+ for _ in result["segments"]:
+ item = _ConversationItem(
+ start_time=_["start"], end_time=_["end"], text=_["text"], words=[]
+ )
+ if "words" in _ and self.word_timestamps:
+ item.words = []
+ for __ in _["words"]:
+ item.words.append(
+ _ConversationWord(
+ start_time=__["start"],
+ end_time=__["end"],
+ text=__["word"],
+ )
+ )
+ convo.append(item)
+
+ return convo
+
+
+class AsrPipeline(BasePipeline):
+ def __init__(self, pipeline_options: AsrPipelineOptions):
+ super().__init__(pipeline_options)
+ self.keep_backend = True
+
+ self.pipeline_options: AsrPipelineOptions = pipeline_options
+
+ artifacts_path: Optional[Path] = None
+ if pipeline_options.artifacts_path is not None:
+ artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
+ elif settings.artifacts_path is not None:
+ artifacts_path = Path(settings.artifacts_path).expanduser()
+
+ if artifacts_path is not None and not artifacts_path.is_dir():
+ raise RuntimeError(
+ f"The value of {artifacts_path=} is not valid. "
+ "When defined, it must point to a folder containing all models required by the pipeline."
+ )
+
+ if isinstance(self.pipeline_options.asr_options, InlineAsrNativeWhisperOptions):
+ asr_options: InlineAsrNativeWhisperOptions = (
+ self.pipeline_options.asr_options
+ )
+ self._model = _NativeWhisperModel(
+ enabled=True, # must be always enabled for this pipeline to make sense.
+ artifacts_path=artifacts_path,
+ accelerator_options=pipeline_options.accelerator_options,
+ asr_options=asr_options,
+ )
+ else:
+ _log.error(f"No model support for {self.pipeline_options.asr_options}")
+
+ def _determine_status(self, conv_res: ConversionResult) -> ConversionStatus:
+ status = ConversionStatus.SUCCESS
+ return status
+
+ @classmethod
+ def get_default_options(cls) -> AsrPipelineOptions:
+ return AsrPipelineOptions()
+
+ def _build_document(self, conv_res: ConversionResult) -> ConversionResult:
+ _log.info(f"start _build_document in AsrPipeline: {conv_res.input.file}")
+ with TimeRecorder(conv_res, "doc_build", scope=ProfilingScope.DOCUMENT):
+ self._model.run(conv_res=conv_res)
+
+ return conv_res
+
+ @classmethod
+ def is_backend_supported(cls, backend: AbstractDocumentBackend):
+ return isinstance(backend, NoOpBackend)
diff --git a/docs/examples/minimal_asr_pipeline.py b/docs/examples/minimal_asr_pipeline.py
new file mode 100644
index 00000000..72c12769
--- /dev/null
+++ b/docs/examples/minimal_asr_pipeline.py
@@ -0,0 +1,56 @@
+from pathlib import Path
+
+from docling_core.types.doc import DoclingDocument
+
+from docling.datamodel import asr_model_specs
+from docling.datamodel.base_models import ConversionStatus, InputFormat
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options import AsrPipelineOptions
+from docling.document_converter import AudioFormatOption, DocumentConverter
+from docling.pipeline.asr_pipeline import AsrPipeline
+
+
+def get_asr_converter():
+ """Create a DocumentConverter configured for ASR with whisper_turbo model."""
+ pipeline_options = AsrPipelineOptions()
+ pipeline_options.asr_options = asr_model_specs.WHISPER_TURBO
+
+ converter = DocumentConverter(
+ format_options={
+ InputFormat.AUDIO: AudioFormatOption(
+ pipeline_cls=AsrPipeline,
+ pipeline_options=pipeline_options,
+ )
+ }
+ )
+ return converter
+
+
+def asr_pipeline_conversion(audio_path: Path) -> DoclingDocument:
+ """ASR pipeline conversion using whisper_turbo"""
+ # Check if the test audio file exists
+ assert audio_path.exists(), f"Test audio file not found: {audio_path}"
+
+ converter = get_asr_converter()
+
+ # Convert the audio file
+ result: ConversionResult = converter.convert(audio_path)
+
+ # Verify conversion was successful
+ assert result.status == ConversionStatus.SUCCESS, (
+ f"Conversion failed with status: {result.status}"
+ )
+ return result.document
+
+
+if __name__ == "__main__":
+ audio_path = Path("tests/data/audio/sample_10s.mp3")
+
+ doc = asr_pipeline_conversion(audio_path=audio_path)
+ print(doc.export_to_markdown())
+
+ # Expected output:
+ #
+ # [time: 0.0-4.0] Shakespeare on Scenery by Oscar Wilde
+ #
+ # [time: 5.28-9.96] This is a LibriVox recording. All LibriVox recordings are in the public domain.
diff --git a/docs/index.md b/docs/index.md
index ad9ac80e..7ec40bfa 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -20,14 +20,15 @@ Docling simplifies document processing, parsing diverse formats — including ad
## Features
-* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more
+* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, PPTX, XLSX, HTML, WAV, MP3, images (PNG, TIFF, JPEG, ...), and more
* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more
* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format
-* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON
+* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, [DocTags](https://arxiv.org/abs/2503.11576) and lossless JSON
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
* 🔍 Extensive OCR support for scanned PDFs and images
-* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🔥
+* 👓 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
+* 🎙️ Support for Audio with Automatic Speech Recognition (ASR) models
* 💻 Simple and convenient CLI
### Coming soon
diff --git a/mkdocs.yml b/mkdocs.yml
index 4f82c19e..d1c67532 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -80,6 +80,7 @@ nav:
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
- "VLM comparison": examples/compare_vlm_models.py
+ - "ASR pipeline with Whisper": examples/minimal_asr_pipeline.py
- "Figure export": examples/export_figures.py
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py
diff --git a/pyproject.toml b/pyproject.toml
index aade0d86..74d9e568 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "docling"
-version = "2.37.0" # DO NOT EDIT, updated automatically
+version = "2.39.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [
@@ -44,7 +44,8 @@ authors = [
requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
- 'docling-core[chunking] (>=2.29.0,<3.0.0)',
+ 'docling-core[chunking] (>=2.39.0,<3.0.0)',
+ 'docling-ibm-models (>=3.4.4,<4.0.0)',
'docling-parse (>=4.0.0,<5.0.0)',
'docling-ibm-models (>=3.6.0,<4)',
'filetype (>=1.2.0,<2.0.0)',
@@ -99,6 +100,9 @@ rapidocr = [
# 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
# 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
]
+asr = [
+ "openai-whisper>=20240930",
+]
[dependency-groups]
dev = [
@@ -145,6 +149,9 @@ constraints = [
package = true
default-groups = "all"
+[tool.uv.sources]
+openai-whisper = { git = "https://github.com/openai/whisper.git", rev = "dd985ac4b90cafeef8712f2998d62c59c3e62d22" }
+
[tool.setuptools.packages.find]
include = ["docling*"]
diff --git a/tests/data/audio/sample_10s.mp3 b/tests/data/audio/sample_10s.mp3
new file mode 100644
index 00000000..93a7ec73
Binary files /dev/null and b/tests/data/audio/sample_10s.mp3 differ
diff --git a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
index 1214cdf6..d047d938 100644
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
@@ -160,8 +160,8 @@