Merge remote-tracking branch 'origin/main' into feat-factory-plugins

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-03-04 14:07:56 -05:00
commit 71d438df84
43 changed files with 4233 additions and 2431 deletions

View File

@ -1,6 +1,10 @@
on:
workflow_call:
env:
HF_HUB_DOWNLOAD_TIMEOUT: "60"
HF_HUB_ETAG_TIMEOUT: "60"
jobs:
run-checks:
runs-on: ubuntu-latest
@ -14,6 +18,11 @@ jobs:
- name: Set TESSDATA_PREFIX
run: |
echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
- name: Cache Hugging Face models
uses: actions/cache@v4
with:
path: ~/.cache/huggingface
key: huggingface-cache-py${{ matrix.python-version }}
- uses: ./.github/actions/setup-poetry
with:
python-version: ${{ matrix.python-version }}
@ -28,7 +37,7 @@ jobs:
run: |
for file in docs/examples/*.py; do
# Skip batch_convert.py
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
echo "Skipping $file"
continue
fi

View File

@ -1,3 +1,30 @@
## [v2.25.1](https://github.com/DS4SD/docling/releases/tag/v2.25.1) - 2025-03-03
### Fix
* Enable locks for threadsafe pdfium ([#1052](https://github.com/DS4SD/docling/issues/1052)) ([`8dc0562`](https://github.com/DS4SD/docling/commit/8dc0562542299cf972d14eeeb4393e50b589c8ad))
* **html:** Use 'start' attribute when parsing ordered lists from HTML docs ([#1062](https://github.com/DS4SD/docling/issues/1062)) ([`de7b963`](https://github.com/DS4SD/docling/commit/de7b963b09a34916f0a8d99649269aeb37db1408))
### Documentation
* Improve docs on token limit warning triggered by HybridChunker ([#1077](https://github.com/DS4SD/docling/issues/1077)) ([`db3ceef`](https://github.com/DS4SD/docling/commit/db3ceefd4ae6251a97e333bcb03051698b3fa71a))
## [v2.25.0](https://github.com/DS4SD/docling/releases/tag/v2.25.0) - 2025-02-26
### Feature
* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/DS4SD/docling/issues/1054)) ([`3c9fe76`](https://github.com/DS4SD/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849))
* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/DS4SD/docling/issues/1061)) ([`ab683e4`](https://github.com/DS4SD/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b))
### Fix
* Vlm using artifacts path ([#1057](https://github.com/DS4SD/docling/issues/1057)) ([`e197225`](https://github.com/DS4SD/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8))
* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/DS4SD/docling/issues/1041)) ([`1b0ead6`](https://github.com/DS4SD/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc))
### Documentation
* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/DS4SD/docling/issues/1053)) ([`c84b973`](https://github.com/DS4SD/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d))
## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20
### Feature

View File

@ -123,6 +123,6 @@ For individual model usage, please refer to the model licenses found in the orig
Docling has been brought to you by IBM.
[supported_formats]: https://ds4sd.github.io/docling/supported_formats/
[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
[integrations]: https://ds4sd.github.io/docling/integrations/

View File

@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell, Size
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend):
padbox.r = page_size.width - padbox.r
padbox.t = page_size.height - padbox.t
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
) # We resize the image from 1.5x the given scale to make it sharper.
with pypdfium2_lock:
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)
) # We resize the image from 1.5x the given scale to make it sharper.
return image
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
with pypdfium2_lock:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser_v2("fatal")
with pypdfium2_lock:
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
self.parser = pdf_parser_v2("fatal")
success = False
if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
)
success = False
if isinstance(self.path_or_stream, BytesIO):
success = self.parser.load_document_from_bytesio(
self.document_hash, self.path_or_stream
)
elif isinstance(self.path_or_stream, Path):
success = self.parser.load_document(
self.document_hash, str(self.path_or_stream)
)
if not success:
raise RuntimeError(
f"docling-parse v2 could not load document {self.document_hash}."
)
if not success:
raise RuntimeError(
f"docling-parse v2 could not load document {self.document_hash}."
)
def page_count(self) -> int:
# return len(self._pdoc) # To be replaced with docling-parse API
@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
return len_2
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
return DoclingParseV2PageBackend(
self.parser, self.document_hash, page_no, self._pdoc[page_no]
)
with pypdfium2_lock:
return DoclingParseV2PageBackend(
self.parser, self.document_hash, page_no, self._pdoc[page_no]
)
def is_valid(self) -> bool:
return self.page_count() > 0
@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
def unload(self):
super().unload()
self.parser.unload_document(self.document_hash)
self._pdoc.close()
self._pdoc = None
with pypdfium2_lock:
self._pdoc.close()
self._pdoc = None

View File

@ -1,9 +1,10 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Optional, Union, cast
from typing import Final, Optional, Union, cast
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString
from docling_core.types.doc import (
DocItem,
DocItemLabel,
@ -14,6 +15,7 @@ from docling_core.types.doc import (
TableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
@ -22,12 +24,29 @@ from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
# tags that generate NodeItem elements
TAGS_FOR_NODE_ITEMS: Final = [
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"p",
"pre",
"ul",
"ol",
"li",
"table",
"figure",
"img",
]
class HTMLDocumentBackend(DeclarativeDocumentBackend):
@override
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
_log.debug("About to init HTML backend...")
self.soup: Optional[Tag] = None
# HTML file:
self.path_or_stream = path_or_stream
@ -48,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.soup = BeautifulSoup(html_content, "html.parser")
except Exception as e:
raise RuntimeError(
f"Could not initialize HTML backend for file with hash {self.document_hash}."
"Could not initialize HTML backend for file with "
f"hash {self.document_hash}."
) from e
@override
@ -88,17 +108,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
assert self.soup is not None
content = self.soup.body or self.soup
# Replace <br> tags with newline characters
# TODO: remove style to avoid losing text from tags like i, b, span, ...
for br in content("br"):
br.replace_with(NavigableString("\n"))
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
self.content_layer = (
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
)
self.walk(content, doc)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
f"Cannot convert doc with {self.document_hash} because the backend "
"failed to init."
)
return doc
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
# Iterate over elements in the body of the document
text: str = ""
for element in tag.children:
if isinstance(element, Tag):
try:
@ -108,6 +137,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
f"Error processing child from tag{tag.name}: {exc_child}"
)
raise exc_child
elif isinstance(element, NavigableString) and not isinstance(
element, PreformattedString
):
# Floating text outside paragraphs or analyzed tags
text += element
siblings: list[Tag] = [
item for item in element.next_siblings if isinstance(item, Tag)
]
if element.next_sibling is None or any(
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
):
text = text.strip()
if text and tag.name in ["div"]:
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)
text = ""
return
@ -127,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
elif tag.name == "figure":
self.handle_figure(tag, doc)
elif tag.name == "img":
self.handle_image(doc)
self.handle_image(tag, doc)
else:
self.walk(tag, doc)
@ -158,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
text = element.text.strip()
if hlevel == 1:
for key, val in self.parents.items():
self.content_layer = ContentLayer.BODY
for key in self.parents.keys():
self.parents[key] = None
self.level = 1
self.parents[self.level] = doc.add_text(
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
parent=self.parents[0],
label=DocItemLabel.TITLE,
text=text,
content_layer=self.content_layer,
)
else:
if hlevel > self.level:
@ -174,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
name=f"header-{i}",
label=GroupLabel.SECTION,
parent=self.parents[i - 1],
content_layer=self.content_layer,
)
self.level = hlevel
@ -189,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[hlevel - 1],
text=text,
level=hlevel,
content_layer=self.content_layer,
)
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
@ -197,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
return
text = element.text.strip()
if text:
doc.add_code(parent=self.parents[self.level], text=text)
doc.add_code(
parent=self.parents[self.level],
text=text,
content_layer=self.content_layer,
)
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles paragraph tags (p)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.PARAGRAPH
if text:
doc.add_text(parent=self.parents[self.level], label=label, text=text)
doc.add_text(
parent=self.parents[self.level],
label=DocItemLabel.TEXT,
text=text,
content_layer=self.content_layer,
)
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles list tags (ul, ol) and their list items."""
@ -214,14 +278,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if element.name == "ul":
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
parent=self.parents[self.level],
name="list",
label=GroupLabel.LIST,
content_layer=self.content_layer,
)
elif element.name == "ol":
start_attr = element.get("start")
start: int = (
int(start_attr)
if isinstance(start_attr, str) and start_attr.isnumeric()
else 1
)
# create a list group
self.parents[self.level + 1] = doc.add_group(
parent=self.parents[self.level],
name="ordered list",
name="ordered list" + (f" start {start}" if start != 1 else ""),
label=GroupLabel.ORDERED_LIST,
content_layer=self.content_layer,
)
self.level += 1
@ -231,15 +305,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self.level -= 1
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles listitem tags (li)."""
"""Handles list item tags (li)."""
nested_list = element.find(["ul", "ol"])
parent = self.parents[self.level]
if parent is None:
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
return
parent_label: str = parent.label
index_in_list = len(parent.children) + 1
if (
parent_label == GroupLabel.ORDERED_LIST
and isinstance(parent, GroupItem)
and parent.name
):
start_in_list: str = parent.name.split(" ")[-1]
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
index_in_list += start - 1
if nested_list:
# Text in list item can be hidden within hierarchy, hence
@ -262,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer,
)
self.level += 1
@ -283,15 +366,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
enumerated=enumerated,
marker=marker,
parent=parent,
content_layer=self.content_layer,
)
else:
_log.warning(f"list-item has no text: {element}")
_log.debug(f"list-item has no text: {element}")
@staticmethod
def parse_table_data(element: Tag) -> Optional[TableData]:
nested_tables = element.find("table")
if nested_tables is not None:
_log.warning("Skipping nested table.")
_log.debug("Skipping nested table.")
return None
# Count the number of rows (number of <tr> elements)
@ -386,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
table_data = HTMLDocumentBackend.parse_table_data(element)
if table_data is not None:
doc.add_table(data=table_data, parent=self.parents[self.level])
doc.add_table(
data=table_data,
parent=self.parents[self.level],
content_layer=self.content_layer,
)
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
"""Recursively extract text from <ul> or <ol> with proper indentation."""
@ -426,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
contains_captions = element.find(["figcaption"])
if not isinstance(contains_captions, Tag):
doc.add_picture(parent=self.parents[self.level], caption=None)
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)
else:
texts = []
for item in contains_captions:
texts.append(item.text)
fig_caption = doc.add_text(
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
label=DocItemLabel.CAPTION,
text=("".join(texts)).strip(),
content_layer=self.content_layer,
)
doc.add_picture(
parent=self.parents[self.level],
caption=fig_caption,
content_layer=self.content_layer,
)
def handle_image(self, doc: DoclingDocument) -> None:
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
"""Handles image tags (img)."""
doc.add_picture(parent=self.parents[self.level], caption=None)
_log.debug(f"ignoring <img> tags at the moment: {element}")
doc.add_picture(
parent=self.parents[self.level],
caption=None,
content_layer=self.content_layer,
)

View File

@ -13,6 +13,7 @@ from pypdfium2._helpers.misc import PdfiumError
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
from docling.datamodel.base_models import Cell
from docling.utils.locks import pypdfium2_lock
if TYPE_CHECKING:
from docling.datamodel.document import InputDocument
@ -24,6 +25,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
def __init__(
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
):
# Note: lock applied by the caller
self.valid = True # No better way to tell from pypdfium.
try:
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
@ -40,51 +42,57 @@ class PyPdfiumPageBackend(PdfPageBackend):
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
AREA_THRESHOLD = 0 # 32 * 32
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=self.get_size().height)
page_size = self.get_size()
with pypdfium2_lock:
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
pos = obj.get_pos()
cropbox = BoundingBox.from_tuple(
pos, origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_height=page_size.height)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
if cropbox.area() > AREA_THRESHOLD:
cropbox = cropbox.scaled(scale=scale)
yield cropbox
yield cropbox
def get_text_in_rect(self, bbox: BoundingBox) -> str:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
bbox = bbox.to_bottom_left_origin(self.get_size().height)
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
with pypdfium2_lock:
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
return text_piece
def get_text_cells(self) -> Iterable[Cell]:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
with pypdfium2_lock:
if not self.text_page:
self.text_page = self._ppage.get_textpage()
cells = []
cell_counter = 0
page_size = self.get_size()
for i in range(self.text_page.count_rects()):
rect = self.text_page.get_rect(i)
text_piece = self.text_page.get_text_bounded(*rect)
x0, y0, x1, y1 = rect
cells.append(
Cell(
id=cell_counter,
text=text_piece,
bbox=BoundingBox(
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_size.height),
with pypdfium2_lock:
for i in range(self.text_page.count_rects()):
rect = self.text_page.get_rect(i)
text_piece = self.text_page.get_text_bounded(*rect)
x0, y0, x1, y1 = rect
cells.append(
Cell(
id=cell_counter,
text=text_piece,
bbox=BoundingBox(
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
).to_top_left_origin(page_size.height),
)
)
)
cell_counter += 1
cell_counter += 1
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
# The cell merging code below is to clean this up.
@ -214,20 +222,24 @@ class PyPdfiumPageBackend(PdfPageBackend):
padbox.r = page_size.width - padbox.r
padbox.t = page_size.height - padbox.t
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
) # We resize the image from 1.5x the given scale to make it sharper.
with pypdfium2_lock:
image = (
self._ppage.render(
scale=scale * 1.5,
rotation=0, # no additional rotation
crop=padbox.as_tuple(),
)
.to_pil()
.resize(
size=(round(cropbox.width * scale), round(cropbox.height * scale))
)
) # We resize the image from 1.5x the given scale to make it sharper.
return image
def get_size(self) -> Size:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
with pypdfium2_lock:
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
def unload(self):
self._ppage = None
@ -239,22 +251,26 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
super().__init__(in_doc, path_or_stream)
try:
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
with pypdfium2_lock:
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
except PdfiumError as e:
raise RuntimeError(
f"pypdfium could not load document with hash {self.document_hash}"
) from e
def page_count(self) -> int:
return len(self._pdoc)
with pypdfium2_lock:
return len(self._pdoc)
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
with pypdfium2_lock:
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
def is_valid(self) -> bool:
return self.page_count() > 0
def unload(self):
super().unload()
self._pdoc.close()
self._pdoc = None
with pypdfium2_lock:
self._pdoc.close()
self._pdoc = None

View File

@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
CODE_FORMULA = "code_formula"
PICTURE_CLASSIFIER = "picture_classifier"
SMOLVLM = "smolvlm"
GRANITE_VISION = "granite_vision"
EASYOCR = "easyocr"
_default_models = [
_AvailableModels.LAYOUT,
_AvailableModels.TABLEFORMER,
_AvailableModels.CODE_FORMULA,
_AvailableModels.PICTURE_CLASSIFIER,
_AvailableModels.EASYOCR,
]
@app.command("download")
def download(
output_dir: Annotated[
@ -43,18 +53,27 @@ def download(
...,
"-o",
"--output-dir",
help="The directory where all the models are downloaded.",
help="The directory where to download the models.",
),
] = (settings.cache_dir / "models"),
force: Annotated[
bool, typer.Option(..., help="If true, the download will be forced")
bool, typer.Option(..., help="If true, the download will be forced.")
] = False,
models: Annotated[
Optional[list[_AvailableModels]],
typer.Argument(
help=f"Models to download (default behavior: all will be downloaded)",
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
),
] = None,
all: Annotated[
bool,
typer.Option(
...,
"--all",
help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
show_default=True,
),
] = False,
quiet: Annotated[
bool,
typer.Option(
@ -65,6 +84,10 @@ def download(
),
] = False,
):
if models and all:
raise typer.BadParameter(
"Cannot simultaneously set 'all' parameter and specify models to download."
)
if not quiet:
FORMAT = "%(message)s"
logging.basicConfig(
@ -73,7 +96,7 @@ def download(
datefmt="[%X]",
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
)
to_download = models or [m for m in _AvailableModels]
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
output_dir = download_models(
output_dir=output_dir,
force=force,
@ -83,6 +106,7 @@ def download(
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
with_easyocr=_AvailableModels.EASYOCR in to_download,
)

View File

@ -154,6 +154,10 @@ class LayoutPrediction(BaseModel):
clusters: List[Cluster] = []
class VlmPrediction(BaseModel):
text: str = ""
class ContainerElement(
BasePageElement
): # Used for Form and Key-Value-Regions, only for typing.
@ -197,6 +201,7 @@ class PagePredictions(BaseModel):
tablestructure: Optional[TableStructurePrediction] = None
figures_classification: Optional[FigureClassificationPrediction] = None
equations_prediction: Optional[EquationPrediction] = None
vlm_response: Optional[VlmPrediction] = None
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]

View File

@ -35,6 +35,7 @@ class AcceleratorOptions(BaseSettings):
num_threads: int = 4
device: Union[str, AcceleratorDevice] = "auto"
cuda_use_flash_attention2: bool = False
@field_validator("device")
def validate_device(cls, value):
@ -252,6 +253,45 @@ granite_picture_description = PictureDescriptionVlmOptions(
)
class BaseVlmOptions(BaseModel):
kind: str
prompt: str
class ResponseFormat(str, Enum):
DOCTAGS = "doctags"
MARKDOWN = "markdown"
class HuggingFaceVlmOptions(BaseVlmOptions):
kind: Literal["hf_model_options"] = "hf_model_options"
repo_id: str
load_in_8bit: bool = True
llm_int8_threshold: float = 6.0
quantized: bool = False
response_format: ResponseFormat
@property
def repo_cache_folder(self) -> str:
return self.repo_id.replace("/", "--")
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="ds4sd/SmolDocling-256M-preview",
prompt="Convert this page to docling.",
response_format=ResponseFormat.DOCTAGS,
)
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
# prompt="OCR the full page to markdown.",
prompt="OCR this image.",
response_format=ResponseFormat.MARKDOWN,
)
# Define an enum for the backend options
class PdfBackend(str, Enum):
"""Enum of valid PDF backends."""
@ -284,7 +324,24 @@ class PipelineOptions(BaseModel):
enable_remote_services: bool = False
class PdfPipelineOptions(PipelineOptions):
class PaginatedPipelineOptions(PipelineOptions):
images_scale: float = 1.0
generate_page_images: bool = False
generate_picture_images: bool = False
class VlmPipelineOptions(PaginatedPipelineOptions):
artifacts_path: Optional[Union[Path, str]] = None
generate_page_images: bool = True
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
class PdfPipelineOptions(PaginatedPipelineOptions):
"""Options for the PDF pipeline."""
artifacts_path: Optional[Union[Path, str]] = None
@ -294,6 +351,10 @@ class PdfPipelineOptions(PipelineOptions):
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
do_picture_classification: bool = False # True: classify pictures in documents
do_picture_description: bool = False # True: run describe pictures in documents
force_backend_text: bool = (
False # (To be used with vlms, or other generative models)
)
# If True, text from backend will be used instead of generated text
table_structure_options: TableStructureOptions = TableStructureOptions()
ocr_options: OcrOptions = EasyOcrOptions()

View File

@ -0,0 +1,180 @@
import logging
import time
from pathlib import Path
from typing import Iterable, List, Optional
from docling.datamodel.base_models import Page, VlmPrediction
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
AcceleratorOptions,
HuggingFaceVlmOptions,
)
from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
_log = logging.getLogger(__name__)
class HuggingFaceVlmModel(BasePageModel):
def __init__(
self,
enabled: bool,
artifacts_path: Optional[Path],
accelerator_options: AcceleratorOptions,
vlm_options: HuggingFaceVlmOptions,
):
self.enabled = enabled
self.vlm_options = vlm_options
if self.enabled:
import torch
from transformers import ( # type: ignore
AutoModelForVision2Seq,
AutoProcessor,
BitsAndBytesConfig,
)
device = decide_device(accelerator_options.device)
self.device = device
_log.debug("Available device for HuggingFace VLM: {}".format(device))
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
# PARAMETERS:
if artifacts_path is None:
artifacts_path = self.download_models(self.vlm_options.repo_id)
elif (artifacts_path / repo_cache_folder).exists():
artifacts_path = artifacts_path / repo_cache_folder
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
self.param_quantization_config = BitsAndBytesConfig(
load_in_8bit=vlm_options.load_in_8bit, # True,
llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0
)
self.param_quantized = vlm_options.quantized # False
self.processor = AutoProcessor.from_pretrained(artifacts_path)
if not self.param_quantized:
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
artifacts_path,
device_map=device,
torch_dtype=torch.bfloat16,
_attn_implementation=(
"flash_attention_2"
if self.device.startswith("cuda")
and accelerator_options.cuda_use_flash_attention2
else "eager"
),
) # .to(self.device)
else:
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
artifacts_path,
device_map=device,
torch_dtype="auto",
quantization_config=self.param_quantization_config,
_attn_implementation=(
"flash_attention_2"
if self.device.startswith("cuda")
and accelerator_options.cuda_use_flash_attention2
else "eager"
),
) # .to(self.device)
@staticmethod
def download_models(
repo_id: str,
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id=repo_id,
force_download=force,
local_dir=local_dir,
# revision="v0.0.1",
)
return Path(download_path)
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:
for page in page_batch:
assert page._backend is not None
if not page._backend.is_valid():
yield page
else:
with TimeRecorder(conv_res, "vlm"):
assert page.size is not None
hi_res_image = page.get_image(scale=2.0) # 144dpi
# hi_res_image = page.get_image(scale=1.0) # 72dpi
if hi_res_image is not None:
im_width, im_height = hi_res_image.size
# populate page_tags with predicted doc tags
page_tags = ""
if hi_res_image:
if hi_res_image.mode != "RGB":
hi_res_image = hi_res_image.convert("RGB")
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "This is a page from a document.",
},
{"type": "image"},
{"type": "text", "text": self.param_question},
],
}
]
prompt = self.processor.apply_chat_template(
messages, add_generation_prompt=False
)
inputs = self.processor(
text=prompt, images=[hi_res_image], return_tensors="pt"
)
inputs = {k: v.to(self.device) for k, v in inputs.items()}
start_time = time.time()
# Call model to generate:
generated_ids = self.vlm_model.generate(
**inputs, max_new_tokens=4096, use_cache=True
)
generation_time = time.time() - start_time
generated_texts = self.processor.batch_decode(
generated_ids[:, inputs["input_ids"].shape[1] :],
skip_special_tokens=False,
)[0]
num_tokens = len(generated_ids[0])
page_tags = generated_texts
# inference_time = time.time() - start_time
# tokens_per_second = num_tokens / generation_time
# print("")
# print(f"Page Inference Time: {inference_time:.2f} seconds")
# print(f"Total tokens on page: {num_tokens:.2f}")
# print(f"Tokens/sec: {tokens_per_second:.2f}")
# print("")
page.predictions.vlm_response = VlmPrediction(text=page_tags)
yield page

View File

@ -53,9 +53,9 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
)
# Initialize processor and model
self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
self.processor = AutoProcessor.from_pretrained(artifacts_path)
self.model = AutoModelForVision2Seq.from_pretrained(
self.options.repo_id,
artifacts_path,
torch_dtype=torch.bfloat16,
_attn_implementation=(
"flash_attention_2" if self.device.startswith("cuda") else "eager"

View File

@ -0,0 +1,534 @@
import itertools
import logging
import re
import warnings
from io import BytesIO
# from io import BytesIO
from pathlib import Path
from typing import Optional
from docling_core.types import DoclingDocument
from docling_core.types.doc import (
BoundingBox,
DocItem,
DocItemLabel,
DoclingDocument,
GroupLabel,
ImageRef,
ImageRefMode,
PictureItem,
ProvenanceItem,
Size,
TableCell,
TableData,
TableItem,
)
from docling_core.types.doc.tokens import DocumentToken, TableToken
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import InputFormat, Page
from docling.datamodel.document import ConversionResult, InputDocument
from docling.datamodel.pipeline_options import (
PdfPipelineOptions,
ResponseFormat,
VlmPipelineOptions,
)
from docling.datamodel.settings import settings
from docling.models.hf_vlm_model import HuggingFaceVlmModel
from docling.pipeline.base_pipeline import PaginatedPipeline
from docling.utils.profiling import ProfilingScope, TimeRecorder
_log = logging.getLogger(__name__)
class VlmPipeline(PaginatedPipeline):
def __init__(self, pipeline_options: VlmPipelineOptions):
super().__init__(pipeline_options)
self.keep_backend = True
warnings.warn(
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
category=UserWarning,
stacklevel=2,
)
self.pipeline_options: VlmPipelineOptions
artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()
if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f"The value of {artifacts_path=} is not valid. "
"When defined, it must point to a folder containing all models required by the pipeline."
)
# force_backend_text = False - use text that is coming from VLM response
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
self.force_backend_text = (
pipeline_options.force_backend_text
and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
)
self.keep_images = self.pipeline_options.generate_page_images
self.build_pipe = [
HuggingFaceVlmModel(
enabled=True, # must be always enabled for this pipeline to make sense.
artifacts_path=artifacts_path,
accelerator_options=pipeline_options.accelerator_options,
vlm_options=self.pipeline_options.vlm_options,
),
]
self.enrichment_pipe = [
# Other models working on `NodeItem` elements in the DoclingDocument
]
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
with TimeRecorder(conv_res, "page_init"):
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
if page._backend is not None and page._backend.is_valid():
page.size = page._backend.get_size()
return page
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
if (
self.pipeline_options.vlm_options.response_format
== ResponseFormat.DOCTAGS
):
conv_res.document = self._turn_tags_into_doc(conv_res.pages)
elif (
self.pipeline_options.vlm_options.response_format
== ResponseFormat.MARKDOWN
):
conv_res.document = self._turn_md_into_doc(conv_res)
else:
raise RuntimeError(
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
)
# Generate images of the requested element types
if self.pipeline_options.generate_picture_images:
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
if not isinstance(element, DocItem) or len(element.prov) == 0:
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
):
page_ix = element.prov[0].page_no - 1
page = conv_res.pages[page_ix]
assert page.size is not None
assert page.image is not None
crop_bbox = (
element.prov[0]
.bbox.scaled(scale=scale)
.to_top_left_origin(page_height=page.size.height * scale)
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(
cropped_im, dpi=int(72 * scale)
)
return conv_res
def _turn_md_into_doc(self, conv_res):
predicted_text = ""
for pg_idx, page in enumerate(conv_res.pages):
if page.predictions.vlm_response:
predicted_text += page.predictions.vlm_response.text + "\n\n"
response_bytes = BytesIO(predicted_text.encode("utf8"))
out_doc = InputDocument(
path_or_stream=response_bytes,
filename=conv_res.input.file.name,
format=InputFormat.MD,
backend=MarkdownDocumentBackend,
)
backend = MarkdownDocumentBackend(
in_doc=out_doc,
path_or_stream=response_bytes,
)
return backend.convert()
def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
###############################################
# Tag definitions and color mappings
###############################################
# Maps the recognized tag to a Docling label.
# Code items will be given DocItemLabel.CODE
tag_to_doclabel = {
"title": DocItemLabel.TITLE,
"document_index": DocItemLabel.DOCUMENT_INDEX,
"otsl": DocItemLabel.TABLE,
"section_header_level_1": DocItemLabel.SECTION_HEADER,
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
"text": DocItemLabel.TEXT,
"page_header": DocItemLabel.PAGE_HEADER,
"page_footer": DocItemLabel.PAGE_FOOTER,
"formula": DocItemLabel.FORMULA,
"caption": DocItemLabel.CAPTION,
"picture": DocItemLabel.PICTURE,
"list_item": DocItemLabel.LIST_ITEM,
"footnote": DocItemLabel.FOOTNOTE,
"code": DocItemLabel.CODE,
}
# Maps each tag to an associated bounding box color.
tag_to_color = {
"title": "blue",
"document_index": "darkblue",
"otsl": "green",
"section_header_level_1": "purple",
"checkbox_selected": "black",
"checkbox_unselected": "gray",
"text": "red",
"page_header": "orange",
"page_footer": "cyan",
"formula": "pink",
"caption": "magenta",
"picture": "yellow",
"list_item": "brown",
"footnote": "darkred",
"code": "lightblue",
}
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
"""Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
coords = re.findall(r"<loc_(\d+)>", text_chunk)
if len(coords) == 4:
l, t, r, b = map(float, coords)
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
return None
def extract_inner_text(text_chunk: str) -> str:
"""Strips all <...> tags inside the chunk to get the raw text content."""
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
# Convert bounding box normalized to 0-100 into page coordinates for cropping
text = ""
if bbox:
if page.size:
bbox.l = bbox.l * page.size.width
bbox.t = bbox.t * page.size.height
bbox.r = bbox.r * page.size.width
bbox.b = bbox.b * page.size.height
if page._backend:
text = page._backend.get_text_in_rect(bbox)
return text
def otsl_parse_texts(texts, tokens):
split_word = TableToken.OTSL_NL.value
split_row_tokens = [
list(y)
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
if not x
]
table_cells = []
r_idx = 0
c_idx = 0
def count_right(tokens, c_idx, r_idx, which_tokens):
span = 0
c_idx_iter = c_idx
while tokens[r_idx][c_idx_iter] in which_tokens:
c_idx_iter += 1
span += 1
if c_idx_iter >= len(tokens[r_idx]):
return span
return span
def count_down(tokens, c_idx, r_idx, which_tokens):
span = 0
r_idx_iter = r_idx
while tokens[r_idx_iter][c_idx] in which_tokens:
r_idx_iter += 1
span += 1
if r_idx_iter >= len(tokens):
return span
return span
for i, text in enumerate(texts):
cell_text = ""
if text in [
TableToken.OTSL_FCEL.value,
TableToken.OTSL_ECEL.value,
TableToken.OTSL_CHED.value,
TableToken.OTSL_RHED.value,
TableToken.OTSL_SROW.value,
]:
row_span = 1
col_span = 1
right_offset = 1
if text != TableToken.OTSL_ECEL.value:
cell_text = texts[i + 1]
right_offset = 2
# Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
next_right_cell = ""
if i + right_offset < len(texts):
next_right_cell = texts[i + right_offset]
next_bottom_cell = ""
if r_idx + 1 < len(split_row_tokens):
if c_idx < len(split_row_tokens[r_idx + 1]):
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
if next_right_cell in [
TableToken.OTSL_LCEL.value,
TableToken.OTSL_XCEL.value,
]:
# we have horisontal spanning cell or 2d spanning cell
col_span += count_right(
split_row_tokens,
c_idx + 1,
r_idx,
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
)
if next_bottom_cell in [
TableToken.OTSL_UCEL.value,
TableToken.OTSL_XCEL.value,
]:
# we have a vertical spanning cell or 2d spanning cell
row_span += count_down(
split_row_tokens,
c_idx,
r_idx + 1,
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
)
table_cells.append(
TableCell(
text=cell_text.strip(),
row_span=row_span,
col_span=col_span,
start_row_offset_idx=r_idx,
end_row_offset_idx=r_idx + row_span,
start_col_offset_idx=c_idx,
end_col_offset_idx=c_idx + col_span,
)
)
if text in [
TableToken.OTSL_FCEL.value,
TableToken.OTSL_ECEL.value,
TableToken.OTSL_CHED.value,
TableToken.OTSL_RHED.value,
TableToken.OTSL_SROW.value,
TableToken.OTSL_LCEL.value,
TableToken.OTSL_UCEL.value,
TableToken.OTSL_XCEL.value,
]:
c_idx += 1
if text == TableToken.OTSL_NL.value:
r_idx += 1
c_idx = 0
return table_cells, split_row_tokens
def otsl_extract_tokens_and_text(s: str):
# Pattern to match anything enclosed by < > (including the angle brackets themselves)
pattern = r"(<[^>]+>)"
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
tokens = re.findall(pattern, s)
# Remove any tokens that start with "<loc_"
tokens = [
token
for token in tokens
if not (
token.startswith(rf"<{DocumentToken.LOC.value}")
or token
in [
rf"<{DocumentToken.OTSL.value}>",
rf"</{DocumentToken.OTSL.value}>",
]
)
]
# Split the string by those tokens to get the in-between text
text_parts = re.split(pattern, s)
text_parts = [
token
for token in text_parts
if not (
token.startswith(rf"<{DocumentToken.LOC.value}")
or token
in [
rf"<{DocumentToken.OTSL.value}>",
rf"</{DocumentToken.OTSL.value}>",
]
)
]
# Remove any empty or purely whitespace strings from text_parts
text_parts = [part for part in text_parts if part.strip()]
return tokens, text_parts
def parse_table_content(otsl_content: str) -> TableData:
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
return TableData(
num_rows=len(split_row_tokens),
num_cols=(
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
),
table_cells=table_cells,
)
doc = DoclingDocument(name="Document")
for pg_idx, page in enumerate(pages):
xml_content = ""
predicted_text = ""
if page.predictions.vlm_response:
predicted_text = page.predictions.vlm_response.text
image = page.image
page_no = pg_idx + 1
bounding_boxes = []
if page.size:
pg_width = page.size.width
pg_height = page.size.height
size = Size(width=pg_width, height=pg_height)
parent_page = doc.add_page(page_no=page_no, size=size)
"""
1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
2. For each chunk, extracts bounding box (if any) and inner text.
3. Adds the item to a DoclingDocument structure with the right label.
4. Tracks bounding boxes + color in a separate list for later visualization.
"""
# Regex for all recognized tags
tag_pattern = (
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
)
# DocumentToken.OTSL
pattern = re.compile(tag_pattern, re.DOTALL)
# Go through each match in order
for match in pattern.finditer(predicted_text):
full_chunk = match.group(0)
tag_name = match.group("tag")
bbox = extract_bounding_box(full_chunk)
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
color = tag_to_color.get(tag_name, "white")
# Store bounding box + color
if bbox:
bounding_boxes.append((bbox, color))
if tag_name == DocumentToken.OTSL.value:
table_data = parse_table_content(full_chunk)
bbox = extract_bounding_box(full_chunk)
if bbox:
prov = ProvenanceItem(
bbox=bbox.resize_by_scale(pg_width, pg_height),
charspan=(0, 0),
page_no=page_no,
)
doc.add_table(data=table_data, prov=prov)
else:
doc.add_table(data=table_data)
elif tag_name == DocItemLabel.PICTURE:
text_caption_content = extract_inner_text(full_chunk)
if image:
if bbox:
im_width, im_height = image.size
crop_box = (
int(bbox.l * im_width),
int(bbox.t * im_height),
int(bbox.r * im_width),
int(bbox.b * im_height),
)
cropped_image = image.crop(crop_box)
pic = doc.add_picture(
parent=None,
image=ImageRef.from_pil(image=cropped_image, dpi=72),
prov=(
ProvenanceItem(
bbox=bbox.resize_by_scale(pg_width, pg_height),
charspan=(0, 0),
page_no=page_no,
)
),
)
# If there is a caption to an image, add it as well
if len(text_caption_content) > 0:
caption_item = doc.add_text(
label=DocItemLabel.CAPTION,
text=text_caption_content,
parent=None,
)
pic.captions.append(caption_item.get_ref())
else:
if bbox:
# In case we don't have access to an binary of an image
doc.add_picture(
parent=None,
prov=ProvenanceItem(
bbox=bbox, charspan=(0, 0), page_no=page_no
),
)
# If there is a caption to an image, add it as well
if len(text_caption_content) > 0:
caption_item = doc.add_text(
label=DocItemLabel.CAPTION,
text=text_caption_content,
parent=None,
)
pic.captions.append(caption_item.get_ref())
else:
# For everything else, treat as text
if self.force_backend_text:
text_content = extract_text_from_backend(page, bbox)
else:
text_content = extract_inner_text(full_chunk)
doc.add_text(
label=doc_label,
text=text_content,
prov=(
ProvenanceItem(
bbox=bbox.resize_by_scale(pg_width, pg_height),
charspan=(0, len(text_content)),
page_no=page_no,
)
if bbox
else None
),
)
return doc
@classmethod
def get_default_options(cls) -> VlmPipelineOptions:
return VlmPipelineOptions()
@classmethod
def is_backend_supported(cls, backend: AbstractDocumentBackend):
return isinstance(backend, PdfDocumentBackend)

3
docling/utils/locks.py Normal file
View File

@ -0,0 +1,3 @@
import threading
pypdfium2_lock = threading.Lock()

View File

@ -2,7 +2,10 @@ import logging
from pathlib import Path
from typing import Optional
from docling.datamodel.pipeline_options import smolvlm_picture_description
from docling.datamodel.pipeline_options import (
granite_picture_description,
smolvlm_picture_description,
)
from docling.datamodel.settings import settings
from docling.models.code_formula_model import CodeFormulaModel
from docling.models.document_picture_classifier import DocumentPictureClassifier
@ -23,7 +26,8 @@ def download_models(
with_tableformer: bool = True,
with_code_formula: bool = True,
with_picture_classifier: bool = True,
with_smolvlm: bool = True,
with_smolvlm: bool = False,
with_granite_vision: bool = False,
with_easyocr: bool = True,
):
if output_dir is None:
@ -73,6 +77,15 @@ def download_models(
progress=progress,
)
if with_granite_vision:
_log.info(f"Downloading Granite Vision model...")
PictureDescriptionVlmModel.download_models(
repo_id=granite_picture_description.repo_id,
local_dir=output_dir / granite_picture_description.repo_cache_folder,
force=force,
progress=progress,
)
if with_easyocr:
_log.info(f"Downloading easyocr models...")
EasyOcrModel.download_models(

View File

@ -43,6 +43,11 @@ def draw_clusters(
y0 *= scale_x
y1 *= scale_y
if y1 <= y0:
y1, y0 = y0, y1
if x1 <= x0:
x1, x0 = x0, x1
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
cluster_outline_color = (
*list(DocItemLabel.get_color(c.label)),

View File

@ -1,5 +1,18 @@
## Introduction
!!! note "Chunking approaches"
Starting from a `DoclingDocument`, there are in principle two possible chunking
approaches:
1. exporting the `DoclingDocument` to Markdown (or similar format) and then
performing user-defined chunking as a post-processing step, or
2. using native Docling chunkers, i.e. operating directly on the `DoclingDocument`
This page is about the latter, i.e. using native Docling chunkers.
For an example of using approach (1) check out e.g.
[this recipe](../examples/rag_langchain.ipynb) looking at the Markdown export mode.
A *chunker* is a Docling abstraction that, given a
[`DoclingDocument`](./docling_document.md), returns a stream of chunks, each of which
captures some part of the document as a string accompanied by respective metadata.

View File

@ -1,3 +1,7 @@
# WARNING
# This example demonstrates only how to develop a new enrichment model.
# It does not run the actual formula understanding model.
import logging
from pathlib import Path
from typing import Iterable

View File

@ -1,3 +1,7 @@
# WARNING
# This example demonstrates only how to develop a new enrichment model.
# It does not run the actual picture classifier model.
import logging
from pathlib import Path
from typing import Any, Iterable

View File

@ -83,7 +83,15 @@
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n"
]
}
],
"source": [
"from docling.chunking import HybridChunker\n",
"\n",
@ -91,6 +99,13 @@
"chunk_iter = chunker.chunk(dl_doc=doc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)."
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -337,11 +352,11 @@
"source": [
"for i, chunk in enumerate(chunks):\n",
" print(f\"=== {i} ===\")\n",
" txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n",
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
"\n",
" ser_txt = chunker.serialize(chunk=chunk)\n",
" ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n",
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
"\n",
" print()"

View File

@ -0,0 +1,96 @@
import json
import time
from pathlib import Path
import yaml
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import (
AcceleratorDevice,
VlmPipelineOptions,
granite_vision_vlm_conversion_options,
smoldocling_vlm_conversion_options,
)
from docling.datamodel.settings import settings
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.vlm_pipeline import VlmPipeline
sources = [
"tests/data/2305.03393v1-pg9-img.png",
]
## Use experimental VlmPipeline
pipeline_options = VlmPipelineOptions()
# If force_backend_text = True, text from backend will be used instead of generated text
pipeline_options.force_backend_text = False
## On GPU systems, enable flash_attention_2 with CUDA:
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
## Pick a VLM model. We choose SmolDocling-256M by default
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
## Alternative VLM models:
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
from docling_core.types.doc import DocItemLabel, ImageRefMode
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
## Set up pipeline for PDF or image inputs
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options,
),
InputFormat.IMAGE: PdfFormatOption(
pipeline_cls=VlmPipeline,
pipeline_options=pipeline_options,
),
}
)
out_path = Path("scratch")
out_path.mkdir(parents=True, exist_ok=True)
for source in sources:
start_time = time.time()
print("================================================")
print("Processing... {}".format(source))
print("================================================")
print("")
res = converter.convert(source)
print("------------------------------------------------")
print("MD:")
print("------------------------------------------------")
print("")
print(res.document.export_to_markdown())
for page in res.pages:
print("")
print("Predicted page in DOCTAGS:")
print(page.predictions.vlm_response.text)
res.document.save_as_html(
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
image_mode=ImageRefMode.REFERENCED,
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
)
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
fp.write(json.dumps(res.document.export_to_dict()))
pg_num = res.document.num_pages()
print("")
inference_time = time.time() - start_time
print(
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
)
print("================================================")
print("done!")
print("================================================")

View File

@ -132,9 +132,48 @@ This is a collection of FAQ collected from the user questions on <https://github
```
??? Some images are missing from MS Word and Powerpoint"
??? question "Some images are missing from MS Word and Powerpoint"
### Some images are missing from MS Word and Powerpoint
The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
If you are on other operaring systems, these images will be ignored.
??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
### `HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'
**TLDR**:
In the context of the `HybridChunker`, this is a known & ancitipated "false alarm".
**Details**:
Using the [`HybridChunker`](../concepts/chunking.md#hybrid-chunker) often triggers a warning like this:
> Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
This is a warning that is emitted by transformers, saying that actually *running this sequence through the model* will result in indexing errors, i.e. the problematic case is only if one indeed passes the particular sequence through the (embedding) model.
In our case though, this occurs as a "false alarm", since what happens is the following:
- the chunker invokes the tokenizer on a potentially long sequence (e.g. 530 tokens as mentioned in the warning) in order to count its tokens, i.e. to assess if it is short enough. At this point transformers already emits the warning above!
- whenever the sequence at hand is oversized, the chunker proceeds to split it (but the transformers warning has already been shown nonetheless)
What is important is the actual token length of the produced chunks.
The snippet below can be used for getting the actual maximum chunk size (for users wanting to confirm that this does not exceed the model limit):
```python
chunk_max_len = 0
for i, chunk in enumerate(chunks):
ser_txt = chunker.serialize(chunk=chunk)
ser_tokens = len(tokenizer.tokenize(ser_txt))
if ser_tokens > chunk_max_len:
chunk_max_len = ser_tokens
print(f"{i}\t{ser_tokens}\t{repr(ser_txt[:100])}...")
print(f"Longest chunk yielded: {chunk_max_len} tokens")
print(f"Model max length: {tokenizer.model_max_length}")
```
Also see [docling#725](https://github.com/DS4SD/docling/issues/725).
Source: Issue [docling-core#119](https://github.com/DS4SD/docling-core/issues/119)

View File

@ -47,6 +47,6 @@ Docling simplifies document processing, parsing diverse formats — including ad
Docling has been brought to you by IBM.
[supported_formats]: ./supported_formats.md
[supported_formats]: ./usage/supported_formats.md
[docling_document]: ./concepts/docling_document.md
[integrations]: ./integrations/index.md

216
docs/usage/enrichments.md Normal file
View File

@ -0,0 +1,216 @@
Docling allows to enrich the conversion pipeline with additional steps which process specific document components,
e.g. code blocks, pictures, etc. The extra steps usually require extra models executions which may increase
the processing time consistently. For this reason most enrichment models are disabled by default.
The following table provides an overview of the default enrichment models available in Docling.
| Feature | Parameter | Processed item | Description |
| ------- | --------- | ---------------| ----------- |
| Code understanding | `do_code_enrichment` | `CodeItem` | See [docs below](#code-understanding). |
| Formula understanding | `do_formula_enrichment` | `TextItem` with label `FORMULA` | See [docs below](#formula-understanding). |
| Picrure classification | `do_picture_classification` | `PictureItem` | See [docs below](#picture-classification). |
| Picture description | `do_picture_description` | `PictureItem` | See [docs below](#picture-description). |
## Enrichments details
### Code understanding
The code understanding step allows to use advance parsing for code blocks found in the document.
This enrichment model also set the `code_language` property of the `CodeItem`.
Model specs: see the [`CodeFormula` model card](https://huggingface.co/ds4sd/CodeFormula).
Example command line:
```sh
docling --enrich-code FILE
```
Example code:
```py
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
pipeline_options = PdfPipelineOptions()
pipeline_options.do_code_enrichment = True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
result = converter.convert("https://arxiv.org/pdf/2501.17887")
doc = result.document
```
### Formula understanding
The formula understanding step will analize the equation formulas in documents and extract their LaTeX representation.
The HTML export functions in the DoclingDocument will leverage the formula and visualize the result using the mathml html syntax.
Model specs: see the [`CodeFormula` model card](https://huggingface.co/ds4sd/CodeFormula).
Example command line:
```sh
docling --enrich-formula FILE
```
Example code:
```py
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
pipeline_options = PdfPipelineOptions()
pipeline_options.do_formula_enrichment = True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
result = converter.convert("https://arxiv.org/pdf/2501.17887")
doc = result.document
```
### Picture classification
The picture classification step classifies the `PictureItem` elements in the document with the `DocumentFigureClassifier` model.
This model is specialized to understand the classes of pictures found in documents, e.g. different chart types, flow diagrams,
logos, signatures, etc.
Model specs: see the [`DocumentFigureClassifier` model card](https://huggingface.co/ds4sd/DocumentFigureClassifier).
Example command line:
```sh
docling --enrich-picture-classes FILE
```
Example code:
```py
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
pipeline_options = PdfPipelineOptions()
pipeline_options.generate_picture_images = True
pipeline_options.images_scale = 2
pipeline_options.do_picture_classification = True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
result = converter.convert("https://arxiv.org/pdf/2501.17887")
doc = result.document
```
### Picture description
The picture description step allows to annotate a picture with a vision model. This is also known as a "captioning" task.
The Docling pipeline allows to load and run models completely locally as well as connecting to remote API which support the chat template.
Below follow a few examples on how to use some common vision model and remote services.
```py
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.base_models import InputFormat
pipeline_options = PdfPipelineOptions()
pipeline_options.do_picture_description = True
converter = DocumentConverter(format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
})
result = converter.convert("https://arxiv.org/pdf/2501.17887")
doc = result.document
```
#### Granite Vision model
Model specs: see the [`ibm-granite/granite-vision-3.1-2b-preview` model card](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview).
Usage in Docling:
```py
from docling.datamodel.pipeline_options import granite_picture_description
pipeline_options.picture_description_options = granite_picture_description
```
#### SmolVLM model
Model specs: see the [`HuggingFaceTB/SmolVLM-256M-Instruct` model card](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct).
Usage in Docling:
```py
from docling.datamodel.pipeline_options import smolvlm_picture_description
pipeline_options.picture_description_options = smolvlm_picture_description
```
#### Other vision models
The option class `PictureDescriptionVlmOptions` allows to use any another model from the Hugging Face Hub.
```py
from docling.datamodel.pipeline_options import PictureDescriptionVlmOptions
pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
repo_id="", # <-- add here the Hugging Face repo_id of your favorite VLM
prompt="Describe the image in three sentences. Be consise and accurate.",
)
```
#### Remote vision model
The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc.
_Note: in most cases this option will send your data to the remote service provider._
Usage in Docling:
```py
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
# Enable connections to remote services
pipeline_options.enable_remote_services=True # <-- this is required!
# Example using a model running locally, e.g. via VLLM
# $ vllm serve MODEL_NAME
pipeline_options.picture_description_options = PictureDescriptionApiOptions(
url="http://localhost:8000/v1/chat/completions",
params=dict(
model="MODEL NAME",
seed=42,
max_completion_tokens=200,
),
prompt="Describe the image in three sentences. Be consise and accurate.",
timeout=90,
)
```
End-to-end code snippets for cloud providers are available in the examples section:
- [IBM watsonx.ai](../examples/pictures_description_api.py)
## Develop new enrichment models
Beside looking at the implementation of all the models listed above, the Docling documentation has a few examples
dedicated to the implementation of enrichment models.
- [Develop picture enrichment](../examples/develop_picture_enrichment.py)
- [Develop formula enrichment](../examples/develop_formula_understanding.py)

View File

@ -22,7 +22,7 @@ A simple example would look like this:
docling https://arxiv.org/pdf/2206.01062
```
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](../reference/cli.md).
### Advanced options
@ -104,7 +104,7 @@ The options in this list require the explicit `enable_remote_services=True` when
#### Adjust pipeline features
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
The example file [custom_convert.py](../examples/custom_convert.py) contains multiple ways
one can adjust the conversion pipeline and features.
##### Control PDF table extraction options
@ -183,13 +183,13 @@ You can limit the CPU threads used by Docling by setting the environment variabl
!!! note
This section discusses directly invoking a [backend](./concepts/architecture.md),
This section discusses directly invoking a [backend](../concepts/architecture.md),
i.e. using a low-level API. This should only be done when necessary. For most cases,
using a `DocumentConverter` (high-level API) as discussed in the sections above
should suffice  and is the recommended way.
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](./supported_formats.md)).
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](../supported_formats.md)).
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](../examples/run_with_formats.py) example.
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
```python
@ -214,9 +214,9 @@ print(dl_doc.export_to_markdown())
## Chunking
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
You can chunk a Docling document using a [chunker](../concepts/chunking.md), such as a
`HybridChunker`, as shown below (for more details check out
[this example](examples/hybrid_chunking.ipynb)):
[this example](../examples/hybrid_chunking.ipynb)):
```python
from docling.document_converter import DocumentConverter

View File

@ -1,6 +1,6 @@
Docling can parse various documents formats into a unified representation (Docling
Document), which it can export to different formats too — check out
[Architecture](./concepts/architecture.md) for more details.
[Architecture](../concepts/architecture.md) for more details.
Below you can find a listing of all supported input and output formats.
@ -22,7 +22,7 @@ Schema-specific support:
|--------|-------------|
| USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
| Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |
| Docling JSON | JSON-serialized [Docling Document](../concepts/docling_document.md) |
## Supported output formats

View File

@ -54,11 +54,14 @@ theme:
nav:
- Home:
- "Docling": index.md
- Installation: installation.md
- Usage: usage.md
- Supported formats: supported_formats.md
- FAQ: faq.md
- Docling v2: v2.md
- Installation:
- Installation: installation/index.md
- Usage:
- Usage: usage/index.md
- Supported formats: usage/supported_formats.md
- Enrichment features: usage/enrichments.md
- FAQ:
- FAQ: faq/index.md
- Concepts:
- Concepts: concepts/index.md
- Architecture: concepts/architecture.md
@ -72,11 +75,8 @@ nav:
- "Batch conversion": examples/batch_convert.py
- "Multi-format conversion": examples/run_with_formats.py
- "Figure export": examples/export_figures.py
- "Figure enrichment": examples/develop_picture_enrichment.py
- "Table export": examples/export_tables.py
- "Multimodal export": examples/export_multimodal.py
- "Annotate picture with local vlm": examples/pictures_description.ipynb
- "Annotate picture with remote vlm": examples/pictures_description_api.py
- "Force full page OCR": examples/full_page_ocr.py
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
@ -90,6 +90,12 @@ nav:
- examples/rag_haystack.ipynb
- examples/rag_langchain.ipynb
- examples/rag_llamaindex.ipynb
- 🖼️ Picture annotation:
- "Annotate picture with local VLM": examples/pictures_description.ipynb
- "Annotate picture with remote VLM": examples/pictures_description_api.py
- ✨ Enrichment development:
- "Figure enrichment": examples/develop_picture_enrichment.py
- "Formula enrichment": examples/develop_formula_understanding.py
- 🗂️ More examples:
- examples/rag_weaviate.ipynb
- RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb

391
poetry.lock generated
View File

@ -1,104 +1,135 @@
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
[[package]]
name = "accelerate"
version = "1.4.0"
description = "Accelerate"
optional = true
python-versions = ">=3.9.0"
files = [
{file = "accelerate-1.4.0-py3-none-any.whl", hash = "sha256:f6e1e7dfaf9d799a20a1dc45efbf4b1546163eac133faa5acd0d89177c896e55"},
{file = "accelerate-1.4.0.tar.gz", hash = "sha256:37d413e1b64cb8681ccd2908ae211cf73e13e6e636a2f598a96eccaa538773a5"},
]
[package.dependencies]
huggingface-hub = ">=0.21.0"
numpy = ">=1.17,<3.0.0"
packaging = ">=20.0"
psutil = "*"
pyyaml = "*"
safetensors = ">=0.4.3"
torch = ">=2.0.0"
[package.extras]
deepspeed = ["deepspeed"]
dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.6.4,<0.7.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.6.4,<0.7.0)"]
rich = ["rich"]
sagemaker = ["sagemaker"]
test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"]
test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
[[package]]
name = "aiohappyeyeballs"
version = "2.4.6"
version = "2.4.8"
description = "Happy Eyeballs for asyncio"
optional = false
python-versions = ">=3.9"
files = [
{file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"},
{file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"},
{file = "aiohappyeyeballs-2.4.8-py3-none-any.whl", hash = "sha256:6cac4f5dd6e34a9644e69cf9021ef679e4394f54e58a183056d12009e42ea9e3"},
{file = "aiohappyeyeballs-2.4.8.tar.gz", hash = "sha256:19728772cb12263077982d2f55453babd8bec6a052a926cd5c0c42796da8bf62"},
]
[[package]]
name = "aiohttp"
version = "3.11.12"
version = "3.11.13"
description = "Async http client/server framework (asyncio)"
optional = false
python-versions = ">=3.9"
files = [
{file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aa8a8caca81c0a3e765f19c6953416c58e2f4cc1b84829af01dd1c771bb2f91f"},
{file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84ede78acde96ca57f6cf8ccb8a13fbaf569f6011b9a52f870c662d4dc8cd854"},
{file = "aiohttp-3.11.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:584096938a001378484aa4ee54e05dc79c7b9dd933e271c744a97b3b6f644957"},
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:392432a2dde22b86f70dd4a0e9671a349446c93965f261dbaecfaf28813e5c42"},
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:88d385b8e7f3a870146bf5ea31786ef7463e99eb59e31db56e2315535d811f55"},
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b10a47e5390c4b30a0d58ee12581003be52eedd506862ab7f97da7a66805befb"},
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5263dcede17b6b0c41ef0c3ccce847d82a7da98709e75cf7efde3e9e3b5cae"},
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c5c7b8aa5443304c55c262c5693b108c35a3b61ef961f1e782dd52a2f559c7"},
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d1c031a7572f62f66f1257db37ddab4cb98bfaf9b9434a3b4840bf3560f5e788"},
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:7e44eba534381dd2687be50cbd5f2daded21575242ecfdaf86bbeecbc38dae8e"},
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:145a73850926018ec1681e734cedcf2716d6a8697d90da11284043b745c286d5"},
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2c311e2f63e42c1bf86361d11e2c4a59f25d9e7aabdbdf53dc38b885c5435cdb"},
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ea756b5a7bac046d202a9a3889b9a92219f885481d78cd318db85b15cc0b7bcf"},
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:526c900397f3bbc2db9cb360ce9c35134c908961cdd0ac25b1ae6ffcaa2507ff"},
{file = "aiohttp-3.11.12-cp310-cp310-win32.whl", hash = "sha256:b8d3bb96c147b39c02d3db086899679f31958c5d81c494ef0fc9ef5bb1359b3d"},
{file = "aiohttp-3.11.12-cp310-cp310-win_amd64.whl", hash = "sha256:7fe3d65279bfbee8de0fb4f8c17fc4e893eed2dba21b2f680e930cc2b09075c5"},
{file = "aiohttp-3.11.12-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87a2e00bf17da098d90d4145375f1d985a81605267e7f9377ff94e55c5d769eb"},
{file = "aiohttp-3.11.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b34508f1cd928ce915ed09682d11307ba4b37d0708d1f28e5774c07a7674cac9"},
{file = "aiohttp-3.11.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:936d8a4f0f7081327014742cd51d320296b56aa6d324461a13724ab05f4b2933"},
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de1378f72def7dfb5dbd73d86c19eda0ea7b0a6873910cc37d57e80f10d64e1"},
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9d45dbb3aaec05cf01525ee1a7ac72de46a8c425cb75c003acd29f76b1ffe94"},
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:930ffa1925393381e1e0a9b82137fa7b34c92a019b521cf9f41263976666a0d6"},
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8340def6737118f5429a5df4e88f440746b791f8f1c4ce4ad8a595f42c980bd5"},
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4016e383f91f2814e48ed61e6bda7d24c4d7f2402c75dd28f7e1027ae44ea204"},
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c0600bcc1adfaaac321422d615939ef300df81e165f6522ad096b73439c0f58"},
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:0450ada317a65383b7cce9576096150fdb97396dcfe559109b403c7242faffef"},
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:850ff6155371fd802a280f8d369d4e15d69434651b844bde566ce97ee2277420"},
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8fd12d0f989c6099e7b0f30dc6e0d1e05499f3337461f0b2b0dadea6c64b89df"},
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:76719dd521c20a58a6c256d058547b3a9595d1d885b830013366e27011ffe804"},
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fe431f2ed646a3b56142fc81d238abcbaff08548d6912acb0b19a0cadc146b"},
{file = "aiohttp-3.11.12-cp311-cp311-win32.whl", hash = "sha256:e10c440d142fa8b32cfdb194caf60ceeceb3e49807072e0dc3a8887ea80e8c16"},
{file = "aiohttp-3.11.12-cp311-cp311-win_amd64.whl", hash = "sha256:246067ba0cf5560cf42e775069c5d80a8989d14a7ded21af529a4e10e3e0f0e6"},
{file = "aiohttp-3.11.12-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e392804a38353900c3fd8b7cacbea5132888f7129f8e241915e90b85f00e3250"},
{file = "aiohttp-3.11.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8fa1510b96c08aaad49303ab11f8803787c99222288f310a62f493faf883ede1"},
{file = "aiohttp-3.11.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dc065a4285307607df3f3686363e7f8bdd0d8ab35f12226362a847731516e42c"},
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddb31f8474695cd61fc9455c644fc1606c164b93bff2490390d90464b4655df"},
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dec0000d2d8621d8015c293e24589d46fa218637d820894cb7356c77eca3259"},
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3552fe98e90fdf5918c04769f338a87fa4f00f3b28830ea9b78b1bdc6140e0d"},
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfe7f984f28a8ae94ff3a7953cd9678550dbd2a1f9bda5dd9c5ae627744c78e"},
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a481a574af914b6e84624412666cbfbe531a05667ca197804ecc19c97b8ab1b0"},
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1987770fb4887560363b0e1a9b75aa303e447433c41284d3af2840a2f226d6e0"},
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a4ac6a0f0f6402854adca4e3259a623f5c82ec3f0c049374133bcb243132baf9"},
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c96a43822f1f9f69cc5c3706af33239489a6294be486a0447fb71380070d4d5f"},
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a5e69046f83c0d3cb8f0d5bd9b8838271b1bc898e01562a04398e160953e8eb9"},
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:68d54234c8d76d8ef74744f9f9fc6324f1508129e23da8883771cdbb5818cbef"},
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c9fd9dcf9c91affe71654ef77426f5cf8489305e1c66ed4816f5a21874b094b9"},
{file = "aiohttp-3.11.12-cp312-cp312-win32.whl", hash = "sha256:0ed49efcd0dc1611378beadbd97beb5d9ca8fe48579fc04a6ed0844072261b6a"},
{file = "aiohttp-3.11.12-cp312-cp312-win_amd64.whl", hash = "sha256:54775858c7f2f214476773ce785a19ee81d1294a6bedc5cc17225355aab74802"},
{file = "aiohttp-3.11.12-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:413ad794dccb19453e2b97c2375f2ca3cdf34dc50d18cc2693bd5aed7d16f4b9"},
{file = "aiohttp-3.11.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a93d28ed4b4b39e6f46fd240896c29b686b75e39cc6992692e3922ff6982b4c"},
{file = "aiohttp-3.11.12-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d589264dbba3b16e8951b6f145d1e6b883094075283dafcab4cdd564a9e353a0"},
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5148ca8955affdfeb864aca158ecae11030e952b25b3ae15d4e2b5ba299bad2"},
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:525410e0790aab036492eeea913858989c4cb070ff373ec3bc322d700bdf47c1"},
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bd8695be2c80b665ae3f05cb584093a1e59c35ecb7d794d1edd96e8cc9201d7"},
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0203433121484b32646a5f5ea93ae86f3d9559d7243f07e8c0eab5ff8e3f70e"},
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40cd36749a1035c34ba8d8aaf221b91ca3d111532e5ccb5fa8c3703ab1b967ed"},
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a7442662afebbf7b4c6d28cb7aab9e9ce3a5df055fc4116cc7228192ad6cb484"},
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8a2fb742ef378284a50766e985804bd6adb5adb5aa781100b09befdbfa757b65"},
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2cee3b117a8d13ab98b38d5b6bdcd040cfb4181068d05ce0c474ec9db5f3c5bb"},
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f6a19bcab7fbd8f8649d6595624856635159a6527861b9cdc3447af288a00c00"},
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e4cecdb52aaa9994fbed6b81d4568427b6002f0a91c322697a4bfcc2b2363f5a"},
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:30f546358dfa0953db92ba620101fefc81574f87b2346556b90b5f3ef16e55ce"},
{file = "aiohttp-3.11.12-cp313-cp313-win32.whl", hash = "sha256:ce1bb21fc7d753b5f8a5d5a4bae99566386b15e716ebdb410154c16c91494d7f"},
{file = "aiohttp-3.11.12-cp313-cp313-win_amd64.whl", hash = "sha256:f7914ab70d2ee8ab91c13e5402122edbc77821c66d2758abb53aabe87f013287"},
{file = "aiohttp-3.11.12-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7c3623053b85b4296cd3925eeb725e386644fd5bc67250b3bb08b0f144803e7b"},
{file = "aiohttp-3.11.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67453e603cea8e85ed566b2700efa1f6916aefbc0c9fcb2e86aaffc08ec38e78"},
{file = "aiohttp-3.11.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6130459189e61baac5a88c10019b21e1f0c6d00ebc770e9ce269475650ff7f73"},
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9060addfa4ff753b09392efe41e6af06ea5dd257829199747b9f15bfad819460"},
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34245498eeb9ae54c687a07ad7f160053911b5745e186afe2d0c0f2898a1ab8a"},
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8dc0fba9a74b471c45ca1a3cb6e6913ebfae416678d90529d188886278e7f3f6"},
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a478aa11b328983c4444dacb947d4513cb371cd323f3845e53caeda6be5589d5"},
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c160a04283c8c6f55b5bf6d4cad59bb9c5b9c9cd08903841b25f1f7109ef1259"},
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:edb69b9589324bdc40961cdf0657815df674f1743a8d5ad9ab56a99e4833cfdd"},
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:4ee84c2a22a809c4f868153b178fe59e71423e1f3d6a8cd416134bb231fbf6d3"},
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:bf4480a5438f80e0f1539e15a7eb8b5f97a26fe087e9828e2c0ec2be119a9f72"},
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:e6b2732ef3bafc759f653a98881b5b9cdef0716d98f013d376ee8dfd7285abf1"},
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f752e80606b132140883bb262a457c475d219d7163d996dc9072434ffb0784c4"},
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ab3247d58b393bda5b1c8f31c9edece7162fc13265334217785518dd770792b8"},
{file = "aiohttp-3.11.12-cp39-cp39-win32.whl", hash = "sha256:0d5176f310a7fe6f65608213cc74f4228e4f4ce9fd10bcb2bb6da8fc66991462"},
{file = "aiohttp-3.11.12-cp39-cp39-win_amd64.whl", hash = "sha256:74bd573dde27e58c760d9ca8615c41a57e719bff315c9adb6f2a4281a28e8798"},
{file = "aiohttp-3.11.12.tar.gz", hash = "sha256:7603ca26d75b1b86160ce1bbe2787a0b706e592af5b2504e12caa88a217767b0"},
{file = "aiohttp-3.11.13-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a4fe27dbbeec445e6e1291e61d61eb212ee9fed6e47998b27de71d70d3e8777d"},
{file = "aiohttp-3.11.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e64ca2dbea28807f8484c13f684a2f761e69ba2640ec49dacd342763cc265ef"},
{file = "aiohttp-3.11.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9840be675de208d1f68f84d578eaa4d1a36eee70b16ae31ab933520c49ba1325"},
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28a772757c9067e2aee8a6b2b425d0efaa628c264d6416d283694c3d86da7689"},
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b88aca5adbf4625e11118df45acac29616b425833c3be7a05ef63a6a4017bfdb"},
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce10ddfbe26ed5856d6902162f71b8fe08545380570a885b4ab56aecfdcb07f4"},
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa48dac27f41b36735c807d1ab093a8386701bbf00eb6b89a0f69d9fa26b3671"},
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89ce611b1eac93ce2ade68f1470889e0173d606de20c85a012bfa24be96cf867"},
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:78e4dd9c34ec7b8b121854eb5342bac8b02aa03075ae8618b6210a06bbb8a115"},
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:66047eacbc73e6fe2462b77ce39fc170ab51235caf331e735eae91c95e6a11e4"},
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5ad8f1c19fe277eeb8bc45741c6d60ddd11d705c12a4d8ee17546acff98e0802"},
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:64815c6f02e8506b10113ddbc6b196f58dbef135751cc7c32136df27b736db09"},
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:967b93f21b426f23ca37329230d5bd122f25516ae2f24a9cea95a30023ff8283"},
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cf1f31f83d16ec344136359001c5e871915c6ab685a3d8dee38e2961b4c81730"},
{file = "aiohttp-3.11.13-cp310-cp310-win32.whl", hash = "sha256:00c8ac69e259c60976aa2edae3f13d9991cf079aaa4d3cd5a49168ae3748dee3"},
{file = "aiohttp-3.11.13-cp310-cp310-win_amd64.whl", hash = "sha256:90d571c98d19a8b6e793b34aa4df4cee1e8fe2862d65cc49185a3a3d0a1a3996"},
{file = "aiohttp-3.11.13-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6b35aab22419ba45f8fc290d0010898de7a6ad131e468ffa3922b1b0b24e9d2e"},
{file = "aiohttp-3.11.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f81cba651db8795f688c589dd11a4fbb834f2e59bbf9bb50908be36e416dc760"},
{file = "aiohttp-3.11.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f55d0f242c2d1fcdf802c8fabcff25a9d85550a4cf3a9cf5f2a6b5742c992839"},
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4bea08a6aad9195ac9b1be6b0c7e8a702a9cec57ce6b713698b4a5afa9c2e33"},
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c6070bcf2173a7146bb9e4735b3c62b2accba459a6eae44deea0eb23e0035a23"},
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:718d5deb678bc4b9d575bfe83a59270861417da071ab44542d0fcb6faa686636"},
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f6b2c5b4a4d22b8fb2c92ac98e0747f5f195e8e9448bfb7404cd77e7bfa243f"},
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:747ec46290107a490d21fe1ff4183bef8022b848cf9516970cb31de6d9460088"},
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:01816f07c9cc9d80f858615b1365f8319d6a5fd079cd668cc58e15aafbc76a54"},
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a08ad95fcbd595803e0c4280671d808eb170a64ca3f2980dd38e7a72ed8d1fea"},
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c97be90d70f7db3aa041d720bfb95f4869d6063fcdf2bb8333764d97e319b7d0"},
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ab915a57c65f7a29353c8014ac4be685c8e4a19e792a79fe133a8e101111438e"},
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:35cda4e07f5e058a723436c4d2b7ba2124ab4e0aa49e6325aed5896507a8a42e"},
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:af55314407714fe77a68a9ccaab90fdb5deb57342585fd4a3a8102b6d4370080"},
{file = "aiohttp-3.11.13-cp311-cp311-win32.whl", hash = "sha256:42d689a5c0a0c357018993e471893e939f555e302313d5c61dfc566c2cad6185"},
{file = "aiohttp-3.11.13-cp311-cp311-win_amd64.whl", hash = "sha256:b73a2b139782a07658fbf170fe4bcdf70fc597fae5ffe75e5b67674c27434a9f"},
{file = "aiohttp-3.11.13-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2eabb269dc3852537d57589b36d7f7362e57d1ece308842ef44d9830d2dc3c90"},
{file = "aiohttp-3.11.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b77ee42addbb1c36d35aca55e8cc6d0958f8419e458bb70888d8c69a4ca833d"},
{file = "aiohttp-3.11.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55789e93c5ed71832e7fac868167276beadf9877b85697020c46e9a75471f55f"},
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c929f9a7249a11e4aa5c157091cfad7f49cc6b13f4eecf9b747104befd9f56f2"},
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d33851d85537bbf0f6291ddc97926a754c8f041af759e0aa0230fe939168852b"},
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9229d8613bd8401182868fe95688f7581673e1c18ff78855671a4b8284f47bcb"},
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669dd33f028e54fe4c96576f406ebb242ba534dd3a981ce009961bf49960f117"},
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c1b20a1ace54af7db1f95af85da530fe97407d9063b7aaf9ce6a32f44730778"},
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5724cc77f4e648362ebbb49bdecb9e2b86d9b172c68a295263fa072e679ee69d"},
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:aa36c35e94ecdb478246dd60db12aba57cfcd0abcad43c927a8876f25734d496"},
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9b5b37c863ad5b0892cc7a4ceb1e435e5e6acd3f2f8d3e11fa56f08d3c67b820"},
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e06cf4852ce8c4442a59bae5a3ea01162b8fcb49ab438d8548b8dc79375dad8a"},
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5194143927e494616e335d074e77a5dac7cd353a04755330c9adc984ac5a628e"},
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:afcb6b275c2d2ba5d8418bf30a9654fa978b4f819c2e8db6311b3525c86fe637"},
{file = "aiohttp-3.11.13-cp312-cp312-win32.whl", hash = "sha256:7104d5b3943c6351d1ad7027d90bdd0ea002903e9f610735ac99df3b81f102ee"},
{file = "aiohttp-3.11.13-cp312-cp312-win_amd64.whl", hash = "sha256:47dc018b1b220c48089b5b9382fbab94db35bef2fa192995be22cbad3c5730c8"},
{file = "aiohttp-3.11.13-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9862d077b9ffa015dbe3ce6c081bdf35135948cb89116e26667dd183550833d1"},
{file = "aiohttp-3.11.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fbfef0666ae9e07abfa2c54c212ac18a1f63e13e0760a769f70b5717742f3ece"},
{file = "aiohttp-3.11.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:93a1f7d857c4fcf7cabb1178058182c789b30d85de379e04f64c15b7e88d66fb"},
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba40b7ae0f81c7029583a338853f6607b6d83a341a3dcde8bed1ea58a3af1df9"},
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b5b95787335c483cd5f29577f42bbe027a412c5431f2f80a749c80d040f7ca9f"},
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7d474c5c1f0b9405c1565fafdc4429fa7d986ccbec7ce55bc6a330f36409cad"},
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e83fb1991e9d8982b3b36aea1e7ad27ea0ce18c14d054c7a404d68b0319eebb"},
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4586a68730bd2f2b04a83e83f79d271d8ed13763f64b75920f18a3a677b9a7f0"},
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fe4eb0e7f50cdb99b26250d9328faef30b1175a5dbcfd6d0578d18456bac567"},
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2a8a6bc19818ac3e5596310ace5aa50d918e1ebdcc204dc96e2f4d505d51740c"},
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7f27eec42f6c3c1df09cfc1f6786308f8b525b8efaaf6d6bd76c1f52c6511f6a"},
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:2a4a13dfbb23977a51853b419141cd0a9b9573ab8d3a1455c6e63561387b52ff"},
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:02876bf2f69b062584965507b07bc06903c2dc93c57a554b64e012d636952654"},
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b992778d95b60a21c4d8d4a5f15aaab2bd3c3e16466a72d7f9bfd86e8cea0d4b"},
{file = "aiohttp-3.11.13-cp313-cp313-win32.whl", hash = "sha256:507ab05d90586dacb4f26a001c3abf912eb719d05635cbfad930bdbeb469b36c"},
{file = "aiohttp-3.11.13-cp313-cp313-win_amd64.whl", hash = "sha256:5ceb81a4db2decdfa087381b5fc5847aa448244f973e5da232610304e199e7b2"},
{file = "aiohttp-3.11.13-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:51c3ff9c7a25f3cad5c09d9aacbc5aefb9267167c4652c1eb737989b554fe278"},
{file = "aiohttp-3.11.13-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e271beb2b1dabec5cd84eb488bdabf9758d22ad13471e9c356be07ad139b3012"},
{file = "aiohttp-3.11.13-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0e9eb7e5764abcb49f0e2bd8f5731849b8728efbf26d0cac8e81384c95acec3f"},
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baae005092e3f200de02699314ac8933ec20abf998ec0be39448f6605bce93df"},
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1982c98ac62c132d2b773d50e2fcc941eb0b8bad3ec078ce7e7877c4d5a2dce7"},
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d2b25b2eeb35707113b2d570cadc7c612a57f1c5d3e7bb2b13870fe284e08fc0"},
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b27961d65639128336b7a7c3f0046dcc62a9443d5ef962e3c84170ac620cec47"},
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a01fe9f1e05025eacdd97590895e2737b9f851d0eb2e017ae9574d9a4f0b6252"},
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa1fb1b61881c8405829c50e9cc5c875bfdbf685edf57a76817dfb50643e4a1a"},
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:25de43bb3cf83ad83efc8295af7310219af6dbe4c543c2e74988d8e9c8a2a917"},
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe7065e2215e4bba63dc00db9ae654c1ba3950a5fff691475a32f511142fcddb"},
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7836587eef675a17d835ec3d98a8c9acdbeb2c1d72b0556f0edf4e855a25e9c1"},
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:85fa0b18558eb1427090912bd456a01f71edab0872f4e0f9e4285571941e4090"},
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a86dc177eb4c286c19d1823ac296299f59ed8106c9536d2b559f65836e0fb2c6"},
{file = "aiohttp-3.11.13-cp39-cp39-win32.whl", hash = "sha256:684eea71ab6e8ade86b9021bb62af4bf0881f6be4e926b6b5455de74e420783a"},
{file = "aiohttp-3.11.13-cp39-cp39-win_amd64.whl", hash = "sha256:82c249f2bfa5ecbe4a1a7902c81c0fba52ed9ebd0176ab3047395d02ad96cfcb"},
{file = "aiohttp-3.11.13.tar.gz", hash = "sha256:8ce789231404ca8fff7f693cdce398abf6d90fd5dae2b1847477196c243b1fbb"},
]
[package.dependencies]
@ -187,8 +218,8 @@ files = [
lazy-object-proxy = ">=1.4.0"
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
wrapt = [
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
]
[[package]]
@ -280,6 +311,24 @@ files = [
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
[[package]]
name = "backrefs"
version = "5.8"
description = "A wrapper around re and regex that adds additional back references."
optional = false
python-versions = ">=3.9"
files = [
{file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"},
{file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"},
{file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"},
{file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"},
{file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"},
{file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"},
]
[package.extras]
extras = ["regex"]
[[package]]
name = "beautifulsoup4"
version = "4.13.3"
@ -821,13 +870,13 @@ files = [
[[package]]
name = "docling-core"
version = "2.20.0"
version = "2.21.1"
description = "A python library to define and validate data types in Docling."
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_core-2.20.0-py3-none-any.whl", hash = "sha256:72f50fce277b7bb51f4134f443240c041582184305c3bcaabdea13fc5550f160"},
{file = "docling_core-2.20.0.tar.gz", hash = "sha256:9733581c15f5a9b5e3a6cb74fa995cc4078ff16668007f86c5f75d1ea9180d7f"},
{file = "docling_core-2.21.1-py3-none-any.whl", hash = "sha256:b8112915728cdc14f328f636f6c0ed36e6bbcc02ff940cc0bf85e303738671c3"},
{file = "docling_core-2.21.1.tar.gz", hash = "sha256:3ccc50197d24a3156cfc6c22c8404c58757749646d876a1c1c69fd800f664a4f"},
]
[package.dependencies]
@ -849,13 +898,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
[[package]]
name = "docling-ibm-models"
version = "3.4.0"
version = "3.4.1"
description = "This package contains the AI models used by the Docling PDF conversion package"
optional = false
python-versions = "<4.0,>=3.9"
files = [
{file = "docling_ibm_models-3.4.0-py3-none-any.whl", hash = "sha256:186517ff1f76e76113600fa1e5a699927325081a8013fdd5d0551121c2e34190"},
{file = "docling_ibm_models-3.4.0.tar.gz", hash = "sha256:fb79beeb07d1bb9bc8acf9d0a44643cd7ce1910aa418cd685e2e477b13eeafee"},
{file = "docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986"},
{file = "docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96"},
]
[package.dependencies]
@ -1300,13 +1349,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit",
[[package]]
name = "griffe"
version = "1.5.7"
version = "1.6.0"
description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
optional = false
python-versions = ">=3.9"
files = [
{file = "griffe-1.5.7-py3-none-any.whl", hash = "sha256:4af8ec834b64de954d447c7b6672426bb145e71605c74a4e22d510cc79fe7d8b"},
{file = "griffe-1.5.7.tar.gz", hash = "sha256:465238c86deaf1137761f700fb343edd8ffc846d72f6de43c3c345ccdfbebe92"},
{file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"},
{file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"},
]
[package.dependencies]
@ -1787,18 +1836,18 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"]
[[package]]
name = "jeepney"
version = "0.8.0"
version = "0.9.0"
description = "Low-level, pure Python DBus protocol wrapper."
optional = false
python-versions = ">=3.7"
files = [
{file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"},
{file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"},
{file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
{file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
]
[package.extras]
test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
trio = ["async_generator", "trio"]
trio = ["trio"]
[[package]]
name = "jinja2"
@ -2617,13 +2666,13 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp
[[package]]
name = "mkdocs-autorefs"
version = "1.3.1"
version = "1.4.0"
description = "Automatically link across pages in MkDocs."
optional = false
python-versions = ">=3.9"
files = [
{file = "mkdocs_autorefs-1.3.1-py3-none-any.whl", hash = "sha256:18c504ae4d3ee7f344369bb26cb31d4105569ee252aab7d75ec2734c2c8b0474"},
{file = "mkdocs_autorefs-1.3.1.tar.gz", hash = "sha256:a6d30cbcccae336d622a66c2418a3c92a8196b69782774529ad441abb23c0902"},
{file = "mkdocs_autorefs-1.4.0-py3-none-any.whl", hash = "sha256:bad19f69655878d20194acd0162e29a89c3f7e6365ffe54e72aa3fd1072f240d"},
{file = "mkdocs_autorefs-1.4.0.tar.gz", hash = "sha256:a9c0aa9c90edbce302c09d050a3c4cb7c76f8b7b2c98f84a7a05f53d00392156"},
]
[package.dependencies]
@ -2684,17 +2733,18 @@ pygments = ">2.12.0"
[[package]]
name = "mkdocs-material"
version = "9.6.5"
version = "9.6.7"
description = "Documentation that simply works"
optional = false
python-versions = ">=3.8"
files = [
{file = "mkdocs_material-9.6.5-py3-none-any.whl", hash = "sha256:aad3e6fb860c20870f75fb2a69ef901f1be727891e41adb60b753efcae19453b"},
{file = "mkdocs_material-9.6.5.tar.gz", hash = "sha256:b714679a8c91b0ffe2188e11ed58c44d2523e9c2ae26a29cc652fa7478faa21f"},
{file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"},
{file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"},
]
[package.dependencies]
babel = ">=2.10,<3.0"
backrefs = ">=5.7.post1,<6.0"
colorama = ">=0.4,<1.0"
jinja2 = ">=3.0,<4.0"
markdown = ">=3.2,<4.0"
@ -2703,7 +2753,6 @@ mkdocs-material-extensions = ">=1.3,<2.0"
paginate = ">=0.5,<1.0"
pygments = ">=2.16,<3.0"
pymdown-extensions = ">=10.2,<11.0"
regex = ">=2022.4"
requests = ">=2.26,<3.0"
[package.extras]
@ -2791,8 +2840,8 @@ files = [
[package.dependencies]
multiprocess = [
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
{version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
]
pygments = ">=2.0"
pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
@ -3144,35 +3193,35 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
[[package]]
name = "nh3"
version = "0.2.20"
version = "0.2.21"
description = "Python binding to Ammonia HTML sanitizer Rust crate"
optional = false
python-versions = ">=3.8"
files = [
{file = "nh3-0.2.20-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:e1061a4ab6681f6bdf72b110eea0c4e1379d57c9de937db3be4202f7ad6043db"},
{file = "nh3-0.2.20-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb4254b1dac4a1ee49919a5b3f1caf9803ea8dada1816d9e8289e63d3cd0dd9a"},
{file = "nh3-0.2.20-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0ae9cbd713524cdb81e64663d0d6aae26f678db9f2cd9db0bf162606f1f9f20c"},
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e1f7370b4e14cc03f5ae141ef30a1caf81fa5787711f80be9081418dd9eb79d2"},
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:ac4d27dc836a476efffc6eb661994426b8b805c951b29c9cf2ff36bc9ad58bc5"},
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:4fd2e9248725ebcedac3997a8d3da0d90a12a28c9179c6ba51f1658938ac30d0"},
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f7d564871833ddbe54df3aa59053b1110729d3a800cb7628ae8f42adb3d75208"},
{file = "nh3-0.2.20-cp313-cp313t-win32.whl", hash = "sha256:d2a176fd4306b6f0f178a3f67fac91bd97a3a8d8fafb771c9b9ef675ba5c8886"},
{file = "nh3-0.2.20-cp313-cp313t-win_amd64.whl", hash = "sha256:6ed834c68452a600f517dd3e1534dbfaff1f67f98899fecf139a055a25d99150"},
{file = "nh3-0.2.20-cp38-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:76e2f603b30c02ff6456b233a83fc377dedab6a50947b04e960a6b905637b776"},
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:181063c581defe683bd4bb78188ac9936d208aebbc74c7f7c16b6a32ae2ebb38"},
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:231addb7643c952cd6d71f1c8702d703f8fe34afcb20becb3efb319a501a12d7"},
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1b9a8340a0aab991c68a5ca938d35ef4a8a3f4bf1b455da8855a40bee1fa0ace"},
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10317cd96fe4bbd4eb6b95f3920b71c902157ad44fed103fdcde43e3b8ee8be6"},
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8698db4c04b140800d1a1cd3067fda399e36e1e2b8fc1fe04292a907350a3e9b"},
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3eb04b9c3deb13c3a375ea39fd4a3c00d1f92e8fb2349f25f1e3e4506751774b"},
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:92f3f1c4f47a2c6f3ca7317b1d5ced05bd29556a75d3a4e2715652ae9d15c05d"},
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ddefa9fd6794a87e37d05827d299d4b53a3ec6f23258101907b96029bfef138a"},
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:ce3731c8f217685d33d9268362e5b4f770914e922bba94d368ab244a59a6c397"},
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:09f037c02fc2c43b211ff1523de32801dcfb0918648d8e651c36ef890f1731ec"},
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:813f1c8012dd64c990514b795508abb90789334f76a561fa0fd4ca32d2275330"},
{file = "nh3-0.2.20-cp38-abi3-win32.whl", hash = "sha256:47b2946c0e13057855209daeffb45dc910bd0c55daf10190bb0b4b60e2999784"},
{file = "nh3-0.2.20-cp38-abi3-win_amd64.whl", hash = "sha256:da87573f03084edae8eb87cfe811ec338606288f81d333c07d2a9a0b9b976c0b"},
{file = "nh3-0.2.20.tar.gz", hash = "sha256:9705c42d7ff88a0bea546c82d7fe5e59135e3d3f057e485394f491248a1f8ed5"},
{file = "nh3-0.2.21-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:fcff321bd60c6c5c9cb4ddf2554e22772bb41ebd93ad88171bbbb6f271255286"},
{file = "nh3-0.2.21-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31eedcd7d08b0eae28ba47f43fd33a653b4cdb271d64f1aeda47001618348fde"},
{file = "nh3-0.2.21-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d426d7be1a2f3d896950fe263332ed1662f6c78525b4520c8e9861f8d7f0d243"},
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9d67709bc0d7d1f5797b21db26e7a8b3d15d21c9c5f58ccfe48b5328483b685b"},
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:55823c5ea1f6b267a4fad5de39bc0524d49a47783e1fe094bcf9c537a37df251"},
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:818f2b6df3763e058efa9e69677b5a92f9bc0acff3295af5ed013da544250d5b"},
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b3b5c58161e08549904ac4abd450dacd94ff648916f7c376ae4b2c0652b98ff9"},
{file = "nh3-0.2.21-cp313-cp313t-win32.whl", hash = "sha256:637d4a10c834e1b7d9548592c7aad760611415fcd5bd346f77fd8a064309ae6d"},
{file = "nh3-0.2.21-cp313-cp313t-win_amd64.whl", hash = "sha256:713d16686596e556b65e7f8c58328c2df63f1a7abe1277d87625dcbbc012ef82"},
{file = "nh3-0.2.21-cp38-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a772dec5b7b7325780922dd904709f0f5f3a79fbf756de5291c01370f6df0967"},
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d002b648592bf3033adfd875a48f09b8ecc000abd7f6a8769ed86b6ccc70c759"},
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a5174551f95f2836f2ad6a8074560f261cf9740a48437d6151fd2d4d7d617ab"},
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b8d55ea1fc7ae3633d758a92aafa3505cd3cc5a6e40470c9164d54dff6f96d42"},
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ae319f17cd8960d0612f0f0ddff5a90700fa71926ca800e9028e7851ce44a6f"},
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63ca02ac6f27fc80f9894409eb61de2cb20ef0a23740c7e29f9ec827139fa578"},
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5f77e62aed5c4acad635239ac1290404c7e940c81abe561fd2af011ff59f585"},
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:087ffadfdcd497658c3adc797258ce0f06be8a537786a7217649fc1c0c60c293"},
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ac7006c3abd097790e611fe4646ecb19a8d7f2184b882f6093293b8d9b887431"},
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:6141caabe00bbddc869665b35fc56a478eb774a8c1dfd6fba9fe1dfdf29e6efa"},
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:20979783526641c81d2f5bfa6ca5ccca3d1e4472474b162c6256745fbfe31cd1"},
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a7ea28cd49293749d67e4fcf326c554c83ec912cd09cd94aa7ec3ab1921c8283"},
{file = "nh3-0.2.21-cp38-abi3-win32.whl", hash = "sha256:6c9c30b8b0d291a7c5ab0967ab200598ba33208f754f2f4920e9343bdd88f79a"},
{file = "nh3-0.2.21-cp38-abi3-win_amd64.whl", hash = "sha256:bb0014948f04d7976aabae43fcd4cb7f551f9f8ce785a4c9ef66e6c2590f8629"},
{file = "nh3-0.2.21.tar.gz", hash = "sha256:4990e7ee6a55490dbf00d61a6f476c9a3258e31e711e13713b2ea7d6616f670e"},
]
[[package]]
@ -3801,10 +3850,10 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
]
@ -3827,10 +3876,10 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
]
@ -4016,9 +4065,9 @@ files = [
[package.dependencies]
numpy = [
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
]
python-dateutil = ">=2.8.2"
pytz = ">=2020.1"
@ -4724,13 +4773,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
[[package]]
name = "pydantic-settings"
version = "2.8.0"
version = "2.8.1"
description = "Settings management using Pydantic"
optional = false
python-versions = ">=3.8"
files = [
{file = "pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820"},
{file = "pydantic_settings-2.8.0.tar.gz", hash = "sha256:88e2ca28f6e68ea102c99c3c401d6c9078e68a5df600e97b43891c34e089500a"},
{file = "pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c"},
{file = "pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585"},
]
[package.dependencies]
@ -4782,8 +4831,8 @@ files = [
astroid = ">=2.15.8,<=2.17.0-dev0"
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
dill = [
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
{version = ">=0.2", markers = "python_version < \"3.11\""},
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
]
isort = ">=4.2.5,<6"
mccabe = ">=0.6,<0.8"
@ -5866,26 +5915,26 @@ files = [
[[package]]
name = "safetensors"
version = "0.5.2"
version = "0.5.3"
description = ""
optional = false
python-versions = ">=3.7"
files = [
{file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"},
{file = "safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d0d6a8ee2215a440e1296b843edf44fd377b055ba350eaba74655a2fe2c4bae"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86016d40bcaa3bcc9a56cd74d97e654b5f4f4abe42b038c71e4f00a089c4526c"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:990833f70a5f9c7d3fc82c94507f03179930ff7d00941c287f73b6fcbf67f19e"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dfa7c2f3fe55db34eba90c29df94bcdac4821043fc391cb5d082d9922013869"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ff2116150ae70a4e9c490d2ab6b6e1b1b93f25e520e540abe1b81b48560c3a"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab696dfdc060caffb61dbe4066b86419107a24c804a4e373ba59be699ebd8d5"},
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a00e737948791b94dad83cf0eafc09a02c4d8c2171a239e8c8572fe04e25960e"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1506e4c2eda1431099cebe9abf6c76853e95d0b7a95addceaa74c6019c65d8cf"},
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5c5b5d9da594f638a259fca766046f44c97244cc7ab8bef161b3e80d04becc76"},
{file = "safetensors-0.5.2-cp38-abi3-win32.whl", hash = "sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2"},
{file = "safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589"},
{file = "safetensors-0.5.2.tar.gz", hash = "sha256:cb4a8d98ba12fa016f4241932b1fc5e702e5143f5374bba0bbcf7ddc1c4cf2b8"},
{file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"},
{file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a"},
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135"},
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04"},
{file = "safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace"},
{file = "safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11"},
{file = "safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965"},
]
[package.dependencies]
@ -6182,13 +6231,13 @@ train = ["accelerate (>=0.20.3)", "datasets"]
[[package]]
name = "setuptools"
version = "75.8.0"
version = "75.8.2"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
optional = false
python-versions = ">=3.9"
files = [
{file = "setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3"},
{file = "setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6"},
{file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"},
{file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"},
]
[package.extras]
@ -7186,13 +7235,13 @@ files = [
[[package]]
name = "types-requests"
version = "2.32.0.20241016"
version = "2.32.0.20250301"
description = "Typing stubs for requests"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.9"
files = [
{file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"},
{file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"},
{file = "types_requests-2.32.0.20250301-py3-none-any.whl", hash = "sha256:0003e0124e2cbefefb88222ff822b48616af40c74df83350f599a650c8de483b"},
{file = "types_requests-2.32.0.20250301.tar.gz", hash = "sha256:3d909dc4eaab159c0d964ebe8bfa326a7afb4578d8706408d417e17d61b0c500"},
]
[package.dependencies]
@ -7200,13 +7249,13 @@ urllib3 = ">=2"
[[package]]
name = "types-tqdm"
version = "4.67.0.20241221"
version = "4.67.0.20250301"
description = "Typing stubs for tqdm"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.9"
files = [
{file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
{file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
{file = "types_tqdm-4.67.0.20250301-py3-none-any.whl", hash = "sha256:8af97deb8e6874af833555dc1fe0fcd456b1a789470bf6cd8813d4e7ee4f6c5b"},
{file = "types_tqdm-4.67.0.20250301.tar.gz", hash = "sha256:5e89a38ad89b867823368eb97d9f90d2fc69806bb055dde62716a05da62b5e0d"},
]
[package.dependencies]
@ -7797,9 +7846,9 @@ type = ["pytest-mypy"]
ocrmac = ["ocrmac"]
rapidocr = ["onnxruntime", "onnxruntime", "rapidocr-onnxruntime"]
tesserocr = ["tesserocr"]
vlm = ["transformers", "transformers"]
vlm = ["accelerate", "transformers", "transformers"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "d2f454a5f88192eeda1fd17a0e69c9fbd3590c46af548c5203a3dfff4e135978"
content-hash = "59424e63947e6c22fceab0a3fe6f1e9ebb72abfe369708a1f55d4daf2593f433"

View File

@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
version = "2.24.0" # DO NOT EDIT, updated automatically
version = "2.25.1" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
license = "MIT"
@ -58,10 +58,14 @@ onnxruntime = [
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
]
transformers = [
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
]
accelerate = [
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
]
pillow = ">=10.0.0,<12.0.0"
tqdm = "^4.65.0"
pluggy = "^1.0.0"
@ -125,7 +129,7 @@ torchvision = [
[tool.poetry.extras]
tesserocr = ["tesserocr"]
ocrmac = ["ocrmac"]
vlm = ["transformers"]
vlm = ["transformers", "accelerate"]
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts]

View File

@ -1,8 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Introduction
item-2 at level 2: paragraph: This is the first paragraph of the introduction.
item-2 at level 2: text: This is the first paragraph of the introduction.
item-3 at level 2: section_header: Background
item-4 at level 3: paragraph: Some background information here.
item-4 at level 3: text: Some background information here.
item-5 at level 3: picture
item-6 at level 3: list: group list
item-7 at level 4: list_item: First item in unordered list

View File

@ -88,7 +88,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
@ -126,7 +126,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."

View File

@ -1,8 +1,8 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Introduction
item-2 at level 2: paragraph: This is the first paragraph of the introduction.
item-2 at level 2: text: This is the first paragraph of the introduction.
item-3 at level 2: section_header: Background
item-4 at level 3: paragraph: Some background information here.
item-4 at level 3: text: Some background information here.
item-5 at level 3: list: group list
item-6 at level 4: list_item: First item in unordered list
item-7 at level 4: list_item: Second item in unordered list

View File

@ -88,7 +88,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
@ -123,7 +123,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."

View File

@ -1,9 +1,9 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: Example Document
item-2 at level 2: section_header: Introduction
item-3 at level 3: paragraph: This is the first paragraph of the introduction.
item-3 at level 3: text: This is the first paragraph of the introduction.
item-4 at level 2: section_header: Background
item-5 at level 3: paragraph: Some background information here.
item-5 at level 3: text: Some background information here.
item-6 at level 3: list: group list
item-7 at level 4: list_item: First item in unordered list
item-8 at level 5: list: group list

View File

@ -142,7 +142,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "This is the first paragraph of the introduction.",
"text": "This is the first paragraph of the introduction."
@ -177,7 +177,7 @@
},
"children": [],
"content_layer": "body",
"label": "paragraph",
"label": "text",
"prov": [],
"orig": "Some background information here.",
"text": "Some background information here."

View File

@ -0,0 +1,7 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: text: This is a div with text.
item-2 at level 1: text: This is another div with text.
item-3 at level 1: text: This is a regular paragraph.
item-4 at level 1: text: This is a third div
with a new line.
item-5 at level 1: text: This is a fourth div with a bold paragraph.

View File

@ -0,0 +1,108 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"name": "example_06",
"origin": {
"mimetype": "text/html",
"binary_hash": 14574683870626799530,
"filename": "example_06.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/2"
},
{
"$ref": "#/texts/3"
},
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is a div with text.",
"text": "This is a div with text."
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is another div with text.",
"text": "This is another div with text."
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is a regular paragraph.",
"text": "This is a regular paragraph."
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is a third div\nwith a new line.",
"text": "This is a third div\nwith a new line."
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "This is a fourth div with a bold paragraph.",
"text": "This is a fourth div with a bold paragraph."
}
],
"pictures": [],
"tables": [],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@ -0,0 +1,10 @@
This is a div with text.
This is another div with text.
This is a regular paragraph.
This is a third div
with a new line.
This is a fourth div with a bold paragraph.

View File

@ -1,474 +1,416 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: list: group list
item-2 at level 2: list_item: Main page
item-3 at level 2: list_item: Contents
item-4 at level 2: list_item: Current events
item-5 at level 2: list_item: Random article
item-6 at level 2: list_item: About Wikipedia
item-7 at level 2: list_item: Contact us
item-8 at level 1: list: group list
item-9 at level 2: list_item: Help
item-10 at level 2: list_item: Learn to edit
item-11 at level 2: list_item: Community portal
item-12 at level 2: list_item: Recent changes
item-13 at level 2: list_item: Upload file
item-14 at level 1: picture
item-15 at level 1: picture
item-16 at level 1: picture
item-17 at level 1: list: group list
item-18 at level 1: list: group list
item-19 at level 2: list_item: Donate
item-20 at level 1: list: group list
item-21 at level 1: list: group list
item-22 at level 2: list_item: Create account
item-23 at level 2: list_item: Log in
item-24 at level 1: list: group list
item-25 at level 2: list_item: Create account
item-26 at level 2: list_item: Log in
item-27 at level 1: list: group list
item-28 at level 2: list_item: Contributions
item-29 at level 2: list_item: Talk
item-30 at level 1: section: group header-1
item-31 at level 2: section_header: Contents
item-32 at level 3: list: group list
item-33 at level 4: list_item: (Top)
item-34 at level 4: list_item: 1 Etymology
item-35 at level 5: list: group list
item-36 at level 4: list_item: 2 Taxonomy
item-37 at level 5: list: group list
item-38 at level 4: list_item: 3 Morphology
item-39 at level 5: list: group list
item-40 at level 4: list_item: 4 Distribution and habitat
item-41 at level 5: list: group list
item-42 at level 4: list_item: 5 Behaviour Toggle Behaviour subsection
item-43 at level 5: list: group list
item-44 at level 6: list_item: 5.1 Feeding
item-45 at level 7: list: group list
item-46 at level 6: list_item: 5.2 Breeding
item-47 at level 7: list: group list
item-48 at level 6: list_item: 5.3 Communication
item-49 at level 7: list: group list
item-50 at level 6: list_item: 5.4 Predators
item-51 at level 7: list: group list
item-52 at level 4: list_item: 6 Relationship with humans Toggle Relationship with humans subsection
item-53 at level 5: list: group list
item-54 at level 6: list_item: 6.1 Hunting
item-55 at level 7: list: group list
item-56 at level 6: list_item: 6.2 Domestication
item-57 at level 7: list: group list
item-58 at level 6: list_item: 6.3 Heraldry
item-59 at level 7: list: group list
item-60 at level 6: list_item: 6.4 Cultural references
item-61 at level 7: list: group list
item-62 at level 4: list_item: 7 See also
item-63 at level 5: list: group list
item-64 at level 4: list_item: 8 Notes Toggle Notes subsection
item-65 at level 5: list: group list
item-66 at level 6: list_item: 8.1 Citations
item-67 at level 7: list: group list
item-68 at level 6: list_item: 8.2 Sources
item-69 at level 7: list: group list
item-70 at level 4: list_item: 9 External links
item-71 at level 5: list: group list
item-72 at level 1: title: Duck
item-73 at level 2: list: group list
item-74 at level 3: list_item: Acèh
item-75 at level 3: list_item: Afrikaans
item-76 at level 3: list_item: Alemannisch
item-77 at level 3: list_item: አማርኛ
item-78 at level 3: list_item: Ænglisc
item-79 at level 3: list_item: العربية
item-80 at level 3: list_item: Aragonés
item-81 at level 3: list_item: ܐܪܡܝܐ
item-82 at level 3: list_item: Armãneashti
item-83 at level 3: list_item: Asturianu
item-84 at level 3: list_item: Atikamekw
item-85 at level 3: list_item: Авар
item-86 at level 3: list_item: Aymar aru
item-87 at level 3: list_item: تۆرکجه
item-88 at level 3: list_item: Basa Bali
item-89 at level 3: list_item: বাংলা
item-90 at level 3: list_item: 閩南語 / Bân-lâm-gú
item-91 at level 3: list_item: Беларуская
item-92 at level 3: list_item: Беларуская (тарашкевіца)
item-93 at level 3: list_item: Bikol Central
item-94 at level 3: list_item: Български
item-95 at level 3: list_item: Brezhoneg
item-96 at level 3: list_item: Буряад
item-97 at level 3: list_item: Català
item-98 at level 3: list_item: Чӑвашла
item-99 at level 3: list_item: Čeština
item-100 at level 3: list_item: ChiShona
item-101 at level 3: list_item: Cymraeg
item-102 at level 3: list_item: Dagbanli
item-103 at level 3: list_item: Dansk
item-104 at level 3: list_item: Deitsch
item-105 at level 3: list_item: Deutsch
item-106 at level 3: list_item: डोटेली
item-107 at level 3: list_item: Ελληνικά
item-108 at level 3: list_item: Emiliàn e rumagnòl
item-109 at level 3: list_item: Español
item-110 at level 3: list_item: Esperanto
item-111 at level 3: list_item: Euskara
item-112 at level 3: list_item: فارسی
item-113 at level 3: list_item: Français
item-114 at level 3: list_item: Gaeilge
item-115 at level 3: list_item: Galego
item-116 at level 3: list_item: ГӀалгӀай
item-117 at level 3: list_item: 贛語
item-118 at level 3: list_item: گیلکی
item-119 at level 3: list_item: 𐌲𐌿𐍄𐌹𐍃𐌺
item-120 at level 3: list_item: गोंयची कोंकणी / Gõychi Konknni
item-121 at level 3: list_item: 客家語 / Hak-kâ-ngî
item-122 at level 3: list_item: 한국어
item-123 at level 3: list_item: Hausa
item-124 at level 3: list_item: Հայերեն
item-125 at level 3: list_item: हिन्दी
item-126 at level 3: list_item: Hrvatski
item-127 at level 3: list_item: Ido
item-128 at level 3: list_item: Bahasa Indonesia
item-129 at level 3: list_item: Iñupiatun
item-130 at level 3: list_item: Íslenska
item-131 at level 3: list_item: Italiano
item-132 at level 3: list_item: עברית
item-133 at level 3: list_item: Jawa
item-134 at level 3: list_item: ಕನ್ನಡ
item-135 at level 3: list_item: Kapampangan
item-136 at level 3: list_item: ქართული
item-137 at level 3: list_item: कॉशुर / کٲشُر
item-138 at level 3: list_item: Қазақша
item-139 at level 3: list_item: Ikirundi
item-140 at level 3: list_item: Kongo
item-141 at level 3: list_item: Kreyòl ayisyen
item-142 at level 3: list_item: Кырык мары
item-143 at level 3: list_item: ລາວ
item-144 at level 3: list_item: Latina
item-145 at level 3: list_item: Latviešu
item-146 at level 3: list_item: Lietuvių
item-147 at level 3: list_item: Li Niha
item-148 at level 3: list_item: Ligure
item-149 at level 3: list_item: Limburgs
item-150 at level 3: list_item: Lingála
item-151 at level 3: list_item: Malagasy
item-152 at level 3: list_item: മലയാളം
item-153 at level 3: list_item: मराठी
item-154 at level 3: list_item: مازِرونی
item-155 at level 3: list_item: Bahasa Melayu
item-156 at level 3: list_item: ꯃꯤꯇꯩ ꯂꯣꯟ
item-157 at level 3: list_item: 閩東語 / Mìng-dĕ̤ng-ngṳ̄
item-158 at level 3: list_item: Мокшень
item-159 at level 3: list_item: Монгол
item-160 at level 3: list_item: မြန်မာဘာသာ
item-161 at level 3: list_item: Nederlands
item-162 at level 3: list_item: Nedersaksies
item-163 at level 3: list_item: नेपाली
item-164 at level 3: list_item: नेपाल भाषा
item-165 at level 3: list_item: 日本語
item-166 at level 3: list_item: Нохчийн
item-167 at level 3: list_item: Norsk nynorsk
item-168 at level 3: list_item: Occitan
item-169 at level 3: list_item: Oromoo
item-170 at level 3: list_item: ਪੰਜਾਬੀ
item-171 at level 3: list_item: Picard
item-172 at level 3: list_item: Plattdüütsch
item-173 at level 3: list_item: Polski
item-174 at level 3: list_item: Português
item-175 at level 3: list_item: Qırımtatarca
item-176 at level 3: list_item: Română
item-177 at level 3: list_item: Русский
item-178 at level 3: list_item: Саха тыла
item-179 at level 3: list_item: ᱥᱟᱱᱛᱟᱲᱤ
item-180 at level 3: list_item: Sardu
item-181 at level 3: list_item: Scots
item-182 at level 3: list_item: Seeltersk
item-183 at level 3: list_item: Shqip
item-184 at level 3: list_item: Sicilianu
item-185 at level 3: list_item: සිංහල
item-186 at level 3: list_item: Simple English
item-187 at level 3: list_item: سنڌي
item-188 at level 3: list_item: کوردی
item-189 at level 3: list_item: Српски / srpski
item-190 at level 3: list_item: Srpskohrvatski / српскохрватски
item-191 at level 3: list_item: Sunda
item-192 at level 3: list_item: Svenska
item-193 at level 3: list_item: Tagalog
item-194 at level 3: list_item: தமிழ்
item-195 at level 3: list_item: Taqbaylit
item-196 at level 3: list_item: Татарча / tatarça
item-197 at level 3: list_item: ไทย
item-198 at level 3: list_item: Türkçe
item-199 at level 3: list_item: Українська
item-200 at level 3: list_item: ئۇيغۇرچە / Uyghurche
item-201 at level 3: list_item: Vahcuengh
item-202 at level 3: list_item: Tiếng Việt
item-203 at level 3: list_item: Walon
item-204 at level 3: list_item: 文言
item-205 at level 3: list_item: Winaray
item-206 at level 3: list_item: 吴语
item-207 at level 3: list_item: 粵語
item-208 at level 3: list_item: Žemaitėška
item-209 at level 3: list_item: 中文
item-210 at level 2: list: group list
item-211 at level 3: list_item: Article
item-212 at level 3: list_item: Talk
item-213 at level 2: list: group list
item-214 at level 2: list: group list
item-215 at level 3: list_item: Read
item-216 at level 3: list_item: View source
item-217 at level 3: list_item: View history
item-218 at level 2: list: group list
item-219 at level 3: list_item: Read
item-220 at level 3: list_item: View source
item-221 at level 3: list_item: View history
item-222 at level 2: list: group list
item-223 at level 3: list_item: What links here
item-224 at level 3: list_item: Related changes
item-225 at level 3: list_item: Upload file
item-226 at level 3: list_item: Special pages
item-227 at level 3: list_item: Permanent link
item-228 at level 3: list_item: Page information
item-229 at level 3: list_item: Cite this page
item-230 at level 3: list_item: Get shortened URL
item-231 at level 3: list_item: Download QR code
item-232 at level 3: list_item: Wikidata item
item-233 at level 2: list: group list
item-234 at level 3: list_item: Download as PDF
item-235 at level 3: list_item: Printable version
item-236 at level 2: list: group list
item-237 at level 3: list_item: Wikimedia Commons
item-238 at level 3: list_item: Wikiquote
item-239 at level 2: picture
item-240 at level 2: table with [13x2]
item-241 at level 2: paragraph: Duck is the common name for nume ... und in both fresh water and sea water.
item-242 at level 2: paragraph: Ducks are sometimes confused wit ... divers, grebes, gallinules and coots.
item-243 at level 2: section_header: Etymology
item-244 at level 3: paragraph: The word duck comes from Old Eng ... h duiken and German tauchen 'to dive'.
item-245 at level 3: picture
item-245 at level 4: caption: Pacific black duck displaying the characteristic upending "duck"
item-246 at level 3: paragraph: This word replaced Old English e ... nskrit ātí 'water bird', among others.
item-247 at level 3: paragraph: A duckling is a young duck in do ... , is sometimes labelled as a duckling.
item-248 at level 3: paragraph: A male is called a drake and the ... a duck, or in ornithology a hen.[3][4]
item-249 at level 3: picture
item-249 at level 4: caption: Male mallard.
item-250 at level 3: picture
item-250 at level 4: caption: Wood ducks.
item-251 at level 2: section_header: Taxonomy
item-252 at level 3: paragraph: All ducks belong to the biologic ... ationships between various species.[9]
item-253 at level 3: picture
item-253 at level 4: caption: Mallard landing in approach
item-254 at level 3: paragraph: In most modern classifications, ... all size and stiff, upright tails.[14]
item-255 at level 3: paragraph: A number of other species called ... shelducks in the tribe Tadornini.[15]
item-256 at level 2: section_header: Morphology
item-257 at level 3: picture
item-257 at level 4: caption: Male Mandarin duck
item-258 at level 3: paragraph: The overall body plan of ducks i ... is moult typically precedes migration.
item-259 at level 3: paragraph: The drakes of northern species o ... rkscrew shaped vagina to prevent rape.
item-260 at level 2: section_header: Distribution and habitat
item-261 at level 3: picture
item-261 at level 4: caption: Flying steamer ducks in Ushuaia, Argentina
item-262 at level 3: paragraph: Ducks have a cosmopolitan distri ... endemic to such far-flung islands.[21]
item-263 at level 3: picture
item-263 at level 4: caption: Female mallard in Cornwall, England
item-264 at level 3: paragraph: Some duck species, mainly those ... t form after localised heavy rain.[23]
item-265 at level 2: section_header: Behaviour
item-266 at level 3: section_header: Feeding
item-267 at level 4: picture
item-267 at level 5: caption: Pecten along the bill
item-268 at level 4: picture
item-268 at level 5: caption: Mallard duckling preening
item-269 at level 4: paragraph: Ducks eat food sources such as g ... amphibians, worms, and small molluscs.
item-270 at level 4: paragraph: Dabbling ducks feed on the surfa ... thers and to hold slippery food items.
item-271 at level 4: paragraph: Diving ducks and sea ducks forag ... ave more difficulty taking off to fly.
item-272 at level 4: paragraph: A few specialized species such a ... apted to catch and swallow large fish.
item-273 at level 4: paragraph: The others have the characterist ... e nostrils come out through hard horn.
item-274 at level 4: paragraph: The Guardian published an articl ... the ducks and pollutes waterways.[25]
item-275 at level 3: section_header: Breeding
item-276 at level 4: picture
item-276 at level 5: caption: A Muscovy duckling
item-277 at level 4: paragraph: Ducks generally only have one pa ... st and led her ducklings to water.[28]
item-278 at level 3: section_header: Communication
item-279 at level 4: paragraph: Female mallard ducks (as well as ... laying calls or quieter contact calls.
item-280 at level 4: paragraph: A common urban legend claims tha ... annel television show MythBusters.[32]
item-281 at level 3: section_header: Predators
item-282 at level 4: picture
item-282 at level 5: caption: Ringed teal
item-283 at level 4: paragraph: Ducks have many predators. Duckl ... or large birds, such as hawks or owls.
item-284 at level 4: paragraph: Adult ducks are fast fliers, but ... its speed and strength to catch ducks.
item-285 at level 2: section_header: Relationship with humans
item-286 at level 3: section_header: Hunting
item-287 at level 4: paragraph: Humans have hunted ducks since p ... evidence of this is uncommon.[35][42]
item-288 at level 4: paragraph: In many areas, wild ducks (inclu ... inated by pollutants such as PCBs.[44]
item-289 at level 3: section_header: Domestication
item-290 at level 4: picture
item-290 at level 5: caption: Indian Runner ducks, a common breed of domestic ducks
item-291 at level 4: paragraph: Ducks have many economic uses, b ... it weighs less than 1 kg (2.2 lb).[48]
item-292 at level 3: section_header: Heraldry
item-293 at level 4: picture
item-293 at level 5: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
item-294 at level 4: paragraph: Ducks appear on several coats of ... the coat of arms of Föglö (Åland).[51]
item-295 at level 3: section_header: Cultural references
item-296 at level 4: paragraph: In 2002, psychologist Richard Wi ... 54] and was made into a movie in 1986.
item-297 at level 4: paragraph: The 1992 Disney film The Mighty ... Ducks minor league baseball team.[55]
item-298 at level 2: section_header: See also
item-299 at level 3: list: group list
item-300 at level 4: list_item: Birds portal
item-301 at level 3: list: group list
item-302 at level 4: list_item: Domestic duck
item-303 at level 4: list_item: Duck as food
item-304 at level 4: list_item: Duck test
item-305 at level 4: list_item: Duck breeds
item-306 at level 4: list_item: Fictional ducks
item-307 at level 4: list_item: Rubber duck
item-308 at level 2: section_header: Notes
item-309 at level 3: section_header: Citations
item-310 at level 4: ordered_list: group ordered list
item-311 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22.
item-312 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 20002006. Retrieved 2015-05-22.
item-313 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139.
item-314 at level 5: list_item: ^ Visca, Curt; Visca, Kelley (20 ... Publishing Group. ISBN 9780823961566.
item-315 at level 5: list_item: ^ a b c d Carboneras 1992, p. 536.
item-316 at level 5: list_item: ^ Livezey 1986, pp. 737738.
item-317 at level 5: list_item: ^ Madsen, McHugh & de Kloet 1988, p. 452.
item-318 at level 5: list_item: ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353354.
item-319 at level 5: list_item: ^ a b c d e f Carboneras 1992, p. 540.
item-320 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 191.
item-321 at level 5: list_item: ^ Kear 2005, p. 448.
item-322 at level 5: list_item: ^ Kear 2005, p. 622623.
item-323 at level 5: list_item: ^ Kear 2005, p. 686.
item-324 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 193.
item-325 at level 5: list_item: ^ a b c d e f g Carboneras 1992, p. 537.
item-326 at level 5: list_item: ^ American Ornithologists' Union 1998, p. xix.
item-327 at level 5: list_item: ^ American Ornithologists' Union 1998.
item-328 at level 5: list_item: ^ Carboneras 1992, p. 538.
item-329 at level 5: list_item: ^ Christidis & Boles 2008, p. 62.
item-330 at level 5: list_item: ^ Shirihai 2008, pp. 239, 245.
item-331 at level 5: list_item: ^ a b Pratt, Bruner & Berrett 1987, pp. 98107.
item-332 at level 5: list_item: ^ Fitter, Fitter & Hosking 2000, pp. 523.
item-333 at level 5: list_item: ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27.
item-334 at level 5: list_item: ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02.
item-335 at level 5: list_item: ^ Karl Mathiesen (16 March 2015) ... Guardian. Retrieved 13 November 2016.
item-336 at level 5: list_item: ^ Rohwer, Frank C.; Anderson, Mi ... 4615-6787-5_4. ISBN 978-1-4615-6789-9.
item-337 at level 5: list_item: ^ Smith, Cyndi M.; Cooke, Fred; ... 093/condor/102.1.201. hdl:10315/13797.
item-338 at level 5: list_item: ^ "If You Find An Orphaned Duckl ... l on 2018-09-23. Retrieved 2018-12-22.
item-339 at level 5: list_item: ^ Carver, Heather (2011). The Du ...  9780557901562.[self-published source]
item-340 at level 5: list_item: ^ Titlow, Budd (2013-09-03). Bir ... man & Littlefield. ISBN 9780762797707.
item-341 at level 5: list_item: ^ Amos, Jonathan (2003-09-08). " ... kers". BBC News. Retrieved 2006-11-02.
item-342 at level 5: list_item: ^ "Mythbusters Episode 8". 12 December 2003.
item-343 at level 5: list_item: ^ Erlandson 1994, p. 171.
item-344 at level 5: list_item: ^ Jeffries 2008, pp. 168, 243.
item-345 at level 5: list_item: ^ a b Sued-Badillo 2003, p. 65.
item-346 at level 5: list_item: ^ Thorpe 1996, p. 68.
item-347 at level 5: list_item: ^ Maisels 1999, p. 42.
item-348 at level 5: list_item: ^ Rau 1876, p. 133.
item-349 at level 5: list_item: ^ Higman 2012, p. 23.
item-350 at level 5: list_item: ^ Hume 2012, p. 53.
item-351 at level 5: list_item: ^ Hume 2012, p. 52.
item-352 at level 5: list_item: ^ Fieldhouse 2002, p. 167.
item-353 at level 5: list_item: ^ Livingston, A. D. (1998-01-01) ... Editions, Limited. ISBN 9781853263774.
item-354 at level 5: list_item: ^ "Study plan for waterfowl inju ... on 2022-10-09. Retrieved 2 July 2019.
item-355 at level 5: list_item: ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25.
item-356 at level 5: list_item: ^ "Anas platyrhynchos, Domestic ... . Digimorph.org. Retrieved 2012-12-23.
item-357 at level 5: list_item: ^ Sy Montgomery. "Mallard; Encyc ... Britannica.com. Retrieved 2012-12-23.
item-358 at level 5: list_item: ^ Glenday, Craig (2014). Guinnes ... ited. pp. 135. ISBN 978-1-908843-15-9.
item-359 at level 5: list_item: ^ Suomen kunnallisvaakunat (in F ... tto. 1982. p. 147. ISBN 951-773-085-3.
item-360 at level 5: list_item: ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021.
item-361 at level 5: list_item: ^ "Föglö" (in Swedish). Retrieved September 9, 2021.
item-362 at level 5: list_item: ^ Young, Emma. "World's funniest ... w Scientist. Retrieved 7 January 2019.
item-363 at level 5: list_item: ^ "Howard the Duck (character)". Grand Comics Database.
item-364 at level 5: list_item: ^ Sanderson, Peter; Gilbert, Lau ... luding this bad-tempered talking duck.
item-365 at level 5: list_item: ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20.
item-366 at level 3: section_header: Sources
item-367 at level 4: list: group list
item-368 at level 5: list_item: American Ornithologists' Union ( ... (PDF) from the original on 2022-10-09.
item-369 at level 5: list_item: Carboneras, Carlos (1992). del H ... Lynx Edicions. ISBN 978-84-87334-10-8.
item-370 at level 5: list_item: Christidis, Les; Boles, Walter E ... ro Publishing. ISBN 978-0-643-06511-6.
item-371 at level 5: list_item: Donne-Goussé, Carole; Laudet, Vi ... /S1055-7903(02)00019-2. PMID 12099792.
item-372 at level 5: list_item: Elphick, Chris; Dunning, John B. ... istopher Helm. ISBN 978-0-7136-6250-4.
item-373 at level 5: list_item: Erlandson, Jon M. (1994). Early ... usiness Media. ISBN 978-1-4419-3231-0.
item-374 at level 5: list_item: Fieldhouse, Paul (2002). Food, F ... ara: ABC-CLIO. ISBN 978-1-61069-412-4.
item-375 at level 5: list_item: Fitter, Julian; Fitter, Daniel; ... versity Press. ISBN 978-0-691-10295-5.
item-376 at level 5: list_item: Higman, B. W. (2012). How Food M ... Wiley & Sons. ISBN 978-1-4051-8947-7.
item-377 at level 5: list_item: Hume, Julian H. (2012). Extinct ... istopher Helm. ISBN 978-1-4729-3744-5.
item-378 at level 5: list_item: Jeffries, Richard (2008). Holoce ... Alabama Press. ISBN 978-0-8173-1658-7.
item-379 at level 5: list_item: Kear, Janet, ed. (2005). Ducks, ... versity Press. ISBN 978-0-19-861009-0.
item-380 at level 5: list_item: Livezey, Bradley C. (October 198 ... (PDF) from the original on 2022-10-09.
item-381 at level 5: list_item: Madsen, Cort S.; McHugh, Kevin P ... (PDF) from the original on 2022-10-09.
item-382 at level 5: list_item: Maisels, Charles Keith (1999). E ... on: Routledge. ISBN 978-0-415-10975-8.
item-383 at level 5: list_item: Pratt, H. Douglas; Bruner, Phill ... University Press. ISBN 0-691-02399-9.
item-384 at level 5: list_item: Rau, Charles (1876). Early Man i ... ork: Harper & Brothers. LCCN 05040168.
item-385 at level 5: list_item: Shirihai, Hadoram (2008). A Comp ... versity Press. ISBN 978-0-691-13666-0.
item-386 at level 5: list_item: Sued-Badillo, Jalil (2003). Auto ... Paris: UNESCO. ISBN 978-92-3-103832-7.
item-387 at level 5: list_item: Thorpe, I. J. (1996). The Origin ... rk: Routledge. ISBN 978-0-415-08009-5.
item-388 at level 2: section_header: External links
item-1 at level 1: title: Duck
item-2 at level 2: list: group list
item-3 at level 3: list_item: Acèh
item-4 at level 3: list_item: Afrikaans
item-5 at level 3: list_item: Alemannisch
item-6 at level 3: list_item: አማርኛ
item-7 at level 3: list_item: Ænglisc
item-8 at level 3: list_item: العربية
item-9 at level 3: list_item: Aragonés
item-10 at level 3: list_item: ܐܪܡܝܐ
item-11 at level 3: list_item: Armãneashti
item-12 at level 3: list_item: Asturianu
item-13 at level 3: list_item: Atikamekw
item-14 at level 3: list_item: Авар
item-15 at level 3: list_item: Aymar aru
item-16 at level 3: list_item: تۆرکجه
item-17 at level 3: list_item: Basa Bali
item-18 at level 3: list_item: বাংলা
item-19 at level 3: list_item: 閩南語 / Bân-lâm-gú
item-20 at level 3: list_item: Беларуская
item-21 at level 3: list_item: Беларуская (тарашкевіца)
item-22 at level 3: list_item: Bikol Central
item-23 at level 3: list_item: Български
item-24 at level 3: list_item: Brezhoneg
item-25 at level 3: list_item: Буряад
item-26 at level 3: list_item: Català
item-27 at level 3: list_item: Чӑвашла
item-28 at level 3: list_item: Čeština
item-29 at level 3: list_item: ChiShona
item-30 at level 3: list_item: Cymraeg
item-31 at level 3: list_item: Dagbanli
item-32 at level 3: list_item: Dansk
item-33 at level 3: list_item: Deitsch
item-34 at level 3: list_item: Deutsch
item-35 at level 3: list_item: डोटेली
item-36 at level 3: list_item: Ελληνικά
item-37 at level 3: list_item: Emiliàn e rumagnòl
item-38 at level 3: list_item: Español
item-39 at level 3: list_item: Esperanto
item-40 at level 3: list_item: Euskara
item-41 at level 3: list_item: فارسی
item-42 at level 3: list_item: Français
item-43 at level 3: list_item: Gaeilge
item-44 at level 3: list_item: Galego
item-45 at level 3: list_item: ГӀалгӀай
item-46 at level 3: list_item: 贛語
item-47 at level 3: list_item: گیلکی
item-48 at level 3: list_item: 𐌲𐌿𐍄𐌹𐍃𐌺
item-49 at level 3: list_item: गोंयची कोंकणी / Gõychi Konknni
item-50 at level 3: list_item: 客家語 / Hak-kâ-ngî
item-51 at level 3: list_item: 한국어
item-52 at level 3: list_item: Hausa
item-53 at level 3: list_item: Հայերեն
item-54 at level 3: list_item: हिन्दी
item-55 at level 3: list_item: Hrvatski
item-56 at level 3: list_item: Ido
item-57 at level 3: list_item: Bahasa Indonesia
item-58 at level 3: list_item: Iñupiatun
item-59 at level 3: list_item: Íslenska
item-60 at level 3: list_item: Italiano
item-61 at level 3: list_item: עברית
item-62 at level 3: list_item: Jawa
item-63 at level 3: list_item: ಕನ್ನಡ
item-64 at level 3: list_item: Kapampangan
item-65 at level 3: list_item: ქართული
item-66 at level 3: list_item: कॉशुर / کٲشُر
item-67 at level 3: list_item: Қазақша
item-68 at level 3: list_item: Ikirundi
item-69 at level 3: list_item: Kongo
item-70 at level 3: list_item: Kreyòl ayisyen
item-71 at level 3: list_item: Кырык мары
item-72 at level 3: list_item: ລາວ
item-73 at level 3: list_item: Latina
item-74 at level 3: list_item: Latviešu
item-75 at level 3: list_item: Lietuvių
item-76 at level 3: list_item: Li Niha
item-77 at level 3: list_item: Ligure
item-78 at level 3: list_item: Limburgs
item-79 at level 3: list_item: Lingála
item-80 at level 3: list_item: Malagasy
item-81 at level 3: list_item: മലയാളം
item-82 at level 3: list_item: मराठी
item-83 at level 3: list_item: مازِرونی
item-84 at level 3: list_item: Bahasa Melayu
item-85 at level 3: list_item: ꯃꯤꯇꯩ ꯂꯣꯟ
item-86 at level 3: list_item: 閩東語 / Mìng-dĕ̤ng-ngṳ̄
item-87 at level 3: list_item: Мокшень
item-88 at level 3: list_item: Монгол
item-89 at level 3: list_item: မြန်မာဘာသာ
item-90 at level 3: list_item: Nederlands
item-91 at level 3: list_item: Nedersaksies
item-92 at level 3: list_item: नेपाली
item-93 at level 3: list_item: नेपाल भाषा
item-94 at level 3: list_item: 日本語
item-95 at level 3: list_item: Нохчийн
item-96 at level 3: list_item: Norsk nynorsk
item-97 at level 3: list_item: Occitan
item-98 at level 3: list_item: Oromoo
item-99 at level 3: list_item: ਪੰਜਾਬੀ
item-100 at level 3: list_item: Picard
item-101 at level 3: list_item: Plattdüütsch
item-102 at level 3: list_item: Polski
item-103 at level 3: list_item: Português
item-104 at level 3: list_item: Qırımtatarca
item-105 at level 3: list_item: Română
item-106 at level 3: list_item: Русский
item-107 at level 3: list_item: Саха тыла
item-108 at level 3: list_item: ᱥᱟᱱᱛᱟᱲᱤ
item-109 at level 3: list_item: Sardu
item-110 at level 3: list_item: Scots
item-111 at level 3: list_item: Seeltersk
item-112 at level 3: list_item: Shqip
item-113 at level 3: list_item: Sicilianu
item-114 at level 3: list_item: සිංහල
item-115 at level 3: list_item: Simple English
item-116 at level 3: list_item: سنڌي
item-117 at level 3: list_item: کوردی
item-118 at level 3: list_item: Српски / srpski
item-119 at level 3: list_item: Srpskohrvatski / српскохрватски
item-120 at level 3: list_item: Sunda
item-121 at level 3: list_item: Svenska
item-122 at level 3: list_item: Tagalog
item-123 at level 3: list_item: தமிழ்
item-124 at level 3: list_item: Taqbaylit
item-125 at level 3: list_item: Татарча / tatarça
item-126 at level 3: list_item: ไทย
item-127 at level 3: list_item: Türkçe
item-128 at level 3: list_item: Українська
item-129 at level 3: list_item: ئۇيغۇرچە / Uyghurche
item-130 at level 3: list_item: Vahcuengh
item-131 at level 3: list_item: Tiếng Việt
item-132 at level 3: list_item: Walon
item-133 at level 3: list_item: 文言
item-134 at level 3: list_item: Winaray
item-135 at level 3: list_item: 吴语
item-136 at level 3: list_item: 粵語
item-137 at level 3: list_item: Žemaitėška
item-138 at level 3: list_item: 中文
item-139 at level 2: list: group list
item-140 at level 3: list_item: Article
item-141 at level 3: list_item: Talk
item-142 at level 2: list: group list
item-143 at level 2: list: group list
item-144 at level 3: list_item: Read
item-145 at level 3: list_item: View source
item-146 at level 3: list_item: View history
item-147 at level 2: text: Tools
item-148 at level 2: text: Actions
item-149 at level 2: list: group list
item-150 at level 3: list_item: Read
item-151 at level 3: list_item: View source
item-152 at level 3: list_item: View history
item-153 at level 2: text: General
item-154 at level 2: list: group list
item-155 at level 3: list_item: What links here
item-156 at level 3: list_item: Related changes
item-157 at level 3: list_item: Upload file
item-158 at level 3: list_item: Special pages
item-159 at level 3: list_item: Permanent link
item-160 at level 3: list_item: Page information
item-161 at level 3: list_item: Cite this page
item-162 at level 3: list_item: Get shortened URL
item-163 at level 3: list_item: Download QR code
item-164 at level 3: list_item: Wikidata item
item-165 at level 2: text: Print/export
item-166 at level 2: list: group list
item-167 at level 3: list_item: Download as PDF
item-168 at level 3: list_item: Printable version
item-169 at level 2: text: In other projects
item-170 at level 2: list: group list
item-171 at level 3: list_item: Wikimedia Commons
item-172 at level 3: list_item: Wikiquote
item-173 at level 2: text: Appearance
item-174 at level 2: picture
item-175 at level 2: text: From Wikipedia, the free encyclopedia
item-176 at level 2: text: Common name for many species of bird
item-177 at level 2: text: This article is about the bird. ... as a food, see . For other uses, see .
item-178 at level 2: text: "Duckling" redirects here. For other uses, see .
item-179 at level 2: table with [13x2]
item-180 at level 2: text: Duck is the common name for nume ... und in both fresh water and sea water.
item-181 at level 2: text: Ducks are sometimes confused wit ... divers, grebes, gallinules and coots.
item-182 at level 2: section_header: Etymology
item-183 at level 3: text: The word duck comes from Old Eng ... h duiken and German tauchen 'to dive'.
item-184 at level 3: picture
item-184 at level 4: caption: Pacific black duck displaying the characteristic upending "duck"
item-185 at level 3: text: This word replaced Old English e ... nskrit ātí 'water bird', among others.
item-186 at level 3: text: A duckling is a young duck in do ... , is sometimes labelled as a duckling.
item-187 at level 3: text: A male is called a drake and the ... a duck, or in ornithology a hen.[3][4]
item-188 at level 3: picture
item-188 at level 4: caption: Male mallard.
item-189 at level 3: picture
item-189 at level 4: caption: Wood ducks.
item-190 at level 2: section_header: Taxonomy
item-191 at level 3: text: All ducks belong to the biologic ... ationships between various species.[9]
item-192 at level 3: picture
item-192 at level 4: caption: Mallard landing in approach
item-193 at level 3: text: In most modern classifications, ... all size and stiff, upright tails.[14]
item-194 at level 3: text: A number of other species called ... shelducks in the tribe Tadornini.[15]
item-195 at level 2: section_header: Morphology
item-196 at level 3: picture
item-196 at level 4: caption: Male Mandarin duck
item-197 at level 3: text: The overall body plan of ducks i ... is moult typically precedes migration.
item-198 at level 3: text: The drakes of northern species o ... rkscrew shaped vagina to prevent rape.
item-199 at level 2: section_header: Distribution and habitat
item-200 at level 3: picture
item-200 at level 4: caption: Flying steamer ducks in Ushuaia, Argentina
item-201 at level 3: text: Ducks have a cosmopolitan distri ... endemic to such far-flung islands.[21]
item-202 at level 3: picture
item-202 at level 4: caption: Female mallard in Cornwall, England
item-203 at level 3: text: Some duck species, mainly those ... t form after localised heavy rain.[23]
item-204 at level 2: section_header: Behaviour
item-205 at level 3: section_header: Feeding
item-206 at level 4: picture
item-206 at level 5: caption: Pecten along the bill
item-207 at level 4: picture
item-207 at level 5: caption: Mallard duckling preening
item-208 at level 4: text: Ducks eat food sources such as g ... amphibians, worms, and small molluscs.
item-209 at level 4: text: Dabbling ducks feed on the surfa ... thers and to hold slippery food items.
item-210 at level 4: text: Diving ducks and sea ducks forag ... ave more difficulty taking off to fly.
item-211 at level 4: text: A few specialized species such a ... apted to catch and swallow large fish.
item-212 at level 4: text: The others have the characterist ... e nostrils come out through hard horn.
item-213 at level 4: text: The Guardian published an articl ... the ducks and pollutes waterways.[25]
item-214 at level 3: section_header: Breeding
item-215 at level 4: picture
item-215 at level 5: caption: A Muscovy duckling
item-216 at level 4: text: Ducks generally only have one pa ... st and led her ducklings to water.[28]
item-217 at level 3: section_header: Communication
item-218 at level 4: text: Female mallard ducks (as well as ... laying calls or quieter contact calls.
item-219 at level 4: text: A common urban legend claims tha ... annel television show MythBusters.[32]
item-220 at level 3: section_header: Predators
item-221 at level 4: picture
item-221 at level 5: caption: Ringed teal
item-222 at level 4: text: Ducks have many predators. Duckl ... or large birds, such as hawks or owls.
item-223 at level 4: text: Adult ducks are fast fliers, but ... its speed and strength to catch ducks.
item-224 at level 2: section_header: Relationship with humans
item-225 at level 3: section_header: Hunting
item-226 at level 4: text: Humans have hunted ducks since p ... evidence of this is uncommon.[35][42]
item-227 at level 4: text: In many areas, wild ducks (inclu ... inated by pollutants such as PCBs.[44]
item-228 at level 3: section_header: Domestication
item-229 at level 4: picture
item-229 at level 5: caption: Indian Runner ducks, a common breed of domestic ducks
item-230 at level 4: text: Ducks have many economic uses, b ... it weighs less than 1 kg (2.2 lb).[48]
item-231 at level 3: section_header: Heraldry
item-232 at level 4: picture
item-232 at level 5: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
item-233 at level 4: text: Ducks appear on several coats of ... the coat of arms of Föglö (Åland).[51]
item-234 at level 3: section_header: Cultural references
item-235 at level 4: text: In 2002, psychologist Richard Wi ... 54] and was made into a movie in 1986.
item-236 at level 4: text: The 1992 Disney film The Mighty ... Ducks minor league baseball team.[55]
item-237 at level 2: section_header: See also
item-238 at level 3: list: group list
item-239 at level 4: list_item: Birds portal
item-240 at level 3: list: group list
item-241 at level 4: list_item: Domestic duck
item-242 at level 4: list_item: Duck as food
item-243 at level 4: list_item: Duck test
item-244 at level 4: list_item: Duck breeds
item-245 at level 4: list_item: Fictional ducks
item-246 at level 4: list_item: Rubber duck
item-247 at level 2: section_header: Notes
item-248 at level 3: section_header: Citations
item-249 at level 4: ordered_list: group ordered list
item-250 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22.
item-251 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 20002006. Retrieved 2015-05-22.
item-252 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139.
item-253 at level 5: list_item: ^ Visca, Curt; Visca, Kelley (20 ... Publishing Group. ISBN 9780823961566.
item-254 at level 5: list_item: ^ a b c d Carboneras 1992, p. 536.
item-255 at level 5: list_item: ^ Livezey 1986, pp. 737738.
item-256 at level 5: list_item: ^ Madsen, McHugh & de Kloet 1988, p. 452.
item-257 at level 5: list_item: ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353354.
item-258 at level 5: list_item: ^ a b c d e f Carboneras 1992, p. 540.
item-259 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 191.
item-260 at level 5: list_item: ^ Kear 2005, p. 448.
item-261 at level 5: list_item: ^ Kear 2005, p. 622623.
item-262 at level 5: list_item: ^ Kear 2005, p. 686.
item-263 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 193.
item-264 at level 5: list_item: ^ a b c d e f g Carboneras 1992, p. 537.
item-265 at level 5: list_item: ^ American Ornithologists' Union 1998, p. xix.
item-266 at level 5: list_item: ^ American Ornithologists' Union 1998.
item-267 at level 5: list_item: ^ Carboneras 1992, p. 538.
item-268 at level 5: list_item: ^ Christidis & Boles 2008, p. 62.
item-269 at level 5: list_item: ^ Shirihai 2008, pp. 239, 245.
item-270 at level 5: list_item: ^ a b Pratt, Bruner & Berrett 1987, pp. 98107.
item-271 at level 5: list_item: ^ Fitter, Fitter & Hosking 2000, pp. 523.
item-272 at level 5: list_item: ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27.
item-273 at level 5: list_item: ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02.
item-274 at level 5: list_item: ^ Karl Mathiesen (16 March 2015) ... Guardian. Retrieved 13 November 2016.
item-275 at level 5: list_item: ^ Rohwer, Frank C.; Anderson, Mi ... 4615-6787-5_4. ISBN 978-1-4615-6789-9.
item-276 at level 5: list_item: ^ Smith, Cyndi M.; Cooke, Fred; ... 093/condor/102.1.201. hdl:10315/13797.
item-277 at level 5: list_item: ^ "If You Find An Orphaned Duckl ... l on 2018-09-23. Retrieved 2018-12-22.
item-278 at level 5: list_item: ^ Carver, Heather (2011). The Du ...  9780557901562.[self-published source]
item-279 at level 5: list_item: ^ Titlow, Budd (2013-09-03). Bir ... man & Littlefield. ISBN 9780762797707.
item-280 at level 5: list_item: ^ Amos, Jonathan (2003-09-08). " ... kers". BBC News. Retrieved 2006-11-02.
item-281 at level 5: list_item: ^ "Mythbusters Episode 8". 12 December 2003.
item-282 at level 5: list_item: ^ Erlandson 1994, p. 171.
item-283 at level 5: list_item: ^ Jeffries 2008, pp. 168, 243.
item-284 at level 5: list_item: ^ a b Sued-Badillo 2003, p. 65.
item-285 at level 5: list_item: ^ Thorpe 1996, p. 68.
item-286 at level 5: list_item: ^ Maisels 1999, p. 42.
item-287 at level 5: list_item: ^ Rau 1876, p. 133.
item-288 at level 5: list_item: ^ Higman 2012, p. 23.
item-289 at level 5: list_item: ^ Hume 2012, p. 53.
item-290 at level 5: list_item: ^ Hume 2012, p. 52.
item-291 at level 5: list_item: ^ Fieldhouse 2002, p. 167.
item-292 at level 5: list_item: ^ Livingston, A. D. (1998-01-01) ... Editions, Limited. ISBN 9781853263774.
item-293 at level 5: list_item: ^ "Study plan for waterfowl inju ... on 2022-10-09. Retrieved 2 July 2019.
item-294 at level 5: list_item: ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25.
item-295 at level 5: list_item: ^ "Anas platyrhynchos, Domestic ... . Digimorph.org. Retrieved 2012-12-23.
item-296 at level 5: list_item: ^ Sy Montgomery. "Mallard; Encyc ... Britannica.com. Retrieved 2012-12-23.
item-297 at level 5: list_item: ^ Glenday, Craig (2014). Guinnes ... ited. pp. 135. ISBN 978-1-908843-15-9.
item-298 at level 5: list_item: ^ Suomen kunnallisvaakunat (in F ... tto. 1982. p. 147. ISBN 951-773-085-3.
item-299 at level 5: list_item: ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021.
item-300 at level 5: list_item: ^ "Föglö" (in Swedish). Retrieved September 9, 2021.
item-301 at level 5: list_item: ^ Young, Emma. "World's funniest ... w Scientist. Retrieved 7 January 2019.
item-302 at level 5: list_item: ^ "Howard the Duck (character)". Grand Comics Database.
item-303 at level 5: list_item: ^ Sanderson, Peter; Gilbert, Lau ... luding this bad-tempered talking duck.
item-304 at level 5: list_item: ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20.
item-305 at level 3: section_header: Sources
item-306 at level 4: list: group list
item-307 at level 5: list_item: American Ornithologists' Union ( ... (PDF) from the original on 2022-10-09.
item-308 at level 5: list_item: Carboneras, Carlos (1992). del H ... Lynx Edicions. ISBN 978-84-87334-10-8.
item-309 at level 5: list_item: Christidis, Les; Boles, Walter E ... ro Publishing. ISBN 978-0-643-06511-6.
item-310 at level 5: list_item: Donne-Goussé, Carole; Laudet, Vi ... /S1055-7903(02)00019-2. PMID 12099792.
item-311 at level 5: list_item: Elphick, Chris; Dunning, John B. ... istopher Helm. ISBN 978-0-7136-6250-4.
item-312 at level 5: list_item: Erlandson, Jon M. (1994). Early ... usiness Media. ISBN 978-1-4419-3231-0.
item-313 at level 5: list_item: Fieldhouse, Paul (2002). Food, F ... ara: ABC-CLIO. ISBN 978-1-61069-412-4.
item-314 at level 5: list_item: Fitter, Julian; Fitter, Daniel; ... versity Press. ISBN 978-0-691-10295-5.
item-315 at level 5: list_item: Higman, B. W. (2012). How Food M ... Wiley & Sons. ISBN 978-1-4051-8947-7.
item-316 at level 5: list_item: Hume, Julian H. (2012). Extinct ... istopher Helm. ISBN 978-1-4729-3744-5.
item-317 at level 5: list_item: Jeffries, Richard (2008). Holoce ... Alabama Press. ISBN 978-0-8173-1658-7.
item-318 at level 5: list_item: Kear, Janet, ed. (2005). Ducks, ... versity Press. ISBN 978-0-19-861009-0.
item-319 at level 5: list_item: Livezey, Bradley C. (October 198 ... (PDF) from the original on 2022-10-09.
item-320 at level 5: list_item: Madsen, Cort S.; McHugh, Kevin P ... (PDF) from the original on 2022-10-09.
item-321 at level 5: list_item: Maisels, Charles Keith (1999). E ... on: Routledge. ISBN 978-0-415-10975-8.
item-322 at level 5: list_item: Pratt, H. Douglas; Bruner, Phill ... University Press. ISBN 0-691-02399-9.
item-323 at level 5: list_item: Rau, Charles (1876). Early Man i ... ork: Harper & Brothers. LCCN 05040168.
item-324 at level 5: list_item: Shirihai, Hadoram (2008). A Comp ... versity Press. ISBN 978-0-691-13666-0.
item-325 at level 5: list_item: Sued-Badillo, Jalil (2003). Auto ... Paris: UNESCO. ISBN 978-92-3-103832-7.
item-326 at level 5: list_item: Thorpe, I. J. (1996). The Origin ... rk: Routledge. ISBN 978-0-415-08009-5.
item-327 at level 2: section_header: External links
item-328 at level 3: list: group list
item-329 at level 4: list_item: Definitions from Wiktionary
item-330 at level 4: list_item: Media from Commons
item-331 at level 4: list_item: Quotations from Wikiquote
item-332 at level 4: list_item: Recipes from Wikibooks
item-333 at level 4: list_item: Taxa from Wikispecies
item-334 at level 4: list_item: Data from Wikidata
item-335 at level 3: list: group list
item-336 at level 4: list_item: list of books (useful looking abstracts)
item-337 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
item-338 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
item-339 at level 3: table with [3x2]
item-340 at level 3: picture
item-341 at level 3: text: Retrieved from ""
item-342 at level 3: text: :
item-343 at level 3: list: group list
item-344 at level 4: list_item: Ducks
item-345 at level 4: list_item: Game birds
item-346 at level 4: list_item: Bird common names
item-347 at level 3: text: Hidden categories:
item-348 at level 3: list: group list
item-349 at level 4: list_item: All accuracy disputes
item-350 at level 4: list_item: Accuracy disputes from February 2020
item-351 at level 4: list_item: CS1 Finnish-language sources (fi)
item-352 at level 4: list_item: CS1 Latvian-language sources (lv)
item-353 at level 4: list_item: CS1 Swedish-language sources (sv)
item-354 at level 4: list_item: Articles with short description
item-355 at level 4: list_item: Short description is different from Wikidata
item-356 at level 4: list_item: Wikipedia indefinitely move-protected pages
item-357 at level 4: list_item: Wikipedia indefinitely semi-protected pages
item-358 at level 4: list_item: Articles with 'species' microformats
item-359 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
item-360 at level 4: list_item: Articles containing Dutch-language text
item-361 at level 4: list_item: Articles containing German-language text
item-362 at level 4: list_item: Articles containing Norwegian-language text
item-363 at level 4: list_item: Articles containing Lithuanian-language text
item-364 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
item-365 at level 4: list_item: All articles with self-published sources
item-366 at level 4: list_item: Articles with self-published sources from February 2020
item-367 at level 4: list_item: All articles with unsourced statements
item-368 at level 4: list_item: Articles with unsourced statements from January 2022
item-369 at level 4: list_item: CS1: long volume value
item-370 at level 4: list_item: Pages using Sister project links with wikidata mismatch
item-371 at level 4: list_item: Pages using Sister project links with hidden wikidata
item-372 at level 4: list_item: Webarchive template wayback links
item-373 at level 4: list_item: Articles with Project Gutenberg links
item-374 at level 4: list_item: Articles containing video clips
item-375 at level 3: list: group list
item-376 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
item-377 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
item-378 at level 3: list: group list
item-379 at level 4: list_item: Privacy policy
item-380 at level 4: list_item: About Wikipedia
item-381 at level 4: list_item: Disclaimers
item-382 at level 4: list_item: Contact Wikipedia
item-383 at level 4: list_item: Code of Conduct
item-384 at level 4: list_item: Developers
item-385 at level 4: list_item: Statistics
item-386 at level 4: list_item: Cookie statement
item-387 at level 4: list_item: Mobile view
item-388 at level 3: list: group list
item-389 at level 3: list: group list
item-390 at level 4: list_item: Definitions from Wiktionary
item-391 at level 4: list_item: Media from Commons
item-392 at level 4: list_item: Quotations from Wikiquote
item-393 at level 4: list_item: Recipes from Wikibooks
item-394 at level 4: list_item: Taxa from Wikispecies
item-395 at level 4: list_item: Data from Wikidata
item-396 at level 3: list: group list
item-397 at level 4: list_item: list of books (useful looking abstracts)
item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
item-400 at level 3: table with [3x2]
item-401 at level 3: picture
item-402 at level 3: list: group list
item-403 at level 4: list_item: Ducks
item-404 at level 4: list_item: Game birds
item-405 at level 4: list_item: Bird common names
item-406 at level 3: list: group list
item-407 at level 4: list_item: All accuracy disputes
item-408 at level 4: list_item: Accuracy disputes from February 2020
item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
item-412 at level 4: list_item: Articles with short description
item-413 at level 4: list_item: Short description is different from Wikidata
item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
item-416 at level 4: list_item: Articles with 'species' microformats
item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
item-418 at level 4: list_item: Articles containing Dutch-language text
item-419 at level 4: list_item: Articles containing German-language text
item-420 at level 4: list_item: Articles containing Norwegian-language text
item-421 at level 4: list_item: Articles containing Lithuanian-language text
item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
item-423 at level 4: list_item: All articles with self-published sources
item-424 at level 4: list_item: Articles with self-published sources from February 2020
item-425 at level 4: list_item: All articles with unsourced statements
item-426 at level 4: list_item: Articles with unsourced statements from January 2022
item-427 at level 4: list_item: CS1: long volume value
item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
item-430 at level 4: list_item: Webarchive template wayback links
item-431 at level 4: list_item: Articles with Project Gutenberg links
item-432 at level 4: list_item: Articles containing video clips
item-433 at level 3: list: group list
item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
item-436 at level 3: list: group list
item-437 at level 4: list_item: Privacy policy
item-438 at level 4: list_item: About Wikipedia
item-439 at level 4: list_item: Disclaimers
item-440 at level 4: list_item: Contact Wikipedia
item-441 at level 4: list_item: Code of Conduct
item-442 at level 4: list_item: Developers
item-443 at level 4: list_item: Statistics
item-444 at level 4: list_item: Cookie statement
item-445 at level 4: list_item: Mobile view
item-446 at level 3: list: group list
item-447 at level 3: list: group list
item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
item-449 at level 1: caption: Male mallard.
item-450 at level 1: caption: Wood ducks.
item-451 at level 1: caption: Mallard landing in approach
item-452 at level 1: caption: Male Mandarin duck
item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
item-454 at level 1: caption: Female mallard in Cornwall, England
item-455 at level 1: caption: Pecten along the bill
item-456 at level 1: caption: Mallard duckling preening
item-457 at level 1: caption: A Muscovy duckling
item-458 at level 1: caption: Ringed teal
item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
item-390 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
item-391 at level 1: caption: Male mallard.
item-392 at level 1: caption: Wood ducks.
item-393 at level 1: caption: Mallard landing in approach
item-394 at level 1: caption: Male Mandarin duck
item-395 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
item-396 at level 1: caption: Female mallard in Cornwall, England
item-397 at level 1: caption: Pecten along the bill
item-398 at level 1: caption: Mallard duckling preening
item-399 at level 1: caption: A Muscovy duckling
item-400 at level 1: caption: Ringed teal
item-401 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
item-402 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]

File diff suppressed because it is too large Load Diff

View File

@ -1,53 +1,3 @@
- Main page
- Contents
- Current events
- Random article
- About Wikipedia
- Contact us
- Help
- Learn to edit
- Community portal
- Recent changes
- Upload file
<!-- image -->
<!-- image -->
<!-- image -->
- Donate
- Create account
- Log in
- Create account
- Log in
- Contributions
- Talk
## Contents
- (Top)
- 1 Etymology
- 2 Taxonomy
- 3 Morphology
- 4 Distribution and habitat
- 5 Behaviour Toggle Behaviour subsection
- 5.1 Feeding
- 5.2 Breeding
- 5.3 Communication
- 5.4 Predators
- 6 Relationship with humans Toggle Relationship with humans subsection
- 6.1 Hunting
- 6.2 Domestication
- 6.3 Heraldry
- 6.4 Cultural references
- 7 See also
- 8 Notes Toggle Notes subsection
- 8.1 Citations
- 8.2 Sources
- 9 External links
# Duck
- Acèh
@ -193,9 +143,17 @@
- Read
- View source
- View history
Tools
Actions
- Read
- View source
- View history
General
- What links here
- Related changes
- Upload file
@ -206,13 +164,29 @@
- Get shortened URL
- Download QR code
- Wikidata item
Print/export
- Download as PDF
- Printable version
In other projects
- Wikimedia Commons
- Wikiquote
Appearance
<!-- image -->
From Wikipedia, the free encyclopedia
Common name for many species of bird
This article is about the bird. For duck as a food, see . For other uses, see .
"Duckling" redirects here. For other uses, see .
| Duck | Duck |
|--------------------------------|--------------------------------|
| | |
@ -482,10 +456,16 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a
<!-- image -->
Retrieved from ""
:
- Ducks
- Game birds
- Bird common names
Hidden categories:
- All accuracy disputes
- Accuracy disputes from February 2020
- CS1 Finnish-language sources (fi)

View File

@ -0,0 +1,12 @@
<html>
<head>
<title>Sample HTML File</title>
</head>
<body>
<div>This is a div with text.</div>
<div>This is another div with text.</div>
<p>This is a regular paragraph.</p>
<div>This is a third div<br/>with a new line.</div>
<div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div>
</body>
</html>

View File

@ -1,4 +1,4 @@
import os
from io import BytesIO
from pathlib import Path
from docling.backend.html_backend import HTMLDocumentBackend
@ -41,6 +41,62 @@ def test_heading_levels():
assert found_lvl_2 and found_lvl_3
def test_ordered_lists():
test_set: list[tuple[bytes, str]] = []
test_set.append(
(
b"<html><body><ol><li>1st item</li><li>2nd item</li></ol></body></html>",
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="1"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="2"><li>1st item</li><li>2nd item</li></ol></body></html>',
"2. 1st item\n3. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="0"><li>1st item</li><li>2nd item</li></ol></body></html>',
"0. 1st item\n1. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="-5"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="foo"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
for idx, pair in enumerate(test_set):
in_doc = InputDocument(
path_or_stream=BytesIO(pair[0]),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(pair[0]),
)
doc: DoclingDocument = backend.convert()
assert doc
assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
def get_html_paths():
# Define the directory you want to search