mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Merge remote-tracking branch 'origin/main' into feat-factory-plugins
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
commit
71d438df84
11
.github/workflows/checks.yml
vendored
11
.github/workflows/checks.yml
vendored
@ -1,6 +1,10 @@
|
|||||||
on:
|
on:
|
||||||
workflow_call:
|
workflow_call:
|
||||||
|
|
||||||
|
env:
|
||||||
|
HF_HUB_DOWNLOAD_TIMEOUT: "60"
|
||||||
|
HF_HUB_ETAG_TIMEOUT: "60"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run-checks:
|
run-checks:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -14,6 +18,11 @@ jobs:
|
|||||||
- name: Set TESSDATA_PREFIX
|
- name: Set TESSDATA_PREFIX
|
||||||
run: |
|
run: |
|
||||||
echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"
|
||||||
|
- name: Cache Hugging Face models
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: ~/.cache/huggingface
|
||||||
|
key: huggingface-cache-py${{ matrix.python-version }}
|
||||||
- uses: ./.github/actions/setup-poetry
|
- uses: ./.github/actions/setup-poetry
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
@ -28,7 +37,7 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
for file in docs/examples/*.py; do
|
for file in docs/examples/*.py; do
|
||||||
# Skip batch_convert.py
|
# Skip batch_convert.py
|
||||||
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
|
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then
|
||||||
echo "Skipping $file"
|
echo "Skipping $file"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
27
CHANGELOG.md
27
CHANGELOG.md
@ -1,3 +1,30 @@
|
|||||||
|
## [v2.25.1](https://github.com/DS4SD/docling/releases/tag/v2.25.1) - 2025-03-03
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Enable locks for threadsafe pdfium ([#1052](https://github.com/DS4SD/docling/issues/1052)) ([`8dc0562`](https://github.com/DS4SD/docling/commit/8dc0562542299cf972d14eeeb4393e50b589c8ad))
|
||||||
|
* **html:** Use 'start' attribute when parsing ordered lists from HTML docs ([#1062](https://github.com/DS4SD/docling/issues/1062)) ([`de7b963`](https://github.com/DS4SD/docling/commit/de7b963b09a34916f0a8d99649269aeb37db1408))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Improve docs on token limit warning triggered by HybridChunker ([#1077](https://github.com/DS4SD/docling/issues/1077)) ([`db3ceef`](https://github.com/DS4SD/docling/commit/db3ceefd4ae6251a97e333bcb03051698b3fa71a))
|
||||||
|
|
||||||
|
## [v2.25.0](https://github.com/DS4SD/docling/releases/tag/v2.25.0) - 2025-02-26
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/DS4SD/docling/issues/1054)) ([`3c9fe76`](https://github.com/DS4SD/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849))
|
||||||
|
* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/DS4SD/docling/issues/1061)) ([`ab683e4`](https://github.com/DS4SD/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Vlm using artifacts path ([#1057](https://github.com/DS4SD/docling/issues/1057)) ([`e197225`](https://github.com/DS4SD/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8))
|
||||||
|
* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/DS4SD/docling/issues/1041)) ([`1b0ead6`](https://github.com/DS4SD/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/DS4SD/docling/issues/1053)) ([`c84b973`](https://github.com/DS4SD/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d))
|
||||||
|
|
||||||
## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20
|
## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -123,6 +123,6 @@ For individual model usage, please refer to the model licenses found in the orig
|
|||||||
|
|
||||||
Docling has been brought to you by IBM.
|
Docling has been brought to you by IBM.
|
||||||
|
|
||||||
[supported_formats]: https://ds4sd.github.io/docling/supported_formats/
|
[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/
|
||||||
[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
|
[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/
|
||||||
[integrations]: https://ds4sd.github.io/docling/integrations/
|
[integrations]: https://ds4sd.github.io/docling/integrations/
|
||||||
|
@ -12,6 +12,7 @@ from pypdfium2 import PdfPage
|
|||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell, Size
|
from docling.datamodel.base_models import Cell, Size
|
||||||
|
from docling.utils.locks import pypdfium2_lock
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend):
|
|||||||
padbox.r = page_size.width - padbox.r
|
padbox.r = page_size.width - padbox.r
|
||||||
padbox.t = page_size.height - padbox.t
|
padbox.t = page_size.height - padbox.t
|
||||||
|
|
||||||
image = (
|
with pypdfium2_lock:
|
||||||
self._ppage.render(
|
image = (
|
||||||
scale=scale * 1.5,
|
self._ppage.render(
|
||||||
rotation=0, # no additional rotation
|
scale=scale * 1.5,
|
||||||
crop=padbox.as_tuple(),
|
rotation=0, # no additional rotation
|
||||||
)
|
crop=padbox.as_tuple(),
|
||||||
.to_pil()
|
)
|
||||||
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
.to_pil()
|
||||||
) # We resize the image from 1.5x the given scale to make it sharper.
|
.resize(
|
||||||
|
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
||||||
|
)
|
||||||
|
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||||
|
|
||||||
return image
|
return image
|
||||||
|
|
||||||
def get_size(self) -> Size:
|
def get_size(self) -> Size:
|
||||||
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
with pypdfium2_lock:
|
||||||
|
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
self._ppage = None
|
self._ppage = None
|
||||||
@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
with pypdfium2_lock:
|
||||||
self.parser = pdf_parser_v2("fatal")
|
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||||
|
self.parser = pdf_parser_v2("fatal")
|
||||||
|
|
||||||
success = False
|
success = False
|
||||||
if isinstance(self.path_or_stream, BytesIO):
|
if isinstance(self.path_or_stream, BytesIO):
|
||||||
success = self.parser.load_document_from_bytesio(
|
success = self.parser.load_document_from_bytesio(
|
||||||
self.document_hash, self.path_or_stream
|
self.document_hash, self.path_or_stream
|
||||||
)
|
)
|
||||||
elif isinstance(self.path_or_stream, Path):
|
elif isinstance(self.path_or_stream, Path):
|
||||||
success = self.parser.load_document(
|
success = self.parser.load_document(
|
||||||
self.document_hash, str(self.path_or_stream)
|
self.document_hash, str(self.path_or_stream)
|
||||||
)
|
)
|
||||||
|
|
||||||
if not success:
|
if not success:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"docling-parse v2 could not load document {self.document_hash}."
|
f"docling-parse v2 could not load document {self.document_hash}."
|
||||||
)
|
)
|
||||||
|
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
# return len(self._pdoc) # To be replaced with docling-parse API
|
# return len(self._pdoc) # To be replaced with docling-parse API
|
||||||
@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|||||||
return len_2
|
return len_2
|
||||||
|
|
||||||
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
def load_page(self, page_no: int) -> DoclingParseV2PageBackend:
|
||||||
return DoclingParseV2PageBackend(
|
with pypdfium2_lock:
|
||||||
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
return DoclingParseV2PageBackend(
|
||||||
)
|
self.parser, self.document_hash, page_no, self._pdoc[page_no]
|
||||||
|
)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.page_count() > 0
|
return self.page_count() > 0
|
||||||
@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend):
|
|||||||
def unload(self):
|
def unload(self):
|
||||||
super().unload()
|
super().unload()
|
||||||
self.parser.unload_document(self.document_hash)
|
self.parser.unload_document(self.document_hash)
|
||||||
self._pdoc.close()
|
with pypdfium2_lock:
|
||||||
self._pdoc = None
|
self._pdoc.close()
|
||||||
|
self._pdoc = None
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
import logging
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union, cast
|
from typing import Final, Optional, Union, cast
|
||||||
|
|
||||||
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
|
||||||
|
from bs4.element import PreformattedString
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
DocItem,
|
||||||
DocItemLabel,
|
DocItemLabel,
|
||||||
@ -14,6 +15,7 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
|
from docling_core.types.doc.document import ContentLayer
|
||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
@ -22,12 +24,29 @@ from docling.datamodel.document import InputDocument
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# tags that generate NodeItem elements
|
||||||
|
TAGS_FOR_NODE_ITEMS: Final = [
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
"p",
|
||||||
|
"pre",
|
||||||
|
"ul",
|
||||||
|
"ol",
|
||||||
|
"li",
|
||||||
|
"table",
|
||||||
|
"figure",
|
||||||
|
"img",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||||
@override
|
@override
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
_log.debug("About to init HTML backend...")
|
|
||||||
self.soup: Optional[Tag] = None
|
self.soup: Optional[Tag] = None
|
||||||
# HTML file:
|
# HTML file:
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
@ -48,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.soup = BeautifulSoup(html_content, "html.parser")
|
self.soup = BeautifulSoup(html_content, "html.parser")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Could not initialize HTML backend for file with hash {self.document_hash}."
|
"Could not initialize HTML backend for file with "
|
||||||
|
f"hash {self.document_hash}."
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
@override
|
@override
|
||||||
@ -88,17 +108,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
assert self.soup is not None
|
assert self.soup is not None
|
||||||
content = self.soup.body or self.soup
|
content = self.soup.body or self.soup
|
||||||
# Replace <br> tags with newline characters
|
# Replace <br> tags with newline characters
|
||||||
|
# TODO: remove style to avoid losing text from tags like i, b, span, ...
|
||||||
for br in content("br"):
|
for br in content("br"):
|
||||||
br.replace_with(NavigableString("\n"))
|
br.replace_with(NavigableString("\n"))
|
||||||
|
|
||||||
|
headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"])
|
||||||
|
self.content_layer = (
|
||||||
|
ContentLayer.BODY if headers is None else ContentLayer.FURNITURE
|
||||||
|
)
|
||||||
self.walk(content, doc)
|
self.walk(content, doc)
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
f"Cannot convert doc with {self.document_hash} because the backend "
|
||||||
|
"failed to init."
|
||||||
)
|
)
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
def walk(self, tag: Tag, doc: DoclingDocument) -> None:
|
||||||
|
|
||||||
# Iterate over elements in the body of the document
|
# Iterate over elements in the body of the document
|
||||||
|
text: str = ""
|
||||||
for element in tag.children:
|
for element in tag.children:
|
||||||
if isinstance(element, Tag):
|
if isinstance(element, Tag):
|
||||||
try:
|
try:
|
||||||
@ -108,6 +137,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
f"Error processing child from tag{tag.name}: {exc_child}"
|
f"Error processing child from tag{tag.name}: {exc_child}"
|
||||||
)
|
)
|
||||||
raise exc_child
|
raise exc_child
|
||||||
|
elif isinstance(element, NavigableString) and not isinstance(
|
||||||
|
element, PreformattedString
|
||||||
|
):
|
||||||
|
# Floating text outside paragraphs or analyzed tags
|
||||||
|
text += element
|
||||||
|
siblings: list[Tag] = [
|
||||||
|
item for item in element.next_siblings if isinstance(item, Tag)
|
||||||
|
]
|
||||||
|
if element.next_sibling is None or any(
|
||||||
|
[item.name in TAGS_FOR_NODE_ITEMS for item in siblings]
|
||||||
|
):
|
||||||
|
text = text.strip()
|
||||||
|
if text and tag.name in ["div"]:
|
||||||
|
doc.add_text(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
text = ""
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
@ -127,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
elif tag.name == "figure":
|
elif tag.name == "figure":
|
||||||
self.handle_figure(tag, doc)
|
self.handle_figure(tag, doc)
|
||||||
elif tag.name == "img":
|
elif tag.name == "img":
|
||||||
self.handle_image(doc)
|
self.handle_image(tag, doc)
|
||||||
else:
|
else:
|
||||||
self.walk(tag, doc)
|
self.walk(tag, doc)
|
||||||
|
|
||||||
@ -158,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
|
|
||||||
if hlevel == 1:
|
if hlevel == 1:
|
||||||
for key, val in self.parents.items():
|
self.content_layer = ContentLayer.BODY
|
||||||
|
|
||||||
|
for key in self.parents.keys():
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
|
|
||||||
self.level = 1
|
self.level = 1
|
||||||
self.parents[self.level] = doc.add_text(
|
self.parents[self.level] = doc.add_text(
|
||||||
parent=self.parents[0], label=DocItemLabel.TITLE, text=text
|
parent=self.parents[0],
|
||||||
|
label=DocItemLabel.TITLE,
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
if hlevel > self.level:
|
if hlevel > self.level:
|
||||||
@ -174,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
name=f"header-{i}",
|
name=f"header-{i}",
|
||||||
label=GroupLabel.SECTION,
|
label=GroupLabel.SECTION,
|
||||||
parent=self.parents[i - 1],
|
parent=self.parents[i - 1],
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
self.level = hlevel
|
self.level = hlevel
|
||||||
|
|
||||||
@ -189,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent=self.parents[hlevel - 1],
|
parent=self.parents[hlevel - 1],
|
||||||
text=text,
|
text=text,
|
||||||
level=hlevel,
|
level=hlevel,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_code(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
@ -197,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return
|
return
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
if text:
|
if text:
|
||||||
doc.add_code(parent=self.parents[self.level], text=text)
|
doc.add_code(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
|
||||||
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles paragraph tags (p)."""
|
"""Handles paragraph tags (p)."""
|
||||||
if element.text is None:
|
if element.text is None:
|
||||||
return
|
return
|
||||||
text = element.text.strip()
|
text = element.text.strip()
|
||||||
label = DocItemLabel.PARAGRAPH
|
|
||||||
if text:
|
if text:
|
||||||
doc.add_text(parent=self.parents[self.level], label=label, text=text)
|
doc.add_text(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
label=DocItemLabel.TEXT,
|
||||||
|
text=text,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
|
||||||
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_list(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles list tags (ul, ol) and their list items."""
|
"""Handles list tags (ul, ol) and their list items."""
|
||||||
@ -214,14 +278,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if element.name == "ul":
|
if element.name == "ul":
|
||||||
# create a list group
|
# create a list group
|
||||||
self.parents[self.level + 1] = doc.add_group(
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
parent=self.parents[self.level], name="list", label=GroupLabel.LIST
|
parent=self.parents[self.level],
|
||||||
|
name="list",
|
||||||
|
label=GroupLabel.LIST,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
elif element.name == "ol":
|
elif element.name == "ol":
|
||||||
|
start_attr = element.get("start")
|
||||||
|
start: int = (
|
||||||
|
int(start_attr)
|
||||||
|
if isinstance(start_attr, str) and start_attr.isnumeric()
|
||||||
|
else 1
|
||||||
|
)
|
||||||
# create a list group
|
# create a list group
|
||||||
self.parents[self.level + 1] = doc.add_group(
|
self.parents[self.level + 1] = doc.add_group(
|
||||||
parent=self.parents[self.level],
|
parent=self.parents[self.level],
|
||||||
name="ordered list",
|
name="ordered list" + (f" start {start}" if start != 1 else ""),
|
||||||
label=GroupLabel.ORDERED_LIST,
|
label=GroupLabel.ORDERED_LIST,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -231,15 +305,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level -= 1
|
self.level -= 1
|
||||||
|
|
||||||
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
|
def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles listitem tags (li)."""
|
"""Handles list item tags (li)."""
|
||||||
nested_list = element.find(["ul", "ol"])
|
nested_list = element.find(["ul", "ol"])
|
||||||
|
|
||||||
parent = self.parents[self.level]
|
parent = self.parents[self.level]
|
||||||
if parent is None:
|
if parent is None:
|
||||||
_log.warning(f"list-item has no parent in DoclingDocument: {element}")
|
_log.debug(f"list-item has no parent in DoclingDocument: {element}")
|
||||||
return
|
return
|
||||||
parent_label: str = parent.label
|
parent_label: str = parent.label
|
||||||
index_in_list = len(parent.children) + 1
|
index_in_list = len(parent.children) + 1
|
||||||
|
if (
|
||||||
|
parent_label == GroupLabel.ORDERED_LIST
|
||||||
|
and isinstance(parent, GroupItem)
|
||||||
|
and parent.name
|
||||||
|
):
|
||||||
|
start_in_list: str = parent.name.split(" ")[-1]
|
||||||
|
start: int = int(start_in_list) if start_in_list.isnumeric() else 1
|
||||||
|
index_in_list += start - 1
|
||||||
|
|
||||||
if nested_list:
|
if nested_list:
|
||||||
# Text in list item can be hidden within hierarchy, hence
|
# Text in list item can be hidden within hierarchy, hence
|
||||||
@ -262,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=enumerated,
|
enumerated=enumerated,
|
||||||
marker=marker,
|
marker=marker,
|
||||||
parent=parent,
|
parent=parent,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
self.level += 1
|
self.level += 1
|
||||||
|
|
||||||
@ -283,15 +366,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enumerated=enumerated,
|
enumerated=enumerated,
|
||||||
marker=marker,
|
marker=marker,
|
||||||
parent=parent,
|
parent=parent,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
_log.warning(f"list-item has no text: {element}")
|
_log.debug(f"list-item has no text: {element}")
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def parse_table_data(element: Tag) -> Optional[TableData]:
|
def parse_table_data(element: Tag) -> Optional[TableData]:
|
||||||
nested_tables = element.find("table")
|
nested_tables = element.find("table")
|
||||||
if nested_tables is not None:
|
if nested_tables is not None:
|
||||||
_log.warning("Skipping nested table.")
|
_log.debug("Skipping nested table.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Count the number of rows (number of <tr> elements)
|
# Count the number of rows (number of <tr> elements)
|
||||||
@ -386,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
table_data = HTMLDocumentBackend.parse_table_data(element)
|
table_data = HTMLDocumentBackend.parse_table_data(element)
|
||||||
|
|
||||||
if table_data is not None:
|
if table_data is not None:
|
||||||
doc.add_table(data=table_data, parent=self.parents[self.level])
|
doc.add_table(
|
||||||
|
data=table_data,
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
|
||||||
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]:
|
||||||
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
"""Recursively extract text from <ul> or <ol> with proper indentation."""
|
||||||
@ -426,20 +514,33 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
contains_captions = element.find(["figcaption"])
|
contains_captions = element.find(["figcaption"])
|
||||||
if not isinstance(contains_captions, Tag):
|
if not isinstance(contains_captions, Tag):
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
doc.add_picture(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
texts = []
|
texts = []
|
||||||
for item in contains_captions:
|
for item in contains_captions:
|
||||||
texts.append(item.text)
|
texts.append(item.text)
|
||||||
|
|
||||||
fig_caption = doc.add_text(
|
fig_caption = doc.add_text(
|
||||||
label=DocItemLabel.CAPTION, text=("".join(texts)).strip()
|
label=DocItemLabel.CAPTION,
|
||||||
|
text=("".join(texts)).strip(),
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
parent=self.parents[self.level],
|
parent=self.parents[self.level],
|
||||||
caption=fig_caption,
|
caption=fig_caption,
|
||||||
|
content_layer=self.content_layer,
|
||||||
)
|
)
|
||||||
|
|
||||||
def handle_image(self, doc: DoclingDocument) -> None:
|
def handle_image(self, element: Tag, doc: DoclingDocument) -> None:
|
||||||
"""Handles image tags (img)."""
|
"""Handles image tags (img)."""
|
||||||
doc.add_picture(parent=self.parents[self.level], caption=None)
|
_log.debug(f"ignoring <img> tags at the moment: {element}")
|
||||||
|
|
||||||
|
doc.add_picture(
|
||||||
|
parent=self.parents[self.level],
|
||||||
|
caption=None,
|
||||||
|
content_layer=self.content_layer,
|
||||||
|
)
|
||||||
|
@ -13,6 +13,7 @@ from pypdfium2._helpers.misc import PdfiumError
|
|||||||
|
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
|
||||||
from docling.datamodel.base_models import Cell
|
from docling.datamodel.base_models import Cell
|
||||||
|
from docling.utils.locks import pypdfium2_lock
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
@ -24,6 +25,7 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
self, pdfium_doc: pdfium.PdfDocument, document_hash: str, page_no: int
|
||||||
):
|
):
|
||||||
|
# Note: lock applied by the caller
|
||||||
self.valid = True # No better way to tell from pypdfium.
|
self.valid = True # No better way to tell from pypdfium.
|
||||||
try:
|
try:
|
||||||
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
self._ppage: pdfium.PdfPage = pdfium_doc[page_no]
|
||||||
@ -40,51 +42,57 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
|
|
||||||
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
|
||||||
AREA_THRESHOLD = 0 # 32 * 32
|
AREA_THRESHOLD = 0 # 32 * 32
|
||||||
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
page_size = self.get_size()
|
||||||
pos = obj.get_pos()
|
with pypdfium2_lock:
|
||||||
cropbox = BoundingBox.from_tuple(
|
for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
|
||||||
pos, origin=CoordOrigin.BOTTOMLEFT
|
pos = obj.get_pos()
|
||||||
).to_top_left_origin(page_height=self.get_size().height)
|
cropbox = BoundingBox.from_tuple(
|
||||||
|
pos, origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
).to_top_left_origin(page_height=page_size.height)
|
||||||
|
|
||||||
if cropbox.area() > AREA_THRESHOLD:
|
if cropbox.area() > AREA_THRESHOLD:
|
||||||
cropbox = cropbox.scaled(scale=scale)
|
cropbox = cropbox.scaled(scale=scale)
|
||||||
|
|
||||||
yield cropbox
|
yield cropbox
|
||||||
|
|
||||||
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
def get_text_in_rect(self, bbox: BoundingBox) -> str:
|
||||||
if not self.text_page:
|
with pypdfium2_lock:
|
||||||
self.text_page = self._ppage.get_textpage()
|
if not self.text_page:
|
||||||
|
self.text_page = self._ppage.get_textpage()
|
||||||
|
|
||||||
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
|
||||||
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
bbox = bbox.to_bottom_left_origin(self.get_size().height)
|
||||||
|
|
||||||
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
with pypdfium2_lock:
|
||||||
|
text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
|
||||||
|
|
||||||
return text_piece
|
return text_piece
|
||||||
|
|
||||||
def get_text_cells(self) -> Iterable[Cell]:
|
def get_text_cells(self) -> Iterable[Cell]:
|
||||||
if not self.text_page:
|
with pypdfium2_lock:
|
||||||
self.text_page = self._ppage.get_textpage()
|
if not self.text_page:
|
||||||
|
self.text_page = self._ppage.get_textpage()
|
||||||
|
|
||||||
cells = []
|
cells = []
|
||||||
cell_counter = 0
|
cell_counter = 0
|
||||||
|
|
||||||
page_size = self.get_size()
|
page_size = self.get_size()
|
||||||
|
|
||||||
for i in range(self.text_page.count_rects()):
|
with pypdfium2_lock:
|
||||||
rect = self.text_page.get_rect(i)
|
for i in range(self.text_page.count_rects()):
|
||||||
text_piece = self.text_page.get_text_bounded(*rect)
|
rect = self.text_page.get_rect(i)
|
||||||
x0, y0, x1, y1 = rect
|
text_piece = self.text_page.get_text_bounded(*rect)
|
||||||
cells.append(
|
x0, y0, x1, y1 = rect
|
||||||
Cell(
|
cells.append(
|
||||||
id=cell_counter,
|
Cell(
|
||||||
text=text_piece,
|
id=cell_counter,
|
||||||
bbox=BoundingBox(
|
text=text_piece,
|
||||||
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
bbox=BoundingBox(
|
||||||
).to_top_left_origin(page_size.height),
|
l=x0, b=y0, r=x1, t=y1, coord_origin=CoordOrigin.BOTTOMLEFT
|
||||||
|
).to_top_left_origin(page_size.height),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
cell_counter += 1
|
||||||
cell_counter += 1
|
|
||||||
|
|
||||||
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
# PyPdfium2 produces very fragmented cells, with sub-word level boundaries, in many PDFs.
|
||||||
# The cell merging code below is to clean this up.
|
# The cell merging code below is to clean this up.
|
||||||
@ -214,20 +222,24 @@ class PyPdfiumPageBackend(PdfPageBackend):
|
|||||||
padbox.r = page_size.width - padbox.r
|
padbox.r = page_size.width - padbox.r
|
||||||
padbox.t = page_size.height - padbox.t
|
padbox.t = page_size.height - padbox.t
|
||||||
|
|
||||||
image = (
|
with pypdfium2_lock:
|
||||||
self._ppage.render(
|
image = (
|
||||||
scale=scale * 1.5,
|
self._ppage.render(
|
||||||
rotation=0, # no additional rotation
|
scale=scale * 1.5,
|
||||||
crop=padbox.as_tuple(),
|
rotation=0, # no additional rotation
|
||||||
)
|
crop=padbox.as_tuple(),
|
||||||
.to_pil()
|
)
|
||||||
.resize(size=(round(cropbox.width * scale), round(cropbox.height * scale)))
|
.to_pil()
|
||||||
) # We resize the image from 1.5x the given scale to make it sharper.
|
.resize(
|
||||||
|
size=(round(cropbox.width * scale), round(cropbox.height * scale))
|
||||||
|
)
|
||||||
|
) # We resize the image from 1.5x the given scale to make it sharper.
|
||||||
|
|
||||||
return image
|
return image
|
||||||
|
|
||||||
def get_size(self) -> Size:
|
def get_size(self) -> Size:
|
||||||
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
with pypdfium2_lock:
|
||||||
|
return Size(width=self._ppage.get_width(), height=self._ppage.get_height())
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
self._ppage = None
|
self._ppage = None
|
||||||
@ -239,22 +251,26 @@ class PyPdfiumDocumentBackend(PdfDocumentBackend):
|
|||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
with pypdfium2_lock:
|
||||||
|
self._pdoc = pdfium.PdfDocument(self.path_or_stream)
|
||||||
except PdfiumError as e:
|
except PdfiumError as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"pypdfium could not load document with hash {self.document_hash}"
|
f"pypdfium could not load document with hash {self.document_hash}"
|
||||||
) from e
|
) from e
|
||||||
|
|
||||||
def page_count(self) -> int:
|
def page_count(self) -> int:
|
||||||
return len(self._pdoc)
|
with pypdfium2_lock:
|
||||||
|
return len(self._pdoc)
|
||||||
|
|
||||||
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
def load_page(self, page_no: int) -> PyPdfiumPageBackend:
|
||||||
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
with pypdfium2_lock:
|
||||||
|
return PyPdfiumPageBackend(self._pdoc, self.document_hash, page_no)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return self.page_count() > 0
|
return self.page_count() > 0
|
||||||
|
|
||||||
def unload(self):
|
def unload(self):
|
||||||
super().unload()
|
super().unload()
|
||||||
self._pdoc.close()
|
with pypdfium2_lock:
|
||||||
self._pdoc = None
|
self._pdoc.close()
|
||||||
|
self._pdoc = None
|
||||||
|
@ -32,9 +32,19 @@ class _AvailableModels(str, Enum):
|
|||||||
CODE_FORMULA = "code_formula"
|
CODE_FORMULA = "code_formula"
|
||||||
PICTURE_CLASSIFIER = "picture_classifier"
|
PICTURE_CLASSIFIER = "picture_classifier"
|
||||||
SMOLVLM = "smolvlm"
|
SMOLVLM = "smolvlm"
|
||||||
|
GRANITE_VISION = "granite_vision"
|
||||||
EASYOCR = "easyocr"
|
EASYOCR = "easyocr"
|
||||||
|
|
||||||
|
|
||||||
|
_default_models = [
|
||||||
|
_AvailableModels.LAYOUT,
|
||||||
|
_AvailableModels.TABLEFORMER,
|
||||||
|
_AvailableModels.CODE_FORMULA,
|
||||||
|
_AvailableModels.PICTURE_CLASSIFIER,
|
||||||
|
_AvailableModels.EASYOCR,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
@app.command("download")
|
@app.command("download")
|
||||||
def download(
|
def download(
|
||||||
output_dir: Annotated[
|
output_dir: Annotated[
|
||||||
@ -43,18 +53,27 @@ def download(
|
|||||||
...,
|
...,
|
||||||
"-o",
|
"-o",
|
||||||
"--output-dir",
|
"--output-dir",
|
||||||
help="The directory where all the models are downloaded.",
|
help="The directory where to download the models.",
|
||||||
),
|
),
|
||||||
] = (settings.cache_dir / "models"),
|
] = (settings.cache_dir / "models"),
|
||||||
force: Annotated[
|
force: Annotated[
|
||||||
bool, typer.Option(..., help="If true, the download will be forced")
|
bool, typer.Option(..., help="If true, the download will be forced.")
|
||||||
] = False,
|
] = False,
|
||||||
models: Annotated[
|
models: Annotated[
|
||||||
Optional[list[_AvailableModels]],
|
Optional[list[_AvailableModels]],
|
||||||
typer.Argument(
|
typer.Argument(
|
||||||
help=f"Models to download (default behavior: all will be downloaded)",
|
help=f"Models to download (default behavior: a predefined set of models will be downloaded).",
|
||||||
),
|
),
|
||||||
] = None,
|
] = None,
|
||||||
|
all: Annotated[
|
||||||
|
bool,
|
||||||
|
typer.Option(
|
||||||
|
...,
|
||||||
|
"--all",
|
||||||
|
help="If true, all available models will be downloaded (mutually exclusive with passing specific models).",
|
||||||
|
show_default=True,
|
||||||
|
),
|
||||||
|
] = False,
|
||||||
quiet: Annotated[
|
quiet: Annotated[
|
||||||
bool,
|
bool,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -65,6 +84,10 @@ def download(
|
|||||||
),
|
),
|
||||||
] = False,
|
] = False,
|
||||||
):
|
):
|
||||||
|
if models and all:
|
||||||
|
raise typer.BadParameter(
|
||||||
|
"Cannot simultaneously set 'all' parameter and specify models to download."
|
||||||
|
)
|
||||||
if not quiet:
|
if not quiet:
|
||||||
FORMAT = "%(message)s"
|
FORMAT = "%(message)s"
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
@ -73,7 +96,7 @@ def download(
|
|||||||
datefmt="[%X]",
|
datefmt="[%X]",
|
||||||
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
handlers=[RichHandler(show_level=False, show_time=False, markup=True)],
|
||||||
)
|
)
|
||||||
to_download = models or [m for m in _AvailableModels]
|
to_download = models or ([m for m in _AvailableModels] if all else _default_models)
|
||||||
output_dir = download_models(
|
output_dir = download_models(
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
force=force,
|
force=force,
|
||||||
@ -83,6 +106,7 @@ def download(
|
|||||||
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
with_code_formula=_AvailableModels.CODE_FORMULA in to_download,
|
||||||
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
with_picture_classifier=_AvailableModels.PICTURE_CLASSIFIER in to_download,
|
||||||
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
with_smolvlm=_AvailableModels.SMOLVLM in to_download,
|
||||||
|
with_granite_vision=_AvailableModels.GRANITE_VISION in to_download,
|
||||||
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
with_easyocr=_AvailableModels.EASYOCR in to_download,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -154,6 +154,10 @@ class LayoutPrediction(BaseModel):
|
|||||||
clusters: List[Cluster] = []
|
clusters: List[Cluster] = []
|
||||||
|
|
||||||
|
|
||||||
|
class VlmPrediction(BaseModel):
|
||||||
|
text: str = ""
|
||||||
|
|
||||||
|
|
||||||
class ContainerElement(
|
class ContainerElement(
|
||||||
BasePageElement
|
BasePageElement
|
||||||
): # Used for Form and Key-Value-Regions, only for typing.
|
): # Used for Form and Key-Value-Regions, only for typing.
|
||||||
@ -197,6 +201,7 @@ class PagePredictions(BaseModel):
|
|||||||
tablestructure: Optional[TableStructurePrediction] = None
|
tablestructure: Optional[TableStructurePrediction] = None
|
||||||
figures_classification: Optional[FigureClassificationPrediction] = None
|
figures_classification: Optional[FigureClassificationPrediction] = None
|
||||||
equations_prediction: Optional[EquationPrediction] = None
|
equations_prediction: Optional[EquationPrediction] = None
|
||||||
|
vlm_response: Optional[VlmPrediction] = None
|
||||||
|
|
||||||
|
|
||||||
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
PageElement = Union[TextElement, Table, FigureElement, ContainerElement]
|
||||||
|
@ -35,6 +35,7 @@ class AcceleratorOptions(BaseSettings):
|
|||||||
|
|
||||||
num_threads: int = 4
|
num_threads: int = 4
|
||||||
device: Union[str, AcceleratorDevice] = "auto"
|
device: Union[str, AcceleratorDevice] = "auto"
|
||||||
|
cuda_use_flash_attention2: bool = False
|
||||||
|
|
||||||
@field_validator("device")
|
@field_validator("device")
|
||||||
def validate_device(cls, value):
|
def validate_device(cls, value):
|
||||||
@ -252,6 +253,45 @@ granite_picture_description = PictureDescriptionVlmOptions(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class BaseVlmOptions(BaseModel):
|
||||||
|
kind: str
|
||||||
|
prompt: str
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseFormat(str, Enum):
|
||||||
|
DOCTAGS = "doctags"
|
||||||
|
MARKDOWN = "markdown"
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingFaceVlmOptions(BaseVlmOptions):
|
||||||
|
kind: Literal["hf_model_options"] = "hf_model_options"
|
||||||
|
|
||||||
|
repo_id: str
|
||||||
|
load_in_8bit: bool = True
|
||||||
|
llm_int8_threshold: float = 6.0
|
||||||
|
quantized: bool = False
|
||||||
|
|
||||||
|
response_format: ResponseFormat
|
||||||
|
|
||||||
|
@property
|
||||||
|
def repo_cache_folder(self) -> str:
|
||||||
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||||
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
||||||
|
prompt="Convert this page to docling.",
|
||||||
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
|
)
|
||||||
|
|
||||||
|
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
||||||
|
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
||||||
|
# prompt="OCR the full page to markdown.",
|
||||||
|
prompt="OCR this image.",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the backend options
|
# Define an enum for the backend options
|
||||||
class PdfBackend(str, Enum):
|
class PdfBackend(str, Enum):
|
||||||
"""Enum of valid PDF backends."""
|
"""Enum of valid PDF backends."""
|
||||||
@ -284,7 +324,24 @@ class PipelineOptions(BaseModel):
|
|||||||
enable_remote_services: bool = False
|
enable_remote_services: bool = False
|
||||||
|
|
||||||
|
|
||||||
class PdfPipelineOptions(PipelineOptions):
|
class PaginatedPipelineOptions(PipelineOptions):
|
||||||
|
images_scale: float = 1.0
|
||||||
|
generate_page_images: bool = False
|
||||||
|
generate_picture_images: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class VlmPipelineOptions(PaginatedPipelineOptions):
|
||||||
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
|
|
||||||
|
generate_page_images: bool = True
|
||||||
|
force_backend_text: bool = (
|
||||||
|
False # (To be used with vlms, or other generative models)
|
||||||
|
)
|
||||||
|
# If True, text from backend will be used instead of generated text
|
||||||
|
vlm_options: Union[HuggingFaceVlmOptions] = smoldocling_vlm_conversion_options
|
||||||
|
|
||||||
|
|
||||||
|
class PdfPipelineOptions(PaginatedPipelineOptions):
|
||||||
"""Options for the PDF pipeline."""
|
"""Options for the PDF pipeline."""
|
||||||
|
|
||||||
artifacts_path: Optional[Union[Path, str]] = None
|
artifacts_path: Optional[Union[Path, str]] = None
|
||||||
@ -294,6 +351,10 @@ class PdfPipelineOptions(PipelineOptions):
|
|||||||
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code
|
||||||
do_picture_classification: bool = False # True: classify pictures in documents
|
do_picture_classification: bool = False # True: classify pictures in documents
|
||||||
do_picture_description: bool = False # True: run describe pictures in documents
|
do_picture_description: bool = False # True: run describe pictures in documents
|
||||||
|
force_backend_text: bool = (
|
||||||
|
False # (To be used with vlms, or other generative models)
|
||||||
|
)
|
||||||
|
# If True, text from backend will be used instead of generated text
|
||||||
|
|
||||||
table_structure_options: TableStructureOptions = TableStructureOptions()
|
table_structure_options: TableStructureOptions = TableStructureOptions()
|
||||||
ocr_options: OcrOptions = EasyOcrOptions()
|
ocr_options: OcrOptions = EasyOcrOptions()
|
||||||
|
180
docling/models/hf_vlm_model.py
Normal file
180
docling/models/hf_vlm_model.py
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorDevice,
|
||||||
|
AcceleratorOptions,
|
||||||
|
HuggingFaceVlmOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingFaceVlmModel(BasePageModel):
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
enabled: bool,
|
||||||
|
artifacts_path: Optional[Path],
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
vlm_options: HuggingFaceVlmOptions,
|
||||||
|
):
|
||||||
|
self.enabled = enabled
|
||||||
|
|
||||||
|
self.vlm_options = vlm_options
|
||||||
|
|
||||||
|
if self.enabled:
|
||||||
|
import torch
|
||||||
|
from transformers import ( # type: ignore
|
||||||
|
AutoModelForVision2Seq,
|
||||||
|
AutoProcessor,
|
||||||
|
BitsAndBytesConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
device = decide_device(accelerator_options.device)
|
||||||
|
self.device = device
|
||||||
|
|
||||||
|
_log.debug("Available device for HuggingFace VLM: {}".format(device))
|
||||||
|
|
||||||
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
# PARAMETERS:
|
||||||
|
if artifacts_path is None:
|
||||||
|
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
||||||
|
elif (artifacts_path / repo_cache_folder).exists():
|
||||||
|
artifacts_path = artifacts_path / repo_cache_folder
|
||||||
|
|
||||||
|
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
|
||||||
|
self.param_quantization_config = BitsAndBytesConfig(
|
||||||
|
load_in_8bit=vlm_options.load_in_8bit, # True,
|
||||||
|
llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0
|
||||||
|
)
|
||||||
|
self.param_quantized = vlm_options.quantized # False
|
||||||
|
|
||||||
|
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
||||||
|
if not self.param_quantized:
|
||||||
|
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
||||||
|
artifacts_path,
|
||||||
|
device_map=device,
|
||||||
|
torch_dtype=torch.bfloat16,
|
||||||
|
_attn_implementation=(
|
||||||
|
"flash_attention_2"
|
||||||
|
if self.device.startswith("cuda")
|
||||||
|
and accelerator_options.cuda_use_flash_attention2
|
||||||
|
else "eager"
|
||||||
|
),
|
||||||
|
) # .to(self.device)
|
||||||
|
|
||||||
|
else:
|
||||||
|
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
||||||
|
artifacts_path,
|
||||||
|
device_map=device,
|
||||||
|
torch_dtype="auto",
|
||||||
|
quantization_config=self.param_quantization_config,
|
||||||
|
_attn_implementation=(
|
||||||
|
"flash_attention_2"
|
||||||
|
if self.device.startswith("cuda")
|
||||||
|
and accelerator_options.cuda_use_flash_attention2
|
||||||
|
else "eager"
|
||||||
|
),
|
||||||
|
) # .to(self.device)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def download_models(
|
||||||
|
repo_id: str,
|
||||||
|
local_dir: Optional[Path] = None,
|
||||||
|
force: bool = False,
|
||||||
|
progress: bool = False,
|
||||||
|
) -> Path:
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
from huggingface_hub.utils import disable_progress_bars
|
||||||
|
|
||||||
|
if not progress:
|
||||||
|
disable_progress_bars()
|
||||||
|
download_path = snapshot_download(
|
||||||
|
repo_id=repo_id,
|
||||||
|
force_download=force,
|
||||||
|
local_dir=local_dir,
|
||||||
|
# revision="v0.0.1",
|
||||||
|
)
|
||||||
|
|
||||||
|
return Path(download_path)
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
|
for page in page_batch:
|
||||||
|
assert page._backend is not None
|
||||||
|
if not page._backend.is_valid():
|
||||||
|
yield page
|
||||||
|
else:
|
||||||
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
|
assert page.size is not None
|
||||||
|
|
||||||
|
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
||||||
|
# hi_res_image = page.get_image(scale=1.0) # 72dpi
|
||||||
|
|
||||||
|
if hi_res_image is not None:
|
||||||
|
im_width, im_height = hi_res_image.size
|
||||||
|
|
||||||
|
# populate page_tags with predicted doc tags
|
||||||
|
page_tags = ""
|
||||||
|
|
||||||
|
if hi_res_image:
|
||||||
|
if hi_res_image.mode != "RGB":
|
||||||
|
hi_res_image = hi_res_image.convert("RGB")
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "This is a page from a document.",
|
||||||
|
},
|
||||||
|
{"type": "image"},
|
||||||
|
{"type": "text", "text": self.param_question},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
prompt = self.processor.apply_chat_template(
|
||||||
|
messages, add_generation_prompt=False
|
||||||
|
)
|
||||||
|
inputs = self.processor(
|
||||||
|
text=prompt, images=[hi_res_image], return_tensors="pt"
|
||||||
|
)
|
||||||
|
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
# Call model to generate:
|
||||||
|
generated_ids = self.vlm_model.generate(
|
||||||
|
**inputs, max_new_tokens=4096, use_cache=True
|
||||||
|
)
|
||||||
|
|
||||||
|
generation_time = time.time() - start_time
|
||||||
|
generated_texts = self.processor.batch_decode(
|
||||||
|
generated_ids[:, inputs["input_ids"].shape[1] :],
|
||||||
|
skip_special_tokens=False,
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
num_tokens = len(generated_ids[0])
|
||||||
|
page_tags = generated_texts
|
||||||
|
|
||||||
|
# inference_time = time.time() - start_time
|
||||||
|
# tokens_per_second = num_tokens / generation_time
|
||||||
|
# print("")
|
||||||
|
# print(f"Page Inference Time: {inference_time:.2f} seconds")
|
||||||
|
# print(f"Total tokens on page: {num_tokens:.2f}")
|
||||||
|
# print(f"Tokens/sec: {tokens_per_second:.2f}")
|
||||||
|
# print("")
|
||||||
|
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
||||||
|
|
||||||
|
yield page
|
@ -53,9 +53,9 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Initialize processor and model
|
# Initialize processor and model
|
||||||
self.processor = AutoProcessor.from_pretrained(self.options.repo_id)
|
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
||||||
self.model = AutoModelForVision2Seq.from_pretrained(
|
self.model = AutoModelForVision2Seq.from_pretrained(
|
||||||
self.options.repo_id,
|
artifacts_path,
|
||||||
torch_dtype=torch.bfloat16,
|
torch_dtype=torch.bfloat16,
|
||||||
_attn_implementation=(
|
_attn_implementation=(
|
||||||
"flash_attention_2" if self.device.startswith("cuda") else "eager"
|
"flash_attention_2" if self.device.startswith("cuda") else "eager"
|
||||||
|
534
docling/pipeline/vlm_pipeline.py
Normal file
534
docling/pipeline/vlm_pipeline.py
Normal file
@ -0,0 +1,534 @@
|
|||||||
|
import itertools
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import warnings
|
||||||
|
from io import BytesIO
|
||||||
|
|
||||||
|
# from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from docling_core.types import DoclingDocument
|
||||||
|
from docling_core.types.doc import (
|
||||||
|
BoundingBox,
|
||||||
|
DocItem,
|
||||||
|
DocItemLabel,
|
||||||
|
DoclingDocument,
|
||||||
|
GroupLabel,
|
||||||
|
ImageRef,
|
||||||
|
ImageRefMode,
|
||||||
|
PictureItem,
|
||||||
|
ProvenanceItem,
|
||||||
|
Size,
|
||||||
|
TableCell,
|
||||||
|
TableData,
|
||||||
|
TableItem,
|
||||||
|
)
|
||||||
|
from docling_core.types.doc.tokens import DocumentToken, TableToken
|
||||||
|
|
||||||
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
PdfPipelineOptions,
|
||||||
|
ResponseFormat,
|
||||||
|
VlmPipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
||||||
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class VlmPipeline(PaginatedPipeline):
|
||||||
|
|
||||||
|
def __init__(self, pipeline_options: VlmPipelineOptions):
|
||||||
|
super().__init__(pipeline_options)
|
||||||
|
self.keep_backend = True
|
||||||
|
|
||||||
|
warnings.warn(
|
||||||
|
"The VlmPipeline is currently experimental and may change in upcoming versions without notice.",
|
||||||
|
category=UserWarning,
|
||||||
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
|
self.pipeline_options: VlmPipelineOptions
|
||||||
|
|
||||||
|
artifacts_path: Optional[Path] = None
|
||||||
|
if pipeline_options.artifacts_path is not None:
|
||||||
|
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
|
||||||
|
elif settings.artifacts_path is not None:
|
||||||
|
artifacts_path = Path(settings.artifacts_path).expanduser()
|
||||||
|
|
||||||
|
if artifacts_path is not None and not artifacts_path.is_dir():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The value of {artifacts_path=} is not valid. "
|
||||||
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
|
)
|
||||||
|
|
||||||
|
# force_backend_text = False - use text that is coming from VLM response
|
||||||
|
# force_backend_text = True - get text from backend using bounding boxes predicted by SmolDocling doctags
|
||||||
|
self.force_backend_text = (
|
||||||
|
pipeline_options.force_backend_text
|
||||||
|
and pipeline_options.vlm_options.response_format == ResponseFormat.DOCTAGS
|
||||||
|
)
|
||||||
|
|
||||||
|
self.keep_images = self.pipeline_options.generate_page_images
|
||||||
|
|
||||||
|
self.build_pipe = [
|
||||||
|
HuggingFaceVlmModel(
|
||||||
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
|
artifacts_path=artifacts_path,
|
||||||
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
|
vlm_options=self.pipeline_options.vlm_options,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
self.enrichment_pipe = [
|
||||||
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||||
|
]
|
||||||
|
|
||||||
|
def initialize_page(self, conv_res: ConversionResult, page: Page) -> Page:
|
||||||
|
with TimeRecorder(conv_res, "page_init"):
|
||||||
|
page._backend = conv_res.input._backend.load_page(page.page_no) # type: ignore
|
||||||
|
if page._backend is not None and page._backend.is_valid():
|
||||||
|
page.size = page._backend.get_size()
|
||||||
|
|
||||||
|
return page
|
||||||
|
|
||||||
|
def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
|
||||||
|
with TimeRecorder(conv_res, "doc_assemble", scope=ProfilingScope.DOCUMENT):
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.pipeline_options.vlm_options.response_format
|
||||||
|
== ResponseFormat.DOCTAGS
|
||||||
|
):
|
||||||
|
conv_res.document = self._turn_tags_into_doc(conv_res.pages)
|
||||||
|
elif (
|
||||||
|
self.pipeline_options.vlm_options.response_format
|
||||||
|
== ResponseFormat.MARKDOWN
|
||||||
|
):
|
||||||
|
conv_res.document = self._turn_md_into_doc(conv_res)
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generate images of the requested element types
|
||||||
|
if self.pipeline_options.generate_picture_images:
|
||||||
|
scale = self.pipeline_options.images_scale
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
isinstance(element, PictureItem)
|
||||||
|
and self.pipeline_options.generate_picture_images
|
||||||
|
):
|
||||||
|
page_ix = element.prov[0].page_no - 1
|
||||||
|
page = conv_res.pages[page_ix]
|
||||||
|
assert page.size is not None
|
||||||
|
assert page.image is not None
|
||||||
|
|
||||||
|
crop_bbox = (
|
||||||
|
element.prov[0]
|
||||||
|
.bbox.scaled(scale=scale)
|
||||||
|
.to_top_left_origin(page_height=page.size.height * scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||||
|
element.image = ImageRef.from_pil(
|
||||||
|
cropped_im, dpi=int(72 * scale)
|
||||||
|
)
|
||||||
|
|
||||||
|
return conv_res
|
||||||
|
|
||||||
|
def _turn_md_into_doc(self, conv_res):
|
||||||
|
predicted_text = ""
|
||||||
|
for pg_idx, page in enumerate(conv_res.pages):
|
||||||
|
if page.predictions.vlm_response:
|
||||||
|
predicted_text += page.predictions.vlm_response.text + "\n\n"
|
||||||
|
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
||||||
|
out_doc = InputDocument(
|
||||||
|
path_or_stream=response_bytes,
|
||||||
|
filename=conv_res.input.file.name,
|
||||||
|
format=InputFormat.MD,
|
||||||
|
backend=MarkdownDocumentBackend,
|
||||||
|
)
|
||||||
|
backend = MarkdownDocumentBackend(
|
||||||
|
in_doc=out_doc,
|
||||||
|
path_or_stream=response_bytes,
|
||||||
|
)
|
||||||
|
return backend.convert()
|
||||||
|
|
||||||
|
def _turn_tags_into_doc(self, pages: list[Page]) -> DoclingDocument:
|
||||||
|
###############################################
|
||||||
|
# Tag definitions and color mappings
|
||||||
|
###############################################
|
||||||
|
|
||||||
|
# Maps the recognized tag to a Docling label.
|
||||||
|
# Code items will be given DocItemLabel.CODE
|
||||||
|
tag_to_doclabel = {
|
||||||
|
"title": DocItemLabel.TITLE,
|
||||||
|
"document_index": DocItemLabel.DOCUMENT_INDEX,
|
||||||
|
"otsl": DocItemLabel.TABLE,
|
||||||
|
"section_header_level_1": DocItemLabel.SECTION_HEADER,
|
||||||
|
"checkbox_selected": DocItemLabel.CHECKBOX_SELECTED,
|
||||||
|
"checkbox_unselected": DocItemLabel.CHECKBOX_UNSELECTED,
|
||||||
|
"text": DocItemLabel.TEXT,
|
||||||
|
"page_header": DocItemLabel.PAGE_HEADER,
|
||||||
|
"page_footer": DocItemLabel.PAGE_FOOTER,
|
||||||
|
"formula": DocItemLabel.FORMULA,
|
||||||
|
"caption": DocItemLabel.CAPTION,
|
||||||
|
"picture": DocItemLabel.PICTURE,
|
||||||
|
"list_item": DocItemLabel.LIST_ITEM,
|
||||||
|
"footnote": DocItemLabel.FOOTNOTE,
|
||||||
|
"code": DocItemLabel.CODE,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Maps each tag to an associated bounding box color.
|
||||||
|
tag_to_color = {
|
||||||
|
"title": "blue",
|
||||||
|
"document_index": "darkblue",
|
||||||
|
"otsl": "green",
|
||||||
|
"section_header_level_1": "purple",
|
||||||
|
"checkbox_selected": "black",
|
||||||
|
"checkbox_unselected": "gray",
|
||||||
|
"text": "red",
|
||||||
|
"page_header": "orange",
|
||||||
|
"page_footer": "cyan",
|
||||||
|
"formula": "pink",
|
||||||
|
"caption": "magenta",
|
||||||
|
"picture": "yellow",
|
||||||
|
"list_item": "brown",
|
||||||
|
"footnote": "darkred",
|
||||||
|
"code": "lightblue",
|
||||||
|
}
|
||||||
|
|
||||||
|
def extract_bounding_box(text_chunk: str) -> Optional[BoundingBox]:
|
||||||
|
"""Extracts <loc_...> bounding box coords from the chunk, normalized by / 500."""
|
||||||
|
coords = re.findall(r"<loc_(\d+)>", text_chunk)
|
||||||
|
if len(coords) == 4:
|
||||||
|
l, t, r, b = map(float, coords)
|
||||||
|
return BoundingBox(l=l / 500, t=t / 500, r=r / 500, b=b / 500)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def extract_inner_text(text_chunk: str) -> str:
|
||||||
|
"""Strips all <...> tags inside the chunk to get the raw text content."""
|
||||||
|
return re.sub(r"<.*?>", "", text_chunk, flags=re.DOTALL).strip()
|
||||||
|
|
||||||
|
def extract_text_from_backend(page: Page, bbox: BoundingBox | None) -> str:
|
||||||
|
# Convert bounding box normalized to 0-100 into page coordinates for cropping
|
||||||
|
text = ""
|
||||||
|
if bbox:
|
||||||
|
if page.size:
|
||||||
|
bbox.l = bbox.l * page.size.width
|
||||||
|
bbox.t = bbox.t * page.size.height
|
||||||
|
bbox.r = bbox.r * page.size.width
|
||||||
|
bbox.b = bbox.b * page.size.height
|
||||||
|
if page._backend:
|
||||||
|
text = page._backend.get_text_in_rect(bbox)
|
||||||
|
return text
|
||||||
|
|
||||||
|
def otsl_parse_texts(texts, tokens):
|
||||||
|
split_word = TableToken.OTSL_NL.value
|
||||||
|
split_row_tokens = [
|
||||||
|
list(y)
|
||||||
|
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
||||||
|
if not x
|
||||||
|
]
|
||||||
|
table_cells = []
|
||||||
|
r_idx = 0
|
||||||
|
c_idx = 0
|
||||||
|
|
||||||
|
def count_right(tokens, c_idx, r_idx, which_tokens):
|
||||||
|
span = 0
|
||||||
|
c_idx_iter = c_idx
|
||||||
|
while tokens[r_idx][c_idx_iter] in which_tokens:
|
||||||
|
c_idx_iter += 1
|
||||||
|
span += 1
|
||||||
|
if c_idx_iter >= len(tokens[r_idx]):
|
||||||
|
return span
|
||||||
|
return span
|
||||||
|
|
||||||
|
def count_down(tokens, c_idx, r_idx, which_tokens):
|
||||||
|
span = 0
|
||||||
|
r_idx_iter = r_idx
|
||||||
|
while tokens[r_idx_iter][c_idx] in which_tokens:
|
||||||
|
r_idx_iter += 1
|
||||||
|
span += 1
|
||||||
|
if r_idx_iter >= len(tokens):
|
||||||
|
return span
|
||||||
|
return span
|
||||||
|
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
cell_text = ""
|
||||||
|
if text in [
|
||||||
|
TableToken.OTSL_FCEL.value,
|
||||||
|
TableToken.OTSL_ECEL.value,
|
||||||
|
TableToken.OTSL_CHED.value,
|
||||||
|
TableToken.OTSL_RHED.value,
|
||||||
|
TableToken.OTSL_SROW.value,
|
||||||
|
]:
|
||||||
|
row_span = 1
|
||||||
|
col_span = 1
|
||||||
|
right_offset = 1
|
||||||
|
if text != TableToken.OTSL_ECEL.value:
|
||||||
|
cell_text = texts[i + 1]
|
||||||
|
right_offset = 2
|
||||||
|
|
||||||
|
# Check next element(s) for lcel / ucel / xcel, set properly row_span, col_span
|
||||||
|
next_right_cell = ""
|
||||||
|
if i + right_offset < len(texts):
|
||||||
|
next_right_cell = texts[i + right_offset]
|
||||||
|
|
||||||
|
next_bottom_cell = ""
|
||||||
|
if r_idx + 1 < len(split_row_tokens):
|
||||||
|
if c_idx < len(split_row_tokens[r_idx + 1]):
|
||||||
|
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
||||||
|
|
||||||
|
if next_right_cell in [
|
||||||
|
TableToken.OTSL_LCEL.value,
|
||||||
|
TableToken.OTSL_XCEL.value,
|
||||||
|
]:
|
||||||
|
# we have horisontal spanning cell or 2d spanning cell
|
||||||
|
col_span += count_right(
|
||||||
|
split_row_tokens,
|
||||||
|
c_idx + 1,
|
||||||
|
r_idx,
|
||||||
|
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
||||||
|
)
|
||||||
|
if next_bottom_cell in [
|
||||||
|
TableToken.OTSL_UCEL.value,
|
||||||
|
TableToken.OTSL_XCEL.value,
|
||||||
|
]:
|
||||||
|
# we have a vertical spanning cell or 2d spanning cell
|
||||||
|
row_span += count_down(
|
||||||
|
split_row_tokens,
|
||||||
|
c_idx,
|
||||||
|
r_idx + 1,
|
||||||
|
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
||||||
|
)
|
||||||
|
|
||||||
|
table_cells.append(
|
||||||
|
TableCell(
|
||||||
|
text=cell_text.strip(),
|
||||||
|
row_span=row_span,
|
||||||
|
col_span=col_span,
|
||||||
|
start_row_offset_idx=r_idx,
|
||||||
|
end_row_offset_idx=r_idx + row_span,
|
||||||
|
start_col_offset_idx=c_idx,
|
||||||
|
end_col_offset_idx=c_idx + col_span,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if text in [
|
||||||
|
TableToken.OTSL_FCEL.value,
|
||||||
|
TableToken.OTSL_ECEL.value,
|
||||||
|
TableToken.OTSL_CHED.value,
|
||||||
|
TableToken.OTSL_RHED.value,
|
||||||
|
TableToken.OTSL_SROW.value,
|
||||||
|
TableToken.OTSL_LCEL.value,
|
||||||
|
TableToken.OTSL_UCEL.value,
|
||||||
|
TableToken.OTSL_XCEL.value,
|
||||||
|
]:
|
||||||
|
c_idx += 1
|
||||||
|
if text == TableToken.OTSL_NL.value:
|
||||||
|
r_idx += 1
|
||||||
|
c_idx = 0
|
||||||
|
return table_cells, split_row_tokens
|
||||||
|
|
||||||
|
def otsl_extract_tokens_and_text(s: str):
|
||||||
|
# Pattern to match anything enclosed by < > (including the angle brackets themselves)
|
||||||
|
pattern = r"(<[^>]+>)"
|
||||||
|
# Find all tokens (e.g. "<otsl>", "<loc_140>", etc.)
|
||||||
|
tokens = re.findall(pattern, s)
|
||||||
|
# Remove any tokens that start with "<loc_"
|
||||||
|
tokens = [
|
||||||
|
token
|
||||||
|
for token in tokens
|
||||||
|
if not (
|
||||||
|
token.startswith(rf"<{DocumentToken.LOC.value}")
|
||||||
|
or token
|
||||||
|
in [
|
||||||
|
rf"<{DocumentToken.OTSL.value}>",
|
||||||
|
rf"</{DocumentToken.OTSL.value}>",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
# Split the string by those tokens to get the in-between text
|
||||||
|
text_parts = re.split(pattern, s)
|
||||||
|
text_parts = [
|
||||||
|
token
|
||||||
|
for token in text_parts
|
||||||
|
if not (
|
||||||
|
token.startswith(rf"<{DocumentToken.LOC.value}")
|
||||||
|
or token
|
||||||
|
in [
|
||||||
|
rf"<{DocumentToken.OTSL.value}>",
|
||||||
|
rf"</{DocumentToken.OTSL.value}>",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
]
|
||||||
|
# Remove any empty or purely whitespace strings from text_parts
|
||||||
|
text_parts = [part for part in text_parts if part.strip()]
|
||||||
|
|
||||||
|
return tokens, text_parts
|
||||||
|
|
||||||
|
def parse_table_content(otsl_content: str) -> TableData:
|
||||||
|
tokens, mixed_texts = otsl_extract_tokens_and_text(otsl_content)
|
||||||
|
table_cells, split_row_tokens = otsl_parse_texts(mixed_texts, tokens)
|
||||||
|
|
||||||
|
return TableData(
|
||||||
|
num_rows=len(split_row_tokens),
|
||||||
|
num_cols=(
|
||||||
|
max(len(row) for row in split_row_tokens) if split_row_tokens else 0
|
||||||
|
),
|
||||||
|
table_cells=table_cells,
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = DoclingDocument(name="Document")
|
||||||
|
for pg_idx, page in enumerate(pages):
|
||||||
|
xml_content = ""
|
||||||
|
predicted_text = ""
|
||||||
|
if page.predictions.vlm_response:
|
||||||
|
predicted_text = page.predictions.vlm_response.text
|
||||||
|
image = page.image
|
||||||
|
|
||||||
|
page_no = pg_idx + 1
|
||||||
|
bounding_boxes = []
|
||||||
|
|
||||||
|
if page.size:
|
||||||
|
pg_width = page.size.width
|
||||||
|
pg_height = page.size.height
|
||||||
|
size = Size(width=pg_width, height=pg_height)
|
||||||
|
parent_page = doc.add_page(page_no=page_no, size=size)
|
||||||
|
|
||||||
|
"""
|
||||||
|
1. Finds all <tag>...</tag> blocks in the entire string (multi-line friendly) in the order they appear.
|
||||||
|
2. For each chunk, extracts bounding box (if any) and inner text.
|
||||||
|
3. Adds the item to a DoclingDocument structure with the right label.
|
||||||
|
4. Tracks bounding boxes + color in a separate list for later visualization.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Regex for all recognized tags
|
||||||
|
tag_pattern = (
|
||||||
|
rf"<(?P<tag>{DocItemLabel.TITLE}|{DocItemLabel.DOCUMENT_INDEX}|"
|
||||||
|
rf"{DocItemLabel.CHECKBOX_UNSELECTED}|{DocItemLabel.CHECKBOX_SELECTED}|"
|
||||||
|
rf"{DocItemLabel.TEXT}|{DocItemLabel.PAGE_HEADER}|"
|
||||||
|
rf"{DocItemLabel.PAGE_FOOTER}|{DocItemLabel.FORMULA}|"
|
||||||
|
rf"{DocItemLabel.CAPTION}|{DocItemLabel.PICTURE}|"
|
||||||
|
rf"{DocItemLabel.LIST_ITEM}|{DocItemLabel.FOOTNOTE}|{DocItemLabel.CODE}|"
|
||||||
|
rf"{DocItemLabel.SECTION_HEADER}_level_1|{DocumentToken.OTSL.value})>.*?</(?P=tag)>"
|
||||||
|
)
|
||||||
|
|
||||||
|
# DocumentToken.OTSL
|
||||||
|
pattern = re.compile(tag_pattern, re.DOTALL)
|
||||||
|
|
||||||
|
# Go through each match in order
|
||||||
|
for match in pattern.finditer(predicted_text):
|
||||||
|
full_chunk = match.group(0)
|
||||||
|
tag_name = match.group("tag")
|
||||||
|
|
||||||
|
bbox = extract_bounding_box(full_chunk)
|
||||||
|
doc_label = tag_to_doclabel.get(tag_name, DocItemLabel.PARAGRAPH)
|
||||||
|
color = tag_to_color.get(tag_name, "white")
|
||||||
|
|
||||||
|
# Store bounding box + color
|
||||||
|
if bbox:
|
||||||
|
bounding_boxes.append((bbox, color))
|
||||||
|
|
||||||
|
if tag_name == DocumentToken.OTSL.value:
|
||||||
|
table_data = parse_table_content(full_chunk)
|
||||||
|
bbox = extract_bounding_box(full_chunk)
|
||||||
|
|
||||||
|
if bbox:
|
||||||
|
prov = ProvenanceItem(
|
||||||
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
||||||
|
charspan=(0, 0),
|
||||||
|
page_no=page_no,
|
||||||
|
)
|
||||||
|
doc.add_table(data=table_data, prov=prov)
|
||||||
|
else:
|
||||||
|
doc.add_table(data=table_data)
|
||||||
|
|
||||||
|
elif tag_name == DocItemLabel.PICTURE:
|
||||||
|
text_caption_content = extract_inner_text(full_chunk)
|
||||||
|
if image:
|
||||||
|
if bbox:
|
||||||
|
im_width, im_height = image.size
|
||||||
|
|
||||||
|
crop_box = (
|
||||||
|
int(bbox.l * im_width),
|
||||||
|
int(bbox.t * im_height),
|
||||||
|
int(bbox.r * im_width),
|
||||||
|
int(bbox.b * im_height),
|
||||||
|
)
|
||||||
|
cropped_image = image.crop(crop_box)
|
||||||
|
pic = doc.add_picture(
|
||||||
|
parent=None,
|
||||||
|
image=ImageRef.from_pil(image=cropped_image, dpi=72),
|
||||||
|
prov=(
|
||||||
|
ProvenanceItem(
|
||||||
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
||||||
|
charspan=(0, 0),
|
||||||
|
page_no=page_no,
|
||||||
|
)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# If there is a caption to an image, add it as well
|
||||||
|
if len(text_caption_content) > 0:
|
||||||
|
caption_item = doc.add_text(
|
||||||
|
label=DocItemLabel.CAPTION,
|
||||||
|
text=text_caption_content,
|
||||||
|
parent=None,
|
||||||
|
)
|
||||||
|
pic.captions.append(caption_item.get_ref())
|
||||||
|
else:
|
||||||
|
if bbox:
|
||||||
|
# In case we don't have access to an binary of an image
|
||||||
|
doc.add_picture(
|
||||||
|
parent=None,
|
||||||
|
prov=ProvenanceItem(
|
||||||
|
bbox=bbox, charspan=(0, 0), page_no=page_no
|
||||||
|
),
|
||||||
|
)
|
||||||
|
# If there is a caption to an image, add it as well
|
||||||
|
if len(text_caption_content) > 0:
|
||||||
|
caption_item = doc.add_text(
|
||||||
|
label=DocItemLabel.CAPTION,
|
||||||
|
text=text_caption_content,
|
||||||
|
parent=None,
|
||||||
|
)
|
||||||
|
pic.captions.append(caption_item.get_ref())
|
||||||
|
else:
|
||||||
|
# For everything else, treat as text
|
||||||
|
if self.force_backend_text:
|
||||||
|
text_content = extract_text_from_backend(page, bbox)
|
||||||
|
else:
|
||||||
|
text_content = extract_inner_text(full_chunk)
|
||||||
|
doc.add_text(
|
||||||
|
label=doc_label,
|
||||||
|
text=text_content,
|
||||||
|
prov=(
|
||||||
|
ProvenanceItem(
|
||||||
|
bbox=bbox.resize_by_scale(pg_width, pg_height),
|
||||||
|
charspan=(0, len(text_content)),
|
||||||
|
page_no=page_no,
|
||||||
|
)
|
||||||
|
if bbox
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
)
|
||||||
|
return doc
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_default_options(cls) -> VlmPipelineOptions:
|
||||||
|
return VlmPipelineOptions()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def is_backend_supported(cls, backend: AbstractDocumentBackend):
|
||||||
|
return isinstance(backend, PdfDocumentBackend)
|
3
docling/utils/locks.py
Normal file
3
docling/utils/locks.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
import threading
|
||||||
|
|
||||||
|
pypdfium2_lock = threading.Lock()
|
@ -2,7 +2,10 @@ import logging
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import smolvlm_picture_description
|
from docling.datamodel.pipeline_options import (
|
||||||
|
granite_picture_description,
|
||||||
|
smolvlm_picture_description,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.code_formula_model import CodeFormulaModel
|
from docling.models.code_formula_model import CodeFormulaModel
|
||||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||||
@ -23,7 +26,8 @@ def download_models(
|
|||||||
with_tableformer: bool = True,
|
with_tableformer: bool = True,
|
||||||
with_code_formula: bool = True,
|
with_code_formula: bool = True,
|
||||||
with_picture_classifier: bool = True,
|
with_picture_classifier: bool = True,
|
||||||
with_smolvlm: bool = True,
|
with_smolvlm: bool = False,
|
||||||
|
with_granite_vision: bool = False,
|
||||||
with_easyocr: bool = True,
|
with_easyocr: bool = True,
|
||||||
):
|
):
|
||||||
if output_dir is None:
|
if output_dir is None:
|
||||||
@ -73,6 +77,15 @@ def download_models(
|
|||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if with_granite_vision:
|
||||||
|
_log.info(f"Downloading Granite Vision model...")
|
||||||
|
PictureDescriptionVlmModel.download_models(
|
||||||
|
repo_id=granite_picture_description.repo_id,
|
||||||
|
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
||||||
|
force=force,
|
||||||
|
progress=progress,
|
||||||
|
)
|
||||||
|
|
||||||
if with_easyocr:
|
if with_easyocr:
|
||||||
_log.info(f"Downloading easyocr models...")
|
_log.info(f"Downloading easyocr models...")
|
||||||
EasyOcrModel.download_models(
|
EasyOcrModel.download_models(
|
||||||
|
@ -43,6 +43,11 @@ def draw_clusters(
|
|||||||
y0 *= scale_x
|
y0 *= scale_x
|
||||||
y1 *= scale_y
|
y1 *= scale_y
|
||||||
|
|
||||||
|
if y1 <= y0:
|
||||||
|
y1, y0 = y0, y1
|
||||||
|
if x1 <= x0:
|
||||||
|
x1, x0 = x0, x1
|
||||||
|
|
||||||
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
cluster_fill_color = (*list(DocItemLabel.get_color(c.label)), 70)
|
||||||
cluster_outline_color = (
|
cluster_outline_color = (
|
||||||
*list(DocItemLabel.get_color(c.label)),
|
*list(DocItemLabel.get_color(c.label)),
|
||||||
|
@ -1,5 +1,18 @@
|
|||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
|
!!! note "Chunking approaches"
|
||||||
|
|
||||||
|
Starting from a `DoclingDocument`, there are in principle two possible chunking
|
||||||
|
approaches:
|
||||||
|
|
||||||
|
1. exporting the `DoclingDocument` to Markdown (or similar format) and then
|
||||||
|
performing user-defined chunking as a post-processing step, or
|
||||||
|
2. using native Docling chunkers, i.e. operating directly on the `DoclingDocument`
|
||||||
|
|
||||||
|
This page is about the latter, i.e. using native Docling chunkers.
|
||||||
|
For an example of using approach (1) check out e.g.
|
||||||
|
[this recipe](../examples/rag_langchain.ipynb) looking at the Markdown export mode.
|
||||||
|
|
||||||
A *chunker* is a Docling abstraction that, given a
|
A *chunker* is a Docling abstraction that, given a
|
||||||
[`DoclingDocument`](./docling_document.md), returns a stream of chunks, each of which
|
[`DoclingDocument`](./docling_document.md), returns a stream of chunks, each of which
|
||||||
captures some part of the document as a string accompanied by respective metadata.
|
captures some part of the document as a string accompanied by respective metadata.
|
||||||
|
@ -1,3 +1,7 @@
|
|||||||
|
# WARNING
|
||||||
|
# This example demonstrates only how to develop a new enrichment model.
|
||||||
|
# It does not run the actual formula understanding model.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterable
|
from typing import Iterable
|
||||||
|
@ -1,3 +1,7 @@
|
|||||||
|
# WARNING
|
||||||
|
# This example demonstrates only how to develop a new enrichment model.
|
||||||
|
# It does not run the actual picture classifier model.
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Iterable
|
from typing import Any, Iterable
|
||||||
|
@ -83,7 +83,15 @@
|
|||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 3,
|
"execution_count": 3,
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
"source": [
|
"source": [
|
||||||
"from docling.chunking import HybridChunker\n",
|
"from docling.chunking import HybridChunker\n",
|
||||||
"\n",
|
"\n",
|
||||||
@ -91,6 +99,13 @@
|
|||||||
"chunk_iter = chunker.chunk(dl_doc=doc)"
|
"chunk_iter = chunker.chunk(dl_doc=doc)"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)."
|
||||||
|
]
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "markdown",
|
"cell_type": "markdown",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
@ -337,11 +352,11 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"for i, chunk in enumerate(chunks):\n",
|
"for i, chunk in enumerate(chunks):\n",
|
||||||
" print(f\"=== {i} ===\")\n",
|
" print(f\"=== {i} ===\")\n",
|
||||||
" txt_tokens = len(tokenizer.tokenize(chunk.text, max_length=None))\n",
|
" txt_tokens = len(tokenizer.tokenize(chunk.text))\n",
|
||||||
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
" print(f\"chunk.text ({txt_tokens} tokens):\\n{repr(chunk.text)}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
" ser_txt = chunker.serialize(chunk=chunk)\n",
|
||||||
" ser_tokens = len(tokenizer.tokenize(ser_txt, max_length=None))\n",
|
" ser_tokens = len(tokenizer.tokenize(ser_txt))\n",
|
||||||
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
" print(f\"chunker.serialize(chunk) ({ser_tokens} tokens):\\n{repr(ser_txt)}\")\n",
|
||||||
"\n",
|
"\n",
|
||||||
" print()"
|
" print()"
|
||||||
|
96
docs/examples/minimal_vlm_pipeline.py
Normal file
96
docs/examples/minimal_vlm_pipeline.py
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
AcceleratorDevice,
|
||||||
|
VlmPipelineOptions,
|
||||||
|
granite_vision_vlm_conversion_options,
|
||||||
|
smoldocling_vlm_conversion_options,
|
||||||
|
)
|
||||||
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
|
sources = [
|
||||||
|
"tests/data/2305.03393v1-pg9-img.png",
|
||||||
|
]
|
||||||
|
|
||||||
|
## Use experimental VlmPipeline
|
||||||
|
pipeline_options = VlmPipelineOptions()
|
||||||
|
# If force_backend_text = True, text from backend will be used instead of generated text
|
||||||
|
pipeline_options.force_backend_text = False
|
||||||
|
|
||||||
|
## On GPU systems, enable flash_attention_2 with CUDA:
|
||||||
|
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
||||||
|
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
||||||
|
|
||||||
|
## Pick a VLM model. We choose SmolDocling-256M by default
|
||||||
|
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
||||||
|
|
||||||
|
## Alternative VLM models:
|
||||||
|
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
||||||
|
|
||||||
|
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||||
|
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||||
|
|
||||||
|
## Set up pipeline for PDF or image inputs
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
InputFormat.IMAGE: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
out_path = Path("scratch")
|
||||||
|
out_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for source in sources:
|
||||||
|
start_time = time.time()
|
||||||
|
print("================================================")
|
||||||
|
print("Processing... {}".format(source))
|
||||||
|
print("================================================")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
res = converter.convert(source)
|
||||||
|
|
||||||
|
print("------------------------------------------------")
|
||||||
|
print("MD:")
|
||||||
|
print("------------------------------------------------")
|
||||||
|
print("")
|
||||||
|
print(res.document.export_to_markdown())
|
||||||
|
|
||||||
|
for page in res.pages:
|
||||||
|
print("")
|
||||||
|
print("Predicted page in DOCTAGS:")
|
||||||
|
print(page.predictions.vlm_response.text)
|
||||||
|
|
||||||
|
res.document.save_as_html(
|
||||||
|
filename=Path("{}/{}.html".format(out_path, res.input.file.stem)),
|
||||||
|
image_mode=ImageRefMode.REFERENCED,
|
||||||
|
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||||
|
)
|
||||||
|
|
||||||
|
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
|
||||||
|
fp.write(json.dumps(res.document.export_to_dict()))
|
||||||
|
|
||||||
|
pg_num = res.document.num_pages()
|
||||||
|
|
||||||
|
print("")
|
||||||
|
inference_time = time.time() - start_time
|
||||||
|
print(
|
||||||
|
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
|
||||||
|
)
|
||||||
|
|
||||||
|
print("================================================")
|
||||||
|
print("done!")
|
||||||
|
print("================================================")
|
@ -132,9 +132,48 @@ This is a collection of FAQ collected from the user questions on <https://github
|
|||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
??? Some images are missing from MS Word and Powerpoint"
|
??? question "Some images are missing from MS Word and Powerpoint"
|
||||||
|
|
||||||
### Some images are missing from MS Word and Powerpoint
|
### Some images are missing from MS Word and Powerpoint
|
||||||
|
|
||||||
The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
|
The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
|
||||||
If you are on other operaring systems, these images will be ignored.
|
If you are on other operaring systems, these images will be ignored.
|
||||||
|
|
||||||
|
|
||||||
|
??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
|
||||||
|
|
||||||
|
### `HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'
|
||||||
|
|
||||||
|
**TLDR**:
|
||||||
|
In the context of the `HybridChunker`, this is a known & ancitipated "false alarm".
|
||||||
|
|
||||||
|
**Details**:
|
||||||
|
|
||||||
|
Using the [`HybridChunker`](../concepts/chunking.md#hybrid-chunker) often triggers a warning like this:
|
||||||
|
> Token indices sequence length is longer than the specified maximum sequence length for this model (531 > 512). Running this sequence through the model will result in indexing errors
|
||||||
|
|
||||||
|
This is a warning that is emitted by transformers, saying that actually *running this sequence through the model* will result in indexing errors, i.e. the problematic case is only if one indeed passes the particular sequence through the (embedding) model.
|
||||||
|
|
||||||
|
In our case though, this occurs as a "false alarm", since what happens is the following:
|
||||||
|
|
||||||
|
- the chunker invokes the tokenizer on a potentially long sequence (e.g. 530 tokens as mentioned in the warning) in order to count its tokens, i.e. to assess if it is short enough. At this point transformers already emits the warning above!
|
||||||
|
- whenever the sequence at hand is oversized, the chunker proceeds to split it (but the transformers warning has already been shown nonetheless)
|
||||||
|
|
||||||
|
What is important is the actual token length of the produced chunks.
|
||||||
|
The snippet below can be used for getting the actual maximum chunk size (for users wanting to confirm that this does not exceed the model limit):
|
||||||
|
|
||||||
|
```python
|
||||||
|
chunk_max_len = 0
|
||||||
|
for i, chunk in enumerate(chunks):
|
||||||
|
ser_txt = chunker.serialize(chunk=chunk)
|
||||||
|
ser_tokens = len(tokenizer.tokenize(ser_txt))
|
||||||
|
if ser_tokens > chunk_max_len:
|
||||||
|
chunk_max_len = ser_tokens
|
||||||
|
print(f"{i}\t{ser_tokens}\t{repr(ser_txt[:100])}...")
|
||||||
|
print(f"Longest chunk yielded: {chunk_max_len} tokens")
|
||||||
|
print(f"Model max length: {tokenizer.model_max_length}")
|
||||||
|
```
|
||||||
|
|
||||||
|
Also see [docling#725](https://github.com/DS4SD/docling/issues/725).
|
||||||
|
|
||||||
|
Source: Issue [docling-core#119](https://github.com/DS4SD/docling-core/issues/119)
|
@ -47,6 +47,6 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|||||||
|
|
||||||
Docling has been brought to you by IBM.
|
Docling has been brought to you by IBM.
|
||||||
|
|
||||||
[supported_formats]: ./supported_formats.md
|
[supported_formats]: ./usage/supported_formats.md
|
||||||
[docling_document]: ./concepts/docling_document.md
|
[docling_document]: ./concepts/docling_document.md
|
||||||
[integrations]: ./integrations/index.md
|
[integrations]: ./integrations/index.md
|
||||||
|
216
docs/usage/enrichments.md
Normal file
216
docs/usage/enrichments.md
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
Docling allows to enrich the conversion pipeline with additional steps which process specific document components,
|
||||||
|
e.g. code blocks, pictures, etc. The extra steps usually require extra models executions which may increase
|
||||||
|
the processing time consistently. For this reason most enrichment models are disabled by default.
|
||||||
|
|
||||||
|
The following table provides an overview of the default enrichment models available in Docling.
|
||||||
|
|
||||||
|
| Feature | Parameter | Processed item | Description |
|
||||||
|
| ------- | --------- | ---------------| ----------- |
|
||||||
|
| Code understanding | `do_code_enrichment` | `CodeItem` | See [docs below](#code-understanding). |
|
||||||
|
| Formula understanding | `do_formula_enrichment` | `TextItem` with label `FORMULA` | See [docs below](#formula-understanding). |
|
||||||
|
| Picrure classification | `do_picture_classification` | `PictureItem` | See [docs below](#picture-classification). |
|
||||||
|
| Picture description | `do_picture_description` | `PictureItem` | See [docs below](#picture-description). |
|
||||||
|
|
||||||
|
|
||||||
|
## Enrichments details
|
||||||
|
|
||||||
|
### Code understanding
|
||||||
|
|
||||||
|
The code understanding step allows to use advance parsing for code blocks found in the document.
|
||||||
|
This enrichment model also set the `code_language` property of the `CodeItem`.
|
||||||
|
|
||||||
|
Model specs: see the [`CodeFormula` model card](https://huggingface.co/ds4sd/CodeFormula).
|
||||||
|
|
||||||
|
Example command line:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docling --enrich-code FILE
|
||||||
|
```
|
||||||
|
|
||||||
|
Example code:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_code_enrichment = True
|
||||||
|
|
||||||
|
converter = DocumentConverter(format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
})
|
||||||
|
|
||||||
|
result = converter.convert("https://arxiv.org/pdf/2501.17887")
|
||||||
|
doc = result.document
|
||||||
|
```
|
||||||
|
|
||||||
|
### Formula understanding
|
||||||
|
|
||||||
|
The formula understanding step will analize the equation formulas in documents and extract their LaTeX representation.
|
||||||
|
The HTML export functions in the DoclingDocument will leverage the formula and visualize the result using the mathml html syntax.
|
||||||
|
|
||||||
|
Model specs: see the [`CodeFormula` model card](https://huggingface.co/ds4sd/CodeFormula).
|
||||||
|
|
||||||
|
Example command line:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docling --enrich-formula FILE
|
||||||
|
```
|
||||||
|
|
||||||
|
Example code:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_formula_enrichment = True
|
||||||
|
|
||||||
|
converter = DocumentConverter(format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
})
|
||||||
|
|
||||||
|
result = converter.convert("https://arxiv.org/pdf/2501.17887")
|
||||||
|
doc = result.document
|
||||||
|
```
|
||||||
|
|
||||||
|
### Picture classification
|
||||||
|
|
||||||
|
The picture classification step classifies the `PictureItem` elements in the document with the `DocumentFigureClassifier` model.
|
||||||
|
This model is specialized to understand the classes of pictures found in documents, e.g. different chart types, flow diagrams,
|
||||||
|
logos, signatures, etc.
|
||||||
|
|
||||||
|
Model specs: see the [`DocumentFigureClassifier` model card](https://huggingface.co/ds4sd/DocumentFigureClassifier).
|
||||||
|
|
||||||
|
Example command line:
|
||||||
|
|
||||||
|
```sh
|
||||||
|
docling --enrich-picture-classes FILE
|
||||||
|
```
|
||||||
|
|
||||||
|
Example code:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.generate_picture_images = True
|
||||||
|
pipeline_options.images_scale = 2
|
||||||
|
pipeline_options.do_picture_classification = True
|
||||||
|
|
||||||
|
converter = DocumentConverter(format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
})
|
||||||
|
|
||||||
|
result = converter.convert("https://arxiv.org/pdf/2501.17887")
|
||||||
|
doc = result.document
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Picture description
|
||||||
|
|
||||||
|
The picture description step allows to annotate a picture with a vision model. This is also known as a "captioning" task.
|
||||||
|
The Docling pipeline allows to load and run models completely locally as well as connecting to remote API which support the chat template.
|
||||||
|
Below follow a few examples on how to use some common vision model and remote services.
|
||||||
|
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_picture_description = True
|
||||||
|
|
||||||
|
converter = DocumentConverter(format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
})
|
||||||
|
|
||||||
|
result = converter.convert("https://arxiv.org/pdf/2501.17887")
|
||||||
|
doc = result.document
|
||||||
|
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Granite Vision model
|
||||||
|
|
||||||
|
Model specs: see the [`ibm-granite/granite-vision-3.1-2b-preview` model card](https://huggingface.co/ibm-granite/granite-vision-3.1-2b-preview).
|
||||||
|
|
||||||
|
Usage in Docling:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.datamodel.pipeline_options import granite_picture_description
|
||||||
|
|
||||||
|
pipeline_options.picture_description_options = granite_picture_description
|
||||||
|
```
|
||||||
|
|
||||||
|
#### SmolVLM model
|
||||||
|
|
||||||
|
Model specs: see the [`HuggingFaceTB/SmolVLM-256M-Instruct` model card](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct).
|
||||||
|
|
||||||
|
Usage in Docling:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.datamodel.pipeline_options import smolvlm_picture_description
|
||||||
|
|
||||||
|
pipeline_options.picture_description_options = smolvlm_picture_description
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Other vision models
|
||||||
|
|
||||||
|
The option class `PictureDescriptionVlmOptions` allows to use any another model from the Hugging Face Hub.
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.datamodel.pipeline_options import PictureDescriptionVlmOptions
|
||||||
|
|
||||||
|
pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
|
||||||
|
repo_id="", # <-- add here the Hugging Face repo_id of your favorite VLM
|
||||||
|
prompt="Describe the image in three sentences. Be consise and accurate.",
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Remote vision model
|
||||||
|
|
||||||
|
The option class `PictureDescriptionApiOptions` allows to use models hosted on remote platforms, e.g.
|
||||||
|
on local endpoints served by [VLLM](https://docs.vllm.ai), [Ollama](https://ollama.com/) and others,
|
||||||
|
or cloud providers like [IBM watsonx.ai](https://www.ibm.com/products/watsonx-ai), etc.
|
||||||
|
|
||||||
|
_Note: in most cases this option will send your data to the remote service provider._
|
||||||
|
|
||||||
|
Usage in Docling:
|
||||||
|
|
||||||
|
```py
|
||||||
|
from docling.datamodel.pipeline_options import PictureDescriptionApiOptions
|
||||||
|
|
||||||
|
# Enable connections to remote services
|
||||||
|
pipeline_options.enable_remote_services=True # <-- this is required!
|
||||||
|
|
||||||
|
# Example using a model running locally, e.g. via VLLM
|
||||||
|
# $ vllm serve MODEL_NAME
|
||||||
|
pipeline_options.picture_description_options = PictureDescriptionApiOptions(
|
||||||
|
url="http://localhost:8000/v1/chat/completions",
|
||||||
|
params=dict(
|
||||||
|
model="MODEL NAME",
|
||||||
|
seed=42,
|
||||||
|
max_completion_tokens=200,
|
||||||
|
),
|
||||||
|
prompt="Describe the image in three sentences. Be consise and accurate.",
|
||||||
|
timeout=90,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
End-to-end code snippets for cloud providers are available in the examples section:
|
||||||
|
|
||||||
|
- [IBM watsonx.ai](../examples/pictures_description_api.py)
|
||||||
|
|
||||||
|
|
||||||
|
## Develop new enrichment models
|
||||||
|
|
||||||
|
Beside looking at the implementation of all the models listed above, the Docling documentation has a few examples
|
||||||
|
dedicated to the implementation of enrichment models.
|
||||||
|
|
||||||
|
- [Develop picture enrichment](../examples/develop_picture_enrichment.py)
|
||||||
|
- [Develop formula enrichment](../examples/develop_formula_understanding.py)
|
@ -22,7 +22,7 @@ A simple example would look like this:
|
|||||||
docling https://arxiv.org/pdf/2206.01062
|
docling https://arxiv.org/pdf/2206.01062
|
||||||
```
|
```
|
||||||
|
|
||||||
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).
|
To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](../reference/cli.md).
|
||||||
|
|
||||||
### Advanced options
|
### Advanced options
|
||||||
|
|
||||||
@ -104,7 +104,7 @@ The options in this list require the explicit `enable_remote_services=True` when
|
|||||||
|
|
||||||
#### Adjust pipeline features
|
#### Adjust pipeline features
|
||||||
|
|
||||||
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
|
The example file [custom_convert.py](../examples/custom_convert.py) contains multiple ways
|
||||||
one can adjust the conversion pipeline and features.
|
one can adjust the conversion pipeline and features.
|
||||||
|
|
||||||
##### Control PDF table extraction options
|
##### Control PDF table extraction options
|
||||||
@ -183,13 +183,13 @@ You can limit the CPU threads used by Docling by setting the environment variabl
|
|||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
|
|
||||||
This section discusses directly invoking a [backend](./concepts/architecture.md),
|
This section discusses directly invoking a [backend](../concepts/architecture.md),
|
||||||
i.e. using a low-level API. This should only be done when necessary. For most cases,
|
i.e. using a low-level API. This should only be done when necessary. For most cases,
|
||||||
using a `DocumentConverter` (high-level API) as discussed in the sections above
|
using a `DocumentConverter` (high-level API) as discussed in the sections above
|
||||||
should suffice — and is the recommended way.
|
should suffice — and is the recommended way.
|
||||||
|
|
||||||
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](./supported_formats.md)).
|
By default, Docling will try to identify the document format to apply the appropriate conversion backend (see the list of [supported formats](../supported_formats.md)).
|
||||||
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](./examples/run_with_formats.py) example.
|
You can restrict the `DocumentConverter` to a set of allowed document formats, as shown in the [Multi-format conversion](../examples/run_with_formats.py) example.
|
||||||
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
|
Alternatively, you can also use the specific backend that matches your document content. For instance, you can use `HTMLDocumentBackend` for HTML pages:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -214,9 +214,9 @@ print(dl_doc.export_to_markdown())
|
|||||||
|
|
||||||
## Chunking
|
## Chunking
|
||||||
|
|
||||||
You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
|
You can chunk a Docling document using a [chunker](../concepts/chunking.md), such as a
|
||||||
`HybridChunker`, as shown below (for more details check out
|
`HybridChunker`, as shown below (for more details check out
|
||||||
[this example](examples/hybrid_chunking.ipynb)):
|
[this example](../examples/hybrid_chunking.ipynb)):
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from docling.document_converter import DocumentConverter
|
from docling.document_converter import DocumentConverter
|
@ -1,6 +1,6 @@
|
|||||||
Docling can parse various documents formats into a unified representation (Docling
|
Docling can parse various documents formats into a unified representation (Docling
|
||||||
Document), which it can export to different formats too — check out
|
Document), which it can export to different formats too — check out
|
||||||
[Architecture](./concepts/architecture.md) for more details.
|
[Architecture](../concepts/architecture.md) for more details.
|
||||||
|
|
||||||
Below you can find a listing of all supported input and output formats.
|
Below you can find a listing of all supported input and output formats.
|
||||||
|
|
||||||
@ -22,7 +22,7 @@ Schema-specific support:
|
|||||||
|--------|-------------|
|
|--------|-------------|
|
||||||
| USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
|
| USPTO XML | XML format followed by [USPTO](https://www.uspto.gov/patents) patents |
|
||||||
| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
|
| JATS XML | XML format followed by [JATS](https://jats.nlm.nih.gov/) articles |
|
||||||
| Docling JSON | JSON-serialized [Docling Document](./concepts/docling_document.md) |
|
| Docling JSON | JSON-serialized [Docling Document](../concepts/docling_document.md) |
|
||||||
|
|
||||||
## Supported output formats
|
## Supported output formats
|
||||||
|
|
22
mkdocs.yml
22
mkdocs.yml
@ -54,11 +54,14 @@ theme:
|
|||||||
nav:
|
nav:
|
||||||
- Home:
|
- Home:
|
||||||
- "Docling": index.md
|
- "Docling": index.md
|
||||||
- Installation: installation.md
|
- Installation:
|
||||||
- Usage: usage.md
|
- Installation: installation/index.md
|
||||||
- Supported formats: supported_formats.md
|
- Usage:
|
||||||
- FAQ: faq.md
|
- Usage: usage/index.md
|
||||||
- Docling v2: v2.md
|
- Supported formats: usage/supported_formats.md
|
||||||
|
- Enrichment features: usage/enrichments.md
|
||||||
|
- FAQ:
|
||||||
|
- FAQ: faq/index.md
|
||||||
- Concepts:
|
- Concepts:
|
||||||
- Concepts: concepts/index.md
|
- Concepts: concepts/index.md
|
||||||
- Architecture: concepts/architecture.md
|
- Architecture: concepts/architecture.md
|
||||||
@ -72,11 +75,8 @@ nav:
|
|||||||
- "Batch conversion": examples/batch_convert.py
|
- "Batch conversion": examples/batch_convert.py
|
||||||
- "Multi-format conversion": examples/run_with_formats.py
|
- "Multi-format conversion": examples/run_with_formats.py
|
||||||
- "Figure export": examples/export_figures.py
|
- "Figure export": examples/export_figures.py
|
||||||
- "Figure enrichment": examples/develop_picture_enrichment.py
|
|
||||||
- "Table export": examples/export_tables.py
|
- "Table export": examples/export_tables.py
|
||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
- "Annotate picture with local vlm": examples/pictures_description.ipynb
|
|
||||||
- "Annotate picture with remote vlm": examples/pictures_description_api.py
|
|
||||||
- "Force full page OCR": examples/full_page_ocr.py
|
- "Force full page OCR": examples/full_page_ocr.py
|
||||||
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
|
||||||
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
|
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
|
||||||
@ -90,6 +90,12 @@ nav:
|
|||||||
- examples/rag_haystack.ipynb
|
- examples/rag_haystack.ipynb
|
||||||
- examples/rag_langchain.ipynb
|
- examples/rag_langchain.ipynb
|
||||||
- examples/rag_llamaindex.ipynb
|
- examples/rag_llamaindex.ipynb
|
||||||
|
- 🖼️ Picture annotation:
|
||||||
|
- "Annotate picture with local VLM": examples/pictures_description.ipynb
|
||||||
|
- "Annotate picture with remote VLM": examples/pictures_description_api.py
|
||||||
|
- ✨ Enrichment development:
|
||||||
|
- "Figure enrichment": examples/develop_picture_enrichment.py
|
||||||
|
- "Formula enrichment": examples/develop_formula_understanding.py
|
||||||
- 🗂️ More examples:
|
- 🗂️ More examples:
|
||||||
- examples/rag_weaviate.ipynb
|
- examples/rag_weaviate.ipynb
|
||||||
- RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb
|
- RAG with Granite [↗]: https://github.com/ibm-granite-community/granite-snack-cookbook/blob/main/recipes/RAG/Granite_Docling_RAG.ipynb
|
||||||
|
391
poetry.lock
generated
391
poetry.lock
generated
@ -1,104 +1,135 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.5 and should not be changed by hand.
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "accelerate"
|
||||||
|
version = "1.4.0"
|
||||||
|
description = "Accelerate"
|
||||||
|
optional = true
|
||||||
|
python-versions = ">=3.9.0"
|
||||||
|
files = [
|
||||||
|
{file = "accelerate-1.4.0-py3-none-any.whl", hash = "sha256:f6e1e7dfaf9d799a20a1dc45efbf4b1546163eac133faa5acd0d89177c896e55"},
|
||||||
|
{file = "accelerate-1.4.0.tar.gz", hash = "sha256:37d413e1b64cb8681ccd2908ae211cf73e13e6e636a2f598a96eccaa538773a5"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
huggingface-hub = ">=0.21.0"
|
||||||
|
numpy = ">=1.17,<3.0.0"
|
||||||
|
packaging = ">=20.0"
|
||||||
|
psutil = "*"
|
||||||
|
pyyaml = "*"
|
||||||
|
safetensors = ">=0.4.3"
|
||||||
|
torch = ">=2.0.0"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
deepspeed = ["deepspeed"]
|
||||||
|
dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "diffusers", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.6.4,<0.7.0)", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
|
||||||
|
quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.6.4,<0.7.0)"]
|
||||||
|
rich = ["rich"]
|
||||||
|
sagemaker = ["sagemaker"]
|
||||||
|
test-dev = ["bitsandbytes", "datasets", "diffusers", "evaluate", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
|
||||||
|
test-prod = ["parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist"]
|
||||||
|
test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"]
|
||||||
|
testing = ["bitsandbytes", "datasets", "diffusers", "evaluate", "parameterized", "pytest (>=7.2.0,<=8.0.0)", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchdata (>=0.8.0)", "torchpippy (>=0.2.0)", "tqdm", "transformers"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aiohappyeyeballs"
|
name = "aiohappyeyeballs"
|
||||||
version = "2.4.6"
|
version = "2.4.8"
|
||||||
description = "Happy Eyeballs for asyncio"
|
description = "Happy Eyeballs for asyncio"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "aiohappyeyeballs-2.4.6-py3-none-any.whl", hash = "sha256:147ec992cf873d74f5062644332c539fcd42956dc69453fe5204195e560517e1"},
|
{file = "aiohappyeyeballs-2.4.8-py3-none-any.whl", hash = "sha256:6cac4f5dd6e34a9644e69cf9021ef679e4394f54e58a183056d12009e42ea9e3"},
|
||||||
{file = "aiohappyeyeballs-2.4.6.tar.gz", hash = "sha256:9b05052f9042985d32ecbe4b59a77ae19c006a78f1344d7fdad69d28ded3d0b0"},
|
{file = "aiohappyeyeballs-2.4.8.tar.gz", hash = "sha256:19728772cb12263077982d2f55453babd8bec6a052a926cd5c0c42796da8bf62"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aiohttp"
|
name = "aiohttp"
|
||||||
version = "3.11.12"
|
version = "3.11.13"
|
||||||
description = "Async http client/server framework (asyncio)"
|
description = "Async http client/server framework (asyncio)"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:aa8a8caca81c0a3e765f19c6953416c58e2f4cc1b84829af01dd1c771bb2f91f"},
|
{file = "aiohttp-3.11.13-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:a4fe27dbbeec445e6e1291e61d61eb212ee9fed6e47998b27de71d70d3e8777d"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:84ede78acde96ca57f6cf8ccb8a13fbaf569f6011b9a52f870c662d4dc8cd854"},
|
{file = "aiohttp-3.11.13-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9e64ca2dbea28807f8484c13f684a2f761e69ba2640ec49dacd342763cc265ef"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:584096938a001378484aa4ee54e05dc79c7b9dd933e271c744a97b3b6f644957"},
|
{file = "aiohttp-3.11.13-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9840be675de208d1f68f84d578eaa4d1a36eee70b16ae31ab933520c49ba1325"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:392432a2dde22b86f70dd4a0e9671a349446c93965f261dbaecfaf28813e5c42"},
|
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28a772757c9067e2aee8a6b2b425d0efaa628c264d6416d283694c3d86da7689"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:88d385b8e7f3a870146bf5ea31786ef7463e99eb59e31db56e2315535d811f55"},
|
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b88aca5adbf4625e11118df45acac29616b425833c3be7a05ef63a6a4017bfdb"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b10a47e5390c4b30a0d58ee12581003be52eedd506862ab7f97da7a66805befb"},
|
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ce10ddfbe26ed5856d6902162f71b8fe08545380570a885b4ab56aecfdcb07f4"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b5263dcede17b6b0c41ef0c3ccce847d82a7da98709e75cf7efde3e9e3b5cae"},
|
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa48dac27f41b36735c807d1ab093a8386701bbf00eb6b89a0f69d9fa26b3671"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50c5c7b8aa5443304c55c262c5693b108c35a3b61ef961f1e782dd52a2f559c7"},
|
{file = "aiohttp-3.11.13-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:89ce611b1eac93ce2ade68f1470889e0173d606de20c85a012bfa24be96cf867"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:d1c031a7572f62f66f1257db37ddab4cb98bfaf9b9434a3b4840bf3560f5e788"},
|
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:78e4dd9c34ec7b8b121854eb5342bac8b02aa03075ae8618b6210a06bbb8a115"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:7e44eba534381dd2687be50cbd5f2daded21575242ecfdaf86bbeecbc38dae8e"},
|
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:66047eacbc73e6fe2462b77ce39fc170ab51235caf331e735eae91c95e6a11e4"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:145a73850926018ec1681e734cedcf2716d6a8697d90da11284043b745c286d5"},
|
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5ad8f1c19fe277eeb8bc45741c6d60ddd11d705c12a4d8ee17546acff98e0802"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:2c311e2f63e42c1bf86361d11e2c4a59f25d9e7aabdbdf53dc38b885c5435cdb"},
|
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:64815c6f02e8506b10113ddbc6b196f58dbef135751cc7c32136df27b736db09"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ea756b5a7bac046d202a9a3889b9a92219f885481d78cd318db85b15cc0b7bcf"},
|
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:967b93f21b426f23ca37329230d5bd122f25516ae2f24a9cea95a30023ff8283"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:526c900397f3bbc2db9cb360ce9c35134c908961cdd0ac25b1ae6ffcaa2507ff"},
|
{file = "aiohttp-3.11.13-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cf1f31f83d16ec344136359001c5e871915c6ab685a3d8dee38e2961b4c81730"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-win32.whl", hash = "sha256:b8d3bb96c147b39c02d3db086899679f31958c5d81c494ef0fc9ef5bb1359b3d"},
|
{file = "aiohttp-3.11.13-cp310-cp310-win32.whl", hash = "sha256:00c8ac69e259c60976aa2edae3f13d9991cf079aaa4d3cd5a49168ae3748dee3"},
|
||||||
{file = "aiohttp-3.11.12-cp310-cp310-win_amd64.whl", hash = "sha256:7fe3d65279bfbee8de0fb4f8c17fc4e893eed2dba21b2f680e930cc2b09075c5"},
|
{file = "aiohttp-3.11.13-cp310-cp310-win_amd64.whl", hash = "sha256:90d571c98d19a8b6e793b34aa4df4cee1e8fe2862d65cc49185a3a3d0a1a3996"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:87a2e00bf17da098d90d4145375f1d985a81605267e7f9377ff94e55c5d769eb"},
|
{file = "aiohttp-3.11.13-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6b35aab22419ba45f8fc290d0010898de7a6ad131e468ffa3922b1b0b24e9d2e"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b34508f1cd928ce915ed09682d11307ba4b37d0708d1f28e5774c07a7674cac9"},
|
{file = "aiohttp-3.11.13-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:f81cba651db8795f688c589dd11a4fbb834f2e59bbf9bb50908be36e416dc760"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:936d8a4f0f7081327014742cd51d320296b56aa6d324461a13724ab05f4b2933"},
|
{file = "aiohttp-3.11.13-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f55d0f242c2d1fcdf802c8fabcff25a9d85550a4cf3a9cf5f2a6b5742c992839"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2de1378f72def7dfb5dbd73d86c19eda0ea7b0a6873910cc37d57e80f10d64e1"},
|
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c4bea08a6aad9195ac9b1be6b0c7e8a702a9cec57ce6b713698b4a5afa9c2e33"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b9d45dbb3aaec05cf01525ee1a7ac72de46a8c425cb75c003acd29f76b1ffe94"},
|
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c6070bcf2173a7146bb9e4735b3c62b2accba459a6eae44deea0eb23e0035a23"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:930ffa1925393381e1e0a9b82137fa7b34c92a019b521cf9f41263976666a0d6"},
|
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:718d5deb678bc4b9d575bfe83a59270861417da071ab44542d0fcb6faa686636"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8340def6737118f5429a5df4e88f440746b791f8f1c4ce4ad8a595f42c980bd5"},
|
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0f6b2c5b4a4d22b8fb2c92ac98e0747f5f195e8e9448bfb7404cd77e7bfa243f"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4016e383f91f2814e48ed61e6bda7d24c4d7f2402c75dd28f7e1027ae44ea204"},
|
{file = "aiohttp-3.11.13-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:747ec46290107a490d21fe1ff4183bef8022b848cf9516970cb31de6d9460088"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:3c0600bcc1adfaaac321422d615939ef300df81e165f6522ad096b73439c0f58"},
|
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:01816f07c9cc9d80f858615b1365f8319d6a5fd079cd668cc58e15aafbc76a54"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:0450ada317a65383b7cce9576096150fdb97396dcfe559109b403c7242faffef"},
|
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:a08ad95fcbd595803e0c4280671d808eb170a64ca3f2980dd38e7a72ed8d1fea"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:850ff6155371fd802a280f8d369d4e15d69434651b844bde566ce97ee2277420"},
|
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c97be90d70f7db3aa041d720bfb95f4869d6063fcdf2bb8333764d97e319b7d0"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:8fd12d0f989c6099e7b0f30dc6e0d1e05499f3337461f0b2b0dadea6c64b89df"},
|
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ab915a57c65f7a29353c8014ac4be685c8e4a19e792a79fe133a8e101111438e"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:76719dd521c20a58a6c256d058547b3a9595d1d885b830013366e27011ffe804"},
|
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:35cda4e07f5e058a723436c4d2b7ba2124ab4e0aa49e6325aed5896507a8a42e"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:97fe431f2ed646a3b56142fc81d238abcbaff08548d6912acb0b19a0cadc146b"},
|
{file = "aiohttp-3.11.13-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:af55314407714fe77a68a9ccaab90fdb5deb57342585fd4a3a8102b6d4370080"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-win32.whl", hash = "sha256:e10c440d142fa8b32cfdb194caf60ceeceb3e49807072e0dc3a8887ea80e8c16"},
|
{file = "aiohttp-3.11.13-cp311-cp311-win32.whl", hash = "sha256:42d689a5c0a0c357018993e471893e939f555e302313d5c61dfc566c2cad6185"},
|
||||||
{file = "aiohttp-3.11.12-cp311-cp311-win_amd64.whl", hash = "sha256:246067ba0cf5560cf42e775069c5d80a8989d14a7ded21af529a4e10e3e0f0e6"},
|
{file = "aiohttp-3.11.13-cp311-cp311-win_amd64.whl", hash = "sha256:b73a2b139782a07658fbf170fe4bcdf70fc597fae5ffe75e5b67674c27434a9f"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:e392804a38353900c3fd8b7cacbea5132888f7129f8e241915e90b85f00e3250"},
|
{file = "aiohttp-3.11.13-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:2eabb269dc3852537d57589b36d7f7362e57d1ece308842ef44d9830d2dc3c90"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8fa1510b96c08aaad49303ab11f8803787c99222288f310a62f493faf883ede1"},
|
{file = "aiohttp-3.11.13-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b77ee42addbb1c36d35aca55e8cc6d0958f8419e458bb70888d8c69a4ca833d"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:dc065a4285307607df3f3686363e7f8bdd0d8ab35f12226362a847731516e42c"},
|
{file = "aiohttp-3.11.13-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:55789e93c5ed71832e7fac868167276beadf9877b85697020c46e9a75471f55f"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddb31f8474695cd61fc9455c644fc1606c164b93bff2490390d90464b4655df"},
|
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c929f9a7249a11e4aa5c157091cfad7f49cc6b13f4eecf9b747104befd9f56f2"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9dec0000d2d8621d8015c293e24589d46fa218637d820894cb7356c77eca3259"},
|
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d33851d85537bbf0f6291ddc97926a754c8f041af759e0aa0230fe939168852b"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e3552fe98e90fdf5918c04769f338a87fa4f00f3b28830ea9b78b1bdc6140e0d"},
|
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9229d8613bd8401182868fe95688f7581673e1c18ff78855671a4b8284f47bcb"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6dfe7f984f28a8ae94ff3a7953cd9678550dbd2a1f9bda5dd9c5ae627744c78e"},
|
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:669dd33f028e54fe4c96576f406ebb242ba534dd3a981ce009961bf49960f117"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a481a574af914b6e84624412666cbfbe531a05667ca197804ecc19c97b8ab1b0"},
|
{file = "aiohttp-3.11.13-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c1b20a1ace54af7db1f95af85da530fe97407d9063b7aaf9ce6a32f44730778"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1987770fb4887560363b0e1a9b75aa303e447433c41284d3af2840a2f226d6e0"},
|
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:5724cc77f4e648362ebbb49bdecb9e2b86d9b172c68a295263fa072e679ee69d"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:a4ac6a0f0f6402854adca4e3259a623f5c82ec3f0c049374133bcb243132baf9"},
|
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:aa36c35e94ecdb478246dd60db12aba57cfcd0abcad43c927a8876f25734d496"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:c96a43822f1f9f69cc5c3706af33239489a6294be486a0447fb71380070d4d5f"},
|
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:9b5b37c863ad5b0892cc7a4ceb1e435e5e6acd3f2f8d3e11fa56f08d3c67b820"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:a5e69046f83c0d3cb8f0d5bd9b8838271b1bc898e01562a04398e160953e8eb9"},
|
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:e06cf4852ce8c4442a59bae5a3ea01162b8fcb49ab438d8548b8dc79375dad8a"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:68d54234c8d76d8ef74744f9f9fc6324f1508129e23da8883771cdbb5818cbef"},
|
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:5194143927e494616e335d074e77a5dac7cd353a04755330c9adc984ac5a628e"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c9fd9dcf9c91affe71654ef77426f5cf8489305e1c66ed4816f5a21874b094b9"},
|
{file = "aiohttp-3.11.13-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:afcb6b275c2d2ba5d8418bf30a9654fa978b4f819c2e8db6311b3525c86fe637"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-win32.whl", hash = "sha256:0ed49efcd0dc1611378beadbd97beb5d9ca8fe48579fc04a6ed0844072261b6a"},
|
{file = "aiohttp-3.11.13-cp312-cp312-win32.whl", hash = "sha256:7104d5b3943c6351d1ad7027d90bdd0ea002903e9f610735ac99df3b81f102ee"},
|
||||||
{file = "aiohttp-3.11.12-cp312-cp312-win_amd64.whl", hash = "sha256:54775858c7f2f214476773ce785a19ee81d1294a6bedc5cc17225355aab74802"},
|
{file = "aiohttp-3.11.13-cp312-cp312-win_amd64.whl", hash = "sha256:47dc018b1b220c48089b5b9382fbab94db35bef2fa192995be22cbad3c5730c8"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:413ad794dccb19453e2b97c2375f2ca3cdf34dc50d18cc2693bd5aed7d16f4b9"},
|
{file = "aiohttp-3.11.13-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:9862d077b9ffa015dbe3ce6c081bdf35135948cb89116e26667dd183550833d1"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4a93d28ed4b4b39e6f46fd240896c29b686b75e39cc6992692e3922ff6982b4c"},
|
{file = "aiohttp-3.11.13-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:fbfef0666ae9e07abfa2c54c212ac18a1f63e13e0760a769f70b5717742f3ece"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d589264dbba3b16e8951b6f145d1e6b883094075283dafcab4cdd564a9e353a0"},
|
{file = "aiohttp-3.11.13-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:93a1f7d857c4fcf7cabb1178058182c789b30d85de379e04f64c15b7e88d66fb"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e5148ca8955affdfeb864aca158ecae11030e952b25b3ae15d4e2b5ba299bad2"},
|
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba40b7ae0f81c7029583a338853f6607b6d83a341a3dcde8bed1ea58a3af1df9"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:525410e0790aab036492eeea913858989c4cb070ff373ec3bc322d700bdf47c1"},
|
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b5b95787335c483cd5f29577f42bbe027a412c5431f2f80a749c80d040f7ca9f"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9bd8695be2c80b665ae3f05cb584093a1e59c35ecb7d794d1edd96e8cc9201d7"},
|
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a7d474c5c1f0b9405c1565fafdc4429fa7d986ccbec7ce55bc6a330f36409cad"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f0203433121484b32646a5f5ea93ae86f3d9559d7243f07e8c0eab5ff8e3f70e"},
|
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e83fb1991e9d8982b3b36aea1e7ad27ea0ce18c14d054c7a404d68b0319eebb"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:40cd36749a1035c34ba8d8aaf221b91ca3d111532e5ccb5fa8c3703ab1b967ed"},
|
{file = "aiohttp-3.11.13-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4586a68730bd2f2b04a83e83f79d271d8ed13763f64b75920f18a3a677b9a7f0"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a7442662afebbf7b4c6d28cb7aab9e9ce3a5df055fc4116cc7228192ad6cb484"},
|
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:9fe4eb0e7f50cdb99b26250d9328faef30b1175a5dbcfd6d0578d18456bac567"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:8a2fb742ef378284a50766e985804bd6adb5adb5aa781100b09befdbfa757b65"},
|
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:2a8a6bc19818ac3e5596310ace5aa50d918e1ebdcc204dc96e2f4d505d51740c"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:2cee3b117a8d13ab98b38d5b6bdcd040cfb4181068d05ce0c474ec9db5f3c5bb"},
|
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:7f27eec42f6c3c1df09cfc1f6786308f8b525b8efaaf6d6bd76c1f52c6511f6a"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f6a19bcab7fbd8f8649d6595624856635159a6527861b9cdc3447af288a00c00"},
|
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:2a4a13dfbb23977a51853b419141cd0a9b9573ab8d3a1455c6e63561387b52ff"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e4cecdb52aaa9994fbed6b81d4568427b6002f0a91c322697a4bfcc2b2363f5a"},
|
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:02876bf2f69b062584965507b07bc06903c2dc93c57a554b64e012d636952654"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:30f546358dfa0953db92ba620101fefc81574f87b2346556b90b5f3ef16e55ce"},
|
{file = "aiohttp-3.11.13-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b992778d95b60a21c4d8d4a5f15aaab2bd3c3e16466a72d7f9bfd86e8cea0d4b"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-win32.whl", hash = "sha256:ce1bb21fc7d753b5f8a5d5a4bae99566386b15e716ebdb410154c16c91494d7f"},
|
{file = "aiohttp-3.11.13-cp313-cp313-win32.whl", hash = "sha256:507ab05d90586dacb4f26a001c3abf912eb719d05635cbfad930bdbeb469b36c"},
|
||||||
{file = "aiohttp-3.11.12-cp313-cp313-win_amd64.whl", hash = "sha256:f7914ab70d2ee8ab91c13e5402122edbc77821c66d2758abb53aabe87f013287"},
|
{file = "aiohttp-3.11.13-cp313-cp313-win_amd64.whl", hash = "sha256:5ceb81a4db2decdfa087381b5fc5847aa448244f973e5da232610304e199e7b2"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:7c3623053b85b4296cd3925eeb725e386644fd5bc67250b3bb08b0f144803e7b"},
|
{file = "aiohttp-3.11.13-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:51c3ff9c7a25f3cad5c09d9aacbc5aefb9267167c4652c1eb737989b554fe278"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:67453e603cea8e85ed566b2700efa1f6916aefbc0c9fcb2e86aaffc08ec38e78"},
|
{file = "aiohttp-3.11.13-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e271beb2b1dabec5cd84eb488bdabf9758d22ad13471e9c356be07ad139b3012"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6130459189e61baac5a88c10019b21e1f0c6d00ebc770e9ce269475650ff7f73"},
|
{file = "aiohttp-3.11.13-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0e9eb7e5764abcb49f0e2bd8f5731849b8728efbf26d0cac8e81384c95acec3f"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9060addfa4ff753b09392efe41e6af06ea5dd257829199747b9f15bfad819460"},
|
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baae005092e3f200de02699314ac8933ec20abf998ec0be39448f6605bce93df"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:34245498eeb9ae54c687a07ad7f160053911b5745e186afe2d0c0f2898a1ab8a"},
|
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1982c98ac62c132d2b773d50e2fcc941eb0b8bad3ec078ce7e7877c4d5a2dce7"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8dc0fba9a74b471c45ca1a3cb6e6913ebfae416678d90529d188886278e7f3f6"},
|
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d2b25b2eeb35707113b2d570cadc7c612a57f1c5d3e7bb2b13870fe284e08fc0"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a478aa11b328983c4444dacb947d4513cb371cd323f3845e53caeda6be5589d5"},
|
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b27961d65639128336b7a7c3f0046dcc62a9443d5ef962e3c84170ac620cec47"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c160a04283c8c6f55b5bf6d4cad59bb9c5b9c9cd08903841b25f1f7109ef1259"},
|
{file = "aiohttp-3.11.13-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a01fe9f1e05025eacdd97590895e2737b9f851d0eb2e017ae9574d9a4f0b6252"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:edb69b9589324bdc40961cdf0657815df674f1743a8d5ad9ab56a99e4833cfdd"},
|
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:fa1fb1b61881c8405829c50e9cc5c875bfdbf685edf57a76817dfb50643e4a1a"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:4ee84c2a22a809c4f868153b178fe59e71423e1f3d6a8cd416134bb231fbf6d3"},
|
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:25de43bb3cf83ad83efc8295af7310219af6dbe4c543c2e74988d8e9c8a2a917"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:bf4480a5438f80e0f1539e15a7eb8b5f97a26fe087e9828e2c0ec2be119a9f72"},
|
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fe7065e2215e4bba63dc00db9ae654c1ba3950a5fff691475a32f511142fcddb"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:e6b2732ef3bafc759f653a98881b5b9cdef0716d98f013d376ee8dfd7285abf1"},
|
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:7836587eef675a17d835ec3d98a8c9acdbeb2c1d72b0556f0edf4e855a25e9c1"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:f752e80606b132140883bb262a457c475d219d7163d996dc9072434ffb0784c4"},
|
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:85fa0b18558eb1427090912bd456a01f71edab0872f4e0f9e4285571941e4090"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:ab3247d58b393bda5b1c8f31c9edece7162fc13265334217785518dd770792b8"},
|
{file = "aiohttp-3.11.13-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:a86dc177eb4c286c19d1823ac296299f59ed8106c9536d2b559f65836e0fb2c6"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-win32.whl", hash = "sha256:0d5176f310a7fe6f65608213cc74f4228e4f4ce9fd10bcb2bb6da8fc66991462"},
|
{file = "aiohttp-3.11.13-cp39-cp39-win32.whl", hash = "sha256:684eea71ab6e8ade86b9021bb62af4bf0881f6be4e926b6b5455de74e420783a"},
|
||||||
{file = "aiohttp-3.11.12-cp39-cp39-win_amd64.whl", hash = "sha256:74bd573dde27e58c760d9ca8615c41a57e719bff315c9adb6f2a4281a28e8798"},
|
{file = "aiohttp-3.11.13-cp39-cp39-win_amd64.whl", hash = "sha256:82c249f2bfa5ecbe4a1a7902c81c0fba52ed9ebd0176ab3047395d02ad96cfcb"},
|
||||||
{file = "aiohttp-3.11.12.tar.gz", hash = "sha256:7603ca26d75b1b86160ce1bbe2787a0b706e592af5b2504e12caa88a217767b0"},
|
{file = "aiohttp-3.11.13.tar.gz", hash = "sha256:8ce789231404ca8fff7f693cdce398abf6d90fd5dae2b1847477196c243b1fbb"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -187,8 +218,8 @@ files = [
|
|||||||
lazy-object-proxy = ">=1.4.0"
|
lazy-object-proxy = ">=1.4.0"
|
||||||
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
typing-extensions = {version = ">=4.0.0", markers = "python_version < \"3.11\""}
|
||||||
wrapt = [
|
wrapt = [
|
||||||
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
|
||||||
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
{version = ">=1.11,<2", markers = "python_version < \"3.11\""},
|
||||||
|
{version = ">=1.14,<2", markers = "python_version >= \"3.11\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -280,6 +311,24 @@ files = [
|
|||||||
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
|
docs = ["furo", "jaraco.packaging (>=9.3)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
|
||||||
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
|
testing = ["jaraco.test", "pytest (!=8.0.*)", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "backrefs"
|
||||||
|
version = "5.8"
|
||||||
|
description = "A wrapper around re and regex that adds additional back references."
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.9"
|
||||||
|
files = [
|
||||||
|
{file = "backrefs-5.8-py310-none-any.whl", hash = "sha256:c67f6638a34a5b8730812f5101376f9d41dc38c43f1fdc35cb54700f6ed4465d"},
|
||||||
|
{file = "backrefs-5.8-py311-none-any.whl", hash = "sha256:2e1c15e4af0e12e45c8701bd5da0902d326b2e200cafcd25e49d9f06d44bb61b"},
|
||||||
|
{file = "backrefs-5.8-py312-none-any.whl", hash = "sha256:bbef7169a33811080d67cdf1538c8289f76f0942ff971222a16034da88a73486"},
|
||||||
|
{file = "backrefs-5.8-py313-none-any.whl", hash = "sha256:e3a63b073867dbefd0536425f43db618578528e3896fb77be7141328642a1585"},
|
||||||
|
{file = "backrefs-5.8-py39-none-any.whl", hash = "sha256:a66851e4533fb5b371aa0628e1fee1af05135616b86140c9d787a2ffdf4b8fdc"},
|
||||||
|
{file = "backrefs-5.8.tar.gz", hash = "sha256:2cab642a205ce966af3dd4b38ee36009b31fa9502a35fd61d59ccc116e40a6bd"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
extras = ["regex"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "beautifulsoup4"
|
name = "beautifulsoup4"
|
||||||
version = "4.13.3"
|
version = "4.13.3"
|
||||||
@ -821,13 +870,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-core"
|
name = "docling-core"
|
||||||
version = "2.20.0"
|
version = "2.21.1"
|
||||||
description = "A python library to define and validate data types in Docling."
|
description = "A python library to define and validate data types in Docling."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_core-2.20.0-py3-none-any.whl", hash = "sha256:72f50fce277b7bb51f4134f443240c041582184305c3bcaabdea13fc5550f160"},
|
{file = "docling_core-2.21.1-py3-none-any.whl", hash = "sha256:b8112915728cdc14f328f636f6c0ed36e6bbcc02ff940cc0bf85e303738671c3"},
|
||||||
{file = "docling_core-2.20.0.tar.gz", hash = "sha256:9733581c15f5a9b5e3a6cb74fa995cc4078ff16668007f86c5f75d1ea9180d7f"},
|
{file = "docling_core-2.21.1.tar.gz", hash = "sha256:3ccc50197d24a3156cfc6c22c8404c58757749646d876a1c1c69fd800f664a4f"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -849,13 +898,13 @@ chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-ibm-models"
|
name = "docling-ibm-models"
|
||||||
version = "3.4.0"
|
version = "3.4.1"
|
||||||
description = "This package contains the AI models used by the Docling PDF conversion package"
|
description = "This package contains the AI models used by the Docling PDF conversion package"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.9"
|
python-versions = "<4.0,>=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_ibm_models-3.4.0-py3-none-any.whl", hash = "sha256:186517ff1f76e76113600fa1e5a699927325081a8013fdd5d0551121c2e34190"},
|
{file = "docling_ibm_models-3.4.1-py3-none-any.whl", hash = "sha256:c3582c99dddfa3f0eafcf80cf1267fd8efa39c4a74cc7a88f9dd49684fac2986"},
|
||||||
{file = "docling_ibm_models-3.4.0.tar.gz", hash = "sha256:fb79beeb07d1bb9bc8acf9d0a44643cd7ce1910aa418cd685e2e477b13eeafee"},
|
{file = "docling_ibm_models-3.4.1.tar.gz", hash = "sha256:093b4dff2ea284a4953c3aa009e29945208b8d389b94fb14940a03a93f673e96"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -1300,13 +1349,13 @@ test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit",
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "griffe"
|
name = "griffe"
|
||||||
version = "1.5.7"
|
version = "1.6.0"
|
||||||
description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
|
description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "griffe-1.5.7-py3-none-any.whl", hash = "sha256:4af8ec834b64de954d447c7b6672426bb145e71605c74a4e22d510cc79fe7d8b"},
|
{file = "griffe-1.6.0-py3-none-any.whl", hash = "sha256:9f1dfe035d4715a244ed2050dfbceb05b1f470809ed4f6bb10ece5a7302f8dd1"},
|
||||||
{file = "griffe-1.5.7.tar.gz", hash = "sha256:465238c86deaf1137761f700fb343edd8ffc846d72f6de43c3c345ccdfbebe92"},
|
{file = "griffe-1.6.0.tar.gz", hash = "sha256:eb5758088b9c73ad61c7ac014f3cdfb4c57b5c2fcbfca69996584b702aefa354"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -1787,18 +1836,18 @@ testing = ["Django", "attrs", "colorama", "docopt", "pytest (<9.0.0)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jeepney"
|
name = "jeepney"
|
||||||
version = "0.8.0"
|
version = "0.9.0"
|
||||||
description = "Low-level, pure Python DBus protocol wrapper."
|
description = "Low-level, pure Python DBus protocol wrapper."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "jeepney-0.8.0-py3-none-any.whl", hash = "sha256:c0a454ad016ca575060802ee4d590dd912e35c122fa04e70306de3d076cce755"},
|
{file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"},
|
||||||
{file = "jeepney-0.8.0.tar.gz", hash = "sha256:5efe48d255973902f6badc3ce55e2aa6c5c3b3bc642059ef3a91247bcfcc5806"},
|
{file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
|
test = ["async-timeout", "pytest", "pytest-asyncio (>=0.17)", "pytest-trio", "testpath", "trio"]
|
||||||
trio = ["async_generator", "trio"]
|
trio = ["trio"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "jinja2"
|
name = "jinja2"
|
||||||
@ -2617,13 +2666,13 @@ min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-imp
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mkdocs-autorefs"
|
name = "mkdocs-autorefs"
|
||||||
version = "1.3.1"
|
version = "1.4.0"
|
||||||
description = "Automatically link across pages in MkDocs."
|
description = "Automatically link across pages in MkDocs."
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "mkdocs_autorefs-1.3.1-py3-none-any.whl", hash = "sha256:18c504ae4d3ee7f344369bb26cb31d4105569ee252aab7d75ec2734c2c8b0474"},
|
{file = "mkdocs_autorefs-1.4.0-py3-none-any.whl", hash = "sha256:bad19f69655878d20194acd0162e29a89c3f7e6365ffe54e72aa3fd1072f240d"},
|
||||||
{file = "mkdocs_autorefs-1.3.1.tar.gz", hash = "sha256:a6d30cbcccae336d622a66c2418a3c92a8196b69782774529ad441abb23c0902"},
|
{file = "mkdocs_autorefs-1.4.0.tar.gz", hash = "sha256:a9c0aa9c90edbce302c09d050a3c4cb7c76f8b7b2c98f84a7a05f53d00392156"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -2684,17 +2733,18 @@ pygments = ">2.12.0"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "mkdocs-material"
|
name = "mkdocs-material"
|
||||||
version = "9.6.5"
|
version = "9.6.7"
|
||||||
description = "Documentation that simply works"
|
description = "Documentation that simply works"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "mkdocs_material-9.6.5-py3-none-any.whl", hash = "sha256:aad3e6fb860c20870f75fb2a69ef901f1be727891e41adb60b753efcae19453b"},
|
{file = "mkdocs_material-9.6.7-py3-none-any.whl", hash = "sha256:8a159e45e80fcaadd9fbeef62cbf928569b93df954d4dc5ba76d46820caf7b47"},
|
||||||
{file = "mkdocs_material-9.6.5.tar.gz", hash = "sha256:b714679a8c91b0ffe2188e11ed58c44d2523e9c2ae26a29cc652fa7478faa21f"},
|
{file = "mkdocs_material-9.6.7.tar.gz", hash = "sha256:3e2c1fceb9410056c2d91f334a00cdea3215c28750e00c691c1e46b2a33309b4"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
babel = ">=2.10,<3.0"
|
babel = ">=2.10,<3.0"
|
||||||
|
backrefs = ">=5.7.post1,<6.0"
|
||||||
colorama = ">=0.4,<1.0"
|
colorama = ">=0.4,<1.0"
|
||||||
jinja2 = ">=3.0,<4.0"
|
jinja2 = ">=3.0,<4.0"
|
||||||
markdown = ">=3.2,<4.0"
|
markdown = ">=3.2,<4.0"
|
||||||
@ -2703,7 +2753,6 @@ mkdocs-material-extensions = ">=1.3,<2.0"
|
|||||||
paginate = ">=0.5,<1.0"
|
paginate = ">=0.5,<1.0"
|
||||||
pygments = ">=2.16,<3.0"
|
pygments = ">=2.16,<3.0"
|
||||||
pymdown-extensions = ">=10.2,<11.0"
|
pymdown-extensions = ">=10.2,<11.0"
|
||||||
regex = ">=2022.4"
|
|
||||||
requests = ">=2.26,<3.0"
|
requests = ">=2.26,<3.0"
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
@ -2791,8 +2840,8 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
multiprocess = [
|
multiprocess = [
|
||||||
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
|
|
||||||
{version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
|
{version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
|
||||||
|
{version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
|
||||||
]
|
]
|
||||||
pygments = ">=2.0"
|
pygments = ">=2.0"
|
||||||
pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
|
pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
|
||||||
@ -3144,35 +3193,35 @@ test = ["pytest (>=7.2)", "pytest-cov (>=4.0)"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "nh3"
|
name = "nh3"
|
||||||
version = "0.2.20"
|
version = "0.2.21"
|
||||||
description = "Python binding to Ammonia HTML sanitizer Rust crate"
|
description = "Python binding to Ammonia HTML sanitizer Rust crate"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:e1061a4ab6681f6bdf72b110eea0c4e1379d57c9de937db3be4202f7ad6043db"},
|
{file = "nh3-0.2.21-cp313-cp313t-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:fcff321bd60c6c5c9cb4ddf2554e22772bb41ebd93ad88171bbbb6f271255286"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb4254b1dac4a1ee49919a5b3f1caf9803ea8dada1816d9e8289e63d3cd0dd9a"},
|
{file = "nh3-0.2.21-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:31eedcd7d08b0eae28ba47f43fd33a653b4cdb271d64f1aeda47001618348fde"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0ae9cbd713524cdb81e64663d0d6aae26f678db9f2cd9db0bf162606f1f9f20c"},
|
{file = "nh3-0.2.21-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d426d7be1a2f3d896950fe263332ed1662f6c78525b4520c8e9861f8d7f0d243"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:e1f7370b4e14cc03f5ae141ef30a1caf81fa5787711f80be9081418dd9eb79d2"},
|
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:9d67709bc0d7d1f5797b21db26e7a8b3d15d21c9c5f58ccfe48b5328483b685b"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:ac4d27dc836a476efffc6eb661994426b8b805c951b29c9cf2ff36bc9ad58bc5"},
|
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_armv7l.whl", hash = "sha256:55823c5ea1f6b267a4fad5de39bc0524d49a47783e1fe094bcf9c537a37df251"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:4fd2e9248725ebcedac3997a8d3da0d90a12a28c9179c6ba51f1658938ac30d0"},
|
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:818f2b6df3763e058efa9e69677b5a92f9bc0acff3295af5ed013da544250d5b"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:f7d564871833ddbe54df3aa59053b1110729d3a800cb7628ae8f42adb3d75208"},
|
{file = "nh3-0.2.21-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:b3b5c58161e08549904ac4abd450dacd94ff648916f7c376ae4b2c0652b98ff9"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-win32.whl", hash = "sha256:d2a176fd4306b6f0f178a3f67fac91bd97a3a8d8fafb771c9b9ef675ba5c8886"},
|
{file = "nh3-0.2.21-cp313-cp313t-win32.whl", hash = "sha256:637d4a10c834e1b7d9548592c7aad760611415fcd5bd346f77fd8a064309ae6d"},
|
||||||
{file = "nh3-0.2.20-cp313-cp313t-win_amd64.whl", hash = "sha256:6ed834c68452a600f517dd3e1534dbfaff1f67f98899fecf139a055a25d99150"},
|
{file = "nh3-0.2.21-cp313-cp313t-win_amd64.whl", hash = "sha256:713d16686596e556b65e7f8c58328c2df63f1a7abe1277d87625dcbbc012ef82"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:76e2f603b30c02ff6456b233a83fc377dedab6a50947b04e960a6b905637b776"},
|
{file = "nh3-0.2.21-cp38-abi3-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:a772dec5b7b7325780922dd904709f0f5f3a79fbf756de5291c01370f6df0967"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:181063c581defe683bd4bb78188ac9936d208aebbc74c7f7c16b6a32ae2ebb38"},
|
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d002b648592bf3033adfd875a48f09b8ecc000abd7f6a8769ed86b6ccc70c759"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:231addb7643c952cd6d71f1c8702d703f8fe34afcb20becb3efb319a501a12d7"},
|
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:2a5174551f95f2836f2ad6a8074560f261cf9740a48437d6151fd2d4d7d617ab"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:1b9a8340a0aab991c68a5ca938d35ef4a8a3f4bf1b455da8855a40bee1fa0ace"},
|
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b8d55ea1fc7ae3633d758a92aafa3505cd3cc5a6e40470c9164d54dff6f96d42"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:10317cd96fe4bbd4eb6b95f3920b71c902157ad44fed103fdcde43e3b8ee8be6"},
|
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6ae319f17cd8960d0612f0f0ddff5a90700fa71926ca800e9028e7851ce44a6f"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8698db4c04b140800d1a1cd3067fda399e36e1e2b8fc1fe04292a907350a3e9b"},
|
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:63ca02ac6f27fc80f9894409eb61de2cb20ef0a23740c7e29f9ec827139fa578"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3eb04b9c3deb13c3a375ea39fd4a3c00d1f92e8fb2349f25f1e3e4506751774b"},
|
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5f77e62aed5c4acad635239ac1290404c7e940c81abe561fd2af011ff59f585"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:92f3f1c4f47a2c6f3ca7317b1d5ced05bd29556a75d3a4e2715652ae9d15c05d"},
|
{file = "nh3-0.2.21-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:087ffadfdcd497658c3adc797258ce0f06be8a537786a7217649fc1c0c60c293"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ddefa9fd6794a87e37d05827d299d4b53a3ec6f23258101907b96029bfef138a"},
|
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ac7006c3abd097790e611fe4646ecb19a8d7f2184b882f6093293b8d9b887431"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:ce3731c8f217685d33d9268362e5b4f770914e922bba94d368ab244a59a6c397"},
|
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:6141caabe00bbddc869665b35fc56a478eb774a8c1dfd6fba9fe1dfdf29e6efa"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:09f037c02fc2c43b211ff1523de32801dcfb0918648d8e651c36ef890f1731ec"},
|
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:20979783526641c81d2f5bfa6ca5ccca3d1e4472474b162c6256745fbfe31cd1"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:813f1c8012dd64c990514b795508abb90789334f76a561fa0fd4ca32d2275330"},
|
{file = "nh3-0.2.21-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a7ea28cd49293749d67e4fcf326c554c83ec912cd09cd94aa7ec3ab1921c8283"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-win32.whl", hash = "sha256:47b2946c0e13057855209daeffb45dc910bd0c55daf10190bb0b4b60e2999784"},
|
{file = "nh3-0.2.21-cp38-abi3-win32.whl", hash = "sha256:6c9c30b8b0d291a7c5ab0967ab200598ba33208f754f2f4920e9343bdd88f79a"},
|
||||||
{file = "nh3-0.2.20-cp38-abi3-win_amd64.whl", hash = "sha256:da87573f03084edae8eb87cfe811ec338606288f81d333c07d2a9a0b9b976c0b"},
|
{file = "nh3-0.2.21-cp38-abi3-win_amd64.whl", hash = "sha256:bb0014948f04d7976aabae43fcd4cb7f551f9f8ce785a4c9ef66e6c2590f8629"},
|
||||||
{file = "nh3-0.2.20.tar.gz", hash = "sha256:9705c42d7ff88a0bea546c82d7fe5e59135e3d3f057e485394f491248a1f8ed5"},
|
{file = "nh3-0.2.21.tar.gz", hash = "sha256:4990e7ee6a55490dbf00d61a6f476c9a3258e31e711e13713b2ea7d6616f670e"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -3801,10 +3850,10 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
|
||||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||||
|
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||||
]
|
]
|
||||||
@ -3827,10 +3876,10 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
|
||||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||||
|
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||||
]
|
]
|
||||||
@ -4016,9 +4065,9 @@ files = [
|
|||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
|
||||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||||
|
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
python-dateutil = ">=2.8.2"
|
python-dateutil = ">=2.8.2"
|
||||||
pytz = ">=2020.1"
|
pytz = ">=2020.1"
|
||||||
@ -4724,13 +4773,13 @@ typing-extensions = ">=4.6.0,<4.7.0 || >4.7.0"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pydantic-settings"
|
name = "pydantic-settings"
|
||||||
version = "2.8.0"
|
version = "2.8.1"
|
||||||
description = "Settings management using Pydantic"
|
description = "Settings management using Pydantic"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.8"
|
||||||
files = [
|
files = [
|
||||||
{file = "pydantic_settings-2.8.0-py3-none-any.whl", hash = "sha256:c782c7dc3fb40e97b238e713c25d26f64314aece2e91abcff592fcac15f71820"},
|
{file = "pydantic_settings-2.8.1-py3-none-any.whl", hash = "sha256:81942d5ac3d905f7f3ee1a70df5dfb62d5569c12f51a5a647defc1c3d9ee2e9c"},
|
||||||
{file = "pydantic_settings-2.8.0.tar.gz", hash = "sha256:88e2ca28f6e68ea102c99c3c401d6c9078e68a5df600e97b43891c34e089500a"},
|
{file = "pydantic_settings-2.8.1.tar.gz", hash = "sha256:d5c663dfbe9db9d5e1c646b2e161da12f0d734d422ee56f567d0ea2cee4e8585"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -4782,8 +4831,8 @@ files = [
|
|||||||
astroid = ">=2.15.8,<=2.17.0-dev0"
|
astroid = ">=2.15.8,<=2.17.0-dev0"
|
||||||
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
colorama = {version = ">=0.4.5", markers = "sys_platform == \"win32\""}
|
||||||
dill = [
|
dill = [
|
||||||
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
|
||||||
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
{version = ">=0.2", markers = "python_version < \"3.11\""},
|
||||||
|
{version = ">=0.3.6", markers = "python_version >= \"3.11\""},
|
||||||
]
|
]
|
||||||
isort = ">=4.2.5,<6"
|
isort = ">=4.2.5,<6"
|
||||||
mccabe = ">=0.6,<0.8"
|
mccabe = ">=0.6,<0.8"
|
||||||
@ -5866,26 +5915,26 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "safetensors"
|
name = "safetensors"
|
||||||
version = "0.5.2"
|
version = "0.5.3"
|
||||||
description = ""
|
description = ""
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.7"
|
python-versions = ">=3.7"
|
||||||
files = [
|
files = [
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:45b6092997ceb8aa3801693781a71a99909ab9cc776fbc3fa9322d29b1d3bef2"},
|
{file = "safetensors-0.5.3-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:bd20eb133db8ed15b40110b7c00c6df51655a2998132193de2f75f72d99c7073"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:6d0d6a8ee2215a440e1296b843edf44fd377b055ba350eaba74655a2fe2c4bae"},
|
{file = "safetensors-0.5.3-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:21d01c14ff6c415c485616b8b0bf961c46b3b343ca59110d38d744e577f9cce7"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:86016d40bcaa3bcc9a56cd74d97e654b5f4f4abe42b038c71e4f00a089c4526c"},
|
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11bce6164887cd491ca75c2326a113ba934be596e22b28b1742ce27b1d076467"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:990833f70a5f9c7d3fc82c94507f03179930ff7d00941c287f73b6fcbf67f19e"},
|
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4a243be3590bc3301c821da7a18d87224ef35cbd3e5f5727e4e0728b8172411e"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3dfa7c2f3fe55db34eba90c29df94bcdac4821043fc391cb5d082d9922013869"},
|
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8bd84b12b1670a6f8e50f01e28156422a2bc07fb16fc4e98bded13039d688a0d"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:46ff2116150ae70a4e9c490d2ab6b6e1b1b93f25e520e540abe1b81b48560c3a"},
|
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:391ac8cab7c829452175f871fcaf414aa1e292b5448bd02620f675a7f3e7abb9"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ab696dfdc060caffb61dbe4066b86419107a24c804a4e373ba59be699ebd8d5"},
|
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cead1fa41fc54b1e61089fa57452e8834f798cb1dc7a09ba3524f1eb08e0317a"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:03c937100f38c9ff4c1507abea9928a6a9b02c9c1c9c3609ed4fb2bf413d4975"},
|
{file = "safetensors-0.5.3-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:1077f3e94182d72618357b04b5ced540ceb71c8a813d3319f1aba448e68a770d"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:a00e737948791b94dad83cf0eafc09a02c4d8c2171a239e8c8572fe04e25960e"},
|
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:799021e78287bac619c7b3f3606730a22da4cda27759ddf55d37c8db7511c74b"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:d3a06fae62418ec8e5c635b61a8086032c9e281f16c63c3af46a6efbab33156f"},
|
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:df26da01aaac504334644e1b7642fa000bfec820e7cef83aeac4e355e03195ff"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:1506e4c2eda1431099cebe9abf6c76853e95d0b7a95addceaa74c6019c65d8cf"},
|
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:32c3ef2d7af8b9f52ff685ed0bc43913cdcde135089ae322ee576de93eae5135"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5c5b5d9da594f638a259fca766046f44c97244cc7ab8bef161b3e80d04becc76"},
|
{file = "safetensors-0.5.3-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:37f1521be045e56fc2b54c606d4455573e717b2d887c579ee1dbba5f868ece04"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-win32.whl", hash = "sha256:fe55c039d97090d1f85277d402954dd6ad27f63034fa81985a9cc59655ac3ee2"},
|
{file = "safetensors-0.5.3-cp38-abi3-win32.whl", hash = "sha256:cfc0ec0846dcf6763b0ed3d1846ff36008c6e7290683b61616c4b040f6a54ace"},
|
||||||
{file = "safetensors-0.5.2-cp38-abi3-win_amd64.whl", hash = "sha256:78abdddd03a406646107f973c7843276e7b64e5e32623529dc17f3d94a20f589"},
|
{file = "safetensors-0.5.3-cp38-abi3-win_amd64.whl", hash = "sha256:836cbbc320b47e80acd40e44c8682db0e8ad7123209f69b093def21ec7cafd11"},
|
||||||
{file = "safetensors-0.5.2.tar.gz", hash = "sha256:cb4a8d98ba12fa016f4241932b1fc5e702e5143f5374bba0bbcf7ddc1c4cf2b8"},
|
{file = "safetensors-0.5.3.tar.gz", hash = "sha256:b6b0d6ecacec39a4fdd99cc19f4576f5219ce858e6fd8dbe7609df0b8dc56965"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -6182,13 +6231,13 @@ train = ["accelerate (>=0.20.3)", "datasets"]
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "setuptools"
|
name = "setuptools"
|
||||||
version = "75.8.0"
|
version = "75.8.2"
|
||||||
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
description = "Easily download, build, install, upgrade, and uninstall Python packages"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.9"
|
python-versions = ">=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "setuptools-75.8.0-py3-none-any.whl", hash = "sha256:e3982f444617239225d675215d51f6ba05f845d4eec313da4418fdbb56fb27e3"},
|
{file = "setuptools-75.8.2-py3-none-any.whl", hash = "sha256:558e47c15f1811c1fa7adbd0096669bf76c1d3f433f58324df69f3f5ecac4e8f"},
|
||||||
{file = "setuptools-75.8.0.tar.gz", hash = "sha256:c5afc8f407c626b8313a86e10311dd3f661c6cd9c09d4bf8c15c0e11f9f2b0e6"},
|
{file = "setuptools-75.8.2.tar.gz", hash = "sha256:4880473a969e5f23f2a2be3646b2dfd84af9028716d398e46192f84bc36900d2"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.extras]
|
[package.extras]
|
||||||
@ -7186,13 +7235,13 @@ files = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "types-requests"
|
name = "types-requests"
|
||||||
version = "2.32.0.20241016"
|
version = "2.32.0.20250301"
|
||||||
description = "Typing stubs for requests"
|
description = "Typing stubs for requests"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "types-requests-2.32.0.20241016.tar.gz", hash = "sha256:0d9cad2f27515d0e3e3da7134a1b6f28fb97129d86b867f24d9c726452634d95"},
|
{file = "types_requests-2.32.0.20250301-py3-none-any.whl", hash = "sha256:0003e0124e2cbefefb88222ff822b48616af40c74df83350f599a650c8de483b"},
|
||||||
{file = "types_requests-2.32.0.20241016-py3-none-any.whl", hash = "sha256:4195d62d6d3e043a4eaaf08ff8a62184584d2e8684e9d2aa178c7915a7da3747"},
|
{file = "types_requests-2.32.0.20250301.tar.gz", hash = "sha256:3d909dc4eaab159c0d964ebe8bfa326a7afb4578d8706408d417e17d61b0c500"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -7200,13 +7249,13 @@ urllib3 = ">=2"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "types-tqdm"
|
name = "types-tqdm"
|
||||||
version = "4.67.0.20241221"
|
version = "4.67.0.20250301"
|
||||||
description = "Typing stubs for tqdm"
|
description = "Typing stubs for tqdm"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = ">=3.8"
|
python-versions = ">=3.9"
|
||||||
files = [
|
files = [
|
||||||
{file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
|
{file = "types_tqdm-4.67.0.20250301-py3-none-any.whl", hash = "sha256:8af97deb8e6874af833555dc1fe0fcd456b1a789470bf6cd8813d4e7ee4f6c5b"},
|
||||||
{file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
|
{file = "types_tqdm-4.67.0.20250301.tar.gz", hash = "sha256:5e89a38ad89b867823368eb97d9f90d2fc69806bb055dde62716a05da62b5e0d"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -7797,9 +7846,9 @@ type = ["pytest-mypy"]
|
|||||||
ocrmac = ["ocrmac"]
|
ocrmac = ["ocrmac"]
|
||||||
rapidocr = ["onnxruntime", "onnxruntime", "rapidocr-onnxruntime"]
|
rapidocr = ["onnxruntime", "onnxruntime", "rapidocr-onnxruntime"]
|
||||||
tesserocr = ["tesserocr"]
|
tesserocr = ["tesserocr"]
|
||||||
vlm = ["transformers", "transformers"]
|
vlm = ["accelerate", "transformers", "transformers"]
|
||||||
|
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "d2f454a5f88192eeda1fd17a0e69c9fbd3590c46af548c5203a3dfff4e135978"
|
content-hash = "59424e63947e6c22fceab0a3fe6f1e9ebb72abfe369708a1f55d4daf2593f433"
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
[tool.poetry]
|
[tool.poetry]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.24.0" # DO NOT EDIT, updated automatically
|
version = "2.25.1" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
@ -58,10 +58,14 @@ onnxruntime = [
|
|||||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
||||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
||||||
]
|
]
|
||||||
|
|
||||||
transformers = [
|
transformers = [
|
||||||
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
|
||||||
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
|
{markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }
|
||||||
]
|
]
|
||||||
|
accelerate = [
|
||||||
|
{markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
|
||||||
|
]
|
||||||
pillow = ">=10.0.0,<12.0.0"
|
pillow = ">=10.0.0,<12.0.0"
|
||||||
tqdm = "^4.65.0"
|
tqdm = "^4.65.0"
|
||||||
pluggy = "^1.0.0"
|
pluggy = "^1.0.0"
|
||||||
@ -125,7 +129,7 @@ torchvision = [
|
|||||||
[tool.poetry.extras]
|
[tool.poetry.extras]
|
||||||
tesserocr = ["tesserocr"]
|
tesserocr = ["tesserocr"]
|
||||||
ocrmac = ["ocrmac"]
|
ocrmac = ["ocrmac"]
|
||||||
vlm = ["transformers"]
|
vlm = ["transformers", "accelerate"]
|
||||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
[tool.poetry.scripts]
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
item-0 at level 0: unspecified: group _root_
|
item-0 at level 0: unspecified: group _root_
|
||||||
item-1 at level 1: title: Introduction
|
item-1 at level 1: title: Introduction
|
||||||
item-2 at level 2: paragraph: This is the first paragraph of the introduction.
|
item-2 at level 2: text: This is the first paragraph of the introduction.
|
||||||
item-3 at level 2: section_header: Background
|
item-3 at level 2: section_header: Background
|
||||||
item-4 at level 3: paragraph: Some background information here.
|
item-4 at level 3: text: Some background information here.
|
||||||
item-5 at level 3: picture
|
item-5 at level 3: picture
|
||||||
item-6 at level 3: list: group list
|
item-6 at level 3: list: group list
|
||||||
item-7 at level 4: list_item: First item in unordered list
|
item-7 at level 4: list_item: First item in unordered list
|
||||||
|
@ -88,7 +88,7 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "This is the first paragraph of the introduction.",
|
"orig": "This is the first paragraph of the introduction.",
|
||||||
"text": "This is the first paragraph of the introduction."
|
"text": "This is the first paragraph of the introduction."
|
||||||
@ -126,7 +126,7 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some background information here.",
|
"orig": "Some background information here.",
|
||||||
"text": "Some background information here."
|
"text": "Some background information here."
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
item-0 at level 0: unspecified: group _root_
|
item-0 at level 0: unspecified: group _root_
|
||||||
item-1 at level 1: title: Introduction
|
item-1 at level 1: title: Introduction
|
||||||
item-2 at level 2: paragraph: This is the first paragraph of the introduction.
|
item-2 at level 2: text: This is the first paragraph of the introduction.
|
||||||
item-3 at level 2: section_header: Background
|
item-3 at level 2: section_header: Background
|
||||||
item-4 at level 3: paragraph: Some background information here.
|
item-4 at level 3: text: Some background information here.
|
||||||
item-5 at level 3: list: group list
|
item-5 at level 3: list: group list
|
||||||
item-6 at level 4: list_item: First item in unordered list
|
item-6 at level 4: list_item: First item in unordered list
|
||||||
item-7 at level 4: list_item: Second item in unordered list
|
item-7 at level 4: list_item: Second item in unordered list
|
||||||
|
@ -88,7 +88,7 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "This is the first paragraph of the introduction.",
|
"orig": "This is the first paragraph of the introduction.",
|
||||||
"text": "This is the first paragraph of the introduction."
|
"text": "This is the first paragraph of the introduction."
|
||||||
@ -123,7 +123,7 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some background information here.",
|
"orig": "Some background information here.",
|
||||||
"text": "Some background information here."
|
"text": "Some background information here."
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
item-0 at level 0: unspecified: group _root_
|
item-0 at level 0: unspecified: group _root_
|
||||||
item-1 at level 1: title: Example Document
|
item-1 at level 1: title: Example Document
|
||||||
item-2 at level 2: section_header: Introduction
|
item-2 at level 2: section_header: Introduction
|
||||||
item-3 at level 3: paragraph: This is the first paragraph of the introduction.
|
item-3 at level 3: text: This is the first paragraph of the introduction.
|
||||||
item-4 at level 2: section_header: Background
|
item-4 at level 2: section_header: Background
|
||||||
item-5 at level 3: paragraph: Some background information here.
|
item-5 at level 3: text: Some background information here.
|
||||||
item-6 at level 3: list: group list
|
item-6 at level 3: list: group list
|
||||||
item-7 at level 4: list_item: First item in unordered list
|
item-7 at level 4: list_item: First item in unordered list
|
||||||
item-8 at level 5: list: group list
|
item-8 at level 5: list: group list
|
||||||
|
@ -142,7 +142,7 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "This is the first paragraph of the introduction.",
|
"orig": "This is the first paragraph of the introduction.",
|
||||||
"text": "This is the first paragraph of the introduction."
|
"text": "This is the first paragraph of the introduction."
|
||||||
@ -177,7 +177,7 @@
|
|||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "text",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "Some background information here.",
|
"orig": "Some background information here.",
|
||||||
"text": "Some background information here."
|
"text": "Some background information here."
|
||||||
|
7
tests/data/groundtruth/docling_v2/example_06.html.itxt
Normal file
7
tests/data/groundtruth/docling_v2/example_06.html.itxt
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: text: This is a div with text.
|
||||||
|
item-2 at level 1: text: This is another div with text.
|
||||||
|
item-3 at level 1: text: This is a regular paragraph.
|
||||||
|
item-4 at level 1: text: This is a third div
|
||||||
|
with a new line.
|
||||||
|
item-5 at level 1: text: This is a fourth div with a bold paragraph.
|
108
tests/data/groundtruth/docling_v2/example_06.html.json
Normal file
108
tests/data/groundtruth/docling_v2/example_06.html.json
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
{
|
||||||
|
"schema_name": "DoclingDocument",
|
||||||
|
"version": "1.1.0",
|
||||||
|
"name": "example_06",
|
||||||
|
"origin": {
|
||||||
|
"mimetype": "text/html",
|
||||||
|
"binary_hash": 14574683870626799530,
|
||||||
|
"filename": "example_06.html"
|
||||||
|
},
|
||||||
|
"furniture": {
|
||||||
|
"self_ref": "#/furniture",
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "furniture",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"body": {
|
||||||
|
"self_ref": "#/body",
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/0"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/1"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/2"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/3"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/4"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "_root_",
|
||||||
|
"label": "unspecified"
|
||||||
|
},
|
||||||
|
"groups": [],
|
||||||
|
"texts": [
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/0",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is a div with text.",
|
||||||
|
"text": "This is a div with text."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/1",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is another div with text.",
|
||||||
|
"text": "This is another div with text."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/2",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is a regular paragraph.",
|
||||||
|
"text": "This is a regular paragraph."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/3",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is a third div\nwith a new line.",
|
||||||
|
"text": "This is a third div\nwith a new line."
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/texts/4",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"label": "text",
|
||||||
|
"prov": [],
|
||||||
|
"orig": "This is a fourth div with a bold paragraph.",
|
||||||
|
"text": "This is a fourth div with a bold paragraph."
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"pictures": [],
|
||||||
|
"tables": [],
|
||||||
|
"key_value_items": [],
|
||||||
|
"form_items": [],
|
||||||
|
"pages": {}
|
||||||
|
}
|
10
tests/data/groundtruth/docling_v2/example_06.html.md
Normal file
10
tests/data/groundtruth/docling_v2/example_06.html.md
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
This is a div with text.
|
||||||
|
|
||||||
|
This is another div with text.
|
||||||
|
|
||||||
|
This is a regular paragraph.
|
||||||
|
|
||||||
|
This is a third div
|
||||||
|
with a new line.
|
||||||
|
|
||||||
|
This is a fourth div with a bold paragraph.
|
@ -1,474 +1,416 @@
|
|||||||
item-0 at level 0: unspecified: group _root_
|
item-0 at level 0: unspecified: group _root_
|
||||||
item-1 at level 1: list: group list
|
item-1 at level 1: title: Duck
|
||||||
item-2 at level 2: list_item: Main page
|
item-2 at level 2: list: group list
|
||||||
item-3 at level 2: list_item: Contents
|
item-3 at level 3: list_item: Acèh
|
||||||
item-4 at level 2: list_item: Current events
|
item-4 at level 3: list_item: Afrikaans
|
||||||
item-5 at level 2: list_item: Random article
|
item-5 at level 3: list_item: Alemannisch
|
||||||
item-6 at level 2: list_item: About Wikipedia
|
item-6 at level 3: list_item: አማርኛ
|
||||||
item-7 at level 2: list_item: Contact us
|
item-7 at level 3: list_item: Ænglisc
|
||||||
item-8 at level 1: list: group list
|
item-8 at level 3: list_item: العربية
|
||||||
item-9 at level 2: list_item: Help
|
item-9 at level 3: list_item: Aragonés
|
||||||
item-10 at level 2: list_item: Learn to edit
|
item-10 at level 3: list_item: ܐܪܡܝܐ
|
||||||
item-11 at level 2: list_item: Community portal
|
item-11 at level 3: list_item: Armãneashti
|
||||||
item-12 at level 2: list_item: Recent changes
|
item-12 at level 3: list_item: Asturianu
|
||||||
item-13 at level 2: list_item: Upload file
|
item-13 at level 3: list_item: Atikamekw
|
||||||
item-14 at level 1: picture
|
item-14 at level 3: list_item: Авар
|
||||||
item-15 at level 1: picture
|
item-15 at level 3: list_item: Aymar aru
|
||||||
item-16 at level 1: picture
|
item-16 at level 3: list_item: تۆرکجه
|
||||||
item-17 at level 1: list: group list
|
item-17 at level 3: list_item: Basa Bali
|
||||||
item-18 at level 1: list: group list
|
item-18 at level 3: list_item: বাংলা
|
||||||
item-19 at level 2: list_item: Donate
|
item-19 at level 3: list_item: 閩南語 / Bân-lâm-gú
|
||||||
item-20 at level 1: list: group list
|
item-20 at level 3: list_item: Беларуская
|
||||||
item-21 at level 1: list: group list
|
item-21 at level 3: list_item: Беларуская (тарашкевіца)
|
||||||
item-22 at level 2: list_item: Create account
|
item-22 at level 3: list_item: Bikol Central
|
||||||
item-23 at level 2: list_item: Log in
|
item-23 at level 3: list_item: Български
|
||||||
item-24 at level 1: list: group list
|
item-24 at level 3: list_item: Brezhoneg
|
||||||
item-25 at level 2: list_item: Create account
|
item-25 at level 3: list_item: Буряад
|
||||||
item-26 at level 2: list_item: Log in
|
item-26 at level 3: list_item: Català
|
||||||
item-27 at level 1: list: group list
|
item-27 at level 3: list_item: Чӑвашла
|
||||||
item-28 at level 2: list_item: Contributions
|
item-28 at level 3: list_item: Čeština
|
||||||
item-29 at level 2: list_item: Talk
|
item-29 at level 3: list_item: ChiShona
|
||||||
item-30 at level 1: section: group header-1
|
item-30 at level 3: list_item: Cymraeg
|
||||||
item-31 at level 2: section_header: Contents
|
item-31 at level 3: list_item: Dagbanli
|
||||||
item-32 at level 3: list: group list
|
item-32 at level 3: list_item: Dansk
|
||||||
item-33 at level 4: list_item: (Top)
|
item-33 at level 3: list_item: Deitsch
|
||||||
item-34 at level 4: list_item: 1 Etymology
|
item-34 at level 3: list_item: Deutsch
|
||||||
item-35 at level 5: list: group list
|
item-35 at level 3: list_item: डोटेली
|
||||||
item-36 at level 4: list_item: 2 Taxonomy
|
item-36 at level 3: list_item: Ελληνικά
|
||||||
item-37 at level 5: list: group list
|
item-37 at level 3: list_item: Emiliàn e rumagnòl
|
||||||
item-38 at level 4: list_item: 3 Morphology
|
item-38 at level 3: list_item: Español
|
||||||
item-39 at level 5: list: group list
|
item-39 at level 3: list_item: Esperanto
|
||||||
item-40 at level 4: list_item: 4 Distribution and habitat
|
item-40 at level 3: list_item: Euskara
|
||||||
item-41 at level 5: list: group list
|
item-41 at level 3: list_item: فارسی
|
||||||
item-42 at level 4: list_item: 5 Behaviour Toggle Behaviour subsection
|
item-42 at level 3: list_item: Français
|
||||||
item-43 at level 5: list: group list
|
item-43 at level 3: list_item: Gaeilge
|
||||||
item-44 at level 6: list_item: 5.1 Feeding
|
item-44 at level 3: list_item: Galego
|
||||||
item-45 at level 7: list: group list
|
item-45 at level 3: list_item: ГӀалгӀай
|
||||||
item-46 at level 6: list_item: 5.2 Breeding
|
item-46 at level 3: list_item: 贛語
|
||||||
item-47 at level 7: list: group list
|
item-47 at level 3: list_item: گیلکی
|
||||||
item-48 at level 6: list_item: 5.3 Communication
|
item-48 at level 3: list_item: 𐌲𐌿𐍄𐌹𐍃𐌺
|
||||||
item-49 at level 7: list: group list
|
item-49 at level 3: list_item: गोंयची कोंकणी / Gõychi Konknni
|
||||||
item-50 at level 6: list_item: 5.4 Predators
|
item-50 at level 3: list_item: 客家語 / Hak-kâ-ngî
|
||||||
item-51 at level 7: list: group list
|
item-51 at level 3: list_item: 한국어
|
||||||
item-52 at level 4: list_item: 6 Relationship with humans Toggle Relationship with humans subsection
|
item-52 at level 3: list_item: Hausa
|
||||||
item-53 at level 5: list: group list
|
item-53 at level 3: list_item: Հայերեն
|
||||||
item-54 at level 6: list_item: 6.1 Hunting
|
item-54 at level 3: list_item: हिन्दी
|
||||||
item-55 at level 7: list: group list
|
item-55 at level 3: list_item: Hrvatski
|
||||||
item-56 at level 6: list_item: 6.2 Domestication
|
item-56 at level 3: list_item: Ido
|
||||||
item-57 at level 7: list: group list
|
item-57 at level 3: list_item: Bahasa Indonesia
|
||||||
item-58 at level 6: list_item: 6.3 Heraldry
|
item-58 at level 3: list_item: Iñupiatun
|
||||||
item-59 at level 7: list: group list
|
item-59 at level 3: list_item: Íslenska
|
||||||
item-60 at level 6: list_item: 6.4 Cultural references
|
item-60 at level 3: list_item: Italiano
|
||||||
item-61 at level 7: list: group list
|
item-61 at level 3: list_item: עברית
|
||||||
item-62 at level 4: list_item: 7 See also
|
item-62 at level 3: list_item: Jawa
|
||||||
item-63 at level 5: list: group list
|
item-63 at level 3: list_item: ಕನ್ನಡ
|
||||||
item-64 at level 4: list_item: 8 Notes Toggle Notes subsection
|
item-64 at level 3: list_item: Kapampangan
|
||||||
item-65 at level 5: list: group list
|
item-65 at level 3: list_item: ქართული
|
||||||
item-66 at level 6: list_item: 8.1 Citations
|
item-66 at level 3: list_item: कॉशुर / کٲشُر
|
||||||
item-67 at level 7: list: group list
|
item-67 at level 3: list_item: Қазақша
|
||||||
item-68 at level 6: list_item: 8.2 Sources
|
item-68 at level 3: list_item: Ikirundi
|
||||||
item-69 at level 7: list: group list
|
item-69 at level 3: list_item: Kongo
|
||||||
item-70 at level 4: list_item: 9 External links
|
item-70 at level 3: list_item: Kreyòl ayisyen
|
||||||
item-71 at level 5: list: group list
|
item-71 at level 3: list_item: Кырык мары
|
||||||
item-72 at level 1: title: Duck
|
item-72 at level 3: list_item: ລາວ
|
||||||
item-73 at level 2: list: group list
|
item-73 at level 3: list_item: Latina
|
||||||
item-74 at level 3: list_item: Acèh
|
item-74 at level 3: list_item: Latviešu
|
||||||
item-75 at level 3: list_item: Afrikaans
|
item-75 at level 3: list_item: Lietuvių
|
||||||
item-76 at level 3: list_item: Alemannisch
|
item-76 at level 3: list_item: Li Niha
|
||||||
item-77 at level 3: list_item: አማርኛ
|
item-77 at level 3: list_item: Ligure
|
||||||
item-78 at level 3: list_item: Ænglisc
|
item-78 at level 3: list_item: Limburgs
|
||||||
item-79 at level 3: list_item: العربية
|
item-79 at level 3: list_item: Lingála
|
||||||
item-80 at level 3: list_item: Aragonés
|
item-80 at level 3: list_item: Malagasy
|
||||||
item-81 at level 3: list_item: ܐܪܡܝܐ
|
item-81 at level 3: list_item: മലയാളം
|
||||||
item-82 at level 3: list_item: Armãneashti
|
item-82 at level 3: list_item: मराठी
|
||||||
item-83 at level 3: list_item: Asturianu
|
item-83 at level 3: list_item: مازِرونی
|
||||||
item-84 at level 3: list_item: Atikamekw
|
item-84 at level 3: list_item: Bahasa Melayu
|
||||||
item-85 at level 3: list_item: Авар
|
item-85 at level 3: list_item: ꯃꯤꯇꯩ ꯂꯣꯟ
|
||||||
item-86 at level 3: list_item: Aymar aru
|
item-86 at level 3: list_item: 閩東語 / Mìng-dĕ̤ng-ngṳ̄
|
||||||
item-87 at level 3: list_item: تۆرکجه
|
item-87 at level 3: list_item: Мокшень
|
||||||
item-88 at level 3: list_item: Basa Bali
|
item-88 at level 3: list_item: Монгол
|
||||||
item-89 at level 3: list_item: বাংলা
|
item-89 at level 3: list_item: မြန်မာဘာသာ
|
||||||
item-90 at level 3: list_item: 閩南語 / Bân-lâm-gú
|
item-90 at level 3: list_item: Nederlands
|
||||||
item-91 at level 3: list_item: Беларуская
|
item-91 at level 3: list_item: Nedersaksies
|
||||||
item-92 at level 3: list_item: Беларуская (тарашкевіца)
|
item-92 at level 3: list_item: नेपाली
|
||||||
item-93 at level 3: list_item: Bikol Central
|
item-93 at level 3: list_item: नेपाल भाषा
|
||||||
item-94 at level 3: list_item: Български
|
item-94 at level 3: list_item: 日本語
|
||||||
item-95 at level 3: list_item: Brezhoneg
|
item-95 at level 3: list_item: Нохчийн
|
||||||
item-96 at level 3: list_item: Буряад
|
item-96 at level 3: list_item: Norsk nynorsk
|
||||||
item-97 at level 3: list_item: Català
|
item-97 at level 3: list_item: Occitan
|
||||||
item-98 at level 3: list_item: Чӑвашла
|
item-98 at level 3: list_item: Oromoo
|
||||||
item-99 at level 3: list_item: Čeština
|
item-99 at level 3: list_item: ਪੰਜਾਬੀ
|
||||||
item-100 at level 3: list_item: ChiShona
|
item-100 at level 3: list_item: Picard
|
||||||
item-101 at level 3: list_item: Cymraeg
|
item-101 at level 3: list_item: Plattdüütsch
|
||||||
item-102 at level 3: list_item: Dagbanli
|
item-102 at level 3: list_item: Polski
|
||||||
item-103 at level 3: list_item: Dansk
|
item-103 at level 3: list_item: Português
|
||||||
item-104 at level 3: list_item: Deitsch
|
item-104 at level 3: list_item: Qırımtatarca
|
||||||
item-105 at level 3: list_item: Deutsch
|
item-105 at level 3: list_item: Română
|
||||||
item-106 at level 3: list_item: डोटेली
|
item-106 at level 3: list_item: Русский
|
||||||
item-107 at level 3: list_item: Ελληνικά
|
item-107 at level 3: list_item: Саха тыла
|
||||||
item-108 at level 3: list_item: Emiliàn e rumagnòl
|
item-108 at level 3: list_item: ᱥᱟᱱᱛᱟᱲᱤ
|
||||||
item-109 at level 3: list_item: Español
|
item-109 at level 3: list_item: Sardu
|
||||||
item-110 at level 3: list_item: Esperanto
|
item-110 at level 3: list_item: Scots
|
||||||
item-111 at level 3: list_item: Euskara
|
item-111 at level 3: list_item: Seeltersk
|
||||||
item-112 at level 3: list_item: فارسی
|
item-112 at level 3: list_item: Shqip
|
||||||
item-113 at level 3: list_item: Français
|
item-113 at level 3: list_item: Sicilianu
|
||||||
item-114 at level 3: list_item: Gaeilge
|
item-114 at level 3: list_item: සිංහල
|
||||||
item-115 at level 3: list_item: Galego
|
item-115 at level 3: list_item: Simple English
|
||||||
item-116 at level 3: list_item: ГӀалгӀай
|
item-116 at level 3: list_item: سنڌي
|
||||||
item-117 at level 3: list_item: 贛語
|
item-117 at level 3: list_item: کوردی
|
||||||
item-118 at level 3: list_item: گیلکی
|
item-118 at level 3: list_item: Српски / srpski
|
||||||
item-119 at level 3: list_item: 𐌲𐌿𐍄𐌹𐍃𐌺
|
item-119 at level 3: list_item: Srpskohrvatski / српскохрватски
|
||||||
item-120 at level 3: list_item: गोंयची कोंकणी / Gõychi Konknni
|
item-120 at level 3: list_item: Sunda
|
||||||
item-121 at level 3: list_item: 客家語 / Hak-kâ-ngî
|
item-121 at level 3: list_item: Svenska
|
||||||
item-122 at level 3: list_item: 한국어
|
item-122 at level 3: list_item: Tagalog
|
||||||
item-123 at level 3: list_item: Hausa
|
item-123 at level 3: list_item: தமிழ்
|
||||||
item-124 at level 3: list_item: Հայերեն
|
item-124 at level 3: list_item: Taqbaylit
|
||||||
item-125 at level 3: list_item: हिन्दी
|
item-125 at level 3: list_item: Татарча / tatarça
|
||||||
item-126 at level 3: list_item: Hrvatski
|
item-126 at level 3: list_item: ไทย
|
||||||
item-127 at level 3: list_item: Ido
|
item-127 at level 3: list_item: Türkçe
|
||||||
item-128 at level 3: list_item: Bahasa Indonesia
|
item-128 at level 3: list_item: Українська
|
||||||
item-129 at level 3: list_item: Iñupiatun
|
item-129 at level 3: list_item: ئۇيغۇرچە / Uyghurche
|
||||||
item-130 at level 3: list_item: Íslenska
|
item-130 at level 3: list_item: Vahcuengh
|
||||||
item-131 at level 3: list_item: Italiano
|
item-131 at level 3: list_item: Tiếng Việt
|
||||||
item-132 at level 3: list_item: עברית
|
item-132 at level 3: list_item: Walon
|
||||||
item-133 at level 3: list_item: Jawa
|
item-133 at level 3: list_item: 文言
|
||||||
item-134 at level 3: list_item: ಕನ್ನಡ
|
item-134 at level 3: list_item: Winaray
|
||||||
item-135 at level 3: list_item: Kapampangan
|
item-135 at level 3: list_item: 吴语
|
||||||
item-136 at level 3: list_item: ქართული
|
item-136 at level 3: list_item: 粵語
|
||||||
item-137 at level 3: list_item: कॉशुर / کٲشُر
|
item-137 at level 3: list_item: Žemaitėška
|
||||||
item-138 at level 3: list_item: Қазақша
|
item-138 at level 3: list_item: 中文
|
||||||
item-139 at level 3: list_item: Ikirundi
|
item-139 at level 2: list: group list
|
||||||
item-140 at level 3: list_item: Kongo
|
item-140 at level 3: list_item: Article
|
||||||
item-141 at level 3: list_item: Kreyòl ayisyen
|
item-141 at level 3: list_item: Talk
|
||||||
item-142 at level 3: list_item: Кырык мары
|
item-142 at level 2: list: group list
|
||||||
item-143 at level 3: list_item: ລາວ
|
item-143 at level 2: list: group list
|
||||||
item-144 at level 3: list_item: Latina
|
item-144 at level 3: list_item: Read
|
||||||
item-145 at level 3: list_item: Latviešu
|
item-145 at level 3: list_item: View source
|
||||||
item-146 at level 3: list_item: Lietuvių
|
item-146 at level 3: list_item: View history
|
||||||
item-147 at level 3: list_item: Li Niha
|
item-147 at level 2: text: Tools
|
||||||
item-148 at level 3: list_item: Ligure
|
item-148 at level 2: text: Actions
|
||||||
item-149 at level 3: list_item: Limburgs
|
item-149 at level 2: list: group list
|
||||||
item-150 at level 3: list_item: Lingála
|
item-150 at level 3: list_item: Read
|
||||||
item-151 at level 3: list_item: Malagasy
|
item-151 at level 3: list_item: View source
|
||||||
item-152 at level 3: list_item: മലയാളം
|
item-152 at level 3: list_item: View history
|
||||||
item-153 at level 3: list_item: मराठी
|
item-153 at level 2: text: General
|
||||||
item-154 at level 3: list_item: مازِرونی
|
item-154 at level 2: list: group list
|
||||||
item-155 at level 3: list_item: Bahasa Melayu
|
item-155 at level 3: list_item: What links here
|
||||||
item-156 at level 3: list_item: ꯃꯤꯇꯩ ꯂꯣꯟ
|
item-156 at level 3: list_item: Related changes
|
||||||
item-157 at level 3: list_item: 閩東語 / Mìng-dĕ̤ng-ngṳ̄
|
item-157 at level 3: list_item: Upload file
|
||||||
item-158 at level 3: list_item: Мокшень
|
item-158 at level 3: list_item: Special pages
|
||||||
item-159 at level 3: list_item: Монгол
|
item-159 at level 3: list_item: Permanent link
|
||||||
item-160 at level 3: list_item: မြန်မာဘာသာ
|
item-160 at level 3: list_item: Page information
|
||||||
item-161 at level 3: list_item: Nederlands
|
item-161 at level 3: list_item: Cite this page
|
||||||
item-162 at level 3: list_item: Nedersaksies
|
item-162 at level 3: list_item: Get shortened URL
|
||||||
item-163 at level 3: list_item: नेपाली
|
item-163 at level 3: list_item: Download QR code
|
||||||
item-164 at level 3: list_item: नेपाल भाषा
|
item-164 at level 3: list_item: Wikidata item
|
||||||
item-165 at level 3: list_item: 日本語
|
item-165 at level 2: text: Print/export
|
||||||
item-166 at level 3: list_item: Нохчийн
|
item-166 at level 2: list: group list
|
||||||
item-167 at level 3: list_item: Norsk nynorsk
|
item-167 at level 3: list_item: Download as PDF
|
||||||
item-168 at level 3: list_item: Occitan
|
item-168 at level 3: list_item: Printable version
|
||||||
item-169 at level 3: list_item: Oromoo
|
item-169 at level 2: text: In other projects
|
||||||
item-170 at level 3: list_item: ਪੰਜਾਬੀ
|
item-170 at level 2: list: group list
|
||||||
item-171 at level 3: list_item: Picard
|
item-171 at level 3: list_item: Wikimedia Commons
|
||||||
item-172 at level 3: list_item: Plattdüütsch
|
item-172 at level 3: list_item: Wikiquote
|
||||||
item-173 at level 3: list_item: Polski
|
item-173 at level 2: text: Appearance
|
||||||
item-174 at level 3: list_item: Português
|
item-174 at level 2: picture
|
||||||
item-175 at level 3: list_item: Qırımtatarca
|
item-175 at level 2: text: From Wikipedia, the free encyclopedia
|
||||||
item-176 at level 3: list_item: Română
|
item-176 at level 2: text: Common name for many species of bird
|
||||||
item-177 at level 3: list_item: Русский
|
item-177 at level 2: text: This article is about the bird. ... as a food, see . For other uses, see .
|
||||||
item-178 at level 3: list_item: Саха тыла
|
item-178 at level 2: text: "Duckling" redirects here. For other uses, see .
|
||||||
item-179 at level 3: list_item: ᱥᱟᱱᱛᱟᱲᱤ
|
item-179 at level 2: table with [13x2]
|
||||||
item-180 at level 3: list_item: Sardu
|
item-180 at level 2: text: Duck is the common name for nume ... und in both fresh water and sea water.
|
||||||
item-181 at level 3: list_item: Scots
|
item-181 at level 2: text: Ducks are sometimes confused wit ... divers, grebes, gallinules and coots.
|
||||||
item-182 at level 3: list_item: Seeltersk
|
item-182 at level 2: section_header: Etymology
|
||||||
item-183 at level 3: list_item: Shqip
|
item-183 at level 3: text: The word duck comes from Old Eng ... h duiken and German tauchen 'to dive'.
|
||||||
item-184 at level 3: list_item: Sicilianu
|
item-184 at level 3: picture
|
||||||
item-185 at level 3: list_item: සිංහල
|
item-184 at level 4: caption: Pacific black duck displaying the characteristic upending "duck"
|
||||||
item-186 at level 3: list_item: Simple English
|
item-185 at level 3: text: This word replaced Old English e ... nskrit ātí 'water bird', among others.
|
||||||
item-187 at level 3: list_item: سنڌي
|
item-186 at level 3: text: A duckling is a young duck in do ... , is sometimes labelled as a duckling.
|
||||||
item-188 at level 3: list_item: کوردی
|
item-187 at level 3: text: A male is called a drake and the ... a duck, or in ornithology a hen.[3][4]
|
||||||
item-189 at level 3: list_item: Српски / srpski
|
item-188 at level 3: picture
|
||||||
item-190 at level 3: list_item: Srpskohrvatski / српскохрватски
|
item-188 at level 4: caption: Male mallard.
|
||||||
item-191 at level 3: list_item: Sunda
|
item-189 at level 3: picture
|
||||||
item-192 at level 3: list_item: Svenska
|
item-189 at level 4: caption: Wood ducks.
|
||||||
item-193 at level 3: list_item: Tagalog
|
item-190 at level 2: section_header: Taxonomy
|
||||||
item-194 at level 3: list_item: தமிழ்
|
item-191 at level 3: text: All ducks belong to the biologic ... ationships between various species.[9]
|
||||||
item-195 at level 3: list_item: Taqbaylit
|
item-192 at level 3: picture
|
||||||
item-196 at level 3: list_item: Татарча / tatarça
|
item-192 at level 4: caption: Mallard landing in approach
|
||||||
item-197 at level 3: list_item: ไทย
|
item-193 at level 3: text: In most modern classifications, ... all size and stiff, upright tails.[14]
|
||||||
item-198 at level 3: list_item: Türkçe
|
item-194 at level 3: text: A number of other species called ... shelducks in the tribe Tadornini.[15]
|
||||||
item-199 at level 3: list_item: Українська
|
item-195 at level 2: section_header: Morphology
|
||||||
item-200 at level 3: list_item: ئۇيغۇرچە / Uyghurche
|
item-196 at level 3: picture
|
||||||
item-201 at level 3: list_item: Vahcuengh
|
item-196 at level 4: caption: Male Mandarin duck
|
||||||
item-202 at level 3: list_item: Tiếng Việt
|
item-197 at level 3: text: The overall body plan of ducks i ... is moult typically precedes migration.
|
||||||
item-203 at level 3: list_item: Walon
|
item-198 at level 3: text: The drakes of northern species o ... rkscrew shaped vagina to prevent rape.
|
||||||
item-204 at level 3: list_item: 文言
|
item-199 at level 2: section_header: Distribution and habitat
|
||||||
item-205 at level 3: list_item: Winaray
|
item-200 at level 3: picture
|
||||||
item-206 at level 3: list_item: 吴语
|
item-200 at level 4: caption: Flying steamer ducks in Ushuaia, Argentina
|
||||||
item-207 at level 3: list_item: 粵語
|
item-201 at level 3: text: Ducks have a cosmopolitan distri ... endemic to such far-flung islands.[21]
|
||||||
item-208 at level 3: list_item: Žemaitėška
|
item-202 at level 3: picture
|
||||||
item-209 at level 3: list_item: 中文
|
item-202 at level 4: caption: Female mallard in Cornwall, England
|
||||||
item-210 at level 2: list: group list
|
item-203 at level 3: text: Some duck species, mainly those ... t form after localised heavy rain.[23]
|
||||||
item-211 at level 3: list_item: Article
|
item-204 at level 2: section_header: Behaviour
|
||||||
item-212 at level 3: list_item: Talk
|
item-205 at level 3: section_header: Feeding
|
||||||
item-213 at level 2: list: group list
|
item-206 at level 4: picture
|
||||||
item-214 at level 2: list: group list
|
item-206 at level 5: caption: Pecten along the bill
|
||||||
item-215 at level 3: list_item: Read
|
item-207 at level 4: picture
|
||||||
item-216 at level 3: list_item: View source
|
item-207 at level 5: caption: Mallard duckling preening
|
||||||
item-217 at level 3: list_item: View history
|
item-208 at level 4: text: Ducks eat food sources such as g ... amphibians, worms, and small molluscs.
|
||||||
item-218 at level 2: list: group list
|
item-209 at level 4: text: Dabbling ducks feed on the surfa ... thers and to hold slippery food items.
|
||||||
item-219 at level 3: list_item: Read
|
item-210 at level 4: text: Diving ducks and sea ducks forag ... ave more difficulty taking off to fly.
|
||||||
item-220 at level 3: list_item: View source
|
item-211 at level 4: text: A few specialized species such a ... apted to catch and swallow large fish.
|
||||||
item-221 at level 3: list_item: View history
|
item-212 at level 4: text: The others have the characterist ... e nostrils come out through hard horn.
|
||||||
item-222 at level 2: list: group list
|
item-213 at level 4: text: The Guardian published an articl ... the ducks and pollutes waterways.[25]
|
||||||
item-223 at level 3: list_item: What links here
|
item-214 at level 3: section_header: Breeding
|
||||||
item-224 at level 3: list_item: Related changes
|
item-215 at level 4: picture
|
||||||
item-225 at level 3: list_item: Upload file
|
item-215 at level 5: caption: A Muscovy duckling
|
||||||
item-226 at level 3: list_item: Special pages
|
item-216 at level 4: text: Ducks generally only have one pa ... st and led her ducklings to water.[28]
|
||||||
item-227 at level 3: list_item: Permanent link
|
item-217 at level 3: section_header: Communication
|
||||||
item-228 at level 3: list_item: Page information
|
item-218 at level 4: text: Female mallard ducks (as well as ... laying calls or quieter contact calls.
|
||||||
item-229 at level 3: list_item: Cite this page
|
item-219 at level 4: text: A common urban legend claims tha ... annel television show MythBusters.[32]
|
||||||
item-230 at level 3: list_item: Get shortened URL
|
item-220 at level 3: section_header: Predators
|
||||||
item-231 at level 3: list_item: Download QR code
|
item-221 at level 4: picture
|
||||||
item-232 at level 3: list_item: Wikidata item
|
item-221 at level 5: caption: Ringed teal
|
||||||
item-233 at level 2: list: group list
|
item-222 at level 4: text: Ducks have many predators. Duckl ... or large birds, such as hawks or owls.
|
||||||
item-234 at level 3: list_item: Download as PDF
|
item-223 at level 4: text: Adult ducks are fast fliers, but ... its speed and strength to catch ducks.
|
||||||
item-235 at level 3: list_item: Printable version
|
item-224 at level 2: section_header: Relationship with humans
|
||||||
item-236 at level 2: list: group list
|
item-225 at level 3: section_header: Hunting
|
||||||
item-237 at level 3: list_item: Wikimedia Commons
|
item-226 at level 4: text: Humans have hunted ducks since p ... evidence of this is uncommon.[35][42]
|
||||||
item-238 at level 3: list_item: Wikiquote
|
item-227 at level 4: text: In many areas, wild ducks (inclu ... inated by pollutants such as PCBs.[44]
|
||||||
item-239 at level 2: picture
|
item-228 at level 3: section_header: Domestication
|
||||||
item-240 at level 2: table with [13x2]
|
item-229 at level 4: picture
|
||||||
item-241 at level 2: paragraph: Duck is the common name for nume ... und in both fresh water and sea water.
|
item-229 at level 5: caption: Indian Runner ducks, a common breed of domestic ducks
|
||||||
item-242 at level 2: paragraph: Ducks are sometimes confused wit ... divers, grebes, gallinules and coots.
|
item-230 at level 4: text: Ducks have many economic uses, b ... it weighs less than 1 kg (2.2 lb).[48]
|
||||||
item-243 at level 2: section_header: Etymology
|
item-231 at level 3: section_header: Heraldry
|
||||||
item-244 at level 3: paragraph: The word duck comes from Old Eng ... h duiken and German tauchen 'to dive'.
|
item-232 at level 4: picture
|
||||||
item-245 at level 3: picture
|
item-232 at level 5: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
|
||||||
item-245 at level 4: caption: Pacific black duck displaying the characteristic upending "duck"
|
item-233 at level 4: text: Ducks appear on several coats of ... the coat of arms of Föglö (Åland).[51]
|
||||||
item-246 at level 3: paragraph: This word replaced Old English e ... nskrit ātí 'water bird', among others.
|
item-234 at level 3: section_header: Cultural references
|
||||||
item-247 at level 3: paragraph: A duckling is a young duck in do ... , is sometimes labelled as a duckling.
|
item-235 at level 4: text: In 2002, psychologist Richard Wi ... 54] and was made into a movie in 1986.
|
||||||
item-248 at level 3: paragraph: A male is called a drake and the ... a duck, or in ornithology a hen.[3][4]
|
item-236 at level 4: text: The 1992 Disney film The Mighty ... Ducks minor league baseball team.[55]
|
||||||
item-249 at level 3: picture
|
item-237 at level 2: section_header: See also
|
||||||
item-249 at level 4: caption: Male mallard.
|
item-238 at level 3: list: group list
|
||||||
item-250 at level 3: picture
|
item-239 at level 4: list_item: Birds portal
|
||||||
item-250 at level 4: caption: Wood ducks.
|
item-240 at level 3: list: group list
|
||||||
item-251 at level 2: section_header: Taxonomy
|
item-241 at level 4: list_item: Domestic duck
|
||||||
item-252 at level 3: paragraph: All ducks belong to the biologic ... ationships between various species.[9]
|
item-242 at level 4: list_item: Duck as food
|
||||||
item-253 at level 3: picture
|
item-243 at level 4: list_item: Duck test
|
||||||
item-253 at level 4: caption: Mallard landing in approach
|
item-244 at level 4: list_item: Duck breeds
|
||||||
item-254 at level 3: paragraph: In most modern classifications, ... all size and stiff, upright tails.[14]
|
item-245 at level 4: list_item: Fictional ducks
|
||||||
item-255 at level 3: paragraph: A number of other species called ... shelducks in the tribe Tadornini.[15]
|
item-246 at level 4: list_item: Rubber duck
|
||||||
item-256 at level 2: section_header: Morphology
|
item-247 at level 2: section_header: Notes
|
||||||
item-257 at level 3: picture
|
item-248 at level 3: section_header: Citations
|
||||||
item-257 at level 4: caption: Male Mandarin duck
|
item-249 at level 4: ordered_list: group ordered list
|
||||||
item-258 at level 3: paragraph: The overall body plan of ducks i ... is moult typically precedes migration.
|
item-250 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22.
|
||||||
item-259 at level 3: paragraph: The drakes of northern species o ... rkscrew shaped vagina to prevent rape.
|
item-251 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 2000–2006. Retrieved 2015-05-22.
|
||||||
item-260 at level 2: section_header: Distribution and habitat
|
item-252 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139.
|
||||||
item-261 at level 3: picture
|
item-253 at level 5: list_item: ^ Visca, Curt; Visca, Kelley (20 ... Publishing Group. ISBN 9780823961566.
|
||||||
item-261 at level 4: caption: Flying steamer ducks in Ushuaia, Argentina
|
item-254 at level 5: list_item: ^ a b c d Carboneras 1992, p. 536.
|
||||||
item-262 at level 3: paragraph: Ducks have a cosmopolitan distri ... endemic to such far-flung islands.[21]
|
item-255 at level 5: list_item: ^ Livezey 1986, pp. 737–738.
|
||||||
item-263 at level 3: picture
|
item-256 at level 5: list_item: ^ Madsen, McHugh & de Kloet 1988, p. 452.
|
||||||
item-263 at level 4: caption: Female mallard in Cornwall, England
|
item-257 at level 5: list_item: ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.
|
||||||
item-264 at level 3: paragraph: Some duck species, mainly those ... t form after localised heavy rain.[23]
|
item-258 at level 5: list_item: ^ a b c d e f Carboneras 1992, p. 540.
|
||||||
item-265 at level 2: section_header: Behaviour
|
item-259 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 191.
|
||||||
item-266 at level 3: section_header: Feeding
|
item-260 at level 5: list_item: ^ Kear 2005, p. 448.
|
||||||
item-267 at level 4: picture
|
item-261 at level 5: list_item: ^ Kear 2005, p. 622–623.
|
||||||
item-267 at level 5: caption: Pecten along the bill
|
item-262 at level 5: list_item: ^ Kear 2005, p. 686.
|
||||||
item-268 at level 4: picture
|
item-263 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 193.
|
||||||
item-268 at level 5: caption: Mallard duckling preening
|
item-264 at level 5: list_item: ^ a b c d e f g Carboneras 1992, p. 537.
|
||||||
item-269 at level 4: paragraph: Ducks eat food sources such as g ... amphibians, worms, and small molluscs.
|
item-265 at level 5: list_item: ^ American Ornithologists' Union 1998, p. xix.
|
||||||
item-270 at level 4: paragraph: Dabbling ducks feed on the surfa ... thers and to hold slippery food items.
|
item-266 at level 5: list_item: ^ American Ornithologists' Union 1998.
|
||||||
item-271 at level 4: paragraph: Diving ducks and sea ducks forag ... ave more difficulty taking off to fly.
|
item-267 at level 5: list_item: ^ Carboneras 1992, p. 538.
|
||||||
item-272 at level 4: paragraph: A few specialized species such a ... apted to catch and swallow large fish.
|
item-268 at level 5: list_item: ^ Christidis & Boles 2008, p. 62.
|
||||||
item-273 at level 4: paragraph: The others have the characterist ... e nostrils come out through hard horn.
|
item-269 at level 5: list_item: ^ Shirihai 2008, pp. 239, 245.
|
||||||
item-274 at level 4: paragraph: The Guardian published an articl ... the ducks and pollutes waterways.[25]
|
item-270 at level 5: list_item: ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.
|
||||||
item-275 at level 3: section_header: Breeding
|
item-271 at level 5: list_item: ^ Fitter, Fitter & Hosking 2000, pp. 52–3.
|
||||||
item-276 at level 4: picture
|
item-272 at level 5: list_item: ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27.
|
||||||
item-276 at level 5: caption: A Muscovy duckling
|
item-273 at level 5: list_item: ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02.
|
||||||
item-277 at level 4: paragraph: Ducks generally only have one pa ... st and led her ducklings to water.[28]
|
item-274 at level 5: list_item: ^ Karl Mathiesen (16 March 2015) ... Guardian. Retrieved 13 November 2016.
|
||||||
item-278 at level 3: section_header: Communication
|
item-275 at level 5: list_item: ^ Rohwer, Frank C.; Anderson, Mi ... 4615-6787-5_4. ISBN 978-1-4615-6789-9.
|
||||||
item-279 at level 4: paragraph: Female mallard ducks (as well as ... laying calls or quieter contact calls.
|
item-276 at level 5: list_item: ^ Smith, Cyndi M.; Cooke, Fred; ... 093/condor/102.1.201. hdl:10315/13797.
|
||||||
item-280 at level 4: paragraph: A common urban legend claims tha ... annel television show MythBusters.[32]
|
item-277 at level 5: list_item: ^ "If You Find An Orphaned Duckl ... l on 2018-09-23. Retrieved 2018-12-22.
|
||||||
item-281 at level 3: section_header: Predators
|
item-278 at level 5: list_item: ^ Carver, Heather (2011). The Du ... 9780557901562.[self-published source]
|
||||||
item-282 at level 4: picture
|
item-279 at level 5: list_item: ^ Titlow, Budd (2013-09-03). Bir ... man & Littlefield. ISBN 9780762797707.
|
||||||
item-282 at level 5: caption: Ringed teal
|
item-280 at level 5: list_item: ^ Amos, Jonathan (2003-09-08). " ... kers". BBC News. Retrieved 2006-11-02.
|
||||||
item-283 at level 4: paragraph: Ducks have many predators. Duckl ... or large birds, such as hawks or owls.
|
item-281 at level 5: list_item: ^ "Mythbusters Episode 8". 12 December 2003.
|
||||||
item-284 at level 4: paragraph: Adult ducks are fast fliers, but ... its speed and strength to catch ducks.
|
item-282 at level 5: list_item: ^ Erlandson 1994, p. 171.
|
||||||
item-285 at level 2: section_header: Relationship with humans
|
item-283 at level 5: list_item: ^ Jeffries 2008, pp. 168, 243.
|
||||||
item-286 at level 3: section_header: Hunting
|
item-284 at level 5: list_item: ^ a b Sued-Badillo 2003, p. 65.
|
||||||
item-287 at level 4: paragraph: Humans have hunted ducks since p ... evidence of this is uncommon.[35][42]
|
item-285 at level 5: list_item: ^ Thorpe 1996, p. 68.
|
||||||
item-288 at level 4: paragraph: In many areas, wild ducks (inclu ... inated by pollutants such as PCBs.[44]
|
item-286 at level 5: list_item: ^ Maisels 1999, p. 42.
|
||||||
item-289 at level 3: section_header: Domestication
|
item-287 at level 5: list_item: ^ Rau 1876, p. 133.
|
||||||
item-290 at level 4: picture
|
item-288 at level 5: list_item: ^ Higman 2012, p. 23.
|
||||||
item-290 at level 5: caption: Indian Runner ducks, a common breed of domestic ducks
|
item-289 at level 5: list_item: ^ Hume 2012, p. 53.
|
||||||
item-291 at level 4: paragraph: Ducks have many economic uses, b ... it weighs less than 1 kg (2.2 lb).[48]
|
item-290 at level 5: list_item: ^ Hume 2012, p. 52.
|
||||||
item-292 at level 3: section_header: Heraldry
|
item-291 at level 5: list_item: ^ Fieldhouse 2002, p. 167.
|
||||||
item-293 at level 4: picture
|
item-292 at level 5: list_item: ^ Livingston, A. D. (1998-01-01) ... Editions, Limited. ISBN 9781853263774.
|
||||||
item-293 at level 5: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
|
item-293 at level 5: list_item: ^ "Study plan for waterfowl inju ... on 2022-10-09. Retrieved 2 July 2019.
|
||||||
item-294 at level 4: paragraph: Ducks appear on several coats of ... the coat of arms of Föglö (Åland).[51]
|
item-294 at level 5: list_item: ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25.
|
||||||
item-295 at level 3: section_header: Cultural references
|
item-295 at level 5: list_item: ^ "Anas platyrhynchos, Domestic ... . Digimorph.org. Retrieved 2012-12-23.
|
||||||
item-296 at level 4: paragraph: In 2002, psychologist Richard Wi ... 54] and was made into a movie in 1986.
|
item-296 at level 5: list_item: ^ Sy Montgomery. "Mallard; Encyc ... Britannica.com. Retrieved 2012-12-23.
|
||||||
item-297 at level 4: paragraph: The 1992 Disney film The Mighty ... Ducks minor league baseball team.[55]
|
item-297 at level 5: list_item: ^ Glenday, Craig (2014). Guinnes ... ited. pp. 135. ISBN 978-1-908843-15-9.
|
||||||
item-298 at level 2: section_header: See also
|
item-298 at level 5: list_item: ^ Suomen kunnallisvaakunat (in F ... tto. 1982. p. 147. ISBN 951-773-085-3.
|
||||||
item-299 at level 3: list: group list
|
item-299 at level 5: list_item: ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021.
|
||||||
item-300 at level 4: list_item: Birds portal
|
item-300 at level 5: list_item: ^ "Föglö" (in Swedish). Retrieved September 9, 2021.
|
||||||
item-301 at level 3: list: group list
|
item-301 at level 5: list_item: ^ Young, Emma. "World's funniest ... w Scientist. Retrieved 7 January 2019.
|
||||||
item-302 at level 4: list_item: Domestic duck
|
item-302 at level 5: list_item: ^ "Howard the Duck (character)". Grand Comics Database.
|
||||||
item-303 at level 4: list_item: Duck as food
|
item-303 at level 5: list_item: ^ Sanderson, Peter; Gilbert, Lau ... luding this bad-tempered talking duck.
|
||||||
item-304 at level 4: list_item: Duck test
|
item-304 at level 5: list_item: ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20.
|
||||||
item-305 at level 4: list_item: Duck breeds
|
item-305 at level 3: section_header: Sources
|
||||||
item-306 at level 4: list_item: Fictional ducks
|
item-306 at level 4: list: group list
|
||||||
item-307 at level 4: list_item: Rubber duck
|
item-307 at level 5: list_item: American Ornithologists' Union ( ... (PDF) from the original on 2022-10-09.
|
||||||
item-308 at level 2: section_header: Notes
|
item-308 at level 5: list_item: Carboneras, Carlos (1992). del H ... Lynx Edicions. ISBN 978-84-87334-10-8.
|
||||||
item-309 at level 3: section_header: Citations
|
item-309 at level 5: list_item: Christidis, Les; Boles, Walter E ... ro Publishing. ISBN 978-0-643-06511-6.
|
||||||
item-310 at level 4: ordered_list: group ordered list
|
item-310 at level 5: list_item: Donne-Goussé, Carole; Laudet, Vi ... /S1055-7903(02)00019-2. PMID 12099792.
|
||||||
item-311 at level 5: list_item: ^ "Duckling". The American Herit ... n Company. 2006. Retrieved 2015-05-22.
|
item-311 at level 5: list_item: Elphick, Chris; Dunning, John B. ... istopher Helm. ISBN 978-0-7136-6250-4.
|
||||||
item-312 at level 5: list_item: ^ "Duckling". Kernerman English ... Ltd. 2000–2006. Retrieved 2015-05-22.
|
item-312 at level 5: list_item: Erlandson, Jon M. (1994). Early ... usiness Media. ISBN 978-1-4419-3231-0.
|
||||||
item-313 at level 5: list_item: ^ Dohner, Janet Vorwald (2001). ... University Press. ISBN 978-0300138139.
|
item-313 at level 5: list_item: Fieldhouse, Paul (2002). Food, F ... ara: ABC-CLIO. ISBN 978-1-61069-412-4.
|
||||||
item-314 at level 5: list_item: ^ Visca, Curt; Visca, Kelley (20 ... Publishing Group. ISBN 9780823961566.
|
item-314 at level 5: list_item: Fitter, Julian; Fitter, Daniel; ... versity Press. ISBN 978-0-691-10295-5.
|
||||||
item-315 at level 5: list_item: ^ a b c d Carboneras 1992, p. 536.
|
item-315 at level 5: list_item: Higman, B. W. (2012). How Food M ... Wiley & Sons. ISBN 978-1-4051-8947-7.
|
||||||
item-316 at level 5: list_item: ^ Livezey 1986, pp. 737–738.
|
item-316 at level 5: list_item: Hume, Julian H. (2012). Extinct ... istopher Helm. ISBN 978-1-4729-3744-5.
|
||||||
item-317 at level 5: list_item: ^ Madsen, McHugh & de Kloet 1988, p. 452.
|
item-317 at level 5: list_item: Jeffries, Richard (2008). Holoce ... Alabama Press. ISBN 978-0-8173-1658-7.
|
||||||
item-318 at level 5: list_item: ^ Donne-Goussé, Laudet & Hänni 2002, pp. 353–354.
|
item-318 at level 5: list_item: Kear, Janet, ed. (2005). Ducks, ... versity Press. ISBN 978-0-19-861009-0.
|
||||||
item-319 at level 5: list_item: ^ a b c d e f Carboneras 1992, p. 540.
|
item-319 at level 5: list_item: Livezey, Bradley C. (October 198 ... (PDF) from the original on 2022-10-09.
|
||||||
item-320 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 191.
|
item-320 at level 5: list_item: Madsen, Cort S.; McHugh, Kevin P ... (PDF) from the original on 2022-10-09.
|
||||||
item-321 at level 5: list_item: ^ Kear 2005, p. 448.
|
item-321 at level 5: list_item: Maisels, Charles Keith (1999). E ... on: Routledge. ISBN 978-0-415-10975-8.
|
||||||
item-322 at level 5: list_item: ^ Kear 2005, p. 622–623.
|
item-322 at level 5: list_item: Pratt, H. Douglas; Bruner, Phill ... University Press. ISBN 0-691-02399-9.
|
||||||
item-323 at level 5: list_item: ^ Kear 2005, p. 686.
|
item-323 at level 5: list_item: Rau, Charles (1876). Early Man i ... ork: Harper & Brothers. LCCN 05040168.
|
||||||
item-324 at level 5: list_item: ^ Elphick, Dunning & Sibley 2001, p. 193.
|
item-324 at level 5: list_item: Shirihai, Hadoram (2008). A Comp ... versity Press. ISBN 978-0-691-13666-0.
|
||||||
item-325 at level 5: list_item: ^ a b c d e f g Carboneras 1992, p. 537.
|
item-325 at level 5: list_item: Sued-Badillo, Jalil (2003). Auto ... Paris: UNESCO. ISBN 978-92-3-103832-7.
|
||||||
item-326 at level 5: list_item: ^ American Ornithologists' Union 1998, p. xix.
|
item-326 at level 5: list_item: Thorpe, I. J. (1996). The Origin ... rk: Routledge. ISBN 978-0-415-08009-5.
|
||||||
item-327 at level 5: list_item: ^ American Ornithologists' Union 1998.
|
item-327 at level 2: section_header: External links
|
||||||
item-328 at level 5: list_item: ^ Carboneras 1992, p. 538.
|
item-328 at level 3: list: group list
|
||||||
item-329 at level 5: list_item: ^ Christidis & Boles 2008, p. 62.
|
item-329 at level 4: list_item: Definitions from Wiktionary
|
||||||
item-330 at level 5: list_item: ^ Shirihai 2008, pp. 239, 245.
|
item-330 at level 4: list_item: Media from Commons
|
||||||
item-331 at level 5: list_item: ^ a b Pratt, Bruner & Berrett 1987, pp. 98–107.
|
item-331 at level 4: list_item: Quotations from Wikiquote
|
||||||
item-332 at level 5: list_item: ^ Fitter, Fitter & Hosking 2000, pp. 52–3.
|
item-332 at level 4: list_item: Recipes from Wikibooks
|
||||||
item-333 at level 5: list_item: ^ "Pacific Black Duck". www.wiresnr.org. Retrieved 2018-04-27.
|
item-333 at level 4: list_item: Taxa from Wikispecies
|
||||||
item-334 at level 5: list_item: ^ Ogden, Evans. "Dabbling Ducks". CWE. Retrieved 2006-11-02.
|
item-334 at level 4: list_item: Data from Wikidata
|
||||||
item-335 at level 5: list_item: ^ Karl Mathiesen (16 March 2015) ... Guardian. Retrieved 13 November 2016.
|
item-335 at level 3: list: group list
|
||||||
item-336 at level 5: list_item: ^ Rohwer, Frank C.; Anderson, Mi ... 4615-6787-5_4. ISBN 978-1-4615-6789-9.
|
item-336 at level 4: list_item: list of books (useful looking abstracts)
|
||||||
item-337 at level 5: list_item: ^ Smith, Cyndi M.; Cooke, Fred; ... 093/condor/102.1.201. hdl:10315/13797.
|
item-337 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
|
||||||
item-338 at level 5: list_item: ^ "If You Find An Orphaned Duckl ... l on 2018-09-23. Retrieved 2018-12-22.
|
item-338 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
|
||||||
item-339 at level 5: list_item: ^ Carver, Heather (2011). The Du ... 9780557901562.[self-published source]
|
item-339 at level 3: table with [3x2]
|
||||||
item-340 at level 5: list_item: ^ Titlow, Budd (2013-09-03). Bir ... man & Littlefield. ISBN 9780762797707.
|
item-340 at level 3: picture
|
||||||
item-341 at level 5: list_item: ^ Amos, Jonathan (2003-09-08). " ... kers". BBC News. Retrieved 2006-11-02.
|
item-341 at level 3: text: Retrieved from ""
|
||||||
item-342 at level 5: list_item: ^ "Mythbusters Episode 8". 12 December 2003.
|
item-342 at level 3: text: :
|
||||||
item-343 at level 5: list_item: ^ Erlandson 1994, p. 171.
|
item-343 at level 3: list: group list
|
||||||
item-344 at level 5: list_item: ^ Jeffries 2008, pp. 168, 243.
|
item-344 at level 4: list_item: Ducks
|
||||||
item-345 at level 5: list_item: ^ a b Sued-Badillo 2003, p. 65.
|
item-345 at level 4: list_item: Game birds
|
||||||
item-346 at level 5: list_item: ^ Thorpe 1996, p. 68.
|
item-346 at level 4: list_item: Bird common names
|
||||||
item-347 at level 5: list_item: ^ Maisels 1999, p. 42.
|
item-347 at level 3: text: Hidden categories:
|
||||||
item-348 at level 5: list_item: ^ Rau 1876, p. 133.
|
item-348 at level 3: list: group list
|
||||||
item-349 at level 5: list_item: ^ Higman 2012, p. 23.
|
item-349 at level 4: list_item: All accuracy disputes
|
||||||
item-350 at level 5: list_item: ^ Hume 2012, p. 53.
|
item-350 at level 4: list_item: Accuracy disputes from February 2020
|
||||||
item-351 at level 5: list_item: ^ Hume 2012, p. 52.
|
item-351 at level 4: list_item: CS1 Finnish-language sources (fi)
|
||||||
item-352 at level 5: list_item: ^ Fieldhouse 2002, p. 167.
|
item-352 at level 4: list_item: CS1 Latvian-language sources (lv)
|
||||||
item-353 at level 5: list_item: ^ Livingston, A. D. (1998-01-01) ... Editions, Limited. ISBN 9781853263774.
|
item-353 at level 4: list_item: CS1 Swedish-language sources (sv)
|
||||||
item-354 at level 5: list_item: ^ "Study plan for waterfowl inju ... on 2022-10-09. Retrieved 2 July 2019.
|
item-354 at level 4: list_item: Articles with short description
|
||||||
item-355 at level 5: list_item: ^ "FAOSTAT". www.fao.org. Retrieved 2019-10-25.
|
item-355 at level 4: list_item: Short description is different from Wikidata
|
||||||
item-356 at level 5: list_item: ^ "Anas platyrhynchos, Domestic ... . Digimorph.org. Retrieved 2012-12-23.
|
item-356 at level 4: list_item: Wikipedia indefinitely move-protected pages
|
||||||
item-357 at level 5: list_item: ^ Sy Montgomery. "Mallard; Encyc ... Britannica.com. Retrieved 2012-12-23.
|
item-357 at level 4: list_item: Wikipedia indefinitely semi-protected pages
|
||||||
item-358 at level 5: list_item: ^ Glenday, Craig (2014). Guinnes ... ited. pp. 135. ISBN 978-1-908843-15-9.
|
item-358 at level 4: list_item: Articles with 'species' microformats
|
||||||
item-359 at level 5: list_item: ^ Suomen kunnallisvaakunat (in F ... tto. 1982. p. 147. ISBN 951-773-085-3.
|
item-359 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
|
||||||
item-360 at level 5: list_item: ^ "Lubānas simbolika" (in Latvian). Retrieved September 9, 2021.
|
item-360 at level 4: list_item: Articles containing Dutch-language text
|
||||||
item-361 at level 5: list_item: ^ "Föglö" (in Swedish). Retrieved September 9, 2021.
|
item-361 at level 4: list_item: Articles containing German-language text
|
||||||
item-362 at level 5: list_item: ^ Young, Emma. "World's funniest ... w Scientist. Retrieved 7 January 2019.
|
item-362 at level 4: list_item: Articles containing Norwegian-language text
|
||||||
item-363 at level 5: list_item: ^ "Howard the Duck (character)". Grand Comics Database.
|
item-363 at level 4: list_item: Articles containing Lithuanian-language text
|
||||||
item-364 at level 5: list_item: ^ Sanderson, Peter; Gilbert, Lau ... luding this bad-tempered talking duck.
|
item-364 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
|
||||||
item-365 at level 5: list_item: ^ "The Duck". University of Oregon Athletics. Retrieved 2022-01-20.
|
item-365 at level 4: list_item: All articles with self-published sources
|
||||||
item-366 at level 3: section_header: Sources
|
item-366 at level 4: list_item: Articles with self-published sources from February 2020
|
||||||
item-367 at level 4: list: group list
|
item-367 at level 4: list_item: All articles with unsourced statements
|
||||||
item-368 at level 5: list_item: American Ornithologists' Union ( ... (PDF) from the original on 2022-10-09.
|
item-368 at level 4: list_item: Articles with unsourced statements from January 2022
|
||||||
item-369 at level 5: list_item: Carboneras, Carlos (1992). del H ... Lynx Edicions. ISBN 978-84-87334-10-8.
|
item-369 at level 4: list_item: CS1: long volume value
|
||||||
item-370 at level 5: list_item: Christidis, Les; Boles, Walter E ... ro Publishing. ISBN 978-0-643-06511-6.
|
item-370 at level 4: list_item: Pages using Sister project links with wikidata mismatch
|
||||||
item-371 at level 5: list_item: Donne-Goussé, Carole; Laudet, Vi ... /S1055-7903(02)00019-2. PMID 12099792.
|
item-371 at level 4: list_item: Pages using Sister project links with hidden wikidata
|
||||||
item-372 at level 5: list_item: Elphick, Chris; Dunning, John B. ... istopher Helm. ISBN 978-0-7136-6250-4.
|
item-372 at level 4: list_item: Webarchive template wayback links
|
||||||
item-373 at level 5: list_item: Erlandson, Jon M. (1994). Early ... usiness Media. ISBN 978-1-4419-3231-0.
|
item-373 at level 4: list_item: Articles with Project Gutenberg links
|
||||||
item-374 at level 5: list_item: Fieldhouse, Paul (2002). Food, F ... ara: ABC-CLIO. ISBN 978-1-61069-412-4.
|
item-374 at level 4: list_item: Articles containing video clips
|
||||||
item-375 at level 5: list_item: Fitter, Julian; Fitter, Daniel; ... versity Press. ISBN 978-0-691-10295-5.
|
item-375 at level 3: list: group list
|
||||||
item-376 at level 5: list_item: Higman, B. W. (2012). How Food M ... Wiley & Sons. ISBN 978-1-4051-8947-7.
|
item-376 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
|
||||||
item-377 at level 5: list_item: Hume, Julian H. (2012). Extinct ... istopher Helm. ISBN 978-1-4729-3744-5.
|
item-377 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
|
||||||
item-378 at level 5: list_item: Jeffries, Richard (2008). Holoce ... Alabama Press. ISBN 978-0-8173-1658-7.
|
item-378 at level 3: list: group list
|
||||||
item-379 at level 5: list_item: Kear, Janet, ed. (2005). Ducks, ... versity Press. ISBN 978-0-19-861009-0.
|
item-379 at level 4: list_item: Privacy policy
|
||||||
item-380 at level 5: list_item: Livezey, Bradley C. (October 198 ... (PDF) from the original on 2022-10-09.
|
item-380 at level 4: list_item: About Wikipedia
|
||||||
item-381 at level 5: list_item: Madsen, Cort S.; McHugh, Kevin P ... (PDF) from the original on 2022-10-09.
|
item-381 at level 4: list_item: Disclaimers
|
||||||
item-382 at level 5: list_item: Maisels, Charles Keith (1999). E ... on: Routledge. ISBN 978-0-415-10975-8.
|
item-382 at level 4: list_item: Contact Wikipedia
|
||||||
item-383 at level 5: list_item: Pratt, H. Douglas; Bruner, Phill ... University Press. ISBN 0-691-02399-9.
|
item-383 at level 4: list_item: Code of Conduct
|
||||||
item-384 at level 5: list_item: Rau, Charles (1876). Early Man i ... ork: Harper & Brothers. LCCN 05040168.
|
item-384 at level 4: list_item: Developers
|
||||||
item-385 at level 5: list_item: Shirihai, Hadoram (2008). A Comp ... versity Press. ISBN 978-0-691-13666-0.
|
item-385 at level 4: list_item: Statistics
|
||||||
item-386 at level 5: list_item: Sued-Badillo, Jalil (2003). Auto ... Paris: UNESCO. ISBN 978-92-3-103832-7.
|
item-386 at level 4: list_item: Cookie statement
|
||||||
item-387 at level 5: list_item: Thorpe, I. J. (1996). The Origin ... rk: Routledge. ISBN 978-0-415-08009-5.
|
item-387 at level 4: list_item: Mobile view
|
||||||
item-388 at level 2: section_header: External links
|
item-388 at level 3: list: group list
|
||||||
item-389 at level 3: list: group list
|
item-389 at level 3: list: group list
|
||||||
item-390 at level 4: list_item: Definitions from Wiktionary
|
item-390 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
|
||||||
item-391 at level 4: list_item: Media from Commons
|
item-391 at level 1: caption: Male mallard.
|
||||||
item-392 at level 4: list_item: Quotations from Wikiquote
|
item-392 at level 1: caption: Wood ducks.
|
||||||
item-393 at level 4: list_item: Recipes from Wikibooks
|
item-393 at level 1: caption: Mallard landing in approach
|
||||||
item-394 at level 4: list_item: Taxa from Wikispecies
|
item-394 at level 1: caption: Male Mandarin duck
|
||||||
item-395 at level 4: list_item: Data from Wikidata
|
item-395 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
|
||||||
item-396 at level 3: list: group list
|
item-396 at level 1: caption: Female mallard in Cornwall, England
|
||||||
item-397 at level 4: list_item: list of books (useful looking abstracts)
|
item-397 at level 1: caption: Pecten along the bill
|
||||||
item-398 at level 4: list_item: Ducks on postage stamps Archived 2013-05-13 at the Wayback Machine
|
item-398 at level 1: caption: Mallard duckling preening
|
||||||
item-399 at level 4: list_item: Ducks at a Distance, by Rob Hine ... uide to identification of US waterfowl
|
item-399 at level 1: caption: A Muscovy duckling
|
||||||
item-400 at level 3: table with [3x2]
|
item-400 at level 1: caption: Ringed teal
|
||||||
item-401 at level 3: picture
|
item-401 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
|
||||||
item-402 at level 3: list: group list
|
item-402 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
|
||||||
item-403 at level 4: list_item: Ducks
|
|
||||||
item-404 at level 4: list_item: Game birds
|
|
||||||
item-405 at level 4: list_item: Bird common names
|
|
||||||
item-406 at level 3: list: group list
|
|
||||||
item-407 at level 4: list_item: All accuracy disputes
|
|
||||||
item-408 at level 4: list_item: Accuracy disputes from February 2020
|
|
||||||
item-409 at level 4: list_item: CS1 Finnish-language sources (fi)
|
|
||||||
item-410 at level 4: list_item: CS1 Latvian-language sources (lv)
|
|
||||||
item-411 at level 4: list_item: CS1 Swedish-language sources (sv)
|
|
||||||
item-412 at level 4: list_item: Articles with short description
|
|
||||||
item-413 at level 4: list_item: Short description is different from Wikidata
|
|
||||||
item-414 at level 4: list_item: Wikipedia indefinitely move-protected pages
|
|
||||||
item-415 at level 4: list_item: Wikipedia indefinitely semi-protected pages
|
|
||||||
item-416 at level 4: list_item: Articles with 'species' microformats
|
|
||||||
item-417 at level 4: list_item: Articles containing Old English (ca. 450-1100)-language text
|
|
||||||
item-418 at level 4: list_item: Articles containing Dutch-language text
|
|
||||||
item-419 at level 4: list_item: Articles containing German-language text
|
|
||||||
item-420 at level 4: list_item: Articles containing Norwegian-language text
|
|
||||||
item-421 at level 4: list_item: Articles containing Lithuanian-language text
|
|
||||||
item-422 at level 4: list_item: Articles containing Ancient Greek (to 1453)-language text
|
|
||||||
item-423 at level 4: list_item: All articles with self-published sources
|
|
||||||
item-424 at level 4: list_item: Articles with self-published sources from February 2020
|
|
||||||
item-425 at level 4: list_item: All articles with unsourced statements
|
|
||||||
item-426 at level 4: list_item: Articles with unsourced statements from January 2022
|
|
||||||
item-427 at level 4: list_item: CS1: long volume value
|
|
||||||
item-428 at level 4: list_item: Pages using Sister project links with wikidata mismatch
|
|
||||||
item-429 at level 4: list_item: Pages using Sister project links with hidden wikidata
|
|
||||||
item-430 at level 4: list_item: Webarchive template wayback links
|
|
||||||
item-431 at level 4: list_item: Articles with Project Gutenberg links
|
|
||||||
item-432 at level 4: list_item: Articles containing video clips
|
|
||||||
item-433 at level 3: list: group list
|
|
||||||
item-434 at level 4: list_item: This page was last edited on 21 September 2024, at 12:11 (UTC).
|
|
||||||
item-435 at level 4: list_item: Text is available under the Crea ... tion, Inc., a non-profit organization.
|
|
||||||
item-436 at level 3: list: group list
|
|
||||||
item-437 at level 4: list_item: Privacy policy
|
|
||||||
item-438 at level 4: list_item: About Wikipedia
|
|
||||||
item-439 at level 4: list_item: Disclaimers
|
|
||||||
item-440 at level 4: list_item: Contact Wikipedia
|
|
||||||
item-441 at level 4: list_item: Code of Conduct
|
|
||||||
item-442 at level 4: list_item: Developers
|
|
||||||
item-443 at level 4: list_item: Statistics
|
|
||||||
item-444 at level 4: list_item: Cookie statement
|
|
||||||
item-445 at level 4: list_item: Mobile view
|
|
||||||
item-446 at level 3: list: group list
|
|
||||||
item-447 at level 3: list: group list
|
|
||||||
item-448 at level 1: caption: Pacific black duck displaying the characteristic upending "duck"
|
|
||||||
item-449 at level 1: caption: Male mallard.
|
|
||||||
item-450 at level 1: caption: Wood ducks.
|
|
||||||
item-451 at level 1: caption: Mallard landing in approach
|
|
||||||
item-452 at level 1: caption: Male Mandarin duck
|
|
||||||
item-453 at level 1: caption: Flying steamer ducks in Ushuaia, Argentina
|
|
||||||
item-454 at level 1: caption: Female mallard in Cornwall, England
|
|
||||||
item-455 at level 1: caption: Pecten along the bill
|
|
||||||
item-456 at level 1: caption: Mallard duckling preening
|
|
||||||
item-457 at level 1: caption: A Muscovy duckling
|
|
||||||
item-458 at level 1: caption: Ringed teal
|
|
||||||
item-459 at level 1: caption: Indian Runner ducks, a common breed of domestic ducks
|
|
||||||
item-460 at level 1: caption: Three black-colored ducks in the coat of arms of Maaninka[49]
|
|
File diff suppressed because it is too large
Load Diff
@ -1,53 +1,3 @@
|
|||||||
- Main page
|
|
||||||
- Contents
|
|
||||||
- Current events
|
|
||||||
- Random article
|
|
||||||
- About Wikipedia
|
|
||||||
- Contact us
|
|
||||||
|
|
||||||
- Help
|
|
||||||
- Learn to edit
|
|
||||||
- Community portal
|
|
||||||
- Recent changes
|
|
||||||
- Upload file
|
|
||||||
|
|
||||||
<!-- image -->
|
|
||||||
|
|
||||||
<!-- image -->
|
|
||||||
|
|
||||||
<!-- image -->
|
|
||||||
|
|
||||||
- Donate
|
|
||||||
- Create account
|
|
||||||
- Log in
|
|
||||||
- Create account
|
|
||||||
- Log in
|
|
||||||
- Contributions
|
|
||||||
- Talk
|
|
||||||
|
|
||||||
## Contents
|
|
||||||
|
|
||||||
- (Top)
|
|
||||||
- 1 Etymology
|
|
||||||
- 2 Taxonomy
|
|
||||||
- 3 Morphology
|
|
||||||
- 4 Distribution and habitat
|
|
||||||
- 5 Behaviour Toggle Behaviour subsection
|
|
||||||
- 5.1 Feeding
|
|
||||||
- 5.2 Breeding
|
|
||||||
- 5.3 Communication
|
|
||||||
- 5.4 Predators
|
|
||||||
- 6 Relationship with humans Toggle Relationship with humans subsection
|
|
||||||
- 6.1 Hunting
|
|
||||||
- 6.2 Domestication
|
|
||||||
- 6.3 Heraldry
|
|
||||||
- 6.4 Cultural references
|
|
||||||
- 7 See also
|
|
||||||
- 8 Notes Toggle Notes subsection
|
|
||||||
- 8.1 Citations
|
|
||||||
- 8.2 Sources
|
|
||||||
- 9 External links
|
|
||||||
|
|
||||||
# Duck
|
# Duck
|
||||||
|
|
||||||
- Acèh
|
- Acèh
|
||||||
@ -193,9 +143,17 @@
|
|||||||
- Read
|
- Read
|
||||||
- View source
|
- View source
|
||||||
- View history
|
- View history
|
||||||
|
|
||||||
|
Tools
|
||||||
|
|
||||||
|
Actions
|
||||||
|
|
||||||
- Read
|
- Read
|
||||||
- View source
|
- View source
|
||||||
- View history
|
- View history
|
||||||
|
|
||||||
|
General
|
||||||
|
|
||||||
- What links here
|
- What links here
|
||||||
- Related changes
|
- Related changes
|
||||||
- Upload file
|
- Upload file
|
||||||
@ -206,13 +164,29 @@
|
|||||||
- Get shortened URL
|
- Get shortened URL
|
||||||
- Download QR code
|
- Download QR code
|
||||||
- Wikidata item
|
- Wikidata item
|
||||||
|
|
||||||
|
Print/export
|
||||||
|
|
||||||
- Download as PDF
|
- Download as PDF
|
||||||
- Printable version
|
- Printable version
|
||||||
|
|
||||||
|
In other projects
|
||||||
|
|
||||||
- Wikimedia Commons
|
- Wikimedia Commons
|
||||||
- Wikiquote
|
- Wikiquote
|
||||||
|
|
||||||
|
Appearance
|
||||||
|
|
||||||
<!-- image -->
|
<!-- image -->
|
||||||
|
|
||||||
|
From Wikipedia, the free encyclopedia
|
||||||
|
|
||||||
|
Common name for many species of bird
|
||||||
|
|
||||||
|
This article is about the bird. For duck as a food, see . For other uses, see .
|
||||||
|
|
||||||
|
"Duckling" redirects here. For other uses, see .
|
||||||
|
|
||||||
| Duck | Duck |
|
| Duck | Duck |
|
||||||
|--------------------------------|--------------------------------|
|
|--------------------------------|--------------------------------|
|
||||||
| | |
|
| | |
|
||||||
@ -482,10 +456,16 @@ The 1992 Disney film The Mighty Ducks, starring Emilio Estevez, chose the duck a
|
|||||||
|
|
||||||
<!-- image -->
|
<!-- image -->
|
||||||
|
|
||||||
|
Retrieved from ""
|
||||||
|
|
||||||
|
:
|
||||||
|
|
||||||
- Ducks
|
- Ducks
|
||||||
- Game birds
|
- Game birds
|
||||||
- Bird common names
|
- Bird common names
|
||||||
|
|
||||||
|
Hidden categories:
|
||||||
|
|
||||||
- All accuracy disputes
|
- All accuracy disputes
|
||||||
- Accuracy disputes from February 2020
|
- Accuracy disputes from February 2020
|
||||||
- CS1 Finnish-language sources (fi)
|
- CS1 Finnish-language sources (fi)
|
||||||
|
12
tests/data/html/example_06.html
Normal file
12
tests/data/html/example_06.html
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Sample HTML File</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<div>This is a div with text.</div>
|
||||||
|
<div>This is another div with text.</div>
|
||||||
|
<p>This is a regular paragraph.</p>
|
||||||
|
<div>This is a third div<br/>with a new line.</div>
|
||||||
|
<div><p>This is a fourth div with a <b>bold</b> paragraph.</p></div>
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -1,4 +1,4 @@
|
|||||||
import os
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.html_backend import HTMLDocumentBackend
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
@ -41,6 +41,62 @@ def test_heading_levels():
|
|||||||
assert found_lvl_2 and found_lvl_3
|
assert found_lvl_2 and found_lvl_3
|
||||||
|
|
||||||
|
|
||||||
|
def test_ordered_lists():
|
||||||
|
test_set: list[tuple[bytes, str]] = []
|
||||||
|
|
||||||
|
test_set.append(
|
||||||
|
(
|
||||||
|
b"<html><body><ol><li>1st item</li><li>2nd item</li></ol></body></html>",
|
||||||
|
"1. 1st item\n2. 2nd item",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
test_set.append(
|
||||||
|
(
|
||||||
|
b'<html><body><ol start="1"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
||||||
|
"1. 1st item\n2. 2nd item",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
test_set.append(
|
||||||
|
(
|
||||||
|
b'<html><body><ol start="2"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
||||||
|
"2. 1st item\n3. 2nd item",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
test_set.append(
|
||||||
|
(
|
||||||
|
b'<html><body><ol start="0"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
||||||
|
"0. 1st item\n1. 2nd item",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
test_set.append(
|
||||||
|
(
|
||||||
|
b'<html><body><ol start="-5"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
||||||
|
"1. 1st item\n2. 2nd item",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
test_set.append(
|
||||||
|
(
|
||||||
|
b'<html><body><ol start="foo"><li>1st item</li><li>2nd item</li></ol></body></html>',
|
||||||
|
"1. 1st item\n2. 2nd item",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
for idx, pair in enumerate(test_set):
|
||||||
|
in_doc = InputDocument(
|
||||||
|
path_or_stream=BytesIO(pair[0]),
|
||||||
|
format=InputFormat.HTML,
|
||||||
|
backend=HTMLDocumentBackend,
|
||||||
|
filename="test",
|
||||||
|
)
|
||||||
|
backend = HTMLDocumentBackend(
|
||||||
|
in_doc=in_doc,
|
||||||
|
path_or_stream=BytesIO(pair[0]),
|
||||||
|
)
|
||||||
|
doc: DoclingDocument = backend.convert()
|
||||||
|
assert doc
|
||||||
|
assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
|
||||||
|
|
||||||
|
|
||||||
def get_html_paths():
|
def get_html_paths():
|
||||||
|
|
||||||
# Define the directory you want to search
|
# Define the directory you want to search
|
||||||
|
Loading…
Reference in New Issue
Block a user