diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 89bcfd79..b2a295dc 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -1,6 +1,10 @@ on: workflow_call: +env: + HF_HUB_DOWNLOAD_TIMEOUT: "60" + HF_HUB_ETAG_TIMEOUT: "60" + jobs: run-checks: runs-on: ubuntu-latest @@ -14,6 +18,11 @@ jobs: - name: Set TESSDATA_PREFIX run: | echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" + - name: Cache Hugging Face models + uses: actions/cache@v4 + with: + path: ~/.cache/huggingface + key: huggingface-cache-py${{ matrix.python-version }} - uses: ./.github/actions/setup-poetry with: python-version: ${{ matrix.python-version }} @@ -28,7 +37,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api).py ]]; then echo "Skipping $file" continue fi diff --git a/CHANGELOG.md b/CHANGELOG.md index ab946a87..917b3be0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,30 @@ +## [v2.25.1](https://github.com/DS4SD/docling/releases/tag/v2.25.1) - 2025-03-03 + +### Fix + +* Enable locks for threadsafe pdfium ([#1052](https://github.com/DS4SD/docling/issues/1052)) ([`8dc0562`](https://github.com/DS4SD/docling/commit/8dc0562542299cf972d14eeeb4393e50b589c8ad)) +* **html:** Use 'start' attribute when parsing ordered lists from HTML docs ([#1062](https://github.com/DS4SD/docling/issues/1062)) ([`de7b963`](https://github.com/DS4SD/docling/commit/de7b963b09a34916f0a8d99649269aeb37db1408)) + +### Documentation + +* Improve docs on token limit warning triggered by HybridChunker ([#1077](https://github.com/DS4SD/docling/issues/1077)) ([`db3ceef`](https://github.com/DS4SD/docling/commit/db3ceefd4ae6251a97e333bcb03051698b3fa71a)) + +## [v2.25.0](https://github.com/DS4SD/docling/releases/tag/v2.25.0) - 2025-02-26 + +### Feature + +* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/DS4SD/docling/issues/1054)) ([`3c9fe76`](https://github.com/DS4SD/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849)) +* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/DS4SD/docling/issues/1061)) ([`ab683e4`](https://github.com/DS4SD/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b)) + +### Fix + +* Vlm using artifacts path ([#1057](https://github.com/DS4SD/docling/issues/1057)) ([`e197225`](https://github.com/DS4SD/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8)) +* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/DS4SD/docling/issues/1041)) ([`1b0ead6`](https://github.com/DS4SD/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc)) + +### Documentation + +* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/DS4SD/docling/issues/1053)) ([`c84b973`](https://github.com/DS4SD/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d)) + ## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20 ### Feature diff --git a/README.md b/README.md index 5a957d60..842253e9 100644 --- a/README.md +++ b/README.md @@ -123,6 +123,6 @@ For individual model usage, please refer to the model licenses found in the orig Docling has been brought to you by IBM. -[supported_formats]: https://ds4sd.github.io/docling/supported_formats/ +[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/ [docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ [integrations]: https://ds4sd.github.io/docling/integrations/ diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 27a368f9..9178883f 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -12,6 +12,7 @@ from pypdfium2 import PdfPage from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend from docling.datamodel.base_models import Cell, Size +from docling.utils.locks import pypdfium2_lock if TYPE_CHECKING: from docling.datamodel.document import InputDocument @@ -182,20 +183,24 @@ class DoclingParseV2PageBackend(PdfPageBackend): padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t - image = ( - self._ppage.render( - scale=scale * 1.5, - rotation=0, # no additional rotation - crop=padbox.as_tuple(), - ) - .to_pil() - .resize(size=(round(cropbox.width * scale), round(cropbox.height * scale))) - ) # We resize the image from 1.5x the given scale to make it sharper. + with pypdfium2_lock: + image = ( + self._ppage.render( + scale=scale * 1.5, + rotation=0, # no additional rotation + crop=padbox.as_tuple(), + ) + .to_pil() + .resize( + size=(round(cropbox.width * scale), round(cropbox.height * scale)) + ) + ) # We resize the image from 1.5x the given scale to make it sharper. return image def get_size(self) -> Size: - return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) + with pypdfium2_lock: + return Size(width=self._ppage.get_width(), height=self._ppage.get_height()) def unload(self): self._ppage = None @@ -206,23 +211,24 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - self._pdoc = pdfium.PdfDocument(self.path_or_stream) - self.parser = pdf_parser_v2("fatal") + with pypdfium2_lock: + self._pdoc = pdfium.PdfDocument(self.path_or_stream) + self.parser = pdf_parser_v2("fatal") - success = False - if isinstance(self.path_or_stream, BytesIO): - success = self.parser.load_document_from_bytesio( - self.document_hash, self.path_or_stream - ) - elif isinstance(self.path_or_stream, Path): - success = self.parser.load_document( - self.document_hash, str(self.path_or_stream) - ) + success = False + if isinstance(self.path_or_stream, BytesIO): + success = self.parser.load_document_from_bytesio( + self.document_hash, self.path_or_stream + ) + elif isinstance(self.path_or_stream, Path): + success = self.parser.load_document( + self.document_hash, str(self.path_or_stream) + ) - if not success: - raise RuntimeError( - f"docling-parse v2 could not load document {self.document_hash}." - ) + if not success: + raise RuntimeError( + f"docling-parse v2 could not load document {self.document_hash}." + ) def page_count(self) -> int: # return len(self._pdoc) # To be replaced with docling-parse API @@ -236,9 +242,10 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): return len_2 def load_page(self, page_no: int) -> DoclingParseV2PageBackend: - return DoclingParseV2PageBackend( - self.parser, self.document_hash, page_no, self._pdoc[page_no] - ) + with pypdfium2_lock: + return DoclingParseV2PageBackend( + self.parser, self.document_hash, page_no, self._pdoc[page_no] + ) def is_valid(self) -> bool: return self.page_count() > 0 @@ -246,5 +253,6 @@ class DoclingParseV2DocumentBackend(PdfDocumentBackend): def unload(self): super().unload() self.parser.unload_document(self.document_hash) - self._pdoc.close() - self._pdoc = None + with pypdfium2_lock: + self._pdoc.close() + self._pdoc = None diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 5b7f5d81..d14b422f 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,9 +1,10 @@ import logging from io import BytesIO from pathlib import Path -from typing import Optional, Union, cast +from typing import Final, Optional, Union, cast from bs4 import BeautifulSoup, NavigableString, PageElement, Tag +from bs4.element import PreformattedString from docling_core.types.doc import ( DocItem, DocItemLabel, @@ -14,6 +15,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docling_core.types.doc.document import ContentLayer from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend @@ -22,12 +24,29 @@ from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +# tags that generate NodeItem elements +TAGS_FOR_NODE_ITEMS: Final = [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "p", + "pre", + "ul", + "ol", + "li", + "table", + "figure", + "img", +] + class HTMLDocumentBackend(DeclarativeDocumentBackend): @override def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) - _log.debug("About to init HTML backend...") self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream @@ -48,7 +67,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.soup = BeautifulSoup(html_content, "html.parser") except Exception as e: raise RuntimeError( - f"Could not initialize HTML backend for file with hash {self.document_hash}." + "Could not initialize HTML backend for file with " + f"hash {self.document_hash}." ) from e @override @@ -88,17 +108,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): assert self.soup is not None content = self.soup.body or self.soup # Replace
tags with newline characters + # TODO: remove style to avoid losing text from tags like i, b, span, ... for br in content("br"): br.replace_with(NavigableString("\n")) + + headers = content.find(["h1", "h2", "h3", "h4", "h5", "h6"]) + self.content_layer = ( + ContentLayer.BODY if headers is None else ContentLayer.FURNITURE + ) self.walk(content, doc) else: raise RuntimeError( - f"Cannot convert doc with {self.document_hash} because the backend failed to init." + f"Cannot convert doc with {self.document_hash} because the backend " + "failed to init." ) return doc def walk(self, tag: Tag, doc: DoclingDocument) -> None: + # Iterate over elements in the body of the document + text: str = "" for element in tag.children: if isinstance(element, Tag): try: @@ -108,6 +137,26 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): f"Error processing child from tag{tag.name}: {exc_child}" ) raise exc_child + elif isinstance(element, NavigableString) and not isinstance( + element, PreformattedString + ): + # Floating text outside paragraphs or analyzed tags + text += element + siblings: list[Tag] = [ + item for item in element.next_siblings if isinstance(item, Tag) + ] + if element.next_sibling is None or any( + [item.name in TAGS_FOR_NODE_ITEMS for item in siblings] + ): + text = text.strip() + if text and tag.name in ["div"]: + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=text, + content_layer=self.content_layer, + ) + text = "" return @@ -127,7 +176,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): elif tag.name == "figure": self.handle_figure(tag, doc) elif tag.name == "img": - self.handle_image(doc) + self.handle_image(tag, doc) else: self.walk(tag, doc) @@ -158,12 +207,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): text = element.text.strip() if hlevel == 1: - for key, val in self.parents.items(): + self.content_layer = ContentLayer.BODY + + for key in self.parents.keys(): self.parents[key] = None self.level = 1 self.parents[self.level] = doc.add_text( - parent=self.parents[0], label=DocItemLabel.TITLE, text=text + parent=self.parents[0], + label=DocItemLabel.TITLE, + text=text, + content_layer=self.content_layer, ) else: if hlevel > self.level: @@ -174,6 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): name=f"header-{i}", label=GroupLabel.SECTION, parent=self.parents[i - 1], + content_layer=self.content_layer, ) self.level = hlevel @@ -189,6 +244,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): parent=self.parents[hlevel - 1], text=text, level=hlevel, + content_layer=self.content_layer, ) def handle_code(self, element: Tag, doc: DoclingDocument) -> None: @@ -197,16 +253,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return text = element.text.strip() if text: - doc.add_code(parent=self.parents[self.level], text=text) + doc.add_code( + parent=self.parents[self.level], + text=text, + content_layer=self.content_layer, + ) def handle_paragraph(self, element: Tag, doc: DoclingDocument) -> None: """Handles paragraph tags (p).""" if element.text is None: return text = element.text.strip() - label = DocItemLabel.PARAGRAPH if text: - doc.add_text(parent=self.parents[self.level], label=label, text=text) + doc.add_text( + parent=self.parents[self.level], + label=DocItemLabel.TEXT, + text=text, + content_layer=self.content_layer, + ) def handle_list(self, element: Tag, doc: DoclingDocument) -> None: """Handles list tags (ul, ol) and their list items.""" @@ -214,14 +278,24 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if element.name == "ul": # create a list group self.parents[self.level + 1] = doc.add_group( - parent=self.parents[self.level], name="list", label=GroupLabel.LIST + parent=self.parents[self.level], + name="list", + label=GroupLabel.LIST, + content_layer=self.content_layer, ) elif element.name == "ol": + start_attr = element.get("start") + start: int = ( + int(start_attr) + if isinstance(start_attr, str) and start_attr.isnumeric() + else 1 + ) # create a list group self.parents[self.level + 1] = doc.add_group( parent=self.parents[self.level], - name="ordered list", + name="ordered list" + (f" start {start}" if start != 1 else ""), label=GroupLabel.ORDERED_LIST, + content_layer=self.content_layer, ) self.level += 1 @@ -231,15 +305,23 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level -= 1 def handle_list_item(self, element: Tag, doc: DoclingDocument) -> None: - """Handles listitem tags (li).""" + """Handles list item tags (li).""" nested_list = element.find(["ul", "ol"]) parent = self.parents[self.level] if parent is None: - _log.warning(f"list-item has no parent in DoclingDocument: {element}") + _log.debug(f"list-item has no parent in DoclingDocument: {element}") return parent_label: str = parent.label index_in_list = len(parent.children) + 1 + if ( + parent_label == GroupLabel.ORDERED_LIST + and isinstance(parent, GroupItem) + and parent.name + ): + start_in_list: str = parent.name.split(" ")[-1] + start: int = int(start_in_list) if start_in_list.isnumeric() else 1 + index_in_list += start - 1 if nested_list: # Text in list item can be hidden within hierarchy, hence @@ -262,6 +344,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): enumerated=enumerated, marker=marker, parent=parent, + content_layer=self.content_layer, ) self.level += 1 @@ -283,15 +366,16 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): enumerated=enumerated, marker=marker, parent=parent, + content_layer=self.content_layer, ) else: - _log.warning(f"list-item has no text: {element}") + _log.debug(f"list-item has no text: {element}") @staticmethod def parse_table_data(element: Tag) -> Optional[TableData]: nested_tables = element.find("table") if nested_tables is not None: - _log.warning("Skipping nested table.") + _log.debug("Skipping nested table.") return None # Count the number of rows (number of elements) @@ -386,7 +470,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): table_data = HTMLDocumentBackend.parse_table_data(element) if table_data is not None: - doc.add_table(data=table_data, parent=self.parents[self.level]) + doc.add_table( + data=table_data, + parent=self.parents[self.level], + content_layer=self.content_layer, + ) def get_list_text(self, list_element: Tag, level: int = 0) -> list[str]: """Recursively extract text from