diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml index 0bdd730c..473326dc 100644 --- a/.github/actions/setup-poetry/action.yml +++ b/.github/actions/setup-poetry/action.yml @@ -8,7 +8,7 @@ runs: using: 'composite' steps: - name: Install poetry - run: pipx install poetry==1.8.3 + run: pipx install poetry==1.8.5 shell: bash - uses: actions/setup-python@v5 with: diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 531ec8dd..19e8c1e1 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - name: Install tesseract @@ -28,7 +28,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then echo "Skipping $file" continue fi diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 0fc3ac7a..dd976ea3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,10 +14,6 @@ jobs: - uses: ./.github/actions/setup-poetry - name: Build docs run: poetry run mkdocs build --verbose --clean - - name: Make docs LLM ready - if: inputs.deploy - uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b - name: Build and push docs if: inputs.deploy - run: poetry run mkdocs gh-deploy --force --dirty - + run: poetry run mkdocs gh-deploy --force diff --git a/CHANGELOG.md b/CHANGELOG.md index 597dde6e..4ad3b47d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,29 @@ +## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03 + +### Feature + +* Expose equation exports ([#869](https://github.com/DS4SD/docling/issues/869)) ([`6a76b49`](https://github.com/DS4SD/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8)) +* Add option to define page range ([#852](https://github.com/DS4SD/docling/issues/852)) ([`70d68b6`](https://github.com/DS4SD/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381)) +* **docx:** Support of SDTs in docx backend ([#853](https://github.com/DS4SD/docling/issues/853)) ([`d727b04`](https://github.com/DS4SD/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e)) +* Python 3.13 support ([#841](https://github.com/DS4SD/docling/issues/841)) ([`4df085a`](https://github.com/DS4SD/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f)) + +### Fix + +* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/DS4SD/docling/issues/873)) ([`5ac2887`](https://github.com/DS4SD/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70)) +* **markdown:** Add support for HTML content ([#855](https://github.com/DS4SD/docling/issues/855)) ([`94751a7`](https://github.com/DS4SD/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62)) +* **docx:** Merged table cells not properly converted ([#857](https://github.com/DS4SD/docling/issues/857)) ([`0cd81a8`](https://github.com/DS4SD/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0)) +* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/DS4SD/docling/issues/868)) ([`eff16b6`](https://github.com/DS4SD/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a)) +* KeyError in tableformer prediction ([#854](https://github.com/DS4SD/docling/issues/854)) ([`b1cf796`](https://github.com/DS4SD/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee)) +* Fixed docx import with headers that are also lists ([#842](https://github.com/DS4SD/docling/issues/842)) ([`2c037ae`](https://github.com/DS4SD/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3)) +* Use new add_code in html backend and add more typing hints ([#850](https://github.com/DS4SD/docling/issues/850)) ([`2a1f8af`](https://github.com/DS4SD/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933)) +* **markdown:** Fix empty block handling ([#843](https://github.com/DS4SD/docling/issues/843)) ([`bccb022`](https://github.com/DS4SD/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9)) +* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/DS4SD/docling/issues/837)) ([`fea0a99`](https://github.com/DS4SD/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e)) + +### Documentation + +* Updated the readme with upcoming features ([#831](https://github.com/DS4SD/docling/issues/831)) ([`d7c0828`](https://github.com/DS4SD/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd)) +* Add example for inspection of picture content ([#624](https://github.com/DS4SD/docling/issues/624)) ([`f9144f2`](https://github.com/DS4SD/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781)) + ## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 ### Feature diff --git a/README.md b/README.md index 8050365f..5a957d60 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,9 @@ Docling simplifies document processing, parsing diverse formats — including ad ### Coming soon * 📝 Metadata extraction, including title, authors, references & language +* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling)) +* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) +* 📝 Complex chemistry understanding (Molecular structures) ## Installation diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 3de333dc..286dfbfa 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,9 +1,9 @@ import logging from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Optional, Set, Union -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from docling_core.types.doc import ( DocItemLabel, DoclingDocument, @@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) _log.debug("About to init HTML backend...") - self.soup = None + self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy @@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) return doc - def walk(self, element, doc): + def walk(self, element: Tag, doc: DoclingDocument): try: # Iterate over elements in the body of the document for idx, element in enumerate(element.children): @@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc - def analyse_element(self, element, idx, doc): + def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): """ if element.name!=None: _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") @@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: self.walk(element, doc) - def get_direct_text(self, item): + def get_direct_text(self, item: Tag): """Get the direct text of the
  • element (ignoring nested lists).""" text = item.find(string=True, recursive=False) if isinstance(text, str): @@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "" # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item): + def extract_text_recursively(self, item: Tag): result = [] if isinstance(item, str): @@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "".join(result) + " " - def handle_header(self, element, idx, doc): + def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) slevel = hlevel - 1 @@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): level=hlevel, ) - def handle_code(self, element, idx, doc): + def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): """Handles monospace code snippets (pre).""" if element.text is None: return @@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): label = DocItemLabel.CODE if len(text) == 0: return - doc.add_code(parent=self.parents[self.level], label=label, text=text) + doc.add_code(parent=self.parents[self.level], text=text) - def handle_paragraph(self, element, idx, doc): + def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): """Handles paragraph tags (p).""" if element.text is None: return @@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc.add_text(parent=self.parents[self.level], label=label, text=text) - def handle_list(self, element, idx, doc): + def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": @@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element, idx, doc): + def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): """Handles listitem tags (li).""" nested_lists = element.find(["ul", "ol"]) @@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: _log.warn("list-item has no text: ", element) - def handle_table(self, element, idx, doc): + def handle_table(self, element: Tag, idx: int, doc: DoclingDocument): """Handles table tags.""" nested_tables = element.find("table") @@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=data, parent=self.parents[self.level]) - def get_list_text(self, list_element, level=0): + def get_list_text(self, list_element: Tag, level=0): """Recursively extract text from