diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml index 0bdd730c..473326dc 100644 --- a/.github/actions/setup-poetry/action.yml +++ b/.github/actions/setup-poetry/action.yml @@ -8,7 +8,7 @@ runs: using: 'composite' steps: - name: Install poetry - run: pipx install poetry==1.8.3 + run: pipx install poetry==1.8.5 shell: bash - uses: actions/setup-python@v5 with: diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 531ec8dd..e04e2803 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - name: Install tesseract diff --git a/CHANGELOG.md b/CHANGELOG.md index 906e8300..597dde6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,27 @@ +## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 + +### Feature + +* **CLI:** Expose code and formula models in the CLI ([#820](https://github.com/DS4SD/docling/issues/820)) ([`6882e6c`](https://github.com/DS4SD/docling/commit/6882e6c38df30e4d4a1b83e01b13900ca7ea001f)) +* Add platform info to CLI version printout ([#816](https://github.com/DS4SD/docling/issues/816)) ([`95b293a`](https://github.com/DS4SD/docling/commit/95b293a72356f94c7076e3649be970c8a51121a3)) +* **ocr:** Expose `rec_keys_path` in RapidOcrOptions to support custom dictionaries ([#786](https://github.com/DS4SD/docling/issues/786)) ([`5332755`](https://github.com/DS4SD/docling/commit/53327552e83ced079ae50d8067ba7a8ce80cd9ad)) +* Introduce automatic language detection in TesseractOcrCliModel ([#800](https://github.com/DS4SD/docling/issues/800)) ([`3be2fb5`](https://github.com/DS4SD/docling/commit/3be2fb581fe5a2ebd5cec9c86bb22eb1dec6fd0f)) + +### Fix + +* Fix single newline handling in MD backend ([#824](https://github.com/DS4SD/docling/issues/824)) ([`5aed9f8`](https://github.com/DS4SD/docling/commit/5aed9f8aeba1624ba1a721e2ed3ba4aceaa7a482)) +* Use file extension if filetype fails with PDF ([#827](https://github.com/DS4SD/docling/issues/827)) ([`adf6353`](https://github.com/DS4SD/docling/commit/adf635348365f82daa64e3f879076a7baf71edc0)) +* Parse html with omitted body tag ([#818](https://github.com/DS4SD/docling/issues/818)) ([`a112d7a`](https://github.com/DS4SD/docling/commit/a112d7a03512e8a00842a100416426254d6ecfc0)) + +### Documentation + +* Document Docling JSON parsing ([#819](https://github.com/DS4SD/docling/issues/819)) ([`6875913`](https://github.com/DS4SD/docling/commit/6875913e34abacb8d71b5d31543adbf7b5bd5e92)) +* Add SSL verification error mitigation ([#821](https://github.com/DS4SD/docling/issues/821)) ([`5139b48`](https://github.com/DS4SD/docling/commit/5139b48e4e62bb061d956c132958ec2e6d88e40a)) +* **backend XML:** Do not delete temp file in notebook ([#817](https://github.com/DS4SD/docling/issues/817)) ([`4d41db3`](https://github.com/DS4SD/docling/commit/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b)) +* Typo ([#814](https://github.com/DS4SD/docling/issues/814)) ([`8a4ec77`](https://github.com/DS4SD/docling/commit/8a4ec77576b8a9fd60d0047939665d00cf93b4dd)) +* Added markdown headings to enable TOC in github pages ([#808](https://github.com/DS4SD/docling/issues/808)) ([`b885b2f`](https://github.com/DS4SD/docling/commit/b885b2fa3c2519c399ed4b9a3dd4c2f6f62235d1)) +* Description of supported formats and backends ([#788](https://github.com/DS4SD/docling/issues/788)) ([`c2ae1cc`](https://github.com/DS4SD/docling/commit/c2ae1cc4cab0f9e693c7ca460fe8afa5b515ee94)) + ## [v2.16.0](https://github.com/DS4SD/docling/releases/tag/v2.16.0) - 2025-01-24 ### Feature diff --git a/README.md b/README.md index 78acb592..5a957d60 100644 --- a/README.md +++ b/README.md @@ -22,23 +22,25 @@ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) -Docling parses documents and exports them to the desired format with ease and speed. +Docling simplifies document processing, parsing diverse formats โ€” including advanced PDF understanding โ€” and providing seamless integrations with the gen AI ecosystem. ## Features -* ๐Ÿ—‚๏ธ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images) -* ๐Ÿ“‘ Advanced PDF document understanding including page layout, reading order & table structures -* ๐Ÿงฉ Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format -* ๐Ÿค– Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI -* ๐Ÿ” OCR support for scanned PDFs +* ๐Ÿ—‚๏ธ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more +* ๐Ÿ“‘ Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more +* ๐Ÿงฌ Unified, expressive [DoclingDocument][docling_document] representation format +* โ†ช๏ธ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON +* ๐Ÿ”’ Local execution capabilities for sensitive data and air-gapped environments +* ๐Ÿค– Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI +* ๐Ÿ” Extensive OCR support for scanned PDFs and images * ๐Ÿ’ป Simple and convenient CLI -Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling! - ### Coming soon -* โ™พ๏ธ Equation & code extraction * ๐Ÿ“ Metadata extraction, including title, authors, references & language +* ๐Ÿ“ Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling)) +* ๐Ÿ“ Chart understanding (Barchart, Piechart, LinePlot, etc) +* ๐Ÿ“ Complex chemistry understanding (Molecular structures) ## Installation @@ -120,3 +122,7 @@ For individual model usage, please refer to the model licenses found in the orig ## IBM โค๏ธ Open Source AI Docling has been brought to you by IBM. + +[supported_formats]: https://ds4sd.github.io/docling/supported_formats/ +[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ +[integrations]: https://ds4sd.github.io/docling/integrations/ diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 3de333dc..286dfbfa 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,9 +1,9 @@ import logging from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Optional, Set, Union -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from docling_core.types.doc import ( DocItemLabel, DoclingDocument, @@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) _log.debug("About to init HTML backend...") - self.soup = None + self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy @@ -89,7 +89,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) return doc - def walk(self, element, doc): + def walk(self, element: Tag, doc: DoclingDocument): try: # Iterate over elements in the body of the document for idx, element in enumerate(element.children): @@ -106,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc - def analyse_element(self, element, idx, doc): + def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): """ if element.name!=None: _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") @@ -136,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: self.walk(element, doc) - def get_direct_text(self, item): + def get_direct_text(self, item: Tag): """Get the direct text of the
  • element (ignoring nested lists).""" text = item.find(string=True, recursive=False) if isinstance(text, str): @@ -145,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "" # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item): + def extract_text_recursively(self, item: Tag): result = [] if isinstance(item, str): @@ -166,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "".join(result) + " " - def handle_header(self, element, idx, doc): + def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) slevel = hlevel - 1 @@ -208,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): level=hlevel, ) - def handle_code(self, element, idx, doc): + def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): """Handles monospace code snippets (pre).""" if element.text is None: return @@ -216,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): label = DocItemLabel.CODE if len(text) == 0: return - doc.add_code(parent=self.parents[self.level], label=label, text=text) + doc.add_code(parent=self.parents[self.level], text=text) - def handle_paragraph(self, element, idx, doc): + def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): """Handles paragraph tags (p).""" if element.text is None: return @@ -228,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc.add_text(parent=self.parents[self.level], label=label, text=text) - def handle_list(self, element, idx, doc): + def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": @@ -250,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element, idx, doc): + def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): """Handles listitem tags (li).""" nested_lists = element.find(["ul", "ol"]) @@ -304,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: _log.warn("list-item has no text: ", element) - def handle_table(self, element, idx, doc): + def handle_table(self, element: Tag, idx: int, doc: DoclingDocument): """Handles table tags.""" nested_tables = element.find("table") @@ -377,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=data, parent=self.parents[self.level]) - def get_list_text(self, list_element, level=0): + def get_list_text(self, list_element: Tag, level=0): """Recursively extract text from