diff --git a/.github/actions/setup-poetry/action.yml b/.github/actions/setup-poetry/action.yml index 0bdd730c..473326dc 100644 --- a/.github/actions/setup-poetry/action.yml +++ b/.github/actions/setup-poetry/action.yml @@ -8,7 +8,7 @@ runs: using: 'composite' steps: - name: Install poetry - run: pipx install poetry==1.8.3 + run: pipx install poetry==1.8.5 shell: bash - uses: actions/setup-python@v5 with: diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 1cd08f2c..e04e2803 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -6,11 +6,11 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.9', '3.10', '3.11', '3.12'] + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] steps: - uses: actions/checkout@v4 - name: Install tesseract - run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa libleptonica-dev libtesseract-dev pkg-config + run: sudo apt-get update && sudo apt-get install -y tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config - name: Set TESSDATA_PREFIX run: | echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 2733b522..0fc3ac7a 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,7 +14,10 @@ jobs: - uses: ./.github/actions/setup-poetry - name: Build docs run: poetry run mkdocs build --verbose --clean + - name: Make docs LLM ready + if: inputs.deploy + uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b - name: Build and push docs if: inputs.deploy - run: poetry run mkdocs gh-deploy --force - \ No newline at end of file + run: poetry run mkdocs gh-deploy --force --dirty + diff --git a/CHANGELOG.md b/CHANGELOG.md index 21882618..597dde6e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,49 @@ +## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 + +### Feature + +* **CLI:** Expose code and formula models in the CLI ([#820](https://github.com/DS4SD/docling/issues/820)) ([`6882e6c`](https://github.com/DS4SD/docling/commit/6882e6c38df30e4d4a1b83e01b13900ca7ea001f)) +* Add platform info to CLI version printout ([#816](https://github.com/DS4SD/docling/issues/816)) ([`95b293a`](https://github.com/DS4SD/docling/commit/95b293a72356f94c7076e3649be970c8a51121a3)) +* **ocr:** Expose `rec_keys_path` in RapidOcrOptions to support custom dictionaries ([#786](https://github.com/DS4SD/docling/issues/786)) ([`5332755`](https://github.com/DS4SD/docling/commit/53327552e83ced079ae50d8067ba7a8ce80cd9ad)) +* Introduce automatic language detection in TesseractOcrCliModel ([#800](https://github.com/DS4SD/docling/issues/800)) ([`3be2fb5`](https://github.com/DS4SD/docling/commit/3be2fb581fe5a2ebd5cec9c86bb22eb1dec6fd0f)) + +### Fix + +* Fix single newline handling in MD backend ([#824](https://github.com/DS4SD/docling/issues/824)) ([`5aed9f8`](https://github.com/DS4SD/docling/commit/5aed9f8aeba1624ba1a721e2ed3ba4aceaa7a482)) +* Use file extension if filetype fails with PDF ([#827](https://github.com/DS4SD/docling/issues/827)) ([`adf6353`](https://github.com/DS4SD/docling/commit/adf635348365f82daa64e3f879076a7baf71edc0)) +* Parse html with omitted body tag ([#818](https://github.com/DS4SD/docling/issues/818)) ([`a112d7a`](https://github.com/DS4SD/docling/commit/a112d7a03512e8a00842a100416426254d6ecfc0)) + +### Documentation + +* Document Docling JSON parsing ([#819](https://github.com/DS4SD/docling/issues/819)) ([`6875913`](https://github.com/DS4SD/docling/commit/6875913e34abacb8d71b5d31543adbf7b5bd5e92)) +* Add SSL verification error mitigation ([#821](https://github.com/DS4SD/docling/issues/821)) ([`5139b48`](https://github.com/DS4SD/docling/commit/5139b48e4e62bb061d956c132958ec2e6d88e40a)) +* **backend XML:** Do not delete temp file in notebook ([#817](https://github.com/DS4SD/docling/issues/817)) ([`4d41db3`](https://github.com/DS4SD/docling/commit/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b)) +* Typo ([#814](https://github.com/DS4SD/docling/issues/814)) ([`8a4ec77`](https://github.com/DS4SD/docling/commit/8a4ec77576b8a9fd60d0047939665d00cf93b4dd)) +* Added markdown headings to enable TOC in github pages ([#808](https://github.com/DS4SD/docling/issues/808)) ([`b885b2f`](https://github.com/DS4SD/docling/commit/b885b2fa3c2519c399ed4b9a3dd4c2f6f62235d1)) +* Description of supported formats and backends ([#788](https://github.com/DS4SD/docling/issues/788)) ([`c2ae1cc`](https://github.com/DS4SD/docling/commit/c2ae1cc4cab0f9e693c7ca460fe8afa5b515ee94)) + +## [v2.16.0](https://github.com/DS4SD/docling/releases/tag/v2.16.0) - 2025-01-24 + +### Feature + +* New document picture classifier ([#805](https://github.com/DS4SD/docling/issues/805)) ([`16a218d`](https://github.com/DS4SD/docling/commit/16a218d871c48fd9cc636b77f7b597dc40cbeeec)) +* Add Docling JSON ingestion ([#783](https://github.com/DS4SD/docling/issues/783)) ([`88a0e66`](https://github.com/DS4SD/docling/commit/88a0e66adc19238f57a942b0504926cdaeacd8cc)) +* Code and equation model for PDF and code blocks in markdown ([#752](https://github.com/DS4SD/docling/issues/752)) ([`3213b24`](https://github.com/DS4SD/docling/commit/3213b247ad6870ff984271f09f7720be68d9479b)) +* Add "auto" language for TesseractOcr ([#759](https://github.com/DS4SD/docling/issues/759)) ([`8543c22`](https://github.com/DS4SD/docling/commit/8543c22687fee40459d393bf4adcfc059712de02)) + +### Fix + +* Added extraction of byte-images in excel ([#804](https://github.com/DS4SD/docling/issues/804)) ([`a458e29`](https://github.com/DS4SD/docling/commit/a458e298ca64da2c6df29d953e95645525817bed)) +* Update docling-parse-v2 backend version with new parsing fixes ([#769](https://github.com/DS4SD/docling/issues/769)) ([`670a08b`](https://github.com/DS4SD/docling/commit/670a08bdedda847ff3b6942bcaa1a2adef79afe2)) + +### Documentation + +* Fix minor typos ([#801](https://github.com/DS4SD/docling/issues/801)) ([`c58f75d`](https://github.com/DS4SD/docling/commit/c58f75d0f75040e32820cc2915ec00755211c02f)) +* Add Azure RAG example ([#675](https://github.com/DS4SD/docling/issues/675)) ([`9020a93`](https://github.com/DS4SD/docling/commit/9020a934be35b0798c972eb77a22fb62ce654ca5)) +* Fix links between docs pages ([#697](https://github.com/DS4SD/docling/issues/697)) ([`c49b352`](https://github.com/DS4SD/docling/commit/c49b3526fb7b72e8007f785b1fcfdf58c2457756)) +* Fix correct Accelerator pipeline options in docs/examples/custom_convert.py ([#733](https://github.com/DS4SD/docling/issues/733)) ([`7686083`](https://github.com/DS4SD/docling/commit/768608351d40376c3504546f52e967195536b3d5)) +* Example to translate documents ([#739](https://github.com/DS4SD/docling/issues/739)) ([`f7e1cbf`](https://github.com/DS4SD/docling/commit/f7e1cbf629ae5f3e279296e72f656b7a453ab7a3)) + ## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10 ### Fix diff --git a/README.md b/README.md index 78acb592..5a957d60 100644 --- a/README.md +++ b/README.md @@ -22,23 +22,25 @@ [![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) -Docling parses documents and exports them to the desired format with ease and speed. +Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. ## Features -* 🗂️ Reads popular document formats (PDF, DOCX, PPTX, XLSX, Images, HTML, AsciiDoc & Markdown) and exports to HTML, Markdown and JSON (with embedded and referenced images) -* 📑 Advanced PDF document understanding including page layout, reading order & table structures -* 🧩 Unified, expressive [DoclingDocument](https://ds4sd.github.io/docling/concepts/docling_document/) representation format -* 🤖 Plug-and-play [integrations](https://ds4sd.github.io/docling/integrations/) incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI -* 🔍 OCR support for scanned PDFs +* 🗂️ Parsing of [multiple document formats][supported_formats] incl. PDF, DOCX, XLSX, HTML, images, and more +* 📑 Advanced PDF understanding incl. page layout, reading order, table structure, code, formulas, image classification, and more +* 🧬 Unified, expressive [DoclingDocument][docling_document] representation format +* ↪️ Various [export formats][supported_formats] and options, including Markdown, HTML, and lossless JSON +* 🔒 Local execution capabilities for sensitive data and air-gapped environments +* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI +* 🔍 Extensive OCR support for scanned PDFs and images * 💻 Simple and convenient CLI -Explore the [documentation](https://ds4sd.github.io/docling/) to discover plenty examples and unlock the full power of Docling! - ### Coming soon -* ♾️ Equation & code extraction * 📝 Metadata extraction, including title, authors, references & language +* 📝 Inclusion of Visual Language Models ([SmolDocling](https://huggingface.co/blog/smolervlm#smoldocling)) +* 📝 Chart understanding (Barchart, Piechart, LinePlot, etc) +* 📝 Complex chemistry understanding (Molecular structures) ## Installation @@ -120,3 +122,7 @@ For individual model usage, please refer to the model licenses found in the orig ## IBM ❤️ Open Source AI Docling has been brought to you by IBM. + +[supported_formats]: https://ds4sd.github.io/docling/supported_formats/ +[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ +[integrations]: https://ds4sd.github.io/docling/integrations/ diff --git a/docling/backend/abstract_backend.py b/docling/backend/abstract_backend.py index b47b11cd..491330b3 100644 --- a/docling/backend/abstract_backend.py +++ b/docling/backend/abstract_backend.py @@ -27,7 +27,6 @@ class AbstractDocumentBackend(ABC): def supports_pagination(cls) -> bool: pass - @abstractmethod def unload(self): if isinstance(self.path_or_stream, BytesIO): self.path_or_stream.close() diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 829419af..397bfc44 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -24,7 +24,6 @@ _log = logging.getLogger(__name__) class AsciiDocBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) diff --git a/docling/backend/docling_parse_backend.py b/docling/backend/docling_parse_backend.py index 89b25ee1..6d22127b 100644 --- a/docling/backend/docling_parse_backend.py +++ b/docling/backend/docling_parse_backend.py @@ -163,7 +163,7 @@ class DoclingParsePageBackend(PdfPageBackend): l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT ) else: - padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t diff --git a/docling/backend/docling_parse_v2_backend.py b/docling/backend/docling_parse_v2_backend.py index 366fa6ac..27a368f9 100644 --- a/docling/backend/docling_parse_v2_backend.py +++ b/docling/backend/docling_parse_v2_backend.py @@ -178,7 +178,7 @@ class DoclingParseV2PageBackend(PdfPageBackend): l=0, r=0, t=0, b=0, coord_origin=CoordOrigin.BOTTOMLEFT ) else: - padbox = cropbox.to_bottom_left_origin(page_size.height) + padbox = cropbox.to_bottom_left_origin(page_size.height).model_copy() padbox.r = page_size.width - padbox.r padbox.t = page_size.height - padbox.t diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index ae478885..286dfbfa 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -1,9 +1,9 @@ import logging from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Optional, Set, Union -from bs4 import BeautifulSoup +from bs4 import BeautifulSoup, Tag from docling_core.types.doc import ( DocItemLabel, DoclingDocument, @@ -24,7 +24,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): super().__init__(in_doc, path_or_stream) _log.debug("About to init HTML backend...") - self.soup = None + self.soup: Optional[Tag] = None # HTML file: self.path_or_stream = path_or_stream # Initialise the parents for the hierarchy @@ -78,17 +78,18 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if self.is_valid(): assert self.soup is not None + content = self.soup.body or self.soup # Replace
tags with newline characters - for br in self.soup.body.find_all("br"): + for br in content.find_all("br"): br.replace_with("\n") - doc = self.walk(self.soup.body, doc) + doc = self.walk(content, doc) else: raise RuntimeError( f"Cannot convert doc with {self.document_hash} because the backend failed to init." ) return doc - def walk(self, element, doc): + def walk(self, element: Tag, doc: DoclingDocument): try: # Iterate over elements in the body of the document for idx, element in enumerate(element.children): @@ -105,7 +106,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc - def analyse_element(self, element, idx, doc): + def analyse_element(self, element: Tag, idx: int, doc: DoclingDocument): """ if element.name!=None: _log.debug("\t"*self.level, idx, "\t", f"{element.name} ({self.level})") @@ -135,7 +136,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: self.walk(element, doc) - def get_direct_text(self, item): + def get_direct_text(self, item: Tag): """Get the direct text of the
  • element (ignoring nested lists).""" text = item.find(string=True, recursive=False) if isinstance(text, str): @@ -144,7 +145,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "" # Function to recursively extract text from all child nodes - def extract_text_recursively(self, item): + def extract_text_recursively(self, item: Tag): result = [] if isinstance(item, str): @@ -165,7 +166,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return "".join(result) + " " - def handle_header(self, element, idx, doc): + def handle_header(self, element: Tag, idx: int, doc: DoclingDocument): """Handles header tags (h1, h2, etc.).""" hlevel = int(element.name.replace("h", "")) slevel = hlevel - 1 @@ -207,7 +208,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): level=hlevel, ) - def handle_code(self, element, idx, doc): + def handle_code(self, element: Tag, idx: int, doc: DoclingDocument): """Handles monospace code snippets (pre).""" if element.text is None: return @@ -215,9 +216,9 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): label = DocItemLabel.CODE if len(text) == 0: return - doc.add_text(parent=self.parents[self.level], label=label, text=text) + doc.add_code(parent=self.parents[self.level], text=text) - def handle_paragraph(self, element, idx, doc): + def handle_paragraph(self, element: Tag, idx: int, doc: DoclingDocument): """Handles paragraph tags (p).""" if element.text is None: return @@ -227,7 +228,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): return doc.add_text(parent=self.parents[self.level], label=label, text=text) - def handle_list(self, element, idx, doc): + def handle_list(self, element: Tag, idx: int, doc: DoclingDocument): """Handles list tags (ul, ol) and their list items.""" if element.name == "ul": @@ -249,7 +250,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.parents[self.level + 1] = None self.level -= 1 - def handle_listitem(self, element, idx, doc): + def handle_listitem(self, element: Tag, idx: int, doc: DoclingDocument): """Handles listitem tags (li).""" nested_lists = element.find(["ul", "ol"]) @@ -303,7 +304,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): else: _log.warn("list-item has no text: ", element) - def handle_table(self, element, idx, doc): + def handle_table(self, element: Tag, idx: int, doc: DoclingDocument): """Handles table tags.""" nested_tables = element.find("table") @@ -376,7 +377,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): doc.add_table(data=data, parent=self.parents[self.level]) - def get_list_text(self, list_element, level=0): + def get_list_text(self, list_element: Tag, level=0): """Recursively extract text from