From a40544a54661488378b2ecee1bb0354411efcf37 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Mon, 3 Feb 2025 14:10:12 +0100
Subject: [PATCH 1/5] chore: clean up top-level file (#872)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
word_tables.html | 75 ------------------------------------------------
1 file changed, 75 deletions(-)
delete mode 100644 word_tables.html
diff --git a/word_tables.html b/word_tables.html
deleted file mode 100644
index 30f6e8d3..00000000
--- a/word_tables.html
+++ /dev/null
@@ -1,75 +0,0 @@
-
-
-
-
-
-
- Powered by Docling
-
-
-
-Test with tables
-A uniform table
-Header 0.0 | Header 0.1 | Header 0.2 |
Cell 1.0 | Cell 1.1 | Cell 1.2 |
Cell 2.0 | Cell 2.1 | Cell 2.2 |
-
-A non-uniform table with horizontal spans
-Header 0.0 | Header 0.1 | Header 0.2 |
Cell 1.0 | Merged Cell 1.1 1.2 |
Cell 2.0 | Merged Cell 2.1 2.2 |
-
-A non-uniform table with horizontal spans in inner columns
-Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 |
Cell 1.0 | Merged Cell 1.1 1.2 | Cell 1.3 |
Cell 2.0 | Merged Cell 2.1 2.2 | Cell 2.3 |
-
-A non-uniform table with vertical spans
-Header 0.0 | Header 0.1 | Header 0.2 |
Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 |
Cell 2.0 | Cell 2.2 |
Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 |
Cell 4.0 | Cell 4.2 |
-
-A non-uniform table with all kinds of spans and empty cells
-Header 0.0 | Header 0.1 | Header 0.2 | | |
Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | |
Cell 2.0 | Cell 2.2 | | |
Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | |
Cell 4.0 | Cell 4.2 | Merged Cell 4.4 5.4 |
| | |
| | | | |
|
| | | | Cell 8.4 |
-
-
-
\ No newline at end of file
From 5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Mon, 3 Feb 2025 14:38:38 +0100
Subject: [PATCH 2/5] fix(markdown): fix parsing if doc ending with table
(#873)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
docling/backend/md_backend.py | 1 +
tests/data/groundtruth/docling_v2/ending_with_table.md.md | 6 ++++++
tests/data/md/ending_with_table.md | 6 ++++++
3 files changed, 13 insertions(+)
create mode 100644 tests/data/groundtruth/docling_v2/ending_with_table.md.md
create mode 100644 tests/data/md/ending_with_table.md
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index 669096eb..eaf47537 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -368,6 +368,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text
+ self.close_table(doc=doc) # handle any last hanging table
# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:
diff --git a/tests/data/groundtruth/docling_v2/ending_with_table.md.md b/tests/data/groundtruth/docling_v2/ending_with_table.md.md
new file mode 100644
index 00000000..9c179fe0
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/ending_with_table.md.md
@@ -0,0 +1,6 @@
+| Character | Name in German | Name in French | Name in Italian |
+|----------------|------------------|------------------|-------------------|
+| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
+| Huey | Tick | Riri | Qui |
+| Dewey | Trick | Fifi | Quo |
+| Louie | Track | Loulou | Qua |
diff --git a/tests/data/md/ending_with_table.md b/tests/data/md/ending_with_table.md
new file mode 100644
index 00000000..6c491d64
--- /dev/null
+++ b/tests/data/md/ending_with_table.md
@@ -0,0 +1,6 @@
+| Character | Name in German | Name in French | Name in Italian |
+|---|---|---|---|
+| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
+| Huey | Tick | Riri | Qui |
+| Dewey | Trick | Fifi | Quo |
+| Louie | Track | Loulou | Qua |
From b5da4080c9e966f47a82c267410aa59968352bd9 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
Date: Mon, 3 Feb 2025 14:58:50 +0000
Subject: [PATCH 3/5] chore: bump version to 2.18.0 [skip ci]
---
CHANGELOG.md | 26 ++++++++++++++++++++++++++
pyproject.toml | 2 +-
2 files changed, 27 insertions(+), 1 deletion(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 597dde6e..4ad3b47d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,29 @@
+## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03
+
+### Feature
+
+* Expose equation exports ([#869](https://github.com/DS4SD/docling/issues/869)) ([`6a76b49`](https://github.com/DS4SD/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8))
+* Add option to define page range ([#852](https://github.com/DS4SD/docling/issues/852)) ([`70d68b6`](https://github.com/DS4SD/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381))
+* **docx:** Support of SDTs in docx backend ([#853](https://github.com/DS4SD/docling/issues/853)) ([`d727b04`](https://github.com/DS4SD/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e))
+* Python 3.13 support ([#841](https://github.com/DS4SD/docling/issues/841)) ([`4df085a`](https://github.com/DS4SD/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f))
+
+### Fix
+
+* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/DS4SD/docling/issues/873)) ([`5ac2887`](https://github.com/DS4SD/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70))
+* **markdown:** Add support for HTML content ([#855](https://github.com/DS4SD/docling/issues/855)) ([`94751a7`](https://github.com/DS4SD/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62))
+* **docx:** Merged table cells not properly converted ([#857](https://github.com/DS4SD/docling/issues/857)) ([`0cd81a8`](https://github.com/DS4SD/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0))
+* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/DS4SD/docling/issues/868)) ([`eff16b6`](https://github.com/DS4SD/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a))
+* KeyError in tableformer prediction ([#854](https://github.com/DS4SD/docling/issues/854)) ([`b1cf796`](https://github.com/DS4SD/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee))
+* Fixed docx import with headers that are also lists ([#842](https://github.com/DS4SD/docling/issues/842)) ([`2c037ae`](https://github.com/DS4SD/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3))
+* Use new add_code in html backend and add more typing hints ([#850](https://github.com/DS4SD/docling/issues/850)) ([`2a1f8af`](https://github.com/DS4SD/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933))
+* **markdown:** Fix empty block handling ([#843](https://github.com/DS4SD/docling/issues/843)) ([`bccb022`](https://github.com/DS4SD/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9))
+* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/DS4SD/docling/issues/837)) ([`fea0a99`](https://github.com/DS4SD/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e))
+
+### Documentation
+
+* Updated the readme with upcoming features ([#831](https://github.com/DS4SD/docling/issues/831)) ([`d7c0828`](https://github.com/DS4SD/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd))
+* Add example for inspection of picture content ([#624](https://github.com/DS4SD/docling/issues/624)) ([`f9144f2`](https://github.com/DS4SD/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781))
+
## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28
### Feature
diff --git a/pyproject.toml b/pyproject.toml
index d12b70e2..4baf50a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[tool.poetry]
name = "docling"
-version = "2.17.0" # DO NOT EDIT, updated automatically
+version = "2.18.0" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "]
license = "MIT"
From 6d3fea019635bd6ca94bd36c3928b28c245d638d Mon Sep 17 00:00:00 2001
From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com>
Date: Tue, 4 Feb 2025 10:07:00 +0100
Subject: [PATCH 4/5] docs: Introduce example with custom models for RapidOCR
(#874)
* docs: Introduce example with custom models for RapidOCR
Signed-off-by: Nikos Livathinos
* chore: Exclude the example with custom RapidOCR models from the examples to run in github actions
Signed-off-by: Nikos Livathinos
---------
Signed-off-by: Nikos Livathinos
---
.github/workflows/checks.yml | 2 +-
docs/examples/rapidocr_with_custom_models.py | 58 ++++++++++++++++++++
mkdocs.yml | 1 +
3 files changed, 60 insertions(+), 1 deletion(-)
create mode 100644 docs/examples/rapidocr_with_custom_models.py
diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml
index e04e2803..19e8c1e1 100644
--- a/.github/workflows/checks.yml
+++ b/.github/workflows/checks.yml
@@ -28,7 +28,7 @@ jobs:
run: |
for file in docs/examples/*.py; do
# Skip batch_convert.py
- if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then
+ if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then
echo "Skipping $file"
continue
fi
diff --git a/docs/examples/rapidocr_with_custom_models.py b/docs/examples/rapidocr_with_custom_models.py
new file mode 100644
index 00000000..e6dd3963
--- /dev/null
+++ b/docs/examples/rapidocr_with_custom_models.py
@@ -0,0 +1,58 @@
+import os
+
+from huggingface_hub import snapshot_download
+
+from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions
+from docling.document_converter import (
+ ConversionResult,
+ DocumentConverter,
+ InputFormat,
+ PdfFormatOption,
+)
+
+
+def main():
+ # Source document to convert
+ source = "https://arxiv.org/pdf/2408.09869v4"
+
+ # Download RappidOCR models from HuggingFace
+ print("Downloading RapidOCR models")
+ download_path = snapshot_download(repo_id="SWHL/RapidOCR")
+
+ # Setup RapidOcrOptions for english detection
+ det_model_path = os.path.join(
+ download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx"
+ )
+ rec_model_path = os.path.join(
+ download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx"
+ )
+ cls_model_path = os.path.join(
+ download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx"
+ )
+ ocr_options = RapidOcrOptions(
+ det_model_path=det_model_path,
+ rec_model_path=rec_model_path,
+ cls_model_path=cls_model_path,
+ )
+
+ pipeline_options = PdfPipelineOptions(
+ ocr_options=ocr_options,
+ )
+
+ # Convert the document
+ converter = DocumentConverter(
+ format_options={
+ InputFormat.PDF: PdfFormatOption(
+ pipeline_options=pipeline_options,
+ ),
+ },
+ )
+
+ conversion_result: ConversionResult = converter.convert(source=source)
+ doc = conversion_result.document
+ md = doc.export_to_markdown()
+ print(md)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/mkdocs.yml b/mkdocs.yml
index 0fcc2ca4..abb93a27 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -77,6 +77,7 @@ nav:
- "Multimodal export": examples/export_multimodal.py
- "Force full page OCR": examples/full_page_ocr.py
- "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py
+ - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
- "Accelerator options": examples/run_with_accelerator.py
- "Simple translation": examples/translate.py
- examples/backend_xml_rag.ipynb
From 17448163e7dc64e6607b3be9bcb66f5ca71c011c Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Tue, 4 Feb 2025 11:35:34 +0100
Subject: [PATCH 5/5] chore: fix docs search (#880)
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
.github/workflows/docs.yml | 6 +-----
1 file changed, 1 insertion(+), 5 deletions(-)
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index 0fc3ac7a..dd976ea3 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -14,10 +14,6 @@ jobs:
- uses: ./.github/actions/setup-poetry
- name: Build docs
run: poetry run mkdocs build --verbose --clean
- - name: Make docs LLM ready
- if: inputs.deploy
- uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b
- name: Build and push docs
if: inputs.deploy
- run: poetry run mkdocs gh-deploy --force --dirty
-
+ run: poetry run mkdocs gh-deploy --force