From a40544a54661488378b2ecee1bb0354411efcf37 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 3 Feb 2025 14:10:12 +0100 Subject: [PATCH 1/5] chore: clean up top-level file (#872) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- word_tables.html | 75 ------------------------------------------------ 1 file changed, 75 deletions(-) delete mode 100644 word_tables.html diff --git a/word_tables.html b/word_tables.html deleted file mode 100644 index 30f6e8d3..00000000 --- a/word_tables.html +++ /dev/null @@ -1,75 +0,0 @@ - - - - - - - Powered by Docling - - - -

Test with tables

-

A uniform table

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
-

-

A non-uniform table with horizontal spans

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
-

-

A non-uniform table with horizontal spans in inner columns

-
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
-

-

A non-uniform table with vertical spans

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
-

-

A non-uniform table with all kinds of spans and empty cells

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
-

-

- \ No newline at end of file From 5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 3 Feb 2025 14:38:38 +0100 Subject: [PATCH 2/5] fix(markdown): fix parsing if doc ending with table (#873) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/md_backend.py | 1 + tests/data/groundtruth/docling_v2/ending_with_table.md.md | 6 ++++++ tests/data/md/ending_with_table.md | 6 ++++++ 3 files changed, 13 insertions(+) create mode 100644 tests/data/groundtruth/docling_v2/ending_with_table.md.md create mode 100644 tests/data/md/ending_with_table.md diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 669096eb..eaf47537 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -368,6 +368,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): # Start iterating from the root of the AST self.iterate_elements(parsed_ast, 0, doc, None) self.process_inline_text(None, doc) # handle last hanging inline text + self.close_table(doc=doc) # handle any last hanging table # if HTML blocks were detected, export to HTML and delegate to HTML backend if self._html_blocks > 0: diff --git a/tests/data/groundtruth/docling_v2/ending_with_table.md.md b/tests/data/groundtruth/docling_v2/ending_with_table.md.md new file mode 100644 index 00000000..9c179fe0 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/ending_with_table.md.md @@ -0,0 +1,6 @@ +| Character | Name in German | Name in French | Name in Italian | +|----------------|------------------|------------------|-------------------| +| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone | +| Huey | Tick | Riri | Qui | +| Dewey | Trick | Fifi | Quo | +| Louie | Track | Loulou | Qua | diff --git a/tests/data/md/ending_with_table.md b/tests/data/md/ending_with_table.md new file mode 100644 index 00000000..6c491d64 --- /dev/null +++ b/tests/data/md/ending_with_table.md @@ -0,0 +1,6 @@ +| Character | Name in German | Name in French | Name in Italian | +|---|---|---|---| +| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone | +| Huey | Tick | Riri | Qui | +| Dewey | Trick | Fifi | Quo | +| Louie | Track | Loulou | Qua | From b5da4080c9e966f47a82c267410aa59968352bd9 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Mon, 3 Feb 2025 14:58:50 +0000 Subject: [PATCH 3/5] chore: bump version to 2.18.0 [skip ci] --- CHANGELOG.md | 26 ++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 597dde6e..4ad3b47d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,29 @@ +## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03 + +### Feature + +* Expose equation exports ([#869](https://github.com/DS4SD/docling/issues/869)) ([`6a76b49`](https://github.com/DS4SD/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8)) +* Add option to define page range ([#852](https://github.com/DS4SD/docling/issues/852)) ([`70d68b6`](https://github.com/DS4SD/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381)) +* **docx:** Support of SDTs in docx backend ([#853](https://github.com/DS4SD/docling/issues/853)) ([`d727b04`](https://github.com/DS4SD/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e)) +* Python 3.13 support ([#841](https://github.com/DS4SD/docling/issues/841)) ([`4df085a`](https://github.com/DS4SD/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f)) + +### Fix + +* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/DS4SD/docling/issues/873)) ([`5ac2887`](https://github.com/DS4SD/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70)) +* **markdown:** Add support for HTML content ([#855](https://github.com/DS4SD/docling/issues/855)) ([`94751a7`](https://github.com/DS4SD/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62)) +* **docx:** Merged table cells not properly converted ([#857](https://github.com/DS4SD/docling/issues/857)) ([`0cd81a8`](https://github.com/DS4SD/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0)) +* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/DS4SD/docling/issues/868)) ([`eff16b6`](https://github.com/DS4SD/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a)) +* KeyError in tableformer prediction ([#854](https://github.com/DS4SD/docling/issues/854)) ([`b1cf796`](https://github.com/DS4SD/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee)) +* Fixed docx import with headers that are also lists ([#842](https://github.com/DS4SD/docling/issues/842)) ([`2c037ae`](https://github.com/DS4SD/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3)) +* Use new add_code in html backend and add more typing hints ([#850](https://github.com/DS4SD/docling/issues/850)) ([`2a1f8af`](https://github.com/DS4SD/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933)) +* **markdown:** Fix empty block handling ([#843](https://github.com/DS4SD/docling/issues/843)) ([`bccb022`](https://github.com/DS4SD/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9)) +* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/DS4SD/docling/issues/837)) ([`fea0a99`](https://github.com/DS4SD/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e)) + +### Documentation + +* Updated the readme with upcoming features ([#831](https://github.com/DS4SD/docling/issues/831)) ([`d7c0828`](https://github.com/DS4SD/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd)) +* Add example for inspection of picture content ([#624](https://github.com/DS4SD/docling/issues/624)) ([`f9144f2`](https://github.com/DS4SD/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781)) + ## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 ### Feature diff --git a/pyproject.toml b/pyproject.toml index d12b70e2..4baf50a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "docling" -version = "2.17.0" # DO NOT EDIT, updated automatically +version = "2.18.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] license = "MIT" From 6d3fea019635bd6ca94bd36c3928b28c245d638d Mon Sep 17 00:00:00 2001 From: Nikos Livathinos <100353117+nikos-livathinos@users.noreply.github.com> Date: Tue, 4 Feb 2025 10:07:00 +0100 Subject: [PATCH 4/5] docs: Introduce example with custom models for RapidOCR (#874) * docs: Introduce example with custom models for RapidOCR Signed-off-by: Nikos Livathinos * chore: Exclude the example with custom RapidOCR models from the examples to run in github actions Signed-off-by: Nikos Livathinos --------- Signed-off-by: Nikos Livathinos --- .github/workflows/checks.yml | 2 +- docs/examples/rapidocr_with_custom_models.py | 58 ++++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 docs/examples/rapidocr_with_custom_models.py diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index e04e2803..19e8c1e1 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -28,7 +28,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models).py ]]; then echo "Skipping $file" continue fi diff --git a/docs/examples/rapidocr_with_custom_models.py b/docs/examples/rapidocr_with_custom_models.py new file mode 100644 index 00000000..e6dd3963 --- /dev/null +++ b/docs/examples/rapidocr_with_custom_models.py @@ -0,0 +1,58 @@ +import os + +from huggingface_hub import snapshot_download + +from docling.datamodel.pipeline_options import PdfPipelineOptions, RapidOcrOptions +from docling.document_converter import ( + ConversionResult, + DocumentConverter, + InputFormat, + PdfFormatOption, +) + + +def main(): + # Source document to convert + source = "https://arxiv.org/pdf/2408.09869v4" + + # Download RappidOCR models from HuggingFace + print("Downloading RapidOCR models") + download_path = snapshot_download(repo_id="SWHL/RapidOCR") + + # Setup RapidOcrOptions for english detection + det_model_path = os.path.join( + download_path, "PP-OCRv4", "en_PP-OCRv3_det_infer.onnx" + ) + rec_model_path = os.path.join( + download_path, "PP-OCRv4", "ch_PP-OCRv4_rec_server_infer.onnx" + ) + cls_model_path = os.path.join( + download_path, "PP-OCRv3", "ch_ppocr_mobile_v2.0_cls_train.onnx" + ) + ocr_options = RapidOcrOptions( + det_model_path=det_model_path, + rec_model_path=rec_model_path, + cls_model_path=cls_model_path, + ) + + pipeline_options = PdfPipelineOptions( + ocr_options=ocr_options, + ) + + # Convert the document + converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=pipeline_options, + ), + }, + ) + + conversion_result: ConversionResult = converter.convert(source=source) + doc = conversion_result.document + md = doc.export_to_markdown() + print(md) + + +if __name__ == "__main__": + main() diff --git a/mkdocs.yml b/mkdocs.yml index 0fcc2ca4..abb93a27 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -77,6 +77,7 @@ nav: - "Multimodal export": examples/export_multimodal.py - "Force full page OCR": examples/full_page_ocr.py - "Automatic OCR language detection with tesseract": examples/tesseract_lang_detection.py + - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py - "Accelerator options": examples/run_with_accelerator.py - "Simple translation": examples/translate.py - examples/backend_xml_rag.ipynb From 17448163e7dc64e6607b3be9bcb66f5ca71c011c Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Tue, 4 Feb 2025 11:35:34 +0100 Subject: [PATCH 5/5] chore: fix docs search (#880) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .github/workflows/docs.yml | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 0fc3ac7a..dd976ea3 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -14,10 +14,6 @@ jobs: - uses: ./.github/actions/setup-poetry - name: Build docs run: poetry run mkdocs build --verbose --clean - - name: Make docs LLM ready - if: inputs.deploy - uses: demodrive-ai/llms-txt-action@ad720693843126e6a73910a667d0eba37c1dea4b - name: Build and push docs if: inputs.deploy - run: poetry run mkdocs gh-deploy --force --dirty - + run: poetry run mkdocs gh-deploy --force