diff --git a/.github/SECURITY.md b/.github/SECURITY.md index 419e13db..affc77bf 100644 --- a/.github/SECURITY.md +++ b/.github/SECURITY.md @@ -20,4 +20,4 @@ After the initial reply to your report, the security team will keep you informed ## Security Alerts -We will send announcements of security vulnerabilities and steps to remediate on the [Docling announcements](https://github.com/DS4SD/docling/discussions/categories/announcements). +We will send announcements of security vulnerabilities and steps to remediate on the [Docling announcements](https://github.com/docling-project/docling/discussions/categories/announcements). diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index 6e9134d5..003177ae 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -10,7 +10,7 @@ on: jobs: build-docs: - if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling') }} + if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }} uses: ./.github/workflows/docs.yml with: deploy: false diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e2b21ed2..0bf45ce3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,5 +15,5 @@ env: jobs: code-checks: - if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'DS4SD/docling' && github.event.pull_request.head.repo.full_name != 'ds4sd/docling') }} + if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }} uses: ./.github/workflows/checks.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index b0e986e8..c4ae5e8b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,867 +1,867 @@ -## [v2.26.0](https://github.com/DS4SD/docling/releases/tag/v2.26.0) - 2025-03-11 +## [v2.26.0](https://github.com/docling-project/docling/releases/tag/v2.26.0) - 2025-03-11 ### Feature -* Use new TableFormer model weights and default to accurate model version ([#1100](https://github.com/DS4SD/docling/issues/1100)) ([`eb97357`](https://github.com/DS4SD/docling/commit/eb97357b0560b59c14a8be3fb52d6a1362ad0a1d)) +* Use new TableFormer model weights and default to accurate model version ([#1100](https://github.com/docling-project/docling/issues/1100)) ([`eb97357`](https://github.com/docling-project/docling/commit/eb97357b0560b59c14a8be3fb52d6a1362ad0a1d)) ### Fix -* **CLI:** Fix help message for abort options ([#1130](https://github.com/DS4SD/docling/issues/1130)) ([`4d64c4c`](https://github.com/DS4SD/docling/commit/4d64c4c0b67b23f53d4ea21fb754455840fe4556)) +* **CLI:** Fix help message for abort options ([#1130](https://github.com/docling-project/docling/issues/1130)) ([`4d64c4c`](https://github.com/docling-project/docling/commit/4d64c4c0b67b23f53d4ea21fb754455840fe4556)) ### Documentation -* Add description of DOCLING_ARTIFACTS_PATH env var ([#1124](https://github.com/DS4SD/docling/issues/1124)) ([`e1c49ad`](https://github.com/DS4SD/docling/commit/e1c49ad72710ff76c1b0574bb4d2bdab93077902)) +* Add description of DOCLING_ARTIFACTS_PATH env var ([#1124](https://github.com/docling-project/docling/issues/1124)) ([`e1c49ad`](https://github.com/docling-project/docling/commit/e1c49ad72710ff76c1b0574bb4d2bdab93077902)) ### Performance -* New revision code formula model and document picture classifier ([#1140](https://github.com/DS4SD/docling/issues/1140)) ([`5e30381`](https://github.com/DS4SD/docling/commit/5e30381c0dd3b4f9e3b2d8af3863ed51fa51194a)) +* New revision code formula model and document picture classifier ([#1140](https://github.com/docling-project/docling/issues/1140)) ([`5e30381`](https://github.com/docling-project/docling/commit/5e30381c0dd3b4f9e3b2d8af3863ed51fa51194a)) -## [v2.25.2](https://github.com/DS4SD/docling/releases/tag/v2.25.2) - 2025-03-05 +## [v2.25.2](https://github.com/docling-project/docling/releases/tag/v2.25.2) - 2025-03-05 ### Fix -* Proper handling of orphan IDs in layout postprocessing ([#1118](https://github.com/DS4SD/docling/issues/1118)) ([`c56ab3a`](https://github.com/DS4SD/docling/commit/c56ab3a66b79e0d1b6f4b22880aabb7ee909d9d7)) +* Proper handling of orphan IDs in layout postprocessing ([#1118](https://github.com/docling-project/docling/issues/1118)) ([`c56ab3a`](https://github.com/docling-project/docling/commit/c56ab3a66b79e0d1b6f4b22880aabb7ee909d9d7)) ### Documentation -* Enrichment models ([#1097](https://github.com/DS4SD/docling/issues/1097)) ([`357d41c`](https://github.com/DS4SD/docling/commit/357d41cc47bcf69684643d193ed38f9baaf0d2f1)) +* Enrichment models ([#1097](https://github.com/docling-project/docling/issues/1097)) ([`357d41c`](https://github.com/docling-project/docling/commit/357d41cc47bcf69684643d193ed38f9baaf0d2f1)) -## [v2.25.1](https://github.com/DS4SD/docling/releases/tag/v2.25.1) - 2025-03-03 +## [v2.25.1](https://github.com/docling-project/docling/releases/tag/v2.25.1) - 2025-03-03 ### Fix -* Enable locks for threadsafe pdfium ([#1052](https://github.com/DS4SD/docling/issues/1052)) ([`8dc0562`](https://github.com/DS4SD/docling/commit/8dc0562542299cf972d14eeeb4393e50b589c8ad)) -* **html:** Use 'start' attribute when parsing ordered lists from HTML docs ([#1062](https://github.com/DS4SD/docling/issues/1062)) ([`de7b963`](https://github.com/DS4SD/docling/commit/de7b963b09a34916f0a8d99649269aeb37db1408)) +* Enable locks for threadsafe pdfium ([#1052](https://github.com/docling-project/docling/issues/1052)) ([`8dc0562`](https://github.com/docling-project/docling/commit/8dc0562542299cf972d14eeeb4393e50b589c8ad)) +* **html:** Use 'start' attribute when parsing ordered lists from HTML docs ([#1062](https://github.com/docling-project/docling/issues/1062)) ([`de7b963`](https://github.com/docling-project/docling/commit/de7b963b09a34916f0a8d99649269aeb37db1408)) ### Documentation -* Improve docs on token limit warning triggered by HybridChunker ([#1077](https://github.com/DS4SD/docling/issues/1077)) ([`db3ceef`](https://github.com/DS4SD/docling/commit/db3ceefd4ae6251a97e333bcb03051698b3fa71a)) +* Improve docs on token limit warning triggered by HybridChunker ([#1077](https://github.com/docling-project/docling/issues/1077)) ([`db3ceef`](https://github.com/docling-project/docling/commit/db3ceefd4ae6251a97e333bcb03051698b3fa71a)) -## [v2.25.0](https://github.com/DS4SD/docling/releases/tag/v2.25.0) - 2025-02-26 +## [v2.25.0](https://github.com/docling-project/docling/releases/tag/v2.25.0) - 2025-02-26 ### Feature -* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/DS4SD/docling/issues/1054)) ([`3c9fe76`](https://github.com/DS4SD/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849)) -* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/DS4SD/docling/issues/1061)) ([`ab683e4`](https://github.com/DS4SD/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b)) +* [Experimental] Introduce VLM pipeline using HF AutoModelForVision2Seq, featuring SmolDocling model ([#1054](https://github.com/docling-project/docling/issues/1054)) ([`3c9fe76`](https://github.com/docling-project/docling/commit/3c9fe76b706b7714b25d49cb09050c42e3b8c849)) +* **cli:** Add option for downloading all models, refine help messages ([#1061](https://github.com/docling-project/docling/issues/1061)) ([`ab683e4`](https://github.com/docling-project/docling/commit/ab683e4fb6df4973d2efda04f00c269a2dc95f5b)) ### Fix -* Vlm using artifacts path ([#1057](https://github.com/DS4SD/docling/issues/1057)) ([`e197225`](https://github.com/DS4SD/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8)) -* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/DS4SD/docling/issues/1041)) ([`1b0ead6`](https://github.com/DS4SD/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc)) +* Vlm using artifacts path ([#1057](https://github.com/docling-project/docling/issues/1057)) ([`e197225`](https://github.com/docling-project/docling/commit/e1972257399151503d60b4806976c8b9b6911aa8)) +* **html:** Parse text in div elements as TextItem ([#1041](https://github.com/docling-project/docling/issues/1041)) ([`1b0ead6`](https://github.com/docling-project/docling/commit/1b0ead69078030a0e4d25b51450ef2aa4a2e79fc)) ### Documentation -* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/DS4SD/docling/issues/1053)) ([`c84b973`](https://github.com/DS4SD/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d)) +* Extend chunking docs, add FAQ on token limit ([#1053](https://github.com/docling-project/docling/issues/1053)) ([`c84b973`](https://github.com/docling-project/docling/commit/c84b973959a254db22ac9a7dc8810628e4808a2d)) -## [v2.24.0](https://github.com/DS4SD/docling/releases/tag/v2.24.0) - 2025-02-20 +## [v2.24.0](https://github.com/docling-project/docling/releases/tag/v2.24.0) - 2025-02-20 ### Feature -* Implement new reading-order model ([#916](https://github.com/DS4SD/docling/issues/916)) ([`c93e369`](https://github.com/DS4SD/docling/commit/c93e36988f1e1e461477223143c2c1fb2162d11f)) +* Implement new reading-order model ([#916](https://github.com/docling-project/docling/issues/916)) ([`c93e369`](https://github.com/docling-project/docling/commit/c93e36988f1e1e461477223143c2c1fb2162d11f)) -## [v2.23.1](https://github.com/DS4SD/docling/releases/tag/v2.23.1) - 2025-02-20 +## [v2.23.1](https://github.com/docling-project/docling/releases/tag/v2.23.1) - 2025-02-20 ### Fix -* Runtime error when Pandas Series is not always of string type ([#1024](https://github.com/DS4SD/docling/issues/1024)) ([`6796f0a`](https://github.com/DS4SD/docling/commit/6796f0a13263281cd48712b3c71579bfd81bb0d1)) +* Runtime error when Pandas Series is not always of string type ([#1024](https://github.com/docling-project/docling/issues/1024)) ([`6796f0a`](https://github.com/docling-project/docling/commit/6796f0a13263281cd48712b3c71579bfd81bb0d1)) ### Documentation -* Revamp picture description example ([#1015](https://github.com/DS4SD/docling/issues/1015)) ([`27c0400`](https://github.com/DS4SD/docling/commit/27c04007bc1be7a6f6c90aaf04ea9f4ff8eb1f3d)) +* Revamp picture description example ([#1015](https://github.com/docling-project/docling/issues/1015)) ([`27c0400`](https://github.com/docling-project/docling/commit/27c04007bc1be7a6f6c90aaf04ea9f4ff8eb1f3d)) -## [v2.23.0](https://github.com/DS4SD/docling/releases/tag/v2.23.0) - 2025-02-17 +## [v2.23.0](https://github.com/docling-project/docling/releases/tag/v2.23.0) - 2025-02-17 ### Feature -* Support cuda:n GPU device allocation ([#694](https://github.com/DS4SD/docling/issues/694)) ([`77eb77b`](https://github.com/DS4SD/docling/commit/77eb77bdc2c07b632a1d171826d1855a5218399e)) -* **xml-jats:** Parse XML JATS documents ([#967](https://github.com/DS4SD/docling/issues/967)) ([`428b656`](https://github.com/DS4SD/docling/commit/428b656793cb75d108c69f20c254be7c198cee5c)) +* Support cuda:n GPU device allocation ([#694](https://github.com/docling-project/docling/issues/694)) ([`77eb77b`](https://github.com/docling-project/docling/commit/77eb77bdc2c07b632a1d171826d1855a5218399e)) +* **xml-jats:** Parse XML JATS documents ([#967](https://github.com/docling-project/docling/issues/967)) ([`428b656`](https://github.com/docling-project/docling/commit/428b656793cb75d108c69f20c254be7c198cee5c)) ### Fix -* Revise DocTags, fix iterate_items to output content_layer in items ([#965](https://github.com/DS4SD/docling/issues/965)) ([`6e75f0b`](https://github.com/DS4SD/docling/commit/6e75f0b5d3ee42738a80049d4cf2fa6d34e8ab97)) +* Revise DocTags, fix iterate_items to output content_layer in items ([#965](https://github.com/docling-project/docling/issues/965)) ([`6e75f0b`](https://github.com/docling-project/docling/commit/6e75f0b5d3ee42738a80049d4cf2fa6d34e8ab97)) -## [v2.22.0](https://github.com/DS4SD/docling/releases/tag/v2.22.0) - 2025-02-14 +## [v2.22.0](https://github.com/docling-project/docling/releases/tag/v2.22.0) - 2025-02-14 ### Feature -* Add support for CSV input with new backend to transform CSV files to DoclingDocument ([#945](https://github.com/DS4SD/docling/issues/945)) ([`00d9405`](https://github.com/DS4SD/docling/commit/00d9405b0ac519d321ae54e8150f5facbaabbe14)) -* Introduce the enable_remote_services option to allow remote connections while processing ([#941](https://github.com/DS4SD/docling/issues/941)) ([`2716c7d`](https://github.com/DS4SD/docling/commit/2716c7d4ffb836664178178d3f8d01b7f9112595)) -* Allow artifacts_path to be defined as ENV ([#940](https://github.com/DS4SD/docling/issues/940)) ([`5101e25`](https://github.com/DS4SD/docling/commit/5101e2519e7a5bb727531b1412b1131a7cfbda52)) +* Add support for CSV input with new backend to transform CSV files to DoclingDocument ([#945](https://github.com/docling-project/docling/issues/945)) ([`00d9405`](https://github.com/docling-project/docling/commit/00d9405b0ac519d321ae54e8150f5facbaabbe14)) +* Introduce the enable_remote_services option to allow remote connections while processing ([#941](https://github.com/docling-project/docling/issues/941)) ([`2716c7d`](https://github.com/docling-project/docling/commit/2716c7d4ffb836664178178d3f8d01b7f9112595)) +* Allow artifacts_path to be defined as ENV ([#940](https://github.com/docling-project/docling/issues/940)) ([`5101e25`](https://github.com/docling-project/docling/commit/5101e2519e7a5bb727531b1412b1131a7cfbda52)) ### Fix -* Update Pillow constraints ([#958](https://github.com/DS4SD/docling/issues/958)) ([`af19c03`](https://github.com/DS4SD/docling/commit/af19c03f6e5e0b24e12d6a3baac6c46a4c8b10d1)) -* Fix the initialization of the TesseractOcrModel ([#935](https://github.com/DS4SD/docling/issues/935)) ([`c47ae70`](https://github.com/DS4SD/docling/commit/c47ae700ece2ea4efee17f82e4667c1ce9a0ed2a)) +* Update Pillow constraints ([#958](https://github.com/docling-project/docling/issues/958)) ([`af19c03`](https://github.com/docling-project/docling/commit/af19c03f6e5e0b24e12d6a3baac6c46a4c8b10d1)) +* Fix the initialization of the TesseractOcrModel ([#935](https://github.com/docling-project/docling/issues/935)) ([`c47ae70`](https://github.com/docling-project/docling/commit/c47ae700ece2ea4efee17f82e4667c1ce9a0ed2a)) ### Documentation -* Update example Dockerfile with download CLI ([#929](https://github.com/DS4SD/docling/issues/929)) ([`7493d5b`](https://github.com/DS4SD/docling/commit/7493d5b01f8be60294afeffdfb54a62bb74bcc92)) -* Examples for picture descriptions ([#951](https://github.com/DS4SD/docling/issues/951)) ([`2d66e99`](https://github.com/DS4SD/docling/commit/2d66e99b69f39a282109c366fae3679f41c6e081)) +* Update example Dockerfile with download CLI ([#929](https://github.com/docling-project/docling/issues/929)) ([`7493d5b`](https://github.com/docling-project/docling/commit/7493d5b01f8be60294afeffdfb54a62bb74bcc92)) +* Examples for picture descriptions ([#951](https://github.com/docling-project/docling/issues/951)) ([`2d66e99`](https://github.com/docling-project/docling/commit/2d66e99b69f39a282109c366fae3679f41c6e081)) -## [v2.21.0](https://github.com/DS4SD/docling/releases/tag/v2.21.0) - 2025-02-10 +## [v2.21.0](https://github.com/docling-project/docling/releases/tag/v2.21.0) - 2025-02-10 ### Feature -* Add content_layer property to items to address body, furniture and other roles ([#735](https://github.com/DS4SD/docling/issues/735)) ([`cf78d5b`](https://github.com/DS4SD/docling/commit/cf78d5b7b9f12728270e673857fd299efc01a7db)) +* Add content_layer property to items to address body, furniture and other roles ([#735](https://github.com/docling-project/docling/issues/735)) ([`cf78d5b`](https://github.com/docling-project/docling/commit/cf78d5b7b9f12728270e673857fd299efc01a7db)) -## [v2.20.0](https://github.com/DS4SD/docling/releases/tag/v2.20.0) - 2025-02-07 +## [v2.20.0](https://github.com/docling-project/docling/releases/tag/v2.20.0) - 2025-02-07 ### Feature -* Describe pictures using vision models ([#259](https://github.com/DS4SD/docling/issues/259)) ([`4cc6e3e`](https://github.com/DS4SD/docling/commit/4cc6e3ea5e858b367136acc729b723ea0552d22a)) +* Describe pictures using vision models ([#259](https://github.com/docling-project/docling/issues/259)) ([`4cc6e3e`](https://github.com/docling-project/docling/commit/4cc6e3ea5e858b367136acc729b723ea0552d22a)) ### Fix -* Remove unused httpx ([#919](https://github.com/DS4SD/docling/issues/919)) ([`c18f47c`](https://github.com/DS4SD/docling/commit/c18f47c5c032c49bf3175aecd2236df37c0e9ae1)) +* Remove unused httpx ([#919](https://github.com/docling-project/docling/issues/919)) ([`c18f47c`](https://github.com/docling-project/docling/commit/c18f47c5c032c49bf3175aecd2236df37c0e9ae1)) -## [v2.19.0](https://github.com/DS4SD/docling/releases/tag/v2.19.0) - 2025-02-07 +## [v2.19.0](https://github.com/docling-project/docling/releases/tag/v2.19.0) - 2025-02-07 ### Feature -* New artifacts path and CLI utility ([#876](https://github.com/DS4SD/docling/issues/876)) ([`ed74fe2`](https://github.com/DS4SD/docling/commit/ed74fe2ec0a702834f0deacfdb5717c8c587dab1)) +* New artifacts path and CLI utility ([#876](https://github.com/docling-project/docling/issues/876)) ([`ed74fe2`](https://github.com/docling-project/docling/commit/ed74fe2ec0a702834f0deacfdb5717c8c587dab1)) ### Fix -* **markdown:** Handle nested lists ([#910](https://github.com/DS4SD/docling/issues/910)) ([`90b766e`](https://github.com/DS4SD/docling/commit/90b766e2ae1695a759191df37c272efc09be5ee3)) -* Test cases for RTL programmatic PDFs and fixes for the formula model ([#903](https://github.com/DS4SD/docling/issues/903)) ([`9114ada`](https://github.com/DS4SD/docling/commit/9114ada7bc4dd45ce0046de2f9d00a80ccb25c79)) -* **msword_backend:** Handle conversion error in label parsing ([#896](https://github.com/DS4SD/docling/issues/896)) ([`722a6eb`](https://github.com/DS4SD/docling/commit/722a6eb7b994a0261312a356df80b2fced121812)) -* Enrichment models batch size and expose picture classifier ([#878](https://github.com/DS4SD/docling/issues/878)) ([`5ad6de0`](https://github.com/DS4SD/docling/commit/5ad6de05600315617b574bd12af553e00b4d316e)) +* **markdown:** Handle nested lists ([#910](https://github.com/docling-project/docling/issues/910)) ([`90b766e`](https://github.com/docling-project/docling/commit/90b766e2ae1695a759191df37c272efc09be5ee3)) +* Test cases for RTL programmatic PDFs and fixes for the formula model ([#903](https://github.com/docling-project/docling/issues/903)) ([`9114ada`](https://github.com/docling-project/docling/commit/9114ada7bc4dd45ce0046de2f9d00a80ccb25c79)) +* **msword_backend:** Handle conversion error in label parsing ([#896](https://github.com/docling-project/docling/issues/896)) ([`722a6eb`](https://github.com/docling-project/docling/commit/722a6eb7b994a0261312a356df80b2fced121812)) +* Enrichment models batch size and expose picture classifier ([#878](https://github.com/docling-project/docling/issues/878)) ([`5ad6de0`](https://github.com/docling-project/docling/commit/5ad6de05600315617b574bd12af553e00b4d316e)) ### Documentation -* Introduce example with custom models for RapidOCR ([#874](https://github.com/DS4SD/docling/issues/874)) ([`6d3fea0`](https://github.com/DS4SD/docling/commit/6d3fea019635bd6ca94bd36c3928b28c245d638d)) +* Introduce example with custom models for RapidOCR ([#874](https://github.com/docling-project/docling/issues/874)) ([`6d3fea0`](https://github.com/docling-project/docling/commit/6d3fea019635bd6ca94bd36c3928b28c245d638d)) -## [v2.18.0](https://github.com/DS4SD/docling/releases/tag/v2.18.0) - 2025-02-03 +## [v2.18.0](https://github.com/docling-project/docling/releases/tag/v2.18.0) - 2025-02-03 ### Feature -* Expose equation exports ([#869](https://github.com/DS4SD/docling/issues/869)) ([`6a76b49`](https://github.com/DS4SD/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8)) -* Add option to define page range ([#852](https://github.com/DS4SD/docling/issues/852)) ([`70d68b6`](https://github.com/DS4SD/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381)) -* **docx:** Support of SDTs in docx backend ([#853](https://github.com/DS4SD/docling/issues/853)) ([`d727b04`](https://github.com/DS4SD/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e)) -* Python 3.13 support ([#841](https://github.com/DS4SD/docling/issues/841)) ([`4df085a`](https://github.com/DS4SD/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f)) +* Expose equation exports ([#869](https://github.com/docling-project/docling/issues/869)) ([`6a76b49`](https://github.com/docling-project/docling/commit/6a76b49a4756fd00503d0baec5db8d23be8207e8)) +* Add option to define page range ([#852](https://github.com/docling-project/docling/issues/852)) ([`70d68b6`](https://github.com/docling-project/docling/commit/70d68b6164c6c7029b39dd65c5a278278768c381)) +* **docx:** Support of SDTs in docx backend ([#853](https://github.com/docling-project/docling/issues/853)) ([`d727b04`](https://github.com/docling-project/docling/commit/d727b04ad080df0b3811902059e0fe0539f7037e)) +* Python 3.13 support ([#841](https://github.com/docling-project/docling/issues/841)) ([`4df085a`](https://github.com/docling-project/docling/commit/4df085aa6c6f5cc043f4f7a9f0c1b4af43f95e8f)) ### Fix -* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/DS4SD/docling/issues/873)) ([`5ac2887`](https://github.com/DS4SD/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70)) -* **markdown:** Add support for HTML content ([#855](https://github.com/DS4SD/docling/issues/855)) ([`94751a7`](https://github.com/DS4SD/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62)) -* **docx:** Merged table cells not properly converted ([#857](https://github.com/DS4SD/docling/issues/857)) ([`0cd81a8`](https://github.com/DS4SD/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0)) -* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/DS4SD/docling/issues/868)) ([`eff16b6`](https://github.com/DS4SD/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a)) -* KeyError in tableformer prediction ([#854](https://github.com/DS4SD/docling/issues/854)) ([`b1cf796`](https://github.com/DS4SD/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee)) -* Fixed docx import with headers that are also lists ([#842](https://github.com/DS4SD/docling/issues/842)) ([`2c037ae`](https://github.com/DS4SD/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3)) -* Use new add_code in html backend and add more typing hints ([#850](https://github.com/DS4SD/docling/issues/850)) ([`2a1f8af`](https://github.com/DS4SD/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933)) -* **markdown:** Fix empty block handling ([#843](https://github.com/DS4SD/docling/issues/843)) ([`bccb022`](https://github.com/DS4SD/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9)) -* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/DS4SD/docling/issues/837)) ([`fea0a99`](https://github.com/DS4SD/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e)) +* **markdown:** Fix parsing if doc ending with table ([#873](https://github.com/docling-project/docling/issues/873)) ([`5ac2887`](https://github.com/docling-project/docling/commit/5ac2887e4ad52ed6e7147e3af1e3ee5eb0006a70)) +* **markdown:** Add support for HTML content ([#855](https://github.com/docling-project/docling/issues/855)) ([`94751a7`](https://github.com/docling-project/docling/commit/94751a78f4f61b78f64952190717440ec6d84c62)) +* **docx:** Merged table cells not properly converted ([#857](https://github.com/docling-project/docling/issues/857)) ([`0cd81a8`](https://github.com/docling-project/docling/commit/0cd81a81226c0d4aa4f20e4e58c3b33e4fe50ce0)) +* Processing of placeholder shapes in pptx that have text but no bbox ([#868](https://github.com/docling-project/docling/issues/868)) ([`eff16b6`](https://github.com/docling-project/docling/commit/eff16b62ccdb0eb764eeacee550563898784dd6a)) +* KeyError in tableformer prediction ([#854](https://github.com/docling-project/docling/issues/854)) ([`b1cf796`](https://github.com/docling-project/docling/commit/b1cf796730901222ad0882ff44efa0ef43a743ee)) +* Fixed docx import with headers that are also lists ([#842](https://github.com/docling-project/docling/issues/842)) ([`2c037ae`](https://github.com/docling-project/docling/commit/2c037ae62e123967eddf065ccb2abbaf78cdcab3)) +* Use new add_code in html backend and add more typing hints ([#850](https://github.com/docling-project/docling/issues/850)) ([`2a1f8af`](https://github.com/docling-project/docling/commit/2a1f8afe7e8d9d508aebcfd3998ee1625c938933)) +* **markdown:** Fix empty block handling ([#843](https://github.com/docling-project/docling/issues/843)) ([`bccb022`](https://github.com/docling-project/docling/commit/bccb022fc82d4d0ef2ed2d8bea5f5d8e6400c1d9)) +* Fix for the crash when encountering WMF images in pptx and docx ([#837](https://github.com/docling-project/docling/issues/837)) ([`fea0a99`](https://github.com/docling-project/docling/commit/fea0a99a95d97e72687f48f8174d31102655483e)) ### Documentation -* Updated the readme with upcoming features ([#831](https://github.com/DS4SD/docling/issues/831)) ([`d7c0828`](https://github.com/DS4SD/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd)) -* Add example for inspection of picture content ([#624](https://github.com/DS4SD/docling/issues/624)) ([`f9144f2`](https://github.com/DS4SD/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781)) +* Updated the readme with upcoming features ([#831](https://github.com/docling-project/docling/issues/831)) ([`d7c0828`](https://github.com/docling-project/docling/commit/d7c082894e3ef85881665d20167198adcbc1becd)) +* Add example for inspection of picture content ([#624](https://github.com/docling-project/docling/issues/624)) ([`f9144f2`](https://github.com/docling-project/docling/commit/f9144f2bb6b322244c9d37683dca1e537ec6d781)) -## [v2.17.0](https://github.com/DS4SD/docling/releases/tag/v2.17.0) - 2025-01-28 +## [v2.17.0](https://github.com/docling-project/docling/releases/tag/v2.17.0) - 2025-01-28 ### Feature -* **CLI:** Expose code and formula models in the CLI ([#820](https://github.com/DS4SD/docling/issues/820)) ([`6882e6c`](https://github.com/DS4SD/docling/commit/6882e6c38df30e4d4a1b83e01b13900ca7ea001f)) -* Add platform info to CLI version printout ([#816](https://github.com/DS4SD/docling/issues/816)) ([`95b293a`](https://github.com/DS4SD/docling/commit/95b293a72356f94c7076e3649be970c8a51121a3)) -* **ocr:** Expose `rec_keys_path` in RapidOcrOptions to support custom dictionaries ([#786](https://github.com/DS4SD/docling/issues/786)) ([`5332755`](https://github.com/DS4SD/docling/commit/53327552e83ced079ae50d8067ba7a8ce80cd9ad)) -* Introduce automatic language detection in TesseractOcrCliModel ([#800](https://github.com/DS4SD/docling/issues/800)) ([`3be2fb5`](https://github.com/DS4SD/docling/commit/3be2fb581fe5a2ebd5cec9c86bb22eb1dec6fd0f)) +* **CLI:** Expose code and formula models in the CLI ([#820](https://github.com/docling-project/docling/issues/820)) ([`6882e6c`](https://github.com/docling-project/docling/commit/6882e6c38df30e4d4a1b83e01b13900ca7ea001f)) +* Add platform info to CLI version printout ([#816](https://github.com/docling-project/docling/issues/816)) ([`95b293a`](https://github.com/docling-project/docling/commit/95b293a72356f94c7076e3649be970c8a51121a3)) +* **ocr:** Expose `rec_keys_path` in RapidOcrOptions to support custom dictionaries ([#786](https://github.com/docling-project/docling/issues/786)) ([`5332755`](https://github.com/docling-project/docling/commit/53327552e83ced079ae50d8067ba7a8ce80cd9ad)) +* Introduce automatic language detection in TesseractOcrCliModel ([#800](https://github.com/docling-project/docling/issues/800)) ([`3be2fb5`](https://github.com/docling-project/docling/commit/3be2fb581fe5a2ebd5cec9c86bb22eb1dec6fd0f)) ### Fix -* Fix single newline handling in MD backend ([#824](https://github.com/DS4SD/docling/issues/824)) ([`5aed9f8`](https://github.com/DS4SD/docling/commit/5aed9f8aeba1624ba1a721e2ed3ba4aceaa7a482)) -* Use file extension if filetype fails with PDF ([#827](https://github.com/DS4SD/docling/issues/827)) ([`adf6353`](https://github.com/DS4SD/docling/commit/adf635348365f82daa64e3f879076a7baf71edc0)) -* Parse html with omitted body tag ([#818](https://github.com/DS4SD/docling/issues/818)) ([`a112d7a`](https://github.com/DS4SD/docling/commit/a112d7a03512e8a00842a100416426254d6ecfc0)) +* Fix single newline handling in MD backend ([#824](https://github.com/docling-project/docling/issues/824)) ([`5aed9f8`](https://github.com/docling-project/docling/commit/5aed9f8aeba1624ba1a721e2ed3ba4aceaa7a482)) +* Use file extension if filetype fails with PDF ([#827](https://github.com/docling-project/docling/issues/827)) ([`adf6353`](https://github.com/docling-project/docling/commit/adf635348365f82daa64e3f879076a7baf71edc0)) +* Parse html with omitted body tag ([#818](https://github.com/docling-project/docling/issues/818)) ([`a112d7a`](https://github.com/docling-project/docling/commit/a112d7a03512e8a00842a100416426254d6ecfc0)) ### Documentation -* Document Docling JSON parsing ([#819](https://github.com/DS4SD/docling/issues/819)) ([`6875913`](https://github.com/DS4SD/docling/commit/6875913e34abacb8d71b5d31543adbf7b5bd5e92)) -* Add SSL verification error mitigation ([#821](https://github.com/DS4SD/docling/issues/821)) ([`5139b48`](https://github.com/DS4SD/docling/commit/5139b48e4e62bb061d956c132958ec2e6d88e40a)) -* **backend XML:** Do not delete temp file in notebook ([#817](https://github.com/DS4SD/docling/issues/817)) ([`4d41db3`](https://github.com/DS4SD/docling/commit/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b)) -* Typo ([#814](https://github.com/DS4SD/docling/issues/814)) ([`8a4ec77`](https://github.com/DS4SD/docling/commit/8a4ec77576b8a9fd60d0047939665d00cf93b4dd)) -* Added markdown headings to enable TOC in github pages ([#808](https://github.com/DS4SD/docling/issues/808)) ([`b885b2f`](https://github.com/DS4SD/docling/commit/b885b2fa3c2519c399ed4b9a3dd4c2f6f62235d1)) -* Description of supported formats and backends ([#788](https://github.com/DS4SD/docling/issues/788)) ([`c2ae1cc`](https://github.com/DS4SD/docling/commit/c2ae1cc4cab0f9e693c7ca460fe8afa5b515ee94)) +* Document Docling JSON parsing ([#819](https://github.com/docling-project/docling/issues/819)) ([`6875913`](https://github.com/docling-project/docling/commit/6875913e34abacb8d71b5d31543adbf7b5bd5e92)) +* Add SSL verification error mitigation ([#821](https://github.com/docling-project/docling/issues/821)) ([`5139b48`](https://github.com/docling-project/docling/commit/5139b48e4e62bb061d956c132958ec2e6d88e40a)) +* **backend XML:** Do not delete temp file in notebook ([#817](https://github.com/docling-project/docling/issues/817)) ([`4d41db3`](https://github.com/docling-project/docling/commit/4d41db3f7abb86c8c65386bf94e7eb0bf22bb82b)) +* Typo ([#814](https://github.com/docling-project/docling/issues/814)) ([`8a4ec77`](https://github.com/docling-project/docling/commit/8a4ec77576b8a9fd60d0047939665d00cf93b4dd)) +* Added markdown headings to enable TOC in github pages ([#808](https://github.com/docling-project/docling/issues/808)) ([`b885b2f`](https://github.com/docling-project/docling/commit/b885b2fa3c2519c399ed4b9a3dd4c2f6f62235d1)) +* Description of supported formats and backends ([#788](https://github.com/docling-project/docling/issues/788)) ([`c2ae1cc`](https://github.com/docling-project/docling/commit/c2ae1cc4cab0f9e693c7ca460fe8afa5b515ee94)) -## [v2.16.0](https://github.com/DS4SD/docling/releases/tag/v2.16.0) - 2025-01-24 +## [v2.16.0](https://github.com/docling-project/docling/releases/tag/v2.16.0) - 2025-01-24 ### Feature -* New document picture classifier ([#805](https://github.com/DS4SD/docling/issues/805)) ([`16a218d`](https://github.com/DS4SD/docling/commit/16a218d871c48fd9cc636b77f7b597dc40cbeeec)) -* Add Docling JSON ingestion ([#783](https://github.com/DS4SD/docling/issues/783)) ([`88a0e66`](https://github.com/DS4SD/docling/commit/88a0e66adc19238f57a942b0504926cdaeacd8cc)) -* Code and equation model for PDF and code blocks in markdown ([#752](https://github.com/DS4SD/docling/issues/752)) ([`3213b24`](https://github.com/DS4SD/docling/commit/3213b247ad6870ff984271f09f7720be68d9479b)) -* Add "auto" language for TesseractOcr ([#759](https://github.com/DS4SD/docling/issues/759)) ([`8543c22`](https://github.com/DS4SD/docling/commit/8543c22687fee40459d393bf4adcfc059712de02)) +* New document picture classifier ([#805](https://github.com/docling-project/docling/issues/805)) ([`16a218d`](https://github.com/docling-project/docling/commit/16a218d871c48fd9cc636b77f7b597dc40cbeeec)) +* Add Docling JSON ingestion ([#783](https://github.com/docling-project/docling/issues/783)) ([`88a0e66`](https://github.com/docling-project/docling/commit/88a0e66adc19238f57a942b0504926cdaeacd8cc)) +* Code and equation model for PDF and code blocks in markdown ([#752](https://github.com/docling-project/docling/issues/752)) ([`3213b24`](https://github.com/docling-project/docling/commit/3213b247ad6870ff984271f09f7720be68d9479b)) +* Add "auto" language for TesseractOcr ([#759](https://github.com/docling-project/docling/issues/759)) ([`8543c22`](https://github.com/docling-project/docling/commit/8543c22687fee40459d393bf4adcfc059712de02)) ### Fix -* Added extraction of byte-images in excel ([#804](https://github.com/DS4SD/docling/issues/804)) ([`a458e29`](https://github.com/DS4SD/docling/commit/a458e298ca64da2c6df29d953e95645525817bed)) -* Update docling-parse-v2 backend version with new parsing fixes ([#769](https://github.com/DS4SD/docling/issues/769)) ([`670a08b`](https://github.com/DS4SD/docling/commit/670a08bdedda847ff3b6942bcaa1a2adef79afe2)) +* Added extraction of byte-images in excel ([#804](https://github.com/docling-project/docling/issues/804)) ([`a458e29`](https://github.com/docling-project/docling/commit/a458e298ca64da2c6df29d953e95645525817bed)) +* Update docling-parse-v2 backend version with new parsing fixes ([#769](https://github.com/docling-project/docling/issues/769)) ([`670a08b`](https://github.com/docling-project/docling/commit/670a08bdedda847ff3b6942bcaa1a2adef79afe2)) ### Documentation -* Fix minor typos ([#801](https://github.com/DS4SD/docling/issues/801)) ([`c58f75d`](https://github.com/DS4SD/docling/commit/c58f75d0f75040e32820cc2915ec00755211c02f)) -* Add Azure RAG example ([#675](https://github.com/DS4SD/docling/issues/675)) ([`9020a93`](https://github.com/DS4SD/docling/commit/9020a934be35b0798c972eb77a22fb62ce654ca5)) -* Fix links between docs pages ([#697](https://github.com/DS4SD/docling/issues/697)) ([`c49b352`](https://github.com/DS4SD/docling/commit/c49b3526fb7b72e8007f785b1fcfdf58c2457756)) -* Fix correct Accelerator pipeline options in docs/examples/custom_convert.py ([#733](https://github.com/DS4SD/docling/issues/733)) ([`7686083`](https://github.com/DS4SD/docling/commit/768608351d40376c3504546f52e967195536b3d5)) -* Example to translate documents ([#739](https://github.com/DS4SD/docling/issues/739)) ([`f7e1cbf`](https://github.com/DS4SD/docling/commit/f7e1cbf629ae5f3e279296e72f656b7a453ab7a3)) +* Fix minor typos ([#801](https://github.com/docling-project/docling/issues/801)) ([`c58f75d`](https://github.com/docling-project/docling/commit/c58f75d0f75040e32820cc2915ec00755211c02f)) +* Add Azure RAG example ([#675](https://github.com/docling-project/docling/issues/675)) ([`9020a93`](https://github.com/docling-project/docling/commit/9020a934be35b0798c972eb77a22fb62ce654ca5)) +* Fix links between docs pages ([#697](https://github.com/docling-project/docling/issues/697)) ([`c49b352`](https://github.com/docling-project/docling/commit/c49b3526fb7b72e8007f785b1fcfdf58c2457756)) +* Fix correct Accelerator pipeline options in docs/examples/custom_convert.py ([#733](https://github.com/docling-project/docling/issues/733)) ([`7686083`](https://github.com/docling-project/docling/commit/768608351d40376c3504546f52e967195536b3d5)) +* Example to translate documents ([#739](https://github.com/docling-project/docling/issues/739)) ([`f7e1cbf`](https://github.com/docling-project/docling/commit/f7e1cbf629ae5f3e279296e72f656b7a453ab7a3)) -## [v2.15.1](https://github.com/DS4SD/docling/releases/tag/v2.15.1) - 2025-01-10 +## [v2.15.1](https://github.com/docling-project/docling/releases/tag/v2.15.1) - 2025-01-10 ### Fix -* Improve OCR results, stricten criteria before dropping bitmap areas ([#719](https://github.com/DS4SD/docling/issues/719)) ([`5a060f2`](https://github.com/DS4SD/docling/commit/5a060f237d1decd0ff9db9e73478978419315778)) -* Allow earlier requests versions ([#716](https://github.com/DS4SD/docling/issues/716)) ([`e64b5a2`](https://github.com/DS4SD/docling/commit/e64b5a2f628acc340a6d94ee6f1ada2aa267cecc)) +* Improve OCR results, stricten criteria before dropping bitmap areas ([#719](https://github.com/docling-project/docling/issues/719)) ([`5a060f2`](https://github.com/docling-project/docling/commit/5a060f237d1decd0ff9db9e73478978419315778)) +* Allow earlier requests versions ([#716](https://github.com/docling-project/docling/issues/716)) ([`e64b5a2`](https://github.com/docling-project/docling/commit/e64b5a2f628acc340a6d94ee6f1ada2aa267cecc)) ### Documentation -* Add pointers to LangChain-side docs ([#718](https://github.com/DS4SD/docling/issues/718)) ([`9a6b5c8`](https://github.com/DS4SD/docling/commit/9a6b5c8c8debc81e0ddcbe91df6afbbeb29e97e6)) -* Add LangChain docs ([#717](https://github.com/DS4SD/docling/issues/717)) ([`4fa8028`](https://github.com/DS4SD/docling/commit/4fa8028bd8120d7557e1d45ba31e200e130af698)) +* Add pointers to LangChain-side docs ([#718](https://github.com/docling-project/docling/issues/718)) ([`9a6b5c8`](https://github.com/docling-project/docling/commit/9a6b5c8c8debc81e0ddcbe91df6afbbeb29e97e6)) +* Add LangChain docs ([#717](https://github.com/docling-project/docling/issues/717)) ([`4fa8028`](https://github.com/docling-project/docling/commit/4fa8028bd8120d7557e1d45ba31e200e130af698)) -## [v2.15.0](https://github.com/DS4SD/docling/releases/tag/v2.15.0) - 2025-01-08 +## [v2.15.0](https://github.com/docling-project/docling/releases/tag/v2.15.0) - 2025-01-08 ### Feature -* Added http header support for document converter and cli ([#642](https://github.com/DS4SD/docling/issues/642)) ([`0ee849e`](https://github.com/DS4SD/docling/commit/0ee849e8bc8cf24d1c5597af3fe20a7fa19a29e0)) +* Added http header support for document converter and cli ([#642](https://github.com/docling-project/docling/issues/642)) ([`0ee849e`](https://github.com/docling-project/docling/commit/0ee849e8bc8cf24d1c5597af3fe20a7fa19a29e0)) ### Fix -* Correct scaling of debug visualizations, tune OCR ([#700](https://github.com/DS4SD/docling/issues/700)) ([`5cb4cf6`](https://github.com/DS4SD/docling/commit/5cb4cf6f19f91e6c87141e93400c4b54b93aa5d7)) -* Let BeautifulSoup detect the HTML encoding ([#695](https://github.com/DS4SD/docling/issues/695)) ([`42856fd`](https://github.com/DS4SD/docling/commit/42856fdf79559188ec4617bc5d3a007286f114d2)) -* **mspowerpoint:** Handle invalid images in PowerPoint slides ([#650](https://github.com/DS4SD/docling/issues/650)) ([`d49650c`](https://github.com/DS4SD/docling/commit/d49650c54ffa60bc6d6106970e104071689bc7b0)) +* Correct scaling of debug visualizations, tune OCR ([#700](https://github.com/docling-project/docling/issues/700)) ([`5cb4cf6`](https://github.com/docling-project/docling/commit/5cb4cf6f19f91e6c87141e93400c4b54b93aa5d7)) +* Let BeautifulSoup detect the HTML encoding ([#695](https://github.com/docling-project/docling/issues/695)) ([`42856fd`](https://github.com/docling-project/docling/commit/42856fdf79559188ec4617bc5d3a007286f114d2)) +* **mspowerpoint:** Handle invalid images in PowerPoint slides ([#650](https://github.com/docling-project/docling/issues/650)) ([`d49650c`](https://github.com/docling-project/docling/commit/d49650c54ffa60bc6d6106970e104071689bc7b0)) ### Documentation -* Specify docstring types ([#702](https://github.com/DS4SD/docling/issues/702)) ([`ead396a`](https://github.com/DS4SD/docling/commit/ead396ab407f6bbd43176abd6ed2bed7ed8c7c43)) -* Add link to rag with granite ([#698](https://github.com/DS4SD/docling/issues/698)) ([`6701f34`](https://github.com/DS4SD/docling/commit/6701f34c855992c52918b210c65a2edb1c827c01)) -* Add integrations, revamp docs ([#693](https://github.com/DS4SD/docling/issues/693)) ([`2d24fae`](https://github.com/DS4SD/docling/commit/2d24faecd96bfa656b2b8c80f25cdf251a50526a)) -* Add OpenContracts as an integration ([#679](https://github.com/DS4SD/docling/issues/679)) ([`569038d`](https://github.com/DS4SD/docling/commit/569038df4205703f87517ea58da7902d143e7699)) -* Add Weaviate RAG recipe notebook ([#451](https://github.com/DS4SD/docling/issues/451)) ([`2b591f9`](https://github.com/DS4SD/docling/commit/2b591f98726ed0d883236dd0550201b95203eebb)) -* Document Haystack & Vectara support ([#628](https://github.com/DS4SD/docling/issues/628)) ([`fc645ea`](https://github.com/DS4SD/docling/commit/fc645ea531ddc67959640b428007851d641c923e)) +* Specify docstring types ([#702](https://github.com/docling-project/docling/issues/702)) ([`ead396a`](https://github.com/docling-project/docling/commit/ead396ab407f6bbd43176abd6ed2bed7ed8c7c43)) +* Add link to rag with granite ([#698](https://github.com/docling-project/docling/issues/698)) ([`6701f34`](https://github.com/docling-project/docling/commit/6701f34c855992c52918b210c65a2edb1c827c01)) +* Add integrations, revamp docs ([#693](https://github.com/docling-project/docling/issues/693)) ([`2d24fae`](https://github.com/docling-project/docling/commit/2d24faecd96bfa656b2b8c80f25cdf251a50526a)) +* Add OpenContracts as an integration ([#679](https://github.com/docling-project/docling/issues/679)) ([`569038d`](https://github.com/docling-project/docling/commit/569038df4205703f87517ea58da7902d143e7699)) +* Add Weaviate RAG recipe notebook ([#451](https://github.com/docling-project/docling/issues/451)) ([`2b591f9`](https://github.com/docling-project/docling/commit/2b591f98726ed0d883236dd0550201b95203eebb)) +* Document Haystack & Vectara support ([#628](https://github.com/docling-project/docling/issues/628)) ([`fc645ea`](https://github.com/docling-project/docling/commit/fc645ea531ddc67959640b428007851d641c923e)) -## [v2.14.0](https://github.com/DS4SD/docling/releases/tag/v2.14.0) - 2024-12-18 +## [v2.14.0](https://github.com/docling-project/docling/releases/tag/v2.14.0) - 2024-12-18 ### Feature -* Create a backend to transform PubMed XML files to DoclingDocument ([#557](https://github.com/DS4SD/docling/issues/557)) ([`fd03480`](https://github.com/DS4SD/docling/commit/fd034802b65a0e567531b8ecc9a283aaf030e050)) +* Create a backend to transform PubMed XML files to DoclingDocument ([#557](https://github.com/docling-project/docling/issues/557)) ([`fd03480`](https://github.com/docling-project/docling/commit/fd034802b65a0e567531b8ecc9a283aaf030e050)) -## [v2.13.0](https://github.com/DS4SD/docling/releases/tag/v2.13.0) - 2024-12-17 +## [v2.13.0](https://github.com/docling-project/docling/releases/tag/v2.13.0) - 2024-12-17 ### Feature -* Updated Layout processing with forms and key-value areas ([#530](https://github.com/DS4SD/docling/issues/530)) ([`60dc852`](https://github.com/DS4SD/docling/commit/60dc852f16dc1adbb5e9284c81a146043a301ec1)) -* Create a backend to parse USPTO patents into DoclingDocument ([#606](https://github.com/DS4SD/docling/issues/606)) ([`4e08750`](https://github.com/DS4SD/docling/commit/4e087504cc4b04210574e69f616badcddfa1f8e5)) -* Add Easyocr parameter recog_network ([#613](https://github.com/DS4SD/docling/issues/613)) ([`3b53bd3`](https://github.com/DS4SD/docling/commit/3b53bd38c8efcc5ba54421fbfa90d047f1a61f82)) +* Updated Layout processing with forms and key-value areas ([#530](https://github.com/docling-project/docling/issues/530)) ([`60dc852`](https://github.com/docling-project/docling/commit/60dc852f16dc1adbb5e9284c81a146043a301ec1)) +* Create a backend to parse USPTO patents into DoclingDocument ([#606](https://github.com/docling-project/docling/issues/606)) ([`4e08750`](https://github.com/docling-project/docling/commit/4e087504cc4b04210574e69f616badcddfa1f8e5)) +* Add Easyocr parameter recog_network ([#613](https://github.com/docling-project/docling/issues/613)) ([`3b53bd3`](https://github.com/docling-project/docling/commit/3b53bd38c8efcc5ba54421fbfa90d047f1a61f82)) ### Documentation -* Add Haystack RAG example ([#615](https://github.com/DS4SD/docling/issues/615)) ([`3e599c7`](https://github.com/DS4SD/docling/commit/3e599c7bbeef211dc346e9bc1d3a249113fcc4e4)) -* Fix the path to the run_with_accelerator.py example ([#608](https://github.com/DS4SD/docling/issues/608)) ([`3bb3bf5`](https://github.com/DS4SD/docling/commit/3bb3bf57150c9705a055982e6fb0cc8d1408f161)) +* Add Haystack RAG example ([#615](https://github.com/docling-project/docling/issues/615)) ([`3e599c7`](https://github.com/docling-project/docling/commit/3e599c7bbeef211dc346e9bc1d3a249113fcc4e4)) +* Fix the path to the run_with_accelerator.py example ([#608](https://github.com/docling-project/docling/issues/608)) ([`3bb3bf5`](https://github.com/docling-project/docling/commit/3bb3bf57150c9705a055982e6fb0cc8d1408f161)) -## [v2.12.0](https://github.com/DS4SD/docling/releases/tag/v2.12.0) - 2024-12-13 +## [v2.12.0](https://github.com/docling-project/docling/releases/tag/v2.12.0) - 2024-12-13 ### Feature -* Introduce support for GPU Accelerators ([#593](https://github.com/DS4SD/docling/issues/593)) ([`19fad92`](https://github.com/DS4SD/docling/commit/19fad9261cb61f732a0426393866c8c1a9efbf4f)) +* Introduce support for GPU Accelerators ([#593](https://github.com/docling-project/docling/issues/593)) ([`19fad92`](https://github.com/docling-project/docling/commit/19fad9261cb61f732a0426393866c8c1a9efbf4f)) -## [v2.11.0](https://github.com/DS4SD/docling/releases/tag/v2.11.0) - 2024-12-12 +## [v2.11.0](https://github.com/docling-project/docling/releases/tag/v2.11.0) - 2024-12-12 ### Feature -* Add timeout limit to document parsing job. DS4SD#270 ([#552](https://github.com/DS4SD/docling/issues/552)) ([`3da166e`](https://github.com/DS4SD/docling/commit/3da166eafa3c119de961510341cb92397652c222)) +* Add timeout limit to document parsing job. DS4SD#270 ([#552](https://github.com/docling-project/docling/issues/552)) ([`3da166e`](https://github.com/docling-project/docling/commit/3da166eafa3c119de961510341cb92397652c222)) ### Fix -* Do not import python modules from deepsearch-glm ([#569](https://github.com/DS4SD/docling/issues/569)) ([`aee9c0b`](https://github.com/DS4SD/docling/commit/aee9c0b324a07190ad03ad3a6266e76c465d4cdf)) -* Handle no result from RapidOcr reader ([#558](https://github.com/DS4SD/docling/issues/558)) ([`f45499c`](https://github.com/DS4SD/docling/commit/f45499ce9349fe55538dfb36d74c395e9193d9b1)) -* Make enum serializable with human-readable value ([#555](https://github.com/DS4SD/docling/issues/555)) ([`a7df337`](https://github.com/DS4SD/docling/commit/a7df337654fa5fa7633af8740fb5e4cc4a06f250)) +* Do not import python modules from deepsearch-glm ([#569](https://github.com/docling-project/docling/issues/569)) ([`aee9c0b`](https://github.com/docling-project/docling/commit/aee9c0b324a07190ad03ad3a6266e76c465d4cdf)) +* Handle no result from RapidOcr reader ([#558](https://github.com/docling-project/docling/issues/558)) ([`f45499c`](https://github.com/docling-project/docling/commit/f45499ce9349fe55538dfb36d74c395e9193d9b1)) +* Make enum serializable with human-readable value ([#555](https://github.com/docling-project/docling/issues/555)) ([`a7df337`](https://github.com/docling-project/docling/commit/a7df337654fa5fa7633af8740fb5e4cc4a06f250)) ### Documentation -* Update chunking usage docs, minor reorg ([#550](https://github.com/DS4SD/docling/issues/550)) ([`d0c9e8e`](https://github.com/DS4SD/docling/commit/d0c9e8e508d7edef5e733be6cdea2cea0a9a0695)) +* Update chunking usage docs, minor reorg ([#550](https://github.com/docling-project/docling/issues/550)) ([`d0c9e8e`](https://github.com/docling-project/docling/commit/d0c9e8e508d7edef5e733be6cdea2cea0a9a0695)) -## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09 +## [v2.10.0](https://github.com/docling-project/docling/releases/tag/v2.10.0) - 2024-12-09 ### Feature -* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170)) +* Docling-parse v2 as default PDF backend ([#549](https://github.com/docling-project/docling/issues/549)) ([`aca57f0`](https://github.com/docling-project/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170)) ### Fix -* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2)) -* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96)) +* Call into docling-core for legacy document transform ([#551](https://github.com/docling-project/docling/issues/551)) ([`7972d47`](https://github.com/docling-project/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2)) +* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/docling-project/docling/issues/544)) ([`78f61a8`](https://github.com/docling-project/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96)) -## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09 +## [v2.9.0](https://github.com/docling-project/docling/releases/tag/v2.9.0) - 2024-12-09 ### Feature -* Expose new hybrid chunker, update docs ([#384](https://github.com/DS4SD/docling/issues/384)) ([`c8ecdd9`](https://github.com/DS4SD/docling/commit/c8ecdd987e80227db3850ea729ecb36d2b609040)) -* **MS Word backend:** Make detection of headers and other styles localization agnostic ([#534](https://github.com/DS4SD/docling/issues/534)) ([`3e073df`](https://github.com/DS4SD/docling/commit/3e073dfbebbc65f995d4df946c1650699a26782c)) +* Expose new hybrid chunker, update docs ([#384](https://github.com/docling-project/docling/issues/384)) ([`c8ecdd9`](https://github.com/docling-project/docling/commit/c8ecdd987e80227db3850ea729ecb36d2b609040)) +* **MS Word backend:** Make detection of headers and other styles localization agnostic ([#534](https://github.com/docling-project/docling/issues/534)) ([`3e073df`](https://github.com/docling-project/docling/commit/3e073dfbebbc65f995d4df946c1650699a26782c)) ### Fix -* Correcting DefaultText ID for MS Word backend ([#537](https://github.com/DS4SD/docling/issues/537)) ([`eb7ffcd`](https://github.com/DS4SD/docling/commit/eb7ffcdd1cda1caa8ec8ba2fc313ff1e7d9acd4f)) -* Add `py.typed` marker file ([#531](https://github.com/DS4SD/docling/issues/531)) ([`9102fe1`](https://github.com/DS4SD/docling/commit/9102fe1adcd43432e5fb3f35af704b7442c5d633)) -* Enable HTML export in CLI and add options for image mode ([#513](https://github.com/DS4SD/docling/issues/513)) ([`0d11e30`](https://github.com/DS4SD/docling/commit/0d11e30dd813020c0189de849cd7b2e285d08694)) -* Missing text in docx (t tag) when embedded in a table ([#528](https://github.com/DS4SD/docling/issues/528)) ([`b730b2d`](https://github.com/DS4SD/docling/commit/b730b2d7a04a8773a00ed88889d28b0c476ba052)) -* Restore pydantic version pin after fixes ([#512](https://github.com/DS4SD/docling/issues/512)) ([`c830b92`](https://github.com/DS4SD/docling/commit/c830b92b2e043ea63d216f65b3f9d88d2a8c33f7)) -* Folder input in cli ([#511](https://github.com/DS4SD/docling/issues/511)) ([`8ada0bc`](https://github.com/DS4SD/docling/commit/8ada0bccc744df94f755adf71cf8b163e6304375)) +* Correcting DefaultText ID for MS Word backend ([#537](https://github.com/docling-project/docling/issues/537)) ([`eb7ffcd`](https://github.com/docling-project/docling/commit/eb7ffcdd1cda1caa8ec8ba2fc313ff1e7d9acd4f)) +* Add `py.typed` marker file ([#531](https://github.com/docling-project/docling/issues/531)) ([`9102fe1`](https://github.com/docling-project/docling/commit/9102fe1adcd43432e5fb3f35af704b7442c5d633)) +* Enable HTML export in CLI and add options for image mode ([#513](https://github.com/docling-project/docling/issues/513)) ([`0d11e30`](https://github.com/docling-project/docling/commit/0d11e30dd813020c0189de849cd7b2e285d08694)) +* Missing text in docx (t tag) when embedded in a table ([#528](https://github.com/docling-project/docling/issues/528)) ([`b730b2d`](https://github.com/docling-project/docling/commit/b730b2d7a04a8773a00ed88889d28b0c476ba052)) +* Restore pydantic version pin after fixes ([#512](https://github.com/docling-project/docling/issues/512)) ([`c830b92`](https://github.com/docling-project/docling/commit/c830b92b2e043ea63d216f65b3f9d88d2a8c33f7)) +* Folder input in cli ([#511](https://github.com/docling-project/docling/issues/511)) ([`8ada0bc`](https://github.com/docling-project/docling/commit/8ada0bccc744df94f755adf71cf8b163e6304375)) ### Documentation -* Document new integrations ([#532](https://github.com/DS4SD/docling/issues/532)) ([`e780333`](https://github.com/DS4SD/docling/commit/e7803334409a343a59c536c529a03d6f5cdbfe15)) +* Document new integrations ([#532](https://github.com/docling-project/docling/issues/532)) ([`e780333`](https://github.com/docling-project/docling/commit/e7803334409a343a59c536c529a03d6f5cdbfe15)) -## [v2.8.3](https://github.com/DS4SD/docling/releases/tag/v2.8.3) - 2024-12-03 +## [v2.8.3](https://github.com/docling-project/docling/releases/tag/v2.8.3) - 2024-12-03 ### Fix -* Improve handling of disallowed formats ([#429](https://github.com/DS4SD/docling/issues/429)) ([`34c7c79`](https://github.com/DS4SD/docling/commit/34c7c798580476a86ce8abec30b1115fbb36fdd8)) +* Improve handling of disallowed formats ([#429](https://github.com/docling-project/docling/issues/429)) ([`34c7c79`](https://github.com/docling-project/docling/commit/34c7c798580476a86ce8abec30b1115fbb36fdd8)) -## [v2.8.2](https://github.com/DS4SD/docling/releases/tag/v2.8.2) - 2024-12-03 +## [v2.8.2](https://github.com/docling-project/docling/releases/tag/v2.8.2) - 2024-12-03 ### Fix -* ParserError EOF inside string (#470) ([#472](https://github.com/DS4SD/docling/issues/472)) ([`c90c41c`](https://github.com/DS4SD/docling/commit/c90c41c391de4366db554d7a71ce9a35467c981e)) -* PermissionError when using tesseract_ocr_cli_model ([#496](https://github.com/DS4SD/docling/issues/496)) ([`d3f84b2`](https://github.com/DS4SD/docling/commit/d3f84b2457125feacd0c21d6513e7ae69a308ea5)) +* ParserError EOF inside string (#470) ([#472](https://github.com/docling-project/docling/issues/472)) ([`c90c41c`](https://github.com/docling-project/docling/commit/c90c41c391de4366db554d7a71ce9a35467c981e)) +* PermissionError when using tesseract_ocr_cli_model ([#496](https://github.com/docling-project/docling/issues/496)) ([`d3f84b2`](https://github.com/docling-project/docling/commit/d3f84b2457125feacd0c21d6513e7ae69a308ea5)) ### Documentation -* Add styling for faq ([#502](https://github.com/DS4SD/docling/issues/502)) ([`5ba3807`](https://github.com/DS4SD/docling/commit/5ba3807f315a01b1a4e8df9bab40e34a4238205a)) -* Typo in faq ([#484](https://github.com/DS4SD/docling/issues/484)) ([`33cff98`](https://github.com/DS4SD/docling/commit/33cff98d360c02a382a66850c696a0cf511659ac)) -* Add automatic api reference ([#475](https://github.com/DS4SD/docling/issues/475)) ([`d487210`](https://github.com/DS4SD/docling/commit/d4872103b8f24e38b37a8cd3ac414d3e02e7d6e8)) -* Introduce faq section ([#468](https://github.com/DS4SD/docling/issues/468)) ([`8ccb3c6`](https://github.com/DS4SD/docling/commit/8ccb3c6db69318789af7deec26cfa2a3fd71302e)) +* Add styling for faq ([#502](https://github.com/docling-project/docling/issues/502)) ([`5ba3807`](https://github.com/docling-project/docling/commit/5ba3807f315a01b1a4e8df9bab40e34a4238205a)) +* Typo in faq ([#484](https://github.com/docling-project/docling/issues/484)) ([`33cff98`](https://github.com/docling-project/docling/commit/33cff98d360c02a382a66850c696a0cf511659ac)) +* Add automatic api reference ([#475](https://github.com/docling-project/docling/issues/475)) ([`d487210`](https://github.com/docling-project/docling/commit/d4872103b8f24e38b37a8cd3ac414d3e02e7d6e8)) +* Introduce faq section ([#468](https://github.com/docling-project/docling/issues/468)) ([`8ccb3c6`](https://github.com/docling-project/docling/commit/8ccb3c6db69318789af7deec26cfa2a3fd71302e)) ### Performance -* Prevent temp file leftovers, reuse core type ([#487](https://github.com/DS4SD/docling/issues/487)) ([`051789d`](https://github.com/DS4SD/docling/commit/051789d01706d3823dd6307eca4dc5faacd1b7ce)) +* Prevent temp file leftovers, reuse core type ([#487](https://github.com/docling-project/docling/issues/487)) ([`051789d`](https://github.com/docling-project/docling/commit/051789d01706d3823dd6307eca4dc5faacd1b7ce)) -## [v2.8.1](https://github.com/DS4SD/docling/releases/tag/v2.8.1) - 2024-11-29 +## [v2.8.1](https://github.com/docling-project/docling/releases/tag/v2.8.1) - 2024-11-29 ### Fix -* **cli:** Expose debug options ([#467](https://github.com/DS4SD/docling/issues/467)) ([`dd8de46`](https://github.com/DS4SD/docling/commit/dd8de462676993b81926610fd573d51d3272cbaf)) -* Remove unused deps ([#466](https://github.com/DS4SD/docling/issues/466)) ([`af63818`](https://github.com/DS4SD/docling/commit/af63818df5636c4cbe77c0a01e6dcc0d47c4bfdb)) +* **cli:** Expose debug options ([#467](https://github.com/docling-project/docling/issues/467)) ([`dd8de46`](https://github.com/docling-project/docling/commit/dd8de462676993b81926610fd573d51d3272cbaf)) +* Remove unused deps ([#466](https://github.com/docling-project/docling/issues/466)) ([`af63818`](https://github.com/docling-project/docling/commit/af63818df5636c4cbe77c0a01e6dcc0d47c4bfdb)) ### Documentation -* Extend integration docs & README ([#456](https://github.com/DS4SD/docling/issues/456)) ([`84c46fd`](https://github.com/DS4SD/docling/commit/84c46fdeb344502edf9647c610c4828ab0ffb9dd)) +* Extend integration docs & README ([#456](https://github.com/docling-project/docling/issues/456)) ([`84c46fd`](https://github.com/docling-project/docling/commit/84c46fdeb344502edf9647c610c4828ab0ffb9dd)) -## [v2.8.0](https://github.com/DS4SD/docling/releases/tag/v2.8.0) - 2024-11-27 +## [v2.8.0](https://github.com/docling-project/docling/releases/tag/v2.8.0) - 2024-11-27 ### Feature -* **ocr:** Added support for RapidOCR engine ([#415](https://github.com/DS4SD/docling/issues/415)) ([`85b2999`](https://github.com/DS4SD/docling/commit/85b29990be6468516b6dbe49f880d9f1f4c11c5a)) +* **ocr:** Added support for RapidOCR engine ([#415](https://github.com/docling-project/docling/issues/415)) ([`85b2999`](https://github.com/docling-project/docling/commit/85b29990be6468516b6dbe49f880d9f1f4c11c5a)) ### Fix -* Use correct image index in word backend ([#442](https://github.com/DS4SD/docling/issues/442)) ([`767563b`](https://github.com/DS4SD/docling/commit/767563bf8b331304892285c0789bba481acaa1b5)) -* Update tests and examples for docling-core 2.5.1 ([#449](https://github.com/DS4SD/docling/issues/449)) ([`29807a2`](https://github.com/DS4SD/docling/commit/29807a2d687896c67ada934c6a626401f5930e50)) +* Use correct image index in word backend ([#442](https://github.com/docling-project/docling/issues/442)) ([`767563b`](https://github.com/docling-project/docling/commit/767563bf8b331304892285c0789bba481acaa1b5)) +* Update tests and examples for docling-core 2.5.1 ([#449](https://github.com/docling-project/docling/issues/449)) ([`29807a2`](https://github.com/docling-project/docling/commit/29807a2d687896c67ada934c6a626401f5930e50)) -## [v2.7.1](https://github.com/DS4SD/docling/releases/tag/v2.7.1) - 2024-11-26 +## [v2.7.1](https://github.com/docling-project/docling/releases/tag/v2.7.1) - 2024-11-26 ### Fix -* Fixes for wordx ([#432](https://github.com/DS4SD/docling/issues/432)) ([`d0a1180`](https://github.com/DS4SD/docling/commit/d0a118047804765b1b8532e72e08272e678c0c93)) -* Force pydantic < 2.10.0 ([#407](https://github.com/DS4SD/docling/issues/407)) ([`d7072b4`](https://github.com/DS4SD/docling/commit/d7072b4b56227756eb2c7abd3a6e7387eeefe7c1)) +* Fixes for wordx ([#432](https://github.com/docling-project/docling/issues/432)) ([`d0a1180`](https://github.com/docling-project/docling/commit/d0a118047804765b1b8532e72e08272e678c0c93)) +* Force pydantic < 2.10.0 ([#407](https://github.com/docling-project/docling/issues/407)) ([`d7072b4`](https://github.com/docling-project/docling/commit/d7072b4b56227756eb2c7abd3a6e7387eeefe7c1)) ### Documentation -* Add DocETL, Kotaemon, spaCy integrations; minor docs improvements ([#408](https://github.com/DS4SD/docling/issues/408)) ([`7a45b92`](https://github.com/DS4SD/docling/commit/7a45b92078b3a9fdd8f0650002eddc03e9d780af)) +* Add DocETL, Kotaemon, spaCy integrations; minor docs improvements ([#408](https://github.com/docling-project/docling/issues/408)) ([`7a45b92`](https://github.com/docling-project/docling/commit/7a45b92078b3a9fdd8f0650002eddc03e9d780af)) -## [v2.7.0](https://github.com/DS4SD/docling/releases/tag/v2.7.0) - 2024-11-20 +## [v2.7.0](https://github.com/docling-project/docling/releases/tag/v2.7.0) - 2024-11-20 ### Feature -* Add support for `ocrmac` OCR engine on macOS ([#276](https://github.com/DS4SD/docling/issues/276)) ([`6efa96c`](https://github.com/DS4SD/docling/commit/6efa96c983fc509b2c7b35a4a25a714284f2f782)) +* Add support for `ocrmac` OCR engine on macOS ([#276](https://github.com/docling-project/docling/issues/276)) ([`6efa96c`](https://github.com/docling-project/docling/commit/6efa96c983fc509b2c7b35a4a25a714284f2f782)) ### Fix -* Python3.9 support ([#396](https://github.com/DS4SD/docling/issues/396)) ([`7b013ab`](https://github.com/DS4SD/docling/commit/7b013abcf31ba49e2141dfd408bc8c23e8d87d91)) -* Propagate document limits to converter ([#388](https://github.com/DS4SD/docling/issues/388)) ([`32ebf55`](https://github.com/DS4SD/docling/commit/32ebf55e3338dd22f9a23c55595f15835794d961)) +* Python3.9 support ([#396](https://github.com/docling-project/docling/issues/396)) ([`7b013ab`](https://github.com/docling-project/docling/commit/7b013abcf31ba49e2141dfd408bc8c23e8d87d91)) +* Propagate document limits to converter ([#388](https://github.com/docling-project/docling/issues/388)) ([`32ebf55`](https://github.com/docling-project/docling/commit/32ebf55e3338dd22f9a23c55595f15835794d961)) -## [v2.6.0](https://github.com/DS4SD/docling/releases/tag/v2.6.0) - 2024-11-19 +## [v2.6.0](https://github.com/docling-project/docling/releases/tag/v2.6.0) - 2024-11-19 ### Feature -* Added support for exporting DocItem to an image when page image is available ([#379](https://github.com/DS4SD/docling/issues/379)) ([`3f91e7d`](https://github.com/DS4SD/docling/commit/3f91e7d3f166901c139ab036c4d9dad5fa560aa9)) -* Expose ocr-lang in CLI ([#375](https://github.com/DS4SD/docling/issues/375)) ([`ed785ea`](https://github.com/DS4SD/docling/commit/ed785ea122d8d736c2031a38fce81dc5c19e244c)) -* Added excel backend ([#334](https://github.com/DS4SD/docling/issues/334)) ([`926dfd2`](https://github.com/DS4SD/docling/commit/926dfd29d51c52628fe9fe8acb0ad0121c88e58a)) -* Extracting picture data for raster images found in PPTX ([#349](https://github.com/DS4SD/docling/issues/349)) ([`7a97d71`](https://github.com/DS4SD/docling/commit/7a97d7119f69a83042477d4272e8ef93a2252cc8)) +* Added support for exporting DocItem to an image when page image is available ([#379](https://github.com/docling-project/docling/issues/379)) ([`3f91e7d`](https://github.com/docling-project/docling/commit/3f91e7d3f166901c139ab036c4d9dad5fa560aa9)) +* Expose ocr-lang in CLI ([#375](https://github.com/docling-project/docling/issues/375)) ([`ed785ea`](https://github.com/docling-project/docling/commit/ed785ea122d8d736c2031a38fce81dc5c19e244c)) +* Added excel backend ([#334](https://github.com/docling-project/docling/issues/334)) ([`926dfd2`](https://github.com/docling-project/docling/commit/926dfd29d51c52628fe9fe8acb0ad0121c88e58a)) +* Extracting picture data for raster images found in PPTX ([#349](https://github.com/docling-project/docling/issues/349)) ([`7a97d71`](https://github.com/docling-project/docling/commit/7a97d7119f69a83042477d4272e8ef93a2252cc8)) ### Fix -* Fixing images in the input Word files ([#330](https://github.com/DS4SD/docling/issues/330)) ([`8533039`](https://github.com/DS4SD/docling/commit/8533039b0c0eff131b524da765f15c3279b554c5)) -* Reduce logging by keeping option for more verbose ([#323](https://github.com/DS4SD/docling/issues/323)) ([`8b437ad`](https://github.com/DS4SD/docling/commit/8b437adcde4acc1d309c81c707c264bcca05d394)) +* Fixing images in the input Word files ([#330](https://github.com/docling-project/docling/issues/330)) ([`8533039`](https://github.com/docling-project/docling/commit/8533039b0c0eff131b524da765f15c3279b554c5)) +* Reduce logging by keeping option for more verbose ([#323](https://github.com/docling-project/docling/issues/323)) ([`8b437ad`](https://github.com/docling-project/docling/commit/8b437adcde4acc1d309c81c707c264bcca05d394)) ### Documentation -* Fixed typo in v2 example v2 ([#378](https://github.com/DS4SD/docling/issues/378)) ([`911c3bd`](https://github.com/DS4SD/docling/commit/911c3bda27c4108167b89fa70ec8204c854c583b)) -* Add automatic generation of CLI reference ([#325](https://github.com/DS4SD/docling/issues/325)) ([`ca8524e`](https://github.com/DS4SD/docling/commit/ca8524ecaea93cca0c808c8e7dee29fda0c1977e)) -* Add architecture outline ([#341](https://github.com/DS4SD/docling/issues/341)) ([`25fd149`](https://github.com/DS4SD/docling/commit/25fd149c3839343f8bd42ae993e35f80acda2a52)) -* Fix parameter in usage.md ([#332](https://github.com/DS4SD/docling/issues/332)) ([`835e077`](https://github.com/DS4SD/docling/commit/835e077b021d0a3615247906dd82ecfa19f3cf98)) +* Fixed typo in v2 example v2 ([#378](https://github.com/docling-project/docling/issues/378)) ([`911c3bd`](https://github.com/docling-project/docling/commit/911c3bda27c4108167b89fa70ec8204c854c583b)) +* Add automatic generation of CLI reference ([#325](https://github.com/docling-project/docling/issues/325)) ([`ca8524e`](https://github.com/docling-project/docling/commit/ca8524ecaea93cca0c808c8e7dee29fda0c1977e)) +* Add architecture outline ([#341](https://github.com/docling-project/docling/issues/341)) ([`25fd149`](https://github.com/docling-project/docling/commit/25fd149c3839343f8bd42ae993e35f80acda2a52)) +* Fix parameter in usage.md ([#332](https://github.com/docling-project/docling/issues/332)) ([`835e077`](https://github.com/docling-project/docling/commit/835e077b021d0a3615247906dd82ecfa19f3cf98)) -## [v2.5.2](https://github.com/DS4SD/docling/releases/tag/v2.5.2) - 2024-11-13 +## [v2.5.2](https://github.com/docling-project/docling/releases/tag/v2.5.2) - 2024-11-13 ### Fix -* Skip glm model downloads ([#322](https://github.com/DS4SD/docling/issues/322)) ([`c9341bf`](https://github.com/DS4SD/docling/commit/c9341bf22e08920284cbc14821c190eaf6abf8a6)) +* Skip glm model downloads ([#322](https://github.com/docling-project/docling/issues/322)) ([`c9341bf`](https://github.com/docling-project/docling/commit/c9341bf22e08920284cbc14821c190eaf6abf8a6)) -## [v2.5.1](https://github.com/DS4SD/docling/releases/tag/v2.5.1) - 2024-11-12 +## [v2.5.1](https://github.com/docling-project/docling/releases/tag/v2.5.1) - 2024-11-12 ### Fix -* Handling of single-cell tables in DOCX backend ([#314](https://github.com/DS4SD/docling/issues/314)) ([`fb8ba86`](https://github.com/DS4SD/docling/commit/fb8ba861e28eda0079daa44fb1ea3ed17745f1d2)) +* Handling of single-cell tables in DOCX backend ([#314](https://github.com/docling-project/docling/issues/314)) ([`fb8ba86`](https://github.com/docling-project/docling/commit/fb8ba861e28eda0079daa44fb1ea3ed17745f1d2)) ### Documentation -* Hybrid RAG with Qdrant ([#312](https://github.com/DS4SD/docling/issues/312)) ([`7f5d35e`](https://github.com/DS4SD/docling/commit/7f5d35ea3c225ce1ce7328825842f98755c0104f)) -* Add Data Prep Kit integration ([#316](https://github.com/DS4SD/docling/issues/316)) ([`93fc1be`](https://github.com/DS4SD/docling/commit/93fc1be61abfe0669daf26c0984a51ec8675bf62)) +* Hybrid RAG with Qdrant ([#312](https://github.com/docling-project/docling/issues/312)) ([`7f5d35e`](https://github.com/docling-project/docling/commit/7f5d35ea3c225ce1ce7328825842f98755c0104f)) +* Add Data Prep Kit integration ([#316](https://github.com/docling-project/docling/issues/316)) ([`93fc1be`](https://github.com/docling-project/docling/commit/93fc1be61abfe0669daf26c0984a51ec8675bf62)) -## [v2.5.0](https://github.com/DS4SD/docling/releases/tag/v2.5.0) - 2024-11-12 +## [v2.5.0](https://github.com/docling-project/docling/releases/tag/v2.5.0) - 2024-11-12 ### Feature -* **OCR:** Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning ([#290](https://github.com/DS4SD/docling/issues/290)) ([`c6b3763`](https://github.com/DS4SD/docling/commit/c6b3763ecb6ef862840a30978ee177b907f86505)) +* **OCR:** Introduce the OcrOptions.force_full_page_ocr parameter that forces a full page OCR scanning ([#290](https://github.com/docling-project/docling/issues/290)) ([`c6b3763`](https://github.com/docling-project/docling/commit/c6b3763ecb6ef862840a30978ee177b907f86505)) ### Fix -* Configure env prefix for docling settings ([#315](https://github.com/DS4SD/docling/issues/315)) ([`5d4a10b`](https://github.com/DS4SD/docling/commit/5d4a10b121317fa481208dacbee47032b08ff928)) -* Added handling of grouped elements in pptx backend ([#307](https://github.com/DS4SD/docling/issues/307)) ([`81c8243`](https://github.com/DS4SD/docling/commit/81c8243a8bf177feed8f87ea283b5bb6836350cb)) -* Allow mps usage for easyocr ([#286](https://github.com/DS4SD/docling/issues/286)) ([`97f214e`](https://github.com/DS4SD/docling/commit/97f214efddcf66f0734a95c17c08936f6111d113)) +* Configure env prefix for docling settings ([#315](https://github.com/docling-project/docling/issues/315)) ([`5d4a10b`](https://github.com/docling-project/docling/commit/5d4a10b121317fa481208dacbee47032b08ff928)) +* Added handling of grouped elements in pptx backend ([#307](https://github.com/docling-project/docling/issues/307)) ([`81c8243`](https://github.com/docling-project/docling/commit/81c8243a8bf177feed8f87ea283b5bb6836350cb)) +* Allow mps usage for easyocr ([#286](https://github.com/docling-project/docling/issues/286)) ([`97f214e`](https://github.com/docling-project/docling/commit/97f214efddcf66f0734a95c17c08936f6111d113)) ### Documentation -* Add navigation indices ([#305](https://github.com/DS4SD/docling/issues/305)) ([`1239ade`](https://github.com/DS4SD/docling/commit/1239ade2750349d13d4e865d88449b232bbad944)) +* Add navigation indices ([#305](https://github.com/docling-project/docling/issues/305)) ([`1239ade`](https://github.com/docling-project/docling/commit/1239ade2750349d13d4e865d88449b232bbad944)) -## [v2.4.2](https://github.com/DS4SD/docling/releases/tag/v2.4.2) - 2024-11-08 +## [v2.4.2](https://github.com/docling-project/docling/releases/tag/v2.4.2) - 2024-11-08 ### Fix -* **EasyOcrModel:** Support the use_gpu pipeline parameter in EasyOcrModel. Initialize easyocr ([#282](https://github.com/DS4SD/docling/issues/282)) ([`0eb065e`](https://github.com/DS4SD/docling/commit/0eb065e9b6e4619d4c412ed98bc7408915ca3f95)) +* **EasyOcrModel:** Support the use_gpu pipeline parameter in EasyOcrModel. Initialize easyocr ([#282](https://github.com/docling-project/docling/issues/282)) ([`0eb065e`](https://github.com/docling-project/docling/commit/0eb065e9b6e4619d4c412ed98bc7408915ca3f95)) -## [v2.4.1](https://github.com/DS4SD/docling/releases/tag/v2.4.1) - 2024-11-08 +## [v2.4.1](https://github.com/docling-project/docling/releases/tag/v2.4.1) - 2024-11-08 ### Fix -* **tesserocr:** Raise Exception if tesserocr has not loaded any languages ([#279](https://github.com/DS4SD/docling/issues/279)) ([`704d792`](https://github.com/DS4SD/docling/commit/704d792a7997c4ca34f9f9045ed4ae02b4f5df47)) -* Dockerfile example copy command ([#234](https://github.com/DS4SD/docling/issues/234)) ([`90836db`](https://github.com/DS4SD/docling/commit/90836db90accf4a66c9c20544c98452840e3a308)) +* **tesserocr:** Raise Exception if tesserocr has not loaded any languages ([#279](https://github.com/docling-project/docling/issues/279)) ([`704d792`](https://github.com/docling-project/docling/commit/704d792a7997c4ca34f9f9045ed4ae02b4f5df47)) +* Dockerfile example copy command ([#234](https://github.com/docling-project/docling/issues/234)) ([`90836db`](https://github.com/docling-project/docling/commit/90836db90accf4a66c9c20544c98452840e3a308)) ### Documentation -* Update badges & credits ([#248](https://github.com/DS4SD/docling/issues/248)) ([`a84ec27`](https://github.com/DS4SD/docling/commit/a84ec276b0997c4ba9b32e18e911a966124dc3bc)) -* Add coming-soon section ([#235](https://github.com/DS4SD/docling/issues/235)) ([`5ce02c5`](https://github.com/DS4SD/docling/commit/5ce02c5c598a2efa615ad15f0ead8d752d3ad2ea)) -* Add artifacts-path param to CLI ([#233](https://github.com/DS4SD/docling/issues/233)) ([`d5e65ae`](https://github.com/DS4SD/docling/commit/d5e65aedac23d6849c805a0e88dd06f2a285eb18)) +* Update badges & credits ([#248](https://github.com/docling-project/docling/issues/248)) ([`a84ec27`](https://github.com/docling-project/docling/commit/a84ec276b0997c4ba9b32e18e911a966124dc3bc)) +* Add coming-soon section ([#235](https://github.com/docling-project/docling/issues/235)) ([`5ce02c5`](https://github.com/docling-project/docling/commit/5ce02c5c598a2efa615ad15f0ead8d752d3ad2ea)) +* Add artifacts-path param to CLI ([#233](https://github.com/docling-project/docling/issues/233)) ([`d5e65ae`](https://github.com/docling-project/docling/commit/d5e65aedac23d6849c805a0e88dd06f2a285eb18)) -## [v2.4.0](https://github.com/DS4SD/docling/releases/tag/v2.4.0) - 2024-11-04 +## [v2.4.0](https://github.com/docling-project/docling/releases/tag/v2.4.0) - 2024-11-04 ### Feature -* Pdf backend, table mode as options and artifacts path ([#203](https://github.com/DS4SD/docling/issues/203)) ([`40ad987`](https://github.com/DS4SD/docling/commit/40ad98730356218359d6fa9b3deb5bc094d6c699)) +* Pdf backend, table mode as options and artifacts path ([#203](https://github.com/docling-project/docling/issues/203)) ([`40ad987`](https://github.com/docling-project/docling/commit/40ad98730356218359d6fa9b3deb5bc094d6c699)) ### Documentation -* Add explicit artifacts path example ([#224](https://github.com/DS4SD/docling/issues/224)) ([`eeee3b4`](https://github.com/DS4SD/docling/commit/eeee3b4371cb8207a8e7a887acba3fc5afc6de4d)) -* Update custom convert and dockerfile ([#226](https://github.com/DS4SD/docling/issues/226)) ([`5f5fea9`](https://github.com/DS4SD/docling/commit/5f5fea90a963f73a92b551dfefb353fa3e9657d7)) -* Correct spelling of 'individual' ([#219](https://github.com/DS4SD/docling/issues/219)) ([`41acaa9`](https://github.com/DS4SD/docling/commit/41acaa9e2ef4cff8d781f79fb5ae1b31762fa644)) -* Update LlamaIndex docs ([#196](https://github.com/DS4SD/docling/issues/196)) ([`244ca69`](https://github.com/DS4SD/docling/commit/244ca69cfd8a17b449a0a6baaf062b0b5b798bb1)) +* Add explicit artifacts path example ([#224](https://github.com/docling-project/docling/issues/224)) ([`eeee3b4`](https://github.com/docling-project/docling/commit/eeee3b4371cb8207a8e7a887acba3fc5afc6de4d)) +* Update custom convert and dockerfile ([#226](https://github.com/docling-project/docling/issues/226)) ([`5f5fea9`](https://github.com/docling-project/docling/commit/5f5fea90a963f73a92b551dfefb353fa3e9657d7)) +* Correct spelling of 'individual' ([#219](https://github.com/docling-project/docling/issues/219)) ([`41acaa9`](https://github.com/docling-project/docling/commit/41acaa9e2ef4cff8d781f79fb5ae1b31762fa644)) +* Update LlamaIndex docs ([#196](https://github.com/docling-project/docling/issues/196)) ([`244ca69`](https://github.com/docling-project/docling/commit/244ca69cfd8a17b449a0a6baaf062b0b5b798bb1)) -## [v2.3.1](https://github.com/DS4SD/docling/releases/tag/v2.3.1) - 2024-10-30 +## [v2.3.1](https://github.com/docling-project/docling/releases/tag/v2.3.1) - 2024-10-30 ### Fix -* Simplify torch dependencies and update pinned docling deps ([#190](https://github.com/DS4SD/docling/issues/190)) ([`eb679cc`](https://github.com/DS4SD/docling/commit/eb679ccbb484fc3ef50dcf00be54ccd488d4a34d)) -* Allow to explicitly initialize the pipeline ([#189](https://github.com/DS4SD/docling/issues/189)) ([`904d24d`](https://github.com/DS4SD/docling/commit/904d24d6005d113c50bde0ad398fdaafbbfb3027)) +* Simplify torch dependencies and update pinned docling deps ([#190](https://github.com/docling-project/docling/issues/190)) ([`eb679cc`](https://github.com/docling-project/docling/commit/eb679ccbb484fc3ef50dcf00be54ccd488d4a34d)) +* Allow to explicitly initialize the pipeline ([#189](https://github.com/docling-project/docling/issues/189)) ([`904d24d`](https://github.com/docling-project/docling/commit/904d24d6005d113c50bde0ad398fdaafbbfb3027)) -## [v2.3.0](https://github.com/DS4SD/docling/releases/tag/v2.3.0) - 2024-10-30 +## [v2.3.0](https://github.com/docling-project/docling/releases/tag/v2.3.0) - 2024-10-30 ### Feature -* Add pipeline timings and toggle visualization, establish debug settings ([#183](https://github.com/DS4SD/docling/issues/183)) ([`2a2c65b`](https://github.com/DS4SD/docling/commit/2a2c65bf4f89a715c27310eaa9cd9db635e0f673)) +* Add pipeline timings and toggle visualization, establish debug settings ([#183](https://github.com/docling-project/docling/issues/183)) ([`2a2c65b`](https://github.com/docling-project/docling/commit/2a2c65bf4f89a715c27310eaa9cd9db635e0f673)) ### Fix -* Fix duplicate title and heading + add e2e tests for html and docx ([#186](https://github.com/DS4SD/docling/issues/186)) ([`f542460`](https://github.com/DS4SD/docling/commit/f542460af3c7676e5f8dee3b6ce729b139560cd6)) +* Fix duplicate title and heading + add e2e tests for html and docx ([#186](https://github.com/docling-project/docling/issues/186)) ([`f542460`](https://github.com/docling-project/docling/commit/f542460af3c7676e5f8dee3b6ce729b139560cd6)) -## [v2.2.1](https://github.com/DS4SD/docling/releases/tag/v2.2.1) - 2024-10-28 +## [v2.2.1](https://github.com/docling-project/docling/releases/tag/v2.2.1) - 2024-10-28 ### Fix -* Fix header levels for DOCX & HTML ([#184](https://github.com/DS4SD/docling/issues/184)) ([`b9f5c74`](https://github.com/DS4SD/docling/commit/b9f5c74a7d13827c2b7887ddbf0b4eb43edd0846)) -* Handling of long sequence of unescaped underscore chars in markdown ([#173](https://github.com/DS4SD/docling/issues/173)) ([`94d0729`](https://github.com/DS4SD/docling/commit/94d0729c500b0be8ac4a1cd3025b42048f6e8d5a)) -* HTML backend, fixes for Lists and nested texts ([#180](https://github.com/DS4SD/docling/issues/180)) ([`7d19418`](https://github.com/DS4SD/docling/commit/7d19418b779408c345473af684de6b7f60872b6e)) -* MD Backend, fixes to properly handle trailing inline text and emphasis in headers ([#178](https://github.com/DS4SD/docling/issues/178)) ([`88c1673`](https://github.com/DS4SD/docling/commit/88c16730571afdd3bfb8894f64d41b5e99bc5a5b)) +* Fix header levels for DOCX & HTML ([#184](https://github.com/docling-project/docling/issues/184)) ([`b9f5c74`](https://github.com/docling-project/docling/commit/b9f5c74a7d13827c2b7887ddbf0b4eb43edd0846)) +* Handling of long sequence of unescaped underscore chars in markdown ([#173](https://github.com/docling-project/docling/issues/173)) ([`94d0729`](https://github.com/docling-project/docling/commit/94d0729c500b0be8ac4a1cd3025b42048f6e8d5a)) +* HTML backend, fixes for Lists and nested texts ([#180](https://github.com/docling-project/docling/issues/180)) ([`7d19418`](https://github.com/docling-project/docling/commit/7d19418b779408c345473af684de6b7f60872b6e)) +* MD Backend, fixes to properly handle trailing inline text and emphasis in headers ([#178](https://github.com/docling-project/docling/issues/178)) ([`88c1673`](https://github.com/docling-project/docling/commit/88c16730571afdd3bfb8894f64d41b5e99bc5a5b)) ### Documentation -* Update LlamaIndex docs for Docling v2 ([#182](https://github.com/DS4SD/docling/issues/182)) ([`2cece27`](https://github.com/DS4SD/docling/commit/2cece27208c4bce715d20000b845794dfb97843d)) -* Fix batch convert ([#177](https://github.com/DS4SD/docling/issues/177)) ([`189d3c2`](https://github.com/DS4SD/docling/commit/189d3c2d44ec389856f48696eaa78ac9f02f8cde)) -* Add export with embedded images ([#175](https://github.com/DS4SD/docling/issues/175)) ([`8d356aa`](https://github.com/DS4SD/docling/commit/8d356aa24715433d458eff4f5f0937ff5cb9cc69)) +* Update LlamaIndex docs for Docling v2 ([#182](https://github.com/docling-project/docling/issues/182)) ([`2cece27`](https://github.com/docling-project/docling/commit/2cece27208c4bce715d20000b845794dfb97843d)) +* Fix batch convert ([#177](https://github.com/docling-project/docling/issues/177)) ([`189d3c2`](https://github.com/docling-project/docling/commit/189d3c2d44ec389856f48696eaa78ac9f02f8cde)) +* Add export with embedded images ([#175](https://github.com/docling-project/docling/issues/175)) ([`8d356aa`](https://github.com/docling-project/docling/commit/8d356aa24715433d458eff4f5f0937ff5cb9cc69)) -## [v2.2.0](https://github.com/DS4SD/docling/releases/tag/v2.2.0) - 2024-10-23 +## [v2.2.0](https://github.com/docling-project/docling/releases/tag/v2.2.0) - 2024-10-23 ### Feature -* Update to docling-parse v2 without history ([#170](https://github.com/DS4SD/docling/issues/170)) ([`4116819`](https://github.com/DS4SD/docling/commit/4116819b515a0611e8e5bf2bb0e1e39f1096b7bf)) -* Support AsciiDoc and Markdown input format ([#168](https://github.com/DS4SD/docling/issues/168)) ([`3023f18`](https://github.com/DS4SD/docling/commit/3023f18ba0462a224f75ea40953b5605abef6427)) +* Update to docling-parse v2 without history ([#170](https://github.com/docling-project/docling/issues/170)) ([`4116819`](https://github.com/docling-project/docling/commit/4116819b515a0611e8e5bf2bb0e1e39f1096b7bf)) +* Support AsciiDoc and Markdown input format ([#168](https://github.com/docling-project/docling/issues/168)) ([`3023f18`](https://github.com/docling-project/docling/commit/3023f18ba0462a224f75ea40953b5605abef6427)) ### Fix -* Set valid=false for invalid backends ([#171](https://github.com/DS4SD/docling/issues/171)) ([`3496b48`](https://github.com/DS4SD/docling/commit/3496b4838fd52cb0d74eadf78b27c19f633871b1)) +* Set valid=false for invalid backends ([#171](https://github.com/docling-project/docling/issues/171)) ([`3496b48`](https://github.com/docling-project/docling/commit/3496b4838fd52cb0d74eadf78b27c19f633871b1)) -## [v2.1.0](https://github.com/DS4SD/docling/releases/tag/v2.1.0) - 2024-10-18 +## [v2.1.0](https://github.com/docling-project/docling/releases/tag/v2.1.0) - 2024-10-18 ### Feature -* Add coverage_threshold to skip OCR for small images ([#161](https://github.com/DS4SD/docling/issues/161)) ([`b346faf`](https://github.com/DS4SD/docling/commit/b346faf622190c4895dffdc1ee2365b3f7808cbc)) +* Add coverage_threshold to skip OCR for small images ([#161](https://github.com/docling-project/docling/issues/161)) ([`b346faf`](https://github.com/docling-project/docling/commit/b346faf622190c4895dffdc1ee2365b3f7808cbc)) ### Fix -* Fix legacy doc ref ([#162](https://github.com/DS4SD/docling/issues/162)) ([`63bef59`](https://github.com/DS4SD/docling/commit/63bef59d9ed6cfd937aefd36a4ef38a54a10dac5)) +* Fix legacy doc ref ([#162](https://github.com/docling-project/docling/issues/162)) ([`63bef59`](https://github.com/docling-project/docling/commit/63bef59d9ed6cfd937aefd36a4ef38a54a10dac5)) ### Documentation -* Typo fix ([#155](https://github.com/DS4SD/docling/issues/155)) ([`f799e77`](https://github.com/DS4SD/docling/commit/f799e777c1248559eb2f84bc334e392cd3c98e49)) -* Add graphical band in readme ([#154](https://github.com/DS4SD/docling/issues/154)) ([`034a411`](https://github.com/DS4SD/docling/commit/034a4110573df3ac88fd623970958f02309dd6da)) -* Add use docling ([#150](https://github.com/DS4SD/docling/issues/150)) ([`61c092f`](https://github.com/DS4SD/docling/commit/61c092f445ccde8ed5d7c0f2fa91a3d19a1f7a1e)) +* Typo fix ([#155](https://github.com/docling-project/docling/issues/155)) ([`f799e77`](https://github.com/docling-project/docling/commit/f799e777c1248559eb2f84bc334e392cd3c98e49)) +* Add graphical band in readme ([#154](https://github.com/docling-project/docling/issues/154)) ([`034a411`](https://github.com/docling-project/docling/commit/034a4110573df3ac88fd623970958f02309dd6da)) +* Add use docling ([#150](https://github.com/docling-project/docling/issues/150)) ([`61c092f`](https://github.com/docling-project/docling/commit/61c092f445ccde8ed5d7c0f2fa91a3d19a1f7a1e)) -## [v2.0.0](https://github.com/DS4SD/docling/releases/tag/v2.0.0) - 2024-10-16 +## [v2.0.0](https://github.com/docling-project/docling/releases/tag/v2.0.0) - 2024-10-16 ### Feature -* Docling v2 ([#117](https://github.com/DS4SD/docling/issues/117)) ([`7d3be0e`](https://github.com/DS4SD/docling/commit/7d3be0edebb420f5840499aa04e4ab928d33cda2)) +* Docling v2 ([#117](https://github.com/docling-project/docling/issues/117)) ([`7d3be0e`](https://github.com/docling-project/docling/commit/7d3be0edebb420f5840499aa04e4ab928d33cda2)) ### Breaking -* Docling v2 ([#117](https://github.com/DS4SD/docling/issues/117)) ([`7d3be0e`](https://github.com/DS4SD/docling/commit/7d3be0edebb420f5840499aa04e4ab928d33cda2)) +* Docling v2 ([#117](https://github.com/docling-project/docling/issues/117)) ([`7d3be0e`](https://github.com/docling-project/docling/commit/7d3be0edebb420f5840499aa04e4ab928d33cda2)) ### Documentation -* Introduce docs site ([#141](https://github.com/DS4SD/docling/issues/141)) ([`d504432`](https://github.com/DS4SD/docling/commit/d504432c1ee250ea417e8239ff5c16c5bec5a2c7)) +* Introduce docs site ([#141](https://github.com/docling-project/docling/issues/141)) ([`d504432`](https://github.com/docling-project/docling/commit/d504432c1ee250ea417e8239ff5c16c5bec5a2c7)) -## [v1.20.0](https://github.com/DS4SD/docling/releases/tag/v1.20.0) - 2024-10-11 +## [v1.20.0](https://github.com/docling-project/docling/releases/tag/v1.20.0) - 2024-10-11 ### Feature -* New experimental docling-parse v2 backend ([#131](https://github.com/DS4SD/docling/issues/131)) ([`5e4944f`](https://github.com/DS4SD/docling/commit/5e4944f15f0ac1faf3e6a532c3e3ab4da56517a3)) +* New experimental docling-parse v2 backend ([#131](https://github.com/docling-project/docling/issues/131)) ([`5e4944f`](https://github.com/docling-project/docling/commit/5e4944f15f0ac1faf3e6a532c3e3ab4da56517a3)) -## [v1.19.1](https://github.com/DS4SD/docling/releases/tag/v1.19.1) - 2024-10-11 +## [v1.19.1](https://github.com/docling-project/docling/releases/tag/v1.19.1) - 2024-10-11 ### Fix -* Remove stderr from tesseract cli and introduce fuzziness in the text validation of OCR tests ([#138](https://github.com/DS4SD/docling/issues/138)) ([`dae2a3b`](https://github.com/DS4SD/docling/commit/dae2a3b66732e1e135b00cce24226c7d9f2eb2e4)) +* Remove stderr from tesseract cli and introduce fuzziness in the text validation of OCR tests ([#138](https://github.com/docling-project/docling/issues/138)) ([`dae2a3b`](https://github.com/docling-project/docling/commit/dae2a3b66732e1e135b00cce24226c7d9f2eb2e4)) ### Documentation -* Simplify LlamaIndex example using Docling extension ([#135](https://github.com/DS4SD/docling/issues/135)) ([`5f1bd9e`](https://github.com/DS4SD/docling/commit/5f1bd9e9c8a19c667d1d587a557c3c36df494762)) +* Simplify LlamaIndex example using Docling extension ([#135](https://github.com/docling-project/docling/issues/135)) ([`5f1bd9e`](https://github.com/docling-project/docling/commit/5f1bd9e9c8a19c667d1d587a557c3c36df494762)) -## [v1.19.0](https://github.com/DS4SD/docling/releases/tag/v1.19.0) - 2024-10-08 +## [v1.19.0](https://github.com/docling-project/docling/releases/tag/v1.19.0) - 2024-10-08 ### Feature -* Add options for choosing OCR engines ([#118](https://github.com/DS4SD/docling/issues/118)) ([`f96ea86`](https://github.com/DS4SD/docling/commit/f96ea86a00fd1aafaa57025e46b5288b43958725)) +* Add options for choosing OCR engines ([#118](https://github.com/docling-project/docling/issues/118)) ([`f96ea86`](https://github.com/docling-project/docling/commit/f96ea86a00fd1aafaa57025e46b5288b43958725)) -## [v1.18.0](https://github.com/DS4SD/docling/releases/tag/v1.18.0) - 2024-10-03 +## [v1.18.0](https://github.com/docling-project/docling/releases/tag/v1.18.0) - 2024-10-03 ### Feature -* New torch-based docling models ([#120](https://github.com/DS4SD/docling/issues/120)) ([`2422f70`](https://github.com/DS4SD/docling/commit/2422f706a1b02a679bcbaaba097fef2f69aba0f4)) +* New torch-based docling models ([#120](https://github.com/docling-project/docling/issues/120)) ([`2422f70`](https://github.com/docling-project/docling/commit/2422f706a1b02a679bcbaaba097fef2f69aba0f4)) -## [v1.17.0](https://github.com/DS4SD/docling/releases/tag/v1.17.0) - 2024-10-03 +## [v1.17.0](https://github.com/docling-project/docling/releases/tag/v1.17.0) - 2024-10-03 ### Feature -* Windows support ([#122](https://github.com/DS4SD/docling/issues/122)) ([`d44c62d`](https://github.com/DS4SD/docling/commit/d44c62d7ce6990bbc78bf53315dd76d35d1f6c2e)) +* Windows support ([#122](https://github.com/docling-project/docling/issues/122)) ([`d44c62d`](https://github.com/docling-project/docling/commit/d44c62d7ce6990bbc78bf53315dd76d35d1f6c2e)) -## [v1.16.1](https://github.com/DS4SD/docling/releases/tag/v1.16.1) - 2024-09-27 +## [v1.16.1](https://github.com/docling-project/docling/releases/tag/v1.16.1) - 2024-09-27 ### Fix -* Allow usage of opencv 4.6.x ([#110](https://github.com/DS4SD/docling/issues/110)) ([`34bd887`](https://github.com/DS4SD/docling/commit/34bd887a7f9c11b2b051bdb4707dfdc5f43e6ad5)) +* Allow usage of opencv 4.6.x ([#110](https://github.com/docling-project/docling/issues/110)) ([`34bd887`](https://github.com/docling-project/docling/commit/34bd887a7f9c11b2b051bdb4707dfdc5f43e6ad5)) ### Documentation -* Document chunking ([#111](https://github.com/DS4SD/docling/issues/111)) ([`c05b692`](https://github.com/DS4SD/docling/commit/c05b692d69b6dae1ac5f518e84b17f32e7d94372)) +* Document chunking ([#111](https://github.com/docling-project/docling/issues/111)) ([`c05b692`](https://github.com/docling-project/docling/commit/c05b692d69b6dae1ac5f518e84b17f32e7d94372)) -## [v1.16.0](https://github.com/DS4SD/docling/releases/tag/v1.16.0) - 2024-09-27 +## [v1.16.0](https://github.com/docling-project/docling/releases/tag/v1.16.0) - 2024-09-27 ### Feature -* Support tableformer model choice ([#90](https://github.com/DS4SD/docling/issues/90)) ([`d6df76f`](https://github.com/DS4SD/docling/commit/d6df76f90b249bf48a509b63fa18f570be39482e)) +* Support tableformer model choice ([#90](https://github.com/docling-project/docling/issues/90)) ([`d6df76f`](https://github.com/docling-project/docling/commit/d6df76f90b249bf48a509b63fa18f570be39482e)) -## [v1.15.0](https://github.com/DS4SD/docling/releases/tag/v1.15.0) - 2024-09-24 +## [v1.15.0](https://github.com/docling-project/docling/releases/tag/v1.15.0) - 2024-09-24 ### Feature -* Add figure in markdown ([#98](https://github.com/DS4SD/docling/issues/98)) ([`6a03c20`](https://github.com/DS4SD/docling/commit/6a03c208ecc9176b0be413594114ce8a3f213371)) +* Add figure in markdown ([#98](https://github.com/docling-project/docling/issues/98)) ([`6a03c20`](https://github.com/docling-project/docling/commit/6a03c208ecc9176b0be413594114ce8a3f213371)) -## [v1.14.0](https://github.com/DS4SD/docling/releases/tag/v1.14.0) - 2024-09-24 +## [v1.14.0](https://github.com/docling-project/docling/releases/tag/v1.14.0) - 2024-09-24 ### Feature -* Add URL support to CLI ([#99](https://github.com/DS4SD/docling/issues/99)) ([`3c46e42`](https://github.com/DS4SD/docling/commit/3c46e4266cf1ad8d3a99aa33636d84d34222b4fe)) +* Add URL support to CLI ([#99](https://github.com/docling-project/docling/issues/99)) ([`3c46e42`](https://github.com/docling-project/docling/commit/3c46e4266cf1ad8d3a99aa33636d84d34222b4fe)) ### Fix -* Fix OCR setting for pypdfium, minor refactor ([#102](https://github.com/DS4SD/docling/issues/102)) ([`d96b96c`](https://github.com/DS4SD/docling/commit/d96b96c8481a8ae68545a34aaf9b8d5a6637a6be)) +* Fix OCR setting for pypdfium, minor refactor ([#102](https://github.com/docling-project/docling/issues/102)) ([`d96b96c`](https://github.com/docling-project/docling/commit/d96b96c8481a8ae68545a34aaf9b8d5a6637a6be)) ### Documentation -* Document CLI, minor README revamp ([#100](https://github.com/DS4SD/docling/issues/100)) ([`f8f2303`](https://github.com/DS4SD/docling/commit/f8f2303348c4bbcb7903ff172746a69607e53271)) +* Document CLI, minor README revamp ([#100](https://github.com/docling-project/docling/issues/100)) ([`f8f2303`](https://github.com/docling-project/docling/commit/f8f2303348c4bbcb7903ff172746a69607e53271)) -## [v1.13.1](https://github.com/DS4SD/docling/releases/tag/v1.13.1) - 2024-09-23 +## [v1.13.1](https://github.com/docling-project/docling/releases/tag/v1.13.1) - 2024-09-23 ### Fix -* Updated the render_as_doctags with the new arguments from docling-core ([#93](https://github.com/DS4SD/docling/issues/93)) ([`4794ce4`](https://github.com/DS4SD/docling/commit/4794ce460a542a730fd5a72a7be7f94a07ed5d12)) +* Updated the render_as_doctags with the new arguments from docling-core ([#93](https://github.com/docling-project/docling/issues/93)) ([`4794ce4`](https://github.com/docling-project/docling/commit/4794ce460a542a730fd5a72a7be7f94a07ed5d12)) -## [v1.13.0](https://github.com/DS4SD/docling/releases/tag/v1.13.0) - 2024-09-18 +## [v1.13.0](https://github.com/docling-project/docling/releases/tag/v1.13.0) - 2024-09-18 ### Feature -* Add table exports ([#86](https://github.com/DS4SD/docling/issues/86)) ([`f19bd43`](https://github.com/DS4SD/docling/commit/f19bd437984f77067d33d591e25c5d5c92d7e0a9)) +* Add table exports ([#86](https://github.com/docling-project/docling/issues/86)) ([`f19bd43`](https://github.com/docling-project/docling/commit/f19bd437984f77067d33d591e25c5d5c92d7e0a9)) ### Fix -* Bumped the glm version and adjusted the tests ([#83](https://github.com/DS4SD/docling/issues/83)) ([`442443a`](https://github.com/DS4SD/docling/commit/442443a102d91b19a7eb38b316dada89c86ea8a8)) +* Bumped the glm version and adjusted the tests ([#83](https://github.com/docling-project/docling/issues/83)) ([`442443a`](https://github.com/docling-project/docling/commit/442443a102d91b19a7eb38b316dada89c86ea8a8)) ### Documentation -* Updated Docling logo.png with transparent background ([#88](https://github.com/DS4SD/docling/issues/88)) ([`0da7519`](https://github.com/DS4SD/docling/commit/0da75198967c9cffd42be3f3acd6ade2341fc1f5)) +* Updated Docling logo.png with transparent background ([#88](https://github.com/docling-project/docling/issues/88)) ([`0da7519`](https://github.com/docling-project/docling/commit/0da75198967c9cffd42be3f3acd6ade2341fc1f5)) -## [v1.12.2](https://github.com/DS4SD/docling/releases/tag/v1.12.2) - 2024-09-17 +## [v1.12.2](https://github.com/docling-project/docling/releases/tag/v1.12.2) - 2024-09-17 ### Fix -* **tests:** Adjust the test data to match the new version of LayoutPredictor ([#82](https://github.com/DS4SD/docling/issues/82)) ([`fa9699f`](https://github.com/DS4SD/docling/commit/fa9699fa3cd2d367382d7b952d0365983a870848)) +* **tests:** Adjust the test data to match the new version of LayoutPredictor ([#82](https://github.com/docling-project/docling/issues/82)) ([`fa9699f`](https://github.com/docling-project/docling/commit/fa9699fa3cd2d367382d7b952d0365983a870848)) -## [v1.12.1](https://github.com/DS4SD/docling/releases/tag/v1.12.1) - 2024-09-16 +## [v1.12.1](https://github.com/docling-project/docling/releases/tag/v1.12.1) - 2024-09-16 ### Fix -* CLI compatibility with python 3.10 and 3.11 ([#79](https://github.com/DS4SD/docling/issues/79)) ([`2870fdc`](https://github.com/DS4SD/docling/commit/2870fdc857d02efeb8f1de7852e9577dd3eb2f51)) +* CLI compatibility with python 3.10 and 3.11 ([#79](https://github.com/docling-project/docling/issues/79)) ([`2870fdc`](https://github.com/docling-project/docling/commit/2870fdc857d02efeb8f1de7852e9577dd3eb2f51)) -## [v1.12.0](https://github.com/DS4SD/docling/releases/tag/v1.12.0) - 2024-09-13 +## [v1.12.0](https://github.com/docling-project/docling/releases/tag/v1.12.0) - 2024-09-13 ### Feature -* Add docling cli ([#75](https://github.com/DS4SD/docling/issues/75)) ([`9899078`](https://github.com/DS4SD/docling/commit/98990784dfa6009b72ee2e1508948b22b00245ec)) +* Add docling cli ([#75](https://github.com/docling-project/docling/issues/75)) ([`9899078`](https://github.com/docling-project/docling/commit/98990784dfa6009b72ee2e1508948b22b00245ec)) ### Documentation -* Showcase RAG with LlamaIndex and LangChain ([#71](https://github.com/DS4SD/docling/issues/71)) ([`53569a1`](https://github.com/DS4SD/docling/commit/53569a10238a821dfbbfcef9d2376d179e62a1db)) +* Showcase RAG with LlamaIndex and LangChain ([#71](https://github.com/docling-project/docling/issues/71)) ([`53569a1`](https://github.com/docling-project/docling/commit/53569a10238a821dfbbfcef9d2376d179e62a1db)) -## [v1.11.0](https://github.com/DS4SD/docling/releases/tag/v1.11.0) - 2024-09-10 +## [v1.11.0](https://github.com/docling-project/docling/releases/tag/v1.11.0) - 2024-09-10 ### Feature -* Adding txt and doctags output ([#68](https://github.com/DS4SD/docling/issues/68)) ([`bdfdfbf`](https://github.com/DS4SD/docling/commit/bdfdfbf092fdaca43ddef28f763ef04456b82890)) +* Adding txt and doctags output ([#68](https://github.com/docling-project/docling/issues/68)) ([`bdfdfbf`](https://github.com/docling-project/docling/commit/bdfdfbf092fdaca43ddef28f763ef04456b82890)) -## [v1.10.0](https://github.com/DS4SD/docling/releases/tag/v1.10.0) - 2024-09-10 +## [v1.10.0](https://github.com/docling-project/docling/releases/tag/v1.10.0) - 2024-09-10 ### Feature -* Linux arm64 support and reducing dependencies ([#69](https://github.com/DS4SD/docling/issues/69)) ([`27a7a15`](https://github.com/DS4SD/docling/commit/27a7a152e1123df7a22c44bb1adab5acce8f5390)) +* Linux arm64 support and reducing dependencies ([#69](https://github.com/docling-project/docling/issues/69)) ([`27a7a15`](https://github.com/docling-project/docling/commit/27a7a152e1123df7a22c44bb1adab5acce8f5390)) -## [v1.9.0](https://github.com/DS4SD/docling/releases/tag/v1.9.0) - 2024-09-03 +## [v1.9.0](https://github.com/docling-project/docling/releases/tag/v1.9.0) - 2024-09-03 ### Feature -* Export document pages as multimodal output ([#54](https://github.com/DS4SD/docling/issues/54)) ([`1de2e4f`](https://github.com/DS4SD/docling/commit/1de2e4f924f562139c2a1e6314364845f9256575)) +* Export document pages as multimodal output ([#54](https://github.com/docling-project/docling/issues/54)) ([`1de2e4f`](https://github.com/docling-project/docling/commit/1de2e4f924f562139c2a1e6314364845f9256575)) ### Documentation -* Update MAINTAINERS.md ([#59](https://github.com/DS4SD/docling/issues/59)) ([`69e5d95`](https://github.com/DS4SD/docling/commit/69e5d951a389a9d36134629cfa2a0496c3bf095a)) -* Mention quackling on README ([#58](https://github.com/DS4SD/docling/issues/58)) ([`85b7348`](https://github.com/DS4SD/docling/commit/85b7348846c87b28981f23c4855e49857c5bb782)) +* Update MAINTAINERS.md ([#59](https://github.com/docling-project/docling/issues/59)) ([`69e5d95`](https://github.com/docling-project/docling/commit/69e5d951a389a9d36134629cfa2a0496c3bf095a)) +* Mention quackling on README ([#58](https://github.com/docling-project/docling/issues/58)) ([`85b7348`](https://github.com/docling-project/docling/commit/85b7348846c87b28981f23c4855e49857c5bb782)) -## [v1.8.5](https://github.com/DS4SD/docling/releases/tag/v1.8.5) - 2024-08-30 +## [v1.8.5](https://github.com/docling-project/docling/releases/tag/v1.8.5) - 2024-08-30 ### Fix -* Add unit tests ([#51](https://github.com/DS4SD/docling/issues/51)) ([`48f4d1b`](https://github.com/DS4SD/docling/commit/48f4d1ba5288b54d96740a1132b0d7977bef01cf)) +* Add unit tests ([#51](https://github.com/docling-project/docling/issues/51)) ([`48f4d1b`](https://github.com/docling-project/docling/commit/48f4d1ba5288b54d96740a1132b0d7977bef01cf)) -## [v1.8.4](https://github.com/DS4SD/docling/releases/tag/v1.8.4) - 2024-08-30 +## [v1.8.4](https://github.com/docling-project/docling/releases/tag/v1.8.4) - 2024-08-30 ### Fix -* Propagate row_section in tables ([#57](https://github.com/DS4SD/docling/issues/57)) ([`de85e46`](https://github.com/DS4SD/docling/commit/de85e46ced293bdef7957f72fff301fec178cc94)) +* Propagate row_section in tables ([#57](https://github.com/docling-project/docling/issues/57)) ([`de85e46`](https://github.com/docling-project/docling/commit/de85e46ced293bdef7957f72fff301fec178cc94)) ### Documentation -* Add instructions for cpu-only installation ([#56](https://github.com/DS4SD/docling/issues/56)) ([`a8a60d5`](https://github.com/DS4SD/docling/commit/a8a60d52b17fc25e71a421d4f89240bc7f02e154)) +* Add instructions for cpu-only installation ([#56](https://github.com/docling-project/docling/issues/56)) ([`a8a60d5`](https://github.com/docling-project/docling/commit/a8a60d52b17fc25e71a421d4f89240bc7f02e154)) -## [v1.8.3](https://github.com/DS4SD/docling/releases/tag/v1.8.3) - 2024-08-28 +## [v1.8.3](https://github.com/docling-project/docling/releases/tag/v1.8.3) - 2024-08-28 ### Fix -* Table cells overlap and model warnings ([#53](https://github.com/DS4SD/docling/issues/53)) ([`f49ee82`](https://github.com/DS4SD/docling/commit/f49ee825c3b95ffd5de29242aec764b074c773f7)) +* Table cells overlap and model warnings ([#53](https://github.com/docling-project/docling/issues/53)) ([`f49ee82`](https://github.com/docling-project/docling/commit/f49ee825c3b95ffd5de29242aec764b074c773f7)) -## [v1.8.2](https://github.com/DS4SD/docling/releases/tag/v1.8.2) - 2024-08-27 +## [v1.8.2](https://github.com/docling-project/docling/releases/tag/v1.8.2) - 2024-08-27 ### Fix -* Refine conversion result ([#52](https://github.com/DS4SD/docling/issues/52)) ([`e46a66a`](https://github.com/DS4SD/docling/commit/e46a66a17606a26f351b798ecf4fdeae71465f9c)) +* Refine conversion result ([#52](https://github.com/docling-project/docling/issues/52)) ([`e46a66a`](https://github.com/docling-project/docling/commit/e46a66a17606a26f351b798ecf4fdeae71465f9c)) ### Documentation -* Update interface in README ([#50](https://github.com/DS4SD/docling/issues/50)) ([`fe817b1`](https://github.com/DS4SD/docling/commit/fe817b11d730c55d48b6a60fc4e6f173da51a66b)) +* Update interface in README ([#50](https://github.com/docling-project/docling/issues/50)) ([`fe817b1`](https://github.com/docling-project/docling/commit/fe817b11d730c55d48b6a60fc4e6f173da51a66b)) -## [v1.8.1](https://github.com/DS4SD/docling/releases/tag/v1.8.1) - 2024-08-26 +## [v1.8.1](https://github.com/docling-project/docling/releases/tag/v1.8.1) - 2024-08-26 ### Fix -* Align output formats ([#49](https://github.com/DS4SD/docling/issues/49)) ([`8cc147b`](https://github.com/DS4SD/docling/commit/8cc147bc56753144915709a48b08830d0c3ad44e)) +* Align output formats ([#49](https://github.com/docling-project/docling/issues/49)) ([`8cc147b`](https://github.com/docling-project/docling/commit/8cc147bc56753144915709a48b08830d0c3ad44e)) -## [v1.8.0](https://github.com/DS4SD/docling/releases/tag/v1.8.0) - 2024-08-23 +## [v1.8.0](https://github.com/docling-project/docling/releases/tag/v1.8.0) - 2024-08-23 ### Feature -* Page-level error reporting from PDF backend, introduce PARTIAL_SUCCESS status ([#47](https://github.com/DS4SD/docling/issues/47)) ([`a294b7e`](https://github.com/DS4SD/docling/commit/a294b7e64a4d66ebb9fd328c084e5f74647805ee)) +* Page-level error reporting from PDF backend, introduce PARTIAL_SUCCESS status ([#47](https://github.com/docling-project/docling/issues/47)) ([`a294b7e`](https://github.com/docling-project/docling/commit/a294b7e64a4d66ebb9fd328c084e5f74647805ee)) -## [v1.7.1](https://github.com/DS4SD/docling/releases/tag/v1.7.1) - 2024-08-23 +## [v1.7.1](https://github.com/docling-project/docling/releases/tag/v1.7.1) - 2024-08-23 ### Fix -* Better raise exception when a page fails to parse ([#46](https://github.com/DS4SD/docling/issues/46)) ([`8808463`](https://github.com/DS4SD/docling/commit/8808463cecd7ff3a92bd99d2e3d65fd248672c9e)) -* Upgrade docling-parse to 1.1.1, safety checks for failed parse on pages ([#45](https://github.com/DS4SD/docling/issues/45)) ([`7e84533`](https://github.com/DS4SD/docling/commit/7e845332992ab37386daee087573773051bfd065)) +* Better raise exception when a page fails to parse ([#46](https://github.com/docling-project/docling/issues/46)) ([`8808463`](https://github.com/docling-project/docling/commit/8808463cecd7ff3a92bd99d2e3d65fd248672c9e)) +* Upgrade docling-parse to 1.1.1, safety checks for failed parse on pages ([#45](https://github.com/docling-project/docling/issues/45)) ([`7e84533`](https://github.com/docling-project/docling/commit/7e845332992ab37386daee087573773051bfd065)) -## [v1.7.0](https://github.com/DS4SD/docling/releases/tag/v1.7.0) - 2024-08-22 +## [v1.7.0](https://github.com/docling-project/docling/releases/tag/v1.7.0) - 2024-08-22 ### Feature -* Upgrade docling-parse PDF backend and interface to use page-by-page parsing ([#44](https://github.com/DS4SD/docling/issues/44)) ([`a8c6b29`](https://github.com/DS4SD/docling/commit/a8c6b29a67ca303d6eec3fabb6b5e75ad5a7676d)) +* Upgrade docling-parse PDF backend and interface to use page-by-page parsing ([#44](https://github.com/docling-project/docling/issues/44)) ([`a8c6b29`](https://github.com/docling-project/docling/commit/a8c6b29a67ca303d6eec3fabb6b5e75ad5a7676d)) -## [v1.6.3](https://github.com/DS4SD/docling/releases/tag/v1.6.3) - 2024-08-22 +## [v1.6.3](https://github.com/docling-project/docling/releases/tag/v1.6.3) - 2024-08-22 ### Fix -* Usage of bytesio with docling-parse ([#43](https://github.com/DS4SD/docling/issues/43)) ([`fac5745`](https://github.com/DS4SD/docling/commit/fac5745dc846281bfae64bc631658bb2a2c90982)) +* Usage of bytesio with docling-parse ([#43](https://github.com/docling-project/docling/issues/43)) ([`fac5745`](https://github.com/docling-project/docling/commit/fac5745dc846281bfae64bc631658bb2a2c90982)) -## [v1.6.2](https://github.com/DS4SD/docling/releases/tag/v1.6.2) - 2024-08-22 +## [v1.6.2](https://github.com/docling-project/docling/releases/tag/v1.6.2) - 2024-08-22 ### Fix -* Remove [ocr] extra to fix wheel install ([#42](https://github.com/DS4SD/docling/issues/42)) ([`6995268`](https://github.com/DS4SD/docling/commit/69952682edd014a3f252e9c87edffa7c34f1033b)) +* Remove [ocr] extra to fix wheel install ([#42](https://github.com/docling-project/docling/issues/42)) ([`6995268`](https://github.com/docling-project/docling/commit/69952682edd014a3f252e9c87edffa7c34f1033b)) -## [v1.6.1](https://github.com/DS4SD/docling/releases/tag/v1.6.1) - 2024-08-21 +## [v1.6.1](https://github.com/docling-project/docling/releases/tag/v1.6.1) - 2024-08-21 ### Fix -* Add scipy as dependency ([#40](https://github.com/DS4SD/docling/issues/40)) ([`f19871a`](https://github.com/DS4SD/docling/commit/f19871a5a164b5369da10f7753d7c7da7fde35cc)) +* Add scipy as dependency ([#40](https://github.com/docling-project/docling/issues/40)) ([`f19871a`](https://github.com/docling-project/docling/commit/f19871a5a164b5369da10f7753d7c7da7fde35cc)) -## [v1.6.0](https://github.com/DS4SD/docling/releases/tag/v1.6.0) - 2024-08-20 +## [v1.6.0](https://github.com/docling-project/docling/releases/tag/v1.6.0) - 2024-08-20 ### Feature -* Add adaptive OCR, factor out treatment of OCR areas and cell filtering ([#38](https://github.com/DS4SD/docling/issues/38)) ([`e94d317`](https://github.com/DS4SD/docling/commit/e94d317c022d2b916332d43cdc2aa90fd4738df9)) +* Add adaptive OCR, factor out treatment of OCR areas and cell filtering ([#38](https://github.com/docling-project/docling/issues/38)) ([`e94d317`](https://github.com/docling-project/docling/commit/e94d317c022d2b916332d43cdc2aa90fd4738df9)) -## [v1.5.0](https://github.com/DS4SD/docling/releases/tag/v1.5.0) - 2024-08-20 +## [v1.5.0](https://github.com/docling-project/docling/releases/tag/v1.5.0) - 2024-08-20 ### Feature -* Allow computing page images on-demand with scale and cache them ([#36](https://github.com/DS4SD/docling/issues/36)) ([`78347bf`](https://github.com/DS4SD/docling/commit/78347bf679c393378eab0bd383929fced88afeae)) +* Allow computing page images on-demand with scale and cache them ([#36](https://github.com/docling-project/docling/issues/36)) ([`78347bf`](https://github.com/docling-project/docling/commit/78347bf679c393378eab0bd383929fced88afeae)) ### Documentation -* Add technical paper ref ([#37](https://github.com/DS4SD/docling/issues/37)) ([`a13114b`](https://github.com/DS4SD/docling/commit/a13114bafdcf4b62eb97df32cbfaf5695596b77c)) +* Add technical paper ref ([#37](https://github.com/docling-project/docling/issues/37)) ([`a13114b`](https://github.com/docling-project/docling/commit/a13114bafdcf4b62eb97df32cbfaf5695596b77c)) -## [v1.4.0](https://github.com/DS4SD/docling/releases/tag/v1.4.0) - 2024-08-14 +## [v1.4.0](https://github.com/docling-project/docling/releases/tag/v1.4.0) - 2024-08-14 ### Feature -* Update parser with bytesio interface and set as new default backend ([#32](https://github.com/DS4SD/docling/issues/32)) ([`90dd676`](https://github.com/DS4SD/docling/commit/90dd676422f87584395a8949fa842fc9a6bdbd19)) +* Update parser with bytesio interface and set as new default backend ([#32](https://github.com/docling-project/docling/issues/32)) ([`90dd676`](https://github.com/docling-project/docling/commit/90dd676422f87584395a8949fa842fc9a6bdbd19)) ### Fix -* Allow newer torch versions ([#34](https://github.com/DS4SD/docling/issues/34)) ([`349b0e9`](https://github.com/DS4SD/docling/commit/349b0e914f7194ee778571a7189b7eaff6f5966a)) +* Allow newer torch versions ([#34](https://github.com/docling-project/docling/issues/34)) ([`349b0e9`](https://github.com/docling-project/docling/commit/349b0e914f7194ee778571a7189b7eaff6f5966a)) -## [v1.3.0](https://github.com/DS4SD/docling/releases/tag/v1.3.0) - 2024-08-12 +## [v1.3.0](https://github.com/docling-project/docling/releases/tag/v1.3.0) - 2024-08-12 ### Feature -* Output page images and extracted bbox ([#31](https://github.com/DS4SD/docling/issues/31)) ([`63d80ed`](https://github.com/DS4SD/docling/commit/63d80edca2fa4e64a07d8b00172d563d81ecb781)) +* Output page images and extracted bbox ([#31](https://github.com/docling-project/docling/issues/31)) ([`63d80ed`](https://github.com/docling-project/docling/commit/63d80edca2fa4e64a07d8b00172d563d81ecb781)) -## [v1.2.1](https://github.com/DS4SD/docling/releases/tag/v1.2.1) - 2024-08-07 +## [v1.2.1](https://github.com/docling-project/docling/releases/tag/v1.2.1) - 2024-08-07 ### Fix -* Update (vuln) deps ([#29](https://github.com/DS4SD/docling/issues/29)) ([`79ef8d2`](https://github.com/DS4SD/docling/commit/79ef8d2f2f6732f94c6777877ac9d0a45915ac84)) -* Type of path_or_stream in PdfDocumentBackend ([#28](https://github.com/DS4SD/docling/issues/28)) ([`794b20a`](https://github.com/DS4SD/docling/commit/794b20a50ad089b39d4a4a84dcd826935b2b83ed)) +* Update (vuln) deps ([#29](https://github.com/docling-project/docling/issues/29)) ([`79ef8d2`](https://github.com/docling-project/docling/commit/79ef8d2f2f6732f94c6777877ac9d0a45915ac84)) +* Type of path_or_stream in PdfDocumentBackend ([#28](https://github.com/docling-project/docling/issues/28)) ([`794b20a`](https://github.com/docling-project/docling/commit/794b20a50ad089b39d4a4a84dcd826935b2b83ed)) ### Documentation -* Improve examples ([#27](https://github.com/DS4SD/docling/issues/27)) ([`9550db8`](https://github.com/DS4SD/docling/commit/9550db8e64c4d638a429be33c10f10f18871f795)) +* Improve examples ([#27](https://github.com/docling-project/docling/issues/27)) ([`9550db8`](https://github.com/docling-project/docling/commit/9550db8e64c4d638a429be33c10f10f18871f795)) -## [v1.2.0](https://github.com/DS4SD/docling/releases/tag/v1.2.0) - 2024-08-07 +## [v1.2.0](https://github.com/docling-project/docling/releases/tag/v1.2.0) - 2024-08-07 ### Feature -* Introducing docling_backend ([#26](https://github.com/DS4SD/docling/issues/26)) ([`b8f5e38`](https://github.com/DS4SD/docling/commit/b8f5e38a8c8b3fd734fa119cae216a3da0b363f7)) +* Introducing docling_backend ([#26](https://github.com/docling-project/docling/issues/26)) ([`b8f5e38`](https://github.com/docling-project/docling/commit/b8f5e38a8c8b3fd734fa119cae216a3da0b363f7)) -## [v1.1.2](https://github.com/DS4SD/docling/releases/tag/v1.1.2) - 2024-07-31 +## [v1.1.2](https://github.com/docling-project/docling/releases/tag/v1.1.2) - 2024-07-31 ### Fix -* Set page number using 1-based indexing ([#22](https://github.com/DS4SD/docling/issues/22)) ([`d2d9543`](https://github.com/DS4SD/docling/commit/d2d9543415d37c54add917803b96d9959dc4d2cc)) +* Set page number using 1-based indexing ([#22](https://github.com/docling-project/docling/issues/22)) ([`d2d9543`](https://github.com/docling-project/docling/commit/d2d9543415d37c54add917803b96d9959dc4d2cc)) -## [v1.1.1](https://github.com/DS4SD/docling/releases/tag/v1.1.1) - 2024-07-30 +## [v1.1.1](https://github.com/docling-project/docling/releases/tag/v1.1.1) - 2024-07-30 ### Fix -* Correct text extraction for table cells ([#21](https://github.com/DS4SD/docling/issues/21)) ([`f4bf3d2`](https://github.com/DS4SD/docling/commit/f4bf3d25b955b71729833a18aa3a5b643fecfa75)) +* Correct text extraction for table cells ([#21](https://github.com/docling-project/docling/issues/21)) ([`f4bf3d2`](https://github.com/docling-project/docling/commit/f4bf3d25b955b71729833a18aa3a5b643fecfa75)) -## [v1.1.0](https://github.com/DS4SD/docling/releases/tag/v1.1.0) - 2024-07-26 +## [v1.1.0](https://github.com/docling-project/docling/releases/tag/v1.1.0) - 2024-07-26 ### Feature -* Add simplified single-doc conversion ([#20](https://github.com/DS4SD/docling/issues/20)) ([`d603137`](https://github.com/DS4SD/docling/commit/d60313738340c20f9af64dfe51e28b7670ff64ef)) +* Add simplified single-doc conversion ([#20](https://github.com/docling-project/docling/issues/20)) ([`d603137`](https://github.com/docling-project/docling/commit/d60313738340c20f9af64dfe51e28b7670ff64ef)) -## [v1.0.2](https://github.com/DS4SD/docling/releases/tag/v1.0.2) - 2024-07-24 +## [v1.0.2](https://github.com/docling-project/docling/releases/tag/v1.0.2) - 2024-07-24 ### Fix -* Add easyocr to main deps for valid extra ([#19](https://github.com/DS4SD/docling/issues/19)) ([`54b3dda`](https://github.com/DS4SD/docling/commit/54b3dda141fc09e8c17ba4cb301d0c4394b680d8)) +* Add easyocr to main deps for valid extra ([#19](https://github.com/docling-project/docling/issues/19)) ([`54b3dda`](https://github.com/docling-project/docling/commit/54b3dda141fc09e8c17ba4cb301d0c4394b680d8)) -## [v1.0.1](https://github.com/DS4SD/docling/releases/tag/v1.0.1) - 2024-07-24 +## [v1.0.1](https://github.com/docling-project/docling/releases/tag/v1.0.1) - 2024-07-24 ### Fix -* Expose ocr as extra ([#18](https://github.com/DS4SD/docling/issues/18)) ([`b0725e0`](https://github.com/DS4SD/docling/commit/b0725e0aa693058b4962efa69730777dbe1d5bec)) +* Expose ocr as extra ([#18](https://github.com/docling-project/docling/issues/18)) ([`b0725e0`](https://github.com/docling-project/docling/commit/b0725e0aa693058b4962efa69730777dbe1d5bec)) -## [v1.0.0](https://github.com/DS4SD/docling/releases/tag/v1.0.0) - 2024-07-18 +## [v1.0.0](https://github.com/docling-project/docling/releases/tag/v1.0.0) - 2024-07-18 ### Feature -* V1.0.0 release ([#16](https://github.com/DS4SD/docling/issues/16)) ([`71c3a9c`](https://github.com/DS4SD/docling/commit/71c3a9c8cde5b3a8884430eddcb33a9fbd7bf354)) +* V1.0.0 release ([#16](https://github.com/docling-project/docling/issues/16)) ([`71c3a9c`](https://github.com/docling-project/docling/commit/71c3a9c8cde5b3a8884430eddcb33a9fbd7bf354)) ### Breaking -* v1.0.0 release ([#16](https://github.com/DS4SD/docling/issues/16)) ([`71c3a9c`](https://github.com/DS4SD/docling/commit/71c3a9c8cde5b3a8884430eddcb33a9fbd7bf354)) +* v1.0.0 release ([#16](https://github.com/docling-project/docling/issues/16)) ([`71c3a9c`](https://github.com/docling-project/docling/commit/71c3a9c8cde5b3a8884430eddcb33a9fbd7bf354)) -## [v0.4.0](https://github.com/DS4SD/docling/releases/tag/v0.4.0) - 2024-07-17 +## [v0.4.0](https://github.com/docling-project/docling/releases/tag/v0.4.0) - 2024-07-17 ### Feature -* Optimize table extraction quality, add configuration options ([#11](https://github.com/DS4SD/docling/issues/11)) ([`e9526bb`](https://github.com/DS4SD/docling/commit/e9526bb11e21dc85c787af5c38e6f77eaca05f69)) +* Optimize table extraction quality, add configuration options ([#11](https://github.com/docling-project/docling/issues/11)) ([`e9526bb`](https://github.com/docling-project/docling/commit/e9526bb11e21dc85c787af5c38e6f77eaca05f69)) -## [v0.3.1](https://github.com/DS4SD/docling/releases/tag/v0.3.1) - 2024-07-17 +## [v0.3.1](https://github.com/docling-project/docling/releases/tag/v0.3.1) - 2024-07-17 ### Fix -* Missing type for default values ([#12](https://github.com/DS4SD/docling/issues/12)) ([`d1d1724`](https://github.com/DS4SD/docling/commit/d1d1724537d6a1f37591cdea44052207caae2ee2)) +* Missing type for default values ([#12](https://github.com/docling-project/docling/issues/12)) ([`d1d1724`](https://github.com/docling-project/docling/commit/d1d1724537d6a1f37591cdea44052207caae2ee2)) ### Documentation -* Reflect supported Python versions, add badges ([#10](https://github.com/DS4SD/docling/issues/10)) ([`2baa35c`](https://github.com/DS4SD/docling/commit/2baa35c548dd6d15dba449eb1dc707f8f08c0a2a)) +* Reflect supported Python versions, add badges ([#10](https://github.com/docling-project/docling/issues/10)) ([`2baa35c`](https://github.com/docling-project/docling/commit/2baa35c548dd6d15dba449eb1dc707f8f08c0a2a)) -## [v0.3.0](https://github.com/DS4SD/docling/releases/tag/v0.3.0) - 2024-07-17 +## [v0.3.0](https://github.com/docling-project/docling/releases/tag/v0.3.0) - 2024-07-17 ### Feature -* Enable python 3.12 support by updating glm ([#8](https://github.com/DS4SD/docling/issues/8)) ([`fb72688`](https://github.com/DS4SD/docling/commit/fb72688ff7413083c864fe62d2dbfc420c1e5268)) +* Enable python 3.12 support by updating glm ([#8](https://github.com/docling-project/docling/issues/8)) ([`fb72688`](https://github.com/docling-project/docling/commit/fb72688ff7413083c864fe62d2dbfc420c1e5268)) ### Documentation -* Add setup with pypi to Readme ([#7](https://github.com/DS4SD/docling/issues/7)) ([`2803222`](https://github.com/DS4SD/docling/commit/2803222ee1708481c779d435dbf1c031929d3cf6)) +* Add setup with pypi to Readme ([#7](https://github.com/docling-project/docling/issues/7)) ([`2803222`](https://github.com/docling-project/docling/commit/2803222ee1708481c779d435dbf1c031929d3cf6)) -## [v0.2.0](https://github.com/DS4SD/docling/releases/tag/v0.2.0) - 2024-07-16 +## [v0.2.0](https://github.com/docling-project/docling/releases/tag/v0.2.0) - 2024-07-16 ### Feature -* Build with ci ([#6](https://github.com/DS4SD/docling/issues/6)) ([`b1479cf`](https://github.com/DS4SD/docling/commit/b1479cf4ecf8a586703b31c7cf6917b3293c6a85)) +* Build with ci ([#6](https://github.com/docling-project/docling/issues/6)) ([`b1479cf`](https://github.com/docling-project/docling/commit/b1479cf4ecf8a586703b31c7cf6917b3293c6a85)) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 770b923f..843c6daf 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -2,13 +2,13 @@ Our project welcomes external contributions. If you have an itch, please feel free to scratch it. -To contribute code or documentation, please submit a [pull request](https://github.com/DS4SD/docling/pulls). +To contribute code or documentation, please submit a [pull request](https://github.com/docling-project/docling/pulls). A good way to familiarize yourself with the codebase and contribution process is -to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/DS4SD/docling/issues). +to look for and tackle low-hanging fruit in the [issue tracker](https://github.com/docling-project/docling/issues). Before embarking on a more ambitious contribution, please quickly [get in touch](#communication) with us. -For general questions or support requests, please refer to the [discussion section](https://github.com/DS4SD/docling/discussions). +For general questions or support requests, please refer to the [discussion section](https://github.com/docling-project/docling/discussions). **Note: We appreciate your effort and want to avoid situations where a contribution requires extensive rework (by you or by us), sits in the backlog for a long time, or @@ -16,14 +16,14 @@ cannot be accepted at all!** ### Proposing New Features -If you would like to implement a new feature, please [raise an issue](https://github.com/DS4SD/docling/issues) +If you would like to implement a new feature, please [raise an issue](https://github.com/docling-project/docling/issues) before sending a pull request so the feature can be discussed. This is to avoid you spending valuable time working on a feature that the project developers are not interested in accepting into the codebase. ### Fixing Bugs -If you would like to fix a bug, please [raise an issue](https://github.com/DS4SD/docling/issues) before sending a +If you would like to fix a bug, please [raise an issue](https://github.com/docling-project/docling/issues) before sending a pull request so it can be tracked. ### Merge Approval @@ -78,7 +78,7 @@ This project strictly adheres to using dependencies that are compatible with the ## Communication -Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions). +Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions). diff --git a/README.md b/README.md index 842253e9..69eccd7c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

- - Docling + + Docling

@@ -11,7 +11,7 @@

[![arXiv](https://img.shields.io/badge/arXiv-2408.09869-b31b1b.svg)](https://arxiv.org/abs/2408.09869) -[![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://ds4sd.github.io/docling/) +[![Docs](https://img.shields.io/badge/docs-live-brightgreen)](https://docling-project.github.io/docling/) [![PyPI version](https://img.shields.io/pypi/v/docling)](https://pypi.org/project/docling/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/docling)](https://pypi.org/project/docling/) [![Poetry](https://img.shields.io/endpoint?url=https://python-poetry.org/badge/v0.json)](https://python-poetry.org/) @@ -19,7 +19,7 @@ [![Imports: isort](https://img.shields.io/badge/%20imports-isort-%231674b1?style=flat&labelColor=ef8336)](https://pycqa.github.io/isort/) [![Pydantic v2](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/pydantic/pydantic/main/docs/badge/v2.json)](https://pydantic.dev) [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit&logoColor=white)](https://github.com/pre-commit/pre-commit) -[![License MIT](https://img.shields.io/github/license/DS4SD/docling)](https://opensource.org/licenses/MIT) +[![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) Docling simplifies document processing, parsing diverse formats — including advanced PDF understanding — and providing seamless integrations with the gen AI ecosystem. @@ -51,7 +51,7 @@ pip install docling Works on macOS, Linux and Windows environments. Both x86_64 and arm64 architectures. -More [detailed installation instructions](https://ds4sd.github.io/docling/installation/) are available in the docs. +More [detailed installation instructions](https://docling-project.github.io/docling/installation/) are available in the docs. ## Getting started @@ -66,28 +66,28 @@ result = converter.convert(source) print(result.document.export_to_markdown()) # output: "## Docling Technical Report[...]" ``` -More [advanced usage options](https://ds4sd.github.io/docling/usage/) are available in +More [advanced usage options](https://docling-project.github.io/docling/usage/) are available in the docs. ## Documentation -Check out Docling's [documentation](https://ds4sd.github.io/docling/), for details on +Check out Docling's [documentation](https://docling-project.github.io/docling/), for details on installation, usage, concepts, recipes, extensions, and more. ## Examples -Go hands-on with our [examples](https://ds4sd.github.io/docling/examples/), +Go hands-on with our [examples](https://docling-project.github.io/docling/examples/), demonstrating how to address different application use cases with Docling. ## Integrations To further accelerate your AI application development, check out Docling's native -[integrations](https://ds4sd.github.io/docling/integrations/) with popular frameworks +[integrations](https://docling-project.github.io/docling/integrations/) with popular frameworks and tools. ## Get help and support -Please feel free to connect with us using the [discussion section](https://github.com/DS4SD/docling/discussions). +Please feel free to connect with us using the [discussion section](https://github.com/docling-project/docling/discussions). ## Technical report @@ -95,7 +95,7 @@ For more details on Docling's inner workings, check out the [Docling Technical R ## Contributing -Please read [Contributing to Docling](https://github.com/DS4SD/docling/blob/main/CONTRIBUTING.md) for details. +Please read [Contributing to Docling](https://github.com/docling-project/docling/blob/main/CONTRIBUTING.md) for details. ## References @@ -123,6 +123,6 @@ For individual model usage, please refer to the model licenses found in the orig Docling has been brought to you by IBM. -[supported_formats]: https://ds4sd.github.io/docling/usage/supported_formats/ -[docling_document]: https://ds4sd.github.io/docling/concepts/docling_document/ -[integrations]: https://ds4sd.github.io/docling/integrations/ +[supported_formats]: https://docling-project.github.io/docling/usage/supported_formats/ +[docling_document]: https://docling-project.github.io/docling/concepts/docling_document/ +[integrations]: https://docling-project.github.io/docling/integrations/ diff --git a/docling/backend/asciidoc_backend.py b/docling/backend/asciidoc_backend.py index 397bfc44..09891eb8 100644 --- a/docling/backend/asciidoc_backend.py +++ b/docling/backend/asciidoc_backend.py @@ -380,7 +380,7 @@ class AsciiDocBackend(DeclarativeDocumentBackend): end_row_offset_idx=row_idx + row_span, start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + col_span, - col_header=False, + column_header=row_idx == 0, row_header=False, ) data.table_cells.append(cell) diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py index 9097acf8..9159bd47 100644 --- a/docling/backend/csv_backend.py +++ b/docling/backend/csv_backend.py @@ -111,7 +111,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend): end_row_offset_idx=row_idx + 1, start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + 1, - col_header=row_idx == 0, # First row as header + column_header=row_idx == 0, # First row as header row_header=False, ) table_data.table_cells.append(cell) diff --git a/docling/backend/docx/__init__.py b/docling/backend/docx/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/backend/docx/latex/__init__.py b/docling/backend/docx/latex/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/docling/backend/docx/latex/latex_dict.py b/docling/backend/docx/latex/latex_dict.py new file mode 100644 index 00000000..280358be --- /dev/null +++ b/docling/backend/docx/latex/latex_dict.py @@ -0,0 +1,271 @@ +# -*- coding: utf-8 -*- + +""" +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py +On 23/01/2025 +""" + +from __future__ import unicode_literals + +CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~") + +BLANK = "" +BACKSLASH = "\\" +ALN = "&" + +CHR = { + # Unicode : Latex Math Symbols + # Top accents + "\u0300": "\\grave{{{0}}}", + "\u0301": "\\acute{{{0}}}", + "\u0302": "\\hat{{{0}}}", + "\u0303": "\\tilde{{{0}}}", + "\u0304": "\\bar{{{0}}}", + "\u0305": "\\overbar{{{0}}}", + "\u0306": "\\breve{{{0}}}", + "\u0307": "\\dot{{{0}}}", + "\u0308": "\\ddot{{{0}}}", + "\u0309": "\\ovhook{{{0}}}", + "\u030a": "\\ocirc{{{0}}}}", + "\u030c": "\\check{{{0}}}}", + "\u0310": "\\candra{{{0}}}", + "\u0312": "\\oturnedcomma{{{0}}}", + "\u0315": "\\ocommatopright{{{0}}}", + "\u031a": "\\droang{{{0}}}", + "\u0338": "\\not{{{0}}}", + "\u20d0": "\\leftharpoonaccent{{{0}}}", + "\u20d1": "\\rightharpoonaccent{{{0}}}", + "\u20d2": "\\vertoverlay{{{0}}}", + "\u20d6": "\\overleftarrow{{{0}}}", + "\u20d7": "\\vec{{{0}}}", + "\u20db": "\\dddot{{{0}}}", + "\u20dc": "\\ddddot{{{0}}}", + "\u20e1": "\\overleftrightarrow{{{0}}}", + "\u20e7": "\\annuity{{{0}}}", + "\u20e9": "\\widebridgeabove{{{0}}}", + "\u20f0": "\\asteraccent{{{0}}}", + # Bottom accents + "\u0330": "\\wideutilde{{{0}}}", + "\u0331": "\\underbar{{{0}}}", + "\u20e8": "\\threeunderdot{{{0}}}", + "\u20ec": "\\underrightharpoondown{{{0}}}", + "\u20ed": "\\underleftharpoondown{{{0}}}", + "\u20ee": "\\underledtarrow{{{0}}}", + "\u20ef": "\\underrightarrow{{{0}}}", + # Over | group + "\u23b4": "\\overbracket{{{0}}}", + "\u23dc": "\\overparen{{{0}}}", + "\u23de": "\\overbrace{{{0}}}", + # Under| group + "\u23b5": "\\underbracket{{{0}}}", + "\u23dd": "\\underparen{{{0}}}", + "\u23df": "\\underbrace{{{0}}}", +} + +CHR_BO = { + # Big operators, + "\u2140": "\\Bbbsum", + "\u220f": "\\prod", + "\u2210": "\\coprod", + "\u2211": "\\sum", + "\u222b": "\\int", + "\u22c0": "\\bigwedge", + "\u22c1": "\\bigvee", + "\u22c2": "\\bigcap", + "\u22c3": "\\bigcup", + "\u2a00": "\\bigodot", + "\u2a01": "\\bigoplus", + "\u2a02": "\\bigotimes", +} + +T = { + "\u2192": "\\rightarrow ", + # Greek letters + "\U0001d6fc": "\\alpha ", + "\U0001d6fd": "\\beta ", + "\U0001d6fe": "\\gamma ", + "\U0001d6ff": "\\theta ", + "\U0001d700": "\\epsilon ", + "\U0001d701": "\\zeta ", + "\U0001d702": "\\eta ", + "\U0001d703": "\\theta ", + "\U0001d704": "\\iota ", + "\U0001d705": "\\kappa ", + "\U0001d706": "\\lambda ", + "\U0001d707": "\\m ", + "\U0001d708": "\\n ", + "\U0001d709": "\\xi ", + "\U0001d70a": "\\omicron ", + "\U0001d70b": "\\pi ", + "\U0001d70c": "\\rho ", + "\U0001d70d": "\\varsigma ", + "\U0001d70e": "\\sigma ", + "\U0001d70f": "\\ta ", + "\U0001d710": "\\upsilon ", + "\U0001d711": "\\phi ", + "\U0001d712": "\\chi ", + "\U0001d713": "\\psi ", + "\U0001d714": "\\omega ", + "\U0001d715": "\\partial ", + "\U0001d716": "\\varepsilon ", + "\U0001d717": "\\vartheta ", + "\U0001d718": "\\varkappa ", + "\U0001d719": "\\varphi ", + "\U0001d71a": "\\varrho ", + "\U0001d71b": "\\varpi ", + # Relation symbols + "\u2190": "\\leftarrow ", + "\u2191": "\\uparrow ", + "\u2192": "\\rightarrow ", + "\u2193": "\\downright ", + "\u2194": "\\leftrightarrow ", + "\u2195": "\\updownarrow ", + "\u2196": "\\nwarrow ", + "\u2197": "\\nearrow ", + "\u2198": "\\searrow ", + "\u2199": "\\swarrow ", + "\u22ee": "\\vdots ", + "\u22ef": "\\cdots ", + "\u22f0": "\\adots ", + "\u22f1": "\\ddots ", + "\u2260": "\\ne ", + "\u2264": "\\leq ", + "\u2265": "\\geq ", + "\u2266": "\\leqq ", + "\u2267": "\\geqq ", + "\u2268": "\\lneqq ", + "\u2269": "\\gneqq ", + "\u226a": "\\ll ", + "\u226b": "\\gg ", + "\u2208": "\\in ", + "\u2209": "\\notin ", + "\u220b": "\\ni ", + "\u220c": "\\nni ", + # Ordinary symbols + "\u221e": "\\infty ", + # Binary relations + "\u00b1": "\\pm ", + "\u2213": "\\mp ", + # Italic, Latin, uppercase + "\U0001d434": "A", + "\U0001d435": "B", + "\U0001d436": "C", + "\U0001d437": "D", + "\U0001d438": "E", + "\U0001d439": "F", + "\U0001d43a": "G", + "\U0001d43b": "H", + "\U0001d43c": "I", + "\U0001d43d": "J", + "\U0001d43e": "K", + "\U0001d43f": "L", + "\U0001d440": "M", + "\U0001d441": "N", + "\U0001d442": "O", + "\U0001d443": "P", + "\U0001d444": "Q", + "\U0001d445": "R", + "\U0001d446": "S", + "\U0001d447": "T", + "\U0001d448": "U", + "\U0001d449": "V", + "\U0001d44a": "W", + "\U0001d44b": "X", + "\U0001d44c": "Y", + "\U0001d44d": "Z", + # Italic, Latin, lowercase + "\U0001d44e": "a", + "\U0001d44f": "b", + "\U0001d450": "c", + "\U0001d451": "d", + "\U0001d452": "e", + "\U0001d453": "f", + "\U0001d454": "g", + "\U0001d456": "i", + "\U0001d457": "j", + "\U0001d458": "k", + "\U0001d459": "l", + "\U0001d45a": "m", + "\U0001d45b": "n", + "\U0001d45c": "o", + "\U0001d45d": "p", + "\U0001d45e": "q", + "\U0001d45f": "r", + "\U0001d460": "s", + "\U0001d461": "t", + "\U0001d462": "u", + "\U0001d463": "v", + "\U0001d464": "w", + "\U0001d465": "x", + "\U0001d466": "y", + "\U0001d467": "z", +} + +FUNC = { + "sin": "\\sin({fe})", + "cos": "\\cos({fe})", + "tan": "\\tan({fe})", + "arcsin": "\\arcsin({fe})", + "arccos": "\\arccos({fe})", + "arctan": "\\arctan({fe})", + "arccot": "\\arccot({fe})", + "sinh": "\\sinh({fe})", + "cosh": "\\cosh({fe})", + "tanh": "\\tanh({fe})", + "coth": "\\coth({fe})", + "sec": "\\sec({fe})", + "csc": "\\csc({fe})", +} + +FUNC_PLACE = "{fe}" + +BRK = "\\\\" + +CHR_DEFAULT = { + "ACC_VAL": "\\hat{{{0}}}", +} + +POS = { + "top": "\\overline{{{0}}}", # not sure + "bot": "\\underline{{{0}}}", +} + +POS_DEFAULT = { + "BAR_VAL": "\\overline{{{0}}}", +} + +SUB = "_{{{0}}}" + +SUP = "^{{{0}}}" + +F = { + "bar": "\\frac{{{num}}}{{{den}}}", + "skw": r"^{{{num}}}/_{{{den}}}", + "noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}", + "lin": "{{{num}}}/{{{den}}}", +} +F_DEFAULT = "\\frac{{{num}}}{{{den}}}" + +D = "\\left{left}{text}\\right{right}" + +D_DEFAULT = { + "left": "(", + "right": ")", + "null": ".", +} + +RAD = "\\sqrt[{deg}]{{{text}}}" +RAD_DEFAULT = "\\sqrt{{{text}}}" +ARR = "{text}" + +LIM_FUNC = { + "lim": "\\lim_{{{lim}}}", + "max": "\\max_{{{lim}}}", + "min": "\\min_{{{lim}}}", +} + +LIM_TO = ("\\rightarrow", "\\to") + +LIM_UPP = "\\overset{{{lim}}}{{{text}}}" + +M = "\\begin{{matrix}}{text}\\end{{matrix}}" diff --git a/docling/backend/docx/latex/omml.py b/docling/backend/docx/latex/omml.py new file mode 100644 index 00000000..add0de71 --- /dev/null +++ b/docling/backend/docx/latex/omml.py @@ -0,0 +1,453 @@ +""" +Office Math Markup Language (OMML) + +Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py +On 23/01/2025 +""" + +import lxml.etree as ET +from pylatexenc.latexencode import UnicodeToLatexEncoder + +from docling.backend.docx.latex.latex_dict import ( + ALN, + ARR, + BACKSLASH, + BLANK, + BRK, + CHARS, + CHR, + CHR_BO, + CHR_DEFAULT, + D_DEFAULT, + F_DEFAULT, + FUNC, + FUNC_PLACE, + LIM_FUNC, + LIM_TO, + LIM_UPP, + POS, + POS_DEFAULT, + RAD, + RAD_DEFAULT, + SUB, + SUP, + D, + F, + M, + T, +) + +OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}" + + +def load(stream): + tree = ET.parse(stream) + for omath in tree.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def load_string(string): + root = ET.fromstring(string) + for omath in root.findall(OMML_NS + "oMath"): + yield oMath2Latex(omath) + + +def escape_latex(strs): + last = None + new_chr = [] + strs = strs.replace(r"\\", "\\") + for c in strs: + if (c in CHARS) and (last != BACKSLASH): + new_chr.append(BACKSLASH + c) + else: + new_chr.append(c) + last = c + return BLANK.join(new_chr) + + +def get_val(key, default=None, store=CHR): + if key is not None: + return key if not store else store.get(key, key) + else: + return default + + +class Tag2Method(object): + + def call_method(self, elm, stag=None): + getmethod = self.tag2meth.get + if stag is None: + stag = elm.tag.replace(OMML_NS, "") + method = getmethod(stag) + if method: + return method(self, elm) + else: + return None + + def process_children_list(self, elm, include=None): + """ + process children of the elm,return iterable + """ + for _e in list(elm): + if OMML_NS not in _e.tag: + continue + stag = _e.tag.replace(OMML_NS, "") + if include and (stag not in include): + continue + t = self.call_method(_e, stag=stag) + if t is None: + t = self.process_unknow(_e, stag) + if t is None: + continue + yield (stag, t, _e) + + def process_children_dict(self, elm, include=None): + """ + process children of the elm,return dict + """ + latex_chars = dict() + for stag, t, e in self.process_children_list(elm, include): + latex_chars[stag] = t + return latex_chars + + def process_children(self, elm, include=None): + """ + process children of the elm,return string + """ + return BLANK.join( + ( + t if not isinstance(t, Tag2Method) else str(t) + for stag, t, e in self.process_children_list(elm, include) + ) + ) + + def process_unknow(self, elm, stag): + return None + + +class Pr(Tag2Method): + + text = "" + + __val_tags = ("chr", "pos", "begChr", "endChr", "type") + + __innerdict = None # can't use the __dict__ + + """ common properties of element""" + + def __init__(self, elm): + self.__innerdict = {} + self.text = self.process_children(elm) + + def __str__(self): + return self.text + + def __unicode__(self): + return self.__str__(self) + + def __getattr__(self, name): + return self.__innerdict.get(name, None) + + def do_brk(self, elm): + self.__innerdict["brk"] = BRK + return BRK + + def do_common(self, elm): + stag = elm.tag.replace(OMML_NS, "") + if stag in self.__val_tags: + t = elm.get("{0}val".format(OMML_NS)) + self.__innerdict[stag] = t + return None + + tag2meth = { + "brk": do_brk, + "chr": do_common, + "pos": do_common, + "begChr": do_common, + "endChr": do_common, + "type": do_common, + } + + +class oMath2Latex(Tag2Method): + """ + Convert oMath element of omml to latex + """ + + _t_dict = T + + __direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e") + u = UnicodeToLatexEncoder( + replacement_latex_protection="braces-all", + unknown_char_policy="keep", + unknown_char_warning=False, + ) + + def __init__(self, element): + self._latex = self.process_children(element) + + def __str__(self): + return self.latex.replace(" ", " ") + + def __unicode__(self): + return self.__str__(self) + + def process_unknow(self, elm, stag): + if stag in self.__direct_tags: + return self.process_children(elm) + elif stag[-2:] == "Pr": + return Pr(elm) + else: + return None + + @property + def latex(self): + return self._latex + + def do_acc(self, elm): + """ + the accent function + """ + c_dict = self.process_children_dict(elm) + latex_s = get_val( + c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR + ) + return latex_s.format(c_dict["e"]) + + def do_bar(self, elm): + """ + the bar function + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["barPr"] + latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS) + return pr.text + latex_s.format(c_dict["e"]) + + def do_d(self, elm): + """ + the delimiter object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["dPr"] + null = D_DEFAULT.get("null") + + s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T) + e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T) + delim = pr.text + D.format( + left=null if not s_val else escape_latex(s_val), + text=c_dict["e"], + right=null if not e_val else escape_latex(e_val), + ) + return delim + + def do_spre(self, elm): + """ + the Pre-Sub-Superscript object -- Not support yet + """ + pass + + def do_sub(self, elm): + text = self.process_children(elm) + return SUB.format(text) + + def do_sup(self, elm): + text = self.process_children(elm) + return SUP.format(text) + + def do_f(self, elm): + """ + the fraction object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["fPr"] + latex_s = get_val(pr.type, default=F_DEFAULT, store=F) + return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den")) + + def do_func(self, elm): + """ + the Function-Apply object (Examples:sin cos) + """ + c_dict = self.process_children_dict(elm) + func_name = c_dict.get("fName") + return func_name.replace(FUNC_PLACE, c_dict.get("e")) + + def do_fname(self, elm): + """ + the func name + """ + latex_chars = [] + for stag, t, e in self.process_children_list(elm): + if stag == "r": + if FUNC.get(t): + latex_chars.append(FUNC[t]) + else: + raise NotSupport("Not support func %s" % t) + else: + latex_chars.append(t) + t = BLANK.join(latex_chars) + return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this + + def do_groupchr(self, elm): + """ + the Group-Character object + """ + c_dict = self.process_children_dict(elm) + pr = c_dict["groupChrPr"] + latex_s = get_val(pr.chr) + return pr.text + latex_s.format(c_dict["e"]) + + def do_rad(self, elm): + """ + the radical object + """ + c_dict = self.process_children_dict(elm) + text = c_dict.get("e") + deg_text = c_dict.get("deg") + if deg_text: + return RAD.format(deg=deg_text, text=text) + else: + return RAD_DEFAULT.format(text=text) + + def do_eqarr(self, elm): + """ + the Array object + """ + return ARR.format( + text=BRK.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + ) + + def do_limlow(self, elm): + """ + the Lower-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + latex_s = LIM_FUNC.get(t_dict["e"]) + if not latex_s: + raise NotSupport("Not support lim %s" % t_dict["e"]) + else: + return latex_s.format(lim=t_dict.get("lim")) + + def do_limupp(self, elm): + """ + the Upper-Limit object + """ + t_dict = self.process_children_dict(elm, include=("e", "lim")) + return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e")) + + def do_lim(self, elm): + """ + the lower limit of the limLow object and the upper limit of the limUpp function + """ + return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1]) + + def do_m(self, elm): + """ + the Matrix object + """ + rows = [] + for stag, t, e in self.process_children_list(elm): + if stag == "mPr": + pass + elif stag == "mr": + rows.append(t) + return M.format(text=BRK.join(rows)) + + def do_mr(self, elm): + """ + a single row of the matrix m + """ + return ALN.join( + [t for stag, t, e in self.process_children_list(elm, include=("e",))] + ) + + def do_nary(self, elm): + """ + the n-ary object + """ + res = [] + bo = "" + for stag, t, e in self.process_children_list(elm): + if stag == "naryPr": + bo = get_val(t.chr, store=CHR_BO) + else: + res.append(t) + return bo + BLANK.join(res) + + def process_unicode(self, s): + # s = s if isinstance(s,unicode) else unicode(s,'utf-8') + # print(s, self._t_dict.get(s, s), unicode_to_latex(s)) + # _str.append( self._t_dict.get(s, s) ) + + out_latex_str = self.u.unicode_to_latex(s) + + # print(s, out_latex_str) + + if ( + s.startswith("{") is False + and out_latex_str.startswith("{") + and s.endswith("}") is False + and out_latex_str.endswith("}") + ): + out_latex_str = f" {out_latex_str[1:-1]} " + + # print(s, out_latex_str) + + if "ensuremath" in out_latex_str: + out_latex_str = out_latex_str.replace("\\ensuremath{", " ") + out_latex_str = out_latex_str.replace("}", " ") + + # print(s, out_latex_str) + + if out_latex_str.strip().startswith("\\text"): + out_latex_str = f" \\text{{{out_latex_str}}} " + + # print(s, out_latex_str) + + return out_latex_str + + def do_r(self, elm): + """ + Get text from 'r' element,And try convert them to latex symbols + @todo text style support , (sty) + @todo \text (latex pure text support) + """ + _str = [] + _base_str = [] + for s in elm.findtext("./{0}t".format(OMML_NS)): + out_latex_str = self.process_unicode(s) + _str.append(out_latex_str) + _base_str.append(s) + + proc_str = escape_latex(BLANK.join(_str)) + base_proc_str = BLANK.join(_base_str) + + if "{" not in base_proc_str and "\\{" in proc_str: + proc_str = proc_str.replace("\\{", "{") + + if "}" not in base_proc_str and "\\}" in proc_str: + proc_str = proc_str.replace("\\}", "}") + + return proc_str + + tag2meth = { + "acc": do_acc, + "r": do_r, + "bar": do_bar, + "sub": do_sub, + "sup": do_sup, + "f": do_f, + "func": do_func, + "fName": do_fname, + "groupChr": do_groupchr, + "d": do_d, + "rad": do_rad, + "eqArr": do_eqarr, + "limLow": do_limlow, + "limUpp": do_limupp, + "lim": do_lim, + "m": do_m, + "mr": do_mr, + "nary": do_nary, + } diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index d14b422f..d2222971 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -134,7 +134,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.analyze_tag(cast(Tag, element), doc) except Exception as exc_child: _log.error( - f"Error processing child from tag{tag.name}: {exc_child}" + f"Error processing child from tag {tag.name}: {repr(exc_child)}" ) raise exc_child elif isinstance(element, NavigableString) and not isinstance( @@ -347,11 +347,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): content_layer=self.content_layer, ) self.level += 1 - - self.walk(element, doc) - - self.parents[self.level + 1] = None - self.level -= 1 + self.walk(element, doc) + self.parents[self.level + 1] = None + self.level -= 1 + else: + self.walk(element, doc) elif element.text.strip(): text = element.text.strip() @@ -457,7 +457,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): end_row_offset_idx=row_idx + row_span, start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + col_span, - col_header=col_header, + column_header=col_header, row_header=((not col_header) and html_cell.name == "th"), ) data.table_cells.append(table_cell) diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 19a21c19..a8cae1ce 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -136,7 +136,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): end_row_offset_idx=trow_ind + row_span, start_col_offset_idx=tcol_ind, end_col_offset_idx=tcol_ind + col_span, - col_header=False, + column_header=trow_ind == 0, row_header=False, ) tcells.append(icell) diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 19c25341..121637a1 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -164,7 +164,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): end_row_offset_idx=excel_cell.row + excel_cell.row_span, start_col_offset_idx=excel_cell.col, end_col_offset_idx=excel_cell.col + excel_cell.col_span, - col_header=False, + column_header=excel_cell.row == 0, row_header=False, ) table_data.table_cells.append(cell) @@ -173,7 +173,7 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): return doc - def _find_data_tables(self, sheet: Worksheet): + def _find_data_tables(self, sheet: Worksheet) -> List[ExcelTable]: """ Find all compact rectangular data tables in a sheet. """ @@ -340,47 +340,4 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend): except: _log.error("could not extract the image from excel sheets") - """ - for idx, chart in enumerate(sheet._charts): # type: ignore - try: - chart_path = f"chart_{idx + 1}.png" - _log.info( - f"Chart found, but dynamic rendering is required for: {chart_path}" - ) - - _log.info(f"Chart {idx + 1}:") - - # Chart type - # _log.info(f"Type: {type(chart).__name__}") - print(f"Type: {type(chart).__name__}") - - # Extract series data - for series_idx, series in enumerate(chart.series): - #_log.info(f"Series {series_idx + 1}:") - print(f"Series {series_idx + 1} type: {type(series).__name__}") - #print(f"x-values: {series.xVal}") - #print(f"y-values: {series.yVal}") - - print(f"xval type: {type(series.xVal).__name__}") - - xvals = [] - for _ in series.xVal.numLit.pt: - print(f"xval type: {type(_).__name__}") - if hasattr(_, 'v'): - xvals.append(_.v) - - print(f"x-values: {xvals}") - - yvals = [] - for _ in series.yVal: - if hasattr(_, 'v'): - yvals.append(_.v) - - print(f"y-values: {yvals}") - - except Exception as exc: - print(exc) - continue - """ - return doc diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 8b86008b..231d6224 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -346,7 +346,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB end_row_offset_idx=row_idx + row_span, start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + col_span, - col_header=False, + column_header=row_idx == 0, row_header=False, ) if len(cell.text.strip()) > 0: diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 1a504bcb..926ce083 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -26,6 +26,7 @@ from PIL import Image, UnidentifiedImageError from typing_extensions import override from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.docx.latex.omml import oMath2Latex from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument @@ -260,6 +261,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): else: return label, None + def handle_equations_in_text(self, element, text): + only_texts = [] + only_equations = [] + texts_and_equations = [] + for subt in element.iter(): + tag_name = etree.QName(subt).localname + if tag_name == "t" and "math" not in subt.tag: + only_texts.append(subt.text) + texts_and_equations.append(subt.text) + elif "oMath" in subt.tag and "oMathPara" not in subt.tag: + latex_equation = str(oMath2Latex(subt)) + only_equations.append(latex_equation) + texts_and_equations.append(latex_equation) + + if "".join(only_texts) != text: + return text + + return "".join(texts_and_equations), only_equations + def handle_text_elements( self, element: BaseOxmlElement, @@ -268,9 +288,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) -> None: paragraph = Paragraph(element, docx_obj) - if paragraph.text is None: + raw_text = paragraph.text + text, equations = self.handle_equations_in_text(element=element, text=raw_text) + + if text is None: return - text = paragraph.text.strip() + text = text.strip() # Common styles for bullet and numbered lists. # "List Bullet", "List Number", "List Paragraph" @@ -323,6 +346,45 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif "Heading" in p_style_id: self.add_header(doc, p_level, text) + elif len(equations) > 0: + if (raw_text is None or len(raw_text) == 0) and len(text) > 0: + # Standalone equation + level = self.get_level() + doc.add_text( + label=DocItemLabel.FORMULA, + parent=self.parents[level - 1], + text=text, + ) + else: + # Inline equation + level = self.get_level() + inline_equation = doc.add_group( + label=GroupLabel.INLINE, parent=self.parents[level - 1] + ) + text_tmp = text + for eq in equations: + if len(text_tmp) == 0: + break + pre_eq_text = text_tmp.split(eq, maxsplit=1)[0] + text_tmp = text_tmp.split(eq, maxsplit=1)[1] + if len(pre_eq_text) > 0: + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=inline_equation, + text=pre_eq_text, + ) + doc.add_text( + label=DocItemLabel.FORMULA, + parent=inline_equation, + text=eq, + ) + if len(text_tmp) > 0: + doc.add_text( + label=DocItemLabel.PARAGRAPH, + parent=inline_equation, + text=text_tmp, + ) + elif p_style_id in [ "Paragraph", "Normal", @@ -539,7 +601,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): end_row_offset_idx=row.grid_cols_before + spanned_idx, start_col_offset_idx=col_idx, end_col_offset_idx=col_idx + cell.grid_span, - col_header=False, + column_header=row.grid_cols_before + row_idx == 0, row_header=False, ) data.table_cells.append(table_cell) diff --git a/docling/cli/models.py b/docling/cli/models.py index cc4a43ac..7bc313c1 100644 --- a/docling/cli/models.py +++ b/docling/cli/models.py @@ -121,7 +121,7 @@ def download( "Using the CLI:", f"`docling --artifacts-path={output_dir} FILE`", "\n", - "Using Python: see the documentation at .", + "Using Python: see the documentation at .", ) diff --git a/docling/models/ocr_mac_model.py b/docling/models/ocr_mac_model.py index 9e6fe979..9d61828a 100644 --- a/docling/models/ocr_mac_model.py +++ b/docling/models/ocr_mac_model.py @@ -27,7 +27,7 @@ class OcrMacModel(BaseOcrModel): "ocrmac is not correctly installed. " "Please install it via `pip install ocrmac` to use this OCR engine. " "Alternatively, Docling has support for other OCR engines. See the documentation: " - "https://ds4sd.github.io/docling/installation/" + "https://docling-project.github.io/docling/installation/" ) try: from ocrmac import ocrmac diff --git a/docling/models/tesseract_ocr_model.py b/docling/models/tesseract_ocr_model.py index 7f4da180..00008636 100644 --- a/docling/models/tesseract_ocr_model.py +++ b/docling/models/tesseract_ocr_model.py @@ -32,14 +32,14 @@ class TesseractOcrModel(BaseOcrModel): "Note that tesserocr might have to be manually compiled for working with " "your Tesseract installation. The Docling documentation provides examples for it. " "Alternatively, Docling has support for other OCR engines. See the documentation: " - "https://ds4sd.github.io/docling/installation/" + "https://docling-project.github.io/docling/installation/" ) missing_langs_errmsg = ( "tesserocr is not correctly configured. No language models have been detected. " "Please ensure that the TESSDATA_PREFIX envvar points to tesseract languages dir. " "You can find more information how to setup other OCR engines in Docling " "documentation: " - "https://ds4sd.github.io/docling/installation/" + "https://docling-project.github.io/docling/installation/" ) try: diff --git a/docs/concepts/docling_document.md b/docs/concepts/docling_document.md index 1ac46f55..628285dd 100644 --- a/docs/concepts/docling_document.md +++ b/docs/concepts/docling_document.md @@ -7,7 +7,7 @@ pydantic datatype, which can express several features common to documents, such * Layout information (i.e. bounding boxes) for all items, if available * Provenance information -The definition of the Pydantic types is implemented in the module `docling_core.types.doc`, more details in [source code definitions](https://github.com/DS4SD/docling-core/tree/main/docling_core/types/doc). +The definition of the Pydantic types is implemented in the module `docling_core.types.doc`, more details in [source code definitions](https://github.com/docling-project/docling-core/tree/main/docling_core/types/doc). It also brings a set of document construction APIs to build up a `DoclingDocument` from scratch. diff --git a/docs/examples/backend_xml_rag.ipynb b/docs/examples/backend_xml_rag.ipynb index 0b2227f4..091f116d 100644 --- a/docs/examples/backend_xml_rag.ipynb +++ b/docs/examples/backend_xml_rag.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\"Open" + "\"Open" ] }, { @@ -36,7 +36,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This is an example of using [Docling](https://ds4sd.github.io/docling/) for converting structured data (XML) into a unified document\n", + "This is an example of using [Docling](https://docling-project.github.io/docling/) for converting structured data (XML) into a unified document\n", "representation format, `DoclingDocument`, and leverage its riched structured content for RAG applications.\n", "\n", "Data used in this example consist of patents from the [United States Patent and Trademark Office (USPTO)](https://www.uspto.gov/) and medical\n", diff --git a/docs/examples/hybrid_chunking.ipynb b/docs/examples/hybrid_chunking.ipynb index 6a5f5882..2f6d9457 100644 --- a/docs/examples/hybrid_chunking.ipynb +++ b/docs/examples/hybrid_chunking.ipynb @@ -103,7 +103,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://ds4sd.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)." + "> 👉 **NOTE**: As you see above, using the `HybridChunker` can sometimes lead to a warning from the transformers library, however this is a \"false alarm\" — for details check [here](https://docling-project.github.io/docling/faq/#hybridchunker-triggers-warning-token-indices-sequence-length-is-longer-than-the-specified-maximum-sequence-length-for-this-model)." ] }, { diff --git a/docs/examples/pictures_description.ipynb b/docs/examples/pictures_description.ipynb index f50860db..feeb00ba 100644 --- a/docs/examples/pictures_description.ipynb +++ b/docs/examples/pictures_description.ipynb @@ -321,7 +321,7 @@ ], "metadata": { "kernelspec": { - "display_name": "docling-aMWN2FRM-py3.12", + "display_name": "docling-hgXEfXco-py3.12", "language": "python", "name": "python3" }, diff --git a/docs/examples/rag_azuresearch.ipynb b/docs/examples/rag_azuresearch.ipynb index dcfd19e3..9f867b1d 100644 --- a/docs/examples/rag_azuresearch.ipynb +++ b/docs/examples/rag_azuresearch.ipynb @@ -36,7 +36,7 @@ "## A recipe 🧑‍🍳 🐥 💚\n", "\n", "This notebook demonstrates how to build a Retrieval-Augmented Generation (RAG) system using:\n", - "- [Docling](https://ds4sd.github.io/docling/) for document parsing and chunking\n", + "- [Docling](https://docling-project.github.io/docling/) for document parsing and chunking\n", "- [Azure AI Search](https://azure.microsoft.com/products/ai-services/ai-search/?msockid=0109678bea39665431e37323ebff6723) for vector indexing and retrieval\n", "- [Azure OpenAI](https://azure.microsoft.com/products/ai-services/openai-service?msockid=0109678bea39665431e37323ebff6723) for embeddings and chat completion\n", "\n", diff --git a/docs/examples/rag_haystack.ipynb b/docs/examples/rag_haystack.ipynb index f4f61de8..b9541150 100644 --- a/docs/examples/rag_haystack.ipynb +++ b/docs/examples/rag_haystack.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\"Open" + "\"Open" ] }, { @@ -247,7 +247,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_client.py:2232: FutureWarning: `stop_sequences` is a deprecated argument for `text_generation` task and will be removed in version '0.28.0'. Use `stop` instead.\n", + "/Users/pva/work/github.com/docling-project/docling/.venv/lib/python3.12/site-packages/huggingface_hub/inference/_client.py:2232: FutureWarning: `stop_sequences` is a deprecated argument for `text_generation` task and will be removed in version '0.28.0'. Use `stop` instead.\n", " warnings.warn(\n" ] } diff --git a/docs/examples/rag_langchain.ipynb b/docs/examples/rag_langchain.ipynb index ef8374aa..37c41709 100644 --- a/docs/examples/rag_langchain.ipynb +++ b/docs/examples/rag_langchain.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\"Open" + "\"Open" ] }, { @@ -168,7 +168,7 @@ "source": [ "> Note: a message saying `\"Token indices sequence length is longer than the specified\n", "maximum sequence length...\"` can be ignored in this case — details\n", - "[here](https://github.com/DS4SD/docling-core/issues/119#issuecomment-2577418826)." + "[here](https://github.com/docling-project/docling-core/issues/119#issuecomment-2577418826)." ] }, { diff --git a/docs/examples/rag_llamaindex.ipynb b/docs/examples/rag_llamaindex.ipynb index b0c5ddc1..e0fae9b7 100644 --- a/docs/examples/rag_llamaindex.ipynb +++ b/docs/examples/rag_llamaindex.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\"Open" + "\"Open" ] }, { diff --git a/docs/examples/rag_weaviate.ipynb b/docs/examples/rag_weaviate.ipynb index 2d32c6c5..7c020f43 100644 --- a/docs/examples/rag_weaviate.ipynb +++ b/docs/examples/rag_weaviate.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DS4SD/docling/blob/main/docs/examples/rag_weaviate.ipynb)" + "[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/docling-project/docling/blob/main/docs/examples/rag_weaviate.ipynb)" ] }, { @@ -29,7 +29,7 @@ "\n", "## A recipe 🧑‍🍳 🐥 💚\n", "\n", - "This is a code recipe that uses [Weaviate](https://weaviate.io/) to perform RAG over PDF documents parsed by [Docling](https://ds4sd.github.io/docling/).\n", + "This is a code recipe that uses [Weaviate](https://weaviate.io/) to perform RAG over PDF documents parsed by [Docling](https://docling-project.github.io/docling/).\n", "\n", "In this notebook, we accomplish the following:\n", "* Parse the top machine learning papers on [arXiv](https://arxiv.org/) using Docling\n", diff --git a/docs/examples/retrieval_qdrant.ipynb b/docs/examples/retrieval_qdrant.ipynb index cda50ac7..d32e1f31 100644 --- a/docs/examples/retrieval_qdrant.ipynb +++ b/docs/examples/retrieval_qdrant.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\"Open" ] }, @@ -109,7 +109,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/pva/work/github.com/DS4SD/docling/.venv/lib/python3.12/site-packages/huggingface_hub/utils/tqdm.py:155: UserWarning: Cannot enable progress bars: environment variable `HF_HUB_DISABLE_PROGRESS_BARS=1` is set and has priority.\n", + "/Users/pva/work/github.com/docling-project/docling/.venv/lib/python3.12/site-packages/huggingface_hub/utils/tqdm.py:155: UserWarning: Cannot enable progress bars: environment variable `HF_HUB_DISABLE_PROGRESS_BARS=1` is set and has priority.\n", " warnings.warn(\n" ] } diff --git a/docs/faq/index.md b/docs/faq/index.md index d1d05a8f..781c5688 100644 --- a/docs/faq/index.md +++ b/docs/faq/index.md @@ -1,6 +1,6 @@ # FAQ -This is a collection of FAQ collected from the user questions on . +This is a collection of FAQ collected from the user questions on . ??? question "Is Python 3.13 supported?" @@ -41,7 +41,7 @@ This is a collection of FAQ collected from the user questions on . - When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/DS4SD/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior. + When OCR is enabled, some engines also require model artifacts. For example EasyOCR, for which Docling has [special pipeline options](https://github.com/docling-project/docling/blob/main/docling/datamodel/pipeline_options.py#L68) to control the runtime behavior. ??? question "SSL error downloading model weights" @@ -174,6 +174,6 @@ This is a collection of FAQ collected from the user questions on =0.4.6)"] +[[package]] +name = "pylatexenc" +version = "2.10" +description = "Simple LaTeX parser providing latex-to-unicode and unicode-to-latex conversion" +optional = false +python-versions = "*" +files = [ + {file = "pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3"}, +] + [[package]] name = "pylint" version = "2.17.7" @@ -7806,4 +7816,4 @@ vlm = ["accelerate", "transformers", "transformers"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "da6afcbfeefb3a45560d4098c5a1345333fc833fd13e6408aacb06c6d18317f0" +content-hash = "86d3894f8f998af4b7f766ec5060f9f64d532d9b6611d4836271bc0fdfd796c7" diff --git a/pyproject.toml b/pyproject.toml index af24e6ce..e8a1d8c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,23 +2,43 @@ name = "docling" version = "2.26.0" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." -authors = ["Christoph Auer ", "Michele Dolfi ", "Maxim Lysak ", "Nikos Livathinos ", "Ahmed Nassar ", "Panos Vagenas ", "Peter Staar "] +authors = [ + "Christoph Auer ", + "Michele Dolfi ", + "Maxim Lysak ", + "Nikos Livathinos ", + "Ahmed Nassar ", + "Panos Vagenas ", + "Peter Staar ", +] license = "MIT" readme = "README.md" -repository = "https://github.com/DS4SD/docling" -homepage = "https://github.com/DS4SD/docling" -keywords= ["docling", "convert", "document", "pdf", "docx", "html", "markdown", "layout model", "segmentation", "table structure", "table former"] - classifiers = [ - "License :: OSI Approved :: MIT License", - "Operating System :: MacOS :: MacOS X", - "Operating System :: POSIX :: Linux", - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "Programming Language :: Python :: 3" - ] -packages = [{include = "docling"}] +repository = "https://github.com/docling-project/docling" +homepage = "https://github.com/docling-project/docling" +keywords = [ + "docling", + "convert", + "document", + "pdf", + "docx", + "html", + "markdown", + "layout model", + "segmentation", + "table structure", + "table former", +] +classifiers = [ + "License :: OSI Approved :: MIT License", + "Operating System :: MacOS :: MacOS X", + "Operating System :: POSIX :: Linux", + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Programming Language :: Python :: 3", +] +packages = [{ include = "docling" }] [tool.poetry.dependencies] ###################### @@ -28,7 +48,7 @@ python = "^3.9" pydantic = "^2.0.0" docling-core = {extras = ["chunking"], version = "^2.23.0"} docling-ibm-models = "^3.4.0" -docling-parse = {git = "https://github.com/DS4SD/docling-parse", rev = "cau/api-move-to-docling-core"} +docling-parse = {git = "https://github.com/DS4SD/docling-parse", rev = "main"} filetype = "^1.2.0" pypdfium2 = "^4.30.0" pydantic-settings = "^2.3.0" @@ -40,7 +60,7 @@ certifi = ">=2024.7.4" rtree = "^1.3.0" scipy = [ { version = "^1.6.0", markers = "python_version >= '3.10'" }, - { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" } + { version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" }, ] typer = "^0.12.5" python-docx = "^1.1.2" @@ -56,21 +76,22 @@ onnxruntime = [ # 1.19.2 is the last version with python3.9 support, # see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0 { version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" }, - { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } + { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }, ] transformers = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true }, - {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true } + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true }, + { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true }, ] accelerate = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true }, + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true }, ] pillow = ">=10.0.0,<12.0.0" tqdm = "^4.65.0" +pylatexenc = "^2.10" [tool.poetry.group.dev.dependencies] -black = {extras = ["jupyter"], version = "^24.4.2"} +black = { extras = ["jupyter"], version = "^24.4.2" } pytest = "^7.2.2" pre-commit = "^3.7.1" mypy = "^1.10.1" @@ -93,7 +114,7 @@ types-tqdm = "^4.67.0.20241221" mkdocs-material = "^9.5.40" mkdocs-jupyter = "^0.25.0" mkdocs-click = "^0.8.1" -mkdocstrings = {extras = ["python"], version = "^0.27.0"} +mkdocstrings = { extras = ["python"], version = "^0.27.0" } griffe-pydantic = "^1.1.0" [tool.poetry.group.examples.dependencies] @@ -108,8 +129,8 @@ optional = true [tool.poetry.group.constraints.dependencies] numpy = [ - { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' }, - { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' }, + { version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' }, + { version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' }, ] [tool.poetry.group.mac_intel] @@ -117,12 +138,12 @@ optional = true [tool.poetry.group.mac_intel.dependencies] torch = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2"}, - {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2"} + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" }, + { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" }, ] torchvision = [ - {markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0"}, - {markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2"} + { markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" }, + { markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" }, ] [tool.poetry.extras] @@ -147,7 +168,7 @@ include = '\.pyi?$' [tool.isort] profile = "black" line_length = 88 -py_version=39 +py_version = 39 [tool.mypy] pretty = true @@ -158,18 +179,19 @@ python_version = "3.10" [[tool.mypy.overrides]] module = [ - "docling_parse.*", - "pypdfium2.*", - "networkx.*", - "scipy.*", - "filetype.*", - "tesserocr.*", - "docling_ibm_models.*", - "easyocr.*", - "ocrmac.*", - "lxml.*", - "huggingface_hub.*", - "transformers.*", + "docling_parse.*", + "pypdfium2.*", + "networkx.*", + "scipy.*", + "filetype.*", + "tesserocr.*", + "docling_ibm_models.*", + "easyocr.*", + "ocrmac.*", + "lxml.*", + "huggingface_hub.*", + "transformers.*", + "pylatexenc.*", ] ignore_missing_imports = true diff --git a/tests/data/docx/equations.docx b/tests/data/docx/equations.docx new file mode 100644 index 00000000..8ab71b96 Binary files /dev/null and b/tests/data/docx/equations.docx differ diff --git a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json index 310d5c84..6c4f7f2a 100644 --- a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -87,7 +87,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -296,7 +296,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -308,7 +308,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -320,7 +320,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -332,7 +332,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/csv-comma.csv.json b/tests/data/groundtruth/docling_v2/csv-comma.csv.json index 4e7f6df9..1b5d113d 100644 --- a/tests/data/groundtruth/docling_v2/csv-comma.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-comma.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -87,7 +87,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -99,7 +99,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -111,7 +111,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -123,7 +123,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -135,7 +135,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -147,7 +147,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -159,7 +159,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -171,7 +171,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -183,7 +183,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -920,7 +920,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -932,7 +932,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -944,7 +944,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -956,7 +956,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -968,7 +968,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -980,7 +980,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -992,7 +992,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1004,7 +1004,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1016,7 +1016,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1028,7 +1028,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1040,7 +1040,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1052,7 +1052,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json index daa2844a..22ab07bc 100644 --- a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -284,7 +284,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -296,7 +296,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -308,7 +308,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, diff --git a/tests/data/groundtruth/docling_v2/csv-pipe.csv.json b/tests/data/groundtruth/docling_v2/csv-pipe.csv.json index 753fa138..b4ffd643 100644 --- a/tests/data/groundtruth/docling_v2/csv-pipe.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-pipe.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -87,7 +87,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -99,7 +99,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -111,7 +111,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -123,7 +123,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -135,7 +135,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -147,7 +147,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -159,7 +159,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -171,7 +171,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -183,7 +183,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -920,7 +920,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -932,7 +932,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -944,7 +944,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -956,7 +956,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -968,7 +968,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -980,7 +980,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -992,7 +992,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1004,7 +1004,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1016,7 +1016,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1028,7 +1028,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1040,7 +1040,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1052,7 +1052,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json index a429ceeb..4e654b10 100644 --- a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -87,7 +87,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -99,7 +99,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -111,7 +111,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -123,7 +123,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -135,7 +135,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -147,7 +147,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -159,7 +159,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -171,7 +171,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -183,7 +183,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -920,7 +920,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -932,7 +932,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -944,7 +944,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -956,7 +956,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -968,7 +968,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -980,7 +980,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -992,7 +992,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1004,7 +1004,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1016,7 +1016,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1028,7 +1028,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1040,7 +1040,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1052,7 +1052,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/csv-tab.csv.json b/tests/data/groundtruth/docling_v2/csv-tab.csv.json index 5f6d4ac5..14ab073e 100644 --- a/tests/data/groundtruth/docling_v2/csv-tab.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-tab.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -87,7 +87,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -99,7 +99,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -111,7 +111,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -123,7 +123,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -135,7 +135,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -147,7 +147,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -159,7 +159,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -171,7 +171,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -183,7 +183,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -920,7 +920,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Index", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -932,7 +932,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Customer Id", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -944,7 +944,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "First Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -956,7 +956,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Last Name", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -968,7 +968,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "Company", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -980,7 +980,7 @@ "start_col_offset_idx": 5, "end_col_offset_idx": 6, "text": "City", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -992,7 +992,7 @@ "start_col_offset_idx": 6, "end_col_offset_idx": 7, "text": "Country", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1004,7 +1004,7 @@ "start_col_offset_idx": 7, "end_col_offset_idx": 8, "text": "Phone 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1016,7 +1016,7 @@ "start_col_offset_idx": 8, "end_col_offset_idx": 9, "text": "Phone 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1028,7 +1028,7 @@ "start_col_offset_idx": 9, "end_col_offset_idx": 10, "text": "Email", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1040,7 +1040,7 @@ "start_col_offset_idx": 10, "end_col_offset_idx": 11, "text": "Subscription Date", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1052,7 +1052,7 @@ "start_col_offset_idx": 11, "end_col_offset_idx": 12, "text": "Website", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json index e1275895..c17e6d28 100644 --- a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -87,7 +87,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -284,7 +284,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -296,7 +296,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -308,7 +308,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -320,7 +320,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json index 6df1ac90..6292b7a8 100644 --- a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json @@ -51,7 +51,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -63,7 +63,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -75,7 +75,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -87,7 +87,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -308,7 +308,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -320,7 +320,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -332,7 +332,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -344,7 +344,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, diff --git a/tests/data/groundtruth/docling_v2/equations.docx.itxt b/tests/data/groundtruth/docling_v2/equations.docx.itxt new file mode 100644 index 00000000..1412074d --- /dev/null +++ b/tests/data/groundtruth/docling_v2/equations.docx.itxt @@ -0,0 +1,40 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: inline: group group + item-2 at level 2: paragraph: This is a word document and this is an inline equation: + item-3 at level 2: formula: A= \pi r^{2} + item-4 at level 2: paragraph: . If instead, I want an equation by line, I can do this: + item-5 at level 1: paragraph: + item-6 at level 1: formula: a^{2}+b^{2}=c^{2} \text{ \texttimes } 23 + item-7 at level 1: paragraph: And that is an equation by itself. Cheers! + item-8 at level 1: paragraph: + item-9 at level 1: paragraph: This is another equation: + item-10 at level 1: formula: f\left(x\right)=a_{0}+\sum_{n=1} ... })+b_{n}\sin(\frac{n \pi x}{L})\right) + item-11 at level 1: paragraph: + item-12 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. + item-13 at level 1: paragraph: + item-14 at level 1: paragraph: + item-15 at level 1: inline: group group + item-16 at level 2: paragraph: This is a word document and this is an inline equation: + item-17 at level 2: formula: A= \pi r^{2} + item-18 at level 2: paragraph: . If instead, I want an equation by line, I can do this: + item-19 at level 1: paragraph: + item-20 at level 1: formula: \left(x+a\right)^{n}=\sum_{k=0}^ ... ac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k} + item-21 at level 1: paragraph: + item-22 at level 1: paragraph: And that is an equation by itself. Cheers! + item-23 at level 1: paragraph: + item-24 at level 1: paragraph: This is another equation: + item-25 at level 1: paragraph: + item-26 at level 1: formula: \left(1+x\right)^{n}=1+\frac{nx} ... ght)x^{2}}{2!}+ \text{ \textellipsis } + item-27 at level 1: paragraph: + item-28 at level 1: paragraph: This is text. This is text. This ... s is text. This is text. This is text. + item-29 at level 1: paragraph: + item-30 at level 1: paragraph: + item-31 at level 1: inline: group group + item-32 at level 2: paragraph: This is a word document and this is an inline equation: + item-33 at level 2: formula: A= \pi r^{2} + item-34 at level 2: paragraph: . If instead, I want an equation by line, I can do this: + item-35 at level 1: paragraph: + item-36 at level 1: formula: e^{x}=1+\frac{x}{1!}+\frac{x^{2} ... xtellipsis } , - \infty < x < \infty + item-37 at level 1: paragraph: + item-38 at level 1: paragraph: And that is an equation by itself. Cheers! + item-39 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json new file mode 100644 index 00000000..7880ce44 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -0,0 +1,616 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.2.0", + "name": "equations", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 11121138535595486899, + "filename": "equations.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/texts/16" + }, + { + "$ref": "#/texts/17" + }, + { + "$ref": "#/texts/18" + }, + { + "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" + }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" + }, + { + "$ref": "#/texts/25" + }, + { + "$ref": "#/texts/26" + }, + { + "$ref": "#/texts/27" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/texts/31" + }, + { + "$ref": "#/texts/32" + }, + { + "$ref": "#/texts/33" + }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/13" + }, + { + "$ref": "#/texts/14" + }, + { + "$ref": "#/texts/15" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/28" + }, + { + "$ref": "#/texts/29" + }, + { + "$ref": "#/texts/30" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is a word document and this is an inline equation: ", + "text": "This is a word document and this is an inline equation: " + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "A= \\pi r^{2} ", + "text": "A= \\pi r^{2} " + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": ". If instead, I want an equation by line, I can do this:", + "text": ". If instead, I want an equation by line, I can do this:" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23", + "text": "a^{2}+b^{2}=c^{2} \\text{ \\texttimes } 23" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "And that is an equation by itself. Cheers!", + "text": "And that is an equation by itself. Cheers!" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is another equation:", + "text": "This is another equation:" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)", + "text": "f\\left(x\\right)=a_{0}+\\sum_{n=1}^{ \\infty }\\left(a_{n}\\cos(\\frac{n \\pi x}{L})+b_{n}\\sin(\\frac{n \\pi x}{L})\\right)" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", + "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text." + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is a word document and this is an inline equation: ", + "text": "This is a word document and this is an inline equation: " + }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "A= \\pi r^{2} ", + "text": "A= \\pi r^{2} " + }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": ". If instead, I want an equation by line, I can do this:", + "text": ". If instead, I want an equation by line, I can do this:" + }, + { + "self_ref": "#/texts/16", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/17", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}", + "text": "\\left(x+a\\right)^{n}=\\sum_{k=0}^{n}\\left(\\genfrac{}{}{0pt}{}{n}{k}\\right)x^{k}a^{n-k}" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "And that is an equation by itself. Cheers!", + "text": "And that is an equation by itself. Cheers!" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is another equation:", + "text": "This is another equation:" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }", + "text": "\\left(1+x\\right)^{n}=1+\\frac{nx}{1!}+\\frac{n\\left(n-1\\right)x^{2}}{2!}+ \\text{ \\textellipsis }" + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/25", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text.", + "text": "This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text." + }, + { + "self_ref": "#/texts/26", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/27", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/28", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "This is a word document and this is an inline equation: ", + "text": "This is a word document and this is an inline equation: " + }, + { + "self_ref": "#/texts/29", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "A= \\pi r^{2} ", + "text": "A= \\pi r^{2} " + }, + { + "self_ref": "#/texts/30", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": ". If instead, I want an equation by line, I can do this:", + "text": ". If instead, I want an equation by line, I can do this:" + }, + { + "self_ref": "#/texts/31", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "formula", + "prov": [], + "orig": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty", + "text": "e^{x}=1+\\frac{x}{1!}+\\frac{x^{2}}{2!}+\\frac{x^{3}}{3!}+ \\text{ \\textellipsis } , - \\infty < x < \\infty" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "And that is an equation by itself. Cheers!", + "text": "And that is an equation by itself. Cheers!" + }, + { + "self_ref": "#/texts/35", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/equations.docx.md b/tests/data/groundtruth/docling_v2/equations.docx.md new file mode 100644 index 00000000..a8253ebf --- /dev/null +++ b/tests/data/groundtruth/docling_v2/equations.docx.md @@ -0,0 +1,29 @@ +This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: + +$$a^{2}+b^{2}=c^{2} \text{ \texttimes } 23$$ + +And that is an equation by itself. Cheers! + +This is another equation: + +$$f\left(x\right)=a_{0}+\sum_{n=1}^{ \infty }\left(a_{n}\cos(\frac{n \pi x}{L})+b_{n}\sin(\frac{n \pi x}{L})\right)$$ + +This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. + +This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: + +$$\left(x+a\right)^{n}=\sum_{k=0}^{n}\left(\genfrac{}{}{0pt}{}{n}{k}\right)x^{k}a^{n-k}$$ + +And that is an equation by itself. Cheers! + +This is another equation: + +$$\left(1+x\right)^{n}=1+\frac{nx}{1!}+\frac{n\left(n-1\right)x^{2}}{2!}+ \text{ \textellipsis }$$ + +This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. This is text. + +This is a word document and this is an inline equation: $A= \pi r^{2} $ . If instead, I want an equation by line, I can do this: + +$$e^{x}=1+\frac{x}{1!}+\frac{x^{2}}{2!}+\frac{x^{3}}{3!}+ \text{ \textellipsis } , - \infty < x < \infty$$ + +And that is an equation by itself. Cheers! \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_03.html.json b/tests/data/groundtruth/docling_v2/example_03.html.json index bdfaaebe..5d373296 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.json +++ b/tests/data/groundtruth/docling_v2/example_03.html.json @@ -344,7 +344,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -356,7 +356,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -368,7 +368,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -493,7 +493,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -505,7 +505,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -517,7 +517,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/example_04.html.json b/tests/data/groundtruth/docling_v2/example_04.html.json index 23c38904..854ba4c6 100644 --- a/tests/data/groundtruth/docling_v2/example_04.html.json +++ b/tests/data/groundtruth/docling_v2/example_04.html.json @@ -68,7 +68,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -80,7 +80,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "Header 2 & 3 (colspan)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -181,7 +181,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -193,7 +193,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "Header 2 & 3 (colspan)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -205,7 +205,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "Header 2 & 3 (colspan)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/example_05.html.json b/tests/data/groundtruth/docling_v2/example_05.html.json index 3c5c1f0e..3168a637 100644 --- a/tests/data/groundtruth/docling_v2/example_05.html.json +++ b/tests/data/groundtruth/docling_v2/example_05.html.json @@ -68,7 +68,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -80,7 +80,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "Header 2 & 3 (colspan)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -181,7 +181,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -193,7 +193,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "Header 2 & 3 (colspan)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -205,7 +205,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "Header 2 & 3 (colspan)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/example_07.html.itxt b/tests/data/groundtruth/docling_v2/example_07.html.itxt new file mode 100644 index 00000000..675749eb --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_07.html.itxt @@ -0,0 +1,22 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: list: group list + item-2 at level 2: list_item: Asia + item-3 at level 3: list: group list + item-4 at level 4: list_item: China + item-5 at level 4: list_item: Japan + item-6 at level 4: list_item: Thailand + item-7 at level 2: list_item: Europe + item-8 at level 3: list: group list + item-9 at level 4: list_item: UK + item-10 at level 4: list_item: Germany + item-11 at level 4: list_item: Switzerland + item-12 at level 5: list: group list + item-13 at level 6: list: group list + item-14 at level 7: list_item: Bern + item-15 at level 7: list_item: Aargau + item-16 at level 4: list_item: Italy + item-17 at level 5: list: group list + item-18 at level 6: list: group list + item-19 at level 7: list_item: Piedmont + item-20 at level 7: list_item: Liguria + item-21 at level 2: list_item: Africa \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_07.html.json b/tests/data/groundtruth/docling_v2/example_07.html.json new file mode 100644 index 00000000..248a47bd --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_07.html.json @@ -0,0 +1,374 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.2.0", + "name": "example_07", + "origin": { + "mimetype": "text/html", + "binary_hash": 623628706615267627, + "filename": "example_07.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/13" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/4" + }, + "children": [ + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/texts/10" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/texts/7" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/4", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/texts/10" + }, + "children": [ + { + "$ref": "#/groups/6" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/groups/5" + }, + "children": [ + { + "$ref": "#/texts/11" + }, + { + "$ref": "#/texts/12" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Asia", + "text": "Asia", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "China", + "text": "China", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Japan", + "text": "Japan", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Thailand", + "text": "Thailand", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/groups/2" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Europe", + "text": "Europe", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "UK", + "text": "UK", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Germany", + "text": "Germany", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Switzerland", + "text": "Switzerland", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Bern", + "text": "Bern", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Aargau", + "text": "Aargau", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/groups/5" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Italy", + "text": "Italy", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Piedmont", + "text": "Piedmont", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/12", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Liguria", + "text": "Liguria", + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/13", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Africa", + "text": "Africa", + "enumerated": false, + "marker": "-" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/example_07.html.md b/tests/data/groundtruth/docling_v2/example_07.html.md new file mode 100644 index 00000000..0a4b9fac --- /dev/null +++ b/tests/data/groundtruth/docling_v2/example_07.html.md @@ -0,0 +1,14 @@ +- Asia + - China + - Japan + - Thailand +- Europe + - UK + - Germany + - Switzerland + - Bern + - Aargau + - Italy + - Piedmont + - Liguria +- Africa \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json index 6783cf3f..b24c46ed 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json @@ -960,7 +960,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 4, "text": "Class1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -972,7 +972,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 7, "text": "Class2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1385,7 +1385,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 4, "text": "Class1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1397,7 +1397,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 4, "text": "Class1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1409,7 +1409,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 4, "text": "Class1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1421,7 +1421,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 7, "text": "Class2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1433,7 +1433,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 7, "text": "Class2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1445,7 +1445,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 7, "text": "Class2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/tablecell.docx.json b/tests/data/groundtruth/docling_v2/tablecell.docx.json index 51341f35..2a070468 100644 --- a/tests/data/groundtruth/docling_v2/tablecell.docx.json +++ b/tests/data/groundtruth/docling_v2/tablecell.docx.json @@ -176,7 +176,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Tab1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -188,7 +188,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Tab2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -200,7 +200,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Tab3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -289,7 +289,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Tab1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -301,7 +301,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Tab2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -313,7 +313,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Tab3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/test-01.xlsx.json index 19a701c5..173cd5fb 100644 --- a/tests/data/groundtruth/docling_v2/test-01.xlsx.json +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.json @@ -136,7 +136,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -148,7 +148,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "second ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -160,7 +160,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "third", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -393,7 +393,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -405,7 +405,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "second ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -417,7 +417,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "third", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -675,7 +675,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "col-1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -687,7 +687,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "col-2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -699,7 +699,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "col-3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -711,7 +711,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "col-4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1112,7 +1112,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "col-1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1124,7 +1124,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "col-2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1136,7 +1136,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "col-3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1148,7 +1148,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "col-4", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -1578,7 +1578,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "col-1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1590,7 +1590,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "col-2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1602,7 +1602,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "col-3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1763,7 +1763,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "col-1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1775,7 +1775,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "col-2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1787,7 +1787,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "col-3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -1969,7 +1969,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "col-1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1981,7 +1981,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "col-2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1993,7 +1993,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "col-3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2154,7 +2154,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "col-1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2166,7 +2166,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "col-2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2178,7 +2178,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "col-3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -2360,7 +2360,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2372,7 +2372,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "header", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2545,7 +2545,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2557,7 +2557,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "header", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2569,7 +2569,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "header", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -2583,7 +2583,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2827,7 +2827,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first (f)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -2839,7 +2839,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "header (f)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -3012,7 +3012,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first (f)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -3024,7 +3024,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "header (f)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -3036,7 +3036,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 3, "text": "header (f)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -3050,7 +3050,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "first (f)", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 5a4928be..eb93fe29 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -7914,7 +7914,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Duck\n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -7950,7 +7950,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Scientific classification \n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -8130,7 +8130,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Subfamilies\n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -8159,7 +8159,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Duck\n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -8171,7 +8171,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Duck\n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -8237,7 +8237,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Scientific classification \n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -8249,7 +8249,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Scientific classification \n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -8445,7 +8445,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Subfamilies\n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -8457,7 +8457,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Subfamilies\n", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -8513,7 +8513,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Authority control databases ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -8578,7 +8578,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Authority control databases ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -8590,7 +8590,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 2, "text": "Authority control databases ", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 2f5dfa32..1d305cbc 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -490,7 +490,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -502,7 +502,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Food", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -514,7 +514,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Calories per portion", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -639,7 +639,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -651,7 +651,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Food", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -663,7 +663,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Calories per portion", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.html b/tests/data/groundtruth/docling_v2/word_tables.docx.html index 00a790c0..2dc087f7 100644 --- a/tests/data/groundtruth/docling_v2/word_tables.docx.html +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.html @@ -71,19 +71,19 @@

Test with tables

A uniform table

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2

A non-uniform table with horizontal spans

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2

A non-uniform table with horizontal spans in inner columns

-
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3

A non-uniform table with vertical spans

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2

A non-uniform table with all kinds of spans and empty cells

-
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4

\ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.json b/tests/data/groundtruth/docling_v2/word_tables.docx.json index 5a2642be..db919637 100644 --- a/tests/data/groundtruth/docling_v2/word_tables.docx.json +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json @@ -261,7 +261,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -273,7 +273,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -285,7 +285,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -374,7 +374,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -386,7 +386,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -398,7 +398,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -504,7 +504,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -516,7 +516,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -528,7 +528,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -593,7 +593,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -605,7 +605,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -617,7 +617,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -723,7 +723,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -735,7 +735,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -747,7 +747,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -759,7 +759,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Header 0.3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -848,7 +848,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -860,7 +860,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -872,7 +872,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -884,7 +884,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "Header 0.3", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -1014,7 +1014,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1026,7 +1026,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1038,7 +1038,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1175,7 +1175,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1187,7 +1187,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1199,7 +1199,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } @@ -1381,7 +1381,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1393,7 +1393,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1405,7 +1405,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1417,7 +1417,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1429,7 +1429,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1818,7 +1818,7 @@ "start_col_offset_idx": 0, "end_col_offset_idx": 1, "text": "Header 0.0", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1830,7 +1830,7 @@ "start_col_offset_idx": 1, "end_col_offset_idx": 2, "text": "Header 0.1", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1842,7 +1842,7 @@ "start_col_offset_idx": 2, "end_col_offset_idx": 3, "text": "Header 0.2", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1854,7 +1854,7 @@ "start_col_offset_idx": 3, "end_col_offset_idx": 4, "text": "", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false }, @@ -1866,7 +1866,7 @@ "start_col_offset_idx": 4, "end_col_offset_idx": 5, "text": "", - "column_header": false, + "column_header": true, "row_header": false, "row_section": false } diff --git a/tests/data/html/example_07.html b/tests/data/html/example_07.html new file mode 100644 index 00000000..a8c58c17 --- /dev/null +++ b/tests/data/html/example_07.html @@ -0,0 +1,40 @@ + + +
    +
  • Asia +
      +
    • China
    • +
    • Japan
    • +
    • Thailand
    • +
    +
  • +
  • Europe +
      +
    • UK
    • +
    • Germany
    • +
    • Switzerland +
        +
      • +
          +
        • Bern
        • +
        • Aargau
        • +
        +
      • +
      +
    • +
    • Italy +
        +
      • +
          +
        • Piedmont
        • +
        • Liguria
        • +
        +
      • +
      +
    • +
    +
  • +
  • Africa
  • +
+ + \ No newline at end of file diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index cb39f000..2eee27bf 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -59,7 +59,11 @@ def test_e2e_valid_csv_conversions(): pred_itxt, str(gt_path) + ".itxt" ), "export to indented-text" - assert verify_document(doc, str(gt_path) + ".json"), "export to json" + assert verify_document( + pred_doc=doc, + gtfile=str(gt_path) + ".json", + generate=GENERATE, + ), "export to json" def test_e2e_invalid_csv_conversions(): diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 17c134b4..f9843c78 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -91,4 +91,8 @@ def test_e2e_docx_conversions(): if docx_path.name == "word_tables.docx": pred_html: str = doc.export_to_html() - assert verify_export(pred_html, str(gt_path) + ".html"), "export to html" + assert verify_export( + pred_text=pred_html, + gtfile=str(gt_path) + ".html", + generate=GENERATE, + ), "export to html" diff --git a/tests/test_input_doc.py b/tests/test_input_doc.py index 198dc965..66afe286 100644 --- a/tests/test_input_doc.py +++ b/tests/test_input_doc.py @@ -179,7 +179,7 @@ def test_guess_format(tmp_path): # Non-Docling JSON # TODO: Docling JSON is currently the single supported JSON flavor and the pipeline # will try to validate *any* JSON (based on suffix/MIME) as Docling JSON; proper - # disambiguation seen as part of https://github.com/DS4SD/docling/issues/802 + # disambiguation seen as part of https://github.com/docling-project/docling/issues/802 test_str = "{}" stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode())) assert dci._guess_format(stream) == InputFormat.JSON_DOCLING