test: update results with new docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-08-02 15:32:30 +00:00 · 2025-01-30 11:39:51 +01:00 · 2025-01-30 11:39:51 +01:00 · 22a64ccf45
commit 22a64ccf45
parent f9144f2bb6
17 changed files with 43 additions and 43 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -861,13 +861,13 @@ files = [
 [[package]]
 name = "docling-core"
-version = "2.15.1"
+version = "2.16.1"
 description = "A python library to define and validate data types in Docling."
 optional = false
 python-versions = "<4.0,>=3.9"
 files = [
-    {file = "docling_core-2.15.1-py3-none-any.whl", hash = "sha256:33152604e1f14d5caccbef099c73163c3f211d0b4d92403d262c308633cc0451"},
+    {file = "docling_core-2.16.1-py3-none-any.whl", hash = "sha256:d26af2f49e9f1f65ae5dfca972e206860339c1f91adfe427fa67d1cf95cce241"},
-    {file = "docling_core-2.15.1.tar.gz", hash = "sha256:588d941b5bfc393a79e779ab64819c60763e7f182ec5221ee37da4be91dd802f"},
+    {file = "docling_core-2.16.1.tar.gz", hash = "sha256:676f51fa5797c91a86ccbc1fdaa020effcde4cc86aa9b094a0d5d775636871ba"},
 ]
 [package.dependencies]
@ -3823,10 +3823,10 @@ files = [
 numpy = [
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 [[package]]
@ -3849,10 +3849,10 @@ files = [
 numpy = [
    {version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
    {version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
    {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
    {version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 [[package]]
@ -4037,8 +4037,8 @@ files = [
 [package.dependencies]
 numpy = [
    {version = ">=1.22.4", markers = "python_version < \"3.11\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
    {version = ">=1.23.2", markers = "python_version == \"3.11\""},
    {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
 ]
 python-dateutil = ">=2.8.2"
 pytz = ">=2020.1"
@ -7751,4 +7751,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "08d30cee8d77f9beee32d5dbec1643367ecae2b4c4b47b57fcb337711471eb5c"
+content-hash = "9594a837bfcc96f1bad0d5a9210cec0008e33130f0a4557e7e711d066ac8e791"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -26,7 +26,7 @@ packages = [{include = "docling"}]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = { version = "^2.15.1", extras = ["chunking"] }
+docling-core = {version = "^2.16.1", extras = ["chunking"]}
 docling-ibm-models = "^3.3.0"
 deepsearch-glm = "^1.0.0"
 docling-parse = "^3.1.0"
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md
@ -141,13 +141,13 @@ tention encoding is then multiplied to the encoded image to produce a feature fo
 The output features for each table cell are then fed into the feed-forward network (FFN). The FFN consists of a Multi-Layer Perceptron (3 layers with ReLU activation function) that predicts the normalized coordinates for the bounding box of each table cell. Finally, the predicted bounding boxes are classified based on whether they are empty or not using a linear layer.
-Loss Functions. We formulate a multi-task loss Eq. 2 to train our network. The Cross-Entropy loss (denoted as l$\_{s}$ ) is used to train the Structure Decoder which predicts the structure tokens. As for the Cell BBox Decoder it is trained with a combination of losses denoted as l$\_{box}$ . l$\_{box}$ consists of the generally used l$\_{1}$ loss for object detection and the IoU loss ( l$\_{iou}$ ) to be scale invariant as explained in [25]. In comparison to DETR, we do not use the Hungarian algorithm [15] to match the predicted bounding boxes with the ground-truth boxes, as we have already achieved a one-toone match through two steps: 1) Our token input sequence is naturally ordered, therefore the hidden states of the table data cells are also in order when they are provided as input to the Cell BBox Decoder , and 2) Our bounding boxes generation mechanism (see Sec. 3) ensures a one-to-one mapping between the cell content and its bounding box for all post-processed datasets.
+Loss Functions. We formulate a multi-task loss Eq. 2 to train our network. The Cross-Entropy loss (denoted as l$_{s}$ ) is used to train the Structure Decoder which predicts the structure tokens. As for the Cell BBox Decoder it is trained with a combination of losses denoted as l$_{box}$ . l$_{box}$ consists of the generally used l$_{1}$ loss for object detection and the IoU loss ( l$_{iou}$ ) to be scale invariant as explained in [25]. In comparison to DETR, we do not use the Hungarian algorithm [15] to match the predicted bounding boxes with the ground-truth boxes, as we have already achieved a one-toone match through two steps: 1) Our token input sequence is naturally ordered, therefore the hidden states of the table data cells are also in order when they are provided as input to the Cell BBox Decoder , and 2) Our bounding boxes generation mechanism (see Sec. 3) ensures a one-to-one mapping between the cell content and its bounding box for all post-processed datasets.
 The loss used to train the TableFormer can be defined as following:
-l$\_{box}$ = λ$\_{iou}$l$\_{iou}$ + λ$\_{l}$$\_{1}$ l = λl$\_{s}$ + (1 - λ ) l$\_{box}$ (1)
+$$l$\_{box}$ = λ$\_{iou}$l$\_{iou}$ + λ$\_{l}$$_{1}$ l = λl$_{s}$ + (1 - λ ) l$_{box}$ (1)$$
-where λ ∈ [0, 1], and λ$\_{iou}$, λ$\_{l}$$\_{1}$ ∈$\_{R}$ are hyper-parameters.
+where λ ∈ [0, 1], and λ$\_{iou}$, λ$\_{l}$$_{1}$ ∈$_{R}$ are hyper-parameters.
 ## 5. Experimental Results
@ -155,7 +155,7 @@ where λ ∈ [0, 1], and λ$\_{iou}$, λ$\_{l}$$\_{1}$ ∈$\_{R}$ are hyper-para
 TableFormer uses ResNet-18 as the CNN Backbone Network . The input images are resized to 448*448 pixels and the feature map has a dimension of 28*28. Additionally, we enforce the following input constraints:
-Image width and height ≤ 1024 pixels Structural tags length ≤ 512 tokens. (2)
+$$Image width and height ≤ 1024 pixels Structural tags length ≤ 512 tokens. (2)$$
 Although input constraints are used also by other methods, such as EDD, ours are less restrictive due to the improved
@ -177,9 +177,9 @@ We also share our baseline results on the challenging SynthTabNet dataset. Throu
 The Tree-Edit-Distance-Based Similarity (TEDS) metric was introduced in [37]. It represents the prediction, and ground-truth as a tree structure of HTML tags. This similarity is calculated as:
-TEDS ( T$\_{a}$, T$\_{b}$ ) = 1 - EditDist ( T$\_{a}$, T$\_{b}$ ) max ( | T$\_{a}$ | , | T$\_{b}$ | ) (3)
+$$TEDS ( T$\_{a}$, T$\_{b}$ ) = 1 - EditDist ( T$\_{a}$, T$\_{b}$ ) max ( | T$\_{a}$ | , | T$\_{b}$ | ) (3)$$
-where T$\_{a}$ and T$\_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .
+where T$_{a}$ and T$_{b}$ represent tables in tree structure HTML format. EditDist denotes the tree-edit distance, and | T | represents the number of nodes in T .
 ## 5.4. Quantitative Analysis
@ -240,17 +240,17 @@ b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 <!-- image -->
-|                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
+|                                                    |        | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
-|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
+|----------------------------------------------------|--------|----------|----------|--------|--------|
-| 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
+| 出典                                                 | ファイル 数 | 英語       | 日本語      | 英語     | 日本語    |
-| Association for Computational Linguistics(ACL2003) | 65          | 65             | 0              | 150        | 0          |
+| Association for Computational Linguistics(ACL2003) | 65     | 65       | 0        | 150    | 0      |
-| Computational Linguistics(COLING2002)              | 140         | 140            | 0              | 150        | 0          |
+| Computational Linguistics(COLING2002)              | 140    | 140      | 0        | 150    | 0      |
-| 電気情報通信学会 2003 年総合大会                   | 150         | 8              | 142            | 223        | 147        |
+| 電気情報通信学会 2003 年総合大会                                | 150    | 8        | 142      | 223    | 147    |
-| 情報処理学会第 65 回全国大会 (2003)                | 177         | 1              | 176            | 150        | 236        |
+| 情報処理学会第 65 回全国大会 (2003)                            | 177    | 1        | 176      | 150    | 236    |
-| 第 17 回人工知能学会全国大会 (2003)                | 208         | 5              | 203            | 152        | 244        |
+| 第 17 回人工知能学会全国大会 (2003)                            | 208    | 5        | 203      | 152    | 244    |
-| 自然言語処理研究会第 146 〜 155 回                 | 98          | 2              | 96             | 150        | 232        |
+| 自然言語処理研究会第 146 〜 155 回                             | 98     | 2        | 96       | 150    | 232    |
-| WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
+| WWW から収集した論文                                       | 107    | 73       | 34       | 147    | 96     |
-|                                                    | 945         | 294            | 651            | 1122       | 955        |
+|                                                    | 945    | 294      | 651      | 1122   | 955    |
 Text is aligned to match original for ease of viewing
@ -377,9 +377,9 @@ Here is a step-by-step description of the prediction postprocessing:
 - 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
 - 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:
-alignment = arg min c { D$\_{c}$ } D$\_{c}$ = max { x$\_{c}$ } - min { x$\_{c}$ } (4)
+$$alignment = arg min c { D$\_{c}$ } D$\_{c}$ = max { x$\_{c}$ } - min { x$\_{c}$ } (4)$$
-where c is one of { left, centroid, right } and x$\_{c}$ is the xcoordinate for the corresponding point.
+where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.
 - 5. Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.md
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.md
@ -16,7 +16,7 @@ Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod
 Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet, consectetuer adipiscing elit, sed diam nonummy nibh euismod tincidunt.
-a 2 + 8 = 12
+$$a 2 + 8 = 12$$
 Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.
--- a/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json