Update tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-12-10 16:47:58 +01:00 · 2024-12-10 16:47:58 +01:00 · 1de42bef6a
commit 1de42bef6a
parent 5e013294f9
34 changed files with 267 additions and 133 deletions
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
@ -153,20 +153,41 @@
 </table>
 <paragraph><location><page_8><loc_9><loc_89><loc_10><loc_90></location>- a.</paragraph>
 <paragraph><location><page_8><loc_11><loc_89><loc_82><loc_90></location>- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</paragraph>
-<caption><location><page_8><loc_9><loc_87><loc_70><loc_88></location>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:</caption>
-<caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
+<subtitle-level-1><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</subtitle-level-1>
+<subtitle-level-1><location><page_8><loc_50><loc_87><loc_70><loc_88></location>Example table from FinTabNet:</subtitle-level-1>
 <figure>
 <location><page_8><loc_8><loc_76><loc_49><loc_87></location>
-<caption>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 </figure>
+<caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 <figure>
+<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
+<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
+</figure>
+<table>
 <location><page_8><loc_9><loc_63><loc_49><loc_72></location>
-</figure>
+<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
+<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
+<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
+<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
+<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
+<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
+<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
+<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
+<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
+<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
+</table>
 <caption><location><page_8><loc_62><loc_62><loc_90><loc_63></location>Text is aligned to match original for ease of viewing</caption>
-<figure>
+<table>
 <location><page_8><loc_50><loc_64><loc_90><loc_72></location>
 <caption>Text is aligned to match original for ease of viewing</caption>
-</figure>
+<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
+<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
+<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
+<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
+<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
+<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
+<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
+</table>
 <caption><location><page_8><loc_8><loc_54><loc_89><loc_59></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
 <figure>
 <location><page_8><loc_8><loc_44><loc_35><loc_52></location>
@ -275,7 +296,7 @@
 <paragraph><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</paragraph>
 <caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
 <figure>
-<location><page_13><loc_51><loc_63><loc_91><loc_87></location>
+<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
 <caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </figure>
 <caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
@ -298,11 +319,7 @@
 <location><page_14><loc_52><loc_55><loc_87><loc_89></location>
 <caption>Figure 13: Table predictions example on colorful table.</caption>
 </figure>
-<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
-<figure>
-<location><page_14><loc_52><loc_25><loc_85><loc_31></location>
-<caption>Figure 14: Example with multi-line text.</caption>
-</figure>
+<paragraph><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</paragraph>
 <figure>
 <location><page_15><loc_9><loc_69><loc_46><loc_83></location>
 </figure>
@ -318,9 +335,6 @@
 <caption>Figure 15: Example with triangular table.</caption>
 </figure>
 <figure>
-<location><page_15><loc_53><loc_72><loc_86><loc_85></location>
-</figure>
-<figure>
 <location><page_15><loc_53><loc_41><loc_86><loc_54></location>
 </figure>
 <caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.md
@ -219,18 +219,40 @@ Table 4: Results of structure with content retrieved using cell detection on Pub

 - Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells

-Japanese language (previously unseen by TableFormer): Example table from FinTabNet:
+## Japanese language (previously unseen by TableFormer):
+
+## Example table from FinTabNet:
+
+
+<!-- image -->

 b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
-
-Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 <!-- image -->


-<!-- image -->
+
+|                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
+|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
+| 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
+| Association for Computational Linguistics(ACL2003) | 65          | 65             | 0              | 150        | 0          |
+| Computational Linguistics(COLING2002)              | 140         | 140            | 0              | 150        | 0          |
+| 電気情報通信学会 2003 年総合大会                   | 150         | 8              | 142            | 223        | 147        |
+| 情報処理学会第 65 回全国大会 (2003)                | 177         | 1              | 176            | 150        | 236        |
+| 第 17 回人工知能学会全国大会 (2003)                | 208         | 5              | 203            | 152        | 244        |
+| 自然言語処理研究会第 146 〜 155 回                 | 98          | 2              | 96             | 150        | 232        |
+| WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
+|                                                    | 945         | 294            | 651            | 1122       | 955        |

 Text is aligned to match original for ease of viewing
-<!-- image -->
+
+|                          | Shares (in millions)   | Shares (in millions)   | Weighted Average Grant Date Fair Value   | Weighted Average Grant Date Fair Value   |
+|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
+|                          | RS U s                 | PSUs                   | RSUs                                     | PSUs                                     |
+| Nonvested on Janua ry 1  | 1. 1                   | 0.3                    | 90.10 $                                  | $ 91.19                                  |
+| Granted                  | 0. 5                   | 0.1                    | 117.44                                   | 122.41                                   |
+| Vested                   | (0. 5 )                | (0.1)                  | 87.08                                    | 81.14                                    |
+| Canceled or forfeited    | (0. 1 )                | -                      | 102.01                                   | 92.18                                    |
+| Nonvested on December 31 | 1.0                    | 0.3                    | 104.85 $                                 | $ 104.51                                 |

 Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
 <!-- image -->
@ -436,7 +458,6 @@ Figure 13: Table predictions example on colorful table.
 <!-- image -->

 Figure 14: Example with multi-line text.
-<!-- image -->


 <!-- image -->
@ -451,9 +472,6 @@ Figure 15: Example with triangular table.
 <!-- image -->


-<!-- image -->
-
-
 <!-- image -->

 Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt
@ -3,16 +3,17 @@
 <paragraph><location><page_1><loc_15><loc_77><loc_32><loc_83></location>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</paragraph>
 <paragraph><location><page_1><loc_42><loc_77><loc_58><loc_83></location>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</paragraph>
 <paragraph><location><page_1><loc_69><loc_77><loc_85><loc_83></location>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</paragraph>
-<paragraph><location><page_1><loc_28><loc_71><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland</paragraph>
-<paragraph><location><page_1><loc_29><loc_70><loc_44><loc_71></location>ahn@zurich.ibm.com</paragraph>
+<paragraph><location><page_1><loc_28><loc_70><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</paragraph>
+<paragraph><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</paragraph>
 <subtitle-level-1><location><page_1><loc_9><loc_67><loc_18><loc_69></location>ABSTRACT</subtitle-level-1>
 <paragraph><location><page_1><loc_9><loc_33><loc_48><loc_67></location>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</paragraph>
 <subtitle-level-1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</subtitle-level-1>
 <paragraph><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</paragraph>
 <paragraph><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</paragraph>
-<paragraph><location><page_1><loc_9><loc_12><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.</paragraph>
+<paragraph><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</paragraph>
+<paragraph><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</paragraph>
+<paragraph><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</paragraph>
 <paragraph><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</paragraph>
-<paragraph><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</paragraph>
 <caption><location><page_1><loc_52><loc_29><loc_91><loc_32></location>Figure 1: Four examples of complex page layouts across different document categories</caption>
 <figure>
 <location><page_1><loc_53><loc_34><loc_90><loc_68></location>
@ -152,6 +153,10 @@
 <row_11><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>Sec.-h.</col_2><col_3><body>Sec.-h.</col_3><col_4><body>Sec.-h.</col_4></row_11>
 <row_12><col_0><row_header>Overall</col_0><col_1><body>72</col_1><col_2><body>73</col_2><col_3><body>78</col_3><col_4><body>77</col_4></row_12>
 </table>
+<subtitle-level-1><location><page_7><loc_9><loc_58><loc_21><loc_60></location>Learning Curve</subtitle-level-1>
+<paragraph><location><page_7><loc_9><loc_33><loc_48><loc_58></location>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</paragraph>
+<subtitle-level-1><location><page_7><loc_9><loc_30><loc_27><loc_32></location>Impact of Class Labels</subtitle-level-1>
+<paragraph><location><page_7><loc_9><loc_11><loc_48><loc_30></location>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</paragraph>
 <table>
 <location><page_7><loc_58><loc_61><loc_85><loc_81></location>
 <row_0><col_0><body>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>11</col_2><col_3><col_header>5</col_3><col_4><col_header>5</col_4></row_0>
@ -169,10 +174,6 @@
 <row_12><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>81</col_2><col_3><body></col_3><col_4><body></col_4></row_12>
 <row_13><col_0><row_header>All</col_0><col_1><body>72</col_1><col_2><body>84</col_2><col_3><body>78</col_3><col_4><body>87</col_4></row_13>
 </table>
-<subtitle-level-1><location><page_7><loc_9><loc_58><loc_21><loc_60></location>Learning Curve</subtitle-level-1>
-<paragraph><location><page_7><loc_9><loc_33><loc_48><loc_58></location>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</paragraph>
-<subtitle-level-1><location><page_7><loc_9><loc_30><loc_27><loc_32></location>Impact of Class Labels</subtitle-level-1>
-<paragraph><location><page_7><loc_9><loc_11><loc_48><loc_30></location>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</paragraph>
 <paragraph><location><page_7><loc_52><loc_47><loc_91><loc_58></location>lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.</paragraph>
 <subtitle-level-1><location><page_7><loc_52><loc_44><loc_90><loc_46></location>Impact of Document Split in Train and Test Set</subtitle-level-1>
 <paragraph><location><page_7><loc_52><loc_25><loc_91><loc_44></location>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</paragraph>
--- a/tests/data/groundtruth/docling_v1/2206.01062.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.md
+++ b/tests/data/groundtruth/docling_v1/2206.01062.md
@ -6,9 +6,9 @@ Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

 Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-Ahmed S. Nassar IBM Research Rueschlikon, Switzerland
+Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

-ahn@zurich.ibm.com
+Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

 ## ABSTRACT

@ -20,12 +20,14 @@ Accurate document layout analysis is a key requirement for highquality PDF docum

 Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.
+KDD '22, August 14-18, 2022, Washington, DC, USA
+
+© 2022 Copyright held by the owner/author(s).
+
+ACM ISBN 978-1-4503-9385-0/22/08.

 https://doi.org/10.1145/3534678.3539043

-Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
-
 Figure 1: Four examples of complex page layouts across different document categories
 <!-- image -->

@ -210,6 +212,14 @@ Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wis
 | Title          |   77 | Sec.-h. | Sec.-h. | Sec.-h. |
 | Overall        |   72 | 73      | 78      | 77      |

+## Learning Curve
+
+One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.
+
+## Impact of Class Labels
+
+The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of
+


 | Class-count    | 11   | 11   | 5   | 5    |
@ -228,14 +238,6 @@ Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wis
 | Title          | 77   | 81   |     |      |
 | All            | 72   | 84   | 78  | 87   |

-## Learning Curve
-
-One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.
-
-## Impact of Class Labels
-
-The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of
-
 lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.

 ## Impact of Document Split in Train and Test Set
--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt
@ -1,7 +1,6 @@
 <document>
 <subtitle-level-1><location><page_1><loc_22><loc_82><loc_79><loc_85></location>Optimized Table Tokenization for Table Structure Recognition</subtitle-level-1>
-<paragraph><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]</paragraph>
-<paragraph><location><page_1><loc_38><loc_74><loc_49><loc_75></location>and Peter Staar</paragraph>
+<paragraph><location><page_1><loc_23><loc_74><loc_78><loc_79></location>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, and Peter Staar [0000 − 0002 − 8088 − 0823]</paragraph>
 <paragraph><location><page_1><loc_46><loc_72><loc_55><loc_73></location>IBM Research</paragraph>
 <paragraph><location><page_1><loc_36><loc_70><loc_64><loc_71></location>{mly,ahn,nli,cau,taa}@zurich.ibm.com</paragraph>
 <paragraph><location><page_1><loc_27><loc_41><loc_74><loc_66></location>Abstract. Extracting tables from documents is a crucial task in any document conversion pipeline. Recently, transformer-based models have demonstrated that table-structure can be recognized with impressive accuracy using Image-to-Markup-Sequence (Im2Seq) approaches. Taking only the image of a table, such models predict a sequence of tokens (e.g. in HTML, LaTeX) which represent the structure of the table. Since the token representation of the table structure has a significant impact on the accuracy and run-time performance of any Im2Seq model, we investigate in this paper how table-structure representation can be optimised. We propose a new, optimised table-structure language (OTSL) with a minimized vocabulary and specific rules. The benefits of OTSL are that it reduces the number of tokens to 5 (HTML needs 28+) and shortens the sequence length to half of HTML on average. Consequently, model accuracy improves significantly, inference time is halved compared to HTML-based models, and the predicted table structures are always syntactically correct. This in turn eliminates most post-processing needs. Popular table structure data-sets will be published in OTSL format to the community.</paragraph>
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.md
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.md
@ -1,8 +1,6 @@
 ## Optimized Table Tokenization for Table Structure Recognition

-Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]
-
-and Peter Staar
+Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, and Peter Staar [0000 − 0002 − 8088 − 0823]

 IBM Research

--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt
@ -5,7 +5,10 @@
 </figure>
 <subtitle-level-1><location><page_1><loc_6><loc_79><loc_96><loc_89></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
 <figure>
-<location><page_1><loc_3><loc_1><loc_96><loc_64></location>
+<location><page_1><loc_5><loc_11><loc_96><loc_63></location>
+</figure>
+<figure>
+<location><page_1><loc_52><loc_2><loc_95><loc_10></location>
 </figure>
 <subtitle-level-1><location><page_2><loc_11><loc_88><loc_28><loc_91></location>Contents</subtitle-level-1>
 <table>
@ -102,9 +105,7 @@
 <location><page_5><loc_5><loc_70><loc_39><loc_91></location>
 </figure>
 <paragraph><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</paragraph>
-<figure>
-<location><page_5><loc_78><loc_82><loc_89><loc_91></location>
-</figure>
+<paragraph><location><page_5><loc_82><loc_84><loc_85><loc_88></location>1</paragraph>
 <subtitle-level-1><location><page_5><loc_22><loc_61><loc_89><loc_68></location>Securing and protecting IBM DB2 data</subtitle-level-1>
 <paragraph><location><page_5><loc_22><loc_46><loc_89><loc_56></location>Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.</paragraph>
 <paragraph><location><page_5><loc_22><loc_38><loc_86><loc_44></location>Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement.</paragraph>
@ -153,8 +154,17 @@
 <row_4><col_0><body>USER_TYPE</col_0><col_1><body>VARCHAR(5)</col_1><col_2><body>Type of user profile: GLYPH<SM590000> USER: The user profile is a user. GLYPH<SM590000> GROUP: The user profile is a group.</col_2></row_4>
 </table>
 <paragraph><location><page_8><loc_22><loc_40><loc_89><loc_43></location>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</paragraph>
-<paragraph><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</paragraph>
-<table><location><page_8><loc_22><loc_26><loc_89><loc_37></location>SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=’QIBM_DB_SECADM’ ORDER BY user_name;</table>
+<caption><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</caption>
+<table>
+<location><page_8><loc_22><loc_26><loc_89><loc_37></location>
+<caption>Example 2-1 Query to determine who has authority to define and manage RCAC</caption>
+<row_0><col_0><body>SELECT</col_0><col_1><body>function_id, user_name,</col_1></row_0>
+<row_1><col_0><body></col_0><col_1><body>usage,</col_1></row_1>
+<row_2><col_0><body></col_0><col_1><body>user_type</col_1></row_2>
+<row_3><col_0><body>FROM</col_0><col_1><body>function_usage</col_1></row_3>
+<row_4><col_0><body>WHERE</col_0><col_1><body>function_id='QIBM_DB_SECADM'</col_1></row_4>
+<row_5><col_0><body>ORDER BY</col_0><col_1><body>user_name;</col_1></row_5>
+</table>
 <subtitle-level-1><location><page_8><loc_11><loc_20><loc_41><loc_22></location>2.2 Separation of duties</subtitle-level-1>
 <paragraph><location><page_8><loc_22><loc_10><loc_89><loc_18></location>Separation of duties helps businesses comply with industry regulations or organizational requirements and simplifies the management of authorities. Separation of duties is commonly used to prevent fraudulent activities or errors by a single person. It provides the ability for administrative functions to be divided across individuals without overlapping responsibilities, so that one user does not possess unlimited authority, such as with the *ALLOBJ authority.</paragraph>
 <paragraph><location><page_9><loc_22><loc_82><loc_89><loc_91></location>For example, assume that a business has assigned the duty to manage security on IBM i to Theresa. Before release IBM i 7.2, to grant privileges, Theresa had to have the same privileges Theresa was granting to others. Therefore, to grant *USE privileges to the PAYROLL table, Theresa had to have *OBJMGT and *USE authority (or a higher level of authority, such as *ALLOBJ). This requirement allowed Theresa to access the data in the PAYROLL table even though Theresa's job description was only to manage its security.</paragraph>
@ -258,7 +268,12 @@
 <paragraph><location><page_14><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</paragraph>
 <paragraph><location><page_14><loc_22><loc_65><loc_67><loc_66></location>- 1. Run the SQL statements that are shown in Example 3-10.</paragraph>
 <subtitle-level-1><location><page_14><loc_22><loc_62><loc_61><loc_63></location>Example 3-10 Activating RCAC on the EMPLOYEES table</subtitle-level-1>
-<paragraph><location><page_14><loc_22><loc_54><loc_62><loc_61></location>- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;</paragraph>
+<paragraph><location><page_14><loc_22><loc_60><loc_62><loc_61></location>- /* Active Row Access Control (permissions) */</paragraph>
+<paragraph><location><page_14><loc_22><loc_58><loc_58><loc_60></location>- /* Active Column Access Control (masks)</paragraph>
+<paragraph><location><page_14><loc_60><loc_58><loc_62><loc_60></location>*/</paragraph>
+<paragraph><location><page_14><loc_22><loc_57><loc_48><loc_58></location>ALTER TABLE HR_SCHEMA.EMPLOYEES</paragraph>
+<paragraph><location><page_14><loc_22><loc_55><loc_44><loc_56></location>ACTIVATE ROW ACCESS CONTROL</paragraph>
+<paragraph><location><page_14><loc_22><loc_54><loc_48><loc_55></location>ACTIVATE COLUMN ACCESS CONTROL;</paragraph>
 <paragraph><location><page_14><loc_22><loc_48><loc_88><loc_52></location>- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</paragraph>
 <caption><location><page_14><loc_11><loc_17><loc_57><loc_18></location>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
 <figure>
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.md
@ -6,6 +6,9 @@ Front cover
 ## Row and Column Access Control Support in IBM DB2 for i


+<!-- image -->
+
+
 <!-- image -->

 ## Contents
@ -138,8 +141,7 @@ Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Ro

 Chapter 1.

-
-<!-- image -->
+1

 ## Securing and protecting IBM DB2 data

@ -221,7 +223,13 @@ To discover who has authorization to define and manage RCAC, you can use the que

 Example 2-1 Query to determine who has authority to define and manage RCAC

-SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=’QIBM_DB_SECADM’ ORDER BY user_name;
+| SELECT   | function_id, user_name,      |
+|----------|------------------------------|
+|          | usage,                       |
+|          | user_type                    |
+| FROM     | function_usage               |
+| WHERE    | function_id='QIBM_DB_SECADM' |
+| ORDER BY | user_name;                   |

 ## 2.2 Separation of duties

@ -365,7 +373,17 @@ Now that you have created the row permission and the two column masks, RCAC must

 ## Example 3-10 Activating RCAC on the EMPLOYEES table

- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;
+- /* Active Row Access Control (permissions) */
+
+- /* Active Column Access Control (masks)
+
+*/
+
+ALTER TABLE HR_SCHEMA.EMPLOYEES
+
+ACTIVATE ROW ACCESS CONTROL
+
+ACTIVATE COLUMN ACCESS CONTROL;

 - 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .

--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
@ -156,17 +156,39 @@
 <list_item><location><page_8><loc_9><loc_89><loc_10><loc_90></location>a.</list_item>
 <list_item><location><page_8><loc_11><loc_89><loc_82><loc_90></location>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</list_item>
 </unordered_list>
+<section_header_level_1><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</section_header_level_1>
+<section_header_level_1><location><page_8><loc_50><loc_87><loc_70><loc_88></location>Example table from FinTabNet:</section_header_level_1>
 <figure>
 <location><page_8><loc_8><loc_76><loc_49><loc_87></location>
-<caption>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 </figure>
 <figure>
+<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
+<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
+</figure>
+<table>
 <location><page_8><loc_9><loc_63><loc_49><loc_72></location>
-</figure>
-<figure>
+<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
+<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
+<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
+<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
+<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
+<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
+<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
+<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
+<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
+<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
+</table>
+<table>
 <location><page_8><loc_50><loc_64><loc_90><loc_72></location>
 <caption>Text is aligned to match original for ease of viewing</caption>
-</figure>
+<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
+<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
+<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
+<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
+<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
+<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
+<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
+</table>
 <figure>
 <location><page_8><loc_8><loc_44><loc_35><loc_52></location>
 <caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
@ -294,7 +316,7 @@
 <text><location><page_13><loc_8><loc_83><loc_47><loc_86></location>Aditional images with examples of TableFormer predictions and post-processing can be found below.</text>
 <paragraph><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</paragraph>
 <figure>
-<location><page_13><loc_51><loc_63><loc_91><loc_87></location>
+<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
 <caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </figure>
 <figure>
@ -313,10 +335,7 @@
 <location><page_14><loc_52><loc_55><loc_87><loc_89></location>
 <caption>Figure 13: Table predictions example on colorful table.</caption>
 </figure>
-<figure>
-<location><page_14><loc_52><loc_25><loc_85><loc_31></location>
-<caption>Figure 14: Example with multi-line text.</caption>
-</figure>
+<paragraph><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</paragraph>
 <figure>
 <location><page_15><loc_9><loc_69><loc_46><loc_83></location>
 </figure>
@ -331,9 +350,6 @@
 <caption>Figure 15: Example with triangular table.</caption>
 </figure>
 <figure>
-<location><page_15><loc_53><loc_72><loc_86><loc_85></location>
-</figure>
-<figure>
 <location><page_15><loc_53><loc_41><loc_86><loc_54></location>
 </figure>
 <figure>
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md
@ -223,15 +223,38 @@ Table 4: Results of structure with content retrieved using cell detection on Pub
 - a.
 - Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells

-Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
+## Japanese language (previously unseen by TableFormer):
+
+## Example table from FinTabNet:

 <!-- image -->

+b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
+
 <!-- image -->

+|                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
+|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
+| 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
+| Association for Computational Linguistics(ACL2003) | 65          | 65             | 0              | 150        | 0          |
+| Computational Linguistics(COLING2002)              | 140         | 140            | 0              | 150        | 0          |
+| 電気情報通信学会 2003 年総合大会                   | 150         | 8              | 142            | 223        | 147        |
+| 情報処理学会第 65 回全国大会 (2003)                | 177         | 1              | 176            | 150        | 236        |
+| 第 17 回人工知能学会全国大会 (2003)                | 208         | 5              | 203            | 152        | 244        |
+| 自然言語処理研究会第 146 〜 155 回                 | 98          | 2              | 96             | 150        | 232        |
+| WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
+|                                                    | 945         | 294            | 651            | 1122       | 955        |
+
 Text is aligned to match original for ease of viewing

-<!-- image -->
+|                          | Shares (in millions)   | Shares (in millions)   | Weighted Average Grant Date Fair Value   | Weighted Average Grant Date Fair Value   |
+|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
+|                          | RS U s                 | PSUs                   | RSUs                                     | PSUs                                     |
+| Nonvested on Janua ry 1  | 1. 1                   | 0.3                    | 90.10 $                                  | $ 91.19                                  |
+| Granted                  | 0. 5                   | 0.1                    | 117.44                                   | 122.41                                   |
+| Vested                   | (0. 5 )                | (0.1)                  | 87.08                                    | 81.14                                    |
+| Canceled or forfeited    | (0. 1 )                | -                      | 102.01                                   | 92.18                                    |
+| Nonvested on December 31 | 1.0                    | 0.3                    | 104.85 $                                 | $ 104.51                                 |

 Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.

@ -403,16 +426,12 @@ Figure 14: Example with multi-line text.

 <!-- image -->

-<!-- image -->
-
 Figure 15: Example with triangular table.

 <!-- image -->

 <!-- image -->

-<!-- image -->
-
 Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.

 <!-- image -->
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
@ -3,16 +3,17 @@
 <text><location><page_1><loc_15><loc_77><loc_32><loc_83></location>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</text>
 <text><location><page_1><loc_42><loc_77><loc_58><loc_83></location>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</text>
 <text><location><page_1><loc_69><loc_77><loc_85><loc_83></location>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</text>
-<text><location><page_1><loc_28><loc_71><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland</text>
-<text><location><page_1><loc_29><loc_70><loc_44><loc_71></location>ahn@zurich.ibm.com</text>
+<text><location><page_1><loc_28><loc_70><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</text>
+<text><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
 <section_header_level_1><location><page_1><loc_9><loc_67><loc_18><loc_69></location>ABSTRACT</section_header_level_1>
 <text><location><page_1><loc_9><loc_33><loc_48><loc_67></location>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</text>
 <section_header_level_1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</section_header_level_1>
 <text><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</text>
 <text><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</text>
-<text><location><page_1><loc_9><loc_12><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.</text>
+<text><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</text>
+<text><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</text>
+<text><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</text>
 <text><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</text>
-<text><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
 <figure>
 <location><page_1><loc_53><loc_34><loc_90><loc_68></location>
 <caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
@ -150,6 +151,10 @@
 <row_11><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>Sec.-h.</col_2><col_3><body>Sec.-h.</col_3><col_4><body>Sec.-h.</col_4></row_11>
 <row_12><col_0><row_header>Overall</col_0><col_1><body>72</col_1><col_2><body>73</col_2><col_3><body>78</col_3><col_4><body>77</col_4></row_12>
 </table>
+<section_header_level_1><location><page_7><loc_9><loc_58><loc_21><loc_60></location>Learning Curve</section_header_level_1>
+<text><location><page_7><loc_9><loc_33><loc_48><loc_58></location>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</text>
+<section_header_level_1><location><page_7><loc_9><loc_30><loc_27><loc_32></location>Impact of Class Labels</section_header_level_1>
+<text><location><page_7><loc_9><loc_11><loc_48><loc_30></location>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</text>
 <table>
 <location><page_7><loc_58><loc_61><loc_85><loc_81></location>
 <row_0><col_0><body>Class-count</col_0><col_1><col_header>11</col_1><col_2><col_header>11</col_2><col_3><col_header>5</col_3><col_4><col_header>5</col_4></row_0>
@ -167,10 +172,6 @@
 <row_12><col_0><row_header>Title</col_0><col_1><body>77</col_1><col_2><body>81</col_2><col_3><body></col_3><col_4><body></col_4></row_12>
 <row_13><col_0><row_header>All</col_0><col_1><body>72</col_1><col_2><body>84</col_2><col_3><body>78</col_3><col_4><body>87</col_4></row_13>
 </table>
-<section_header_level_1><location><page_7><loc_9><loc_58><loc_21><loc_60></location>Learning Curve</section_header_level_1>
-<text><location><page_7><loc_9><loc_33><loc_48><loc_58></location>One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.</text>
-<section_header_level_1><location><page_7><loc_9><loc_30><loc_27><loc_32></location>Impact of Class Labels</section_header_level_1>
-<text><location><page_7><loc_9><loc_11><loc_48><loc_30></location>The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of</text>
 <text><location><page_7><loc_52><loc_47><loc_91><loc_58></location>lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.</text>
 <section_header_level_1><location><page_7><loc_52><loc_44><loc_90><loc_46></location>Impact of Document Split in Train and Test Set</section_header_level_1>
 <text><location><page_7><loc_52><loc_25><loc_91><loc_44></location>Many documents in DocLayNet have a unique styling. In order to avoid overfitting on a particular style, we have split the train-, test- and validation-sets of DocLayNet on document boundaries, i.e. every document contributes pages to only one set. To the best of our knowledge, this was not considered in PubLayNet or DocBank. To quantify how this affects model performance, we trained and evaluated a Mask R-CNN R50 model on a modified dataset version. Here, the train-, test- and validation-sets were obtained by a randomised draw over the individual pages. As can be seen in Table 4, the difference in model performance is surprisingly large: pagewise splitting gains ˜ 10% in mAP over the document-wise splitting. Thus, random page-wise splitting of DocLayNet can easily lead to accidental overestimation of model performance and should be avoided.</text>
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.md
+++ b/tests/data/groundtruth/docling_v2/2206.01062.md
@ -6,9 +6,9 @@ Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

 Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-Ahmed S. Nassar IBM Research Rueschlikon, Switzerland
+Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com

-ahn@zurich.ibm.com
+Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com

 ## ABSTRACT

@ -20,12 +20,14 @@ Accurate document layout analysis is a key requirement for highquality PDF docum

 Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.
+KDD '22, August 14-18, 2022, Washington, DC, USA
+
+© 2022 Copyright held by the owner/author(s).
+
+ACM ISBN 978-1-4503-9385-0/22/08.

 https://doi.org/10.1145/3534678.3539043

-Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
-
 Figure 1: Four examples of complex page layouts across different document categories

 <!-- image -->
@ -207,6 +209,14 @@ Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wis
 | Title          |   77 | Sec.-h. | Sec.-h. | Sec.-h. |
 | Overall        |   72 | 73      | 78      | 77      |

+## Learning Curve
+
+One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.
+
+## Impact of Class Labels
+
+The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of
+
 | Class-count    | 11   | 11   | 5   | 5    |
 |----------------|------|------|-----|------|
 | Split          | Doc  | Page | Doc | Page |
@ -223,14 +233,6 @@ Table 4: Performance of a Mask R-CNN R50 network with document-wise and page-wis
 | Title          | 77   | 81   |     |      |
 | All            | 72   | 84   | 78  | 87   |

-## Learning Curve
-
-One of the fundamental questions related to any dataset is if it is "large enough". To answer this question for DocLayNet, we performed a data ablation study in which we evaluated a Mask R-CNN model trained on increasing fractions of the DocLayNet dataset. As can be seen in Figure 5, the mAP score rises sharply in the beginning and eventually levels out. To estimate the error-bar on the metrics, we ran the training five times on the entire data-set. This resulted in a 1% error-bar, depicted by the shaded area in Figure 5. In the inset of Figure 5, we show the exact same data-points, but with a logarithmic scale on the x-axis. As is expected, the mAP score increases linearly as a function of the data-size in the inset. The curve ultimately flattens out between the 80% and 100% mark, with the 80% mark falling within the error-bars of the 100% mark. This provides a good indication that the model would not improve significantly by yet increasing the data size. Rather, it would probably benefit more from improved data consistency (as discussed in Section 3), data augmentation methods [23], or the addition of more document categories and styles.
-
-## Impact of Class Labels
-
-The choice and number of labels can have a significant effect on the overall model performance. Since PubLayNet, DocBank and DocLayNet all have different label sets, it is of particular interest to understand and quantify this influence of the label set on the model performance. We investigate this by either down-mapping labels into more common ones (e.g. Caption → Text ) or excluding them from the annotations entirely. Furthermore, it must be stressed that all mappings and exclusions were performed on the data before model training. In Table 3, we present the mAP scores for a Mask R-CNN R50 network on different label sets. Where a label is down-mapped, we show its corresponding label, otherwise it was excluded. We present three different label sets, with 6, 5 and 4 different labels respectively. The set of 5 labels contains the same labels as PubLayNet. However, due to the different definition of
-
 lists in PubLayNet (grouped list-items) versus DocLayNet (separate list-items), the label set of size 4 is the closest to PubLayNet, in the assumption that the List is down-mapped to Text in PubLayNet. The results in Table 3 show that the prediction accuracy on the remaining class labels does not change significantly when other classes are merged into them. The overall macro-average improves by around 5%, in particular when Page-footer and Page-header are excluded.

 ## Impact of Document Split in Train and Test Set
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
@ -1,7 +1,6 @@
 <document>
 <section_header_level_1><location><page_1><loc_22><loc_82><loc_79><loc_85></location>Optimized Table Tokenization for Table Structure Recognition</section_header_level_1>
-<text><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]</text>
-<text><location><page_1><loc_38><loc_74><loc_49><loc_75></location>and Peter Staar</text>
+<text><location><page_1><loc_23><loc_74><loc_78><loc_79></location>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, and Peter Staar [0000 − 0002 − 8088 − 0823]</text>
 <text><location><page_1><loc_46><loc_72><loc_55><loc_73></location>IBM Research</text>
 <text><location><page_1><loc_36><loc_70><loc_64><loc_71></location>{mly,ahn,nli,cau,taa}@zurich.ibm.com</text>
 <text><location><page_1><loc_27><loc_41><loc_74><loc_66></location>Abstract. Extracting tables from documents is a crucial task in any document conversion pipeline. Recently, transformer-based models have demonstrated that table-structure can be recognized with impressive accuracy using Image-to-Markup-Sequence (Im2Seq) approaches. Taking only the image of a table, such models predict a sequence of tokens (e.g. in HTML, LaTeX) which represent the structure of the table. Since the token representation of the table structure has a significant impact on the accuracy and run-time performance of any Im2Seq model, we investigate in this paper how table-structure representation can be optimised. We propose a new, optimised table-structure language (OTSL) with a minimized vocabulary and specific rules. The benefits of OTSL are that it reduces the number of tokens to 5 (HTML needs 28+) and shortens the sequence length to half of HTML on average. Consequently, model accuracy improves significantly, inference time is halved compared to HTML-based models, and the predicted table structures are always syntactically correct. This in turn eliminates most post-processing needs. Popular table structure data-sets will be published in OTSL format to the community.</text>
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.md
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.md
@ -1,8 +1,6 @@
 ## Optimized Table Tokenization for Table Structure Recognition

-Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]
-
-and Peter Staar
+Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, and Peter Staar [0000 − 0002 − 8088 − 0823]

 IBM Research

--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
@ -5,7 +5,10 @@
 </figure>
 <section_header_level_1><location><page_1><loc_6><loc_79><loc_96><loc_89></location>Row and Column Access Control Support in IBM DB2 for i</section_header_level_1>
 <figure>
-<location><page_1><loc_3><loc_1><loc_96><loc_64></location>
+<location><page_1><loc_5><loc_11><loc_96><loc_63></location>
+</figure>
+<figure>
+<location><page_1><loc_52><loc_2><loc_95><loc_10></location>
 </figure>
 <section_header_level_1><location><page_2><loc_11><loc_88><loc_28><loc_91></location>Contents</section_header_level_1>
 <table>
@ -106,9 +109,7 @@
 <location><page_5><loc_5><loc_70><loc_39><loc_91></location>
 </figure>
 <text><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</text>
-<figure>
-<location><page_5><loc_78><loc_82><loc_89><loc_91></location>
-</figure>
+<text><location><page_5><loc_82><loc_84><loc_85><loc_88></location>1</text>
 <section_header_level_1><location><page_5><loc_22><loc_61><loc_89><loc_68></location>Securing and protecting IBM DB2 data</section_header_level_1>
 <text><location><page_5><loc_22><loc_46><loc_89><loc_56></location>Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.</text>
 <text><location><page_5><loc_22><loc_38><loc_86><loc_44></location>Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement.</text>
@ -163,8 +164,16 @@
 <row_4><col_0><body>USER_TYPE</col_0><col_1><body>VARCHAR(5)</col_1><col_2><body>Type of user profile: GLYPH<SM590000> USER: The user profile is a user. GLYPH<SM590000> GROUP: The user profile is a group.</col_2></row_4>
 </table>
 <text><location><page_8><loc_22><loc_40><loc_89><loc_43></location>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</text>
-<paragraph><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</paragraph>
-<table><location><page_8><loc_22><loc_26><loc_89><loc_37></location>SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=’QIBM_DB_SECADM’ ORDER BY user_name;</table>
+<table>
+<location><page_8><loc_22><loc_26><loc_89><loc_37></location>
+<caption>Example 2-1 Query to determine who has authority to define and manage RCAC</caption>
+<row_0><col_0><body>SELECT</col_0><col_1><body>function_id, user_name,</col_1></row_0>
+<row_1><col_0><body></col_0><col_1><body>usage,</col_1></row_1>
+<row_2><col_0><body></col_0><col_1><body>user_type</col_1></row_2>
+<row_3><col_0><body>FROM</col_0><col_1><body>function_usage</col_1></row_3>
+<row_4><col_0><body>WHERE</col_0><col_1><body>function_id='QIBM_DB_SECADM'</col_1></row_4>
+<row_5><col_0><body>ORDER BY</col_0><col_1><body>user_name;</col_1></row_5>
+</table>
 <section_header_level_1><location><page_8><loc_11><loc_20><loc_41><loc_22></location>2.2 Separation of duties</section_header_level_1>
 <text><location><page_8><loc_22><loc_10><loc_89><loc_18></location>Separation of duties helps businesses comply with industry regulations or organizational requirements and simplifies the management of authorities. Separation of duties is commonly used to prevent fraudulent activities or errors by a single person. It provides the ability for administrative functions to be divided across individuals without overlapping responsibilities, so that one user does not possess unlimited authority, such as with the *ALLOBJ authority.</text>
 <text><location><page_9><loc_22><loc_82><loc_89><loc_91></location>For example, assume that a business has assigned the duty to manage security on IBM i to Theresa. Before release IBM i 7.2, to grant privileges, Theresa had to have the same privileges Theresa was granting to others. Therefore, to grant *USE privileges to the PAYROLL table, Theresa had to have *OBJMGT and *USE authority (or a higher level of authority, such as *ALLOBJ). This requirement allowed Theresa to access the data in the PAYROLL table even though Theresa's job description was only to manage its security.</text>
@ -272,7 +281,14 @@
 </unordered_list>
 <section_header_level_1><location><page_14><loc_22><loc_62><loc_61><loc_63></location>Example 3-10 Activating RCAC on the EMPLOYEES table</section_header_level_1>
 <unordered_list>
-<list_item><location><page_14><loc_22><loc_54><loc_62><loc_61></location>/* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;</list_item>
+<list_item><location><page_14><loc_22><loc_60><loc_62><loc_61></location>/* Active Row Access Control (permissions) */</list_item>
+<list_item><location><page_14><loc_22><loc_58><loc_58><loc_60></location>/* Active Column Access Control (masks)</list_item>
+</unordered_list>
+<text><location><page_14><loc_60><loc_58><loc_62><loc_60></location>*/</text>
+<text><location><page_14><loc_22><loc_57><loc_48><loc_58></location>ALTER TABLE HR_SCHEMA.EMPLOYEES</text>
+<text><location><page_14><loc_22><loc_55><loc_44><loc_56></location>ACTIVATE ROW ACCESS CONTROL</text>
+<text><location><page_14><loc_22><loc_54><loc_48><loc_55></location>ACTIVATE COLUMN ACCESS CONTROL;</text>
+<unordered_list>
 <list_item><location><page_14><loc_22><loc_48><loc_88><loc_52></location>2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</list_item>
 </unordered_list>
 <figure>
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.md
@ -6,6 +6,8 @@ Front cover

 <!-- image -->

+<!-- image -->
+
 ## Contents

 | Notices                                                                                                                                        | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vii   |
@ -118,7 +120,7 @@ Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Ro

 Chapter 1.

-<!-- image -->
+1

 ## Securing and protecting IBM DB2 data

@ -196,7 +198,13 @@ To discover who has authorization to define and manage RCAC, you can use the que

 Example 2-1 Query to determine who has authority to define and manage RCAC

-SELECT function\_id, user\_name, usage, user\_type FROM function\_usage WHERE function\_id=’QIBM\_DB\_SECADM’ ORDER BY user\_name;
+| SELECT   | function\_id, user\_name,      |
+|----------|------------------------------|
+|          | usage,                       |
+|          | user\_type                    |
+| FROM     | function\_usage               |
+| WHERE    | function\_id='QIBM\_DB\_SECADM' |
+| ORDER BY | user\_name;                   |

 ## 2.2 Separation of duties

@ -334,7 +342,17 @@ Now that you have created the row permission and the two column masks, RCAC must

 ## Example 3-10 Activating RCAC on the EMPLOYEES table

- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR\_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;
+- /* Active Row Access Control (permissions) */
+- /* Active Column Access Control (masks)
+
+*/
+
+ALTER TABLE HR\_SCHEMA.EMPLOYEES
+
+ACTIVATE ROW ACCESS CONTROL
+
+ACTIVATE COLUMN ACCESS CONTROL;
+
 - 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR\_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .

 Figure 3-11 Selecting the EMPLOYEES table from System i Navigator
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.json
@ -1 +1 @@
-{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "73f23122e9edbdb0a115b448e03c8064a0ea8bdc21d02917ce220cf032454f31", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "8c5c5b766c1bdb92242142ca37260089b02380f9c57729703350f646cdf4771e", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.6796646118164, 689.012451171875, 504.87200927734375, 764.9216918945312], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}
+{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "ocr_test.pdf", "filename-prov": null, "document-hash": "80f38f5b87a84870681556176a9622186fd200dd32c5557be9e0c0af05b8bc61", "#-pages": 1, "collection-name": null, "description": null, "page-hashes": [{"hash": "14d896dc8bcb7ee7c08c0347eb6be8dcb92a3782501992f1ea14d2e58077d4e3", "model": "default", "page": 1}]}, "main-text": [{"prov": [{"bbox": [69.6796646118164, 689.012451171875, 504.87200927734375, 764.9216918945312], "page": 1, "span": [0, 94], "__ref_s3_data": null}], "text": "Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package", "type": "paragraph", "payload": null, "name": "Text", "font": null}], "figures": [], "tables": [], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 841.9216918945312, "page": 1, "width": 595.201171875}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null}