chore: check new core

Signed-off-by: Panos Vagenas <pva@zurich.ibm.com>
2025-07-29 21:44:32 +00:00 · 2025-03-24 15:00:11 +01:00 · 2025-03-24 15:00:11 +01:00 · 921a6e7c97
commit 921a6e7c97
parent 7df157204b
14 changed files with 47 additions and 40 deletions
--- a/poetry.lock
+++ b/poetry.lock
@ -873,29 +873,33 @@ name = "docling-core"
 version = "2.23.3"
 description = "A python library to define and validate data types in Docling."
 optional = false
-python-versions = "<4.0,>=3.9"
-files = [
-    {file = "docling_core-2.23.3-py3-none-any.whl", hash = "sha256:a2166ffc41f8fdf6fdb99b33da6c7146eccf6382712ea92e95772604fb5af5e5"},
-    {file = "docling_core-2.23.3.tar.gz", hash = "sha256:a64ce41e0881c06962a2b3ec80e0665f84de0809dedf1bf84f3a14b75dd665c4"},
-]
+python-versions = "^3.9"
+files = []
+develop = false

 [package.dependencies]
-jsonref = ">=1.1.0,<2.0.0"
-jsonschema = ">=4.16.0,<5.0.0"
-latex2mathml = ">=3.77.0,<4.0.0"
-pandas = ">=2.1.4,<3.0.0"
+jsonref = "^1.1.0"
+jsonschema = "^4.16.0"
+latex2mathml = "^3.77.0"
+pandas = "^2.1.4"
 pillow = ">=10.0.0,<12.0.0"
-pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0"
+pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2"
 pyyaml = ">=5.1,<7.0.0"
-semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""}
-tabulate = ">=0.9.0,<0.10.0"
-transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""}
-typer = ">=0.12.5,<0.13.0"
-typing-extensions = ">=4.12.2,<5.0.0"
+semchunk = {version = "^2.2.0", optional = true}
+tabulate = "^0.9.0"
+transformers = {version = "^4.34.0", optional = true}
+typer = "^0.12.5"
+typing-extensions = "^4.12.2"

 [package.extras]
 chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]

+[package.source]
+type = "git"
+url = "https://github.com/docling-project/docling-core.git"
+reference = "80704dce4b21656b50bf04182d4636f448c49827"
+resolved_reference = "80704dce4b21656b50bf04182d4636f448c49827"
+
 [[package]]
 name = "docling-ibm-models"
 version = "3.4.1"
@ -7838,4 +7842,4 @@ vlm = ["accelerate", "transformers", "transformers"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "16324c95a8aae1a710c4151e509c59e9a97d8bb97d4c726861ab3215fbea0a9d"
+content-hash = "891d40e03852f5de1979c76134f1ea6b4876abf6cb14a27c3c22f4aa3d0e37f4"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -46,7 +46,8 @@ packages = [{ include = "docling" }]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {extras = ["chunking"], version = "^2.23.1"}
+# docling-core = { git = "https://github.com/docling-project/docling-core.git", extras = ["chunking"], branch = "main" }
+docling-core = { git = "https://github.com/docling-project/docling-core.git", extras = ["chunking"], rev = "80704dce4b21656b50bf04182d4636f448c49827" }
 docling-ibm-models = "^3.4.0"
 docling-parse = "^4.0.0"
 filetype = "^1.2.0"
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
@ -106,8 +106,8 @@
 <text><loc_252><loc_232><loc_445><loc_328>Cell Content. In this section, we evaluate the entire pipeline of recovering a table with content. Here we put our approach to test by capitalizing on extracting content from the PDF cells rather than decoding from images. Tab. 4 shows the TEDs score of HTML code representing the structure of the table along with the content inserted in the data cell and compared with the ground-truth. Our method achieved a 5.3% increase over the state-of-the-art, and commercial solutions. We believe our scores would be higher if the HTML ground-truth matched the extracted PDF cell content. Unfortunately, there are small discrepancies such as spacings around words or special characters with various unicode representations.</text>
 <otsl><loc_272><loc_341><loc_426><loc_406><fcel>Model<ched>Simple<ched>TEDS Complex<ched>All<nl><rhed>Tabula<fcel>78.0<fcel>57.8<fcel>67.9<nl><rhed>Traprange<fcel>60.8<fcel>49.9<fcel>55.4<nl><rhed>Camelot<fcel>80.0<fcel>66.0<fcel>73.0<nl><rhed>Acrobat Pro<fcel>68.9<fcel>61.8<fcel>65.3<nl><rhed>EDD<fcel>91.2<fcel>85.4<fcel>88.3<nl><rhed>TableFormer<fcel>95.4<fcel>90.1<fcel>93.6<nl><caption><loc_252><loc_415><loc_445><loc_435>Table 4: Results of structure with content retrieved using cell detection on PubTabNet. In all cases the input is PDF documents with cropped tables.</caption></otsl>
 <page_footer><loc_241><loc_463><loc_245><loc_469>7</page_footer>
-<unordered_list><page_break>
-<list_item><loc_44><loc_50><loc_50><loc_55>a.</list_item>
+<page_break>
+<unordered_list><list_item><loc_44><loc_50><loc_50><loc_55>a.</list_item>
 <list_item><loc_54><loc_50><loc_408><loc_55>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</list_item>
 </unordered_list>
 <section_header_level_1><loc_44><loc_60><loc_232><loc_64>Japanese language (previously unseen by TableFormer):</section_header_level_1>
@ -127,8 +127,8 @@
 <unordered_list><list_item><loc_256><loc_438><loc_445><loc_450>[1] Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</list_item>
 </unordered_list>
 <page_footer><loc_241><loc_463><loc_245><loc_469>8</page_footer>
-<unordered_list><page_break>
-<list_item><loc_57><loc_48><loc_234><loc_74>end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</list_item>
+<page_break>
+<unordered_list><list_item><loc_57><loc_48><loc_234><loc_74>end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</list_item>
 <list_item><loc_45><loc_76><loc_234><loc_95>[2] Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3</list_item>
 <list_item><loc_45><loc_97><loc_234><loc_116>[3] Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2</list_item>
 <list_item><loc_45><loc_118><loc_234><loc_143>[4] Herv´e D´ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2</list_item>
@ -229,7 +229,6 @@
 <otsl><loc_68><loc_148><loc_195><loc_184></otsl>
 <otsl><loc_69><loc_195><loc_195><loc_232></otsl>
 <otsl><loc_68><loc_250><loc_203><loc_308></otsl>
-<caption><loc_52><loc_317><loc_223><loc_323>Figure 8: Example of a table with multi-line header.</caption>
 <otsl><loc_254><loc_64><loc_454><loc_86><caption><loc_252><loc_194><loc_445><loc_207>Figure 9: Example of a table with big empty distance between cells.</caption></otsl>
 <otsl><loc_253><loc_98><loc_454><loc_117></otsl>
 <otsl><loc_253><loc_124><loc_454><loc_147></otsl>
@ -244,12 +243,10 @@
 <page_break>
 <otsl><loc_42><loc_173><loc_231><loc_217></otsl>
 <picture><loc_42><loc_66><loc_231><loc_218></picture>
-<caption><loc_41><loc_225><loc_234><loc_238>Figure 11: Simple table with different style and empty cells.</caption>
 <otsl><loc_42><loc_286><loc_254><loc_310></otsl>
 <otsl><loc_42><loc_318><loc_254><loc_342></otsl>
 <otsl><loc_42><loc_350><loc_254><loc_374></otsl>
 <picture><loc_41><loc_386><loc_145><loc_414></picture>
-<caption><loc_45><loc_424><loc_230><loc_430>Figure 12: Simple table predictions and post processing.</caption>
 <otsl><loc_261><loc_102><loc_437><loc_135></otsl>
 <otsl><loc_261><loc_143><loc_437><loc_177></otsl>
 <otsl><loc_268><loc_182><loc_428><loc_226></otsl>
@ -273,7 +270,6 @@
 <picture><loc_289><loc_308><loc_405><loc_401></picture>
 <otsl><loc_289><loc_308><loc_405><loc_401></otsl>
 <otsl><loc_41><loc_319><loc_261><loc_399></otsl>
-<caption><loc_252><loc_412><loc_445><loc_425>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
 <page_footer><loc_239><loc_463><loc_247><loc_469>15</page_footer>
 <page_break>
 <picture><loc_55><loc_160><loc_432><loc_314><caption><loc_41><loc_321><loc_445><loc_334>Figure 17: Example of long table. End-to-end example from initial PDF cells to prediction of bounding boxes, post processing and prediction of structure.</caption></picture>
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md
@ -51,6 +51,8 @@ To meet the design criteria listed above, we developed a new model called TableF

 The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe

+$^{1}$https://github.com/IBM/SynthTabNet
+
 its results &amp; performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community.

 ## 2. Previous work and State of the Art
--- a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
@ -82,7 +82,6 @@
 <text><loc_44><loc_364><loc_241><loc_446>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</text>
 <picture><loc_258><loc_54><loc_457><loc_290></picture>
 <text><loc_327><loc_289><loc_389><loc_291>05237a14f2524e3f53c8454b074409d05078038a6a36b770fcc8ec7e540deae0</text>
-<caption><loc_260><loc_299><loc_457><loc_318>Figure 4: Examples of plausible annotation alternatives for the same page. Criteria in our annotation guideline can resolve cases A to C, while the case D remains ambiguous.</caption>
 <text><loc_259><loc_332><loc_456><loc_344>were carried out over a timeframe of 12 weeks, after which 8 of the 40 initially allocated annotators did not pass the bar.</text>
 <text><loc_259><loc_346><loc_457><loc_448>Phase 4: Production annotation. The previously selected 80K pages were annotated with the defined 11 class labels by 32 annotators. This production phase took around three months to complete. All annotations were created online through CCS, which visualises the programmatic PDF text-cells as an overlay on the page. The page annotation are obtained by drawing rectangular bounding-boxes, as shown in Figure 3. With regard to the annotation practices, we implemented a few constraints and capabilities on the tooling level. First, we only allow non-overlapping, vertically oriented, rectangular boxes. For the large majority of documents, this constraint was sufficient and it speeds up the annotation considerably in comparison with arbitrary segmentation shapes. Second, annotator staff were not able to see each other's annotations. This was enforced by design to avoid any bias in the annotation, which could skew the numbers of the inter-annotator agreement (see Table 1). We wanted</text>
 <page_break>
--- a/tests/data/groundtruth/docling_v2/2206.01062.md
+++ b/tests/data/groundtruth/docling_v2/2206.01062.md
@ -53,6 +53,8 @@ In this paper, we present the DocLayNet dataset. It provides pageby-page layout
 - (3) Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.
 - (4) Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.

+$^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet
+
 This enables experimentation with annotation uncertainty and quality control analysis.

 - (5) Pre-defined Train-, Test- &amp; Validation-set : Like DocBank, we provide fixed train-, test- &amp; validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
@ -85,6 +87,8 @@ We did not control the document selection with regard to language. The vast majo

 To ensure that future benchmarks in the document-layout analysis community can be easily compared, we have split up DocLayNet into pre-defined train-, test- and validation-sets. In this way, we can avoid spurious variations in the evaluation scores due to random splitting in train-, test- and validation-sets. We also ensured that less frequent labels are represented in train and test sets in equal proportions.

+$^{2}$e.g. AAPL from https://www.annualreports.com/
+
 Table 1 shows the overall frequency and distribution of the labels among the different sets. Importantly, we ensure that subsets are only split on full-document boundaries. This avoids that pages of the same document are spread over train, test and validation set, which can give an undesired evaluation advantage to models and lead to overestimation of their prediction accuracy. We will show the impact of this decision in Section 5.

 In order to accommodate the different types of models currently in use by the community, we provide DocLayNet in an augmented COCO format [16]. This entails the standard COCO ground-truth file (in JSON format) with the associated page images (in PNG format, 1025 × 1025 pixels). Furthermore, custom fields have been added to each COCO record to specify document category, original document filename and page number. In addition, we also provide the original PDF pages, as well as sidecar files containing parsed PDF text and text-cell coordinates (in JSON). All additional files are linked to the primary page images by their matching filenames.
@ -125,6 +129,8 @@ Preparation work included uploading and parsing the sourced PDF documents in the

 Phase 2: Label selection and guideline. We reviewed the collected documents and identified the most common structural features they exhibit. This was achieved by identifying recurrent layout elements and lead us to the definition of 11 distinct class labels. These 11 class labels are Caption , Footnote , Formula , List-item , Pagefooter , Page-header , Picture , Section-header , Table , Text , and Title . Critical factors that were considered for the choice of these class labels were (1) the overall occurrence of the label, (2) the specificity of the label, (3) recognisability on a single page (i.e. no need for context from previous or next page) and (4) overall coverage of the page. Specificity ensures that the choice of label is not ambiguous, while coverage ensures that all meaningful items on a page can be annotated. We refrained from class labels that are very specific to a document category, such as Abstract in the Scientific Articles category. We also avoided class labels that are tightly linked to the semantics of the text. Labels such as Author and Affiliation , as seen in DocBank, are often only distinguishable by discriminating on

+$^{3}$https://arxiv.org/
+
 the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.

 At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages.
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.doctags.txt
@ -2,7 +2,6 @@
 <text><loc_109><loc_94><loc_390><loc_183>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
 <text><loc_109><loc_185><loc_390><loc_213>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
 <code><loc_110><loc_231><loc_215><loc_257><_unknown_>function add(a, b) { return a + b; } console.log(add(3, 5));</code>
-<caption><loc_182><loc_221><loc_317><loc_226>Listing 1: Simple JavaScript Program</caption>
 <text><loc_109><loc_265><loc_390><loc_353>Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet. Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam erat, sed diam voluptua. At vero eos et accusam et justo duo dolores et ea rebum. Stet clita kasd gubergren, no sea takimata sanctus est Lorem ipsum dolor sit amet.</text>
 <text><loc_109><loc_355><loc_390><loc_383>Duis autem vel eum iriure dolor in hendrerit in vulputate velit esse molestie consequat, vel illum dolore eu feugiat nulla facilisis at vero eros et accumsan et iusto odio dignissim qui blandit praesent luptatum zzril delenit augue duis dolore te feugait nulla facilisi. Lorem ipsum dolor sit amet,</text>
 <page_footer><loc_248><loc_439><loc_252><loc_445>1</page_footer>
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
@ -105,7 +105,6 @@
 <text><loc_112><loc_155><loc_424><loc_168>The FUNCTION_USAGE view contains function usage configuration details. Table 2-1 describes the columns in the FUNCTION_USAGE view.</text>
 <otsl><loc_111><loc_183><loc_446><loc_279><ched>Column name<ched>Data type<ched>Description<nl><fcel>FUNCTION_ID<fcel>VARCHAR(30)<fcel>ID of the function.<nl><fcel>USER_NAME<fcel>VARCHAR(10)<fcel>Name of the user profile that has a usage setting for this  function.<nl><fcel>USAGE<fcel>VARCHAR(7)<fcel>Usage setting: GLYPH<SM590000> ALLOWED: The user profile is allowed to use the function. GLYPH<SM590000> DENIED: The user profile is not allowed to use the function.<nl><fcel>USER_TYPE<fcel>VARCHAR(5)<fcel>Type of user profile: GLYPH<SM590000> USER: The user profile is a user. GLYPH<SM590000> GROUP: The user profile is a group.<nl><caption><loc_112><loc_176><loc_232><loc_182>Table 2-1 FUNCTION_USAGE view</caption></otsl>
 <text><loc_112><loc_286><loc_447><loc_299>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</text>
-<caption><loc_112><loc_307><loc_378><loc_312>Example 2-1 Query to determine who has authority to define and manage RCAC</caption>
 <text><loc_112><loc_318><loc_140><loc_324>SELECT</text>
 <text><loc_149><loc_318><loc_206><loc_324>function_id,</text>
 <text><loc_136><loc_326><loc_197><loc_331>user_name,</text>
@ -132,14 +131,12 @@
 <page_footer><loc_290><loc_477><loc_428><loc_482>Chapter 2. Roles and separation of duties</page_footer>
 <page_footer><loc_438><loc_477><loc_447><loc_482>11</page_footer>
 <page_break>
-<caption><loc_112><loc_45><loc_432><loc_59>The SQL CREATE PERMISSION statement that is shown in Figure 3-1 is used to define and initially enable or disable the row access rules.</caption>
 <picture><loc_111><loc_68><loc_446><loc_259><caption><loc_112><loc_261><loc_279><loc_267>Figure 3-1 CREATE PERMISSION SQL statement</caption></picture>
 <section_header_level_1><loc_112><loc_278><loc_176><loc_285>Column mask</section_header_level_1>
 <text><loc_112><loc_287><loc_443><loc_316>A column mask is a database object that manifests a column value access control rule for a specific column in a specific table. It uses a CASE expression that describes what you see when you access the column. For example, a teller can see only the last four digits of a tax identification number.</text>
 <page_footer><loc_282><loc_477><loc_428><loc_482>Chapter 3. Row and Column Access Control</page_footer>
 <page_footer><loc_438><loc_477><loc_447><loc_482>15</page_footer>
 <page_break>
-<caption><loc_112><loc_45><loc_337><loc_51>Table 3-1 summarizes these special registers and their values.</caption>
 <otsl><loc_110><loc_65><loc_443><loc_129><ched>Special register<ched>Corresponding value<nl><fcel>USER or SESSION_USER<fcel>The effective user of the thread excluding adopted authority.<nl><fcel>CURRENT_USER<fcel>The effective user of the thread including adopted authority. When no adopted  authority is present, this has the same value as USER.<nl><fcel>SYSTEM_USER<fcel>The authorization ID that initiated the connection.<nl><caption><loc_112><loc_59><loc_304><loc_64>Table 3-1 Special registers and their corresponding values</caption></otsl>
 <text><loc_112><loc_135><loc_440><loc_149>Figure 3-5 shows the difference in the special register values when an adopted authority is used:</text>
 <unordered_list><list_item><loc_112><loc_154><loc_336><loc_160>GLYPH<SM590000> A user connects to the server using the user profile ALICE.</list_item>
@ -180,11 +177,10 @@
 <list_item><loc_124><loc_243><loc_433><loc_249>To implement this column mask, run the SQL statement that is shown in Example 3-9.</list_item>
 </unordered_list>
 <code><loc_112><loc_267><loc_430><loc_432><_unknown_>CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ;</code>
-<caption><loc_112><loc_257><loc_288><loc_262>Example 3-9 Creating a mask on the TAX_ID column</caption>
 <page_footer><loc_282><loc_477><loc_428><loc_482>Chapter 3. Row and Column Access Control</page_footer>
 <page_footer><loc_438><loc_477><loc_447><loc_482>27</page_footer>
-<unordered_list><page_break>
-<list_item><loc_112><loc_45><loc_368><loc_51>3. Figure 3-10 shows the masks that are created in the HR_SCHEMA.</list_item>
+<page_break>
+<unordered_list><list_item><loc_112><loc_45><loc_368><loc_51>3. Figure 3-10 shows the masks that are created in the HR_SCHEMA.</list_item>
 </unordered_list>
 <picture><loc_52><loc_60><loc_447><loc_107><caption><loc_53><loc_110><loc_239><loc_115>Figure 3-10 Column masks shown in System i Navigator</caption></picture>
 <section_header_level_1><loc_53><loc_128><loc_167><loc_135>3.6.6 Activating RCAC</section_header_level_1>
@ -204,8 +200,8 @@
 <picture><loc_52><loc_270><loc_433><loc_408><caption><loc_53><loc_410><loc_284><loc_415>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption></picture>
 <page_footer><loc_53><loc_477><loc_64><loc_482>28</page_footer>
 <page_footer><loc_76><loc_477><loc_273><loc_482>Row and Column Access Control Support in IBM DB2 for i</page_footer>
-<unordered_list><page_break>
-<list_item><loc_112><loc_45><loc_420><loc_66>2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</list_item>
+<page_break>
+<unordered_list><list_item><loc_112><loc_45><loc_420><loc_66>2. Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</list_item>
 <list_item><loc_112><loc_320><loc_447><loc_341>3. Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</list_item>
 </unordered_list>
 <picture><loc_112><loc_75><loc_446><loc_301><caption><loc_112><loc_303><loc_267><loc_309>Figure 4-68 Visual Explain with RCAC enabled</caption></picture>
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.md
@ -133,6 +133,10 @@ This chapter describes how you can secure and protect data in DB2 for i. The fol
 - GLYPH&lt;SM590000&gt; Current state of IBM i security
 - GLYPH&lt;SM590000&gt; DB2 for i security controls

+$^{1 }$http://www.idtheftcenter.org
+
+$^{2 }$http://www.ponemon.org /
+
 ## 1.1 Security fundamentals

 Before reviewing database security techniques, there are two fundamental steps in securing information assets that must be described:
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.doctags.txt
@ -3,5 +3,5 @@
 <text><loc_63><loc_258><loc_370><loc_277>رول لا لاول ةيرو ص م لا ةو موكحلا امخونرب دالوسي ،قبس خمل خً فوو 2024( -)2026 اتلآا وحهلا ىاي اهو ،ةسيئر ةيجيتارلسا اد هع ةعبرع قي حت :</text>
 <text><loc_58><loc_301><loc_367><loc_317>نــــــــم ما ةــــــــيا م رـ صم لا يم وقل ا اــــسن ا ءاــــ نب رــــــــــــــــــــصم لا عاـــــصت ا ءاـــــ نب يــــــــــــــــــــــسبا نت قتسظا ق يقحت را ر يــــــــــــــــــــــــساي سلا</text>
 <text><loc_61><loc_344><loc_367><loc_385>خهلوسحخب امخونرب لا ت خفدالوسم ديدحت لت دق هن ع ىلإ رخ لإا ردجت لكواب د روووصم ةو ووي ر تخ فدال ووو س م ىووو اي سيوووئر 2023 ر اوو وو حلا تخووو ساو تخيوووصوتو ، كيال ا تخ اووصيل اه،ووولا امخوونربلاو ،تارا ووو لا ت خ فدا لوو سمو ،اه،ووولا ،ةوو ي ا ةيه، ولا تخ ي جيتا رلسحا فالبمو .</text>
-<picture><loc_375><loc_119><loc_500><loc_386></picture>
+<picture><loc_375><loc_119><loc_499><loc_386></picture>
 </doctag>