Merge pull request #556 from DS4SD/cau/layout-processing-improvement

feat: layout processing improvements and bugfixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 12:34:22 +00:00 · 2024-12-10 16:29:07 +01:00 · 2024-12-10 16:29:07 +01:00 · b66fb830c9
commit b66fb830c9
parent 184eed4095
52 changed files with 401 additions and 739 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,14 @@
+## [v2.10.0](https://github.com/DS4SD/docling/releases/tag/v2.10.0) - 2024-12-09
+
+### Feature
+
+* Docling-parse v2 as default PDF backend ([#549](https://github.com/DS4SD/docling/issues/549)) ([`aca57f0`](https://github.com/DS4SD/docling/commit/aca57f0527dddcc027dc1ee840e2e492ab997170))
+
+### Fix
+
+* Call into docling-core for legacy document transform ([#551](https://github.com/DS4SD/docling/issues/551)) ([`7972d47`](https://github.com/DS4SD/docling/commit/7972d47f88604f02d6a32527116c4d78eb1005e2))
+* Introduce Image format options in CLI. Silence the tqdm downloading messages. ([#544](https://github.com/DS4SD/docling/issues/544)) ([`78f61a8`](https://github.com/DS4SD/docling/commit/78f61a8522d3a19ecc1d605e8441fb543ca0fa96))
+
 ## [v2.9.0](https://github.com/DS4SD/docling/releases/tag/v2.9.0) - 2024-12-09

 ### Feature
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -29,8 +29,10 @@ from docling.datamodel.pipeline_options import (
    AcceleratorDevice,
    AcceleratorOptions,
    EasyOcrOptions,
+    OcrEngine,
    OcrMacOptions,
    OcrOptions,
+    PdfBackend,
    PdfPipelineOptions,
    RapidOcrOptions,
    TableFormerMode,
@ -70,22 +72,6 @@ def version_callback(value: bool):
        raise typer.Exit()


-# Define an enum for the backend options
-class PdfBackend(str, Enum):
-    PYPDFIUM2 = "pypdfium2"
-    DLPARSE_V1 = "dlparse_v1"
-    DLPARSE_V2 = "dlparse_v2"
-
-
-# Define an enum for the ocr engines
-class OcrEngine(str, Enum):
-    EASYOCR = "easyocr"
-    TESSERACT_CLI = "tesseract_cli"
-    TESSERACT = "tesseract"
-    OCRMAC = "ocrmac"
-    RAPIDOCR = "rapidocr"
-
-
 def export_documents(
    conv_results: Iterable[ConversionResult],
    output_dir: Path,
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -19,12 +19,12 @@ if TYPE_CHECKING:


 class ConversionStatus(str, Enum):
-    PENDING = auto()
-    STARTED = auto()
-    FAILURE = auto()
-    SUCCESS = auto()
-    PARTIAL_SUCCESS = auto()
-    SKIPPED = auto()
+    PENDING = "pending"
+    STARTED = "started"
+    FAILURE = "failure"
+    SUCCESS = "success"
+    PARTIAL_SUCCESS = "partial_success"
+    SKIPPED = "skipped"


 class InputFormat(str, Enum):
@ -89,15 +89,15 @@ MimeTypeToFormat = {


 class DocInputType(str, Enum):
-    PATH = auto()
-    STREAM = auto()
+    PATH = "path"
+    STREAM = "stream"


 class DoclingComponentType(str, Enum):
-    DOCUMENT_BACKEND = auto()
-    MODEL = auto()
-    DOC_ASSEMBLER = auto()
-    USER_INPUT = auto()
+    DOCUMENT_BACKEND = "document_backend"
+    MODEL = "model"
+    DOC_ASSEMBLER = "doc_assembler"
+    USER_INPUT = "user_input"


 class ErrorItem(BaseModel):
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -33,6 +33,7 @@ from docling_core.types.legacy_doc.document import (
 from docling_core.types.legacy_doc.document import CCSFileInfoObject as DsFileInfoObject
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from docling_core.utils.file import resolve_source_to_stream
+from docling_core.utils.legacy import docling_document_to_legacy
 from pydantic import BaseModel
 from typing_extensions import deprecated

@ -191,259 +192,7 @@ class ConversionResult(BaseModel):
    @property
    @deprecated("Use document instead.")
    def legacy_document(self):
-        reverse_label_mapping = {
-            DocItemLabel.CAPTION.value: "Caption",
-            DocItemLabel.FOOTNOTE.value: "Footnote",
-            DocItemLabel.FORMULA.value: "Formula",
-            DocItemLabel.LIST_ITEM.value: "List-item",
-            DocItemLabel.PAGE_FOOTER.value: "Page-footer",
-            DocItemLabel.PAGE_HEADER.value: "Page-header",
-            DocItemLabel.PICTURE.value: "Picture",  # low threshold adjust to capture chemical structures for examples.
-            DocItemLabel.SECTION_HEADER.value: "Section-header",
-            DocItemLabel.TABLE.value: "Table",
-            DocItemLabel.TEXT.value: "Text",
-            DocItemLabel.TITLE.value: "Title",
-            DocItemLabel.DOCUMENT_INDEX.value: "Document Index",
-            DocItemLabel.CODE.value: "Code",
-            DocItemLabel.CHECKBOX_SELECTED.value: "Checkbox-Selected",
-            DocItemLabel.CHECKBOX_UNSELECTED.value: "Checkbox-Unselected",
-            DocItemLabel.FORM.value: "Form",
-            DocItemLabel.KEY_VALUE_REGION.value: "Key-Value Region",
-            DocItemLabel.PARAGRAPH.value: "paragraph",
-        }
-
-        title = ""
-        desc = DsDocumentDescription(logs=[])
-
-        page_hashes = [
-            PageReference(
-                hash=create_hash(self.input.document_hash + ":" + str(p.page_no - 1)),
-                page=p.page_no,
-                model="default",
-            )
-            for p in self.document.pages.values()
-        ]
-
-        file_info = DsFileInfoObject(
-            filename=self.input.file.name,
-            document_hash=self.input.document_hash,
-            num_pages=self.input.page_count,
-            page_hashes=page_hashes,
-        )
-
-        main_text = []
-        tables = []
-        figures = []
-        equations = []
-        footnotes = []
-        page_headers = []
-        page_footers = []
-
-        embedded_captions = set()
-        for ix, (item, level) in enumerate(
-            self.document.iterate_items(self.document.body)
-        ):
-
-            if isinstance(item, (TableItem, PictureItem)) and len(item.captions) > 0:
-                caption = item.caption_text(self.document)
-                if caption:
-                    embedded_captions.add(caption)
-
-        for item, level in self.document.iterate_items():
-            if isinstance(item, DocItem):
-                item_type = item.label
-
-                if isinstance(item, (TextItem, ListItem, SectionHeaderItem)):
-
-                    if isinstance(item, ListItem) and item.marker:
-                        text = f"{item.marker} {item.text}"
-                    else:
-                        text = item.text
-
-                    # Can be empty.
-                    prov = [
-                        Prov(
-                            bbox=p.bbox.as_tuple(),
-                            page=p.page_no,
-                            span=[0, len(item.text)],
-                        )
-                        for p in item.prov
-                    ]
-                    main_text.append(
-                        BaseText(
-                            text=text,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            name=reverse_label_mapping[item.label],
-                            prov=prov,
-                        )
-                    )
-
-                    # skip captions of they are embedded in the actual
-                    # floating object
-                    if item_type == DocItemLabel.CAPTION and text in embedded_captions:
-                        continue
-
-                elif isinstance(item, TableItem) and item.data:
-                    index = len(tables)
-                    ref_str = f"#/tables/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-
-                    # Initialise empty table data grid (only empty cells)
-                    table_data = [
-                        [
-                            TableCell(
-                                text="",
-                                # bbox=[0,0,0,0],
-                                spans=[[i, j]],
-                                obj_type="body",
-                            )
-                            for j in range(item.data.num_cols)
-                        ]
-                        for i in range(item.data.num_rows)
-                    ]
-
-                    # Overwrite cells in table data for which there is actual cell content.
-                    for cell in item.data.table_cells:
-                        for i in range(
-                            min(cell.start_row_offset_idx, item.data.num_rows),
-                            min(cell.end_row_offset_idx, item.data.num_rows),
-                        ):
-                            for j in range(
-                                min(cell.start_col_offset_idx, item.data.num_cols),
-                                min(cell.end_col_offset_idx, item.data.num_cols),
-                            ):
-                                celltype = "body"
-                                if cell.column_header:
-                                    celltype = "col_header"
-                                elif cell.row_header:
-                                    celltype = "row_header"
-                                elif cell.row_section:
-                                    celltype = "row_section"
-
-                                def make_spans(cell):
-                                    for rspan in range(
-                                        min(
-                                            cell.start_row_offset_idx,
-                                            item.data.num_rows,
-                                        ),
-                                        min(
-                                            cell.end_row_offset_idx, item.data.num_rows
-                                        ),
-                                    ):
-                                        for cspan in range(
-                                            min(
-                                                cell.start_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                            min(
-                                                cell.end_col_offset_idx,
-                                                item.data.num_cols,
-                                            ),
-                                        ):
-                                            yield [rspan, cspan]
-
-                                spans = list(make_spans(cell))
-                                table_data[i][j] = GlmTableCell(
-                                    text=cell.text,
-                                    bbox=(
-                                        cell.bbox.as_tuple()
-                                        if cell.bbox is not None
-                                        else None
-                                    ),  # check if this is bottom-left
-                                    spans=spans,
-                                    obj_type=celltype,
-                                    col=j,
-                                    row=i,
-                                    row_header=cell.row_header,
-                                    row_section=cell.row_section,
-                                    col_header=cell.column_header,
-                                    row_span=[
-                                        cell.start_row_offset_idx,
-                                        cell.end_row_offset_idx,
-                                    ],
-                                    col_span=[
-                                        cell.start_col_offset_idx,
-                                        cell.end_col_offset_idx,
-                                    ],
-                                )
-
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-
-                    tables.append(
-                        DsSchemaTable(
-                            text=caption,
-                            num_cols=item.data.num_cols,
-                            num_rows=item.data.num_rows,
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            data=table_data,
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, 0],
-                                )
-                                for p in item.prov
-                            ],
-                        )
-                    )
-
-                elif isinstance(item, PictureItem):
-                    index = len(figures)
-                    ref_str = f"#/figures/{index}"
-                    main_text.append(
-                        Ref(
-                            name=reverse_label_mapping[item.label],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            ref=ref_str,
-                        ),
-                    )
-
-                    # Compute the caption
-                    caption = item.caption_text(self.document)
-
-                    figures.append(
-                        Figure(
-                            prov=[
-                                Prov(
-                                    bbox=p.bbox.as_tuple(),
-                                    page=p.page_no,
-                                    span=[0, len(caption)],
-                                )
-                                for p in item.prov
-                            ],
-                            obj_type=layout_label_to_ds_type.get(item.label),
-                            text=caption,
-                            # data=[[]],
-                        )
-                    )
-
-        page_dimensions = [
-            PageDimensions(page=p.page_no, height=p.size.height, width=p.size.width)
-            for p in self.document.pages.values()
-        ]
-
-        ds_doc = DsDocument(
-            name=title,
-            description=desc,
-            file_info=file_info,
-            main_text=main_text,
-            equations=equations,
-            footnotes=footnotes,
-            page_headers=page_headers,
-            page_footers=page_footers,
-            tables=tables,
-            figures=figures,
-            page_dimensions=page_dimensions,
-        )
-
-        return ds_doc
+        return docling_document_to_legacy(self.document)


 class _DummyBackend(AbstractDocumentBackend):
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -190,6 +190,26 @@ class OcrMacOptions(OcrOptions):
    )


+# Define an enum for the backend options
+class PdfBackend(str, Enum):
+    """Enum of valid PDF backends."""
+
+    PYPDFIUM2 = "pypdfium2"
+    DLPARSE_V1 = "dlparse_v1"
+    DLPARSE_V2 = "dlparse_v2"
+
+
+# Define an enum for the ocr engines
+class OcrEngine(str, Enum):
+    """Enum of valid OCR engines."""
+
+    EASYOCR = "easyocr"
+    TESSERACT_CLI = "tesseract_cli"
+    TESSERACT = "tesseract"
+    OCRMAC = "ocrmac"
+    RAPIDOCR = "rapidocr"
+
+
 class PipelineOptions(BaseModel):
    """Base pipeline options."""

--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -80,7 +80,7 @@ class LayoutModel(BasePageModel):
            DocItemLabel.TITLE: (255, 153, 153),  # Light Red (same as Section-Header)
            DocItemLabel.FOOTNOTE: (200, 200, 255),  # Light Blue
            DocItemLabel.DOCUMENT_INDEX: (220, 220, 220),  # Light Gray
-            DocItemLabel.CODE: (255, 223, 186),  # Peach
+            DocItemLabel.CODE: (125, 125, 125),  # Gray
            DocItemLabel.CHECKBOX_SELECTED: (255, 182, 193),  # Pale Green
            DocItemLabel.CHECKBOX_UNSELECTED: (255, 182, 193),  # Light Pink
            DocItemLabel.FORM: (200, 255, 255),  # Light Cyan
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -97,24 +97,25 @@ class RapidOcrModel(BaseOcrModel):
                        del high_res_image
                        del im

-                        cells = [
-                            OcrCell(
-                                id=ix,
-                                text=line[1],
-                                confidence=line[2],
-                                bbox=BoundingBox.from_tuple(
-                                    coord=(
-                                        (line[0][0][0] / self.scale) + ocr_rect.l,
-                                        (line[0][0][1] / self.scale) + ocr_rect.t,
-                                        (line[0][2][0] / self.scale) + ocr_rect.l,
-                                        (line[0][2][1] / self.scale) + ocr_rect.t,
+                        if result is not None:
+                            cells = [
+                                OcrCell(
+                                    id=ix,
+                                    text=line[1],
+                                    confidence=line[2],
+                                    bbox=BoundingBox.from_tuple(
+                                        coord=(
+                                            (line[0][0][0] / self.scale) + ocr_rect.l,
+                                            (line[0][0][1] / self.scale) + ocr_rect.t,
+                                            (line[0][2][0] / self.scale) + ocr_rect.l,
+                                            (line[0][2][1] / self.scale) + ocr_rect.t,
+                                        ),
+                                        origin=CoordOrigin.TOPLEFT,
                                    ),
-                                    origin=CoordOrigin.TOPLEFT,
-                                ),
-                            )
-                            for ix, line in enumerate(result)
-                        ]
-                        all_ocr_cells.extend(cells)
+                                )
+                                for ix, line in enumerate(result)
+                            ]
+                            all_ocr_cells.extend(cells)

                    # Post-process the cells
                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -71,6 +71,10 @@ class TableStructureModel(BasePageModel):
            x0, y0, x1, y1 = table_element.cluster.bbox.as_tuple()
            draw.rectangle([(x0, y0), (x1, y1)], outline="red")

+            for cell in table_element.cluster.cells:
+                x0, y0, x1, y1 = cell.bbox.as_tuple()
+                draw.rectangle([(x0, y0), (x1, y1)], outline="green")
+
            for tc in table_element.table_cells:
                if tc.bbox is not None:
                    x0, y0, x1, y1 = tc.bbox.as_tuple()
@ -84,7 +88,6 @@ class TableStructureModel(BasePageModel):
                        text=f"{tc.start_row_offset_idx}, {tc.start_col_offset_idx}",
                        fill="black",
                    )
-
        if show:
            image.show()
        else:
@ -136,41 +139,33 @@ class TableStructureModel(BasePageModel):
                        yield page
                        continue

-                    tokens = []
-                    for c in page.cells:
-                        for cluster, _ in in_tables:
-                            if c.bbox.area() > 0:
-                                if (
-                                    c.bbox.intersection_area_with(cluster.bbox)
-                                    / c.bbox.area()
-                                    > 0.2
-                                ):
-                                    # Only allow non empty stings (spaces) into the cells of a table
-                                    if len(c.text.strip()) > 0:
-                                        new_cell = copy.deepcopy(c)
-                                        new_cell.bbox = new_cell.bbox.scaled(
-                                            scale=self.scale
-                                        )
-
-                                        tokens.append(new_cell.model_dump())
-
                    page_input = {
-                        "tokens": tokens,
                        "width": page.size.width * self.scale,
                        "height": page.size.height * self.scale,
+                        "image": numpy.asarray(page.get_image(scale=self.scale)),
                    }
-                    page_input["image"] = numpy.asarray(
-                        page.get_image(scale=self.scale)
-                    )

                    table_clusters, table_bboxes = zip(*in_tables)

                    if len(table_bboxes):
-                        tf_output = self.tf_predictor.multi_table_predict(
-                            page_input, table_bboxes, do_matching=self.do_cell_matching
-                        )
+                        for table_cluster, tbl_box in in_tables:

-                        for table_cluster, table_out in zip(table_clusters, tf_output):
+                            tokens = []
+                            for c in table_cluster.cells:
+                                # Only allow non empty stings (spaces) into the cells of a table
+                                if len(c.text.strip()) > 0:
+                                    new_cell = copy.deepcopy(c)
+                                    new_cell.bbox = new_cell.bbox.scaled(
+                                        scale=self.scale
+                                    )
+
+                                    tokens.append(new_cell.model_dump())
+                            page_input["tokens"] = tokens
+
+                            tf_output = self.tf_predictor.multi_table_predict(
+                                page_input, [tbl_box], do_matching=self.do_cell_matching
+                            )
+                            table_out = tf_output[0]
                            table_cells = []
                            for element in table_out["tf_responses"]:

--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -156,16 +156,16 @@ class LayoutPostprocessor:
    SPECIAL_TYPES = WRAPPER_TYPES | {DocItemLabel.PICTURE}

    CONFIDENCE_THRESHOLDS = {
-        DocItemLabel.CAPTION: 0.35,
-        DocItemLabel.FOOTNOTE: 0.35,
-        DocItemLabel.FORMULA: 0.35,
-        DocItemLabel.LIST_ITEM: 0.35,
-        DocItemLabel.PAGE_FOOTER: 0.35,
-        DocItemLabel.PAGE_HEADER: 0.35,
-        DocItemLabel.PICTURE: 0.1,
+        DocItemLabel.CAPTION: 0.5,
+        DocItemLabel.FOOTNOTE: 0.5,
+        DocItemLabel.FORMULA: 0.5,
+        DocItemLabel.LIST_ITEM: 0.5,
+        DocItemLabel.PAGE_FOOTER: 0.5,
+        DocItemLabel.PAGE_HEADER: 0.5,
+        DocItemLabel.PICTURE: 0.5,
        DocItemLabel.SECTION_HEADER: 0.45,
        DocItemLabel.TABLE: 0.35,
-        DocItemLabel.TEXT: 0.45,
+        DocItemLabel.TEXT: 0.55,  # 0.45,
        DocItemLabel.TITLE: 0.45,
        DocItemLabel.CODE: 0.45,
        DocItemLabel.CHECKBOX_SELECTED: 0.45,
@ -218,6 +218,12 @@ class LayoutPostprocessor:
        final_clusters = self._sort_clusters(
            self.regular_clusters + self.special_clusters
        )
+        for cluster in final_clusters:
+            cluster.cells = self._sort_cells(cluster.cells)
+            # Also sort cells in children if any
+            for child in cluster.children:
+                child.cells = self._sort_cells(child.cells)
+
        return final_clusters, self.cells

    def _process_regular_clusters(self) -> List[Cluster]:
@ -273,6 +279,8 @@ class LayoutPostprocessor:
            if c.confidence >= self.CONFIDENCE_THRESHOLDS[c.label]
        ]

+        special_clusters = self._handle_cross_type_overlaps(special_clusters)
+
        for special in special_clusters:
            contained = []
            for cluster in self.regular_clusters:
@ -283,14 +291,17 @@ class LayoutPostprocessor:
                        contained.append(cluster)

            if contained:
-                # Sort contained clusters by minimum cell ID
-                contained.sort(
-                    key=lambda cluster: (
-                        min(cell.id for cell in cluster.cells)
-                        if cluster.cells
-                        else sys.maxsize
-                    )
-                )
+                # # Sort contained clusters by minimum cell ID:
+                # contained.sort(
+                #     key=lambda cluster: (
+                #         min(cell.id for cell in cluster.cells)
+                #         if cluster.cells
+                #         else sys.maxsize
+                #     )
+                # )
+
+                # Sort contained clusters left-to-right, top-to-bottom
+                contained = self._sort_clusters(contained)
                special.children = contained

                # Adjust bbox only for wrapper types
@ -318,6 +329,109 @@ class LayoutPostprocessor:

        return picture_clusters + wrapper_clusters

+    def _handle_cross_type_overlaps(self, special_clusters) -> List[Cluster]:
+        """Handle overlaps between regular and wrapper clusters before child assignment.
+
+        In particular, KEY_VALUE_REGION proposals that are almost identical to a TABLE
+        should be removed.
+        """
+        wrappers_to_remove = set()
+
+        for wrapper in special_clusters:
+            if wrapper.label != DocItemLabel.KEY_VALUE_REGION:
+                continue  # only treat KEY_VALUE_REGION for now.
+
+            for regular in self.regular_clusters:
+                if regular.label == DocItemLabel.TABLE:
+                    # Calculate overlap
+                    overlap = regular.bbox.intersection_area_with(wrapper.bbox)
+                    wrapper_area = wrapper.bbox.area()
+                    overlap_ratio = overlap / wrapper_area
+
+                    # If wrapper is mostly overlapping with a TABLE, remove the wrapper
+                    if overlap_ratio > 0.8:  # 80% overlap threshold
+                        wrappers_to_remove.add(wrapper.id)
+                        break
+
+        # Filter out the identified wrappers
+        special_clusters = [
+            cluster
+            for cluster in special_clusters
+            if cluster.id not in wrappers_to_remove
+        ]
+
+        return special_clusters
+
+    def _should_prefer_cluster(
+        self, candidate: Cluster, other: Cluster, params: dict
+    ) -> bool:
+        """Determine if candidate cluster should be preferred over other cluster based on rules.
+        Returns True if candidate should be preferred, False if not."""
+
+        # Rule 1: LIST_ITEM vs TEXT
+        if (
+            candidate.label == DocItemLabel.LIST_ITEM
+            and other.label == DocItemLabel.TEXT
+        ):
+            # Check if areas are similar (within 20% of each other)
+            area_ratio = candidate.bbox.area() / other.bbox.area()
+            area_similarity = abs(1 - area_ratio) < 0.2
+            if area_similarity:
+                return True
+
+        # Rule 2: CODE vs others
+        if candidate.label == DocItemLabel.CODE:
+            # Calculate how much of the other cluster is contained within the CODE cluster
+            overlap = other.bbox.intersection_area_with(candidate.bbox)
+            containment = overlap / other.bbox.area()
+            if containment > 0.8:  # other is 80% contained within CODE
+                return True
+
+        # If no label-based rules matched, fall back to area/confidence thresholds
+        area_ratio = candidate.bbox.area() / other.bbox.area()
+        conf_diff = other.confidence - candidate.confidence
+
+        if (
+            area_ratio <= params["area_threshold"]
+            and conf_diff > params["conf_threshold"]
+        ):
+            return False
+
+        return True  # Default to keeping candidate if no rules triggered rejection
+
+    def _select_best_cluster_from_group(
+        self,
+        group_clusters: List[Cluster],
+        params: dict,
+    ) -> Cluster:
+        """Select best cluster from a group of overlapping clusters based on all rules."""
+        current_best = None
+
+        for candidate in group_clusters:
+            should_select = True
+
+            for other in group_clusters:
+                if other == candidate:
+                    continue
+
+                if not self._should_prefer_cluster(candidate, other, params):
+                    should_select = False
+                    break
+
+            if should_select:
+                if current_best is None:
+                    current_best = candidate
+                else:
+                    # If both clusters pass rules, prefer the larger one unless confidence differs significantly
+                    if (
+                        candidate.bbox.area() > current_best.bbox.area()
+                        and current_best.confidence - candidate.confidence
+                        <= params["conf_threshold"]
+                    ):
+                        current_best = candidate
+
+        return current_best if current_best else group_clusters[0]
+
    def _remove_overlapping_clusters(
        self,
        clusters: List[Cluster],
@ -360,36 +474,15 @@ class LayoutPostprocessor:
                continue

            group_clusters = [valid_clusters[cid] for cid in group]
-            current_best = None
+            best = self._select_best_cluster_from_group(group_clusters, params)

-            for candidate in group_clusters:
-                should_select = True
-                for other in group_clusters:
-                    if other == candidate:
-                        continue
-
-                    area_ratio = candidate.bbox.area() / other.bbox.area()
-                    conf_diff = other.confidence - candidate.confidence
-
-                    if (
-                        area_ratio <= params["area_threshold"]
-                        and conf_diff > params["conf_threshold"]
-                    ):
-                        should_select = False
-                        break
-
-                if should_select:
-                    if current_best is None or (
-                        candidate.bbox.area() > current_best.bbox.area()
-                        and current_best.confidence - candidate.confidence
-                        <= params["conf_threshold"]
-                    ):
-                        current_best = candidate
-
-            best = current_best if current_best else group_clusters[0]
+            # Simple cell merging - no special cases
            for cluster in group_clusters:
                if cluster != best:
                    best.cells.extend(cluster.cells)
+
+            best.cells = self._deduplicate_cells(best.cells)
+            best.cells = self._sort_cells(best.cells)
            result.append(best)

        return result
@ -424,6 +517,16 @@ class LayoutPostprocessor:

        return current_best if current_best else clusters[0]

+    def _deduplicate_cells(self, cells: List[Cell]) -> List[Cell]:
+        """Ensure each cell appears only once, maintaining order of first appearance."""
+        seen_ids = set()
+        unique_cells = []
+        for cell in cells:
+            if cell.id not in seen_ids:
+                seen_ids.add(cell.id)
+                unique_cells.append(cell)
+        return unique_cells
+
    def _assign_cells_to_clusters(
        self, clusters: List[Cluster], min_overlap: float = 0.2
    ) -> List[Cluster]:
@ -452,6 +555,10 @@ class LayoutPostprocessor:
            if best_cluster is not None:
                best_cluster.cells.append(cell)

+        # Deduplicate cells in each cluster after assignment
+        for cluster in clusters:
+            cluster.cells = self._deduplicate_cells(cluster.cells)
+
        return clusters

    def _find_unassigned_cells(self, clusters: List[Cluster]) -> List[Cell]:
@ -487,13 +594,10 @@ class LayoutPostprocessor:

        return clusters

+    def _sort_cells(self, cells: List[Cell]) -> List[Cell]:
+        """Sort cells in native reading order."""
+        return sorted(cells, key=lambda c: (c.id))
+
    def _sort_clusters(self, clusters: List[Cluster]) -> List[Cluster]:
        """Sort clusters in reading order (top-to-bottom, left-to-right)."""
-
-        def reading_order_key(cluster: Cluster) -> Tuple[float, float]:
-            if cluster.cells and cluster.label != DocItemLabel.PICTURE:
-                first_cell = min(cluster.cells, key=lambda c: (c.bbox.t, c.bbox.l))
-                return (first_cell.bbox.t, first_cell.bbox.l)
-            return (cluster.bbox.t, cluster.bbox.l)
-
-        return sorted(clusters, key=reading_order_key)
+        return sorted(clusters, key=lambda cluster: (cluster.bbox.t, cluster.bbox.l))
--- a/docs/concepts/architecture.md
+++ b/docs/concepts/architecture.md
@ -10,7 +10,7 @@ For each document format, the *document converter* knows which format-specific *

 The *conversion result* contains the [*Docling document*](./docling_document.md), Docling's fundamental document representation.

-Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a *chunker*.
+Some typical scenarios for using a Docling document include directly calling its *export methods*, such as for markdown, dictionary etc., or having it chunked by a [*chunker*](./chunking.md).

 For more details on Docling's architecture, check out the [Docling Technical Report](https://arxiv.org/abs/2408.09869).

--- a/docs/reference/cli.md
+++ b/docs/reference/cli.md
@ -1,4 +1,4 @@
-# CLI Reference
+# CLI reference

 This page provides documentation for our command line tools.

--- a/docs/api_reference/docling_document.md
+++ b/docs/api_reference/docling_document.md
--- a/docs/api_reference/document_converter.md
+++ b/docs/api_reference/document_converter.md
--- a/docs/api_reference/pipeline_options.md
+++ b/docs/api_reference/pipeline_options.md
--- a/docs/usage.md
+++ b/docs/usage.md
@ -22,9 +22,7 @@ A simple example would look like this:
 docling https://arxiv.org/pdf/2206.01062
 ```

-To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./cli.md).
-
-
+To see all available options (export formats etc.) run `docling --help`. More details in the [CLI reference page](./reference/cli.md).

 ### Advanced options

@ -130,29 +128,37 @@ You can limit the CPU threads used by Docling by setting the environment variabl

 ## Chunking

-You can perform a hierarchy-aware chunking of a Docling document as follows:
+You can chunk a Docling document using a [chunker](concepts/chunking.md), such as a
+`HybridChunker`, as shown below (for more details check out
+[this example](examples/hybrid_chunking.ipynb)):

 ```python
 from docling.document_converter import DocumentConverter
-from docling_core.transforms.chunker import HierarchicalChunker
+from docling.chunking import HybridChunker

 conv_res = DocumentConverter().convert("https://arxiv.org/pdf/2206.01062")
 doc = conv_res.document
-chunks = list(HierarchicalChunker().chunk(doc))

-print(chunks[30])
+chunker = HybridChunker(tokenizer="BAAI/bge-small-en-v1.5")  # set tokenizer as needed
+chunk_iter = chunker.chunk(doc)
+```
+
+An example chunk would look like this:
+
+```python
+print(list(chunk_iter)[11])
 # {
-#   "text": "Lately, new types of ML models for document-layout analysis have emerged [...]",
+#   "text": "In this paper, we present the DocLayNet dataset. [...]",
 #   "meta": {
 #     "doc_items": [{
-#       "self_ref": "#/texts/40",
+#       "self_ref": "#/texts/28",
 #       "label": "text",
 #       "prov": [{
 #         "page_no": 2,
-#         "bbox": {"l": 317.06, "t": 325.81, "r": 559.18, "b": 239.97, ...},
-#       }]
-#     }],
-#     "headings": ["2 RELATED WORK"],
+#         "bbox": {"l": 53.29, "t": 287.14, "r": 295.56, "b": 212.37, ...},
+#       }], ...,
+#     }, ...],
+#     "headings": ["1 INTRODUCTION"],
 #   }
 # }
 ```
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -56,7 +56,6 @@ nav:
    - "Docling": index.md
    - Installation: installation.md
    - Usage: usage.md
-    - CLI: cli.md
    - FAQ: faq.md
    - Docling v2: v2.md
  - Concepts:
@ -77,15 +76,12 @@ nav:
      - "Multimodal export": examples/export_multimodal.py
      - "Force full page OCR": examples/full_page_ocr.py
      - "Accelerator options": examples/run_with_acclerators.py
+    - Chunking:
+      - "Hybrid chunking": examples/hybrid_chunking.ipynb
    - RAG / QA:
      - "RAG with LlamaIndex 🦙": examples/rag_llamaindex.ipynb
      - "RAG with LangChain 🦜🔗": examples/rag_langchain.ipynb
      - "Hybrid RAG with Qdrant": examples/hybrid_rag_qdrant.ipynb
-    - Chunking:
-      - "Hybrid chunking": examples/hybrid_chunking.ipynb
-    #   - Chunking: examples/chunking.md
-    # - CLI:
-    #   - CLI: examples/cli.md
  - Integrations:
    - Integrations: integrations/index.md
    - "🐝 Bee": integrations/bee.md
@ -100,10 +96,13 @@ nav:
    - "spaCy": integrations/spacy.md
    - "txtai": integrations/txtai.md
    # - "LangChain 🦜🔗": integrations/langchain.md
-  - API reference:
-    - Document Converter: api_reference/document_converter.md
-    - Pipeline options: api_reference/pipeline_options.md
-    - Docling Document: api_reference/docling_document.md
+  - Reference:
+    - Python API:
+      - Document Converter: reference/document_converter.md
+      - Pipeline options: reference/pipeline_options.md
+      - Docling Document: reference/docling_document.md
+    - CLI:
+      - CLI reference: reference/cli.md

 markdown_extensions:
  - pymdownx.superfences
--- a/poetry.lock
+++ b/poetry.lock
@ -922,27 +922,29 @@ name = "docling-core"
 version = "2.9.0"
 description = "A python library to define and validate data types in Docling."
 optional = false
-python-versions = "<4.0,>=3.9"
-files = [
-    {file = "docling_core-2.9.0-py3-none-any.whl", hash = "sha256:b44b077db5d2ac8a900f30a15abe329c165b1f2eb7f1c90d1275c423c1c3d668"},
-    {file = "docling_core-2.9.0.tar.gz", hash = "sha256:1bf12fe67ee4852330e9bac33fe62b45598ff885481e03a88fa8e1bf48252424"},
-]
+python-versions = "^3.9"
+files = []
+develop = false

 [package.dependencies]
-jsonref = ">=1.1.0,<2.0.0"
-jsonschema = ">=4.16.0,<5.0.0"
-pandas = ">=2.1.4,<3.0.0"
-pillow = ">=10.3.0,<11.0.0"
-pydantic = ">=2.6.0,<2.10.0 || >2.10.0,<2.10.1 || >2.10.1,<2.10.2 || >2.10.2,<3.0.0"
+jsonref = "^1.1.0"
+jsonschema = "^4.16.0"
+pandas = "^2.1.4"
+pillow = "^10.3.0"
+pydantic = ">=2.6.0,<3.0.0,!=2.10.0,!=2.10.1,!=2.10.2"
 pyyaml = ">=5.1,<7.0.0"
-semchunk = {version = ">=2.2.0,<3.0.0", optional = true, markers = "extra == \"chunking\""}
-tabulate = ">=0.9.0,<0.10.0"
-transformers = {version = ">=4.34.0,<5.0.0", optional = true, markers = "extra == \"chunking\""}
-typing-extensions = ">=4.12.2,<5.0.0"
+tabulate = "^0.9.0"
+typing-extensions = "^4.12.2"

 [package.extras]
 chunking = ["semchunk (>=2.2.0,<3.0.0)", "transformers (>=4.34.0,<5.0.0)"]

+[package.source]
+type = "git"
+url = "ssh://git@github.com/DS4SD/docling-core.git"
+reference = "cau/include-picture-contents"
+resolved_reference = "012f8ac38a2ba7e77110b3f7ad57af2a984232e5"
+
 [[package]]
 name = "docling-ibm-models"
 version = "2.0.7"
@ -2855,32 +2857,6 @@ files = [
    {file = "more_itertools-10.5.0-py3-none-any.whl", hash = "sha256:037b0d3203ce90cca8ab1defbbdac29d5f993fc20131f3664dc8d6acfa872aef"},
 ]

-[[package]]
-name = "mpire"
-version = "2.10.2"
-description = "A Python package for easy multiprocessing, but faster than multiprocessing"
-optional = false
-python-versions = "*"
-files = [
-    {file = "mpire-2.10.2-py3-none-any.whl", hash = "sha256:d627707f7a8d02aa4c7f7d59de399dec5290945ddf7fbd36cbb1d6ebb37a51fb"},
-    {file = "mpire-2.10.2.tar.gz", hash = "sha256:f66a321e93fadff34585a4bfa05e95bd946cf714b442f51c529038eb45773d97"},
-]
-
-[package.dependencies]
-multiprocess = [
-    {version = "*", optional = true, markers = "python_version < \"3.11\" and extra == \"dill\""},
-    {version = ">=0.70.15", optional = true, markers = "python_version >= \"3.11\" and extra == \"dill\""},
-]
-pygments = ">=2.0"
-pywin32 = {version = ">=301", markers = "platform_system == \"Windows\""}
-tqdm = ">=4.27"
-
-[package.extras]
-dashboard = ["flask"]
-dill = ["multiprocess", "multiprocess (>=0.70.15)"]
-docs = ["docutils (==0.17.1)", "sphinx (==3.2.1)", "sphinx-autodoc-typehints (==1.11.0)", "sphinx-rtd-theme (==0.5.0)", "sphinx-versions (==1.0.1)", "sphinxcontrib-images (==0.9.2)"]
-testing = ["ipywidgets", "multiprocess", "multiprocess (>=0.70.15)", "numpy", "pywin32 (>=301)", "rich"]
-
 [[package]]
 name = "mpmath"
 version = "1.3.0"
@ -6170,21 +6146,6 @@ files = [
 cryptography = ">=2.0"
 jeepney = ">=0.6"

-[[package]]
-name = "semchunk"
-version = "2.2.0"
-description = "A fast and lightweight Python library for splitting text into semantically meaningful chunks."
-optional = false
-python-versions = ">=3.9"
-files = [
-    {file = "semchunk-2.2.0-py3-none-any.whl", hash = "sha256:7db19ca90ddb48f99265e789e07a7bb111ae25185f9cc3d44b94e1e61b9067fc"},
-    {file = "semchunk-2.2.0.tar.gz", hash = "sha256:4de761ce614036fa3bea61adbe47e3ade7c96ac9b062f223b3ac353dbfd26743"},
-]
-
-[package.dependencies]
-mpire = {version = "*", extras = ["dill"]}
-tqdm = "*"
-
 [[package]]
 name = "semver"
 version = "2.13.0"
@ -7723,4 +7684,4 @@ tesserocr = ["tesserocr"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "6917af8d76aa1f85a159f0ab9546478b4bef194ae726c79196bac087c7368fef"
+content-hash = "c991515ef231d9eeead33cc876e8cb93fe31e949a5ab92918a4b77257d2700a3"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.9.0"  # DO NOT EDIT, updated automatically
+version = "2.10.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = ["Christoph Auer <cau@zurich.ibm.com>", "Michele Dolfi <dol@zurich.ibm.com>", "Maxim Lysak <mly@zurich.ibm.com>", "Nikos Livathinos <nli@zurich.ibm.com>", "Ahmed Nassar <ahn@zurich.ibm.com>", "Panos Vagenas <pva@zurich.ibm.com>", "Peter Staar <taa@zurich.ibm.com>"]
 license = "MIT"
@ -28,7 +28,8 @@ python = "^3.9"
 docling-ibm-models = { git = "ssh://git@github.com/DS4SD/docling-ibm-models.git", branch = "nli/performance" }
 deepsearch-glm = "^1.0.0"
 docling-parse = "^3.0.0"
-docling-core = { version = "^2.9.0", extras = ["chunking"] }
+#docling-core = { version = "^2.9.0", extras = ["chunking"] }
+docling-core = { git = "ssh://git@github.com/DS4SD/docling-core.git", branch = "cau/include-picture-contents" }
 pydantic = "^2.0.0"
 filetype = "^1.2.0"
 pypdfium2 = "^4.30.0"
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.doctags.txt
@ -153,41 +153,20 @@
 </table>
 <paragraph><location><page_8><loc_9><loc_89><loc_10><loc_90></location>- a.</paragraph>
 <paragraph><location><page_8><loc_11><loc_89><loc_82><loc_90></location>- Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</paragraph>
-<paragraph><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</paragraph>
-<paragraph><location><page_8><loc_50><loc_87><loc_70><loc_88></location>Example table from FinTabNet:</paragraph>
-<figure>
-<location><page_8><loc_8><loc_76><loc_49><loc_87></location>
-</figure>
+<caption><location><page_8><loc_9><loc_87><loc_70><loc_88></location>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:</caption>
 <caption><location><page_8><loc_9><loc_73><loc_63><loc_74></location>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 <figure>
-<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
-<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
+<location><page_8><loc_8><loc_76><loc_49><loc_87></location>
+<caption>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 </figure>
-<table>
+<figure>
 <location><page_8><loc_9><loc_63><loc_49><loc_72></location>
-<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
-<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
-<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
-<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
-<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
-<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
-<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
-<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
-<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
-<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
-</table>
+</figure>
 <caption><location><page_8><loc_62><loc_62><loc_90><loc_63></location>Text is aligned to match original for ease of viewing</caption>
-<table>
+<figure>
 <location><page_8><loc_50><loc_64><loc_90><loc_72></location>
 <caption>Text is aligned to match original for ease of viewing</caption>
-<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
-<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
-<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
-<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
-<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
-<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
-<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
-</table>
+</figure>
 <caption><location><page_8><loc_8><loc_54><loc_89><loc_59></location>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
 <figure>
 <location><page_8><loc_8><loc_44><loc_35><loc_52></location>
@ -296,7 +275,7 @@
 <paragraph><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</paragraph>
 <caption><location><page_13><loc_50><loc_59><loc_89><loc_61></location>Figure 9: Example of a table with big empty distance between cells.</caption>
 <figure>
-<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
+<location><page_13><loc_51><loc_63><loc_91><loc_87></location>
 <caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </figure>
 <caption><location><page_13><loc_51><loc_13><loc_89><loc_14></location>Figure 10: Example of a complex table with empty cells.</caption>
@ -319,7 +298,11 @@
 <location><page_14><loc_52><loc_55><loc_87><loc_89></location>
 <caption>Figure 13: Table predictions example on colorful table.</caption>
 </figure>
-<paragraph><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</paragraph>
+<caption><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</caption>
+<figure>
+<location><page_14><loc_52><loc_25><loc_85><loc_31></location>
+<caption>Figure 14: Example with multi-line text.</caption>
+</figure>
 <figure>
 <location><page_15><loc_9><loc_69><loc_46><loc_83></location>
 </figure>
@ -335,6 +318,9 @@
 <caption>Figure 15: Example with triangular table.</caption>
 </figure>
 <figure>
+<location><page_15><loc_53><loc_72><loc_86><loc_85></location>
+</figure>
+<figure>
 <location><page_15><loc_53><loc_41><loc_86><loc_54></location>
 </figure>
 <caption><location><page_15><loc_50><loc_15><loc_89><loc_18></location>Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.</caption>
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.md
@ -219,40 +219,18 @@ Table 4: Results of structure with content retrieved using cell detection on Pub

 - Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells

-Japanese language (previously unseen by TableFormer):
-
-Example table from FinTabNet:
-
-
-<!-- image -->
+Japanese language (previously unseen by TableFormer): Example table from FinTabNet:

 b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
+
+Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
 <!-- image -->


-
-|                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
-|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
-| 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
-| Association for Computational Linguistics(ACL2003) | 65          | 65             | 0              | 150        | 0          |
-| Computational Linguistics(COLING2002)              | 140         | 140            | 0              | 150        | 0          |
-| 電気情報通信学会 2003 年総合大会                   | 150         | 8              | 142            | 223        | 147        |
-| 情報処理学会第 65 回全国大会 (2003)                | 177         | 1              | 176            | 150        | 236        |
-| 第 17 回人工知能学会全国大会 (2003)                | 208         | 5              | 203            | 152        | 244        |
-| 自然言語処理研究会第 146 〜 155 回                 | 98          | 2              | 96             | 150        | 232        |
-| WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
-|                                                    | 945         | 294            | 651            | 1122       | 955        |
+<!-- image -->

 Text is aligned to match original for ease of viewing
-
-|                          | Shares (in millions)   | Shares (in millions)   | Weighted Average Grant Date Fair Value   | Weighted Average Grant Date Fair Value   |
-|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
-|                          | RS U s                 | PSUs                   | RSUs                                     | PSUs                                     |
-| Nonvested on Janua ry 1  | 1. 1                   | 0.3                    | 90.10 $                                  | $ 91.19                                  |
-| Granted                  | 0. 5                   | 0.1                    | 117.44                                   | 122.41                                   |
-| Vested                   | (0. 5 )                | (0.1)                  | 87.08                                    | 81.14                                    |
-| Canceled or forfeited    | (0. 1 )                | -                      | 102.01                                   | 92.18                                    |
-| Nonvested on December 31 | 1.0                    | 0.3                    | 104.85 $                                 | $ 104.51                                 |
+<!-- image -->

 Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.
 <!-- image -->
@ -458,6 +436,7 @@ Figure 13: Table predictions example on colorful table.
 <!-- image -->

 Figure 14: Example with multi-line text.
+<!-- image -->


 <!-- image -->
@ -472,6 +451,9 @@ Figure 15: Example with triangular table.
 <!-- image -->


+<!-- image -->
+
+
 <!-- image -->

 Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2206.01062.doctags.txt
@ -3,17 +3,16 @@
 <paragraph><location><page_1><loc_15><loc_77><loc_32><loc_83></location>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</paragraph>
 <paragraph><location><page_1><loc_42><loc_77><loc_58><loc_83></location>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</paragraph>
 <paragraph><location><page_1><loc_69><loc_77><loc_85><loc_83></location>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</paragraph>
-<paragraph><location><page_1><loc_28><loc_70><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</paragraph>
-<paragraph><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</paragraph>
+<paragraph><location><page_1><loc_28><loc_71><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland</paragraph>
+<paragraph><location><page_1><loc_29><loc_70><loc_44><loc_71></location>ahn@zurich.ibm.com</paragraph>
 <subtitle-level-1><location><page_1><loc_9><loc_67><loc_18><loc_69></location>ABSTRACT</subtitle-level-1>
 <paragraph><location><page_1><loc_9><loc_33><loc_48><loc_67></location>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</paragraph>
 <subtitle-level-1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</subtitle-level-1>
 <paragraph><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</paragraph>
 <paragraph><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</paragraph>
-<paragraph><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</paragraph>
-<paragraph><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</paragraph>
-<paragraph><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</paragraph>
+<paragraph><location><page_1><loc_9><loc_12><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.</paragraph>
 <paragraph><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</paragraph>
+<paragraph><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</paragraph>
 <caption><location><page_1><loc_52><loc_29><loc_91><loc_32></location>Figure 1: Four examples of complex page layouts across different document categories</caption>
 <figure>
 <location><page_1><loc_53><loc_34><loc_90><loc_68></location>
--- a/tests/data/groundtruth/docling_v1/2206.01062.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.md
+++ b/tests/data/groundtruth/docling_v1/2206.01062.md
@ -6,9 +6,9 @@ Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

 Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
+Ahmed S. Nassar IBM Research Rueschlikon, Switzerland

-Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
+ahn@zurich.ibm.com

 ## ABSTRACT

@ -20,14 +20,12 @@ Accurate document layout analysis is a key requirement for highquality PDF docum

 Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-KDD '22, August 14-18, 2022, Washington, DC, USA
-
-© 2022 Copyright held by the owner/author(s).
-
-ACM ISBN 978-1-4503-9385-0/22/08.
+KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.

 https://doi.org/10.1145/3534678.3539043

+Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
+
 Figure 1: Four examples of complex page layouts across different document categories
 <!-- image -->

--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.doctags.txt
@ -1,6 +1,6 @@
 <document>
 <subtitle-level-1><location><page_1><loc_22><loc_82><loc_79><loc_85></location>Optimized Table Tokenization for Table Structure Recognition</subtitle-level-1>
-<paragraph><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]</paragraph>
+<paragraph><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]</paragraph>
 <paragraph><location><page_1><loc_38><loc_74><loc_49><loc_75></location>and Peter Staar</paragraph>
 <paragraph><location><page_1><loc_46><loc_72><loc_55><loc_73></location>IBM Research</paragraph>
 <paragraph><location><page_1><loc_36><loc_70><loc_64><loc_71></location>{mly,ahn,nli,cau,taa}@zurich.ibm.com</paragraph>
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.md
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.md
@ -1,6 +1,6 @@
 ## Optimized Table Tokenization for Table Structure Recognition

-Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]
+Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]

 and Peter Staar

--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.doctags.txt
@ -5,10 +5,7 @@
 </figure>
 <subtitle-level-1><location><page_1><loc_6><loc_79><loc_96><loc_89></location>Row and Column Access Control Support in IBM DB2 for i</subtitle-level-1>
 <figure>
-<location><page_1><loc_5><loc_11><loc_96><loc_63></location>
-</figure>
-<figure>
-<location><page_1><loc_52><loc_2><loc_95><loc_10></location>
+<location><page_1><loc_3><loc_1><loc_96><loc_64></location>
 </figure>
 <subtitle-level-1><location><page_2><loc_11><loc_88><loc_28><loc_91></location>Contents</subtitle-level-1>
 <table>
@ -105,7 +102,9 @@
 <location><page_5><loc_5><loc_70><loc_39><loc_91></location>
 </figure>
 <paragraph><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</paragraph>
-<paragraph><location><page_5><loc_82><loc_84><loc_85><loc_88></location>1</paragraph>
+<figure>
+<location><page_5><loc_78><loc_82><loc_89><loc_91></location>
+</figure>
 <subtitle-level-1><location><page_5><loc_22><loc_61><loc_89><loc_68></location>Securing and protecting IBM DB2 data</subtitle-level-1>
 <paragraph><location><page_5><loc_22><loc_46><loc_89><loc_56></location>Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.</paragraph>
 <paragraph><location><page_5><loc_22><loc_38><loc_86><loc_44></location>Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement.</paragraph>
@ -155,17 +154,7 @@
 </table>
 <paragraph><location><page_8><loc_22><loc_40><loc_89><loc_43></location>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</paragraph>
 <paragraph><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</paragraph>
-<paragraph><location><page_8><loc_22><loc_35><loc_28><loc_36></location>SELECT</paragraph>
-<paragraph><location><page_8><loc_30><loc_35><loc_41><loc_36></location>function_id,</paragraph>
-<paragraph><location><page_8><loc_27><loc_34><loc_39><loc_35></location>user_name,</paragraph>
-<paragraph><location><page_8><loc_28><loc_32><loc_36><loc_33></location>usage,</paragraph>
-<paragraph><location><page_8><loc_27><loc_31><loc_39><loc_32></location>user_type</paragraph>
-<paragraph><location><page_8><loc_22><loc_29><loc_26><loc_30></location>FROM</paragraph>
-<paragraph><location><page_8><loc_29><loc_29><loc_43><loc_30></location>function_usage</paragraph>
-<paragraph><location><page_8><loc_22><loc_28><loc_27><loc_29></location>WHERE</paragraph>
-<paragraph><location><page_8><loc_29><loc_28><loc_54><loc_29></location>function_id=’QIBM_DB_SECADM’</paragraph>
-<paragraph><location><page_8><loc_22><loc_26><loc_29><loc_27></location>ORDER BY</paragraph>
-<paragraph><location><page_8><loc_31><loc_26><loc_39><loc_27></location>user_name;</paragraph>
+<table><location><page_8><loc_22><loc_26><loc_89><loc_37></location>SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=’QIBM_DB_SECADM’ ORDER BY user_name;</table>
 <subtitle-level-1><location><page_8><loc_11><loc_20><loc_41><loc_22></location>2.2 Separation of duties</subtitle-level-1>
 <paragraph><location><page_8><loc_22><loc_10><loc_89><loc_18></location>Separation of duties helps businesses comply with industry regulations or organizational requirements and simplifies the management of authorities. Separation of duties is commonly used to prevent fraudulent activities or errors by a single person. It provides the ability for administrative functions to be divided across individuals without overlapping responsibilities, so that one user does not possess unlimited authority, such as with the *ALLOBJ authority.</paragraph>
 <paragraph><location><page_9><loc_22><loc_82><loc_89><loc_91></location>For example, assume that a business has assigned the duty to manage security on IBM i to Theresa. Before release IBM i 7.2, to grant privileges, Theresa had to have the same privileges Theresa was granting to others. Therefore, to grant *USE privileges to the PAYROLL table, Theresa had to have *OBJMGT and *USE authority (or a higher level of authority, such as *ALLOBJ). This requirement allowed Theresa to access the data in the PAYROLL table even though Theresa's job description was only to manage its security.</paragraph>
@ -247,7 +236,7 @@
 <paragraph><location><page_12><loc_22><loc_34><loc_66><loc_35></location>- 1. There are user profiles for MGR, JANE, JUDY, and TONY.</paragraph>
 <paragraph><location><page_12><loc_22><loc_32><loc_65><loc_33></location>- 2. The user profile JANE specifies a group profile of MGR.</paragraph>
 <paragraph><location><page_12><loc_22><loc_28><loc_88><loc_31></location>- 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:</paragraph>
-<paragraph><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE')</paragraph>
+<paragraph><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY')</paragraph>
 <paragraph><location><page_13><loc_22><loc_90><loc_27><loc_91></location>RETURN</paragraph>
 <paragraph><location><page_13><loc_22><loc_88><loc_26><loc_89></location>CASE</paragraph>
 <paragraph><location><page_13><loc_22><loc_67><loc_85><loc_88></location>WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 9999 || '-' || MONTH ( EMPLOYEES . DATE_OF_BIRTH ) || '-' || DAY (EMPLOYEES.DATE_OF_BIRTH )) ELSE NULL END ENABLE ;</paragraph>
@ -269,12 +258,7 @@
 <paragraph><location><page_14><loc_22><loc_67><loc_89><loc_71></location>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</paragraph>
 <paragraph><location><page_14><loc_22><loc_65><loc_67><loc_66></location>- 1. Run the SQL statements that are shown in Example 3-10.</paragraph>
 <subtitle-level-1><location><page_14><loc_22><loc_62><loc_61><loc_63></location>Example 3-10 Activating RCAC on the EMPLOYEES table</subtitle-level-1>
-<paragraph><location><page_14><loc_22><loc_60><loc_62><loc_61></location>- /* Active Row Access Control (permissions) */</paragraph>
-<paragraph><location><page_14><loc_22><loc_58><loc_58><loc_60></location>- /* Active Column Access Control (masks)</paragraph>
-<paragraph><location><page_14><loc_60><loc_58><loc_62><loc_60></location>*/</paragraph>
-<paragraph><location><page_14><loc_22><loc_57><loc_48><loc_58></location>ALTER TABLE HR_SCHEMA.EMPLOYEES</paragraph>
-<paragraph><location><page_14><loc_22><loc_55><loc_44><loc_56></location>ACTIVATE ROW ACCESS CONTROL</paragraph>
-<paragraph><location><page_14><loc_22><loc_54><loc_48><loc_55></location>ACTIVATE COLUMN ACCESS CONTROL;</paragraph>
+<paragraph><location><page_14><loc_22><loc_54><loc_62><loc_61></location>- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;</paragraph>
 <paragraph><location><page_14><loc_22><loc_48><loc_88><loc_52></location>- 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</paragraph>
 <caption><location><page_14><loc_11><loc_17><loc_57><loc_18></location>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption>
 <figure>
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.md
@ -6,9 +6,6 @@ Front cover
 ## Row and Column Access Control Support in IBM DB2 for i


-<!-- image -->
-
-
 <!-- image -->

 ## Contents
@ -141,7 +138,8 @@ Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Ro

 Chapter 1.

-1
+
+<!-- image -->

 ## Securing and protecting IBM DB2 data

@ -223,27 +221,7 @@ To discover who has authorization to define and manage RCAC, you can use the que

 Example 2-1 Query to determine who has authority to define and manage RCAC

-SELECT
-
-function_id,
-
-user_name,
-
-usage,
-
-user_type
-
-FROM
-
-function_usage
-
-WHERE
-
-function_id=’QIBM_DB_SECADM’
-
-ORDER BY
-
-user_name;
+SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=’QIBM_DB_SECADM’ ORDER BY user_name;

 ## 2.2 Separation of duties

@ -350,7 +328,7 @@ Here is an example of using the VERIFY_GROUP_FOR_USER function:

 - 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:

-VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE')
+VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY')

 RETURN

@ -387,17 +365,7 @@ Now that you have created the row permission and the two column masks, RCAC must

 ## Example 3-10 Activating RCAC on the EMPLOYEES table

- /* Active Row Access Control (permissions) */
-
- /* Active Column Access Control (masks)
-
-*/
-
-ALTER TABLE HR_SCHEMA.EMPLOYEES
-
-ACTIVATE ROW ACCESS CONTROL
-
-ACTIVATE COLUMN ACCESS CONTROL;
+- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;

 - 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .

--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
@ -156,39 +156,17 @@
 <list_item><location><page_8><loc_9><loc_89><loc_10><loc_90></location>a.</list_item>
 <list_item><location><page_8><loc_11><loc_89><loc_82><loc_90></location>Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells</list_item>
 </unordered_list>
-<text><location><page_8><loc_9><loc_87><loc_46><loc_88></location>Japanese language (previously unseen by TableFormer):</text>
-<text><location><page_8><loc_50><loc_87><loc_70><loc_88></location>Example table from FinTabNet:</text>
 <figure>
 <location><page_8><loc_8><loc_76><loc_49><loc_87></location>
+<caption>Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
 </figure>
 <figure>
-<location><page_8><loc_50><loc_77><loc_91><loc_88></location>
-<caption>b. Structure predicted by TableFormer, with superimposed matched PDF cell text:</caption>
-</figure>
-<table>
 <location><page_8><loc_9><loc_63><loc_49><loc_72></location>
-<row_0><col_0><body></col_0><col_1><body></col_1><col_2><col_header>論文ファイル</col_2><col_3><col_header>論文ファイル</col_3><col_4><col_header>参考文献</col_4><col_5><col_header>参考文献</col_5></row_0>
-<row_1><col_0><col_header>出典</col_0><col_1><col_header>ファイル 数</col_1><col_2><col_header>英語</col_2><col_3><col_header>日本語</col_3><col_4><col_header>英語</col_4><col_5><col_header>日本語</col_5></row_1>
-<row_2><col_0><row_header>Association for Computational Linguistics(ACL2003)</col_0><col_1><body>65</col_1><col_2><body>65</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_2>
-<row_3><col_0><row_header>Computational Linguistics(COLING2002)</col_0><col_1><body>140</col_1><col_2><body>140</col_2><col_3><body>0</col_3><col_4><body>150</col_4><col_5><body>0</col_5></row_3>
-<row_4><col_0><row_header>電気情報通信学会 2003 年総合大会</col_0><col_1><body>150</col_1><col_2><body>8</col_2><col_3><body>142</col_3><col_4><body>223</col_4><col_5><body>147</col_5></row_4>
-<row_5><col_0><row_header>情報処理学会第 65 回全国大会 (2003)</col_0><col_1><body>177</col_1><col_2><body>1</col_2><col_3><body>176</col_3><col_4><body>150</col_4><col_5><body>236</col_5></row_5>
-<row_6><col_0><row_header>第 17 回人工知能学会全国大会 (2003)</col_0><col_1><body>208</col_1><col_2><body>5</col_2><col_3><body>203</col_3><col_4><body>152</col_4><col_5><body>244</col_5></row_6>
-<row_7><col_0><row_header>自然言語処理研究会第 146 〜 155 回</col_0><col_1><body>98</col_1><col_2><body>2</col_2><col_3><body>96</col_3><col_4><body>150</col_4><col_5><body>232</col_5></row_7>
-<row_8><col_0><row_header>WWW から収集した論文</col_0><col_1><body>107</col_1><col_2><body>73</col_2><col_3><body>34</col_3><col_4><body>147</col_4><col_5><body>96</col_5></row_8>
-<row_9><col_0><body></col_0><col_1><body>945</col_1><col_2><body>294</col_2><col_3><body>651</col_3><col_4><body>1122</col_4><col_5><body>955</col_5></row_9>
-</table>
-<table>
+</figure>
+<figure>
 <location><page_8><loc_50><loc_64><loc_90><loc_72></location>
 <caption>Text is aligned to match original for ease of viewing</caption>
-<row_0><col_0><body></col_0><col_1><col_header>Shares (in millions)</col_1><col_2><col_header>Shares (in millions)</col_2><col_3><col_header>Weighted Average Grant Date Fair Value</col_3><col_4><col_header>Weighted Average Grant Date Fair Value</col_4></row_0>
-<row_1><col_0><body></col_0><col_1><col_header>RS U s</col_1><col_2><col_header>PSUs</col_2><col_3><col_header>RSUs</col_3><col_4><col_header>PSUs</col_4></row_1>
-<row_2><col_0><row_header>Nonvested on Janua ry 1</col_0><col_1><body>1. 1</col_1><col_2><body>0.3</col_2><col_3><body>90.10 $</col_3><col_4><body>$ 91.19</col_4></row_2>
-<row_3><col_0><row_header>Granted</col_0><col_1><body>0. 5</col_1><col_2><body>0.1</col_2><col_3><body>117.44</col_3><col_4><body>122.41</col_4></row_3>
-<row_4><col_0><row_header>Vested</col_0><col_1><body>(0. 5 )</col_1><col_2><body>(0.1)</col_2><col_3><body>87.08</col_3><col_4><body>81.14</col_4></row_4>
-<row_5><col_0><row_header>Canceled or forfeited</col_0><col_1><body>(0. 1 )</col_1><col_2><body>-</col_2><col_3><body>102.01</col_3><col_4><body>92.18</col_4></row_5>
-<row_6><col_0><row_header>Nonvested on December 31</col_0><col_1><body>1.0</col_1><col_2><body>0.3</col_2><col_3><body>104.85 $</col_3><col_4><body>$ 104.51</col_4></row_6>
-</table>
+</figure>
 <figure>
 <location><page_8><loc_8><loc_44><loc_35><loc_52></location>
 <caption>Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.</caption>
@ -316,7 +294,7 @@
 <text><location><page_13><loc_8><loc_83><loc_47><loc_86></location>Aditional images with examples of TableFormer predictions and post-processing can be found below.</text>
 <paragraph><location><page_13><loc_10><loc_35><loc_45><loc_37></location>Figure 8: Example of a table with multi-line header.</paragraph>
 <figure>
-<location><page_13><loc_51><loc_63><loc_70><loc_68></location>
+<location><page_13><loc_51><loc_63><loc_91><loc_87></location>
 <caption>Figure 9: Example of a table with big empty distance between cells.</caption>
 </figure>
 <figure>
@ -335,7 +313,10 @@
 <location><page_14><loc_52><loc_55><loc_87><loc_89></location>
 <caption>Figure 13: Table predictions example on colorful table.</caption>
 </figure>
-<paragraph><location><page_14><loc_56><loc_13><loc_83><loc_14></location>Figure 14: Example with multi-line text.</paragraph>
+<figure>
+<location><page_14><loc_52><loc_25><loc_85><loc_31></location>
+<caption>Figure 14: Example with multi-line text.</caption>
+</figure>
 <figure>
 <location><page_15><loc_9><loc_69><loc_46><loc_83></location>
 </figure>
@ -350,6 +331,9 @@
 <caption>Figure 15: Example with triangular table.</caption>
 </figure>
 <figure>
+<location><page_15><loc_53><loc_72><loc_86><loc_85></location>
+</figure>
+<figure>
 <location><page_15><loc_53><loc_41><loc_86><loc_54></location>
 </figure>
 <figure>
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md
@ -223,38 +223,15 @@ Table 4: Results of structure with content retrieved using cell detection on Pub
 - a.
 - Red - PDF cells, Green - predicted bounding boxes, Blue - post-processed predictions matched to PDF cells

-Japanese language (previously unseen by TableFormer):
-
-Example table from FinTabNet:
+Japanese language (previously unseen by TableFormer): Example table from FinTabNet:b. Structure predicted by TableFormer, with superimposed matched PDF cell text:

 <!-- image -->

-b. Structure predicted by TableFormer, with superimposed matched PDF cell text:
-
 <!-- image -->

-|                                                    |             | 論文ファイル   | 論文ファイル   | 参考文献   | 参考文献   |
-|----------------------------------------------------|-------------|----------------|----------------|------------|------------|
-| 出典                                               | ファイル 数 | 英語           | 日本語         | 英語       | 日本語     |
-| Association for Computational Linguistics(ACL2003) | 65          | 65             | 0              | 150        | 0          |
-| Computational Linguistics(COLING2002)              | 140         | 140            | 0              | 150        | 0          |
-| 電気情報通信学会 2003 年総合大会                   | 150         | 8              | 142            | 223        | 147        |
-| 情報処理学会第 65 回全国大会 (2003)                | 177         | 1              | 176            | 150        | 236        |
-| 第 17 回人工知能学会全国大会 (2003)                | 208         | 5              | 203            | 152        | 244        |
-| 自然言語処理研究会第 146 〜 155 回                 | 98          | 2              | 96             | 150        | 232        |
-| WWW から収集した論文                               | 107         | 73             | 34             | 147        | 96         |
-|                                                    | 945         | 294            | 651            | 1122       | 955        |
-
 Text is aligned to match original for ease of viewing

-|                          | Shares (in millions)   | Shares (in millions)   | Weighted Average Grant Date Fair Value   | Weighted Average Grant Date Fair Value   |
-|--------------------------|------------------------|------------------------|------------------------------------------|------------------------------------------|
-|                          | RS U s                 | PSUs                   | RSUs                                     | PSUs                                     |
-| Nonvested on Janua ry 1  | 1. 1                   | 0.3                    | 90.10 $                                  | $ 91.19                                  |
-| Granted                  | 0. 5                   | 0.1                    | 117.44                                   | 122.41                                   |
-| Vested                   | (0. 5 )                | (0.1)                  | 87.08                                    | 81.14                                    |
-| Canceled or forfeited    | (0. 1 )                | -                      | 102.01                                   | 92.18                                    |
-| Nonvested on December 31 | 1.0                    | 0.3                    | 104.85 $                                 | $ 104.51                                 |
+<!-- image -->

 Figure 5: One of the benefits of TableFormer is that it is language agnostic, as an example, the left part of the illustration demonstrates TableFormer predictions on previously unseen language (Japanese). Additionally, we see that TableFormer is robust to variability in style and content, right side of the illustration shows the example of the TableFormer prediction from the FinTabNet dataset.

@ -426,12 +403,16 @@ Figure 14: Example with multi-line text.

 <!-- image -->

+<!-- image -->
+
 Figure 15: Example with triangular table.

 <!-- image -->

 <!-- image -->

+<!-- image -->
+
 Figure 16: Example of how post-processing helps to restore mis-aligned bounding boxes prediction artifact.

 <!-- image -->
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
@ -3,17 +3,16 @@
 <text><location><page_1><loc_15><loc_77><loc_32><loc_83></location>Birgit Pfitzmann IBM Research Rueschlikon, Switzerland bpf@zurich.ibm.com</text>
 <text><location><page_1><loc_42><loc_77><loc_58><loc_83></location>Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com</text>
 <text><location><page_1><loc_69><loc_77><loc_85><loc_83></location>Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com</text>
-<text><location><page_1><loc_28><loc_70><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com</text>
-<text><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
+<text><location><page_1><loc_28><loc_71><loc_45><loc_76></location>Ahmed S. Nassar IBM Research Rueschlikon, Switzerland</text>
+<text><location><page_1><loc_29><loc_70><loc_44><loc_71></location>ahn@zurich.ibm.com</text>
 <section_header_level_1><location><page_1><loc_9><loc_67><loc_18><loc_69></location>ABSTRACT</section_header_level_1>
 <text><location><page_1><loc_9><loc_33><loc_48><loc_67></location>Accurate document layout analysis is a key requirement for highquality PDF document conversion. With the recent availability of public, large ground-truth datasets such as PubLayNet and DocBank, deep-learning models have proven to be very effective at layout detection and segmentation. While these datasets are of adequate size to train such models, they severely lack in layout variability since they are sourced from scientific article repositories such as PubMed and arXiv only. Consequently, the accuracy of the layout segmentation drops significantly when these models are applied on more challenging and diverse layouts. In this paper, we present DocLayNet , a new, publicly available, document-layout annotation dataset in COCO format. It contains 80863 manually annotated pages from diverse data sources to represent a wide variability in layouts. For each PDF page, the layout annotations provide labelled bounding-boxes with a choice of 11 distinct classes. DocLayNet also provides a subset of double- and triple-annotated pages to determine the inter-annotator agreement. In multiple experiments, we provide baseline accuracy scores (in mAP) for a set of popular object detection models. We also demonstrate that these models fall approximately 10% behind the inter-annotator agreement. Furthermore, we provide evidence that DocLayNet is of sufficient size. Lastly, we compare models trained on PubLayNet, DocBank and DocLayNet, showing that layout predictions of the DocLayNettrained models are more robust and thus the preferred choice for general-purpose document-layout analysis.</text>
 <section_header_level_1><location><page_1><loc_9><loc_29><loc_22><loc_30></location>CCS CONCEPTS</section_header_level_1>
 <text><location><page_1><loc_9><loc_25><loc_49><loc_29></location>· Information systems → Document structure ; · Applied computing → Document analysis ; · Computing methodologies → Machine learning ; Computer vision ; Object detection ;</text>
 <text><location><page_1><loc_9><loc_15><loc_48><loc_20></location>Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).</text>
-<text><location><page_1><loc_9><loc_14><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA</text>
-<text><location><page_1><loc_9><loc_13><loc_31><loc_14></location>© 2022 Copyright held by the owner/author(s).</text>
-<text><location><page_1><loc_9><loc_12><loc_26><loc_13></location>ACM ISBN 978-1-4503-9385-0/22/08.</text>
+<text><location><page_1><loc_9><loc_12><loc_32><loc_15></location>KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.</text>
 <text><location><page_1><loc_9><loc_11><loc_27><loc_12></location>https://doi.org/10.1145/3534678.3539043</text>
+<text><location><page_1><loc_55><loc_70><loc_72><loc_76></location>Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com</text>
 <figure>
 <location><page_1><loc_53><loc_34><loc_90><loc_68></location>
 <caption>Figure 1: Four examples of complex page layouts across different document categories</caption>
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.md
+++ b/tests/data/groundtruth/docling_v2/2206.01062.md
@ -6,9 +6,9 @@ Christoph Auer IBM Research Rueschlikon, Switzerland cau@zurich.ibm.com

 Michele Dolfi IBM Research Rueschlikon, Switzerland dol@zurich.ibm.com

-Ahmed S. Nassar IBM Research Rueschlikon, Switzerland ahn@zurich.ibm.com
+Ahmed S. Nassar IBM Research Rueschlikon, Switzerland

-Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
+ahn@zurich.ibm.com

 ## ABSTRACT

@ -20,14 +20,12 @@ Accurate document layout analysis is a key requirement for highquality PDF docum

 Permission to make digital or hard copies of part or all of this work for personal or classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. Copyrights for third-party components of this work must be honored. For all other uses, contact the owner/author(s).

-KDD '22, August 14-18, 2022, Washington, DC, USA
-
-© 2022 Copyright held by the owner/author(s).
-
-ACM ISBN 978-1-4503-9385-0/22/08.
+KDD '22, August 14-18, 2022, Washington, DC, USA © 2022 Copyright held by the owner/author(s). ACM ISBN 978-1-4503-9385-0/22/08.

 https://doi.org/10.1145/3534678.3539043

+Peter Staar IBM Research Rueschlikon, Switzerland taa@zurich.ibm.com
+
 Figure 1: Four examples of complex page layouts across different document categories

 <!-- image -->
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
@ -1,6 +1,6 @@
 <document>
 <section_header_level_1><location><page_1><loc_22><loc_82><loc_79><loc_85></location>Optimized Table Tokenization for Table Structure Recognition</section_header_level_1>
-<text><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]</text>
+<text><location><page_1><loc_23><loc_75><loc_78><loc_79></location>Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]</text>
 <text><location><page_1><loc_38><loc_74><loc_49><loc_75></location>and Peter Staar</text>
 <text><location><page_1><loc_46><loc_72><loc_55><loc_73></location>IBM Research</text>
 <text><location><page_1><loc_36><loc_70><loc_64><loc_71></location>{mly,ahn,nli,cau,taa}@zurich.ibm.com</text>
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.md
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.md
@ -1,6 +1,6 @@
 ## Optimized Table Tokenization for Table Structure Recognition

-Maksym Lysak [0000 - 0002 - 3723 - $^{6960]}$, Ahmed Nassar[0000 - 0002 - 9468 - $^{0822]}$, Nikolaos Livathinos [0000 - 0001 - 8513 - $^{3491]}$, Christoph Auer[0000 - 0001 - 5761 - $^{0422]}$, [0000 - 0002 - 8088 - 0823]
+Maksym Lysak [0000 − 0002 − 3723 − $^{6960]}$, Ahmed Nassar[0000 − 0002 − 9468 − $^{0822]}$, Nikolaos Livathinos [0000 − 0001 − 8513 − $^{3491]}$, Christoph Auer[0000 − 0001 − 5761 − $^{0422]}$, [0000 − 0002 − 8088 − 0823]

 and Peter Staar

--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
@ -5,10 +5,7 @@
 </figure>
 <section_header_level_1><location><page_1><loc_6><loc_79><loc_96><loc_89></location>Row and Column Access Control Support in IBM DB2 for i</section_header_level_1>
 <figure>
-<location><page_1><loc_5><loc_11><loc_96><loc_63></location>
-</figure>
-<figure>
-<location><page_1><loc_52><loc_2><loc_95><loc_10></location>
+<location><page_1><loc_3><loc_1><loc_96><loc_64></location>
 </figure>
 <section_header_level_1><location><page_2><loc_11><loc_88><loc_28><loc_91></location>Contents</section_header_level_1>
 <table>
@ -109,7 +106,9 @@
 <location><page_5><loc_5><loc_70><loc_39><loc_91></location>
 </figure>
 <text><location><page_5><loc_13><loc_65><loc_19><loc_66></location>Chapter 1.</text>
-<text><location><page_5><loc_82><loc_84><loc_85><loc_88></location>1</text>
+<figure>
+<location><page_5><loc_78><loc_82><loc_89><loc_91></location>
+</figure>
 <section_header_level_1><location><page_5><loc_22><loc_61><loc_89><loc_68></location>Securing and protecting IBM DB2 data</section_header_level_1>
 <text><location><page_5><loc_22><loc_46><loc_89><loc_56></location>Recent news headlines are filled with reports of data breaches and cyber-attacks impacting global businesses of all sizes. The Identity Theft Resource Center$^{1}$ reports that almost 5000 data breaches have occurred since 2005, exposing over 600 million records of data. The financial cost of these data breaches is skyrocketing. Studies from the Ponemon Institute$^{2}$ revealed that the average cost of a data breach increased in 2013 by 15% globally and resulted in a brand equity loss of $9.4 million per attack. The average cost that is incurred for each lost record containing sensitive information increased more than 9% to $145 per record.</text>
 <text><location><page_5><loc_22><loc_38><loc_86><loc_44></location>Businesses must make a serious effort to secure their data and recognize that securing information assets is a cost of doing business. In many parts of the world and in many industries, securing the data is required by law and subject to audits. Data security is no longer an option; it is a requirement.</text>
@ -165,17 +164,7 @@
 </table>
 <text><location><page_8><loc_22><loc_40><loc_89><loc_43></location>To discover who has authorization to define and manage RCAC, you can use the query that is shown in Example 2-1.</text>
 <paragraph><location><page_8><loc_22><loc_38><loc_76><loc_39></location>Example 2-1 Query to determine who has authority to define and manage RCAC</paragraph>
-<text><location><page_8><loc_22><loc_35><loc_28><loc_36></location>SELECT</text>
-<text><location><page_8><loc_30><loc_35><loc_41><loc_36></location>function_id,</text>
-<text><location><page_8><loc_27><loc_34><loc_39><loc_35></location>user_name,</text>
-<text><location><page_8><loc_28><loc_32><loc_36><loc_33></location>usage,</text>
-<text><location><page_8><loc_27><loc_31><loc_39><loc_32></location>user_type</text>
-<text><location><page_8><loc_22><loc_29><loc_26><loc_30></location>FROM</text>
-<text><location><page_8><loc_29><loc_29><loc_43><loc_30></location>function_usage</text>
-<text><location><page_8><loc_22><loc_28><loc_27><loc_29></location>WHERE</text>
-<text><location><page_8><loc_29><loc_28><loc_54><loc_29></location>function_id=’QIBM_DB_SECADM’</text>
-<text><location><page_8><loc_22><loc_26><loc_29><loc_27></location>ORDER BY</text>
-<text><location><page_8><loc_31><loc_26><loc_39><loc_27></location>user_name;</text>
+<table><location><page_8><loc_22><loc_26><loc_89><loc_37></location>SELECT function_id, user_name, usage, user_type FROM function_usage WHERE function_id=’QIBM_DB_SECADM’ ORDER BY user_name;</table>
 <section_header_level_1><location><page_8><loc_11><loc_20><loc_41><loc_22></location>2.2 Separation of duties</section_header_level_1>
 <text><location><page_8><loc_22><loc_10><loc_89><loc_18></location>Separation of duties helps businesses comply with industry regulations or organizational requirements and simplifies the management of authorities. Separation of duties is commonly used to prevent fraudulent activities or errors by a single person. It provides the ability for administrative functions to be divided across individuals without overlapping responsibilities, so that one user does not possess unlimited authority, such as with the *ALLOBJ authority.</text>
 <text><location><page_9><loc_22><loc_82><loc_89><loc_91></location>For example, assume that a business has assigned the duty to manage security on IBM i to Theresa. Before release IBM i 7.2, to grant privileges, Theresa had to have the same privileges Theresa was granting to others. Therefore, to grant *USE privileges to the PAYROLL table, Theresa had to have *OBJMGT and *USE authority (or a higher level of authority, such as *ALLOBJ). This requirement allowed Theresa to access the data in the PAYROLL table even though Theresa's job description was only to manage its security.</text>
@ -255,7 +244,7 @@
 <list_item><location><page_12><loc_22><loc_32><loc_65><loc_33></location>2. The user profile JANE specifies a group profile of MGR.</list_item>
 <list_item><location><page_12><loc_22><loc_28><loc_88><loc_31></location>3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:</list_item>
 </unordered_list>
-<code><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE')</code>
+<code><location><page_12><loc_25><loc_19><loc_74><loc_27></location>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY')</code>
 <text><location><page_13><loc_22><loc_90><loc_27><loc_91></location>RETURN</text>
 <text><location><page_13><loc_22><loc_88><loc_26><loc_89></location>CASE</text>
 <code><location><page_13><loc_22><loc_67><loc_85><loc_88></location>WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 9999 || '-' || MONTH ( EMPLOYEES . DATE_OF_BIRTH ) || '-' || DAY (EMPLOYEES.DATE_OF_BIRTH )) ELSE NULL END ENABLE ;</code>
@ -283,14 +272,7 @@
 </unordered_list>
 <section_header_level_1><location><page_14><loc_22><loc_62><loc_61><loc_63></location>Example 3-10 Activating RCAC on the EMPLOYEES table</section_header_level_1>
 <unordered_list>
-<list_item><location><page_14><loc_22><loc_60><loc_62><loc_61></location>/* Active Row Access Control (permissions) */</list_item>
-<list_item><location><page_14><loc_22><loc_58><loc_58><loc_60></location>/* Active Column Access Control (masks)</list_item>
-</unordered_list>
-<text><location><page_14><loc_60><loc_58><loc_62><loc_60></location>*/</text>
-<text><location><page_14><loc_22><loc_57><loc_48><loc_58></location>ALTER TABLE HR_SCHEMA.EMPLOYEES</text>
-<text><location><page_14><loc_22><loc_55><loc_44><loc_56></location>ACTIVATE ROW ACCESS CONTROL</text>
-<text><location><page_14><loc_22><loc_54><loc_48><loc_55></location>ACTIVATE COLUMN ACCESS CONTROL;</text>
-<unordered_list>
+<list_item><location><page_14><loc_22><loc_54><loc_62><loc_61></location>/* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;</list_item>
 <list_item><location><page_14><loc_22><loc_48><loc_88><loc_52></location>2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</list_item>
 </unordered_list>
 <figure>
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.md
@ -6,8 +6,6 @@ Front cover

 <!-- image -->

-<!-- image -->
-
 ## Contents

 | Notices                                                                                                                                        | . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . vii   |
@ -120,7 +118,7 @@ Hernando Bedoya is a Senior IT Specialist at STG Lab Services and Training in Ro

 Chapter 1.

-1
+<!-- image -->

 ## Securing and protecting IBM DB2 data

@ -198,27 +196,7 @@ To discover who has authorization to define and manage RCAC, you can use the que

 Example 2-1 Query to determine who has authority to define and manage RCAC

-SELECT
-
-function\_id,
-
-user\_name,
-
-usage,
-
-user\_type
-
-FROM
-
-function\_usage
-
-WHERE
-
-function\_id=’QIBM\_DB\_SECADM’
-
-ORDER BY
-
-user\_name;
+SELECT function\_id, user\_name, usage, user\_type FROM function\_usage WHERE function\_id=’QIBM\_DB\_SECADM’ ORDER BY user\_name;

 ## 2.2 Separation of duties

@ -318,7 +296,7 @@ Here is an example of using the VERIFY\_GROUP\_FOR\_USER function:
 - 3. If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:

 ```
-VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'MGR') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR') The following function invocation returns a value of 0: VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JUDY', 'TONY') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR', 'STEVE')
+VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'MGR') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR') VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY\_GROUP\_FOR\_USER (CURRENT\_USER, 'JUDY', 'TONY')
 ```

 RETURN
@ -356,17 +334,7 @@ Now that you have created the row permission and the two column masks, RCAC must

 ## Example 3-10 Activating RCAC on the EMPLOYEES table

- /* Active Row Access Control (permissions) */
- /* Active Column Access Control (masks)
-
-*/
-
-ALTER TABLE HR\_SCHEMA.EMPLOYEES
-
-ACTIVATE ROW ACCESS CONTROL
-
-ACTIVATE COLUMN ACCESS CONTROL;
-
+- /* Active Row Access Control (permissions) */ /* Active Column Access Control (masks) */ ALTER TABLE HR\_SCHEMA.EMPLOYEES ACTIVATE ROW ACCESS CONTROL ACTIVATE COLUMN ACCESS CONTROL;
 - 2. Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR\_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .

 Figure 3-11 Selecting the EMPLOYEES table from System i Navigator
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -8,8 +8,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption

 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

-GENERATE_V1 = False
-GENERATE_V2 = False
+GENERATE_V1 = True
+GENERATE_V2 = True


 def get_pdf_paths():
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -18,8 +18,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption

 from .verify_utils import verify_conversion_result_v1, verify_conversion_result_v2

-GENERATE_V1 = False
-GENERATE_V2 = False
+GENERATE_V1 = True
+GENERATE_V2 = True


 def get_pdf_paths():