From 4eceefa47c197dc142c54e2a860ecdf8de068212 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Jul 2025 07:38:48 +0200
Subject: [PATCH 01/13] feat: add TwoStageVlmModel

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py         |   1 +
 .../datamodel/pipeline_options_vlm_model.py   |   7 ++
 docling/models/layout_model.py                |  65 +++++++++-
 .../vlm_models_inline/TwoStageVlmModel.py     | 115 ++++++++++++++++++
 docling/pipeline/vlm_pipeline.py              |  38 ++++++
 5 files changed, 223 insertions(+), 3 deletions(-)
 create mode 100644 docling/models/vlm_models_inline/TwoStageVlmModel.py

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index fcf091ef..c267a639 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -269,6 +269,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
 class LayoutOptions(BaseModel):
     """Options for layout processing."""
 
+    repo_id: str = "ds4sd/docling-layout-heron"
     create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
 
 
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index bcea2493..a2b31d9d 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -87,3 +87,10 @@ class ApiVlmOptions(BaseVlmOptions):
     timeout: float = 60
     concurrency: int = 1
     response_format: ResponseFormat
+
+
+class TwoStageVlmOptions(BaseVlmOptions):
+    kind: Literal["inline_model_options"] = "inline_two_stage_model_options"
+
+    vlm_options: UnionInlineVlmOptions
+    layout_options: LayoutOptions
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index 44e7286d..9b90f4bd 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -156,6 +156,7 @@ class LayoutModel(BasePageModel):
                     page_image = page.get_image(scale=1.0)
                     assert page_image is not None
 
+                    """
                     clusters = []
                     for ix, pred_item in enumerate(
                         self.layout_predictor.predict(page_image)
@@ -174,14 +175,16 @@ class LayoutModel(BasePageModel):
                             cells=[],
                         )
                         clusters.append(cluster)
-
+                    """
+                    clusters = self.predict_on_page(page_image)
+                    
                     if settings.debug.visualize_raw_layout:
                         self.draw_clusters_and_cells_side_by_side(
                             conv_res, page, clusters, mode_prefix="raw"
                         )
 
                     # Apply postprocessing
-
+                    """
                     processed_clusters, processed_cells = LayoutPostprocessor(
                         page, clusters, self.options
                     ).postprocess()
@@ -208,10 +211,66 @@ class LayoutModel(BasePageModel):
                     page.predictions.layout = LayoutPrediction(
                         clusters=processed_clusters
                     )
-
+                    """
+                    page, processed_clusters, processed_cells = self.postprocess_on_page(page, cluster)
+                    
                 if settings.debug.visualize_layout:
                     self.draw_clusters_and_cells_side_by_side(
                         conv_res, page, processed_clusters, mode_prefix="postprocessed"
                     )
 
                 yield page
+
+    def predict_on_page(self, page_image: Image) -> list[Cluster]:
+
+        pred_items = self.layout_predictor.predict(page_image)
+
+        clusters = []
+        for ix, pred_item in enumerate(pred_items):
+            label = DocItemLabel(
+                pred_item["label"]
+                .lower()
+                .replace(" ", "_")
+                .replace("-", "_")
+            )  # Temporary, until docling-ibm-model uses docling-core types
+            cluster = Cluster(
+                id=ix,
+                label=label,
+                confidence=pred_item["confidence"],
+                bbox=BoundingBox.model_validate(pred_item),
+                cells=[],
+            )
+            clusters.append(cluster)
+            
+        return clusters
+
+    def postprocess_on_page(self, page: Page, cluster: list(Cluster)):
+
+        processed_clusters, processed_cells = LayoutPostprocessor(
+            page, clusters, self.options
+        ).postprocess()
+        # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
+        
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                "Mean of empty slice|invalid value encountered in scalar divide",
+                RuntimeWarning,
+                "numpy",
+            )
+
+            conv_res.confidence.pages[page.page_no].layout_score = float(
+                np.mean([c.confidence for c in processed_clusters])
+            )
+            
+            conv_res.confidence.pages[page.page_no].ocr_score = float(
+                np.mean(
+                    [c.confidence for c in processed_cells if c.from_ocr]
+                )
+            )
+
+            page.predictions.layout = LayoutPrediction(
+                clusters=processed_clusters
+            )
+
+        return page, processed_clusters, processed_cells
diff --git a/docling/models/vlm_models_inline/TwoStageVlmModel.py b/docling/models/vlm_models_inline/TwoStageVlmModel.py
new file mode 100644
index 00000000..f540ea4b
--- /dev/null
+++ b/docling/models/vlm_models_inline/TwoStageVlmModel.py
@@ -0,0 +1,115 @@
+import importlib.metadata
+import logging
+import time
+from collections.abc import Iterable
+from pathlib import Path
+from typing import Any, Optional
+
+from docling.datamodel.accelerator_options import (
+    AcceleratorOptions,
+)
+from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.document import ConversionResult
+from docling.datamodel.pipeline_options_vlm_model import (
+    InlineVlmOptions,
+    TransformersModelType,
+    TransformersPromptStyle,
+)
+from docling.models.base_model import BasePageModel
+from docling.models.utils.hf_model_download import (
+    HuggingFaceModelDownloadMixin,
+)
+from docling.utils.accelerator_utils import decide_device
+from docling.utils.profiling import TimeRecorder
+
+_log = logging.getLogger(__name__)
+
+
+class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+    def __init__(
+        self,
+        *,
+        layout_model: LayoutModelModel,
+        vlm_model: BasePageModel,
+    ):
+        self.layout_model = layout_options
+        self.vlm_model = vlm_options
+
+    def __call__(
+        self, conv_res: ConversionResult, page_batch: Iterable[Page]
+    ) -> Iterable[Page]:
+        for page in page_batch:
+            assert page._backend is not None
+            if not page._backend.is_valid():
+                yield page
+            else:
+                with TimeRecorder(conv_res, "two-staged-vlm"):
+                    assert page.size is not None
+
+                    page_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
+
+                    pred_clusters = self.layout_model.predict_on_page(page_image)
+                    page, processed_clusters, processed_cells = self.layout_model.postprocess_on_page(page=page,
+                                                                                                      page_image=page_image)
+                    
+                    # Define prompt structure
+                    if callable(self.vlm_options.prompt):
+                        user_prompt = self.vlm_options.prompt(page.parsed_page)
+                    else:
+                        user_prompt = self.vlm_options.prompt
+                        
+                    prompt = self.formulate_prompt(user_prompt, processed_clusters)
+
+                    generated_text, generation_time = self.vlm_model.predict_on_image(page_image=page_image,
+                                                                                      prompt=prompt)
+
+                    page.predictions.vlm_response = VlmPrediction(
+                        text=generated_text,
+                        generation_time=generation_time,
+                    )
+
+                yield page
+
+    def formulate_prompt(self, user_prompt: str, clusters:list[Cluster]) -> str:
+        """Formulate a prompt for the VLM."""
+
+        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
+            return user_prompt
+
+        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
+            _log.debug("Using specialized prompt for Phi-4")
+            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
+
+            user_prompt = "<|user|>"
+            assistant_prompt = "<|assistant|>"
+            prompt_suffix = "<|end|>"
+
+            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
+            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
+
+            return prompt
+
+        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
+            messages = [
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "This is a page from a document.",
+                        },
+                        {"type": "image"},
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+            ]
+            prompt = self.processor.apply_chat_template(
+                messages, add_generation_prompt=False
+            )
+            return prompt
+
+        raise RuntimeError(
+            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
+        )
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index ab474fab..4c38e02a 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -44,6 +44,8 @@ from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 
+from docling.models.layout_model import LayoutModel
+
 _log = logging.getLogger(__name__)
 
 
@@ -107,7 +109,43 @@ class VlmPipeline(PaginatedPipeline):
                 raise ValueError(
                     f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
                 )
+        elif isinstance(self.pipeline_options.vlm_options, TwoStageVlmOptions):
+            twostagevlm_options = cast(TwoStageVlmOptions, self.pipeline_options.vlm_options)
 
+            layout_options = twostagevlm_options.lay_options
+            vlm_options = twostagevlm_options.vlm_options
+
+            layout_model = LayoutModel(
+                artifacts_path=artifacts_path,
+                accelerator_options=pipeline_options.accelerator_options,
+                options=layout_options,
+            )
+            
+            if vlm_options.inference_framework == InferenceFramework.MLX:
+                vlm_model = HuggingFaceMlxModel(
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
+                    artifacts_path=artifacts_path,
+                    accelerator_options=pipeline_options.accelerator_options,
+                    vlm_options=vlm_options,
+                )
+                self.build_pipe = [
+                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model)
+                ]
+            elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
+                vlm_model = HuggingFaceTransformersVlmModel(
+                    enabled=True,  # must be always enabled for this pipeline to make sense.
+                    artifacts_path=artifacts_path,
+                    accelerator_options=pipeline_options.accelerator_options,
+                    vlm_options=vlm_options,
+                )
+                self.build_pipe = [
+                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model)
+                ]
+            else:
+                raise ValueError(
+                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
+                )
+            
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
         ]

From 810446c8dcb315762ab857d03fbf1601a62df1ea Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Jul 2025 09:49:39 +0200
Subject: [PATCH 02/13] feat: working on a two stage VLM model

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py         |  1 +
 .../datamodel/pipeline_options_vlm_model.py   |  5 +-
 docling/models/base_model.py                  | 14 ++++
 docling/models/layout_model.py                | 68 +++++++++----------
 .../hf_transformers_model.py                  |  7 +-
 docling/models/vlm_models_inline/mlx_model.py |  8 ++-
 ...tageVlmModel.py => two_stage_vlm_model.py} | 33 +++++----
 docling/pipeline/vlm_pipeline.py              | 18 ++---
 8 files changed, 90 insertions(+), 64 deletions(-)
 rename docling/models/vlm_models_inline/{TwoStageVlmModel.py => two_stage_vlm_model.py} (82%)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index c267a639..33940c45 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -16,6 +16,7 @@ from docling.datamodel import asr_model_specs
 
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import WHISPER_TINY as whisper_tiny
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrOptions,
 )
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index a2b31d9d..a38f0414 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -6,6 +6,7 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.pipeline_options import LayoutOptions
 
 
 class BaseVlmOptions(BaseModel):
@@ -90,7 +91,7 @@ class ApiVlmOptions(BaseVlmOptions):
 
 
 class TwoStageVlmOptions(BaseVlmOptions):
-    kind: Literal["inline_model_options"] = "inline_two_stage_model_options"
+    kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
 
-    vlm_options: UnionInlineVlmOptions
+    vlm_options: InlineVlmOptions
     layout_options: LayoutOptions
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index b0a43f40..dd019216 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -3,6 +3,7 @@ from collections.abc import Iterable
 from typing import Generic, Optional, Protocol, Type
 
 from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeItem
+from PIL import Image
 from typing_extensions import TypeVar
 
 from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
@@ -19,12 +20,25 @@ class BaseModelWithOptions(Protocol):
 
 
 class BasePageModel(ABC):
+    scale: float  # scale with which the page-image needs to be created (dpi = 72*scale)
+    max_size: int  # max size of width/height of page-image
+
     @abstractmethod
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
         pass
 
+class BaseLayoutModel(BasePageModel):
+    @abstractmethod
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list(Cluster):
+        pass    
+
+class BaseVlmModel(BasePageModel):
+    @abstractmethod
+    def predict_on_page_image(self, *, page_image: Image.Image, prompt: str) -> str:
+        pass
+
 
 EnrichElementT = TypeVar("EnrichElementT", default=NodeItem)
 
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index 9b90f4bd..58c9646e 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -7,6 +7,7 @@ from typing import Optional
 
 import numpy as np
 from docling_core.types.doc import DocItemLabel
+from docling_core.types.doc.page import TextCell
 from PIL import Image
 
 from docling.datamodel.accelerator_options import AcceleratorOptions
@@ -176,11 +177,11 @@ class LayoutModel(BasePageModel):
                         )
                         clusters.append(cluster)
                     """
-                    clusters = self.predict_on_page(page_image)
-                    
+                    predicted_clusters = self.predict_on_page(page_image=page_image)
+
                     if settings.debug.visualize_raw_layout:
                         self.draw_clusters_and_cells_side_by_side(
-                            conv_res, page, clusters, mode_prefix="raw"
+                            conv_res, page, predicted_clusters, mode_prefix="raw"
                         )
 
                     # Apply postprocessing
@@ -212,8 +213,28 @@ class LayoutModel(BasePageModel):
                         clusters=processed_clusters
                     )
                     """
-                    page, processed_clusters, processed_cells = self.postprocess_on_page(page, cluster)
-                    
+                    page, processed_clusters, processed_cells = (
+                        self.postprocess_on_page(page=page, clusters=predicted_clusters)
+                    )
+
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )
+
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )
+
                 if settings.debug.visualize_layout:
                     self.draw_clusters_and_cells_side_by_side(
                         conv_res, page, processed_clusters, mode_prefix="postprocessed"
@@ -221,17 +242,13 @@ class LayoutModel(BasePageModel):
 
                 yield page
 
-    def predict_on_page(self, page_image: Image) -> list[Cluster]:
-
+    def predict_on_page(self, *, page_image: Image.Image) -> list[Cluster]:
         pred_items = self.layout_predictor.predict(page_image)
 
         clusters = []
         for ix, pred_item in enumerate(pred_items):
             label = DocItemLabel(
-                pred_item["label"]
-                .lower()
-                .replace(" ", "_")
-                .replace("-", "_")
+                pred_item["label"].lower().replace(" ", "_").replace("-", "_")
             )  # Temporary, until docling-ibm-model uses docling-core types
             cluster = Cluster(
                 id=ix,
@@ -241,36 +258,17 @@ class LayoutModel(BasePageModel):
                 cells=[],
             )
             clusters.append(cluster)
-            
+
         return clusters
 
-    def postprocess_on_page(self, page: Page, cluster: list(Cluster)):
-
+    def postprocess_on_page(
+        self, *, page: Page, clusters: list[Cluster]
+    ) -> tuple[Page, list[Cluster], list[TextCell]]:
         processed_clusters, processed_cells = LayoutPostprocessor(
             page, clusters, self.options
         ).postprocess()
         # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
-        
-        with warnings.catch_warnings():
-            warnings.filterwarnings(
-                "ignore",
-                "Mean of empty slice|invalid value encountered in scalar divide",
-                RuntimeWarning,
-                "numpy",
-            )
 
-            conv_res.confidence.pages[page.page_no].layout_score = float(
-                np.mean([c.confidence for c in processed_clusters])
-            )
-            
-            conv_res.confidence.pages[page.page_no].ocr_score = float(
-                np.mean(
-                    [c.confidence for c in processed_cells if c.from_ocr]
-                )
-            )
-
-            page.predictions.layout = LayoutPrediction(
-                clusters=processed_clusters
-            )
+        page.predictions.layout = LayoutPrediction(clusters=processed_clusters)
 
         return page, processed_clusters, processed_cells
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index d84925dd..3513de3e 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -15,7 +15,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     TransformersModelType,
     TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -25,7 +25,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 
 
-class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -37,6 +37,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
 
         self.vlm_options = vlm_options
 
+        self.scale = self.vlm_options.scale
+        self.max_size = self.vlm_options.max_size
+
         if self.enabled:
             import torch
             from transformers import (
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
index 647ce531..ddeea379 100644
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -10,7 +10,7 @@ from docling.datamodel.accelerator_options import (
 from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -19,7 +19,7 @@ from docling.utils.profiling import TimeRecorder
 _log = logging.getLogger(__name__)
 
 
-class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
+class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         enabled: bool,
@@ -28,10 +28,12 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
         vlm_options: InlineVlmOptions,
     ):
         self.enabled = enabled
-
         self.vlm_options = vlm_options
+
         self.max_tokens = vlm_options.max_new_tokens
         self.temperature = vlm_options.temperature
+        self.scale = self.vlm_options.scale
+        self.max_size = self.vlm_options.max_size
 
         if self.enabled:
             try:
diff --git a/docling/models/vlm_models_inline/TwoStageVlmModel.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py
similarity index 82%
rename from docling/models/vlm_models_inline/TwoStageVlmModel.py
rename to docling/models/vlm_models_inline/two_stage_vlm_model.py
index f540ea4b..b31d46a9 100644
--- a/docling/models/vlm_models_inline/TwoStageVlmModel.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@@ -15,7 +15,8 @@ from docling.datamodel.pipeline_options_vlm_model import (
     TransformersModelType,
     TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BasePageModel, BaseVlmModel
+from docling.models.layout_model import LayoutModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
 )
@@ -29,11 +30,11 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         *,
-        layout_model: LayoutModelModel,
-        vlm_model: BasePageModel,
+        layout_model: LayoutModel,
+        vlm_model: BaseVlmModel,
     ):
-        self.layout_model = layout_options
-        self.vlm_model = vlm_options
+        self.layout_model = layout_model
+        self.vlm_model = vlm_model
 
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
@@ -47,23 +48,27 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                     assert page.size is not None
 
                     page_image = page.get_image(
-                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                        scale=self.vlm_model.scale, max_size=self.vlm_model.max_size
+                    )
+
+                    pred_clusters = self.layout_model.predict_on_page(page_image=page_image)
+                    page, processed_clusters, processed_cells = (
+                        self.layout_model.postprocess_on_page(
+                            page=page, clusters=pred_clusters
+                        )
                     )
 
-                    pred_clusters = self.layout_model.predict_on_page(page_image)
-                    page, processed_clusters, processed_cells = self.layout_model.postprocess_on_page(page=page,
-                                                                                                      page_image=page_image)
-                    
                     # Define prompt structure
                     if callable(self.vlm_options.prompt):
                         user_prompt = self.vlm_options.prompt(page.parsed_page)
                     else:
                         user_prompt = self.vlm_options.prompt
-                        
+
                     prompt = self.formulate_prompt(user_prompt, processed_clusters)
 
-                    generated_text, generation_time = self.vlm_model.predict_on_image(page_image=page_image,
-                                                                                      prompt=prompt)
+                    generated_text, generation_time = self.vlm_model.predict_on_image(
+                        page_image=page_image, prompt=prompt
+                    )
 
                     page.predictions.vlm_response = VlmPrediction(
                         text=generated_text,
@@ -72,7 +77,7 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
 
                 yield page
 
-    def formulate_prompt(self, user_prompt: str, clusters:list[Cluster]) -> str:
+    def formulate_prompt(self, user_prompt: str, clusters: list[Cluster]) -> str:
         """Formulate a prompt for the VLM."""
 
         if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 4c38e02a..01be3693 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -26,9 +26,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import (
-    VlmPipelineOptions,
-)
+from docling.datamodel.pipeline_options import TwoStageVlmOptions, VlmPipelineOptions
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
@@ -37,15 +35,17 @@ from docling.datamodel.pipeline_options_vlm_model import (
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel
+from docling.models.layout_model import LayoutModel
 from docling.models.vlm_models_inline.hf_transformers_model import (
     HuggingFaceTransformersVlmModel,
 )
 from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
+from docling.models.vlm_models_inline.two_stage_vlm_model import (
+    TwoStageVlmModel,
+)
 from docling.pipeline.base_pipeline import PaginatedPipeline
 from docling.utils.profiling import ProfilingScope, TimeRecorder
 
-from docling.models.layout_model import LayoutModel
-
 _log = logging.getLogger(__name__)
 
 
@@ -110,7 +110,9 @@ class VlmPipeline(PaginatedPipeline):
                     f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
                 )
         elif isinstance(self.pipeline_options.vlm_options, TwoStageVlmOptions):
-            twostagevlm_options = cast(TwoStageVlmOptions, self.pipeline_options.vlm_options)
+            twostagevlm_options = cast(
+                TwoStageVlmOptions, self.pipeline_options.vlm_options
+            )
 
             layout_options = twostagevlm_options.lay_options
             vlm_options = twostagevlm_options.vlm_options
@@ -120,7 +122,7 @@ class VlmPipeline(PaginatedPipeline):
                 accelerator_options=pipeline_options.accelerator_options,
                 options=layout_options,
             )
-            
+
             if vlm_options.inference_framework == InferenceFramework.MLX:
                 vlm_model = HuggingFaceMlxModel(
                     enabled=True,  # must be always enabled for this pipeline to make sense.
@@ -145,7 +147,7 @@ class VlmPipeline(PaginatedPipeline):
                 raise ValueError(
                     f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
                 )
-            
+
         self.enrichment_pipe = [
             # Other models working on `NodeItem` elements in the DoclingDocument
         ]

From f2094f858baccd15a21e551b65f74152d8965984 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 8 Jul 2025 10:23:18 +0200
Subject: [PATCH 03/13] Establish layout_model spec and example instantations

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py         |  11 ++
 docling/models/layout_model.py                |  37 +++---
 docling/pipeline/standard_pdf_pipeline.py     |   6 +-
 docling/utils/model_downloader.py             |   3 +-
 pyproject.toml                                |   3 +-
 .../docling_v1/2305.03393v1-pg9.json          |   4 +-
 .../docling_v1/2305.03393v1-pg9.pages.json    |  58 +++++-----
 .../docling_v2/2203.01017v2.doctags.txt       |  24 ++--
 .../groundtruth/docling_v2/2203.01017v2.json  | 108 +++++++++---------
 .../groundtruth/docling_v2/2203.01017v2.md    |   4 +-
 .../docling_v2/2206.01062.doctags.txt         |  20 ++--
 .../groundtruth/docling_v2/2206.01062.json    |  68 +++++------
 .../docling_v2/2305.03393v1-pg9.json          |   4 +-
 .../docling_v2/2305.03393v1-pg9.pages.json    |  58 +++++-----
 .../docling_v2/2305.03393v1.doctags.txt       |  16 +--
 .../groundtruth/docling_v2/2305.03393v1.json  |  56 ++++-----
 .../docling_v2/multi_page.doctags.txt         |   8 +-
 .../groundtruth/docling_v2/multi_page.json    |  20 ++--
 .../docling_v2/redp5110_sampled.doctags.txt   |  24 ++--
 .../docling_v2/redp5110_sampled.json          |  18 +--
 .../docling_v2/redp5110_sampled.md            |  10 +-
 uv.lock                                       |  68 ++++++-----
 22 files changed, 322 insertions(+), 306 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 11e085b7..42a4b21a 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -15,6 +15,15 @@ from docling.datamodel import asr_model_specs
 
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.layout_model_specs import (
+    DOCLING_LAYOUT_EGRET_LARGE,
+    DOCLING_LAYOUT_EGRET_MEDIUM,
+    DOCLING_LAYOUT_EGRET_XLARGE,
+    DOCLING_LAYOUT_HERON,
+    DOCLING_LAYOUT_HERON_101,
+    DOCLING_LAYOUT_V2,
+    LayoutModelConfig,
+)
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrOptions,
 )
@@ -306,6 +315,8 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
         True  # Always True since parsed_page is now mandatory
     )
 
+    layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2
+
 
 class ProcessingPipeline(str, Enum):
     STANDARD = "standard"
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index da75bb8f..5d2748e5 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -12,6 +12,7 @@ from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
@@ -24,9 +25,6 @@ _log = logging.getLogger(__name__)
 
 
 class LayoutModel(BasePageModel):
-    _model_repo_folder = "ds4sd--docling-models"
-    _model_path = "model_artifacts/layout"
-
     TEXT_ELEM_LABELS = [
         DocItemLabel.TEXT,
         DocItemLabel.FOOTNOTE,
@@ -48,30 +46,36 @@ class LayoutModel(BasePageModel):
     CONTAINER_LABELS = [DocItemLabel.FORM, DocItemLabel.KEY_VALUE_REGION]
 
     def __init__(
-        self, artifacts_path: Optional[Path], accelerator_options: AcceleratorOptions
+        self,
+        artifacts_path: Optional[Path],
+        accelerator_options: AcceleratorOptions,
+        layout_model_config: LayoutModelConfig,
     ):
         from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
 
         device = decide_device(accelerator_options.device)
+        self.layout_model_config = layout_model_config
+        model_repo_folder = layout_model_config.model_repo_folder
+        model_path = layout_model_config.model_path
 
         if artifacts_path is None:
-            artifacts_path = self.download_models() / self._model_path
+            artifacts_path = (
+                self.download_models(layout_model_config=layout_model_config)
+                / model_path
+            )
         else:
-            # will become the default in the future
-            if (artifacts_path / self._model_repo_folder).exists():
-                artifacts_path = (
-                    artifacts_path / self._model_repo_folder / self._model_path
-                )
-            elif (artifacts_path / self._model_path).exists():
+            if (artifacts_path / model_repo_folder).exists():
+                artifacts_path = artifacts_path / model_repo_folder / model_path
+            elif (artifacts_path / model_path).exists():
                 warnings.warn(
                     "The usage of artifacts_path containing directly "
-                    f"{self._model_path} is deprecated. Please point "
+                    f"{model_path} is deprecated. Please point "
                     "the artifacts_path to the parent containing "
-                    f"the {self._model_repo_folder} folder.",
+                    f"the {model_repo_folder} folder.",
                     DeprecationWarning,
                     stacklevel=3,
                 )
-                artifacts_path = artifacts_path / self._model_path
+                artifacts_path = artifacts_path / model_path
 
         self.layout_predictor = LayoutPredictor(
             artifact_path=str(artifacts_path),
@@ -84,10 +88,11 @@ class LayoutModel(BasePageModel):
         local_dir: Optional[Path] = None,
         force: bool = False,
         progress: bool = False,
+        layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
     ) -> Path:
         return download_hf_model(
-            repo_id="ds4sd/docling-models",
-            revision="v2.2.0",
+            repo_id=layout_model_config.repo_id,
+            revision=layout_model_config.revision,
             local_dir=local_dir,
             force=force,
             progress=progress,
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index ad4f36da..bd0c9924 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -10,6 +10,7 @@ from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
+from docling.datamodel.layout_model_specs import LayoutModelConfig
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
 from docling.models.base_ocr_model import BaseOcrModel
@@ -36,8 +37,8 @@ _log = logging.getLogger(__name__)
 
 
 class StandardPdfPipeline(PaginatedPipeline):
-    _layout_model_path = LayoutModel._model_path
-    _table_model_path = TableStructureModel._model_path
+    # _layout_model_path = LayoutModel._model_path
+    # _table_model_path = TableStructureModel._model_path
 
     def __init__(self, pipeline_options: PdfPipelineOptions):
         super().__init__(pipeline_options)
@@ -80,6 +81,7 @@ class StandardPdfPipeline(PaginatedPipeline):
             LayoutModel(
                 artifacts_path=artifacts_path,
                 accelerator_options=pipeline_options.accelerator_options,
+                layout_model_config=pipeline_options.layout_model_config,
             ),
             # Table structure model
             TableStructureModel(
diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py
index 55383c03..a2994fb7 100644
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -2,6 +2,7 @@ import logging
 from pathlib import Path
 from typing import Optional
 
+from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
 from docling.datamodel.pipeline_options import (
     granite_picture_description,
     smolvlm_picture_description,
@@ -46,7 +47,7 @@ def download_models(
     if with_layout:
         _log.info("Downloading layout model...")
         LayoutModel.download_models(
-            local_dir=output_dir / LayoutModel._model_repo_folder,
+            local_dir=output_dir / DOCLING_LAYOUT_V2.model_repo_folder,
             force=force,
             progress=progress,
         )
diff --git a/pyproject.toml b/pyproject.toml
index 7139c031..ee9ea944 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -46,7 +46,7 @@ dependencies = [
   'pydantic (>=2.0.0,<3.0.0)',
   'docling-core[chunking] (>=2.39.0,<3.0.0)',
   'docling-parse (>=4.0.0,<5.0.0)',
-  'docling-ibm-models (>=3.6.0,<4)',
+  "docling-ibm-models>=3.6.0,<4",
   'filetype (>=1.2.0,<2.0.0)',
   'pypdfium2 (>=4.30.0,<5.0.0)',
   'pydantic-settings (>=2.3.0,<3.0.0)',
@@ -150,6 +150,7 @@ default-groups = "all"
 
 [tool.uv.sources]
 openai-whisper = { git = "https://github.com/openai/whisper.git", rev = "dd985ac4b90cafeef8712f2998d62c59c3e62d22" }
+docling-ibm-models = { git = "https://github.com/docling-project/docling-ibm-models", rev = "nli/auto_layout_predictor" }
 
 [tool.setuptools.packages.find]
 include = ["docling*"]
diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
index dd51e390..e938e2d7 100644
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json
@@ -213,10 +213,10 @@
       "prov": [
         {
           "bbox": [
-            139.66741943359375,
+            139.6674041748047,
             322.5054626464844,
             475.00927734375,
-            454.45458984375
+            454.4546203613281
           ],
           "page": 1,
           "span": [
diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
index 3010fbb6..3c219d95 100644
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
@@ -2705,7 +2705,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
             "cells": [
               {
                 "index": 0,
@@ -2745,7 +2745,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
             "cells": [
               {
                 "index": 1,
@@ -2785,7 +2785,7 @@
               "b": 152.90697999999998,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
             "cells": [
               {
                 "index": 2,
@@ -2940,7 +2940,7 @@
               "b": 255.42400999999995,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
             "cells": [
               {
                 "index": 7,
@@ -3155,7 +3155,7 @@
               "b": 327.98218,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
             "cells": [
               {
                 "index": 15,
@@ -3339,8 +3339,8 @@
             "id": 0,
             "label": "table",
             "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
               "r": 475.00927734375,
               "b": 469.4945373535156,
               "coord_origin": "TOPLEFT"
@@ -7846,7 +7846,7 @@
               "b": 518.17419,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
             "cells": [
               {
                 "index": 91,
@@ -7911,7 +7911,7 @@
               "b": 618.3,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
             "cells": [
               {
                 "index": 93,
@@ -8243,8 +8243,8 @@
               "id": 0,
               "label": "table",
               "bbox": {
-                "l": 139.66741943359375,
-                "t": 337.54541015625,
+                "l": 139.6674041748047,
+                "t": 337.5453796386719,
                 "r": 475.00927734375,
                 "b": 469.4945373535156,
                 "coord_origin": "TOPLEFT"
@@ -13641,7 +13641,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
             "cells": [
               {
                 "index": 0,
@@ -13687,7 +13687,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
             "cells": [
               {
                 "index": 1,
@@ -13733,7 +13733,7 @@
               "b": 152.90697999999998,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
             "cells": [
               {
                 "index": 2,
@@ -13900,7 +13900,7 @@
               "b": 255.42400999999995,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
             "cells": [
               {
                 "index": 7,
@@ -14121,7 +14121,7 @@
               "b": 327.98218,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
             "cells": [
               {
                 "index": 15,
@@ -14311,8 +14311,8 @@
             "id": 0,
             "label": "table",
             "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
               "r": 475.00927734375,
               "b": 469.4945373535156,
               "coord_origin": "TOPLEFT"
@@ -19701,7 +19701,7 @@
               "b": 518.17419,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
             "cells": [
               {
                 "index": 91,
@@ -19772,7 +19772,7 @@
               "b": 618.3,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
             "cells": [
               {
                 "index": 93,
@@ -20116,7 +20116,7 @@
               "b": 152.90697999999998,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
             "cells": [
               {
                 "index": 2,
@@ -20283,7 +20283,7 @@
               "b": 255.42400999999995,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
             "cells": [
               {
                 "index": 7,
@@ -20504,7 +20504,7 @@
               "b": 327.98218,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
             "cells": [
               {
                 "index": 15,
@@ -20694,8 +20694,8 @@
             "id": 0,
             "label": "table",
             "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
               "r": 475.00927734375,
               "b": 469.4945373535156,
               "coord_origin": "TOPLEFT"
@@ -26084,7 +26084,7 @@
               "b": 518.17419,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
             "cells": [
               {
                 "index": 91,
@@ -26155,7 +26155,7 @@
               "b": 618.3,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
             "cells": [
               {
                 "index": 93,
@@ -26499,7 +26499,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
             "cells": [
               {
                 "index": 0,
@@ -26545,7 +26545,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
             "cells": [
               {
                 "index": 1,
diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
index b0c2b624..762dcf44 100644
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.doctags.txt
@@ -9,9 +9,9 @@
 <text><loc_41><loc_354><loc_234><loc_450>The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues.</text>
 <picture><loc_258><loc_144><loc_439><loc_191></picture>
 <otsl><loc_258><loc_144><loc_439><loc_191><ched>1<nl></otsl>
-<unordered_list><list_item><loc_258><loc_198><loc_397><loc_210>Red-annotation of bounding boxes, Blue-predictions by TableFormer</list_item>
+<ordered_list><list_item><loc_258><loc_198><loc_397><loc_210>Red-annotation of bounding boxes, Blue-predictions by TableFormer</list_item>
 <list_item><loc_258><loc_265><loc_401><loc_271>Structure predicted by TableFormer:</list_item>
-</unordered_list>
+</ordered_list>
 <picture><loc_257><loc_213><loc_441><loc_259></picture>
 <picture><loc_258><loc_274><loc_439><loc_313><caption><loc_252><loc_325><loc_445><loc_353>Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: 'PMC2944238 004 02'.</caption></picture>
 <otsl><loc_258><loc_274><loc_439><loc_313><fcel>0<fcel>1 2 1<lcel><lcel><lcel><nl><fcel>3<fcel>4 3<fcel>5<fcel>6<fcel>7<nl><fcel>8 2<fcel>9<fcel>10<fcel>11<fcel>12<nl><fcel>13<ecel><fcel>14<fcel>15<fcel>16<nl><fcel>17<fcel>18<ecel><fcel>19<fcel>20<nl></otsl>
@@ -124,8 +124,8 @@
 <text><loc_41><loc_339><loc_234><loc_450>We showcase several visualizations for the different components of our network on various "complex" tables within datasets presented in this work in Fig. 5 and Fig. 6 As it is shown, our model is able to predict bounding boxes for all table cells, even for the empty ones. Additionally, our post-processing techniques can extract the cell content by matching the predicted bounding boxes to the PDF cells based on their overlap and spatial proximity. The left part of Fig. 5 demonstrates also the adaptability of our method to any language, as it can successfully extract Japanese text, although the training set contains only English content. We provide more visualizations including the intermediate steps in the supplementary material. Overall these illustrations justify the versatility of our method across a diverse range of table appearances and content type.</text>
 <text><loc_252><loc_324><loc_445><loc_412>In this paper, we presented TableFormer an end-to-end transformer based approach to predict table structures and bounding boxes of cells from an image. This approach enables us to recreate the table structure, and extract the cell content from PDF or OCR by using bounding boxes. Additionally, it provides the versatility required in real-world scenarios when dealing with various types of PDF documents, and languages. Furthermore, our method outperforms all state-of-the-arts with a wide margin. Finally, we introduce "SynthTabNet" a challenging synthetically generated dataset that reinforces missing characteristics from other datasets.</text>
 <section_header_level_1><loc_252><loc_424><loc_298><loc_431>References</section_header_level_1>
-<unordered_list><list_item><loc_256><loc_438><loc_445><loc_450>Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</list_item>
-</unordered_list>
+<ordered_list><list_item><loc_256><loc_438><loc_445><loc_450>Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-</list_item>
+</ordered_list>
 <page_footer><loc_241><loc_463><loc_245><loc_469>8</page_footer>
 <page_break>
 <unordered_list><list_item><loc_57><loc_48><loc_234><loc_74>end object detection with transformers. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision - ECCV 2020 , pages 213-229, Cham, 2020. Springer International Publishing. 5</list_item>
@@ -157,7 +157,7 @@
 <page_footer><loc_241><loc_463><loc_245><loc_469>9</page_footer>
 <page_break>
 <text><loc_57><loc_48><loc_234><loc_60>Computer Vision and Pattern Recognition , pages 658-666, 2019. 6</text>
-<unordered_list><list_item><loc_41><loc_62><loc_234><loc_102>Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1</list_item>
+<ordered_list><list_item><loc_41><loc_62><loc_234><loc_102>Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1</list_item>
 <list_item><loc_41><loc_104><loc_234><loc_143>Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3</list_item>
 <list_item><loc_41><loc_145><loc_234><loc_171>Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2</list_item>
 <list_item><loc_41><loc_173><loc_234><loc_206>Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3</list_item>
@@ -171,7 +171,7 @@
 <list_item><loc_41><loc_438><loc_234><loc_450>Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,</list_item>
 <list_item><loc_269><loc_48><loc_445><loc_74>and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7</list_item>
 <list_item><loc_252><loc_76><loc_445><loc_102>Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1</list_item>
-</unordered_list>
+</ordered_list>
 <page_footer><loc_239><loc_463><loc_247><loc_469>10</page_footer>
 <page_break>
 <section_header_level_1><loc_109><loc_70><loc_380><loc_86>TableFormer: Table Structure Understanding with Transformers Supplementary Material</section_header_level_1>
@@ -183,12 +183,12 @@
 <section_header_level_1><loc_41><loc_418><loc_125><loc_424>1.2. Synthetic datasets</section_header_level_1>
 <text><loc_41><loc_430><loc_234><loc_451><loc_252><loc_103><loc_445><loc_131>Aiming to train and evaluate our models in a broader spectrum of table data we have synthesized four types of datasets. Each one contains tables with different appear- ances in regard to their size, structure, style and content. Every synthetic dataset contains 150k examples, summing up to 600k synthetic examples. All datasets are divided into Train, Test and Val splits (80%, 10%, 10%).</text>
 <text><loc_252><loc_133><loc_445><loc_147>The process of generating a synthetic dataset can be decomposed into the following steps:</text>
-<unordered_list><list_item><loc_252><loc_149><loc_445><loc_200>Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</list_item>
+<ordered_list><list_item><loc_252><loc_149><loc_445><loc_200>Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).</list_item>
 <list_item><loc_252><loc_202><loc_445><loc_283>Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.</list_item>
 <list_item><loc_252><loc_286><loc_445><loc_314>Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.</list_item>
 <list_item><loc_252><loc_316><loc_445><loc_345>Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.</list_item>
 <list_item><loc_252><loc_347><loc_445><loc_383>Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.</list_item>
-</unordered_list>
+</ordered_list>
 <section_header_level_1><loc_252><loc_393><loc_445><loc_408>2. Prediction post-processing for PDF documents</section_header_level_1>
 <text><loc_252><loc_415><loc_445><loc_451>Although TableFormer can predict the table structure and the bounding boxes for tables recognized inside PDF documents, this is not enough when a full reconstruction of the original table is required. This happens mainly due the following reasons:</text>
 <page_footer><loc_239><loc_463><loc_247><loc_469>11</page_footer>
@@ -200,20 +200,20 @@
 <text><loc_252><loc_133><loc_445><loc_161>dian cell size for all table cells. The usage of median during the computations, helps to eliminate outliers caused by occasional column spans which are usually wider than the normal.</text>
 <text><loc_41><loc_176><loc_234><loc_250>However, it is possible to mitigate those limitations by combining the TableFormer predictions with the information already present inside a programmatic PDF document. More specifically, PDF documents can be seen as a sequence of PDF cells where each cell is described by its content and bounding box. If we are able to associate the PDF cells with the predicted table cells, we can directly link the PDF cell content to the table cell structure and use the PDF bounding boxes to correct misalignments in the predicted table cell bounding boxes.</text>
 <text><loc_41><loc_252><loc_234><loc_265>Here is a step-by-step description of the prediction postprocessing:</text>
-<unordered_list><list_item><loc_41><loc_267><loc_234><loc_288>Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</list_item>
+<ordered_list><list_item><loc_41><loc_267><loc_234><loc_288>Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.</list_item>
 <list_item><loc_41><loc_290><loc_234><loc_318>Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.</list_item>
 <list_item><loc_41><loc_320><loc_234><loc_334>Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.</list_item>
 <list_item><loc_41><loc_336><loc_234><loc_356>3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.</list_item>
 <list_item><loc_41><loc_359><loc_234><loc_379>Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:</list_item>
-</unordered_list>
+</ordered_list>
 <formula><loc_90><loc_394><loc_234><loc_413></formula>
 <text><loc_41><loc_421><loc_234><loc_435>where c is one of { left, centroid, right } and x$_{c}$ is the xcoordinate for the corresponding point.</text>
-<unordered_list><list_item><loc_41><loc_437><loc_234><loc_450>Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</list_item>
+<ordered_list><list_item><loc_41><loc_437><loc_234><loc_450>Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-</list_item>
 <list_item><loc_252><loc_164><loc_445><loc_177>Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.</list_item>
 <list_item><loc_252><loc_179><loc_445><loc_245>Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.</list_item>
 <list_item><loc_252><loc_247><loc_445><loc_290>In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.</list_item>
 <list_item><loc_252><loc_293><loc_445><loc_359>Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.</list_item>
-</unordered_list>
+</ordered_list>
 <text><loc_252><loc_361><loc_445><loc_381>9a. Compute the top and bottom boundary of the horizontal band for each grid row (min/max y coordinates per row).</text>
 <unordered_list><list_item><loc_252><loc_384><loc_445><loc_397>9b. Intersect the orphan's bounding box with the row bands, and map the cell to the closest grid row.</list_item>
 <list_item><loc_252><loc_399><loc_445><loc_420>9c. Compute the left and right boundary of the vertical band for each grid column (min/max x coordinates per column).</list_item>
diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.json b/tests/data/groundtruth/docling_v2/2203.01017v2.json
index a1919dd8..dc1853c9 100644
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
@@ -1339,7 +1339,7 @@
       "text": "Red-annotation of bounding boxes, Blue-predictions by TableFormer",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "b."
     },
     {
@@ -2095,7 +2095,7 @@
       "text": "Structure predicted by TableFormer:",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "c."
     },
     {
@@ -11287,7 +11287,7 @@
       "text": "Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[1]"
     },
     {
@@ -11378,7 +11378,7 @@
       "text": "Zewen Chi, Heyan Huang, Heng-Da Xu, Houjin Yu, Wanxuan Yin, and Xian-Ling Mao. Complicated table structure recognition. arXiv preprint arXiv:1908.04729 , 2019. 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[2]"
     },
     {
@@ -11409,7 +11409,7 @@
       "text": "Bertrand Couasnon and Aurelie Lemaitre. Recognition of Tables and Forms , pages 647-677. Springer London, London, 2014. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[3]"
     },
     {
@@ -11440,7 +11440,7 @@
       "text": "Herv\u00b4e D\u00b4ejean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), Apr. 2019. http://sac.founderit.com/. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[4]"
     },
     {
@@ -11471,7 +11471,7 @@
       "text": "Basilios Gatos, Dimitrios Danatsas, Ioannis Pratikakis, and Stavros J Perantonis. Automatic table detection in document images. In International Conference on Pattern Recognition and Image Analysis , pages 609-618. Springer, 2005. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[5]"
     },
     {
@@ -11502,7 +11502,7 @@
       "text": "Max G\u00a8obel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[6]"
     },
     {
@@ -11533,7 +11533,7 @@
       "text": "EA Green and M Krishnamoorthy. Recognition of tables using table grammars. procs. In Symposium on Document Analysis and Recognition (SDAIR'95) , pages 261-277. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[7]"
     },
     {
@@ -11564,7 +11564,7 @@
       "text": "Khurram Azeem Hashmi, Alain Pagani, Marcus Liwicki, Didier Stricker, and Muhammad Zeshan Afzal. Castabdetectors: Cascade network for table detection in document images with recursive feature pyramid and switchable atrous convolution. Journal of Imaging , 7(10), 2021. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[8]"
     },
     {
@@ -11595,7 +11595,7 @@
       "text": "Kaiming He, Georgia Gkioxari, Piotr Dollar, and Ross Girshick. Mask r-cnn. In Proceedings of the IEEE International Conference on Computer Vision (ICCV) , Oct 2017. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[9]"
     },
     {
@@ -11626,7 +11626,7 @@
       "text": "Yelin He, X. Qi, Jiaquan Ye, Peng Gao, Yihao Chen, Bingcong Li, Xin Tang, and Rong Xiao. Pingan-vcgroup's solution for icdar 2021 competition on scientific table image recognition to latex. ArXiv , abs/2105.01846, 2021. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[10]"
     },
     {
@@ -11657,7 +11657,7 @@
       "text": "Jianying Hu, Ramanujan S Kashi, Daniel P Lopresti, and Gordon Wilfong. Medium-independent table detection. In Document Recognition and Retrieval VII , volume 3967, pages 291-302. International Society for Optics and Photonics, 1999. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[11]"
     },
     {
@@ -11688,7 +11688,7 @@
       "text": "Matthew Hurst. A constraint-based approach to table structure derivation. In Proceedings of the Seventh International Conference on Document Analysis and Recognition - Volume 2 , ICDAR '03, page 911, USA, 2003. IEEE Computer Society. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[12]"
     },
     {
@@ -11719,7 +11719,7 @@
       "text": "Thotreingam Kasar, Philippine Barlas, Sebastien Adam, Cl\u00b4ement Chatelain, and Thierry Paquet. Learning to detect tables in scanned document images using line information. In 2013 12th International Conference on Document Analysis and Recognition , pages 1185-1189. IEEE, 2013. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[13]"
     },
     {
@@ -11750,7 +11750,7 @@
       "text": "Pratik Kayal, Mrinal Anand, Harsh Desai, and Mayank Singh. Icdar 2021 competition on scientific table image recognition to latex, 2021. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[14]"
     },
     {
@@ -11781,7 +11781,7 @@
       "text": "Harold W Kuhn. The hungarian method for the assignment problem. Naval research logistics quarterly , 2(1-2):83-97, 1955. 6",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[15]"
     },
     {
@@ -11812,7 +11812,7 @@
       "text": "Girish Kulkarni, Visruth Premraj, Vicente Ordonez, Sagnik Dhar, Siming Li, Yejin Choi, Alexander C. Berg, and Tamara L. Berg. Babytalk: Understanding and generating simple image descriptions. IEEE Transactions on Pattern Analysis and Machine Intelligence , 35(12):2891-2903, 2013. 4",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[16]"
     },
     {
@@ -11843,7 +11843,7 @@
       "text": "Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou, and Zhoujun Li. Tablebank: A benchmark dataset for table detection and recognition, 2019. 2, 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[17]"
     },
     {
@@ -11874,7 +11874,7 @@
       "text": "Yiren Li, Zheng Huang, Junchi Yan, Yi Zhou, Fan Ye, and Xianhui Liu. Gfte: Graph-based financial table extraction. In Alberto Del Bimbo, Rita Cucchiara, Stan Sclaroff, Giovanni Maria Farinella, Tao Mei, Marco Bertini, Hugo Jair Escalante, and Roberto Vezzani, editors, Pattern Recognition. ICPR International Workshops and Challenges , pages 644-658, Cham, 2021. Springer International Publishing. 2, 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[18]"
     },
     {
@@ -11905,7 +11905,7 @@
       "text": "Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter Staar. Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence , 35(17):15137-15145, May 2021. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[19]"
     },
     {
@@ -11936,7 +11936,7 @@
       "text": "Rujiao Long, Wen Wang, Nan Xue, Feiyu Gao, Zhibo Yang, Yongpan Wang, and Gui-Song Xia. Parsing table structures in the wild. In Proceedings of the IEEE/CVF International Conference on Computer Vision , pages 944-952, 2021. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[20]"
     },
     {
@@ -11967,7 +11967,7 @@
       "text": "Shubham Singh Paliwal, D Vishwanath, Rohit Rahul, Monika Sharma, and Lovekesh Vig. Tablenet: Deep learning model for end-to-end table detection and tabular data extraction from scanned document images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 128-133. IEEE, 2019. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[21]"
     },
     {
@@ -11998,7 +11998,7 @@
       "text": "Adam Paszke, Sam Gross, Francisco Massa, Adam Lerer, James Bradbury, Gregory Chanan, Trevor Killeen, Zeming Lin, Natalia Gimelshein, Luca Antiga, Alban Desmaison, Andreas Kopf, Edward Yang, Zachary DeVito, Martin Raison, Alykhan Tejani, Sasank Chilamkurthy, Benoit Steiner, Lu Fang, Junjie Bai, and Soumith Chintala. Pytorch: An imperative style, high-performance deep learning library. In H. Wallach, H. Larochelle, A. Beygelzimer, F. d'Alch\u00b4e-Buc, E. Fox, and R. Garnett, editors, Advances in Neural Information Processing Systems 32 , pages 8024-8035. Curran Associates, Inc., 2019. 6",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[22]"
     },
     {
@@ -12029,7 +12029,7 @@
       "text": "Devashish Prasad, Ayan Gadpal, Kshitij Kapadni, Manish Visave, and Kavita Sultanpure. Cascadetabnet: An approach for end to end table detection and structure recognition from image-based documents. In Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition Workshops , pages 572-573, 2020. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[23]"
     },
     {
@@ -12060,7 +12060,7 @@
       "text": "Shah Rukh Qasim, Hassan Mahmood, and Faisal Shafait. Rethinking table recognition using graph neural networks. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 142-147. IEEE, 2019. 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[24]"
     },
     {
@@ -12091,7 +12091,7 @@
       "text": "Hamid Rezatofighi, Nathan Tsoi, JunYoung Gwak, Amir Sadeghian, Ian Reid, and Silvio Savarese. Generalized intersection over union: A metric and a loss for bounding box regression. In Proceedings of the IEEE/CVF Conference on",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[25]"
     },
     {
@@ -12180,7 +12180,7 @@
       "text": "Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 11621167, 2017. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[26]"
     },
     {
@@ -12211,7 +12211,7 @@
       "text": "Sebastian Schreiber, Stefan Agne, Ivo Wolf, Andreas Dengel, and Sheraz Ahmed. Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In 2017 14th IAPR international conference on document analysis and recognition (ICDAR) , volume 1, pages 1162-1167. IEEE, 2017. 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[27]"
     },
     {
@@ -12242,7 +12242,7 @@
       "text": "Faisal Shafait and Ray Smith. Table detection in heterogeneous documents. In Proceedings of the 9th IAPR International Workshop on Document Analysis Systems , pages 6572, 2010. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[28]"
     },
     {
@@ -12273,7 +12273,7 @@
       "text": "Shoaib Ahmed Siddiqui, Imran Ali Fateh, Syed Tahseen Raza Rizvi, Andreas Dengel, and Sheraz Ahmed. Deeptabstr: Deep learning based table structure recognition. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1403-1409. IEEE, 2019. 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[29]"
     },
     {
@@ -12304,7 +12304,7 @@
       "text": "Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD , KDD '18, pages 774-782, New York, NY, USA, 2018. ACM. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[30]"
     },
     {
@@ -12335,7 +12335,7 @@
       "text": "Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, \u0141 ukasz Kaiser, and Illia Polosukhin. Attention is all you need. In I. Guyon, U. V. Luxburg, S. Bengio, H. Wallach, R. Fergus, S. Vishwanathan, and R. Garnett, editors, Advances in Neural Information Processing Systems 30 , pages 5998-6008. Curran Associates, Inc., 2017. 5",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[31]"
     },
     {
@@ -12366,7 +12366,7 @@
       "text": "Oriol Vinyals, Alexander Toshev, Samy Bengio, and Dumitru Erhan. Show and tell: A neural image caption generator. In Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition (CVPR) , June 2015. 2",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[32]"
     },
     {
@@ -12397,7 +12397,7 @@
       "text": "Wenyuan Xue, Qingyong Li, and Dacheng Tao. Res2tim: reconstruct syntactic structures from table images. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 749-755. IEEE, 2019. 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[33]"
     },
     {
@@ -12428,7 +12428,7 @@
       "text": "Wenyuan Xue, Baosheng Yu, Wen Wang, Dacheng Tao, and Qingyong Li. Tgrnet: A table graph reconstruction network for table structure recognition. arXiv preprint arXiv:2106.10598 , 2021. 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[34]"
     },
     {
@@ -12459,7 +12459,7 @@
       "text": "Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[35]"
     },
     {
@@ -12490,7 +12490,7 @@
       "text": "Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[36]"
     },
     {
@@ -12521,7 +12521,7 @@
       "text": "Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[37]"
     },
     {
@@ -12583,7 +12583,7 @@
       "text": "Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[38]"
     },
     {
@@ -12922,7 +12922,7 @@
       "text": "Prepare styling and content templates: The styling templates have been manually designed and organized into groups of scope specific appearances (e.g. financial data, marketing data, etc.) Additionally, we have prepared curated collections of content templates by extracting the most frequently used terms out of non-synthetic datasets (e.g. PubTabNet, FinTabNet, etc.).",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -12953,7 +12953,7 @@
       "text": "Generate table structures: The structure of each synthetic dataset assumes a horizontal table header which potentially spans over multiple rows and a table body that may contain a combination of row spans and column spans. However, spans are not allowed to cross the header - body boundary. The table structure is described by the parameters: Total number of table rows and columns, number of header rows, type of spans (header only spans, row only spans, column only spans, both row and column spans), maximum span size and the ratio of the table area covered by spans.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -12984,7 +12984,7 @@
       "text": "Generate content: Based on the dataset theme , a set of suitable content templates is chosen first. Then, this content can be combined with purely random text to produce the synthetic content.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
@@ -13015,7 +13015,7 @@
       "text": "Apply styling templates: Depending on the domain of the synthetic dataset, a set of styling templates is first manually selected. Then, a style is randomly selected to format the appearance of the synthesized table.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "4."
     },
     {
@@ -13046,7 +13046,7 @@
       "text": "Render the complete tables: The synthetic table is finally rendered by a web browser engine to generate the bounding boxes for each table cell. A batching technique is utilized to optimize the runtime overhead of the rendering process.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "5."
     },
     {
@@ -15054,7 +15054,7 @@
       "text": "Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -15085,7 +15085,7 @@
       "text": "Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -15116,7 +15116,7 @@
       "text": "Use a carefully selected IOU threshold to designate the matches as \"good\" ones and \"bad\" ones.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
@@ -15178,7 +15178,7 @@
       "text": "Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "4."
     },
     {
@@ -15267,7 +15267,7 @@
       "text": "Use the alignment computed in step 4, to compute the median x -coordinate for all table columns and the me-",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "5."
     },
     {
@@ -15298,7 +15298,7 @@
       "text": "Snap all cells with bad IOU to their corresponding median x -coordinates and cell sizes.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "6."
     },
     {
@@ -15329,7 +15329,7 @@
       "text": "Generate a new set of pair-wise matches between the corrected bounding boxes and PDF cells. This time use a modified version of the IOU metric, where the area of the intersection between the predicted and PDF cells is divided by the PDF cell area. In case there are multiple matches for the same PDF cell, the prediction with the higher score is preferred. This covers the cases where the PDF cells are smaller than the area of predicted or corrected prediction cells.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "7."
     },
     {
@@ -15360,7 +15360,7 @@
       "text": "In some rare occasions, we have noticed that TableFormer can confuse a single column as two. When the postprocessing steps are applied, this results with two predicted columns pointing to the same PDF column. In such case we must de-duplicate the columns according to highest total column intersection score.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "8."
     },
     {
@@ -15391,7 +15391,7 @@
       "text": "Pick up the remaining orphan cells. There could be cases, when after applying all the previous post-processing steps, some PDF cells could still remain without any match to predicted cells. However, it is still possible to deduce the correct matching for an orphan PDF cell by mapping its bounding box on the geometry of the grid. This mapping decides if the content of the orphan cell will be appended to an already matched table cell, or a new table cell should be created to match with the orphan.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "9."
     },
     {
diff --git a/tests/data/groundtruth/docling_v2/2203.01017v2.md b/tests/data/groundtruth/docling_v2/2203.01017v2.md
index 5cf44d8e..5c0cfb52 100644
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.md
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.md
@@ -322,7 +322,7 @@ Computer Vision and Pattern Recognition , pages 658-666, 2019. 6
 - [35] Quanzeng You, Hailin Jin, Zhaowen Wang, Chen Fang, and Jiebo Luo. Image captioning with semantic attention. In Proceedings of the IEEE conference on computer vision and pattern recognition , pages 4651-4659, 2016. 4
 - [36] Xinyi Zheng, Doug Burdick, Lucian Popa, Peter Zhong, and Nancy Xin Ru Wang. Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. Winter Conference for Applications in Computer Vision (WACV) , 2021. 2, 3
 - [37] Xu Zhong, Elaheh ShafieiBavani, and Antonio Jimeno Yepes. Image-based table recognition: Data, model,
-- and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7
+13. and evaluation. In Andrea Vedaldi, Horst Bischof, Thomas Brox, and Jan-Michael Frahm, editors, Computer Vision ECCV 2020 , pages 564-580, Cham, 2020. Springer International Publishing. 2, 3, 7
 - [38] Xu Zhong, Jianbin Tang, and Antonio Jimeno Yepes. Publaynet: Largest dataset ever for document layout analysis. In 2019 International Conference on Document Analysis and Recognition (ICDAR) , pages 1015-1022, 2019. 1
 
 ## TableFormer: Table Structure Understanding with Transformers Supplementary Material
@@ -369,7 +369,7 @@ Here is a step-by-step description of the prediction postprocessing:
 1. Get the minimal grid dimensions - number of rows and columns for the predicted table structure. This represents the most granular grid for the underlying table structure.
 2. Generate pair-wise matches between the bounding boxes of the PDF cells and the predicted cells. The Intersection Over Union (IOU) metric is used to evaluate the quality of the matches.
 3. Use a carefully selected IOU threshold to designate the matches as "good" ones and "bad" ones.
-- 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
+4. 3.a. If all IOU scores in a column are below the threshold, discard all predictions (structure and bounding boxes) for that column.
 4. Find the best-fitting content alignment for the predicted cells with good IOU per each column. The alignment of the column can be identified by the following formula:
 
 <!-- formula-not-decoded -->
diff --git a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
index b5d1a737..63daf90b 100644
--- a/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2206.01062.doctags.txt
@@ -25,15 +25,15 @@
 <text><loc_44><loc_70><loc_248><loc_145>Despite the substantial improvements achieved with machine-learning (ML) approaches and deep neural networks in recent years, document conversion remains a challenging problem, as demonstrated by the numerous public competitions held on this topic [1-4]. The challenge originates from the huge variability in PDF documents regarding layout, language and formats (scanned, programmatic or a combination of both). Engineering a single ML model that can be applied on all types of documents and provides high-quality layout segmentation remains to this day extremely challenging [5]. To highlight the variability in document layouts, we show a few example documents from the DocLayNet dataset in Figure 1.</text>
 <text><loc_44><loc_146><loc_241><loc_317>A key problem in the process of document conversion is to understand the structure of a single document page, i.e. which segments of text should be grouped together in a unit. To train models for this task, there are currently two large datasets available to the community, PubLayNet [6] and DocBank [7]. They were introduced in 2019 and 2020 respectively and significantly accelerated the implementation of layout detection and segmentation models due to their sizes of 300K and 500K ground-truth pages. These sizes were achieved by leveraging an automation approach. The benefit of automated ground-truth generation is obvious: one can generate large ground-truth datasets at virtually no cost. However, the automation introduces a constraint on the variability in the dataset, because corresponding structured source data must be available. PubLayNet and DocBank were both generated from scientific document repositories (PubMed and arXiv), which provide XML or L A T E X sources. Those scientific documents present a limited variability in their layouts, because they are typeset in uniform templates provided by the publishers. Obviously, documents such as technical manuals, annual company reports, legal text, government tenders, etc. have very different and partially unique layouts. As a consequence, the layout predictions obtained from models trained on PubLayNet or DocBank is very reasonable when applied on scientific documents. However, for more artistic or free-style layouts, we see sub-par prediction quality from these models, which we demonstrate in Section 5.</text>
 <text><loc_44><loc_319><loc_241><loc_366>In this paper, we present the DocLayNet dataset. It provides pageby-page layout annotation ground-truth using bounding-boxes for 11 distinct class labels on 80863 unique document pages, of which a fraction carry double- or triple-annotations. DocLayNet is similar in spirit to PubLayNet and DocBank and will likewise be made available to the public 1 in order to stimulate the document-layout analysis community. It distinguishes itself in the following aspects:</text>
-<unordered_list><list_item><loc_53><loc_369><loc_241><loc_388>Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.</list_item>
+<ordered_list><list_item><loc_53><loc_369><loc_241><loc_388>Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.</list_item>
 <list_item><loc_53><loc_390><loc_240><loc_402>Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.</list_item>
 <list_item><loc_53><loc_404><loc_241><loc_423>Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.</list_item>
 <list_item><loc_53><loc_424><loc_241><loc_437>Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.</list_item>
-</unordered_list>
+</ordered_list>
 <footnote><loc_44><loc_443><loc_176><loc_447>$^{1}$https://developer.ibm.com/exchanges/data/all/doclaynet</footnote>
 <text><loc_279><loc_55><loc_456><loc_67>This enables experimentation with annotation uncertainty and quality control analysis.</text>
-<unordered_list><list_item><loc_269><loc_69><loc_457><loc_102>Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.</list_item>
-</unordered_list>
+<ordered_list><list_item><loc_269><loc_69><loc_457><loc_102>Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.</list_item>
+</ordered_list>
 <text><loc_259><loc_106><loc_457><loc_139>All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.</text>
 <text><loc_260><loc_141><loc_457><loc_194>In Section 5, we will present baseline accuracy numbers for a variety of object detection methods (Faster R-CNN, Mask R-CNN and YOLOv5) trained on DocLayNet. We further show how the model performance is impacted by varying the DocLayNet dataset size, reducing the label set and modifying the train/test-split. Last but not least, we compare the performance of models trained on PubLayNet, DocBank and DocLayNet and demonstrate that a model trained on DocLayNet provides overall more robust layout recovery.</text>
 <section_header_level_1><loc_260><loc_203><loc_345><loc_209>2 RELATED WORK</section_header_level_1>
@@ -71,13 +71,13 @@
 <text><loc_44><loc_55><loc_240><loc_67>the textual content of an element, which goes beyond visual layout recognition, in particular outside the Scientific Articles category.</text>
 <text><loc_44><loc_69><loc_241><loc_157>At first sight, the task of visual document-layout interpretation appears intuitive enough to obtain plausible annotations in most cases. However, during early trial-runs in the core team, we observed many cases in which annotators use different annotation styles, especially for documents with challenging layouts. For example, if a figure is presented with subfigures, one annotator might draw a single figure bounding-box, while another might annotate each subfigure separately. The same applies for lists, where one might annotate all list items in one block or each list item separately. In essence, we observed that challenging layouts would be annotated in different but plausible ways. To illustrate this, we show in Figure 4 multiple examples of plausible but inconsistent annotations on the same pages.</text>
 <text><loc_44><loc_159><loc_241><loc_213>Obviously, this inconsistency in annotations is not desirable for datasets which are intended to be used for model training. To minimise these inconsistencies, we created a detailed annotation guideline. While perfect consistency across 40 annotation staff members is clearly not possible to achieve, we saw a huge improvement in annotation consistency after the introduction of our annotation guideline. A few selected, non-trivial highlights of the guideline are:</text>
-<unordered_list><list_item><loc_53><loc_220><loc_240><loc_246>Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.</list_item>
+<ordered_list><list_item><loc_53><loc_220><loc_240><loc_246>Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.</list_item>
 <list_item><loc_53><loc_248><loc_241><loc_274>A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.</list_item>
 <list_item><loc_53><loc_275><loc_240><loc_288>For every Caption , there must be exactly one corresponding Picture or Table .</list_item>
 <list_item><loc_53><loc_289><loc_240><loc_301>Connected sub-pictures are grouped together in one Picture object.</list_item>
 <list_item><loc_53><loc_303><loc_216><loc_308>Formula numbers are included in a Formula object.</list_item>
 <list_item><loc_53><loc_310><loc_240><loc_329>Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.</list_item>
-</unordered_list>
+</ordered_list>
 <text><loc_44><loc_336><loc_241><loc_363>The complete annotation guideline is over 100 pages long and a detailed description is obviously out of scope for this paper. Nevertheless, it will be made publicly available alongside with DocLayNet for future reference.</text>
 <text><loc_44><loc_364><loc_241><loc_446>Phase 3: Training. After a first trial with a small group of people, we realised that providing the annotation guideline and a set of random practice pages did not yield the desired quality level for layout annotation. Therefore we prepared a subset of pages with two different complexity levels, each with a practice and an exam part. 974 pages were reference-annotated by one proficient core team member. Annotation staff were then given the task to annotate the same subsets (blinded from the reference). By comparing the annotations of each staff member with the reference annotations, we could quantify how closely their annotations matched the reference. Only after passing two exam levels with high annotation quality, staff were admitted into the production phase. Practice iterations</text>
 <picture><loc_258><loc_54><loc_457><loc_290></picture>
@@ -126,7 +126,7 @@
 <text><loc_260><loc_119><loc_457><loc_180>From the dataset, we have derived on the one hand reference metrics for human performance on document-layout annotation (through double and triple annotations) and on the other hand evaluated the baseline performance of commonly used object detection methods. We also illustrated the impact of various dataset-related aspects on model performance through data-ablation experiments, both from a size and class-label perspective. Last but not least, we compared the accuracy of models trained on other public datasets and showed that DocLayNet trained models are more robust.</text>
 <text><loc_259><loc_181><loc_456><loc_201>To date, there is still a significant gap between human and ML accuracy on the layout interpretation task, and we hope that this work will inspire the research community to close that gap.</text>
 <section_header_level_1><loc_260><loc_212><loc_316><loc_218>REFERENCES</section_header_level_1>
-<unordered_list><list_item><loc_262><loc_220><loc_456><loc_234>Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.</list_item>
+<ordered_list><list_item><loc_262><loc_220><loc_456><loc_234>Max Göbel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.</list_item>
 <list_item><loc_262><loc_235><loc_457><loc_254>Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.</list_item>
 <list_item><loc_262><loc_255><loc_456><loc_270>Hervé Déjean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.</list_item>
 <list_item><loc_262><loc_270><loc_457><loc_290>Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.</list_item>
@@ -139,14 +139,14 @@
 <list_item><loc_260><loc_396><loc_456><loc_410>Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.</list_item>
 <list_item><loc_260><loc_411><loc_457><loc_426>Kaiming He, Georgia Gkioxari, Piotr Dollár, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.</list_item>
 <list_item><loc_260><loc_426><loc_457><loc_446>Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu</list_item>
-</unordered_list>
+</ordered_list>
 <page_break>
 <page_header><loc_44><loc_38><loc_284><loc_43>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</page_header>
 <page_header><loc_299><loc_38><loc_456><loc_43>KDD ’22, August 14-18, 2022, Washington, DC, USA</page_header>
 <picture><loc_43><loc_53><loc_455><loc_279><caption><loc_51><loc_279><loc_260><loc_283>Text Caption List-Item Formula Table Section-Header Picture Page-Header Page-Footer Title</caption></picture>
 <text><loc_44><loc_293><loc_457><loc_319>Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on coloured backgrounds. (B, C) show accurate list-item and paragraph differentiation despite densely-spaced lines. (E) demonstrates good table and figure distinction. (F) shows predictions on a Chinese patent with multiple overlaps, label confusion and missing boxes.</text>
 <text><loc_57><loc_333><loc_241><loc_347>Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ultralytics/yolov5: v6.0 - yolov5n nano models, roboflow integration, tensorflow export, opencv dnn support, October 2021.</text>
-<unordered_list><list_item><loc_260><loc_333><loc_457><loc_342>Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
+<ordered_list><list_item><loc_260><loc_333><loc_457><loc_342>Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.</list_item>
 <list_item><loc_44><loc_348><loc_241><loc_362>Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.</list_item>
 <list_item><loc_44><loc_363><loc_240><loc_372>Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.</list_item>
 <list_item><loc_44><loc_373><loc_241><loc_387>Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Dollár, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.</list_item>
@@ -156,5 +156,5 @@
 <list_item><loc_260><loc_343><loc_457><loc_357>Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.</list_item>
 <list_item><loc_260><loc_358><loc_457><loc_377>Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.</list_item>
 <list_item><loc_260><loc_378><loc_457><loc_387>Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.</list_item>
-</unordered_list>
+</ordered_list>
 </doctag>
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/2206.01062.json b/tests/data/groundtruth/docling_v2/2206.01062.json
index 9a116a39..dbec5c77 100644
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
@@ -10865,7 +10865,7 @@
       "text": "Human Annotation : In contrast to PubLayNet and DocBank, we relied on human annotation instead of automation approaches to generate the data set.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(1)"
     },
     {
@@ -10896,7 +10896,7 @@
       "text": "Large Layout Variability : We include diverse and complex layouts from a large variety of public sources.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(2)"
     },
     {
@@ -10927,7 +10927,7 @@
       "text": "Detailed Label Set : We define 11 class labels to distinguish layout features in high detail. PubLayNet provides 5 labels; DocBank provides 13, although not a superset of ours.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(3)"
     },
     {
@@ -10958,7 +10958,7 @@
       "text": "Redundant Annotations : A fraction of the pages in the DocLayNet data set carry more than one human annotation.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(4)"
     },
     {
@@ -11047,7 +11047,7 @@
       "text": "Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(5)"
     },
     {
@@ -12429,7 +12429,7 @@
       "text": "Every list-item is an individual object instance with class label List-item . This definition is different from PubLayNet and DocBank, where all list-items are grouped together into one List object.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(1)"
     },
     {
@@ -12460,7 +12460,7 @@
       "text": "A List-item is a paragraph with hanging indentation. Singleline elements can qualify as List-item if the neighbour elements expose hanging indentation. Bullet or enumeration symbols are not a requirement.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(2)"
     },
     {
@@ -12491,7 +12491,7 @@
       "text": "For every Caption , there must be exactly one corresponding Picture or Table .",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(3)"
     },
     {
@@ -12522,7 +12522,7 @@
       "text": "Connected sub-pictures are grouped together in one Picture object.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(4)"
     },
     {
@@ -12553,7 +12553,7 @@
       "text": "Formula numbers are included in a Formula object.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(5)"
     },
     {
@@ -12584,7 +12584,7 @@
       "text": "Emphasised text (e.g. in italic or bold) at the beginning of a paragraph is not considered a Section-header , unless it appears exclusively on its own line.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "(6)"
     },
     {
@@ -14712,7 +14712,7 @@
       "text": "Max G\u00f6bel, Tamir Hassan, Ermelinda Oro, and Giorgio Orsi. Icdar 2013 table competition. In 2013 12th International Conference on Document Analysis and Recognition , pages 1449-1453, 2013.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[1]"
     },
     {
@@ -14743,7 +14743,7 @@
       "text": "Christian Clausner, Apostolos Antonacopoulos, and Stefan Pletschacher. Icdar2017 competition on recognition of documents with complex layouts rdcl2017. In 2017 14th IAPR International Conference on Document Analysis and Recognition (ICDAR) , volume 01, pages 1404-1410, 2017.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[2]"
     },
     {
@@ -14774,7 +14774,7 @@
       "text": "Herv\u00e9 D\u00e9jean, Jean-Luc Meunier, Liangcai Gao, Yilun Huang, Yu Fang, Florian Kleber, and Eva-Maria Lang. ICDAR 2019 Competition on Table Detection and Recognition (cTDaR), April 2019. http://sac.founderit.com/.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[3]"
     },
     {
@@ -14805,7 +14805,7 @@
       "text": "Antonio Jimeno Yepes, Peter Zhong, and Douglas Burdick. Competition on scientific literature parsing. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 605-617. LNCS 12824, SpringerVerlag, sep 2021.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[4]"
     },
     {
@@ -14836,7 +14836,7 @@
       "text": "Logan Markewich, Hao Zhang, Yubin Xing, Navid Lambert-Shirzad, Jiang Zhexin, Roy Lee, Zhi Li, and Seok-Bum Ko. Segmentation for document layout analysis: not dead yet. International Journal on Document Analysis and Recognition (IJDAR) , pages 1-11, 01 2022.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[5]"
     },
     {
@@ -14867,7 +14867,7 @@
       "text": "Xu Zhong, Jianbin Tang, and Antonio Jimeno-Yepes. Publaynet: Largest dataset ever for document layout analysis. In Proceedings of the International Conference on Document Analysis and Recognition , ICDAR, pages 1015-1022, sep 2019.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[6]"
     },
     {
@@ -14898,7 +14898,7 @@
       "text": "Minghao Li, Yiheng Xu, Lei Cui, Shaohan Huang, Furu Wei, Zhoujun Li, and Ming Zhou. Docbank: A benchmark dataset for document layout analysis. In Proceedings of the 28th International Conference on Computational Linguistics , COLING, pages 949-960. International Committee on Computational Linguistics, dec 2020.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[7]"
     },
     {
@@ -14929,7 +14929,7 @@
       "text": "Riaz Ahmad, Muhammad Tanvir Afzal, and M. Qadir. Information extraction from pdf sources based on rule-based system using integrated formats. In SemWebEval@ESWC , 2016.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[8]"
     },
     {
@@ -14960,7 +14960,7 @@
       "text": "Ross B. Girshick, Jeff Donahue, Trevor Darrell, and Jitendra Malik. Rich feature hierarchies for accurate object detection and semantic segmentation. In IEEE Conference on Computer Vision and Pattern Recognition , CVPR, pages 580-587. IEEE Computer Society, jun 2014.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[9]"
     },
     {
@@ -14991,7 +14991,7 @@
       "text": "Ross B. Girshick. Fast R-CNN. In 2015 IEEE International Conference on Computer Vision , ICCV, pages 1440-1448. IEEE Computer Society, dec 2015.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[10]"
     },
     {
@@ -15022,7 +15022,7 @@
       "text": "Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun. Faster r-cnn: Towards real-time object detection with region proposal networks. IEEE Transactions on Pattern Analysis and Machine Intelligence , 39(6):1137-1149, 2017.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[11]"
     },
     {
@@ -15053,7 +15053,7 @@
       "text": "Kaiming He, Georgia Gkioxari, Piotr Doll\u00e1r, and Ross B. Girshick. Mask R-CNN. In IEEE International Conference on Computer Vision , ICCV, pages 2980-2988. IEEE Computer Society, Oct 2017.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[12]"
     },
     {
@@ -15084,7 +15084,7 @@
       "text": "Glenn Jocher, Alex Stoken, Ayush Chaurasia, Jirka Borovec, NanoCode012, TaoXie, Yonghye Kwon, Kalen Michael, Liu Changyu, Jiacong Fang, Abhiram V, Laughing, tkianai, yxNONG, Piotr Skalski, Adam Hogan, Jebastin Nadar, imyhxy, Lorenzo Mammana, Alex Wang, Cristi Fati, Diego Montes, Jan Hajek, Laurentiu",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[13]"
     },
     {
@@ -15579,7 +15579,7 @@
       "text": "Shoubin Li, Xuyan Ma, Shuaiqun Pan, Jun Hu, Lin Shi, and Qing Wang. Vtlayout: Fusion of visual and text features for document layout analysis, 2021.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[20]"
     },
     {
@@ -15610,7 +15610,7 @@
       "text": "Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, and Sergey Zagoruyko. End-to-end object detection with transformers. CoRR , abs/2005.12872, 2020.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[14]"
     },
     {
@@ -15641,7 +15641,7 @@
       "text": "Mingxing Tan, Ruoming Pang, and Quoc V. Le. Efficientdet: Scalable and efficient object detection. CoRR , abs/1911.09070, 2019.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[15]"
     },
     {
@@ -15672,7 +15672,7 @@
       "text": "Tsung-Yi Lin, Michael Maire, Serge J. Belongie, Lubomir D. Bourdev, Ross B. Girshick, James Hays, Pietro Perona, Deva Ramanan, Piotr Doll\u00e1r, and C. Lawrence Zitnick. Microsoft COCO: common objects in context, 2014.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[16]"
     },
     {
@@ -15703,7 +15703,7 @@
       "text": "Yuxin Wu, Alexander Kirillov, Francisco Massa, Wan-Yen Lo, and Ross Girshick. Detectron2, 2019.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[17]"
     },
     {
@@ -15734,7 +15734,7 @@
       "text": "Nikolaos Livathinos, Cesar Berrospi, Maksym Lysak, Viktor Kuropiatnyk, Ahmed Nassar, Andre Carvalho, Michele Dolfi, Christoph Auer, Kasper Dinkla, and Peter W. J. Staar. Robust pdf document conversion using recurrent neural networks. In Proceedings of the 35th Conference on Artificial Intelligence , AAAI, pages 1513715145, feb 2021.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[18]"
     },
     {
@@ -15765,7 +15765,7 @@
       "text": "Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. Layoutlm: Pre-training of text and layout for document image understanding. In Proceedings of the 26th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 1192-1200, New York, USA, 2020. Association for Computing Machinery.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[19]"
     },
     {
@@ -15796,7 +15796,7 @@
       "text": "Peng Zhang, Can Li, Liang Qiao, Zhanzhan Cheng, Shiliang Pu, Yi Niu, and Fei Wu. Vsr: A unified framework for document layout analysis combining vision, semantics and relations, 2021.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[21]"
     },
     {
@@ -15827,7 +15827,7 @@
       "text": "Peter W J Staar, Michele Dolfi, Christoph Auer, and Costas Bekas. Corpus conversion service: A machine learning platform to ingest documents at scale. In Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining , KDD, pages 774-782. ACM, 2018.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[22]"
     },
     {
@@ -15858,7 +15858,7 @@
       "text": "Connor Shorten and Taghi M. Khoshgoftaar. A survey on image data augmentation for deep learning. Journal of Big Data , 6(1):60, 2019.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "[23]"
     }
   ],
diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
index bdf39cc8..746f8835 100644
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json
@@ -336,8 +336,8 @@
         {
           "page_no": 1,
           "bbox": {
-            "l": 139.66741943359375,
-            "t": 454.45458984375,
+            "l": 139.6674041748047,
+            "t": 454.4546203613281,
             "r": 475.00927734375,
             "b": 322.5054626464844,
             "coord_origin": "BOTTOMLEFT"
diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
index 3010fbb6..3c219d95 100644
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
@@ -2705,7 +2705,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
             "cells": [
               {
                 "index": 0,
@@ -2745,7 +2745,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
             "cells": [
               {
                 "index": 1,
@@ -2785,7 +2785,7 @@
               "b": 152.90697999999998,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
             "cells": [
               {
                 "index": 2,
@@ -2940,7 +2940,7 @@
               "b": 255.42400999999995,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
             "cells": [
               {
                 "index": 7,
@@ -3155,7 +3155,7 @@
               "b": 327.98218,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
             "cells": [
               {
                 "index": 15,
@@ -3339,8 +3339,8 @@
             "id": 0,
             "label": "table",
             "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
               "r": 475.00927734375,
               "b": 469.4945373535156,
               "coord_origin": "TOPLEFT"
@@ -7846,7 +7846,7 @@
               "b": 518.17419,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
             "cells": [
               {
                 "index": 91,
@@ -7911,7 +7911,7 @@
               "b": 618.3,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
             "cells": [
               {
                 "index": 93,
@@ -8243,8 +8243,8 @@
               "id": 0,
               "label": "table",
               "bbox": {
-                "l": 139.66741943359375,
-                "t": 337.54541015625,
+                "l": 139.6674041748047,
+                "t": 337.5453796386719,
                 "r": 475.00927734375,
                 "b": 469.4945373535156,
                 "coord_origin": "TOPLEFT"
@@ -13641,7 +13641,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
             "cells": [
               {
                 "index": 0,
@@ -13687,7 +13687,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
             "cells": [
               {
                 "index": 1,
@@ -13733,7 +13733,7 @@
               "b": 152.90697999999998,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
             "cells": [
               {
                 "index": 2,
@@ -13900,7 +13900,7 @@
               "b": 255.42400999999995,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
             "cells": [
               {
                 "index": 7,
@@ -14121,7 +14121,7 @@
               "b": 327.98218,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
             "cells": [
               {
                 "index": 15,
@@ -14311,8 +14311,8 @@
             "id": 0,
             "label": "table",
             "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
               "r": 475.00927734375,
               "b": 469.4945373535156,
               "coord_origin": "TOPLEFT"
@@ -19701,7 +19701,7 @@
               "b": 518.17419,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
             "cells": [
               {
                 "index": 91,
@@ -19772,7 +19772,7 @@
               "b": 618.3,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
             "cells": [
               {
                 "index": 93,
@@ -20116,7 +20116,7 @@
               "b": 152.90697999999998,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9806433916091919,
+            "confidence": 0.9806435108184814,
             "cells": [
               {
                 "index": 2,
@@ -20283,7 +20283,7 @@
               "b": 255.42400999999995,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.98504239320755,
+            "confidence": 0.9850425124168396,
             "cells": [
               {
                 "index": 7,
@@ -20504,7 +20504,7 @@
               "b": 327.98218,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9591909050941467,
+            "confidence": 0.9591907262802124,
             "cells": [
               {
                 "index": 15,
@@ -20694,8 +20694,8 @@
             "id": 0,
             "label": "table",
             "bbox": {
-              "l": 139.66741943359375,
-              "t": 337.54541015625,
+              "l": 139.6674041748047,
+              "t": 337.5453796386719,
               "r": 475.00927734375,
               "b": 469.4945373535156,
               "coord_origin": "TOPLEFT"
@@ -26084,7 +26084,7 @@
               "b": 518.17419,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9589294195175171,
+            "confidence": 0.9589295387268066,
             "cells": [
               {
                 "index": 91,
@@ -26155,7 +26155,7 @@
               "b": 618.3,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9849975109100342,
+            "confidence": 0.9849976301193237,
             "cells": [
               {
                 "index": 93,
@@ -26499,7 +26499,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.9373534917831421,
+            "confidence": 0.9373533129692078,
             "cells": [
               {
                 "index": 0,
@@ -26545,7 +26545,7 @@
               "b": 102.78223000000003,
               "coord_origin": "TOPLEFT"
             },
-            "confidence": 0.8858680725097656,
+            "confidence": 0.8858679533004761,
             "cells": [
               {
                 "index": 1,
diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt b/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
index 2d0bedb6..4809cf8d 100644
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.doctags.txt
@@ -62,9 +62,9 @@
 <picture><loc_135><loc_103><loc_367><loc_177><caption><loc_110><loc_79><loc_393><loc_98>Fig. 3. OTSL description of table structure: A - table example; B - graphical representation of table structure; C - mapping structure on a grid; D - OTSL structure encoding; E - explanation on cell encoding</caption></picture>
 <section_header_level_1><loc_110><loc_193><loc_202><loc_198>4.2 Language Syntax</section_header_level_1>
 <text><loc_110><loc_205><loc_297><loc_211>The OTSL representation follows these syntax rules:</text>
-<unordered_list><list_item><loc_114><loc_219><loc_393><loc_232>Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item>
+<ordered_list><list_item><loc_114><loc_219><loc_393><loc_232>Left-looking cell rule : The left neighbour of an "L" cell must be either another "L" cell or a "C" cell.</list_item>
 <list_item><loc_114><loc_234><loc_393><loc_247>Up-looking cell rule : The upper neighbour of a "U" cell must be either another "U" cell or a "C" cell.</list_item>
-</unordered_list>
+</ordered_list>
 <section_header_level_1><loc_114><loc_249><loc_185><loc_255>3. Cross cell rule :</section_header_level_1>
 <unordered_list><list_item><loc_124><loc_257><loc_393><loc_278>The left neighbour of an "X" cell must be either another "X" cell or a "U" cell, and the upper neighbour of an "X" cell must be either another "X" cell or an "L" cell.</list_item>
 <list_item><loc_114><loc_280><loc_388><loc_285>First row rule : Only "L" cells and "C" cells are allowed in the first row.</list_item>
@@ -114,15 +114,15 @@
 <text><loc_110><loc_131><loc_393><loc_204>First and foremost, given the same network configuration, inference time for a table-structure prediction is about 2 times faster compared to the conventional HTML approach. This is primarily owed to the shorter sequence length of the OTSL representation. Additional performance benefits can be obtained with HPO (hyper parameter optimization). As we demonstrate in our experiments, models trained on OTSL can be significantly smaller, e.g. by reducing the number of encoder and decoder layers, while preserving comparatively good prediction quality. This can further improve inference performance, yielding 5-6 times faster inference speed in OTSL with prediction quality comparable to models trained on HTML (see Table 1).</text>
 <text><loc_110><loc_207><loc_393><loc_296>Secondly, OTSL has more inherent structure and a significantly restricted vocabulary size. This allows autoregressive models to perform better in the TED metric, but especially with regards to prediction accuracy of the table-cell bounding boxes (see Table 2). As shown in Figure 5, we observe that the OTSL drastically reduces the drift for table cell bounding boxes at high row count and in sparse tables. This leads to more accurate predictions and a significant reduction in post-processing complexity, which is an undesired necessity in HTML-based Im2Seq models. Significant novelty lies in OTSL syntactical rules, which are few, simple and always backwards looking. Each new token can be validated only by analyzing the sequence of previous tokens, without requiring the entire sequence to detect mistakes. This in return allows to perform structural error detection and correction on-the-fly during sequence generation.</text>
 <section_header_level_1><loc_110><loc_312><loc_162><loc_318>References</section_header_level_1>
-<unordered_list><list_item><loc_114><loc_330><loc_393><loc_356>Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785</list_item>
+<ordered_list><list_item><loc_114><loc_330><loc_393><loc_356>Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785</list_item>
 <list_item><loc_114><loc_358><loc_393><loc_384>Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Fornés, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)</list_item>
 <list_item><loc_114><loc_386><loc_393><loc_398>Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)</list_item>
 <list_item><loc_114><loc_401><loc_393><loc_420>Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)</list_item>
-</unordered_list>
+</ordered_list>
 <page_break>
 <page_header><loc_159><loc_59><loc_366><loc_64>Optimized Table Tokenization for Table Structure Recognition</page_header>
 <page_header><loc_385><loc_59><loc_393><loc_64>13</page_header>
-<unordered_list><list_item><loc_114><loc_76><loc_393><loc_94>Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)</list_item>
+<ordered_list><list_item><loc_114><loc_76><loc_393><loc_94>Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)</list_item>
 <list_item><loc_114><loc_96><loc_393><loc_122>Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)</list_item>
 <list_item><loc_114><loc_124><loc_393><loc_136>Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)</list_item>
 <list_item><loc_114><loc_138><loc_393><loc_171>Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777</list_item>
@@ -135,15 +135,15 @@
 <list_item><loc_110><loc_345><loc_393><loc_385>Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834</list_item>
 <list_item><loc_110><loc_387><loc_393><loc_399>Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397</list_item>
 <list_item><loc_110><loc_401><loc_393><loc_420>Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)</list_item>
-</unordered_list>
+</ordered_list>
 <page_break>
 <page_header><loc_110><loc_59><loc_118><loc_64>14</page_header>
 <page_header><loc_137><loc_59><loc_189><loc_64>M. Lysak, et al.</page_header>
-<unordered_list><list_item><loc_110><loc_76><loc_393><loc_94>Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)</list_item>
+<ordered_list><list_item><loc_110><loc_76><loc_393><loc_94>Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)</list_item>
 <list_item><loc_110><loc_96><loc_393><loc_122>Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848</list_item>
 <list_item><loc_110><loc_124><loc_393><loc_136>Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)</list_item>
 <list_item><loc_110><loc_138><loc_393><loc_171>Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074</list_item>
 <list_item><loc_110><loc_172><loc_393><loc_198>Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)</list_item>
 <list_item><loc_110><loc_200><loc_393><loc_219>Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)</list_item>
-</unordered_list>
+</ordered_list>
 </doctag>
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1.json b/tests/data/groundtruth/docling_v2/2305.03393v1.json
index a9ff7da8..e16db1c9 100644
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
@@ -7273,7 +7273,7 @@
       "text": "Left-looking cell rule : The left neighbour of an \"L\" cell must be either another \"L\" cell or a \"C\" cell.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -7304,7 +7304,7 @@
       "text": "Up-looking cell rule : The upper neighbour of a \"U\" cell must be either another \"U\" cell or a \"C\" cell.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -7396,7 +7396,7 @@
       "text": "First row rule : Only \"L\" cells and \"C\" cells are allowed in the first row.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "4."
     },
     {
@@ -7427,7 +7427,7 @@
       "text": "First column rule : Only \"U\" cells and \"C\" cells are allowed in the first column.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "5."
     },
     {
@@ -7458,7 +7458,7 @@
       "text": "Rectangular rule : The table representation is always rectangular - all rows must have an equal number of tokens, terminated with \"NL\" token.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "6."
     },
     {
@@ -13818,7 +13818,7 @@
       "text": "Auer, C., Dolfi, M., Carvalho, A., Ramis, C.B., Staar, P.W.J.: Delivering document conversion as a cloud service with high throughput and responsiveness. CoRR abs/2206.00785 (2022). https://doi.org/10.48550/arXiv.2206.00785 , https://doi.org/10.48550/arXiv.2206.00785",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -13849,7 +13849,7 @@
       "text": "Chen, B., Peng, D., Zhang, J., Ren, Y., Jin, L.: Complex table structure recognition in the wild using transformer and identity matrix-based augmentation. In: Porwal, U., Forn\u00e9s, A., Shafait, F. (eds.) Frontiers in Handwriting Recognition. pp. 545561. Springer International Publishing, Cham (2022)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -13880,7 +13880,7 @@
       "text": "Chi, Z., Huang, H., Xu, H.D., Yu, H., Yin, W., Mao, X.L.: Complicated table structure recognition. arXiv preprint arXiv:1908.04729 (2019)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
@@ -13911,7 +13911,7 @@
       "text": "Deng, Y., Rosenberg, D., Mann, G.: Challenges in end-to-end neural scientific table recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 894-901. IEEE (2019)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "4."
     },
     {
@@ -14000,7 +14000,7 @@
       "text": "Kayal, P., Anand, M., Desai, H., Singh, M.: Tables to latex: structure and content extraction from scientific tables. International Journal on Document Analysis and Recognition (IJDAR) pp. 1-10 (2022)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "5."
     },
     {
@@ -14031,7 +14031,7 @@
       "text": "Lee, E., Kwon, J., Yang, H., Park, J., Lee, S., Koo, H.I., Cho, N.I.: Table structure recognition based on grid shape graph. In: 2022 Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC). pp. 18681873. IEEE (2022)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "6."
     },
     {
@@ -14062,7 +14062,7 @@
       "text": "Li, M., Cui, L., Huang, S., Wei, F., Zhou, M., Li, Z.: Tablebank: A benchmark dataset for table detection and recognition (2019)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "7."
     },
     {
@@ -14093,7 +14093,7 @@
       "text": "Livathinos, N., Berrospi, C., Lysak, M., Kuropiatnyk, V., Nassar, A., Carvalho, A., Dolfi, M., Auer, C., Dinkla, K., Staar, P.: Robust pdf document conversion using recurrent neural networks. Proceedings of the AAAI Conference on Artificial Intelligence 35 (17), 15137-15145 (May 2021), https://ojs.aaai.org/index.php/ AAAI/article/view/17777",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "8."
     },
     {
@@ -14124,7 +14124,7 @@
       "text": "Nassar, A., Livathinos, N., Lysak, M., Staar, P.: Tableformer: Table structure understanding with transformers. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4614-4623 (June 2022)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "9."
     },
     {
@@ -14155,7 +14155,7 @@
       "text": "Pfitzmann, B., Auer, C., Dolfi, M., Nassar, A.S., Staar, P.W.J.: Doclaynet: A large human-annotated dataset for document-layout segmentation. In: Zhang, A., Rangwala, H. (eds.) KDD '22: The 28th ACM SIGKDD Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 14 - 18, 2022. pp. 3743-3751. ACM (2022). https://doi.org/10.1145/3534678.3539043 , https:// doi.org/10.1145/3534678.3539043",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "10."
     },
     {
@@ -14186,7 +14186,7 @@
       "text": "Prasad, D., Gadpal, A., Kapadni, K., Visave, M., Sultanpure, K.: Cascadetabnet: An approach for end to end table detection and structure recognition from imagebased documents. In: Proceedings of the IEEE/CVF conference on computer vision and pattern recognition workshops. pp. 572-573 (2020)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "11."
     },
     {
@@ -14217,7 +14217,7 @@
       "text": "Schreiber, S., Agne, S., Wolf, I., Dengel, A., Ahmed, S.: Deepdesrt: Deep learning for detection and structure recognition of tables in document images. In: 2017 14th IAPR international conference on document analysis and recognition (ICDAR). vol. 1, pp. 1162-1167. IEEE (2017)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "12."
     },
     {
@@ -14248,7 +14248,7 @@
       "text": "Siddiqui, S.A., Fateh, I.A., Rizvi, S.T.R., Dengel, A., Ahmed, S.: Deeptabstr: Deep learning based table structure recognition. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1403-1409 (2019). https:// doi.org/10.1109/ICDAR.2019.00226",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "13."
     },
     {
@@ -14279,7 +14279,7 @@
       "text": "Smock, B., Pesala, R., Abraham, R.: PubTables-1M: Towards comprehensive table extraction from unstructured documents. In: Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR). pp. 4634-4642 (June 2022)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "14."
     },
     {
@@ -14310,7 +14310,7 @@
       "text": "Staar, P.W.J., Dolfi, M., Auer, C., Bekas, C.: Corpus conversion service: A machine learning platform to ingest documents at scale. In: Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery & Data Mining. pp. 774-782. KDD '18, Association for Computing Machinery, New York, NY, USA (2018). https://doi.org/10.1145/3219819.3219834 , https://doi.org/10. 1145/3219819.3219834",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "15."
     },
     {
@@ -14341,7 +14341,7 @@
       "text": "Wang, X.: Tabular Abstraction, Editing, and Formatting. Ph.D. thesis, CAN (1996), aAINN09397",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "16."
     },
     {
@@ -14372,7 +14372,7 @@
       "text": "Xue, W., Li, Q., Tao, D.: Res2tim: Reconstruct syntactic structures from table images. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 749-755. IEEE (2019)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "17."
     },
     {
@@ -14461,7 +14461,7 @@
       "text": "Xue, W., Yu, B., Wang, W., Tao, D., Li, Q.: Tgrnet: A table graph reconstruction network for table structure recognition. In: Proceedings of the IEEE/CVF International Conference on Computer Vision. pp. 1295-1304 (2021)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "18."
     },
     {
@@ -14492,7 +14492,7 @@
       "text": "Ye, J., Qi, X., He, Y., Chen, Y., Gu, D., Gao, P., Xiao, R.: Pingan-vcgroup's solution for icdar 2021 competition on scientific literature parsing task b: Table recognition to html (2021). https://doi.org/10.48550/ARXIV.2105.01848 , https://arxiv.org/abs/2105.01848",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "19."
     },
     {
@@ -14523,7 +14523,7 @@
       "text": "Zhang, Z., Zhang, J., Du, J., Wang, F.: Split, embed and merge: An accurate table structure recognizer. Pattern Recognition 126 , 108565 (2022)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "20."
     },
     {
@@ -14554,7 +14554,7 @@
       "text": "Zheng, X., Burdick, D., Popa, L., Zhong, X., Wang, N.X.R.: Global table extractor (gte): A framework for joint table identification and cell structure recognition using visual context. In: 2021 IEEE Winter Conference on Applications of Computer Vision (WACV). pp. 697-706 (2021). https://doi.org/10.1109/WACV48630.2021. 00074",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "21."
     },
     {
@@ -14585,7 +14585,7 @@
       "text": "Zhong, X., ShafieiBavani, E., Jimeno Yepes, A.: Image-based table recognition: Data, model, and evaluation. In: Vedaldi, A., Bischof, H., Brox, T., Frahm, J.M. (eds.) Computer Vision - ECCV 2020. pp. 564-580. Springer International Publishing, Cham (2020)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "22."
     },
     {
@@ -14616,7 +14616,7 @@
       "text": "Zhong, X., Tang, J., Yepes, A.J.: Publaynet: largest dataset ever for document layout analysis. In: 2019 International Conference on Document Analysis and Recognition (ICDAR). pp. 1015-1022. IEEE (2019)",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "23."
     }
   ],
diff --git a/tests/data/groundtruth/docling_v2/multi_page.doctags.txt b/tests/data/groundtruth/docling_v2/multi_page.doctags.txt
index 8a26c7d9..fe796df0 100644
--- a/tests/data/groundtruth/docling_v2/multi_page.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/multi_page.doctags.txt
@@ -34,12 +34,12 @@
 </unordered_list>
 <section_header_level_1><loc_60><loc_225><loc_286><loc_234>Key Features That Changed Word Processing</section_header_level_1>
 <text><loc_60><loc_242><loc_432><loc_267>The evolution of word processors wasn't just about hardware or software improvements-it was about the features that revolutionized how people wrote and edited. Some of these transformative features include:</text>
-<unordered_list><list_item><loc_76><loc_275><loc_428><loc_291>Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.</list_item>
+<ordered_list><list_item><loc_76><loc_275><loc_428><loc_291>Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.</list_item>
 <list_item><loc_76><loc_292><loc_434><loc_308>Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.</list_item>
 <list_item><loc_76><loc_308><loc_409><loc_324>Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.</list_item>
 <list_item><loc_76><loc_324><loc_422><loc_340>Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.</list_item>
 <list_item><loc_76><loc_341><loc_438><loc_365>Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.</list_item>
-</unordered_list>
+</ordered_list>
 <section_header_level_1><loc_60><loc_390><loc_262><loc_399>The Cultural Impact of Word Processors</section_header_level_1>
 <text><loc_60><loc_408><loc_436><loc_432>The word processor didn't just change workplaces-it changed culture. It democratized writing, enabling anyone with access to a computer to produce professional-quality documents. This shift had profound implications for education, business, and creative fields:</text>
 <page_break>
@@ -49,12 +49,12 @@
 </unordered_list>
 <section_header_level_1><loc_60><loc_142><loc_248><loc_151>Word Processors in a Post-Digital Era</section_header_level_1>
 <text><loc_60><loc_159><loc_438><loc_167>As we move further into the 21st century, the role of the word processor continues to evolve:</text>
-<unordered_list><list_item><loc_76><loc_176><loc_440><loc_208>Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.</list_item>
+<ordered_list><list_item><loc_76><loc_176><loc_440><loc_208>Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.</list_item>
 <list_item><loc_76><loc_208><loc_432><loc_241>Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.</list_item>
 <list_item><loc_76><loc_241><loc_422><loc_274>Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.</list_item>
 <list_item><loc_76><loc_274><loc_434><loc_298>Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.</list_item>
 <list_item><loc_76><loc_299><loc_429><loc_323>Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.</list_item>
-</unordered_list>
+</ordered_list>
 <section_header_level_1><loc_60><loc_348><loc_192><loc_357>A Glimpse Into the Future</section_header_level_1>
 <text><loc_60><loc_366><loc_433><loc_382>The word processor's future lies in adaptability and intelligence. Some exciting possibilities include:</text>
 <unordered_list><list_item><loc_76><loc_390><loc_435><loc_406>Fully AI-Assisted Writing : Imagine a word processor that understands your writing style, drafts emails, or creates entire essays based on minimal input.</list_item>
diff --git a/tests/data/groundtruth/docling_v2/multi_page.json b/tests/data/groundtruth/docling_v2/multi_page.json
index 5644fc25..b6f49c4b 100644
--- a/tests/data/groundtruth/docling_v2/multi_page.json
+++ b/tests/data/groundtruth/docling_v2/multi_page.json
@@ -1225,7 +1225,7 @@
       "text": "Undo/Redo : Introduced in the 1980s, the ability to undo mistakes and redo actions made experimentation and error correction much easier.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -1256,7 +1256,7 @@
       "text": "Spell Check and Grammar Check : By the 1990s, these became standard, allowing users to spot errors automatically.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -1287,7 +1287,7 @@
       "text": "Templates : Pre-designed formats for documents, such as resumes, letters, and invoices, helped users save time.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
@@ -1318,7 +1318,7 @@
       "text": "Track Changes : A game-changer for collaboration, this feature allowed multiple users to suggest edits while maintaining the original text.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "4."
     },
     {
@@ -1349,7 +1349,7 @@
       "text": "Real-Time Collaboration : Tools like Google Docs and Microsoft 365 enabled multiple users to edit the same document simultaneously, forever changing teamwork dynamics.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "5."
     },
     {
@@ -1591,7 +1591,7 @@
       "text": "Artificial Intelligence : Modern word processors are leveraging AI to suggest content improvements. Tools like Grammarly, ProWritingAid, and even native features in Word now analyze tone, conciseness, and clarity. Some AI systems can even generate entire paragraphs or rewrite sentences.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -1622,7 +1622,7 @@
       "text": "Integration with Other Tools : Word processors are no longer standalone. They integrate with task managers, cloud storage, and project management platforms. For instance, Google Docs syncs with Google Drive, while Microsoft Word integrates seamlessly with OneDrive and Teams.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -1653,7 +1653,7 @@
       "text": "Voice Typing : Speech-to-text capabilities have made word processing more accessible, particularly for those with disabilities. Tools like Dragon NaturallySpeaking and built-in options in Google Docs and Microsoft Word have made dictation mainstream.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
@@ -1684,7 +1684,7 @@
       "text": "Multimedia Documents : Word processing has expanded beyond text. Modern tools allow users to embed images, videos, charts, and interactive elements, transforming simple documents into rich multimedia experiences.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "4."
     },
     {
@@ -1715,7 +1715,7 @@
       "text": "Cross-Platform Accessibility : Thanks to cloud computing, documents can now be accessed and edited across devices. Whether you're on a desktop, tablet, or smartphone, you can continue working seamlessly.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "5."
     },
     {
diff --git a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
index d85ecd36..5775c5ca 100644
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.doctags.txt
@@ -161,10 +161,10 @@
 <text><loc_112><loc_224><loc_447><loc_275>The VERIFY_GROUP_FOR_USER function was added in IBM i 7.2. Although it is primarily intended for use with RCAC permissions and masks, it can be used in other SQL statements. The first parameter must be one of these three special registers: SESSION_USER, USER, or CURRENT_USER. The second and subsequent parameters are a list of user or group profiles. Each of these values must be 1 - 10 characters in length. These values are not validated for their existence, which means that you can specify the names of user profiles that do not exist without receiving any kind of error.</text>
 <text><loc_112><loc_283><loc_447><loc_304>If a special register value is in the list of user profiles or it is a member of a group profile included in the list, the function returns a long integer value of 1. Otherwise, it returns a value of 0. It never returns the null value.</text>
 <text><loc_112><loc_312><loc_375><loc_318>Here is an example of using the VERIFY_GROUP_FOR_USER function:</text>
-<unordered_list><list_item><loc_112><loc_323><loc_332><loc_329>There are user profiles for MGR, JANE, JUDY, and TONY.</list_item>
+<ordered_list><list_item><loc_112><loc_323><loc_332><loc_329>There are user profiles for MGR, JANE, JUDY, and TONY.</list_item>
 <list_item><loc_112><loc_334><loc_324><loc_339>The user profile JANE specifies a group profile of MGR.</list_item>
 <list_item><loc_112><loc_344><loc_438><loc_358>If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:</list_item>
-</unordered_list>
+</ordered_list>
 <code><loc_124><loc_363><loc_368><loc_405><_unknown_>VERIFY_GROUP_FOR_USER (CURRENT_USER, 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR') VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JANE', 'MGR', 'STEVE') The following function invocation returns a value of 0: VERIFY_GROUP_FOR_USER (CURRENT_USER, 'JUDY', 'TONY')</code>
 <page_footer><loc_53><loc_477><loc_64><loc_482>20</page_footer>
 <page_footer><loc_76><loc_477><loc_273><loc_482>Row and Column Access Control Support in IBM DB2 for i</page_footer>
@@ -172,24 +172,24 @@
 <text><loc_112><loc_45><loc_136><loc_51>RETURN</text>
 <text><loc_112><loc_53><loc_128><loc_58>CASE</text>
 <code><loc_112><loc_60><loc_426><loc_164><_unknown_>WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . DATE_OF_BIRTH WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 9999 || '-' || MONTH ( EMPLOYEES . DATE_OF_BIRTH ) || '-' || DAY (EMPLOYEES.DATE_OF_BIRTH )) ELSE NULL END ENABLE ;</code>
-<unordered_list><list_item><loc_112><loc_174><loc_447><loc_187>The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones:</list_item>
+<ordered_list><list_item><loc_112><loc_174><loc_447><loc_187>The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones:</list_item>
 <list_item><loc_124><loc_192><loc_383><loc_198>-Human Resources can see the unmasked TAX_ID of the employees.</list_item>
 <list_item><loc_124><loc_203><loc_330><loc_209>-Employees can see only their own unmasked TAX_ID.</list_item>
 <list_item><loc_124><loc_214><loc_445><loc_227>-Managers see a masked version of TAX_ID with the first five characters replaced with the X character (for example, XXX-XX-1234).</list_item>
 <list_item><loc_124><loc_232><loc_433><loc_238>-Any other person sees the entire TAX_ID as masked, for example, XXX-XX-XXXX.</list_item>
 <list_item><loc_124><loc_243><loc_433><loc_249>To implement this column mask, run the SQL statement that is shown in Example 3-9.</list_item>
-</unordered_list>
+</ordered_list>
 <code><loc_112><loc_267><loc_430><loc_432><_unknown_>CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ;<caption><loc_112><loc_257><loc_288><loc_262>Example 3-9 Creating a mask on the TAX_ID column</caption></code>
 <page_footer><loc_282><loc_477><loc_428><loc_482>Chapter 3. Row and Column Access Control</page_footer>
 <page_footer><loc_438><loc_477><loc_447><loc_482>27</page_footer>
 <page_break>
-<unordered_list><list_item><loc_112><loc_45><loc_368><loc_51>Figure 3-10 shows the masks that are created in the HR_SCHEMA.</list_item>
-</unordered_list>
+<ordered_list><list_item><loc_112><loc_45><loc_368><loc_51>Figure 3-10 shows the masks that are created in the HR_SCHEMA.</list_item>
+</ordered_list>
 <picture><loc_52><loc_60><loc_447><loc_107><caption><loc_53><loc_110><loc_239><loc_115>Figure 3-10 Column masks shown in System i Navigator</caption></picture>
 <section_header_level_1><loc_53><loc_128><loc_167><loc_135>3.6.6 Activating RCAC</section_header_level_1>
 <text><loc_112><loc_144><loc_447><loc_165>Now that you have created the row permission and the two column masks, RCAC must be activated. The row permission and the two column masks are enabled (last clause in the scripts), but now you must activate RCAC on the table. To do so, complete the following steps:</text>
-<unordered_list><list_item><loc_112><loc_170><loc_335><loc_176>Run the SQL statements that are shown in Example 3-10.</list_item>
-</unordered_list>
+<ordered_list><list_item><loc_112><loc_170><loc_335><loc_176>Run the SQL statements that are shown in Example 3-10.</list_item>
+</ordered_list>
 <section_header_level_1><loc_112><loc_184><loc_307><loc_189>Example 3-10 Activating RCAC on the EMPLOYEES table</section_header_level_1>
 <unordered_list><list_item><loc_112><loc_195><loc_308><loc_200>/* Active Row Access Control (permissions) */</list_item>
 <list_item><loc_112><loc_202><loc_290><loc_208>/* Active Column Access Control (masks)</list_item>
@@ -198,15 +198,15 @@
 <text><loc_112><loc_210><loc_238><loc_216>ALTER TABLE HR_SCHEMA.EMPLOYEES</text>
 <text><loc_112><loc_218><loc_222><loc_223>ACTIVATE ROW ACCESS CONTROL</text>
 <text><loc_112><loc_225><loc_238><loc_231>ACTIVATE COLUMN ACCESS CONTROL;</text>
-<unordered_list><list_item><loc_112><loc_240><loc_442><loc_261>Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</list_item>
-</unordered_list>
+<ordered_list><list_item><loc_112><loc_240><loc_442><loc_261>Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas  HR_SCHEMA  Tables , right-click the EMPLOYEES table, and click Definition .</list_item>
+</ordered_list>
 <picture><loc_52><loc_270><loc_433><loc_408><caption><loc_53><loc_410><loc_284><loc_415>Figure 3-11 Selecting the EMPLOYEES table from System i Navigator</caption></picture>
 <page_footer><loc_53><loc_477><loc_64><loc_482>28</page_footer>
 <page_footer><loc_76><loc_477><loc_273><loc_482>Row and Column Access Control Support in IBM DB2 for i</page_footer>
 <page_break>
-<unordered_list><list_item><loc_112><loc_45><loc_420><loc_66>Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</list_item>
+<ordered_list><list_item><loc_112><loc_45><loc_420><loc_66>Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.</list_item>
 <list_item><loc_112><loc_320><loc_447><loc_341>Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.</list_item>
-</unordered_list>
+</ordered_list>
 <picture><loc_112><loc_75><loc_446><loc_301><caption><loc_112><loc_303><loc_267><loc_309>Figure 4-68 Visual Explain with RCAC enabled</caption></picture>
 <picture><loc_53><loc_349><loc_414><loc_419><caption><loc_53><loc_421><loc_186><loc_427>Figure 4-69 Index advice with no RCAC</caption></picture>
 <page_footer><loc_175><loc_477><loc_428><loc_482>Chapter 4. Implementing Row and Column Access Control: Banking example</page_footer>
diff --git a/tests/data/groundtruth/docling_v2/redp5110_sampled.json b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
index 32bf5ada..d2a0e027 100644
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.json
@@ -6786,7 +6786,7 @@
       "text": "There are user profiles for MGR, JANE, JUDY, and TONY.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -6817,7 +6817,7 @@
       "text": "The user profile JANE specifies a group profile of MGR.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -6848,7 +6848,7 @@
       "text": "If a user is connected to the server using user profile JANE, all of the following function invocations return a value of 1:",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
@@ -7063,7 +7063,7 @@
       "text": "The other column to mask in this example is the TAX_ID information. In this example, the rules to enforce include the following ones:",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -7378,7 +7378,7 @@
       "text": "Figure 3-10 shows the masks that are created in the HR_SCHEMA.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
@@ -7497,7 +7497,7 @@
       "text": "Run the SQL statements that are shown in Example 3-10.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "1."
     },
     {
@@ -7736,7 +7736,7 @@
       "text": "Look at the definition of the EMPLOYEE table, as shown in Figure 3-11. To do this, from the main navigation pane of System i Navigator, click Schemas \uf0ae HR_SCHEMA \uf0ae Tables , right-click the EMPLOYEES table, and click Definition .",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -7854,7 +7854,7 @@
       "text": "Figure 4-68 shows the Visual Explain of the same SQL statement, but with RCAC enabled. It is clear that the implementation of the SQL statement is more complex because the row permission rule becomes part of the WHERE clause.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "2."
     },
     {
@@ -7914,7 +7914,7 @@
       "text": "Compare the advised indexes that are provided by the Optimizer without RCAC and with RCAC enabled. Figure 4-69 shows the index advice for the SQL statement without RCAC enabled. The index being advised is for the ORDER BY clause.",
       "formatting": null,
       "hyperlink": null,
-      "enumerated": false,
+      "enumerated": true,
       "marker": "3."
     },
     {
diff --git a/tests/data/groundtruth/docling_v2/redp5110_sampled.md b/tests/data/groundtruth/docling_v2/redp5110_sampled.md
index 31303c36..11277eab 100644
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.md
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.md
@@ -335,11 +335,11 @@ WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR', 'EMP' ) = 1 THEN EMPLOYEES . D
 ```
 
 2. The other column to mask in this example is the TAX\_ID information. In this example, the rules to enforce include the following ones:
-- -Human Resources can see the unmasked TAX\_ID of the employees.
-- -Employees can see only their own unmasked TAX\_ID.
-- -Managers see a masked version of TAX\_ID with the first five characters replaced with the X character (for example, XXX-XX-1234).
-- -Any other person sees the entire TAX\_ID as masked, for example, XXX-XX-XXXX.
-- To implement this column mask, run the SQL statement that is shown in Example 3-9.
+2. -Human Resources can see the unmasked TAX\_ID of the employees.
+3. -Employees can see only their own unmasked TAX\_ID.
+4. -Managers see a masked version of TAX\_ID with the first five characters replaced with the X character (for example, XXX-XX-1234).
+5. -Any other person sees the entire TAX\_ID as masked, for example, XXX-XX-XXXX.
+6. To implement this column mask, run the SQL statement that is shown in Example 3-9.
 
 ```
 CREATE MASK HR_SCHEMA.MASK_TAX_ID_ON_EMPLOYEES ON HR_SCHEMA.EMPLOYEES AS EMPLOYEES FOR COLUMN TAX_ID RETURN CASE WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'HR' ) = 1 THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER = EMPLOYEES . USER_ID THEN EMPLOYEES . TAX_ID WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'MGR' ) = 1 AND SESSION_USER <> EMPLOYEES . USER_ID THEN ( 'XXX-XX-' CONCAT QSYS2 . SUBSTR ( EMPLOYEES . TAX_ID , 8 , 4 ) ) WHEN VERIFY_GROUP_FOR_USER ( SESSION_USER , 'EMP' ) = 1 THEN EMPLOYEES . TAX_ID ELSE 'XXX-XX-XXXX' END ENABLE ;
diff --git a/uv.lock b/uv.lock
index c98b69b6..f72c2832 100644
--- a/uv.lock
+++ b/uv.lock
@@ -905,7 +905,7 @@ requires-dist = [
     { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
     { name = "certifi", specifier = ">=2024.7.4" },
     { name = "docling-core", extras = ["chunking"], specifier = ">=2.39.0,<3.0.0" },
-    { name = "docling-ibm-models", specifier = ">=3.6.0,<4" },
+    { name = "docling-ibm-models", git = "https://github.com/docling-project/docling-ibm-models?rev=nli%2Fauto_layout_predictor" },
     { name = "docling-parse", specifier = ">=4.0.0,<5.0.0" },
     { name = "easyocr", specifier = ">=1.7,<2.0" },
     { name = "filetype", specifier = ">=1.2.0,<2.0.0" },
@@ -1006,8 +1006,8 @@ chunking = [
 
 [[package]]
 name = "docling-ibm-models"
-version = "3.6.0"
-source = { registry = "https://pypi.org/simple" }
+version = "3.7.0"
+source = { git = "https://github.com/docling-project/docling-ibm-models?rev=nli%2Fauto_layout_predictor#6860b6f68ef004f9a01cee2b146fc17798c39332" }
 dependencies = [
     { name = "docling-core" },
     { name = "huggingface-hub" },
@@ -1024,10 +1024,6 @@ dependencies = [
     { name = "tqdm" },
     { name = "transformers" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/dc/67/df762264044b91036da27bce2304d78035a38b9060e1a0cef46a98a510c7/docling_ibm_models-3.6.0.tar.gz", hash = "sha256:1ff8ef143d6a41f3d9ae22a2fed297524c3fe45235368e1919962a903b65d9d3", size = 85129, upload-time = "2025-06-20T09:19:30.194Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/f4/d7/e7ea203d57e4f3eed7d4ccbbdc62e7828ee2ebcf2a9b7f61097537b5c88c/docling_ibm_models-3.6.0-py3-none-any.whl", hash = "sha256:f61a1ca278b55a9dc2570d4c69d62281a1dc9a1e6e08bbab57940b612cab383d", size = 84779, upload-time = "2025-06-20T09:19:28.923Z" },
-]
 
 [[package]]
 name = "docling-parse"
@@ -3367,7 +3363,7 @@ name = "nvidia-cudnn-cu12"
 version = "9.5.1.17"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/2a/78/4535c9c7f859a64781e43c969a3a7e84c54634e319a996d43ef32ce46f83/nvidia_cudnn_cu12-9.5.1.17-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:30ac3869f6db17d170e0e556dd6cc5eee02647abc31ca856634d5a40f82c15b2", size = 570988386, upload-time = "2024-10-25T19:54:26.39Z" },
@@ -3378,7 +3374,7 @@ name = "nvidia-cufft-cu12"
 version = "11.3.0.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8f/16/73727675941ab8e6ffd86ca3a4b7b47065edcca7a997920b831f8147c99d/nvidia_cufft_cu12-11.3.0.4-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ccba62eb9cef5559abd5e0d54ceed2d9934030f51163df018532142a8ec533e5", size = 200221632, upload-time = "2024-11-20T17:41:32.357Z" },
@@ -3407,9 +3403,9 @@ name = "nvidia-cusolver-cu12"
 version = "11.7.1.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
-    { name = "nvidia-cusparse-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "nvidia-cusparse-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/6e/c2cf12c9ff8b872e92b4a5740701e51ff17689c4d726fca91875b07f655d/nvidia_cusolver_cu12-11.7.1.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:e9e49843a7707e42022babb9bcfa33c29857a93b88020c4e4434656a655b698c", size = 158229790, upload-time = "2024-11-20T17:43:43.211Z" },
@@ -3421,7 +3417,7 @@ name = "nvidia-cusparse-cu12"
 version = "12.5.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink-cu12", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/06/1e/b8b7c2f4099a37b96af5c9bb158632ea9e5d9d27d7391d7eb8fc45236674/nvidia_cusparse_cu12-12.5.4.2-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7556d9eca156e18184b94947ade0fba5bb47d69cec46bf8660fd2c71a4b48b73", size = 216561367, upload-time = "2024-11-20T17:44:54.824Z" },
@@ -3466,10 +3462,10 @@ name = "ocrmac"
 version = "1.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "pillow" },
-    { name = "pyobjc-framework-vision" },
+    { name = "click", version = "8.1.8", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux')" },
+    { name = "click", version = "8.2.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10' and sys_platform == 'darwin'" },
+    { name = "pillow", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-vision", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/dd/dc/de3e9635774b97d9766f6815bbb3f5ec9bce347115f10d9abbf2733a9316/ocrmac-1.0.0.tar.gz", hash = "sha256:5b299e9030c973d1f60f82db000d6c2e5ff271601878c7db0885e850597d1d2e", size = 1463997, upload-time = "2024-11-07T12:00:00.197Z" }
 wheels = [
@@ -4496,7 +4492,7 @@ name = "pyobjc-framework-cocoa"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/4b/c5/7a866d24bc026f79239b74d05e2cf3088b03263da66d53d1b4cf5207f5ae/pyobjc_framework_cocoa-11.1.tar.gz", hash = "sha256:87df76b9b73e7ca699a828ff112564b59251bb9bbe72e610e670a4dc9940d038", size = 5565335, upload-time = "2025-06-14T20:56:59.683Z" }
 wheels = [
@@ -4515,8 +4511,8 @@ name = "pyobjc-framework-coreml"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
-    { name = "pyobjc-framework-cocoa" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0d/5d/4309f220981d769b1a2f0dcb2c5c104490d31389a8ebea67e5595ce1cb74/pyobjc_framework_coreml-11.1.tar.gz", hash = "sha256:775923eefb9eac2e389c0821b10564372de8057cea89f1ea1cdaf04996c970a7", size = 82005, upload-time = "2025-06-14T20:57:12.004Z" }
 wheels = [
@@ -4535,8 +4531,8 @@ name = "pyobjc-framework-quartz"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
-    { name = "pyobjc-framework-cocoa" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c7/ac/6308fec6c9ffeda9942fef72724f4094c6df4933560f512e63eac37ebd30/pyobjc_framework_quartz-11.1.tar.gz", hash = "sha256:a57f35ccfc22ad48c87c5932818e583777ff7276605fef6afad0ac0741169f75", size = 3953275, upload-time = "2025-06-14T20:58:17.924Z" }
 wheels = [
@@ -4555,10 +4551,10 @@ name = "pyobjc-framework-vision"
 version = "11.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pyobjc-core" },
-    { name = "pyobjc-framework-cocoa" },
-    { name = "pyobjc-framework-coreml" },
-    { name = "pyobjc-framework-quartz" },
+    { name = "pyobjc-core", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-cocoa", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-coreml", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
+    { name = "pyobjc-framework-quartz", marker = "(python_full_version < '3.10' and platform_machine != 'aarch64') or (python_full_version < '3.10' and sys_platform != 'linux') or sys_platform == 'darwin'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/40/a8/7128da4d0a0103cabe58910a7233e2f98d18c590b1d36d4b3efaaedba6b9/pyobjc_framework_vision-11.1.tar.gz", hash = "sha256:26590512ee7758da3056499062a344b8a351b178be66d4b719327884dde4216b", size = 133721, upload-time = "2025-06-14T20:58:46.095Z" }
 wheels = [
@@ -5038,17 +5034,17 @@ version = "1.4.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "numpy", version = "2.0.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
+    { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
     { name = "onnxruntime", version = "1.19.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "opencv-python" },
-    { name = "pillow" },
-    { name = "pyclipper" },
-    { name = "pyyaml" },
+    { name = "onnxruntime", version = "1.22.0", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
+    { name = "opencv-python", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "pillow", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "pyclipper", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "pyyaml", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
     { name = "shapely", version = "2.0.7", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.10'" },
-    { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.10'" },
-    { name = "six" },
-    { name = "tqdm" },
+    { name = "shapely", version = "2.1.1", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.10' and python_full_version < '3.13') or (python_full_version >= '3.10' and platform_machine != 'arm64') or (python_full_version >= '3.10' and sys_platform != 'darwin')" },
+    { name = "six", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "tqdm", marker = "python_full_version < '3.13' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/ba/12/1e5497183bdbe782dbb91bad1d0d2297dba4d2831b2652657f7517bfc6df/rapidocr_onnxruntime-1.4.4-py3-none-any.whl", hash = "sha256:971d7d5f223a7a808662229df1ef69893809d8457d834e6373d3854bc1782cbf", size = 14915192, upload-time = "2025-01-17T01:48:25.104Z" },
@@ -6344,7 +6340,7 @@ name = "triton"
 version = "3.3.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "setuptools", marker = "python_full_version < '3.10' or platform_machine != 'arm64' or sys_platform != 'darwin'" },
+    { name = "setuptools", marker = "(python_full_version < '3.10' and platform_machine != 'arm64' and sys_platform == 'darwin') or (platform_machine != 'aarch64' and sys_platform == 'linux') or (sys_platform != 'darwin' and sys_platform != 'linux')" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8d/a9/549e51e9b1b2c9b854fd761a1d23df0ba2fbc60bd0c13b489ffa518cfcb7/triton-3.3.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b74db445b1c562844d3cfad6e9679c72e93fdfb1a90a24052b03bb5c49d1242e", size = 155600257, upload-time = "2025-05-29T23:39:36.085Z" },

From 517230b9c45e60e900fb334922617578726eab14 Mon Sep 17 00:00:00 2001
From: Christoph Auer <cau@zurich.ibm.com>
Date: Tue, 8 Jul 2025 13:07:56 +0200
Subject: [PATCH 04/13] Updated naming

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
---
 docling/datamodel/layout_model_specs.py   | 15 +++++++--------
 docling/datamodel/pipeline_options.py     | 14 +++++++-------
 docling/models/layout_model.py            |  6 +++---
 docling/pipeline/standard_pdf_pipeline.py |  3 ---
 docling/utils/model_downloader.py         |  4 ++--
 5 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/docling/datamodel/layout_model_specs.py b/docling/datamodel/layout_model_specs.py
index 08d5cd50..491d82df 100644
--- a/docling/datamodel/layout_model_specs.py
+++ b/docling/datamodel/layout_model_specs.py
@@ -29,42 +29,42 @@ class LayoutModelConfig(BaseModel):
 # HuggingFace Layout Models
 
 # Default Docling Layout Model
-DOCLING_LAYOUT_V2 = LayoutModelConfig(
-    name="docling_layout_old",
+docling_layout_v2 = LayoutModelConfig(
+    name="docling_layout_v2",
     repo_id="ds4sd/docling-layout-old",
     revision="main",
     model_path="",
 )
 
-DOCLING_LAYOUT_HERON = LayoutModelConfig(
+docling_layout_heron = LayoutModelConfig(
     name="docling_layout_heron",
     repo_id="ds4sd/docling-layout-heron",
     revision="main",
     model_path="",
 )
 
-DOCLING_LAYOUT_HERON_101 = LayoutModelConfig(
+docling_layout_heron_101 = LayoutModelConfig(
     name="docling_layout_heron_101",
     repo_id="ds4sd/docling-layout-heron-101",
     revision="main",
     model_path="",
 )
 
-DOCLING_LAYOUT_EGRET_MEDIUM = LayoutModelConfig(
+docling_layout_egret_medium = LayoutModelConfig(
     name="docling_layout_egret_medium",
     repo_id="ds4sd/docling-layout-egret-medium",
     revision="main",
     model_path="",
 )
 
-DOCLING_LAYOUT_EGRET_LARGE = LayoutModelConfig(
+docling_layout_egret_large = LayoutModelConfig(
     name="docling_layout_egret_large",
     repo_id="ds4sd/docling-layout-egret-large",
     revision="main",
     model_path="",
 )
 
-DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
+docling_layout_egret_xlarge = LayoutModelConfig(
     name="docling_layout_egret_xlarge",
     repo_id="ds4sd/docling-layout-egret-xlarge",
     revision="main",
@@ -82,7 +82,6 @@ DOCLING_LAYOUT_EGRET_XLARGE = LayoutModelConfig(
 
 class LayoutModelType(str, Enum):
     DOCLING_LAYOUT_V2 = "docling_layout_v2"
-    DOCLING_LAYOUT_OLD = "docling_layout_old"
     DOCLING_LAYOUT_HERON = "docling_layout_heron"
     DOCLING_LAYOUT_HERON_101 = "docling_layout_heron_101"
     DOCLING_LAYOUT_EGRET_MEDIUM = "docling_layout_egret_medium"
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index b4573384..fec3db76 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -17,13 +17,13 @@ from docling.datamodel import asr_model_specs
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.layout_model_specs import (
-    DOCLING_LAYOUT_EGRET_LARGE,
-    DOCLING_LAYOUT_EGRET_MEDIUM,
-    DOCLING_LAYOUT_EGRET_XLARGE,
-    DOCLING_LAYOUT_HERON,
-    DOCLING_LAYOUT_HERON_101,
-    DOCLING_LAYOUT_V2,
     LayoutModelConfig,
+    docling_layout_egret_large,
+    docling_layout_egret_medium,
+    docling_layout_egret_xlarge,
+    docling_layout_heron,
+    docling_layout_heron_101,
+    docling_layout_v2,
 )
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrOptions,
@@ -279,7 +279,7 @@ class LayoutOptions(BaseModel):
     """Options for layout processing."""
 
     create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
-    model: LayoutModelConfig = DOCLING_LAYOUT_V2
+    model_spec: LayoutModelConfig = docling_layout_v2
 
 
 class AsrPipelineOptions(PipelineOptions):
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index fdd5701f..fbe04313 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -12,7 +12,7 @@ from PIL import Image
 from docling.datamodel.accelerator_options import AcceleratorOptions
 from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
 from docling.datamodel.document import ConversionResult
-from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
+from docling.datamodel.layout_model_specs import LayoutModelConfig, docling_layout_v2
 from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@@ -57,7 +57,7 @@ class LayoutModel(BasePageModel):
         self.options = options
 
         device = decide_device(accelerator_options.device)
-        layout_model_config = options.model
+        layout_model_config = options.model_spec
         model_repo_folder = layout_model_config.model_repo_folder
         model_path = layout_model_config.model_path
 
@@ -91,7 +91,7 @@ class LayoutModel(BasePageModel):
         local_dir: Optional[Path] = None,
         force: bool = False,
         progress: bool = False,
-        layout_model_config: LayoutModelConfig = DOCLING_LAYOUT_V2,
+        layout_model_config: LayoutModelConfig = docling_layout_v2,
     ) -> Path:
         return download_hf_model(
             repo_id=layout_model_config.repo_id,
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index b00a9ad7..c04ddca9 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -37,9 +37,6 @@ _log = logging.getLogger(__name__)
 
 
 class StandardPdfPipeline(PaginatedPipeline):
-    # _layout_model_path = LayoutModel._model_path
-    # _table_model_path = TableStructureModel._model_path
-
     def __init__(self, pipeline_options: PdfPipelineOptions):
         super().__init__(pipeline_options)
         self.pipeline_options: PdfPipelineOptions
diff --git a/docling/utils/model_downloader.py b/docling/utils/model_downloader.py
index a2994fb7..b93efc83 100644
--- a/docling/utils/model_downloader.py
+++ b/docling/utils/model_downloader.py
@@ -2,7 +2,7 @@ import logging
 from pathlib import Path
 from typing import Optional
 
-from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2
+from docling.datamodel.layout_model_specs import docling_layout_v2
 from docling.datamodel.pipeline_options import (
     granite_picture_description,
     smolvlm_picture_description,
@@ -47,7 +47,7 @@ def download_models(
     if with_layout:
         _log.info("Downloading layout model...")
         LayoutModel.download_models(
-            local_dir=output_dir / DOCLING_LAYOUT_V2.model_repo_folder,
+            local_dir=output_dir / docling_layout_v2.model_repo_folder,
             force=force,
             progress=progress,
         )

From b5479ab9714416fb889b5c8eb406533037e1c4ec Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Jul 2025 15:05:54 +0200
Subject: [PATCH 05/13] working on MyPy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py         |  3 ---
 docling/models/base_model.py                  | 19 ++++++++++++++++---
 .../vlm_models_inline/two_stage_vlm_model.py  | 16 ++++++++++------
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index c6ec97eb..329d9de5 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -16,9 +16,7 @@ from docling.datamodel import asr_model_specs
 
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
-
 from docling.datamodel.asr_model_specs import WHISPER_TINY as whisper_tiny
-
 from docling.datamodel.layout_model_specs import (
     LayoutModelConfig,
     docling_layout_egret_large,
@@ -28,7 +26,6 @@ from docling.datamodel.layout_model_specs import (
     docling_layout_heron_101,
     docling_layout_v2,
 )
-
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrOptions,
 )
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index dd019216..c8691e17 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -6,7 +6,12 @@ from docling_core.types.doc import BoundingBox, DocItem, DoclingDocument, NodeIt
 from PIL import Image
 from typing_extensions import TypeVar
 
-from docling.datamodel.base_models import ItemAndImageEnrichmentElement, Page
+from docling.datamodel.base_models import (
+    Cluster,
+    ItemAndImageEnrichmentElement,
+    Page,
+    TextCell,
+)
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import BaseOptions
 from docling.datamodel.settings import settings
@@ -29,10 +34,18 @@ class BasePageModel(ABC):
     ) -> Iterable[Page]:
         pass
 
+
 class BaseLayoutModel(BasePageModel):
     @abstractmethod
-    def predict_on_page_image(self, *, page_image: Image.Image) -> list(Cluster):
-        pass    
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
+        pass
+
+    @abstractmethod
+    def postprocess_on_page_image(
+        self, *, page: Page, clusters: list[Cluster]
+    ) -> tuple[Page, list[Cluster], list[TextCell]]:
+        pass
+
 
 class BaseVlmModel(BasePageModel):
     @abstractmethod
diff --git a/docling/models/vlm_models_inline/two_stage_vlm_model.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py
index b31d46a9..131b874e 100644
--- a/docling/models/vlm_models_inline/two_stage_vlm_model.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@@ -8,14 +8,14 @@ from typing import Any, Optional
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
 )
-from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.base_models import Cluster, Page, VlmPrediction
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
     InlineVlmOptions,
     TransformersModelType,
     TransformersPromptStyle,
 )
-from docling.models.base_model import BasePageModel, BaseVlmModel
+from docling.models.base_model import BaseLayoutModel, BasePageModel, BaseVlmModel
 from docling.models.layout_model import LayoutModel
 from docling.models.utils.hf_model_download import (
     HuggingFaceModelDownloadMixin,
@@ -30,7 +30,7 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
     def __init__(
         self,
         *,
-        layout_model: LayoutModel,
+        layout_model: BaseLayoutModel,
         vlm_model: BaseVlmModel,
     ):
         self.layout_model = layout_model
@@ -51,13 +51,17 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                         scale=self.vlm_model.scale, max_size=self.vlm_model.max_size
                     )
 
-                    pred_clusters = self.layout_model.predict_on_page(page_image=page_image)
+                    assert page_image is not None
+
+                    pred_clusters = self.layout_model.predict_on_page_image(
+                        page_image=page_image
+                    )
                     page, processed_clusters, processed_cells = (
-                        self.layout_model.postprocess_on_page(
+                        self.layout_model.postprocess_on_page_image(
                             page=page, clusters=pred_clusters
                         )
                     )
-
+                    
                     # Define prompt structure
                     if callable(self.vlm_options.prompt):
                         user_prompt = self.vlm_options.prompt(page.parsed_page)

From c10e2920a491ed12c5a32d4ac1c828fda3960c51 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Tue, 8 Jul 2025 16:37:20 +0200
Subject: [PATCH 06/13] refactoring redundant code and fixing mypy errors

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/models/base_model.py                  |  9 +-
 .../hf_transformers_model.py                  |  2 +-
 docling/models/vlm_models_inline/mlx_model.py | 85 ++++++++++++++++---
 .../vlm_models_inline/two_stage_vlm_model.py  | 58 ++-----------
 docling/pipeline/vlm_pipeline.py              |  3 +-
 5 files changed, 94 insertions(+), 63 deletions(-)

diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index c8691e17..5bf32f48 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -11,6 +11,7 @@ from docling.datamodel.base_models import (
     ItemAndImageEnrichmentElement,
     Page,
     TextCell,
+    VlmPredictionToken,
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import BaseOptions
@@ -49,7 +50,13 @@ class BaseLayoutModel(BasePageModel):
 
 class BaseVlmModel(BasePageModel):
     @abstractmethod
-    def predict_on_page_image(self, *, page_image: Image.Image, prompt: str) -> str:
+    def get_user_prompt(self, page: Optional[Page]) -> str:
+        pass
+
+    @abstractmethod
+    def predict_on_page_image(
+        self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
+    ) -> tuple[str, Optional[list[VlmPredictionToken]]]:
         pass
 
 
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index 3513de3e..2c7b4b0a 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -38,7 +38,7 @@ class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixi
         self.vlm_options = vlm_options
 
         self.scale = self.vlm_options.scale
-        self.max_size = self.vlm_options.max_size
+        # self.max_size = self.vlm_options.max_size
 
         if self.enabled:
             import torch
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
index ddeea379..c28abe41 100644
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -4,6 +4,8 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
 
+from PIL import Image
+
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
 )
@@ -33,7 +35,7 @@ class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
         self.max_tokens = vlm_options.max_new_tokens
         self.temperature = vlm_options.temperature
         self.scale = self.vlm_options.scale
-        self.max_size = self.vlm_options.max_size
+        # self.max_size = self.vlm_options.max_size
 
         if self.enabled:
             try:
@@ -62,6 +64,55 @@ class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
             self.vlm_model, self.processor = load(artifacts_path)
             self.config = load_config(artifacts_path)
 
+    def get_user_prompt(self, page: Optional[Page]) -> str:
+        if callable(self.vlm_options.prompt) and page is not None:
+            return self.vlm_options.prompt(page.parsed_page)
+        else:
+            user_prompt = self.vlm_options.prompt
+            prompt = self.apply_chat_template(
+                self.processor, self.config, user_prompt, num_images=1
+            )
+            return prompt
+
+    def predict_on_page_image(
+        self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
+    ) -> tuple[str, Optional[list[VlmPredictionToken]]]:
+        tokens = []
+        output = ""
+        for token in self.stream_generate(
+            self.vlm_model,
+            self.processor,
+            prompt,
+            [page_image],
+            max_tokens=self.max_tokens,
+            verbose=False,
+            temp=self.temperature,
+        ):
+            if len(token.logprobs.shape) == 1:
+                tokens.append(
+                    VlmPredictionToken(
+                        text=token.text,
+                        token=token.token,
+                        logprob=token.logprobs[token.token],
+                    )
+                )
+            elif len(token.logprobs.shape) == 2 and token.logprobs.shape[0] == 1:
+                tokens.append(
+                    VlmPredictionToken(
+                        text=token.text,
+                        token=token.token,
+                        logprob=token.logprobs[0, token.token],
+                    )
+                )
+            else:
+                _log.warning(f"incompatible shape for logprobs: {token.logprobs.shape}")
+
+            output += token.text
+            if "</doctag>" in token.text:
+                break
+
+        return output, tokens
+
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -73,19 +124,23 @@ class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
                 with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                     assert page.size is not None
 
-                    hi_res_image = page.get_image(
+                    page_image = page.get_image(
                         scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                     )
-                    if hi_res_image is not None:
-                        im_width, im_height = hi_res_image.size
+                    """
+                    if page_image is not None:
+                        im_width, im_height = page_image.size
+                    """
+                    assert page_image is not None
 
                     # populate page_tags with predicted doc tags
                     page_tags = ""
 
-                    if hi_res_image:
-                        if hi_res_image.mode != "RGB":
-                            hi_res_image = hi_res_image.convert("RGB")
+                    if page_image:
+                        if page_image.mode != "RGB":
+                            page_image = page_image.convert("RGB")
 
+                    """
                     if callable(self.vlm_options.prompt):
                         user_prompt = self.vlm_options.prompt(page.parsed_page)
                     else:
@@ -93,11 +148,12 @@ class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
                     prompt = self.apply_chat_template(
                         self.processor, self.config, user_prompt, num_images=1
                     )
-
-                    start_time = time.time()
-                    _log.debug("start generating ...")
+                    """
+                    prompt = self.get_user_prompt(page)
 
                     # Call model to generate:
+                    start_time = time.time()
+                    """
                     tokens: list[VlmPredictionToken] = []
 
                     output = ""
@@ -105,7 +161,7 @@ class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
                         self.vlm_model,
                         self.processor,
                         prompt,
-                        [hi_res_image],
+                        [page_image],
                         max_tokens=self.max_tokens,
                         verbose=False,
                         temp=self.temperature,
@@ -137,13 +193,20 @@ class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
                         output += token.text
                         if "</doctag>" in token.text:
                             break
+                    """
+                    output, tokens = self.predict_on_page_image(
+                        page_image=page_image, prompt=prompt, output_tokens=True
+                    )
 
                     generation_time = time.time() - start_time
                     page_tags = output
 
+                    """
                     _log.debug(
                         f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
                     )
+                    """
+
                     page.predictions.vlm_response = VlmPrediction(
                         text=page_tags,
                         generation_time=generation_time,
diff --git a/docling/models/vlm_models_inline/two_stage_vlm_model.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py
index 131b874e..846fe991 100644
--- a/docling/models/vlm_models_inline/two_stage_vlm_model.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@@ -61,64 +61,24 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                             page=page, clusters=pred_clusters
                         )
                     )
-                    
-                    # Define prompt structure
-                    if callable(self.vlm_options.prompt):
-                        user_prompt = self.vlm_options.prompt(page.parsed_page)
-                    else:
-                        user_prompt = self.vlm_options.prompt
 
-                    prompt = self.formulate_prompt(user_prompt, processed_clusters)
+                    user_prompt = self.vlm_model.get_user_prompt(page=page)
+                    prompt = self.formulate_prompt(
+                        user_prompt=user_prompt, clusters=processed_clusters
+                    )
 
-                    generated_text, generation_time = self.vlm_model.predict_on_image(
+                    start_time = time.time()
+                    generated_text = self.vlm_model.predict_on_page_image(
                         page_image=page_image, prompt=prompt
                     )
 
                     page.predictions.vlm_response = VlmPrediction(
-                        text=generated_text,
-                        generation_time=generation_time,
+                        text=generated_text, generation_time=time.time() - start_time
                     )
 
                 yield page
 
-    def formulate_prompt(self, user_prompt: str, clusters: list[Cluster]) -> str:
+    def formulate_prompt(self, *, user_prompt: str, clusters: list[Cluster]) -> str:
         """Formulate a prompt for the VLM."""
 
-        if self.vlm_options.transformers_prompt_style == TransformersPromptStyle.RAW:
-            return user_prompt
-
-        elif self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
-            _log.debug("Using specialized prompt for Phi-4")
-            # more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
-
-            user_prompt = "<|user|>"
-            assistant_prompt = "<|assistant|>"
-            prompt_suffix = "<|end|>"
-
-            prompt = f"{user_prompt}<|image_1|>{user_prompt}{prompt_suffix}{assistant_prompt}"
-            _log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
-
-            return prompt
-
-        elif self.vlm_options.transformers_prompt_style == TransformersPromptStyle.CHAT:
-            messages = [
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "text",
-                            "text": "This is a page from a document.",
-                        },
-                        {"type": "image"},
-                        {"type": "text", "text": user_prompt},
-                    ],
-                }
-            ]
-            prompt = self.processor.apply_chat_template(
-                messages, add_generation_prompt=False
-            )
-            return prompt
-
-        raise RuntimeError(
-            f"Uknown prompt style `{self.vlm_options.transformers_prompt_style}`. Valid values are {', '.join(s.value for s in TransformersPromptStyle)}."
-        )
+        return user_prompt
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 01be3693..aac61d8d 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -26,12 +26,13 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import TwoStageVlmOptions, VlmPipelineOptions
+from docling.datamodel.pipeline_options import VlmPipelineOptions
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
     InlineVlmOptions,
     ResponseFormat,
+    TwoStageVlmOptions,
 )
 from docling.datamodel.settings import settings
 from docling.models.api_vlm_model import ApiVlmModel

From dcf6fd6a413d4a581344689a46ad8781a47db343 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Wed, 9 Jul 2025 06:48:03 +0200
Subject: [PATCH 07/13] fixed the MyPy complaining

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/asr_model_specs.py          | 13 ++--
 docling/datamodel/pipeline_options.py         | 13 +++-
 docling/models/layout_model.py                | 16 +++--
 .../hf_transformers_model.py                  | 67 ++++++++++++++++---
 docling/pipeline/vlm_pipeline.py              | 10 +--
 5 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py
index 426b5851..5527dd5b 100644
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@@ -11,12 +11,13 @@ from docling.datamodel.pipeline_options_asr_model import (
     # ApiAsrOptions,
     InferenceAsrFramework,
     InlineAsrNativeWhisperOptions,
+    InlineAsrOptions,
     TransformersModelType,
 )
 
 _log = logging.getLogger(__name__)
 
-WHISPER_TINY = InlineAsrNativeWhisperOptions(
+WHISPER_TINY: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="tiny",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -27,7 +28,7 @@ WHISPER_TINY = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
 
-WHISPER_SMALL = InlineAsrNativeWhisperOptions(
+WHISPER_SMALL: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="small",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -38,7 +39,7 @@ WHISPER_SMALL = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
 
-WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
+WHISPER_MEDIUM: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="medium",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -49,7 +50,7 @@ WHISPER_MEDIUM = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
 
-WHISPER_BASE = InlineAsrNativeWhisperOptions(
+WHISPER_BASE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="base",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -60,7 +61,7 @@ WHISPER_BASE = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
 
-WHISPER_LARGE = InlineAsrNativeWhisperOptions(
+WHISPER_LARGE: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="large",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
@@ -71,7 +72,7 @@ WHISPER_LARGE = InlineAsrNativeWhisperOptions(
     max_time_chunk=30.0,
 )
 
-WHISPER_TURBO = InlineAsrNativeWhisperOptions(
+WHISPER_TURBO: InlineAsrOptions = InlineAsrNativeWhisperOptions(
     repo_id="turbo",
     inference_framework=InferenceAsrFramework.WHISPER,
     verbose=True,
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 329d9de5..2b76a553 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -16,7 +16,15 @@ from docling.datamodel import asr_model_specs
 
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
-from docling.datamodel.asr_model_specs import WHISPER_TINY as whisper_tiny
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TINY as whisper_tiny,
+    WHISPER_TURBO,
+)
 from docling.datamodel.layout_model_specs import (
     LayoutModelConfig,
     docling_layout_egret_large,
@@ -279,13 +287,12 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
 class LayoutOptions(BaseModel):
     """Options for layout processing."""
 
-    repo_id: str = "ds4sd/docling-layout-heron"
     create_orphan_clusters: bool = True  # Whether to create clusters for orphaned cells
     model_spec: LayoutModelConfig = docling_layout_v2
 
 
 class AsrPipelineOptions(PipelineOptions):
-    asr_options: Union[InlineAsrOptions] = asr_model_specs.WHISPER_TINY
+    asr_options: Union[InlineAsrOptions] = WHISPER_TINY
     artifacts_path: Optional[Union[Path, str]] = None
 
 
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index 2e72d957..6a668bf2 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -16,7 +16,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.layout_model_specs import LayoutModelConfig, docling_layout_v2
 from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BasePageModel
+from docling.models.base_model import BaseLayoutModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
@@ -26,7 +26,7 @@ from docling.utils.visualization import draw_clusters
 _log = logging.getLogger(__name__)
 
 
-class LayoutModel(BasePageModel):
+class LayoutModel(BaseLayoutModel):
     TEXT_ELEM_LABELS = [
         DocItemLabel.TEXT,
         DocItemLabel.FOOTNOTE,
@@ -179,7 +179,9 @@ class LayoutModel(BasePageModel):
                         )
                         clusters.append(cluster)
                     """
-                    predicted_clusters = self.predict_on_page(page_image=page_image)
+                    predicted_clusters = self.predict_on_page_image(
+                        page_image=page_image
+                    )
 
                     if settings.debug.visualize_raw_layout:
                         self.draw_clusters_and_cells_side_by_side(
@@ -216,7 +218,9 @@ class LayoutModel(BasePageModel):
                     )
                     """
                     page, processed_clusters, processed_cells = (
-                        self.postprocess_on_page(page=page, clusters=predicted_clusters)
+                        self.postprocess_on_page_image(
+                            page=page, clusters=predicted_clusters
+                        )
                     )
 
                     with warnings.catch_warnings():
@@ -244,7 +248,7 @@ class LayoutModel(BasePageModel):
 
                 yield page
 
-    def predict_on_page(self, *, page_image: Image.Image) -> list[Cluster]:
+    def predict_on_page_image(self, *, page_image: Image.Image) -> list[Cluster]:
         pred_items = self.layout_predictor.predict(page_image)
 
         clusters = []
@@ -263,7 +267,7 @@ class LayoutModel(BasePageModel):
 
         return clusters
 
-    def postprocess_on_page(
+    def postprocess_on_page_image(
         self, *, page: Page, clusters: list[Cluster]
     ) -> tuple[Page, list[Cluster], list[TextCell]]:
         processed_clusters, processed_cells = LayoutPostprocessor(
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index 2c7b4b0a..4e892119 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -5,10 +5,12 @@ from collections.abc import Iterable
 from pathlib import Path
 from typing import Any, Optional
 
+from PIL import Image
+
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
 )
-from docling.datamodel.base_models import Page, VlmPrediction
+from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options_vlm_model import (
     InlineVlmOptions,
@@ -122,6 +124,43 @@ class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixi
             # Load generation config
             self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
 
+    def get_user_prompt(self, page: Optional[Page]) -> str:
+        # Define prompt structure
+        user_prompt = ""
+        if callable(self.vlm_options.prompt) and page is not None:
+            user_prompt = self.vlm_options.prompt(page.parsed_page)
+        elif isinstance(self.vlm_options.prompt, str):
+            user_prompt = self.vlm_options.prompt
+
+        prompt = self.formulate_prompt(user_prompt)
+        return prompt
+
+    def predict_on_page_image(
+        self, *, page_image: Image.Image, prompt: str, output_tokens: bool = False
+    ) -> tuple[str, Optional[list[VlmPredictionToken]]]:
+        output = ""
+
+        inputs = self.processor(
+            text=prompt, images=[page_image], return_tensors="pt"
+        ).to(self.device)
+
+        # Call model to generate:
+        generated_ids = self.vlm_model.generate(
+            **inputs,
+            max_new_tokens=self.max_new_tokens,
+            use_cache=self.use_cache,
+            temperature=self.temperature,
+            generation_config=self.generation_config,
+            **self.vlm_options.extra_generation_config,
+        )
+
+        output = self.processor.batch_decode(
+            generated_ids[:, inputs["input_ids"].shape[1] :],
+            skip_special_tokens=False,
+        )[0]
+
+        return output, []
+
     def __call__(
         self, conv_res: ConversionResult, page_batch: Iterable[Page]
     ) -> Iterable[Page]:
@@ -133,22 +172,29 @@ class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixi
                 with TimeRecorder(conv_res, "vlm"):
                     assert page.size is not None
 
-                    hi_res_image = page.get_image(
+                    page_image = page.get_image(
                         scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                     )
 
+                    assert page_image is not None
+
                     # Define prompt structure
+                    """
                     if callable(self.vlm_options.prompt):
                         user_prompt = self.vlm_options.prompt(page.parsed_page)
                     else:
                         user_prompt = self.vlm_options.prompt
                     prompt = self.formulate_prompt(user_prompt)
-
-                    inputs = self.processor(
-                        text=prompt, images=[hi_res_image], return_tensors="pt"
-                    ).to(self.device)
+                    """
+                    prompt = self.get_user_prompt(page=page)
 
                     start_time = time.time()
+
+                    """
+                    inputs = self.processor(
+                        text=prompt, images=[page_image], return_tensors="pt"
+                    ).to(self.device)
+
                     # Call model to generate:
                     generated_ids = self.vlm_model.generate(
                         **inputs,
@@ -169,9 +215,14 @@ class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixi
                     _log.debug(
                         f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
                     )
+                    """
+                    generated_text = self.predict_on_page_image(
+                        page_image=page_image, prompt=prompt, output_tokens=False
+                    )
+
                     page.predictions.vlm_response = VlmPrediction(
-                        text=generated_texts,
-                        generation_time=generation_time,
+                        text=generated_text,
+                        generation_time=time.time() - start_time,
                     )
 
                 yield page
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index aac61d8d..0ee06efb 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -115,7 +115,7 @@ class VlmPipeline(PaginatedPipeline):
                 TwoStageVlmOptions, self.pipeline_options.vlm_options
             )
 
-            layout_options = twostagevlm_options.lay_options
+            layout_options = twostagevlm_options.layout_options
             vlm_options = twostagevlm_options.vlm_options
 
             layout_model = LayoutModel(
@@ -125,24 +125,24 @@ class VlmPipeline(PaginatedPipeline):
             )
 
             if vlm_options.inference_framework == InferenceFramework.MLX:
-                vlm_model = HuggingFaceMlxModel(
+                vlm_model_mlx = HuggingFaceMlxModel(
                     enabled=True,  # must be always enabled for this pipeline to make sense.
                     artifacts_path=artifacts_path,
                     accelerator_options=pipeline_options.accelerator_options,
                     vlm_options=vlm_options,
                 )
                 self.build_pipe = [
-                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model)
+                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
                 ]
             elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
-                vlm_model = HuggingFaceTransformersVlmModel(
+                vlm_model_hf = HuggingFaceTransformersVlmModel(
                     enabled=True,  # must be always enabled for this pipeline to make sense.
                     artifacts_path=artifacts_path,
                     accelerator_options=pipeline_options.accelerator_options,
                     vlm_options=vlm_options,
                 )
                 self.build_pipe = [
-                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model)
+                    TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
                 ]
             else:
                 raise ValueError(

From 0f395688b8caa72c9c427b15216fc438273deb4c Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 10 Jul 2025 06:48:34 +0200
Subject: [PATCH 08/13] refactored the code and added vlm2stage as a cli option

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py           | 2 --
 docling/datamodel/pipeline_options_vlm_model.py | 2 +-
 docling/datamodel/vlm_model_specs.py            | 7 +++++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 2b76a553..cea2594e 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -12,8 +12,6 @@ from pydantic import (
 )
 from typing_extensions import deprecated
 
-from docling.datamodel import asr_model_specs
-
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
 from docling.datamodel.asr_model_specs import (
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index a38f0414..c5ed2e32 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -90,7 +90,7 @@ class ApiVlmOptions(BaseVlmOptions):
     response_format: ResponseFormat
 
 
-class TwoStageVlmOptions(BaseVlmOptions):
+class TwoStageVlmOptions(BaseModel):
     kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
 
     vlm_options: InlineVlmOptions
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
index 5045c846..c8eefe3e 100644
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@@ -6,12 +6,14 @@ from pydantic import (
 )
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.layout_model_specs import docling_layout_heron
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
     InlineVlmOptions,
     ResponseFormat,
     TransformersModelType,
+    TwoStageVlmOptions,
 )
 
 _log = logging.getLogger(__name__)
@@ -137,8 +139,13 @@ GEMMA3_27B_MLX = InlineVlmOptions(
     temperature=0.0,
 )
 
+VLM2STAGE = TwoStageVlmOptions(
+    vlm_options=SMOLDOCLING_MLX, layout_options=docling_layout_heron
+)
+
 
 class VlmModelType(str, Enum):
     SMOLDOCLING = "smoldocling"
     GRANITE_VISION = "granite_vision"
     GRANITE_VISION_OLLAMA = "granite_vision_ollama"
+    VLM2STAGE = "docling2stage"

From 70872e6539b0b392a26b648368f45f565339d1b0 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 10 Jul 2025 09:58:06 +0200
Subject: [PATCH 09/13] merged with main and refactored the code to fix MyPy

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py | 8 ++++++++
 docling/datamodel/vlm_model_specs.py  | 7 +++++--
 docling/models/layout_model.py        | 4 ++--
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 2cfad594..630b16ef 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -14,6 +14,14 @@ from typing_extensions import deprecated
 
 # Import the following for backwards compatibility
 from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
+from docling.datamodel.asr_model_specs import (
+    WHISPER_BASE,
+    WHISPER_LARGE,
+    WHISPER_MEDIUM,
+    WHISPER_SMALL,
+    WHISPER_TINY,
+    WHISPER_TURBO,
+)
 from docling.datamodel.layout_model_specs import (
     DOCLING_LAYOUT_EGRET_LARGE,
     DOCLING_LAYOUT_EGRET_MEDIUM,
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
index c8eefe3e..25815a92 100644
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@@ -6,7 +6,10 @@ from pydantic import (
 )
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
-from docling.datamodel.layout_model_specs import docling_layout_heron
+from docling.datamodel.layout_model_specs import (
+    DOCLING_LAYOUT_HERON,
+    DOCLING_LAYOUT_V2,
+)
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
@@ -140,7 +143,7 @@ GEMMA3_27B_MLX = InlineVlmOptions(
 )
 
 VLM2STAGE = TwoStageVlmOptions(
-    vlm_options=SMOLDOCLING_MLX, layout_options=docling_layout_heron
+    vlm_options=SMOLDOCLING_MLX, layout_options=DOCLING_LAYOUT_HERON
 )
 
 
diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index c8c60a64..2b7947da 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -16,7 +16,7 @@ from docling.datamodel.document import ConversionResult
 from docling.datamodel.layout_model_specs import DOCLING_LAYOUT_V2, LayoutModelConfig
 from docling.datamodel.pipeline_options import LayoutOptions
 from docling.datamodel.settings import settings
-from docling.models.base_model import BaseLayoutModel
+from docling.models.base_model import BaseLayoutModel, BasePageModel
 from docling.models.utils.hf_model_download import download_hf_model
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.layout_postprocessor import LayoutPostprocessor
@@ -26,7 +26,7 @@ from docling.utils.visualization import draw_clusters
 _log = logging.getLogger(__name__)
 
 
-class LayoutModel(BasePageModel):
+class LayoutModel(BaseLayoutModel):
     TEXT_ELEM_LABELS = [
         DocItemLabel.TEXT,
         DocItemLabel.FOOTNOTE,

From b2336830eb6c1b73956d93fc8da5737812823c71 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 10 Jul 2025 10:35:47 +0200
Subject: [PATCH 10/13] fixed the circular dependenciea

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/asr_model_specs.py            |  2 +-
 docling/datamodel/layout_model_specs.py         |  2 --
 docling/datamodel/pipeline_options.py           |  4 ++++
 docling/datamodel/pipeline_options_asr_model.py |  7 ++++---
 docling/datamodel/pipeline_options_vlm_model.py | 10 ++++++----
 5 files changed, 15 insertions(+), 10 deletions(-)

diff --git a/docling/datamodel/asr_model_specs.py b/docling/datamodel/asr_model_specs.py
index 5527dd5b..b16ad8f9 100644
--- a/docling/datamodel/asr_model_specs.py
+++ b/docling/datamodel/asr_model_specs.py
@@ -12,7 +12,7 @@ from docling.datamodel.pipeline_options_asr_model import (
     InferenceAsrFramework,
     InlineAsrNativeWhisperOptions,
     InlineAsrOptions,
-    TransformersModelType,
+    # TransformersModelType,
 )
 
 _log = logging.getLogger(__name__)
diff --git a/docling/datamodel/layout_model_specs.py b/docling/datamodel/layout_model_specs.py
index b91fa7fe..ff5c8074 100644
--- a/docling/datamodel/layout_model_specs.py
+++ b/docling/datamodel/layout_model_specs.py
@@ -26,8 +26,6 @@ class LayoutModelConfig(BaseModel):
         return self.repo_id.replace("/", "--")
 
 
-# HuggingFace Layout Models
-
 # Default Docling Layout Model
 DOCLING_LAYOUT_V2 = LayoutModelConfig(
     name="docling_layout_v2",
diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 630b16ef..40947fd9 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -278,6 +278,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
     # If True, text from backend will be used instead of generated text
     vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
         smoldocling_vlm_conversion_options
+        #SMOLDOCLING_TRANSFORMERS
     )
 
 
@@ -293,6 +294,9 @@ class AsrPipelineOptions(PipelineOptions):
     artifacts_path: Optional[Union[Path, str]] = None
 
 
+
+
+    
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
 
diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py
index 20e2e453..12109ad1 100644
--- a/docling/datamodel/pipeline_options_asr_model.py
+++ b/docling/datamodel/pipeline_options_asr_model.py
@@ -5,10 +5,11 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
-from docling.datamodel.pipeline_options_vlm_model import (
+
+# from docling.datamodel.pipeline_options_vlm_model import (
     # InferenceFramework,
-    TransformersModelType,
-)
+    # TransformersModelType,
+# )
 
 
 class BaseAsrOptions(BaseModel):
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index c5ed2e32..3cf2efb0 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -6,7 +6,6 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
-from docling.datamodel.pipeline_options import LayoutOptions
 
 
 class BaseVlmOptions(BaseModel):
@@ -89,9 +88,12 @@ class ApiVlmOptions(BaseVlmOptions):
     concurrency: int = 1
     response_format: ResponseFormat
 
-
+from docling.datamodel.layout_model_specs import (
+    LayoutModelConfig,
+)
+    
 class TwoStageVlmOptions(BaseModel):
     kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
 
-    vlm_options: InlineVlmOptions
-    layout_options: LayoutOptions
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
+    layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2

From fb74d0c5b35ce4ce9892e2329fe7cfde9325b872 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 10 Jul 2025 15:11:53 +0200
Subject: [PATCH 11/13] working TwoStageVlmModel

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/datamodel/pipeline_options.py         |  8 +++---
 .../datamodel/pipeline_options_asr_model.py   |  4 +--
 .../datamodel/pipeline_options_vlm_model.py   | 14 +++++-----
 docling/datamodel/vlm_model_specs.py          |  4 ++-
 docling/pipeline/vlm_pipeline.py              | 27 ++++++++++++-------
 5 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py
index 40947fd9..4fb2885e 100644
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@@ -39,6 +39,7 @@ from docling.datamodel.pipeline_options_vlm_model import (
     InferenceFramework,
     InlineVlmOptions,
     ResponseFormat,
+    TwoStageVlmOptions,
 )
 from docling.datamodel.vlm_model_specs import (
     GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
@@ -276,9 +277,9 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
         False  # (To be used with vlms, or other generative models)
     )
     # If True, text from backend will be used instead of generated text
-    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions, TwoStageVlmOptions] = (
         smoldocling_vlm_conversion_options
-        #SMOLDOCLING_TRANSFORMERS
+        # SMOLDOCLING_TRANSFORMERS
     )
 
 
@@ -294,9 +295,6 @@ class AsrPipelineOptions(PipelineOptions):
     artifacts_path: Optional[Union[Path, str]] = None
 
 
-
-
-    
 class PdfPipelineOptions(PaginatedPipelineOptions):
     """Options for the PDF pipeline."""
 
diff --git a/docling/datamodel/pipeline_options_asr_model.py b/docling/datamodel/pipeline_options_asr_model.py
index 12109ad1..f26aad76 100644
--- a/docling/datamodel/pipeline_options_asr_model.py
+++ b/docling/datamodel/pipeline_options_asr_model.py
@@ -7,8 +7,8 @@ from typing_extensions import deprecated
 from docling.datamodel.accelerator_options import AcceleratorDevice
 
 # from docling.datamodel.pipeline_options_vlm_model import (
-    # InferenceFramework,
-    # TransformersModelType,
+# InferenceFramework,
+# TransformersModelType,
 # )
 
 
diff --git a/docling/datamodel/pipeline_options_vlm_model.py b/docling/datamodel/pipeline_options_vlm_model.py
index 3cf2efb0..66c97ca4 100644
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@@ -6,6 +6,9 @@ from pydantic import AnyUrl, BaseModel
 from typing_extensions import deprecated
 
 from docling.datamodel.accelerator_options import AcceleratorDevice
+from docling.datamodel.layout_model_specs import (
+    LayoutModelConfig,
+)
 
 
 class BaseVlmOptions(BaseModel):
@@ -88,12 +91,11 @@ class ApiVlmOptions(BaseVlmOptions):
     concurrency: int = 1
     response_format: ResponseFormat
 
-from docling.datamodel.layout_model_specs import (
-    LayoutModelConfig,
-)
-    
+
 class TwoStageVlmOptions(BaseModel):
     kind: Literal["inline_two_stage_model_options"] = "inline_two_stage_model_options"
 
-    vlm_options: Union[InlineVlmOptions, ApiVlmOptions] # = SMOLDOCLING_TRANSFORMERS
-    layout_options: LayoutModelConfig # = DOCLING_LAYOUT_V2
+    response_format: ResponseFormat  # final response of the VLM
+
+    layout_options: LayoutModelConfig  # = DOCLING_LAYOUT_V2
+    vlm_options: Union[InlineVlmOptions, ApiVlmOptions]  # = SMOLDOCLING_TRANSFORMERS
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
index 25815a92..8025d02f 100644
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@@ -143,7 +143,9 @@ GEMMA3_27B_MLX = InlineVlmOptions(
 )
 
 VLM2STAGE = TwoStageVlmOptions(
-    vlm_options=SMOLDOCLING_MLX, layout_options=DOCLING_LAYOUT_HERON
+    vlm_options=SMOLDOCLING_MLX,
+    layout_options=DOCLING_LAYOUT_HERON,
+    response_format=SMOLDOCLING_MLX.response_format,
 )
 
 
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index 0ee06efb..1c94d977 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -26,7 +26,7 @@ from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
 from docling.datamodel.base_models import InputFormat, Page
 from docling.datamodel.document import ConversionResult, InputDocument
-from docling.datamodel.pipeline_options import VlmPipelineOptions
+from docling.datamodel.pipeline_options import LayoutOptions, VlmPipelineOptions
 from docling.datamodel.pipeline_options_vlm_model import (
     ApiVlmOptions,
     InferenceFramework,
@@ -115,38 +115,47 @@ class VlmPipeline(PaginatedPipeline):
                 TwoStageVlmOptions, self.pipeline_options.vlm_options
             )
 
-            layout_options = twostagevlm_options.layout_options
-            vlm_options = twostagevlm_options.vlm_options
+            stage_1_options = twostagevlm_options.layout_options
+            stage_2_options = twostagevlm_options.vlm_options
 
             layout_model = LayoutModel(
                 artifacts_path=artifacts_path,
                 accelerator_options=pipeline_options.accelerator_options,
-                options=layout_options,
+                options=LayoutOptions(
+                    create_orphan_clusters=False, model_spec=stage_1_options
+                ),
             )
 
-            if vlm_options.inference_framework == InferenceFramework.MLX:
+            if (
+                isinstance(stage_2_options, InlineVlmOptions)
+                and stage_2_options.inference_framework == InferenceFramework.MLX
+            ):
                 vlm_model_mlx = HuggingFaceMlxModel(
                     enabled=True,  # must be always enabled for this pipeline to make sense.
                     artifacts_path=artifacts_path,
                     accelerator_options=pipeline_options.accelerator_options,
-                    vlm_options=vlm_options,
+                    vlm_options=stage_2_options,
                 )
                 self.build_pipe = [
                     TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_mlx)
                 ]
-            elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
+            elif (
+                isinstance(stage_2_options, InlineVlmOptions)
+                and stage_2_options.inference_framework
+                == InferenceFramework.TRANSFORMERS
+            ):
                 vlm_model_hf = HuggingFaceTransformersVlmModel(
                     enabled=True,  # must be always enabled for this pipeline to make sense.
                     artifacts_path=artifacts_path,
                     accelerator_options=pipeline_options.accelerator_options,
-                    vlm_options=vlm_options,
+                    vlm_options=stage_2_options,
                 )
                 self.build_pipe = [
                     TwoStageVlmModel(layout_model=layout_model, vlm_model=vlm_model_hf)
                 ]
             else:
                 raise ValueError(
-                    f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
+                    f"Could not instantiate the right type of VLM pipeline: {stage_2_options}"
                 )
 
         self.enrichment_pipe = [

From b2d5c783ae115c469ecc07c28a7faca9f482bb49 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 10 Jul 2025 15:38:15 +0200
Subject: [PATCH 12/13] working two-stage vlm approach from the cli

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 docling/cli/main.py                                 |  7 +++++++
 docling/datamodel/vlm_model_specs.py                |  2 +-
 .../vlm_models_inline/hf_transformers_model.py      |  4 +++-
 docling/models/vlm_models_inline/mlx_model.py       |  5 ++++-
 .../models/vlm_models_inline/two_stage_vlm_model.py | 13 ++++++++-----
 5 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/docling/cli/main.py b/docling/cli/main.py
index ae275ea9..1b623b0d 100644
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -63,6 +63,7 @@ from docling.datamodel.vlm_model_specs import (
     GRANITE_VISION_TRANSFORMERS,
     SMOLDOCLING_MLX,
     SMOLDOCLING_TRANSFORMERS,
+    VLM2STAGE,
     VlmModelType,
 )
 from docling.document_converter import (
@@ -627,6 +628,12 @@ def convert(  # noqa: C901
                             "To run SmolDocling faster, please install mlx-vlm:\n"
                             "pip install mlx-vlm"
                         )
+            elif vlm_model == VlmModelType.VLM2STAGE:
+                pipeline_options.vlm_options = VLM2STAGE
+            else:
+                raise ValueError(
+                    f"{vlm_model} is not of type GRANITE_VISION, GRANITE_VISION_OLLAMA, SMOLDOCLING_TRANSFORMERS or VLM2STAGE"
+                )
 
             pdf_format_option = PdfFormatOption(
                 pipeline_cls=VlmPipeline, pipeline_options=pipeline_options
diff --git a/docling/datamodel/vlm_model_specs.py b/docling/datamodel/vlm_model_specs.py
index 8025d02f..906e4e9c 100644
--- a/docling/datamodel/vlm_model_specs.py
+++ b/docling/datamodel/vlm_model_specs.py
@@ -153,4 +153,4 @@ class VlmModelType(str, Enum):
     SMOLDOCLING = "smoldocling"
     GRANITE_VISION = "granite_vision"
     GRANITE_VISION_OLLAMA = "granite_vision_ollama"
-    VLM2STAGE = "docling2stage"
+    VLM2STAGE = "vlm2stage"
diff --git a/docling/models/vlm_models_inline/hf_transformers_model.py b/docling/models/vlm_models_inline/hf_transformers_model.py
index 4e892119..5434ee50 100644
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@@ -40,7 +40,9 @@ class HuggingFaceTransformersVlmModel(BaseVlmModel, HuggingFaceModelDownloadMixi
         self.vlm_options = vlm_options
 
         self.scale = self.vlm_options.scale
-        # self.max_size = self.vlm_options.max_size
+        self.max_size = 512
+        if isinstance(self.vlm_options.max_size, int):
+            self.max_size = self.vlm_options.max_size
 
         if self.enabled:
             import torch
diff --git a/docling/models/vlm_models_inline/mlx_model.py b/docling/models/vlm_models_inline/mlx_model.py
index c28abe41..fa28de7f 100644
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@@ -35,7 +35,10 @@ class HuggingFaceMlxModel(BaseVlmModel, HuggingFaceModelDownloadMixin):
         self.max_tokens = vlm_options.max_new_tokens
         self.temperature = vlm_options.temperature
         self.scale = self.vlm_options.scale
-        # self.max_size = self.vlm_options.max_size
+
+        self.max_size = 512
+        if isinstance(self.vlm_options.max_size, int):
+            self.max_size = self.vlm_options.max_size
 
         if self.enabled:
             try:
diff --git a/docling/models/vlm_models_inline/two_stage_vlm_model.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py
index 846fe991..2ef18692 100644
--- a/docling/models/vlm_models_inline/two_stage_vlm_model.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@@ -50,12 +50,12 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                     page_image = page.get_image(
                         scale=self.vlm_model.scale, max_size=self.vlm_model.max_size
                     )
-
                     assert page_image is not None
 
                     pred_clusters = self.layout_model.predict_on_page_image(
                         page_image=page_image
                     )
+
                     page, processed_clusters, processed_cells = (
                         self.layout_model.postprocess_on_page_image(
                             page=page, clusters=pred_clusters
@@ -68,14 +68,17 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                     )
 
                     start_time = time.time()
-                    generated_text = self.vlm_model.predict_on_page_image(
-                        page_image=page_image, prompt=prompt
+                    generated_text, generated_tokens = (
+                        self.vlm_model.predict_on_page_image(
+                            page_image=page_image, prompt=prompt
+                        )
                     )
 
                     page.predictions.vlm_response = VlmPrediction(
-                        text=generated_text, generation_time=time.time() - start_time
+                        text=generated_text,
+                        generation_time=time.time() - start_time,
+                        generated_tokens=generated_tokens,
                     )
-
                 yield page
 
     def formulate_prompt(self, *, user_prompt: str, clusters: list[Cluster]) -> str:

From f4c1836c96625eb1afb1437081e81a96db1283b1 Mon Sep 17 00:00:00 2001
From: Peter Staar <taa@zurich.ibm.com>
Date: Thu, 10 Jul 2025 16:15:54 +0200
Subject: [PATCH 13/13] functional working two-stage, need to implement a good
 prompt now to leverage bounding boxes

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
---
 .../vlm_models_inline/two_stage_vlm_model.py  | 38 +++++++++++++++++--
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/docling/models/vlm_models_inline/two_stage_vlm_model.py b/docling/models/vlm_models_inline/two_stage_vlm_model.py
index 2ef18692..2bf5958f 100644
--- a/docling/models/vlm_models_inline/two_stage_vlm_model.py
+++ b/docling/models/vlm_models_inline/two_stage_vlm_model.py
@@ -64,7 +64,10 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
 
                     user_prompt = self.vlm_model.get_user_prompt(page=page)
                     prompt = self.formulate_prompt(
-                        user_prompt=user_prompt, clusters=processed_clusters
+                        user_prompt=user_prompt,
+                        clusters=processed_clusters,
+                        image_width=page_image.width,
+                        image_height=page_image.height,
                     )
 
                     start_time = time.time()
@@ -73,15 +76,44 @@ class TwoStageVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
                             page_image=page_image, prompt=prompt
                         )
                     )
-
+                    print("generated-text: \n", generated_text, "\n")
                     page.predictions.vlm_response = VlmPrediction(
                         text=generated_text,
                         generation_time=time.time() - start_time,
                         generated_tokens=generated_tokens,
                     )
+                    exit(-1)
+
                 yield page
 
-    def formulate_prompt(self, *, user_prompt: str, clusters: list[Cluster]) -> str:
+    def formulate_prompt(
+        self,
+        *,
+        user_prompt: str,
+        clusters: list[Cluster],
+        image_width: int,
+        image_height: int,
+        vlm_width: int = 512,
+        vlm_height: int = 512,
+    ) -> str:
         """Formulate a prompt for the VLM."""
 
+        known_clusters = ["here is a list of unsorted text-blocks:", "<doctags>"]
+        for cluster in clusters:
+            print(" => ", cluster)
+
+            loc_l = f"<loc_{int(vlm_width * cluster.bbox.l / image_width)}>"
+            loc_b = f"<loc_{int(vlm_height * cluster.bbox.b / image_height)}>"
+            loc_r = f"<loc_{int(vlm_width * cluster.bbox.r / image_width)}>"
+            loc_t = f"<loc_{int(vlm_height * cluster.bbox.t / image_height)}>"
+
+            known_clusters.append(
+                f"<{cluster.label}>{loc_l}{loc_b}{loc_r}{loc_t}</{cluster.label}>"
+            )
+
+        known_clusters.append("</doctags>")
+
+        user_prompt = "\n".join(known_clusters) + f"\n\n{user_prompt}"
+        print("user-prompt: ", user_prompt, "\n")
+
         return user_prompt