Merge branch 'main' of github.com:DS4SD/docling into dev/add-asr-pipeline-v2

2025-07-25 19:44:34 +00:00 · 2025-06-23 09:08:58 +02:00 · 2025-06-23 09:08:58 +02:00 · caf18e634b
commit caf18e634b
parent 408b03ebbc d26dac61a8
98 changed files with 340943 additions and 330462 deletions
--- a/.github/dco.yml
+++ b/.github/dco.yml
@ -0,0 +1,2 @@
+allowRemediationCommits:
+  individual: true
--- a/.github/workflows/dco-advisor.yml
+++ b/.github/workflows/dco-advisor.yml
@ -0,0 +1,192 @@
+name: DCO Advisor Bot
+
+on:
+  pull_request_target:
+    types: [opened, reopened, synchronize]
+
+permissions:
+  pull-requests: write
+  issues: write
+
+jobs:
+  dco_advisor:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Handle DCO check result
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const pr = context.payload.pull_request || context.payload.check_run?.pull_requests?.[0];
+            if (!pr) return;
+
+            const prNumber = pr.number;
+            const baseRef = pr.base.ref;
+            const headSha =
+              context.payload.check_run?.head_sha ||
+              pr.head?.sha;
+            const username = pr.user.login;
+
+            console.log("HEAD SHA:", headSha);
+
+            const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
+
+            // Poll until DCO check has a conclusion (max 6 attempts, 30s)
+            let dcoCheck = null;
+            for (let attempt = 0; attempt < 6; attempt++) {
+              const { data: checks } = await github.rest.checks.listForRef({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                ref: headSha
+              });
+
+              
+              console.log("All check runs:");
+                checks.check_runs.forEach(run => {
+                console.log(`- ${run.name} (${run.status}/${run.conclusion}) @ ${run.head_sha}`);
+              });
+
+              dcoCheck = checks.check_runs.find(run =>
+                run.name.toLowerCase().includes("dco") &&
+              !run.name.toLowerCase().includes("dco_advisor") &&
+                run.head_sha === headSha
+              );
+
+
+              if (dcoCheck?.conclusion) break;
+              console.log(`Waiting for DCO check... (${attempt + 1})`);
+              await sleep(5000); // wait 5 seconds
+            }
+
+            if (!dcoCheck || !dcoCheck.conclusion) {
+              console.log("DCO check did not complete in time.");
+              return;
+            }
+
+            const isFailure = ["failure", "action_required"].includes(dcoCheck.conclusion);
+            console.log(`DCO check conclusion for ${headSha}: ${dcoCheck.conclusion} (treated as ${isFailure ? "failure" : "success"})`);
+
+            // Parse DCO output for commit SHAs and author
+            let badCommits = [];
+            let authorName = "";
+            let authorEmail = "";
+            let moreInfo = `More info: [DCO check report](${dcoCheck?.html_url})`;
+
+            if (isFailure) {
+                const { data: commits } = await github.rest.pulls.listCommits({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    pull_number: prNumber,
+                });
+
+                for (const commit of commits) {
+                    const commitMessage = commit.commit.message;
+                    const signoffMatch = commitMessage.match(/^Signed-off-by:\s+.+<.+>$/m);
+                    if (!signoffMatch) {
+                        console.log(`Bad commit found ${commit.sha}`)
+                        badCommits.push({
+                        sha: commit.sha,
+                        authorName: commit.commit.author.name,
+                        authorEmail: commit.commit.author.email,
+                        });
+                    }
+                }            
+            }
+
+            // If multiple authors are present, you could adapt the message accordingly
+            // For now, we'll just use the first one
+            if (badCommits.length > 0) {
+            authorName = badCommits[0].authorName;
+            authorEmail = badCommits[0].authorEmail;
+            }
+
+            // Generate remediation commit message if needed
+            let remediationSnippet = "";
+            if (badCommits.length && authorEmail) {
+              remediationSnippet = `git commit --allow-empty -s -m "DCO Remediation Commit for ${authorName} <${authorEmail}>\n\n` +
+                badCommits.map(c => `I, ${c.authorName} <${c.authorEmail}>, hereby add my Signed-off-by to this commit: ${c.sha}`).join('\n') +
+                `"`;
+            } else {
+              remediationSnippet = "# Unable to auto-generate remediation message. Please check the DCO check details.";
+            }
+
+            // Build comment
+            const commentHeader = '<!-- dco-advice-bot -->';
+            let body = "";
+
+            if (isFailure) {
+              body = [
+                commentHeader,
+                '❌ **DCO Check Failed**',
+                '',
+                `Hi @${username}, your pull request has failed the Developer Certificate of Origin (DCO) check.`,
+                '',
+                'This repository supports **remediation commits**, so you can fix this without rewriting history — but you must follow the required message format.',
+                '',
+                '---',
+                '',
+                '### 🛠 Quick Fix: Add a remediation commit',
+                'Run this command:',
+                '',
+                '```bash',
+                remediationSnippet,
+                'git push',
+                '```',
+                '',
+                '---',
+                '',
+                '<details>',
+                '<summary>🔧 Advanced: Sign off each commit directly</summary>',
+                '',
+                '**For the latest commit:**',
+                '```bash',
+                'git commit --amend --signoff',
+                'git push --force-with-lease',
+                '```',
+                '',
+                '**For multiple commits:**',
+                '```bash',
+                `git rebase --signoff origin/${baseRef}`,
+                'git push --force-with-lease',
+                '```',
+                '',
+                '</details>',
+                '',
+                moreInfo
+              ].join('\n');
+            } else {
+              body = [
+                commentHeader,
+                '✅ **DCO Check Passed**',
+                '',
+                `Thanks @${username}, all your commits are properly signed off. 🎉`
+              ].join('\n');
+            }
+
+            // Get existing comments on the PR
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber
+            });
+
+            // Look for a previous bot comment
+            const existingComment = comments.find(c =>
+              c.body.includes("<!-- dco-advice-bot -->")
+            );
+
+            if (existingComment) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: existingComment.id,
+                body: body
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: prNumber,
+                body: body
+              });
+            }
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,26 @@
+## [v2.37.0](https://github.com/docling-project/docling/releases/tag/v2.37.0) - 2025-06-16
+
+### Feature
+
+* Make Page.parsed_page the only source of truth for text cells, add OCR cells to it ([#1745](https://github.com/docling-project/docling/issues/1745)) ([`7d3302c`](https://github.com/docling-project/docling/commit/7d3302cb48dd91cd29673d7c4eaf7326736d0685))
+* Support xlsm files ([#1520](https://github.com/docling-project/docling/issues/1520)) ([`df14022`](https://github.com/docling-project/docling/commit/df140227c3b8bcad0c68bf3d129930cccd96a07e))
+
+### Fix
+
+* Pptx line break and space handling ([#1664](https://github.com/docling-project/docling/issues/1664)) ([`f28d23c`](https://github.com/docling-project/docling/commit/f28d23cf03d059619d1d3482594596ab7c87d197))
+* **asciidoc:** Set default size when missing in image directive ([#1769](https://github.com/docling-project/docling/issues/1769)) ([`b886e4d`](https://github.com/docling-project/docling/commit/b886e4df312447d39f58cf6e3c45b0f863940321))
+* Handle NoneType error in MsPowerpointDocumentBackend ([#1747](https://github.com/docling-project/docling/issues/1747)) ([`7a275c7`](https://github.com/docling-project/docling/commit/7a275c763731d9c96b7cf32f2e27b8dc8bebacd7))
+* Prov for merged-elems ([#1728](https://github.com/docling-project/docling/issues/1728)) ([`6613b9e`](https://github.com/docling-project/docling/commit/6613b9e98bc8b89791dc0334de8970ff243aba82))
+* **tesseract:** Initialize df_osd to avoid uninitialized variable error ([#1718](https://github.com/docling-project/docling/issues/1718)) ([`e979750`](https://github.com/docling-project/docling/commit/e979750ce93b2fae89dbb60ff06333f80c1c2908))
+* Allow custom torch_dtype in vlm models ([#1735](https://github.com/docling-project/docling/issues/1735)) ([`f7f3113`](https://github.com/docling-project/docling/commit/f7f31137f10999fefdb70da7e5ef56536f650400))
+* Improve extraction from textboxes in Word docs ([#1701](https://github.com/docling-project/docling/issues/1701)) ([`9dbcb3d`](https://github.com/docling-project/docling/commit/9dbcb3d7d4f27d1c935c8681c57ed59524452d53))
+* Add WEBP to the list of image file extensions ([#1711](https://github.com/docling-project/docling/issues/1711)) ([`a2b83fe`](https://github.com/docling-project/docling/commit/a2b83fe4aea66c273a83bf17177e87d45d3f18d1))
+
+### Documentation
+
+* Update vlm models api examples with LM Studio ([#1759](https://github.com/docling-project/docling/issues/1759)) ([`0432a31`](https://github.com/docling-project/docling/commit/0432a31b2f7c9fe944c3a1d4b608ef938b4f2299))
+* Add open webui ([#1734](https://github.com/docling-project/docling/issues/1734)) ([`49b10e7`](https://github.com/docling-project/docling/commit/49b10e74191d4d580c9305ac08d9898a79346d7d))
+
 ## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04

 ### Fix
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -2,7 +2,7 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Final, Set, Union

 from docling_core.types.doc import (
    DocItemLabel,
@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)

+DEFAULT_IMAGE_WIDTH: Final = 128
+DEFAULT_IMAGE_HEIGHT: Final = 128
+

 class AsciiDocBackend(DeclarativeDocumentBackend):
    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

                item = self._parse_picture(line)

-                size = None
+                size: Size
                if "width" in item and "height" in item:
                    size = Size(width=int(item["width"]), height=int(item["height"]))
+                else:
+                    size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)

                uri = None
                if (
@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):

        return doc

-    def _get_current_level(self, parents):
+    @staticmethod
+    def _get_current_level(parents):
        for k, v in parents.items():
            if v is None and k > 0:
                return k - 1

        return 0

-    def _get_current_parent(self, parents):
+    @staticmethod
+    def _get_current_parent(parents):
        for k, v in parents.items():
            if v is None and k > 0:
                return parents[k - 1]
@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return None

    #   =========   Title
-    def _is_title(self, line):
+    @staticmethod
+    def _is_title(line):
        return re.match(r"^= ", line)

-    def _parse_title(self, line):
+    @staticmethod
+    def _parse_title(line):
        return {"type": "title", "text": line[2:].strip(), "level": 0}

    #   =========   Section headers
-    def _is_section_header(self, line):
+    @staticmethod
+    def _is_section_header(line):
        return re.match(r"^==+\s+", line)

-    def _parse_section_header(self, line):
+    @staticmethod
+    def _parse_section_header(line):
        match = re.match(r"^(=+)\s+(.*)", line)

        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        }

    #   =========   Lists
-    def _is_list_item(self, line):
+    @staticmethod
+    def _is_list_item(line):
        return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)

-    def _parse_list_item(self, line):
+    @staticmethod
+    def _parse_list_item(line):
        """Extract the item marker (number or bullet symbol) and the text of the item."""

        match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            }

    #   =========   Tables
-    def _is_table_line(self, line):
+    @staticmethod
+    def _is_table_line(line):
        return re.match(r"^\|.*\|", line)

-    def _parse_table_line(self, line):
+    @staticmethod
+    def _parse_table_line(line):
        # Split table cells and trim extra spaces
        return [cell.strip() for cell in line.split("|") if cell.strip()]

-    def _populate_table_as_grid(self, table_data):
+    @staticmethod
+    def _populate_table_as_grid(table_data):
        num_rows = len(table_data)

        # Adjust the table data into a grid format
@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return data

    #   =========   Pictures
-    def _is_picture(self, line):
+    @staticmethod
+    def _is_picture(line):
        return re.match(r"^image::", line)

-    def _parse_picture(self, line):
+    @staticmethod
+    def _parse_picture(line):
        """
        Parse an image macro, extracting its path and attributes.
        Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return {"type": "picture", "uri": line}

    #   =========   Captions
-    def _is_caption(self, line):
+    @staticmethod
+    def _is_caption(line):
        return re.match(r"^\.(.+)", line)

-    def _parse_caption(self, line):
+    @staticmethod
+    def _parse_caption(line):
        mtch = re.match(r"^\.(.+)", line)
        if mtch:
            text = mtch.group(1)
@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return {"type": "caption", "text": ""}

    #   =========   Plain text
-    def _parse_text(self, line):
+    @staticmethod
+    def _parse_text(line):
        return {"type": "text", "text": line.strip()}
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -7,12 +7,17 @@ from typing import List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    SegmentedPdfPage,
+    TextCell,
+)
 from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pypdfium2_backend import get_pdf_page_geometry
 from docling.datamodel.document import InputDocument

 _log = logging.getLogger(__name__)
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid

+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from docling-parse data."""
+        cells: List[TextCell] = []
+        cell_counter = 0
+
+        if not self.valid:
+            return cells
+
+        page_size = self.get_size()
+
+        parser_width = self._dpage["width"]
+        parser_height = self._dpage["height"]
+
+        for i in range(len(self._dpage["cells"])):
+            rect = self._dpage["cells"][i]["box"]["device"]
+            x0, y0, x1, y1 = rect
+
+            if x1 < x0:
+                x0, x1 = x1, x0
+            if y1 < y0:
+                y0, y1 = y1, y0
+
+            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
+            cells.append(
+                TextCell(
+                    index=cell_counter,
+                    text=text_piece,
+                    orig=text_piece,
+                    from_ocr=False,
+                    rect=BoundingRectangle.from_bounding_box(
+                        BoundingBox(
+                            l=x0 * page_size.width / parser_width,
+                            b=y0 * page_size.height / parser_height,
+                            r=x1 * page_size.width / parser_width,
+                            t=y1 * page_size.height / parser_height,
+                            coord_origin=CoordOrigin.BOTTOMLEFT,
+                        )
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+
+            cell_counter += 1
+
+        return cells
+
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
        return text_piece

    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
+        if not self.valid:
+            return None
+
+        text_cells = self._compute_text_cells()
+
+        # Get the PDF page geometry from pypdfium2
+        dimension = get_pdf_page_geometry(self._ppage)
+
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_lines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )

    def get_text_cells(self) -> Iterable[TextCell]:
-        cells: List[TextCell] = []
-        cell_counter = 0
-
-        if not self.valid:
-            return cells
-
-        page_size = self.get_size()
-
-        parser_width = self._dpage["width"]
-        parser_height = self._dpage["height"]
-
-        for i in range(len(self._dpage["cells"])):
-            rect = self._dpage["cells"][i]["box"]["device"]
-            x0, y0, x1, y1 = rect
-
-            if x1 < x0:
-                x0, x1 = x1, x0
-            if y1 < y0:
-                y0, y1 = y1, y0
-
-            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
-            cells.append(
-                TextCell(
-                    index=cell_counter,
-                    text=text_piece,
-                    orig=text_piece,
-                    from_ocr=False,
-                    rect=BoundingRectangle.from_bounding_box(
-                        BoundingBox(
-                            # l=x0, b=y0, r=x1, t=y1,
-                            l=x0 * page_size.width / parser_width,
-                            b=y0 * page_size.height / parser_height,
-                            r=x1 * page_size.width / parser_width,
-                            t=y1 * page_size.height / parser_height,
-                            coord_origin=CoordOrigin.BOTTOMLEFT,
-                        )
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-
-            cell_counter += 1
-
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-
-        # before merge:
-        # draw_clusters_and_cells()
-
-        # cells = merge_horizontal_cells(cells)
-
-        # after merge:
-        # draw_clusters_and_cells()
-
-        return cells
+        return self._compute_text_cells()

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union

 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage

 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
+from docling.backend.pypdfium2_backend import get_pdf_page_geometry
 from docling.datamodel.base_models import Size
 from docling.utils.locks import pypdfium2_lock

@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid

+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from docling-parse v2 data."""
+        cells: List[TextCell] = []
+        cell_counter = 0
+
+        if not self.valid:
+            return cells
+
+        page_size = self.get_size()
+
+        parser_width = self._dpage["sanitized"]["dimension"]["width"]
+        parser_height = self._dpage["sanitized"]["dimension"]["height"]
+
+        cells_data = self._dpage["sanitized"]["cells"]["data"]
+        cells_header = self._dpage["sanitized"]["cells"]["header"]
+
+        for i, cell_data in enumerate(cells_data):
+            x0 = cell_data[cells_header.index("x0")]
+            y0 = cell_data[cells_header.index("y0")]
+            x1 = cell_data[cells_header.index("x1")]
+            y1 = cell_data[cells_header.index("y1")]
+
+            if x1 < x0:
+                x0, x1 = x1, x0
+            if y1 < y0:
+                y0, y1 = y1, y0
+
+            text_piece = cell_data[cells_header.index("text")]
+            cells.append(
+                TextCell(
+                    index=cell_counter,
+                    text=text_piece,
+                    orig=text_piece,
+                    from_ocr=False,
+                    rect=BoundingRectangle.from_bounding_box(
+                        BoundingBox(
+                            l=x0 * page_size.width / parser_width,
+                            b=y0 * page_size.height / parser_height,
+                            r=x1 * page_size.width / parser_width,
+                            t=y1 * page_size.height / parser_height,
+                            coord_origin=CoordOrigin.BOTTOMLEFT,
+                        )
+                    ).to_top_left_origin(page_size.height),
+                )
+            )
+            cell_counter += 1
+
+        return cells
+
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
        return text_piece

    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
+        if not self.valid:
+            return None
+
+        text_cells = self._compute_text_cells()
+
+        # Get the PDF page geometry from pypdfium2
+        dimension = get_pdf_page_geometry(self._ppage)
+
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_textlines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )

    def get_text_cells(self) -> Iterable[TextCell]:
-        cells: List[TextCell] = []
-        cell_counter = 0
-
-        if not self.valid:
-            return cells
-
-        page_size = self.get_size()
-
-        parser_width = self._dpage["sanitized"]["dimension"]["width"]
-        parser_height = self._dpage["sanitized"]["dimension"]["height"]
-
-        cells_data = self._dpage["sanitized"]["cells"]["data"]
-        cells_header = self._dpage["sanitized"]["cells"]["header"]
-
-        for i, cell_data in enumerate(cells_data):
-            x0 = cell_data[cells_header.index("x0")]
-            y0 = cell_data[cells_header.index("y0")]
-            x1 = cell_data[cells_header.index("x1")]
-            y1 = cell_data[cells_header.index("y1")]
-
-            if x1 < x0:
-                x0, x1 = x1, x0
-            if y1 < y0:
-                y0, y1 = y1, y0
-
-            text_piece = cell_data[cells_header.index("text")]
-            cells.append(
-                TextCell(
-                    index=cell_counter,
-                    text=text_piece,
-                    orig=text_piece,
-                    from_ocr=False,
-                    rect=BoundingRectangle.from_bounding_box(
-                        BoundingBox(
-                            # l=x0, b=y0, r=x1, t=y1,
-                            l=x0 * page_size.width / parser_width,
-                            b=y0 * page_size.height / parser_height,
-                            r=x1 * page_size.width / parser_width,
-                            t=y1 * page_size.height / parser_height,
-                            coord_origin=CoordOrigin.BOTTOMLEFT,
-                        )
-                    ).to_top_left_origin(page_size.height),
-                )
-            )
-            cell_counter += 1
-
-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.bbox.as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
-
-        # draw_clusters_and_cells()
-
-        return cells
+        return self._compute_text_cells()

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
        return self._dpage

    def get_text_cells(self) -> Iterable[TextCell]:
-        page_size = self.get_size()
-
-        [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
-
-        # for cell in self._dpage.textline_cells:
-        #     rect = cell.rect
-        #
-        #     assert (
-        #         rect.to_bounding_box().l <= rect.to_bounding_box().r
-        #     ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
-        #     assert (
-        #         rect.to_bounding_box().t <= rect.to_bounding_box().b
-        #     ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
-
        return self._dpage.textline_cells

    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
        self, page_no: int, create_words: bool = True, create_textlines: bool = True
    ) -> DoclingParseV4PageBackend:
        with pypdfium2_lock:
+            seg_page = self.dp_doc.get_page(
+                page_no + 1,
+                create_words=create_words,
+                create_textlines=create_textlines,
+            )
+
+            # In Docling, all TextCell instances are expected with top-left origin.
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.textline_cells
+            ]
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.char_cells
+            ]
+            [
+                tc.to_top_left_origin(seg_page.dimension.height)
+                for tc in seg_page.word_cells
+            ]
+
            return DoclingParseV4PageBackend(
-                self.dp_doc.get_page(
-                    page_no + 1,
-                    create_words=create_words,
-                    create_textlines=create_textlines,
-                ),
+                seg_page,
                self._pdoc[page_no],
            )

--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -1,17 +1,15 @@
 import logging
 import re
 import warnings
+from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Set, Union

 import marko
 import marko.element
-import marko.ext
-import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
-    DocItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
@ -21,7 +19,9 @@ from docling_core.types.doc import (
    TableData,
    TextItem,
 )
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from marko import Markdown
+from pydantic import AnyUrl, TypeAdapter

 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

        self.in_table = False
        self.md_table_buffer: list[str] = []
-        self.inline_texts: list[str] = []
        self._html_blocks: int = 0

        try:
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                doc.add_table(data=table_data)
        return

-    def _process_inline_text(
-        self, parent_item: Optional[NodeItem], doc: DoclingDocument
-    ):
-        txt = " ".join(self.inline_texts)
-        if len(txt) > 0:
-            doc.add_text(
-                label=DocItemLabel.PARAGRAPH,
-                parent=parent_item,
-                text=txt,
-            )
-        self.inline_texts = []
-
    def _iterate_elements(  # noqa: C901
        self,
+        *,
        element: marko.element.Element,
        depth: int,
        doc: DoclingDocument,
        visited: Set[marko.element.Element],
        parent_item: Optional[NodeItem] = None,
+        formatting: Optional[Formatting] = None,
+        hyperlink: Optional[Union[AnyUrl, Path]] = None,
    ):
        if element in visited:
            return
@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        # Check for different element types and process relevant details
        if isinstance(element, marko.block.Heading) and len(element.children) > 0:
            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
            _log.debug(
                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
            )
-            if element.level == 1:
-                doc_label = DocItemLabel.TITLE
+
+            if len(element.children) == 1:
+                child = element.children[0]
+                snippet_text = str(child.children)  # type: ignore
+                visited.add(child)
            else:
-                doc_label = DocItemLabel.SECTION_HEADER
+                snippet_text = ""  # inline group will be created

-            # Header could have arbitrary inclusion of bold, italic or emphasis,
-            # hence we need to traverse the tree to get full text of a header
-            strings: List[str] = []
-
-            # Define a recursive function to traverse the tree
-            def traverse(node: marko.block.BlockElement):
-                # Check if the node has a "children" attribute
-                if hasattr(node, "children"):
-                    # If "children" is a list, continue traversal
-                    if isinstance(node.children, list):
-                        for child in node.children:
-                            traverse(child)
-                    # If "children" is text, add it to header text
-                    elif isinstance(node.children, str):
-                        strings.append(node.children)
-
-            traverse(element)
-            snippet_text = "".join(strings)
-            if len(snippet_text) > 0:
-                if doc_label == DocItemLabel.SECTION_HEADER:
-                    parent_item = doc.add_heading(
-                        text=snippet_text,
-                        level=element.level - 1,
-                        parent=parent_item,
-                    )
-                else:
-                    parent_item = doc.add_text(
-                        label=doc_label, parent=parent_item, text=snippet_text
-                    )
+            if element.level == 1:
+                parent_item = doc.add_title(
+                    text=snippet_text,
+                    parent=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
+            else:
+                parent_item = doc.add_heading(
+                    text=snippet_text,
+                    level=element.level - 1,
+                    parent=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )

        elif isinstance(element, marko.block.List):
            has_non_empty_list_items = False
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    break

            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
            if has_non_empty_list_items:
                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

        elif (
            isinstance(element, marko.block.ListItem)
-            and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.block.Paragraph)
+            and len(element.children) == 1
+            and isinstance((child := element.children[0]), marko.block.Paragraph)
+            and len(child.children) > 0
        ):
            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
            _log.debug(" - List item")

-            snippet_text = str(first_child.children[0].children)  # type: ignore
-            is_numbered = False
-            if (
-                parent_item is not None
-                and isinstance(parent_item, DocItem)
-                and parent_item.label == GroupLabel.ORDERED_LIST
-            ):
-                is_numbered = True
-            doc.add_list_item(
-                enumerated=is_numbered, parent=parent_item, text=snippet_text
+            if len(child.children) == 1:
+                snippet_text = str(child.children[0].children)  # type: ignore
+                visited.add(child)
+            else:
+                snippet_text = ""  # inline group will be created
+            is_numbered = isinstance(parent_item, OrderedList)
+            if not isinstance(parent_item, (OrderedList, UnorderedList)):
+                _log.warning("ListItem would have not had a list parent, adding one.")
+                parent_item = doc.add_unordered_list(parent=parent_item)
+            parent_item = doc.add_list_item(
+                enumerated=is_numbered,
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
            )
-            visited.add(first_child)

        elif isinstance(element, marko.inline.Image):
            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")

            fig_caption: Optional[TextItem] = None
            if element.title is not None and element.title != "":
                fig_caption = doc.add_text(
-                    label=DocItemLabel.CAPTION, text=element.title
+                    label=DocItemLabel.CAPTION,
+                    text=element.title,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
                )

            doc.add_picture(parent=parent_item, caption=fig_caption)

-        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
-            self._process_inline_text(parent_item, doc)
+        elif isinstance(element, marko.inline.Emphasis):
+            _log.debug(f" - Emphasis: {element.children}")
+            formatting = deepcopy(formatting) if formatting else Formatting()
+            formatting.italic = True
+
+        elif isinstance(element, marko.inline.StrongEmphasis):
+            _log.debug(f" - StrongEmphasis: {element.children}")
+            formatting = deepcopy(formatting) if formatting else Formatting()
+            formatting.bold = True
+
+        elif isinstance(element, marko.inline.Link):
+            _log.debug(f" - Link: {element.children}")
+            hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
+                element.dest
+            )

        elif isinstance(element, marko.inline.RawText):
            _log.debug(f" - Paragraph (raw text): {element.children}")
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
                else:
                    self.md_table_buffer.append(snippet_text)
-            else:
+            elif snippet_text:
                self._close_table(doc)
-                # most likely just inline text
-                self.inline_texts.append(str(element.children))
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    parent=parent_item,
+                    text=snippet_text,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )

        elif isinstance(element, marko.inline.CodeSpan):
            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Span: {element.children}")
            snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )

        elif (
            isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
            and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.inline.RawText)
-            and len(snippet_text := (first_child.children.strip())) > 0
+            and isinstance((child := element.children[0]), marko.inline.RawText)
+            and len(snippet_text := (child.children.strip())) > 0
        ):
            self._close_table(doc)
-            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
+                parent=parent_item,
+                text=snippet_text,
+                formatting=formatting,
+                hyperlink=hyperlink,
+            )

        elif isinstance(element, marko.inline.LineBreak):
            if self.in_table:
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

        elif isinstance(element, marko.block.HTMLBlock):
            self._html_blocks += 1
-            self._process_inline_text(parent_item, doc)
            self._close_table(doc)
            _log.debug(f"HTML Block: {element}")
            if (
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):

                # wrap in markers to enable post-processing in convert()
                text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_item, text=text_to_add)
+                doc.add_code(
+                    parent=parent_item,
+                    text=text_to_add,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
+                )
        else:
            if not isinstance(element, str):
                self._close_table(doc)
                _log.debug(f"Some other element: {element}")

+        if (
+            isinstance(element, (marko.block.Paragraph, marko.block.Heading))
+            and len(element.children) > 1
+        ):
+            parent_item = doc.add_inline_group(parent=parent_item)
+
        processed_block_types = (
-            marko.block.Heading,
+            # marko.block.Heading,
            marko.block.CodeBlock,
            marko.block.FencedCode,
            marko.inline.RawText,
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    doc=doc,
                    visited=visited,
                    parent_item=parent_item,
+                    formatting=formatting,
+                    hyperlink=hyperlink,
                )

    def is_valid(self) -> bool:
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                parent_item=None,
                visited=set(),
            )
-            self._process_inline_text(None, doc)  # handle last hanging inline text
            self._close_table(doc=doc)  # handle any last hanging table

            # if HTML blocks were detected, export to HTML and delegate to HTML backend
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
 from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
+from pptx.oxml.text import CT_TextLineBreak

 from docling.backend.abstract_backend import (
    DeclarativeDocumentBackend,
@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB

        return prov

-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
-        is_a_list = False
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
        is_list_group_created = False
        enum_list_item_value = 0
        new_list = None
-        bullet_type = "None"
-        list_label = GroupLabel.LIST
        doc_label = DocItemLabel.LIST_ITEM
        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)

-        # Identify if shape contains lists
-        for paragraph in shape.text_frame.paragraphs:
-            # Check if paragraph is a bullet point using the `element` XML
+        def is_list_item(paragraph):
+            """Check if the paragraph is a list item."""
            p = paragraph._element
            if (
                p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
                is not None
            ):
-                bullet_type = "Bullet"
-                is_a_list = True
+                return (True, "Bullet")
            elif (
                p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
                is not None
            ):
-                bullet_type = "Numbered"
-                is_a_list = True
-            else:
-                is_a_list = False
-
-            if paragraph.level > 0:
+                return (True, "Numbered")
+            elif paragraph.level > 0:
                # Most likely a sub-list
-                is_a_list = True
-
-            if is_a_list:
-                # Determine if this is an unordered list or an ordered list.
-                # Set GroupLabel.ORDERED_LIST when it fits.
-                if bullet_type == "Numbered":
-                    list_label = GroupLabel.ORDERED_LIST
-
-            if is_a_list:
-                _log.debug("LIST DETECTED!")
+                return (True, "None")
            else:
-                _log.debug("No List")
-
-        # If there is a list inside of the shape, create a new docling list to assign list items to
-        # if is_a_list:
-        #     new_list = doc.add_group(
-        #         label=list_label, name=f"list", parent=parent_slide
-        #     )
+                return (False, "None")

        # Iterate through paragraphs to build up text
        for paragraph in shape.text_frame.paragraphs:
-            # p_text = paragraph.text.strip()
+            is_a_list, bullet_type = is_list_item(paragraph)
            p = paragraph._element
-            enum_list_item_value += 1
-            inline_paragraph_text = ""
-            inline_list_item_text = ""

-            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
-                if len(e.text.strip()) > 0:
-                    e_is_a_list_item = False
-                    is_numbered = False
-                    if (
-                        p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Bullet"
-                        e_is_a_list_item = True
-                    elif (
-                        p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
-                        is not None
-                    ):
-                        bullet_type = "Numbered"
-                        is_numbered = True
-                        e_is_a_list_item = True
-                    else:
-                        e_is_a_list_item = False
+            # Convert line breaks to spaces and accumulate text
+            p_text = ""
+            for e in p.content_children:
+                if isinstance(e, CT_TextLineBreak):
+                    p_text += " "
+                else:
+                    p_text += e.text

-                    if e_is_a_list_item:
-                        if len(inline_paragraph_text) > 0:
-                            # output accumulated inline text:
-                            doc.add_text(
-                                label=doc_label,
-                                parent=parent_slide,
-                                text=inline_paragraph_text,
-                                prov=prov,
-                            )
-                        # Set marker and enumerated arguments if this is an enumeration element.
-                        inline_list_item_text += e.text
-                        # print(e.text)
-                    else:
-                        # Assign proper label to the text, depending if it's a Title or Section Header
-                        # For other types of text, assign - PARAGRAPH
-                        doc_label = DocItemLabel.PARAGRAPH
-                        if shape.is_placeholder:
-                            placeholder_type = shape.placeholder_format.type
-                            if placeholder_type in [
-                                PP_PLACEHOLDER.CENTER_TITLE,
-                                PP_PLACEHOLDER.TITLE,
-                            ]:
-                                # It's a title
-                                doc_label = DocItemLabel.TITLE
-                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
-                                DocItemLabel.SECTION_HEADER
-                        enum_list_item_value = 0
-                        inline_paragraph_text += e.text
+            if is_a_list:
+                enum_marker = ""
+                enumerated = bullet_type == "Numbered"
+
+                if not is_list_group_created:
+                    new_list = doc.add_group(
+                        label=GroupLabel.ORDERED_LIST
+                        if enumerated
+                        else GroupLabel.LIST,
+                        name="list",
+                        parent=parent_slide,
+                    )
+                    is_list_group_created = True
+                    enum_list_item_value = 0
+
+                if enumerated:
+                    enum_list_item_value += 1
+                    enum_marker = str(enum_list_item_value) + "."
+
+                doc.add_list_item(
+                    marker=enum_marker,
+                    enumerated=enumerated,
+                    parent=new_list,
+                    text=p_text,
+                    prov=prov,
+                )
+            else:  # is paragraph not a list item
+                # Assign proper label to the text, depending if it's a Title or Section Header
+                # For other types of text, assign - PARAGRAPH
+                doc_label = DocItemLabel.PARAGRAPH
+                if shape.is_placeholder:
+                    placeholder_type = shape.placeholder_format.type
+                    if placeholder_type in [
+                        PP_PLACEHOLDER.CENTER_TITLE,
+                        PP_PLACEHOLDER.TITLE,
+                    ]:
+                        # It's a title
+                        doc_label = DocItemLabel.TITLE
+                    elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                        DocItemLabel.SECTION_HEADER

-            if len(inline_paragraph_text) > 0:
                # output accumulated inline text:
                doc.add_text(
                    label=doc_label,
                    parent=parent_slide,
-                    text=inline_paragraph_text,
-                    prov=prov,
-                )
-
-            if len(inline_list_item_text) > 0:
-                enum_marker = ""
-                if is_numbered:
-                    enum_marker = str(enum_list_item_value) + "."
-                if not is_list_group_created:
-                    new_list = doc.add_group(
-                        label=list_label, name="list", parent=parent_slide
-                    )
-                    is_list_group_created = True
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_list,
-                    text=inline_list_item_text,
+                    text=p_text,
                    prov=prov,
                )
        return
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -14,7 +14,7 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            self.valid = True
        except Exception as e:
            raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
            ) from e

    @override
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    self._handle_tables(element, docx_obj, doc)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
-
+            # Check for Image
            elif drawing_blip:
                self._handle_pictures(docx_obj, drawing_blip, doc)
+                # Check for Text after the Image
+                if (
+                    tag_name in ["p"]
+                    and element.find(".//w:t", namespaces=namespaces) is not None
+                ):
+                    self._handle_text_elements(element, docx_obj, doc)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self._handle_text_elements(element, docx_obj, doc)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
+
        return doc

    def _str_to_int(
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        all_paragraphs = []

        # Sort paragraphs within each container, then process containers
-        for container_id, paragraphs in container_paragraphs.items():
+        for paragraphs in container_paragraphs.values():
            # Sort by vertical position within each container
            sorted_container_paragraphs = sorted(
                paragraphs,
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc: DoclingDocument,
    ) -> None:
        paragraph = Paragraph(element, docx_obj)
-
+        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
            element=element, text=paragraph.text
        )

        if text is None:
            return
-        paragraph_elements = self._get_paragraph_elements(paragraph)
        text = text.strip()

        # Common styles for bullet and numbered lists.
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        )
        return

+    def _add_formatted_list_item(
+        self,
+        doc: DoclingDocument,
+        elements: list,
+        marker: str,
+        enumerated: bool,
+        level: int,
+    ) -> None:
+        # This should not happen by construction
+        if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
+            return
+        if len(elements) == 1:
+            text, format, hyperlink = elements[0]
+            doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text=text,
+                formatting=format,
+                hyperlink=hyperlink,
+            )
+        else:
+            new_item = doc.add_list_item(
+                marker=marker,
+                enumerated=enumerated,
+                parent=self.parents[level],
+                text="",
+            )
+            new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
+            for text, format, hyperlink in elements:
+                doc.add_text(
+                    label=DocItemLabel.TEXT,
+                    parent=new_parent,
+                    text=text,
+                    formatting=format,
+                    hyperlink=hyperlink,
+                )
+
    def _add_list_item(
        self,
        *,
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        elements: list,
        is_numbered: bool = False,
    ) -> None:
+        # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
+        if not elements:
+            return None
        enum_marker = ""

        level = self._get_level()
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level
            )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
-
        elif (
            self._prev_numid() == numid
            and self.level_at_new_list is not None
@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
            )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
        elif (
            self._prev_numid() == numid
            and self.level_at_new_list is not None
            and prev_indent is not None
            and ilevel < prev_indent
        ):  # Close list
-            for k, v in self.parents.items():
+            for k in self.parents:
                if k > self.level_at_new_list + ilevel:
                    self.parents[k] = None

@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc,
+                elements,
+                enum_marker,
+                is_numbered,
+                self.level_at_new_list + ilevel,
            )
-            for text, format, hyperlink in elements:
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
            self.listIter = 0

        elif self._prev_numid() == numid or prev_indent == ilevel:
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
-                doc=doc,
-                prev_parent=self.parents[level - 1],
-                paragraph_elements=elements,
+            self._add_formatted_list_item(
+                doc, elements, enum_marker, is_numbered, level - 1
            )
-            for text, format, hyperlink in elements:
-                # Add the list item to the parent group
-                doc.add_list_item(
-                    marker=enum_marker,
-                    enumerated=is_numbered,
-                    parent=new_parent,
-                    text=text,
-                    formatting=format,
-                    hyperlink=hyperlink,
-                )
+
        return

    def _handle_tables(
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
+    BoundingRectangle,
+    PdfPageBoundaryType,
+    PdfPageGeometry,
+    SegmentedPdfPage,
+    TextCell,
+)
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.utils.locks import pypdfium2_lock

+
+def get_pdf_page_geometry(
+    ppage: pdfium.PdfPage,
+    angle: float = 0.0,
+    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
+) -> PdfPageGeometry:
+    """
+    Create PdfPageGeometry from a pypdfium2 PdfPage object.
+
+    Args:
+        ppage: pypdfium2 PdfPage object
+        angle: Page rotation angle in degrees (default: 0.0)
+        boundary_type: The boundary type for the page (default: CROP_BOX)
+
+    Returns:
+        PdfPageGeometry with all the different bounding boxes properly set
+    """
+    with pypdfium2_lock:
+        # Get the main bounding box (intersection of crop_box and media_box)
+        bbox_tuple = ppage.get_bbox()
+        bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
+
+        # Get all the different page boxes from pypdfium2
+        media_box_tuple = ppage.get_mediabox()
+        crop_box_tuple = ppage.get_cropbox()
+        art_box_tuple = ppage.get_artbox()
+        bleed_box_tuple = ppage.get_bleedbox()
+        trim_box_tuple = ppage.get_trimbox()
+
+        # Convert to BoundingBox objects using existing from_tuple method
+        # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
+        # Use bbox as fallback when specific box types are not defined
+        media_bbox = (
+            BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if media_box_tuple
+            else bbox
+        )
+        crop_bbox = (
+            BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if crop_box_tuple
+            else bbox
+        )
+        art_bbox = (
+            BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if art_box_tuple
+            else bbox
+        )
+        bleed_bbox = (
+            BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if bleed_box_tuple
+            else bbox
+        )
+        trim_bbox = (
+            BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
+            if trim_box_tuple
+            else bbox
+        )
+
+        return PdfPageGeometry(
+            angle=angle,
+            rect=BoundingRectangle.from_bounding_box(bbox),
+            boundary_type=boundary_type,
+            art_bbox=art_bbox,
+            bleed_bbox=bleed_bbox,
+            crop_bbox=crop_bbox,
+            media_bbox=media_bbox,
+            trim_bbox=trim_bbox,
+        )
+
+
 if TYPE_CHECKING:
    from docling.datamodel.document import InputDocument

@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid

-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        AREA_THRESHOLD = 0  # 32 * 32
-        page_size = self.get_size()
-        with pypdfium2_lock:
-            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
-                pos = obj.get_pos()
-                cropbox = BoundingBox.from_tuple(
-                    pos, origin=CoordOrigin.BOTTOMLEFT
-                ).to_top_left_origin(page_height=page_size.height)
-
-                if cropbox.area() > AREA_THRESHOLD:
-                    cropbox = cropbox.scaled(scale=scale)
-
-                    yield cropbox
-
-    def get_text_in_rect(self, bbox: BoundingBox) -> str:
-        with pypdfium2_lock:
-            if not self.text_page:
-                self.text_page = self._ppage.get_textpage()
-
-        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
-            bbox = bbox.to_bottom_left_origin(self.get_size().height)
-
-        with pypdfium2_lock:
-            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
-
-        return text_piece
-
-    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
-
-    def get_text_cells(self) -> Iterable[TextCell]:
+    def _compute_text_cells(self) -> List[TextCell]:
+        """Compute text cells from pypdfium."""
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):

            return merged_cells

-        def draw_clusters_and_cells():
-            image = (
-                self.get_page_image()
-            )  # make new image to avoid drawing on the saved ones
-            draw = ImageDraw.Draw(image)
-            for c in cells:
-                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
-                cell_color = (
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                    random.randint(30, 140),
-                )
-                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
-            image.show()
+        return merge_horizontal_cells(cells)

-        # before merge:
-        # draw_clusters_and_cells()
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+        AREA_THRESHOLD = 0  # 32 * 32
+        page_size = self.get_size()
+        with pypdfium2_lock:
+            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
+                pos = obj.get_pos()
+                cropbox = BoundingBox.from_tuple(
+                    pos, origin=CoordOrigin.BOTTOMLEFT
+                ).to_top_left_origin(page_height=page_size.height)

-        cells = merge_horizontal_cells(cells)
+                if cropbox.area() > AREA_THRESHOLD:
+                    cropbox = cropbox.scaled(scale=scale)

-        # after merge:
-        # draw_clusters_and_cells()
+                    yield cropbox

-        return cells
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
+        with pypdfium2_lock:
+            if not self.text_page:
+                self.text_page = self._ppage.get_textpage()
+
+        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
+            bbox = bbox.to_bottom_left_origin(self.get_size().height)
+
+        with pypdfium2_lock:
+            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
+
+        return text_piece
+
+    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
+        if not self.valid:
+            return None
+
+        text_cells = self._compute_text_cells()
+
+        # Get the PDF page geometry from pypdfium2
+        dimension = get_pdf_page_geometry(self._ppage)
+
+        # Create SegmentedPdfPage
+        return SegmentedPdfPage(
+            dimension=dimension,
+            textline_cells=text_cells,
+            char_cells=[],
+            word_cells=[],
+            has_textlines=len(text_cells) > 0,
+            has_words=False,
+            has_chars=False,
+        )
+
+    def get_text_cells(self) -> Iterable[TextCell]:
+        return self._compute_text_cells()

    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -235,7 +235,6 @@ class Page(BaseModel):
    page_no: int
    # page_hash: Optional[str] = None
    size: Optional[Size] = None
-    cells: List[TextCell] = []
    parsed_page: Optional[SegmentedPdfPage] = None
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None
@ -248,12 +247,27 @@ class Page(BaseModel):
        float, Image
    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.

+    @property
+    def cells(self) -> List[TextCell]:
+        """Return text cells as a read-only view of parsed_page.textline_cells."""
+        if self.parsed_page is not None:
+            return self.parsed_page.textline_cells
+        else:
+            return []
+
    def get_image(
-        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+        self,
+        scale: float = 1.0,
+        max_size: Optional[int] = None,
+        cropbox: Optional[BoundingBox] = None,
    ) -> Optional[Image]:
        if self._backend is None:
            return self._image_cache.get(scale, None)

+        if max_size:
+            assert self.size is not None
+            scale = min(scale, max_size / max(self.size.as_tuple()))
+
        if scale not in self._image_cache:
            if cropbox is None:
                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -302,7 +302,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
        ),
    )

-    generate_parsed_pages: bool = False
+    generate_parsed_pages: Literal[True] = (
+        True  # Always True since parsed_page is now mandatory
+    )


 class ProcessingPipeline(str, Enum):
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
    kind: str
    prompt: str
+    scale: float = 2.0
+    max_size: Optional[int] = None


 class ResponseFormat(str, Enum):
@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
        AcceleratorDevice.MPS,
    ]

-    scale: float = 2.0
-
    temperature: float = 0.0
    stop_strings: List[str] = []
    extra_generation_config: Dict[str, Any] = {}
@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
    )  # Default to ollama
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
-    scale: float = 2.0
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None

-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                    assert hi_res_image is not None
                    if hi_res_image:
                        if hi_res_image.mode != "RGB":
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
            coord_origin=bbox.coord_origin,
        )

-        page_ix = element_prov.page_no - 1
+        page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
        cropped_image = conv_res.pages[page_ix].get_image(
            scale=self.images_scale, cropbox=expanded_bbox
        )
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -7,6 +7,7 @@ from typing import List, Optional, Type

 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
+from docling_core.types.doc.page import TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
            return []

    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(
+        self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
+    ) -> List[TextCell]:
        # Create R-tree index for programmatic cells
        p = index.Property()
        p.dimension = 2
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
        ]
        return filtered_ocr_cells

-    def post_process_cells(self, ocr_cells, programmatic_cells):
+    def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
        r"""
-        Post-process the ocr and programmatic cells and return the final list of of cells
+        Post-process the OCR cells and update the page object.
+        Updates parsed_page.textline_cells directly since page.cells is now read-only.
        """
-        if self.options.force_full_page_ocr:
-            # If a full page OCR is forced, use only the OCR cells
-            cells = ocr_cells
-            return cells
+        # Get existing cells from the read-only property
+        existing_cells = page.cells

-        ## Remove OCR cells which overlap with programmatic cells.
-        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
-        programmatic_cells.extend(filtered_ocr_cells)
-        return programmatic_cells
+        # Combine existing and OCR cells with overlap filtering
+        final_cells = self._combine_cells(existing_cells, ocr_cells)
+
+        assert page.parsed_page is not None
+
+        # Update parsed_page.textline_cells directly
+        page.parsed_page.textline_cells = final_cells
+        page.parsed_page.has_lines = len(final_cells) > 0
+
+    def _combine_cells(
+        self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
+    ) -> List[TextCell]:
+        """Combine existing and OCR cells with filtering and re-indexing."""
+        if self.options.force_full_page_ocr:
+            combined = ocr_cells
+        else:
+            filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
+            combined = list(existing_cells) + filtered_ocr_cells
+
+        # Re-index in-place
+        for i, cell in enumerate(combined):
+            cell.index = i
+
+        return combined

    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
        image = copy.deepcopy(page.image)
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)

                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)

                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
                    # Apply postprocessing

                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page.cells, clusters, page.size
+                        page, clusters
                    ).postprocess()
-                    # processed_clusters, processed_cells = clusters, page.cells
+                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally

                    with warnings.catch_warnings():
                        warnings.filterwarnings(
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
                            )
                        )

-                    page.cells = processed_cells
                    page.predictions.layout = LayoutPrediction(
                        clusters=processed_clusters
                    )
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)

                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)

                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -2,7 +2,7 @@ import re
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import Literal, Optional

 import numpy as np
 from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder

 class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
-    create_parsed_page: bool


 class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
        assert page._backend is not None

-        page.cells = list(page._backend.get_text_cells())
-
-        if self.options.create_parsed_page:
-            page.parsed_page = page._backend.get_segmented_page()
+        page.parsed_page = page._backend.get_segmented_page()
+        assert page.parsed_page is not None

        # Rate the text quality from the PDF parser, and aggregate on page
        text_scores = []
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
                            all_ocr_cells.extend(cells)

                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)

                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@ -124,7 +124,7 @@ class ReadingOrderModel:
            page_no = page.page_no + 1
            size = page.size

-            assert size is not None
+            assert size is not None, "Page size is not initialized."

            out_doc.add_page(page_no=page_no, size=size)

--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                            all_ocr_cells.append(cell)

                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)

                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)

                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)

                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None

-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )

                    # Define prompt structure
                    prompt = self.formulate_prompt()
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                    assert page.size is not None

-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
+                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
+                    )
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size

--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                )
                raise e

+            # Filter out uninitialized pages (those with size=None) that may remain
+            # after timeout or processing failures to prevent assertion errors downstream
+            initial_page_count = len(conv_res.pages)
+            conv_res.pages = [page for page in conv_res.pages if page.size is not None]
+
+            if len(conv_res.pages) < initial_page_count:
+                _log.info(
+                    f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
+                    f"due to timeout or processing failures"
+                )
+
        return conv_res

    def _unload(self, conv_res: ConversionResult) -> ConversionResult:
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
            PagePreprocessingModel(
                options=PagePreprocessingOptions(
                    images_scale=pipeline_options.images_scale,
-                    create_parsed_page=pipeline_options.generate_parsed_pages,
                )
            ),
            # OCR
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
 from docling_core.types.doc.page import TextCell
 from rtree import index

-from docling.datamodel.base_models import BoundingBox, Cluster
+from docling.datamodel.base_models import BoundingBox, Cluster, Page

 _log = logging.getLogger(__name__)

@ -194,11 +194,11 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }

-    def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
-        """Initialize processor with cells and clusters."""
-        """Initialize processor with cells and spatial indices."""
-        self.cells = cells
-        self.page_size = page_size
+    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
+        """Initialize processor with page and clusters."""
+        self.cells = page.cells
+        self.page = page
+        self.page_size = page.size
        self.all_clusters = clusters
        self.regular_clusters = [
            c for c in clusters if c.label not in self.SPECIAL_TYPES
@ -240,6 +240,10 @@ class LayoutPostprocessor:
            for child in cluster.children:
                child.cells = self._sort_cells(child.cells)

+        assert self.page.parsed_page is not None
+        self.page.parsed_page.textline_cells = self.cells
+        self.page.parsed_page.has_lines = len(self.cells) > 0
+
        return final_clusters, self.cells

    def _process_regular_clusters(self) -> List[Cluster]:
@ -301,6 +305,7 @@ class LayoutPostprocessor:
        special_clusters = self._handle_cross_type_overlaps(special_clusters)

        # Calculate page area from known page size
+        assert self.page_size is not None
        page_area = self.page_size.width * self.page_size.height
        if page_area > 0:
            # Filter out full-page pictures
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -121,14 +121,15 @@ def export_documents(
 def main():
    logging.basicConfig(level=logging.INFO)

+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_paths = [
-        Path("./tests/data/pdf/2206.01062.pdf"),
-        Path("./tests/data/pdf/2203.01017v2.pdf"),
-        Path("./tests/data/pdf/2305.03393v1.pdf"),
-        Path("./tests/data/pdf/redp5110_sampled.pdf"),
+        data_folder / "pdf/2206.01062.pdf",
+        data_folder / "pdf/2203.01017v2.pdf",
+        data_folder / "pdf/2305.03393v1.pdf",
+        data_folder / "pdf/redp5110_sampled.pdf",
    ]

-    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
+    # buf = BytesIO((data_folder / "pdf/2206.01062.pdf").open("rb").read())
    # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
    # input = DocumentConversionInput.from_streams(docs)

--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -16,7 +16,8 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"

    ###########################################################################

--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@ -71,7 +71,8 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2203.01017v2.pdf"

    pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
    pipeline_options.do_formula_understanding = True
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -76,7 +76,8 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"

    pipeline_options = ExamplePictureClassifierPipelineOptions()
    pipeline_options.images_scale = 2.0
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -16,7 +16,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -19,7 +19,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@ -12,7 +12,8 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")

    doc_converter = DocumentConverter()
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"

    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
@ -32,7 +33,7 @@ def main():
        }
    )

-    doc = converter.convert(input_doc).document
+    doc = converter.convert(input_doc_path).document
    md = doc.export_to_markdown()
    print(md)

--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -96,7 +96,8 @@ def watsonx_vlm_options():
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"

    pipeline_options = PdfPipelineOptions(
        enable_remote_services=True  # <-- this is required!
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -10,7 +10,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"

    # Explicitly set the accelerator
    # accelerator_options = AcceleratorOptions(
@ -47,7 +48,7 @@ def main():
    settings.debug.profile_pipeline_timings = True

    # Convert the document
-    conversion_result = converter.convert(input_doc)
+    conversion_result = converter.convert(input_doc_path)
    doc = conversion_result.document

    # List with total time per document
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption


 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"

    # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
    # ocr_options = TesseractOcrOptions(lang=["auto"])
@ -27,7 +28,7 @@ def main():
        }
    )

-    doc = converter.convert(input_doc).document
+    doc = converter.convert(input_doc_path).document
    md = doc.export_to_markdown()
    print(md)

--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -30,7 +30,8 @@ def translate(text: str, src: str = "en", dest: str = "de"):
 def main():
    logging.basicConfig(level=logging.INFO)

-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")

    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -95,8 +95,8 @@ def watsonx_vlm_options(model: str, prompt: str):
 def main():
    logging.basicConfig(level=logging.INFO)

-    # input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
-    input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
+    input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf"

    pipeline_options = VlmPipelineOptions(
        enable_remote_services=True  # <-- this is required!
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.36.1"  # DO NOT EDIT, updated automatically
+version = "2.37.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
--- a/tests/data/asciidoc/test_03.asciidoc
+++ b/tests/data/asciidoc/test_03.asciidoc
@ -0,0 +1,29 @@
+:_mod-docs-content-type: PROCEDURE
+:experimental:
+
+[id="renaming-a-bookmark_{context}"]
+= Renaming a bookmark
+
+You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
+
+Renaming the bookmark does not rename the folder.
+
+.Procedure
+
+. Right-click the bookmark in the side bar.
+
+. Select *Rename…*.
+
+image::rename-bookmark-menu.png[Rename bookmark menu]
+
+. In the *Name* field, enter the new name for the bookmark.
+
+image::rename-bookmark-text.png[Bookmark name field]
+
+. Click btn:[Rename].
+
+.Verification
+
+* Check that the side bar lists the bookmark under the new name.
+
+image::renamed-bookmark.png[Renamed bookmark]
--- a/tests/data/docx/word_image_anchors.docx
+++ b/tests/data/docx/word_image_anchors.docx
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v1/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v1/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v1/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
+++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
@ -0,0 +1,20 @@
+# Contribution guideline example
+
+This is simple.
+
+Foo *emphasis* **strong emphasis** ***both*** .
+
+Create your feature branch: `git checkout -b feature/AmazingFeature` .
+
+1. Pull the [**repository**](https://github.com/docling-project/docling) .
+2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
+3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
+4. Push to the branch ( `git push origin feature/AmazingFeature` )
+5. Open a Pull Request
+
+## 
+
+*Second* section
+
+- **First** : Lorem ipsum.
+- **Second** : Dolor `sit` amet.
--- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
+++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
@ -0,0 +1,565 @@
+body:
+  children:
+  - $ref: '#/texts/0'
+  - $ref: '#/texts/1'
+  - $ref: '#/groups/0'
+  - $ref: '#/groups/1'
+  - $ref: '#/groups/2'
+  - $ref: '#/texts/27'
+  - $ref: '#/groups/8'
+  content_layer: body
+  label: unspecified
+  name: _root_
+  self_ref: '#/body'
+form_items: []
+furniture:
+  children: []
+  content_layer: furniture
+  label: unspecified
+  name: _root_
+  self_ref: '#/furniture'
+groups:
+- children:
+  - $ref: '#/texts/2'
+  - $ref: '#/texts/3'
+  - $ref: '#/texts/4'
+  - $ref: '#/texts/5'
+  - $ref: '#/texts/6'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/body'
+  self_ref: '#/groups/0'
+- children:
+  - $ref: '#/texts/7'
+  - $ref: '#/texts/8'
+  - $ref: '#/texts/9'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/body'
+  self_ref: '#/groups/1'
+- children:
+  - $ref: '#/texts/10'
+  - $ref: '#/texts/14'
+  - $ref: '#/texts/18'
+  - $ref: '#/texts/22'
+  - $ref: '#/texts/26'
+  content_layer: body
+  label: ordered_list
+  name: list
+  parent:
+    $ref: '#/body'
+  self_ref: '#/groups/2'
+- children:
+  - $ref: '#/texts/11'
+  - $ref: '#/texts/12'
+  - $ref: '#/texts/13'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/texts/10'
+  self_ref: '#/groups/3'
+- children:
+  - $ref: '#/texts/15'
+  - $ref: '#/texts/16'
+  - $ref: '#/texts/17'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/texts/14'
+  self_ref: '#/groups/4'
+- children:
+  - $ref: '#/texts/19'
+  - $ref: '#/texts/20'
+  - $ref: '#/texts/21'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/texts/18'
+  self_ref: '#/groups/5'
+- children:
+  - $ref: '#/texts/23'
+  - $ref: '#/texts/24'
+  - $ref: '#/texts/25'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/texts/22'
+  self_ref: '#/groups/6'
+- children:
+  - $ref: '#/texts/28'
+  - $ref: '#/texts/29'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/texts/27'
+  self_ref: '#/groups/7'
+- children:
+  - $ref: '#/texts/30'
+  - $ref: '#/texts/33'
+  content_layer: body
+  label: list
+  name: list
+  parent:
+    $ref: '#/body'
+  self_ref: '#/groups/8'
+- children:
+  - $ref: '#/texts/31'
+  - $ref: '#/texts/32'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/texts/30'
+  self_ref: '#/groups/9'
+- children:
+  - $ref: '#/texts/34'
+  - $ref: '#/texts/35'
+  - $ref: '#/texts/36'
+  - $ref: '#/texts/37'
+  content_layer: body
+  label: inline
+  name: group
+  parent:
+    $ref: '#/texts/33'
+  self_ref: '#/groups/10'
+key_value_items: []
+name: inline_and_formatting
+origin:
+  binary_hash: 9342273634728023910
+  filename: inline_and_formatting.md
+  mimetype: text/markdown
+pages: {}
+pictures: []
+schema_name: DoclingDocument
+tables: []
+texts:
+- children: []
+  content_layer: body
+  label: title
+  orig: Contribution guideline example
+  parent:
+    $ref: '#/body'
+  prov: []
+  self_ref: '#/texts/0'
+  text: Contribution guideline example
+- children: []
+  content_layer: body
+  label: text
+  orig: This is simple.
+  parent:
+    $ref: '#/body'
+  prov: []
+  self_ref: '#/texts/1'
+  text: This is simple.
+- children: []
+  content_layer: body
+  label: text
+  orig: Foo
+  parent:
+    $ref: '#/groups/0'
+  prov: []
+  self_ref: '#/texts/2'
+  text: Foo
+- children: []
+  content_layer: body
+  formatting:
+    bold: false
+    italic: true
+    strikethrough: false
+    underline: false
+  label: text
+  orig: emphasis
+  parent:
+    $ref: '#/groups/0'
+  prov: []
+  self_ref: '#/texts/3'
+  text: emphasis
+- children: []
+  content_layer: body
+  formatting:
+    bold: true
+    italic: false
+    strikethrough: false
+    underline: false
+  label: text
+  orig: strong emphasis
+  parent:
+    $ref: '#/groups/0'
+  prov: []
+  self_ref: '#/texts/4'
+  text: strong emphasis
+- children: []
+  content_layer: body
+  formatting:
+    bold: true
+    italic: true
+    strikethrough: false
+    underline: false
+  label: text
+  orig: both
+  parent:
+    $ref: '#/groups/0'
+  prov: []
+  self_ref: '#/texts/5'
+  text: both
+- children: []
+  content_layer: body
+  label: text
+  orig: .
+  parent:
+    $ref: '#/groups/0'
+  prov: []
+  self_ref: '#/texts/6'
+  text: .
+- children: []
+  content_layer: body
+  label: text
+  orig: 'Create your feature branch:'
+  parent:
+    $ref: '#/groups/1'
+  prov: []
+  self_ref: '#/texts/7'
+  text: 'Create your feature branch:'
+- captions: []
+  children: []
+  code_language: unknown
+  content_layer: body
+  footnotes: []
+  label: code
+  orig: git checkout -b feature/AmazingFeature
+  parent:
+    $ref: '#/groups/1'
+  prov: []
+  references: []
+  self_ref: '#/texts/8'
+  text: git checkout -b feature/AmazingFeature
+- children: []
+  content_layer: body
+  label: text
+  orig: .
+  parent:
+    $ref: '#/groups/1'
+  prov: []
+  self_ref: '#/texts/9'
+  text: .
+- children:
+  - $ref: '#/groups/3'
+  content_layer: body
+  enumerated: true
+  label: list_item
+  marker: '-'
+  orig: ''
+  parent:
+    $ref: '#/groups/2'
+  prov: []
+  self_ref: '#/texts/10'
+  text: ''
+- children: []
+  content_layer: body
+  label: text
+  orig: Pull the
+  parent:
+    $ref: '#/groups/3'
+  prov: []
+  self_ref: '#/texts/11'
+  text: Pull the
+- children: []
+  content_layer: body
+  formatting:
+    bold: true
+    italic: false
+    strikethrough: false
+    underline: false
+  hyperlink: https://github.com/docling-project/docling
+  label: text
+  orig: repository
+  parent:
+    $ref: '#/groups/3'
+  prov: []
+  self_ref: '#/texts/12'
+  text: repository
+- children: []
+  content_layer: body
+  label: text
+  orig: .
+  parent:
+    $ref: '#/groups/3'
+  prov: []
+  self_ref: '#/texts/13'
+  text: .
+- children:
+  - $ref: '#/groups/4'
+  content_layer: body
+  enumerated: true
+  label: list_item
+  marker: '-'
+  orig: ''
+  parent:
+    $ref: '#/groups/2'
+  prov: []
+  self_ref: '#/texts/14'
+  text: ''
+- children: []
+  content_layer: body
+  label: text
+  orig: Create your feature branch (
+  parent:
+    $ref: '#/groups/4'
+  prov: []
+  self_ref: '#/texts/15'
+  text: Create your feature branch (
+- captions: []
+  children: []
+  code_language: unknown
+  content_layer: body
+  footnotes: []
+  label: code
+  orig: git checkout -b feature/AmazingFeature
+  parent:
+    $ref: '#/groups/4'
+  prov: []
+  references: []
+  self_ref: '#/texts/16'
+  text: git checkout -b feature/AmazingFeature
+- children: []
+  content_layer: body
+  label: text
+  orig: )
+  parent:
+    $ref: '#/groups/4'
+  prov: []
+  self_ref: '#/texts/17'
+  text: )
+- children:
+  - $ref: '#/groups/5'
+  content_layer: body
+  enumerated: true
+  label: list_item
+  marker: '-'
+  orig: ''
+  parent:
+    $ref: '#/groups/2'
+  prov: []
+  self_ref: '#/texts/18'
+  text: ''
+- children: []
+  content_layer: body
+  label: text
+  orig: Commit your changes (
+  parent:
+    $ref: '#/groups/5'
+  prov: []
+  self_ref: '#/texts/19'
+  text: Commit your changes (
+- captions: []
+  children: []
+  code_language: unknown
+  content_layer: body
+  footnotes: []
+  label: code
+  orig: git commit -m 'Add some AmazingFeature'
+  parent:
+    $ref: '#/groups/5'
+  prov: []
+  references: []
+  self_ref: '#/texts/20'
+  text: git commit -m 'Add some AmazingFeature'
+- children: []
+  content_layer: body
+  label: text
+  orig: )
+  parent:
+    $ref: '#/groups/5'
+  prov: []
+  self_ref: '#/texts/21'
+  text: )
+- children:
+  - $ref: '#/groups/6'
+  content_layer: body
+  enumerated: true
+  label: list_item
+  marker: '-'
+  orig: ''
+  parent:
+    $ref: '#/groups/2'
+  prov: []
+  self_ref: '#/texts/22'
+  text: ''
+- children: []
+  content_layer: body
+  label: text
+  orig: Push to the branch (
+  parent:
+    $ref: '#/groups/6'
+  prov: []
+  self_ref: '#/texts/23'
+  text: Push to the branch (
+- captions: []
+  children: []
+  code_language: unknown
+  content_layer: body
+  footnotes: []
+  label: code
+  orig: git push origin feature/AmazingFeature
+  parent:
+    $ref: '#/groups/6'
+  prov: []
+  references: []
+  self_ref: '#/texts/24'
+  text: git push origin feature/AmazingFeature
+- children: []
+  content_layer: body
+  label: text
+  orig: )
+  parent:
+    $ref: '#/groups/6'
+  prov: []
+  self_ref: '#/texts/25'
+  text: )
+- children: []
+  content_layer: body
+  enumerated: true
+  label: list_item
+  marker: '-'
+  orig: Open a Pull Request
+  parent:
+    $ref: '#/groups/2'
+  prov: []
+  self_ref: '#/texts/26'
+  text: Open a Pull Request
+- children:
+  - $ref: '#/groups/7'
+  content_layer: body
+  label: section_header
+  level: 1
+  orig: ''
+  parent:
+    $ref: '#/body'
+  prov: []
+  self_ref: '#/texts/27'
+  text: ''
+- children: []
+  content_layer: body
+  formatting:
+    bold: false
+    italic: true
+    strikethrough: false
+    underline: false
+  label: text
+  orig: Second
+  parent:
+    $ref: '#/groups/7'
+  prov: []
+  self_ref: '#/texts/28'
+  text: Second
+- children: []
+  content_layer: body
+  label: text
+  orig: section
+  parent:
+    $ref: '#/groups/7'
+  prov: []
+  self_ref: '#/texts/29'
+  text: section
+- children:
+  - $ref: '#/groups/9'
+  content_layer: body
+  enumerated: false
+  label: list_item
+  marker: '-'
+  orig: ''
+  parent:
+    $ref: '#/groups/8'
+  prov: []
+  self_ref: '#/texts/30'
+  text: ''
+- children: []
+  content_layer: body
+  formatting:
+    bold: true
+    italic: false
+    strikethrough: false
+    underline: false
+  label: text
+  orig: First
+  parent:
+    $ref: '#/groups/9'
+  prov: []
+  self_ref: '#/texts/31'
+  text: First
+- children: []
+  content_layer: body
+  label: text
+  orig: ': Lorem ipsum.'
+  parent:
+    $ref: '#/groups/9'
+  prov: []
+  self_ref: '#/texts/32'
+  text: ': Lorem ipsum.'
+- children:
+  - $ref: '#/groups/10'
+  content_layer: body
+  enumerated: false
+  label: list_item
+  marker: '-'
+  orig: ''
+  parent:
+    $ref: '#/groups/8'
+  prov: []
+  self_ref: '#/texts/33'
+  text: ''
+- children: []
+  content_layer: body
+  formatting:
+    bold: true
+    italic: false
+    strikethrough: false
+    underline: false
+  label: text
+  orig: Second
+  parent:
+    $ref: '#/groups/10'
+  prov: []
+  self_ref: '#/texts/34'
+  text: Second
+- children: []
+  content_layer: body
+  label: text
+  orig: ': Dolor'
+  parent:
+    $ref: '#/groups/10'
+  prov: []
+  self_ref: '#/texts/35'
+  text: ': Dolor'
+- captions: []
+  children: []
+  code_language: unknown
+  content_layer: body
+  footnotes: []
+  label: code
+  orig: sit
+  parent:
+    $ref: '#/groups/10'
+  prov: []
+  references: []
+  self_ref: '#/texts/36'
+  text: sit
+- children: []
+  content_layer: body
+  label: text
+  orig: amet.
+  parent:
+    $ref: '#/groups/10'
+  prov: []
+  self_ref: '#/texts/37'
+  text: amet.
+version: 1.3.0
--- a/tests/data/groundtruth/docling_v2/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v2/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.itxt
+++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.itxt
@ -0,0 +1,3 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: chapter: group slide-0
+    item-2 at level 2: title: X-Library The fully customisable ... llection exclusively for our customers
--- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json
+++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json
@ -0,0 +1,86 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.3.0",
+  "name": "powerpoint_bad_text",
+  "origin": {
+    "mimetype": "application/vnd.ms-powerpoint",
+    "binary_hash": 1443005848482130016,
+    "filename": "powerpoint_bad_text.pptx"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/groups/0"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/0"
+        }
+      ],
+      "content_layer": "body",
+      "name": "slide-0",
+      "label": "chapter"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "title",
+      "prov": [
+        {
+          "page_no": 1,
+          "bbox": {
+            "l": 1041400.0,
+            "t": 4582390.0,
+            "r": 8083550.0,
+            "b": 1689099.0,
+            "coord_origin": "BOTTOMLEFT"
+          },
+          "charspan": [
+            0,
+            118
+          ]
+        }
+      ],
+      "orig": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers",
+      "text": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers"
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {
+    "1": {
+      "size": {
+        "width": 12190413.0,
+        "height": 6858000.0
+      },
+      "page_no": 1
+    }
+  }
+}
--- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.md
+++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.md
@ -0,0 +1 @@
+# X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/test_03.asciidoc.md
+++ b/tests/data/groundtruth/docling_v2/test_03.asciidoc.md
@ -0,0 +1,23 @@
+:\_mod-docs-content-type: PROCEDURE :experimental:
+
+# Renaming a bookmark
+
+[id="renaming-a-bookmark\_{context}"]
+
+You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
+
+Renaming the bookmark does not rename the folder.
+
+- Check that the side bar lists the bookmark under the new name.
+
+Procedure . Right-click the bookmark in the side bar. . Select *Rename…*. +
+
+<!-- image -->
+
+ In the *Name* field, enter the new name for the bookmark. +
+
+<!-- image -->
+
+ Click btn:[Rename]. .Verification
+
+<!-- image -->
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
    item-16 at level 2: list_item: Italic bullet 1
    item-17 at level 2: list_item: Bold bullet 2
    item-18 at level 2: list_item: Underline bullet 3
-    item-19 at level 2: inline: group group
-      item-20 at level 3: list_item: Some
-      item-21 at level 3: list_item: italic
-      item-22 at level 3: list_item: bold
-      item-23 at level 3: list_item: underline
-    item-24 at level 2: list: group list
-      item-25 at level 3: inline: group group
-        item-26 at level 4: list_item: Nested
-        item-27 at level 4: list_item: italic
-        item-28 at level 4: list_item: bold
-  item-29 at level 1: paragraph: 
+    item-19 at level 2: list_item: 
+      item-20 at level 3: inline: group group
+        item-21 at level 4: text: Some
+        item-22 at level 4: text: italic
+        item-23 at level 4: text: bold
+        item-24 at level 4: text: underline
+    item-25 at level 2: list: group list
+      item-26 at level 3: list_item: 
+        item-27 at level 4: inline: group group
+          item-28 at level 5: text: Nested
+          item-29 at level 5: text: italic
+          item-30 at level 5: text: bold
+  item-31 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
@ -42,7 +42,7 @@
        "$ref": "#/groups/1"
      },
      {
-        "$ref": "#/texts/23"
+        "$ref": "#/texts/25"
      }
    ],
    "content_layer": "body",
@ -98,7 +98,7 @@
          "$ref": "#/texts/15"
        },
        {
-          "$ref": "#/groups/2"
+          "$ref": "#/texts/16"
        },
        {
          "$ref": "#/groups/3"
@ -111,12 +111,9 @@
    {
      "self_ref": "#/groups/2",
      "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/texts/16"
      },
      "children": [
-        {
-          "$ref": "#/texts/16"
-        },
        {
          "$ref": "#/texts/17"
        },
@ -125,6 +122,9 @@
        },
        {
          "$ref": "#/texts/19"
+        },
+        {
+          "$ref": "#/texts/20"
        }
      ],
      "content_layer": "body",
@ -138,7 +138,7 @@
      },
      "children": [
        {
-          "$ref": "#/groups/4"
+          "$ref": "#/texts/21"
        }
      ],
      "content_layer": "body",
@ -148,17 +148,17 @@
    {
      "self_ref": "#/groups/4",
      "parent": {
-        "$ref": "#/groups/3"
+        "$ref": "#/texts/21"
      },
      "children": [
-        {
-          "$ref": "#/texts/20"
-        },
-        {
-          "$ref": "#/texts/21"
-        },
        {
          "$ref": "#/texts/22"
+        },
+        {
+          "$ref": "#/texts/23"
+        },
+        {
+          "$ref": "#/texts/24"
        }
      ],
      "content_layer": "body",
@ -461,20 +461,18 @@
    {
      "self_ref": "#/texts/16",
      "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/groups/1"
      },
-      "children": [],
+      "children": [
+        {
+          "$ref": "#/groups/2"
+        }
+      ],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
-      "orig": "Some",
-      "text": "Some",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false
-      },
+      "orig": "",
+      "text": "",
      "enumerated": false,
      "marker": "-"
    },
@ -485,18 +483,16 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
      "prov": [],
-      "orig": "italic",
-      "text": "italic",
+      "orig": "Some",
+      "text": "Some",
      "formatting": {
        "bold": false,
-        "italic": true,
+        "italic": false,
        "underline": false,
        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
+      }
    },
    {
      "self_ref": "#/texts/18",
@ -505,67 +501,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "bold",
-      "text": "bold",
-      "formatting": {
-        "bold": true,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/19",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "underline",
-      "text": "underline",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": true,
-        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/20",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
-      "prov": [],
-      "orig": "Nested",
-      "text": "Nested",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
-    },
-    {
-      "self_ref": "#/texts/21",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
      "prov": [],
      "orig": "italic",
      "text": "italic",
@ -574,7 +510,59 @@
        "italic": true,
        "underline": false,
        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/19",
+      "parent": {
+        "$ref": "#/groups/2"
      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "bold",
+      "text": "bold",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/20",
+      "parent": {
+        "$ref": "#/groups/2"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "underline",
+      "text": "underline",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": true,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/21",
+      "parent": {
+        "$ref": "#/groups/3"
+      },
+      "children": [
+        {
+          "$ref": "#/groups/4"
+        }
+      ],
+      "content_layer": "body",
+      "label": "list_item",
+      "prov": [],
+      "orig": "",
+      "text": "",
      "enumerated": false,
      "marker": "-"
    },
@ -585,7 +573,43 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
+      "prov": [],
+      "orig": "Nested",
+      "text": "Nested",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/23",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [],
+      "orig": "italic",
+      "text": "italic",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/24",
+      "parent": {
+        "$ref": "#/groups/4"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
      "prov": [],
      "orig": "bold",
      "text": "bold",
@ -594,12 +618,10 @@
        "italic": false,
        "underline": false,
        "strikethrough": false
-      },
-      "enumerated": false,
-      "marker": "-"
+      }
    },
    {
-      "self_ref": "#/texts/23",
+      "self_ref": "#/texts/25",
      "parent": {
        "$ref": "#/body"
      },
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
@ -0,0 +1,16 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: paragraph: Transcript
+  item-2 at level 1: paragraph: February 20, 2025, 8:32PM
+  item-3 at level 1: picture
+  item-4 at level 1: inline: group group
+    item-5 at level 2: paragraph: This is test 1
+    item-6 at level 2: paragraph: 0:08
+Correct, he is not.
+  item-7 at level 1: paragraph: 
+  item-8 at level 1: picture
+  item-9 at level 1: inline: group group
+    item-10 at level 2: paragraph: This is test 2
+    item-11 at level 2: paragraph: 0:16
+Yeah, exactly.
+  item-12 at level 1: paragraph: 
+  item-13 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
@ -0,0 +1,286 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.3.0",
+  "name": "word_image_anchors",
+  "origin": {
+    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+    "binary_hash": 2428692234257307633,
+    "filename": "word_image_anchors.docx"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
+      },
+      {
+        "$ref": "#/pictures/0"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/texts/4"
+      },
+      {
+        "$ref": "#/pictures/1"
+      },
+      {
+        "$ref": "#/groups/1"
+      },
+      {
+        "$ref": "#/texts/7"
+      },
+      {
+        "$ref": "#/texts/8"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        }
+      ],
+      "content_layer": "body",
+      "name": "group",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "Transcript",
+      "text": "Transcript",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "February 20, 2025, 8:32PM",
+      "text": "February 20, 2025, 8:32PM",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "This is test 1",
+      "text": "This is test 1",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "0:08\nCorrect, he is not.",
+      "text": "0:08\nCorrect, he is not.",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "This is test 2",
+      "text": "This is test 2",
+      "formatting": {
+        "bold": true,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "0:16\nYeah, exactly.",
+      "text": "0:16\nYeah, exactly.",
+      "formatting": {
+        "bold": false,
+        "italic": false,
+        "underline": false,
+        "strikethrough": false
+      }
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "paragraph",
+      "prov": [],
+      "orig": "",
+      "text": ""
+    }
+  ],
+  "pictures": [
+    {
+      "self_ref": "#/pictures/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "picture",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "image": {
+        "mimetype": "image/png",
+        "dpi": 72,
+        "size": {
+          "width": 100.0,
+          "height": 100.0
+        },
+        "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAz0lEQVR4nO3bUW0CURRF0TukQvDSauBr0mACE1VBAzYQg5Lpdw0wO2EtA+cl+/6+GQAAAAAAAAAAAADe1DIR53X9mcNcdhnf5nm93Y8T8DElyzyuv/evlx/CMqeJOOz9AP4TJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiWp8+t/k8f6/bDrvPl28CAAAAAAAAAAAAAAAAzLv5A5bTEG2TIIlOAAAAAElFTkSuQmCC"
+      },
+      "annotations": []
+    },
+    {
+      "self_ref": "#/pictures/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "picture",
+      "prov": [],
+      "captions": [],
+      "references": [],
+      "footnotes": [],
+      "image": {
+        "mimetype": "image/png",
+        "dpi": 72,
+        "size": {
+          "width": 100.0,
+          "height": 100.0
+        },
+        "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAJIElEQVR4nO2dbWxb1RnH/8+1c5O4bITEwJrRF1ZAI6gtL9oK29oxihAdTQOVoGhbKyS0MDWZJk1CQ+q0aR/4xLYvJNGabdK07MukrSUNaxEvg7aUlteuLUoHrUTbseylSRSgpLGd3Ac9596kSWzHvva1fXzv/UmW4jaxj5+/z73nPOec50/QnM3t5xdbUWOlZeBGgK8jNpYC3AxQHOAGEMXAXKN+mSgF5nGAxgAeBmiIyToH0GnDwklj0jqxq/fK/0BjCJrR2jn8ZcPCXSBaC9DtAC/39h3oDMBHwHzQMvD3ga74P6ERWgjS1jG8BjAeALgVQEuZ334QoAHA2t3fHX8dQRWktX0obpi1jzDjewSshgYwcIwIf7KSiT8M9DYPB0KQts7RlWDuANCuSw/NAAPoBVF3f1fjCZQRKq8QeBzgragqqA+Ep8olDJXj0kSm+XNi6kQVw8RdnEz+otSXspIK0rZ9eDuIngTQAH8wBuYd/T3xnqoSRIauERi/ZuYN8CFEtG8K1o9LMWT2XJBN20e+TwZ1gdmEnyFKssWde3qafuvpy3r5Ym0dI78B8BiCxc7+7qYfaCXIxvbRpZEa7gOwDsHkwFSKtj7b23iu4oLYs2z6M4BlCDZnAd5S7Gy/KEHu3z5yDxN2AVhUzOv4iE+JsfmZnqbnyy7Iph+O3kcWD2g8264UzAa17nm68W+F/DEV0TOeC8XIChPj3kJ6ChV4z3gpvEzl5FOA17u9pxhuR1PODTwUIzeLJFZ2zEokiDO0Dfpoyg3LnJh5L4gz6QvqPKMY1jmx804QSYcEcAbuJY85MSz+pq7WuGEc831uqtQQJS1Yq3MlJHP2EMnahmJ4ALOpYpkDI9d6hl9T6JVAYqnWiAq5ZKlNCDW1p3y0uKQLY1YqcX22lcesPUSWXUMxSkKDE9v8e4izM+R4adoToiBalWnjROYeonaHhJSULDFO6yFh76hsL0nvIfYmtpBykCHWlGFk9X8d0uqrbqjBj7YtQlODq3QbLAtIphgffcL44N+TeO1oEgfeSkJT2Eolrpo94orO/l/ZawuuvBjFYBhAXS2px9VxE2tWmdjWZmHvgQnsemECmkEq5sAvp/9hztdPNj7DZxAB8SsMfLc1hscfvQz1dXp93+bH3Ji98KTLLvRSEDGAO1abaH8wBp2QmNuLfmk9RM5n+BvDAL6y0sTa23RLzV2K/ax7iDosUzUcOprEmydS6udoBFixJIovLYng2msiMGuyX5YW1RNuXBHFwbd1utGr2D8xI4ik2MFlP7lUFBcnGK+8kZh5/uJh+2e5ibc/FMs6OpN7yjVXR6AZLaKBpOZVq9WZPp/w+vEkXjqcwOQUqoppDeyvkTpg6R+GzltIpeQQVBXhaOD0azntGgyYgQ//p2P3sTUw5By490ePK8u1X4zANDPf2D+6YOGtd+3BgF7wctHCkEP58BG3ttTgG7eZat6RKa0iYrwzqKMggGgRlQoJVGWX20xcvyyKDetqcfsqE7F6yiiGCPH7v45DV0SLqJSr0CCX6Jq776hVj3yQZKMMi/v2XFTDZX3h66JO7RD4kQvjjKMnU3j2lQm898EkdEe0kB7SDJ9yWYzwtVtMLGuOqEnkvoMJ3XtIs2FX1fEvEQNYujiCrZti+NVPPq9m8vpCcUOVOAoAREDzlRGVVpGRmJ5wQ9SpN4Vq49Cs5KJQZwI3LJcEYxRLFkcyDnsFyXFta4vh/OgF/Ou/mk0QiWLRmeJfVcbFeclF4blX7ecy+vrOxno0Xp5ZlSVfiOBba2rxx37NhsDMNe4WrKuEFw8nsHd/Qg13MxGJAC0r5qxea4OhyuL5kGPvpTD2cfZLsfQemUxqBVHKsGsU+o9TZyeRWCDjK72kvlazCTHzuAx7x+BDWlZEEVtgQ8PUFHAxodtghsZk2FuRUnal5tabanD557LfIkUM6UV6wcPSQ4bgMzbeWYcNa+vUWnsmZJR/bkizIa+ChqJS15ZYs2tpHsj+qju/eim5KMGXeYg8FpqHCOMTjBOn9BvLiBZRKTKMKuTrt5jq4RbpHYOnJ/H8oblzGD2g04ZUfEaAODs0pd+E0EG0iEr57Sl/zg/Tesbpc5P43V/G9UuZOIgWhl0LXcpv+5ePLzAGXp7Az57+RON1ETojWjhTVT4CwDcbHZIpVjfuMx9O4cjxJPa/mdR8HWRGA2crKfNBED0MjTj+fgqP/tSXc9bMiAbT+7LEJaDS7Qk6lqOBEsQp9zBY6UYFmMHpkhuzhldi2RBSGS7FfpYg1u4KtSYEl2I/J2eyqWPkH34+RaUj4lmyp7vp5unnc2aEYmZSkVYFGJoX8zmCiLOMY2YSUh7YiXlmQZzz0r1lakwI0Du/KlB6Eouou5wtCjSUHus0QezaG+SqkmZIIVBf/tWACE8V9B4h+ZMlxhkFEeXEc8nFy4e4QGKbzWQs60KIGGApz6UQrxlzYgtXgqi7P/MOz5sTdJh3LOT0lnN3w/2do3vDyqTemYk909X47YV+J+farbiRSRFgj9oUXIiSKpY5yCmIpIXFjcyzhgUUtrgzH5u9vHY3ONZwOz1pWTDZma+9nqsdcm0dI/tDhwTXHOjvbvpmvr/sav+PWMPZbmQheXLWiRlKIojt08dbbDufkDwsj7a49TZ0vUNOPJXEGi5M0+c0BdtciKdhQVsWxX1MrOFCUbLb5hXqZVjwHlLx6RNruPDylWYseW+hHoZCUZt67W8Brw9v9DPWq+uLcfkUQnNiv5kTzya07y4eT88hSMOY0R6I3BdRUj6rl2IInh8MkRSBuJFJZhM+hYj2yWfMNx3i6rVRQpQBFtGTPrJOGpP1jP6eeE+p3qCkR6ek4WKA5YflYCbuks9SSjGEsh2/tZ17xOaHXeV2Kg/1yYaEbGvgnr8byoxjqSTOMu06GMdkQTIQvbJvqlxCTFOxgCg3H7P2EfHP0GWDNwPHZK+tbO9caN27lGjxDbX9M8SyQbkElNsUYNA+n2HtLiQZ6EtB0syQLdxl10KX8tteV92WE8d8RM70yTGyfJZVAy0I5iHlt6XisxQZlrq2TlnbZrt4Jzc4JQrtqnhS+0uVm5IKR1JUh4akXIWqkGDhpJwDt4+B68tnvr6L5zB8YjIAAAAASUVORK5CYII="
+      },
+      "annotations": []
+    }
+  ],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
@ -0,0 +1,13 @@
+**Transcript**
+
+February 20, 2025, 8:32PM
+
+<!-- image -->
+
+**This is test 1** 0:08
+Correct, he is not.
+
+<!-- image -->
+
+**This is test 2** 0:16
+Yeah, exactly.
--- a/tests/data/md/inline_and_formatting.md
+++ b/tests/data/md/inline_and_formatting.md
@ -0,0 +1,18 @@
+# Contribution guideline example
+
+This is simple.
+
+Foo *emphasis* **strong emphasis** ***both***.
+
+Create your feature branch: `git checkout -b feature/AmazingFeature`.
+
+1. Pull the [**repository**](https://github.com/docling-project/docling).
+2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
+3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
+4. Push to the branch (`git push origin feature/AmazingFeature`)
+5. Open a Pull Request
+
+## *Second* section  <!-- inline groups in headings not yet supported by serializers -->
+
+- **First**: Lorem ipsum.
+- **Second**: Dolor `sit` amet.
--- a/tests/data/pptx/powerpoint_bad_text.pptx
+++ b/tests/data/pptx/powerpoint_bad_text.pptx
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
@ -5,84 +5,159 @@
      "width": 2000.0,
      "height": 2829.0
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 246.4065456254215,
-          "r_y0": 329.06770715202435,
-          "r_x1": 1691.991797818404,
-          "r_y1": 329.06770715202435,
-          "r_x2": 1691.991797818404,
-          "r_y2": 258.9040166758338,
-          "r_x3": 246.4065456254215,
-          "r_y3": 258.9040166758338,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 2000.0,
+          "r_y1": 0.0,
+          "r_x2": 2000.0,
+          "r_y2": 2829.0,
+          "r_x3": 0.0,
+          "r_y3": 2829.0,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 2829.0,
+          "r": 2000.0,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 2829.0,
+          "r": 2000.0,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 2829.0,
+          "r": 2000.0,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 2829.0,
+          "r": 2000.0,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 2829.0,
+          "r": 2000.0,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [
+        {
+          "index": 0,
+          "rect": {
+            "r_x0": 0.0,
+            "r_y0": 0.0,
+            "r_x1": 2000.0,
+            "r_y1": 0.0,
+            "r_x2": 2000.0,
+            "r_y2": 2829.0,
+            "r_x3": 0.0,
+            "r_y3": 2829.0,
+            "coord_origin": "BOTTOMLEFT"
+          },
+          "uri": null
+        }
+      ],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 246.4065456254215,
+            "r_y0": 329.06770715202435,
+            "r_x1": 1691.991797818404,
+            "r_y1": 329.06770715202435,
+            "r_x2": 1691.991797818404,
+            "r_y2": 258.9040166758338,
+            "r_x3": 246.4065456254215,
+            "r_y3": 258.9040166758338,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 234.08627147881114,
-          "r_y0": 419.5788697734327,
-          "r_x1": 1696.0985042090742,
-          "r_y1": 419.5788697734327,
-          "r_x2": 1696.0985042090742,
-          "r_y2": 349.4151792972422,
-          "r_x3": 234.08627147881114,
-          "r_y3": 349.4151792972422,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 234.08627147881114,
+            "r_y0": 419.5788697734327,
+            "r_x1": 1696.0985042090742,
+            "r_y1": 419.5788697734327,
+            "r_x2": 1696.0985042090742,
+            "r_y2": 349.4151792972422,
+            "r_x3": 234.08627147881114,
+            "r_y3": 349.4151792972422,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 242.29979922858777,
-          "r_y0": 509.8779072023336,
-          "r_x1": 513.3470125989277,
-          "r_y1": 509.8779072023336,
-          "r_x2": 513.3470125989277,
-          "r_y2": 439.9752910477536,
-          "r_x3": 242.29979922858777,
-          "r_y3": 439.9752910477536,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 242.29979922858777,
+            "r_y0": 509.8779072023336,
+            "r_x1": 513.3470125989277,
+            "r_y1": 509.8779072023336,
+            "r_x2": 513.3470125989277,
+            "r_y2": 439.9752910477536,
+            "r_x3": 242.29979922858777,
+            "r_y3": 439.9752910477536,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 73.34702132031646,
-          "r_y0": 97.99999977896755,
-          "r_x1": 503.64955224479564,
-          "r_y1": 97.99999977896755,
-          "r_x2": 503.64955224479564,
-          "r_y2": 76.99999977896756,
-          "r_x3": 73.34702132031646,
-          "r_y3": 76.99999977896756,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 73.34702132031646,
+            "r_y0": 97.99999977896755,
+            "r_x1": 503.64955224479564,
+            "r_y1": 97.99999977896755,
+            "r_x2": 503.64955224479564,
+            "r_y2": 76.99999977896756,
+            "r_x3": 73.34702132031646,
+            "r_y3": 76.99999977896756,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 69.6796630536824,
-          "r_y0": 124.83139494707741,
-          "r_x1": 504.8720051760782,
-          "r_y1": 124.83139494707741,
-          "r_x2": 504.8720051760782,
-          "r_y2": 104.00000011573796,
-          "r_x3": 69.6796630536824,
-          "r_y3": 104.00000011573796,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 69.6796630536824,
+            "r_y0": 124.83139494707741,
+            "r_x1": 504.8720051760782,
+            "r_y1": 124.83139494707741,
+            "r_x2": 504.8720051760782,
+            "r_y2": 104.00000011573796,
+            "r_x3": 69.6796630536824,
+            "r_y3": 104.00000011573796,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 71.84193505100733,
-          "r_y0": 152.90926970226084,
-          "r_x1": 153.088934155825,
-          "r_y1": 152.90926970226084,
-          "r_x2": 153.088934155825,
-          "r_y2": 129.797125232046,
-          "r_x3": 71.84193505100733,
-          "r_y3": 129.797125232046,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 71.84193505100733,
+            "r_y0": 152.90926970226084,
+            "r_x1": 153.088934155825,
+            "r_y1": 152.90926970226084,
+            "r_x2": 153.088934155825,
+            "r_y2": 129.797125232046,
+            "r_x3": 71.84193505100733,
+            "r_y3": 129.797125232046,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 89.2388782764286,
-          "r_y0": 764.898293373551,
-          "r_x1": 521.9863147998661,
-          "r_y1": 764.898293373551,
-          "r_x2": 521.9863147998661,
-          "r_y2": 744.0929853494625,
-          "r_x3": 89.2388782764286,
-          "r_y3": 744.0929853494625,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 89.2388782764286,
+            "r_y0": 764.898293373551,
+            "r_x1": 521.9863147998661,
+            "r_y1": 764.898293373551,
+            "r_x2": 521.9863147998661,
+            "r_y2": 744.0929853494625,
+            "r_x3": 89.2388782764286,
+            "r_y3": 744.0929853494625,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 89.23887497045128,
-          "r_y0": 739.1977118987292,
-          "r_x1": 523.208764293368,
-          "r_y1": 739.1977118987292,
-          "r_x2": 523.208764293368,
-          "r_y2": 717.1685676116198,
-          "r_x3": 89.23887497045128,
-          "r_y3": 717.1685676116198,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 89.23887497045128,
+            "r_y0": 739.1977118987292,
+            "r_x1": 523.208764293368,
+            "r_y1": 739.1977118987292,
+            "r_x2": 523.208764293368,
+            "r_y2": 717.1685676116198,
+            "r_x3": 89.23887497045128,
+            "r_y3": 717.1685676116198,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 441.2561096985719,
-          "r_y0": 710.0268078458798,
-          "r_x1": 522.0347860494834,
-          "r_y1": 710.0268078458798,
-          "r_x2": 522.0347860494834,
-          "r_y2": 690.0429592741025,
-          "r_x3": 441.2561096985719,
-          "r_y3": 690.0429592741025,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 441.2561096985719,
+            "r_y0": 710.0268078458798,
+            "r_x1": 522.0347860494834,
+            "r_y1": 710.0268078458798,
+            "r_x2": 522.0347860494834,
+            "r_y2": 690.0429592741025,
+            "r_x3": 441.2561096985719,
+            "r_y3": 690.0429592741025,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 744.0930045534915,
-          "r_y0": 504.87200373583954,
-          "r_x1": 764.8982839673505,
-          "r_y1": 504.87200373583954,
-          "r_x2": 764.8982839673505,
-          "r_y2": 73.34702001188118,
-          "r_x3": 744.0930045534915,
-          "r_y3": 73.34702001188118,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 744.0930045534915,
+            "r_y0": 504.87200373583954,
+            "r_x1": 764.8982839673505,
+            "r_y1": 504.87200373583954,
+            "r_x2": 764.8982839673505,
+            "r_y2": 73.34702001188118,
+            "r_x3": 744.0930045534915,
+            "r_y3": 73.34702001188118,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 717.168585936602,
-          "r_y0": 504.8720061466397,
-          "r_x1": 737.9738558137178,
-          "r_y1": 504.8720061466397,
-          "r_x2": 737.9738558137178,
-          "r_y2": 70.90211682372312,
-          "r_x3": 717.168585936602,
-          "r_y3": 70.90211682372312,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 717.168585936602,
+            "r_y0": 504.8720061466397,
+            "r_x1": 737.9738558137178,
+            "r_y1": 504.8720061466397,
+            "r_x2": 737.9738558137178,
+            "r_y2": 70.90211682372312,
+            "r_x3": 717.168585936602,
+            "r_y3": 70.90211682372312,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 690.2441821046808,
-          "r_y0": 152.80629773131633,
-          "r_x1": 709.8255852011977,
-          "r_y1": 152.80629773131633,
-          "r_x2": 709.8255852011977,
-          "r_y2": 72.124570639845,
-          "r_x3": 690.2441821046808,
-          "r_y3": 72.124570639845,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 690.2441821046808,
+            "r_y0": 152.80629773131633,
+            "r_x1": 709.8255852011977,
+            "r_y1": 152.80629773131633,
+            "r_x2": 709.8255852011977,
+            "r_y2": 72.124570639845,
+            "r_x3": 690.2441821046808,
+            "r_y3": 72.124570639845,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 77.10171545548258,
-          "r_y0": 520.7638571913312,
-          "r_x1": 96.68315797053792,
-          "r_y1": 520.7638571913312,
-          "r_x2": 96.68315797053792,
-          "r_y2": 89.2388734673729,
-          "r_x3": 77.10171545548258,
-          "r_y3": 89.2388734673729,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 77.10171545548258,
+            "r_y0": 520.7638571913312,
+            "r_x1": 96.68315797053792,
+            "r_y1": 520.7638571913312,
+            "r_x2": 96.68315797053792,
+            "r_y2": 89.2388734673729,
+            "r_x3": 77.10171545548258,
+            "r_y3": 89.2388734673729,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 100.64168123325977,
-          "r_y0": 523.3236155182395,
-          "r_x1": 126.08064862014129,
-          "r_y1": 523.3236155182395,
-          "r_x2": 126.08064862014129,
-          "r_y2": 89.1266754140729,
-          "r_x3": 100.64168123325977,
-          "r_y3": 89.1266754140729,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 100.64168123325977,
+            "r_y0": 523.3236155182395,
+            "r_x1": 126.08064862014129,
+            "r_y1": 523.3236155182395,
+            "r_x2": 126.08064862014129,
+            "r_y2": 89.1266754140729,
+            "r_x3": 100.64168123325977,
+            "r_y3": 89.1266754140729,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 131.21306574279092,
-          "r_y0": 521.0762158417759,
-          "r_x1": 152.19606490864376,
-          "r_y1": 521.0762158417759,
-          "r_x2": 152.19606490864376,
-          "r_y2": 441.0071698212682,
-          "r_x3": 131.21306574279092,
-          "r_y3": 441.0071698212682,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 131.21306574279092,
+            "r_y0": 521.0762158417759,
+            "r_x1": 152.19606490864376,
+            "r_y1": 521.0762158417759,
+            "r_x2": 152.19606490864376,
+            "r_y2": 441.0071698212682,
+            "r_x3": 131.21306574279092,
+            "r_y3": 441.0071698212682,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 73.34702132031646,
-          "r_y0": 97.99999977896755,
-          "r_x1": 503.64955224479564,
-          "r_y1": 97.99999977896755,
-          "r_x2": 503.64955224479564,
-          "r_y2": 76.99999977896756,
-          "r_x3": 73.34702132031646,
-          "r_y3": 76.99999977896756,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 73.34702132031646,
+            "r_y0": 97.99999977896755,
+            "r_x1": 503.64955224479564,
+            "r_y1": 97.99999977896755,
+            "r_x2": 503.64955224479564,
+            "r_y2": 76.99999977896756,
+            "r_x3": 73.34702132031646,
+            "r_y3": 76.99999977896756,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 69.6796630536824,
-          "r_y0": 124.83139494707741,
-          "r_x1": 504.8720051760782,
-          "r_y1": 124.83139494707741,
-          "r_x2": 504.8720051760782,
-          "r_y2": 104.00000011573796,
-          "r_x3": 69.6796630536824,
-          "r_y3": 104.00000011573796,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 69.6796630536824,
+            "r_y0": 124.83139494707741,
+            "r_x1": 504.8720051760782,
+            "r_y1": 124.83139494707741,
+            "r_x2": 504.8720051760782,
+            "r_y2": 104.00000011573796,
+            "r_x3": 69.6796630536824,
+            "r_y3": 104.00000011573796,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 71.84193505100733,
-          "r_y0": 152.90926970226084,
-          "r_x1": 153.088934155825,
-          "r_y1": 152.90926970226084,
-          "r_x2": 153.088934155825,
-          "r_y2": 129.797125232046,
-          "r_x3": 71.84193505100733,
-          "r_y3": 129.797125232046,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 71.84193505100733,
+            "r_y0": 152.90926970226084,
+            "r_x1": 153.088934155825,
+            "r_y1": 152.90926970226084,
+            "r_x2": 153.088934155825,
+            "r_y2": 129.797125232046,
+            "r_x3": 71.84193505100733,
+            "r_y3": 129.797125232046,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 89.2388782764286,
-          "r_y0": 764.898293373551,
-          "r_x1": 521.9863147998661,
-          "r_y1": 764.898293373551,
-          "r_x2": 521.9863147998661,
-          "r_y2": 744.0929853494625,
-          "r_x3": 89.2388782764286,
-          "r_y3": 744.0929853494625,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 89.2388782764286,
+            "r_y0": 764.898293373551,
+            "r_x1": 521.9863147998661,
+            "r_y1": 764.898293373551,
+            "r_x2": 521.9863147998661,
+            "r_y2": 744.0929853494625,
+            "r_x3": 89.2388782764286,
+            "r_y3": 744.0929853494625,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 89.23887497045128,
-          "r_y0": 739.1977118987292,
-          "r_x1": 523.208764293368,
-          "r_y1": 739.1977118987292,
-          "r_x2": 523.208764293368,
-          "r_y2": 717.1685676116198,
-          "r_x3": 89.23887497045128,
-          "r_y3": 717.1685676116198,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 89.23887497045128,
+            "r_y0": 739.1977118987292,
+            "r_x1": 523.208764293368,
+            "r_y1": 739.1977118987292,
+            "r_x2": 523.208764293368,
+            "r_y2": 717.1685676116198,
+            "r_x3": 89.23887497045128,
+            "r_y3": 717.1685676116198,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 441.2561096985719,
-          "r_y0": 710.0268078458798,
-          "r_x1": 522.0347860494834,
-          "r_y1": 710.0268078458798,
-          "r_x2": 522.0347860494834,
-          "r_y2": 690.0429592741025,
-          "r_x3": 441.2561096985719,
-          "r_y3": 690.0429592741025,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 441.2561096985719,
+            "r_y0": 710.0268078458798,
+            "r_x1": 522.0347860494834,
+            "r_y1": 710.0268078458798,
+            "r_x2": 522.0347860494834,
+            "r_y2": 690.0429592741025,
+            "r_x3": 441.2561096985719,
+            "r_y3": 690.0429592741025,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 744.0930045534915,
-          "r_y0": 504.87200373583954,
-          "r_x1": 764.8982839673505,
-          "r_y1": 504.87200373583954,
-          "r_x2": 764.8982839673505,
-          "r_y2": 73.34702001188118,
-          "r_x3": 744.0930045534915,
-          "r_y3": 73.34702001188118,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 744.0930045534915,
+            "r_y0": 504.87200373583954,
+            "r_x1": 764.8982839673505,
+            "r_y1": 504.87200373583954,
+            "r_x2": 764.8982839673505,
+            "r_y2": 73.34702001188118,
+            "r_x3": 744.0930045534915,
+            "r_y3": 73.34702001188118,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 717.168585936602,
-          "r_y0": 504.8720061466397,
-          "r_x1": 737.9738558137178,
-          "r_y1": 504.8720061466397,
-          "r_x2": 737.9738558137178,
-          "r_y2": 70.90211682372312,
-          "r_x3": 717.168585936602,
-          "r_y3": 70.90211682372312,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 717.168585936602,
+            "r_y0": 504.8720061466397,
+            "r_x1": 737.9738558137178,
+            "r_y1": 504.8720061466397,
+            "r_x2": 737.9738558137178,
+            "r_y2": 70.90211682372312,
+            "r_x3": 717.168585936602,
+            "r_y3": 70.90211682372312,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 690.2441821046808,
-          "r_y0": 152.80629773131633,
-          "r_x1": 709.8255852011977,
-          "r_y1": 152.80629773131633,
-          "r_x2": 709.8255852011977,
-          "r_y2": 72.124570639845,
-          "r_x3": 690.2441821046808,
-          "r_y3": 72.124570639845,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 690.2441821046808,
+            "r_y0": 152.80629773131633,
+            "r_x1": 709.8255852011977,
+            "r_y1": 152.80629773131633,
+            "r_x2": 709.8255852011977,
+            "r_y2": 72.124570639845,
+            "r_x3": 690.2441821046808,
+            "r_y3": 72.124570639845,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
-      {
-        "index": 0,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
+    "parsed_page": {
+      "dimension": {
+        "angle": 0.0,
        "rect": {
-          "r_x0": 77.10171545548258,
-          "r_y0": 520.7638571913312,
-          "r_x1": 96.68315797053792,
-          "r_y1": 520.7638571913312,
-          "r_x2": 96.68315797053792,
-          "r_y2": 89.2388734673729,
-          "r_x3": 77.10171545548258,
-          "r_y3": 89.2388734673729,
-          "coord_origin": "TOPLEFT"
+          "r_x0": 0.0,
+          "r_y0": 0.0,
+          "r_x1": 595.201171875,
+          "r_y1": 0.0,
+          "r_x2": 595.201171875,
+          "r_y2": 841.9216918945312,
+          "r_x3": 0.0,
+          "r_y3": 841.9216918945312,
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
-        "orig": "Docling bundles PDF document conversion to",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
+        "boundary_type": "crop_box",
+        "art_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "bleed_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "crop_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "media_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        },
+        "trim_bbox": {
+          "l": 0.0,
+          "t": 841.9216918945312,
+          "r": 595.201171875,
+          "b": 0.0,
+          "coord_origin": "BOTTOMLEFT"
+        }
      },
-      {
-        "index": 1,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
+      "bitmap_resources": [],
+      "char_cells": [],
+      "word_cells": [],
+      "textline_cells": [
+        {
+          "index": 0,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 77.10171545548258,
+            "r_y0": 520.7638571913312,
+            "r_x1": 96.68315797053792,
+            "r_y1": 520.7638571913312,
+            "r_x2": 96.68315797053792,
+            "r_y2": 89.2388734673729,
+            "r_x3": 77.10171545548258,
+            "r_y3": 89.2388734673729,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "Docling bundles PDF document conversion to",
+          "orig": "Docling bundles PDF document conversion to",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "rect": {
-          "r_x0": 100.64168123325977,
-          "r_y0": 523.3236155182395,
-          "r_x1": 126.08064862014129,
-          "r_y1": 523.3236155182395,
-          "r_x2": 126.08064862014129,
-          "r_y2": 89.1266754140729,
-          "r_x3": 100.64168123325977,
-          "r_y3": 89.1266754140729,
-          "coord_origin": "TOPLEFT"
+        {
+          "index": 1,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 100.64168123325977,
+            "r_y0": 523.3236155182395,
+            "r_x1": 126.08064862014129,
+            "r_y1": 523.3236155182395,
+            "r_x2": 126.08064862014129,
+            "r_y2": 89.1266754140729,
+            "r_x3": 100.64168123325977,
+            "r_y3": 89.1266754140729,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "JSON and Markdown in an easy self contained",
+          "orig": "JSON and Markdown in an easy self contained",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
-        "orig": "JSON and Markdown in an easy self contained",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      },
-      {
-        "index": 2,
-        "rgba": {
-          "r": 0,
-          "g": 0,
-          "b": 0,
-          "a": 255
-        },
-        "rect": {
-          "r_x0": 131.21306574279092,
-          "r_y0": 521.0762158417759,
-          "r_x1": 152.19606490864376,
-          "r_y1": 521.0762158417759,
-          "r_x2": 152.19606490864376,
-          "r_y2": 441.0071698212682,
-          "r_x3": 131.21306574279092,
-          "r_y3": 441.0071698212682,
-          "coord_origin": "TOPLEFT"
-        },
-        "text": "package",
-        "orig": "package",
-        "text_direction": "left_to_right",
-        "confidence": 1.0,
-        "from_ocr": true
-      }
-    ],
-    "parsed_page": null,
+        {
+          "index": 2,
+          "rgba": {
+            "r": 0,
+            "g": 0,
+            "b": 0,
+            "a": 255
+          },
+          "rect": {
+            "r_x0": 131.21306574279092,
+            "r_y0": 521.0762158417759,
+            "r_x1": 152.19606490864376,
+            "r_y1": 521.0762158417759,
+            "r_x2": 152.19606490864376,
+            "r_y2": 441.0071698212682,
+            "r_x3": 131.21306574279092,
+            "r_y3": 441.0071698212682,
+            "coord_origin": "TOPLEFT"
+          },
+          "text": "package",
+          "orig": "package",
+          "text_direction": "left_to_right",
+          "confidence": 1.0,
+          "from_ocr": true
+        }
+      ],
+      "has_chars": false,
+      "has_words": false,
+      "has_lines": true,
+      "image": null,
+      "lines": []
+    },
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -2,7 +2,11 @@ import glob
 import os
 from pathlib import Path

-from docling.backend.asciidoc_backend import AsciiDocBackend
+from docling.backend.asciidoc_backend import (
+    DEFAULT_IMAGE_HEIGHT,
+    DEFAULT_IMAGE_WIDTH,
+    AsciiDocBackend,
+)
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument

@ -18,6 +22,24 @@ def _get_backend(fname):
    return doc_backend


+def test_parse_picture():
+    line = (
+        "image::images/example1.png[Example Image, width=200, height=150, align=center]"
+    )
+    res = AsciiDocBackend._parse_picture(line)
+    assert res
+    assert res.get("width", 0) == "200"
+    assert res.get("height", 0) == "150"
+    assert res.get("uri", "") == "images/example1.png"
+
+    line = "image::renamed-bookmark.png[Renamed bookmark]"
+    res = AsciiDocBackend._parse_picture(line)
+    assert res
+    assert "width" not in res
+    assert "height" not in res
+    assert res.get("uri", "") == "renamed-bookmark.png"
+
+
 def test_asciidocs_examples():
    fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))

--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@ -2,7 +2,7 @@ from pathlib import Path

 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docling.datamodel.document import DoclingDocument, InputDocument

 from .test_data_gen_flag import GEN_TEST_DATA

@ -11,12 +11,15 @@ def test_convert_valid():
    fmt = InputFormat.MD
    cls = MarkdownDocumentBackend

-    test_data_path = Path("tests") / "data"
-    relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
+    root_path = Path("tests") / "data"
+    relevant_paths = sorted((root_path / "md").rglob("*.md"))
    assert len(relevant_paths) > 0

+    yaml_filter = ["inline_and_formatting"]
+
    for in_path in relevant_paths:
-        gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
+        md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
+        yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"

        in_doc = InputDocument(
            path_or_stream=in_path,
@ -33,9 +36,17 @@ def test_convert_valid():
        act_data = act_doc.export_to_markdown()

        if GEN_TEST_DATA:
-            with open(gt_path, mode="w", encoding="utf-8") as f:
+            with open(md_gt_path, mode="w", encoding="utf-8") as f:
                f.write(f"{act_data}\n")
+
+            if in_path.stem in yaml_filter:
+                with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
+                    act_doc.save_as_yaml(yaml_gt_path)
        else:
-            with open(gt_path, encoding="utf-8") as f:
+            with open(md_gt_path, encoding="utf-8") as f:
                exp_data = f.read().rstrip()
-            assert exp_data == act_data
+            assert act_data == exp_data
+
+            if in_path.stem in yaml_filter:
+                exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
+                assert act_doc == exp_doc
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -9,6 +9,7 @@ from docling.datamodel.document import (
    DoclingDocument,
    InputDocument,
    SectionHeaderItem,
+    TextItem,
 )
 from docling.document_converter import DocumentConverter

@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):

        pred_md: str = doc.export_to_markdown()
        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
-            "export to md"
+            f"export to markdown failed on {docx_path}"
        )

        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
-            "export to indented-text"
+            f"export to indented-text failed on {docx_path}"
        )

        assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
-            "document document"
+            f"DoclingDocument verification failed on {docx_path}"
        )

        if docx_path.name == "word_tables.docx":
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
                pred_text=pred_html,
                gtfile=str(gt_path) + ".html",
                generate=GENERATE,
-            ), "export to html"
+            ), f"export to html failed on {docx_path}"


 flaky_path = Path("tests/data/docx/textbox.docx")
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
@pytest.mark.xfail(strict=False)
 def test_textbox_conversion():
    _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
+
+
+def test_text_after_image_anchors():
+    """
+    Test to analyse whether text gets parsed after image anchors.
+    """
+
+    in_path = Path("tests/data/docx/word_image_anchors.docx")
+    in_doc = InputDocument(
+        path_or_stream=in_path,
+        format=InputFormat.DOCX,
+        backend=MsWordDocumentBackend,
+    )
+    backend = MsWordDocumentBackend(
+        in_doc=in_doc,
+        path_or_stream=in_path,
+    )
+    doc = backend.convert()
+
+    found_text_after_anchor_1 = found_text_after_anchor_2 = (
+        found_text_after_anchor_3
+    ) = found_text_after_anchor_4 = False
+    for item, _ in doc.iterate_items():
+        if isinstance(item, TextItem):
+            if item.text == "This is test 1":
+                found_text_after_anchor_1 = True
+            elif item.text == "0:08\nCorrect, he is not.":
+                found_text_after_anchor_2 = True
+            elif item.text == "This is test 2":
+                found_text_after_anchor_3 = True
+            elif item.text == "0:16\nYeah, exactly.":
+                found_text_after_anchor_4 = True
+
+    assert (
+        found_text_after_anchor_1
+        and found_text_after_anchor_2
+        and found_text_after_anchor_3
+        and found_text_after_anchor_4
+    )
--- a/tests/test_code_formula.py
+++ b/tests/test_code_formula.py
@ -60,3 +60,25 @@ def test_code_and_formula_conversion():
    gt = "a ^ { 2 } + 8 = 1 2"
    predicted = formula_blocks[0].text
    assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
+
+
+def test_formula_conversion_with_page_range():
+    pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
+    converter = get_converter()
+
+    print(f"converting {pdf_path} with page range")
+
+    doc_result: ConversionResult = converter.convert(pdf_path, page_range=(2, 2))
+
+    results = doc_result.document.texts
+
+    formula_blocks = [
+        el
+        for el in results
+        if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA
+    ]
+    assert len(formula_blocks) == 1
+
+    gt = "a ^ { 2 } + 8 = 1 2"
+    predicted = formula_blocks[0].text
+    assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -57,14 +57,14 @@ def test_e2e_conversions():
    pdf_paths = get_pdf_paths()

    engines: List[Tuple[OcrOptions, bool]] = [
-        (EasyOcrOptions(), False),
        (TesseractOcrOptions(), True),
        (TesseractCliOcrOptions(), True),
-        (EasyOcrOptions(force_full_page_ocr=True), False),
+        (EasyOcrOptions(), False),
        (TesseractOcrOptions(force_full_page_ocr=True), True),
        (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
        (TesseractCliOcrOptions(force_full_page_ocr=True), True),
        (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
+        (EasyOcrOptions(force_full_page_ocr=True), False),
    ]

    # rapidocr is only available for Python >=3.6,<3.13
--- a/uv.lock
+++ b/uv.lock
@ -818,7 +818,7 @@ wheels = [

 [[package]]
 name = "docling"
-version = "2.36.1"
+version = "2.37.0"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },
				`@ -0,0 +1 @@`
				`# X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers`