Merge branch 'main' of github.com:DS4SD/docling into dev/add-asr-pipeline-v2

2025-07-27 04:24:45 +00:00 · 2025-06-23 09:08:58 +02:00 · 2025-06-23 09:08:58 +02:00 · caf18e634b
commit caf18e634b
parent 408b03ebbc d26dac61a8
98 changed files with 340943 additions and 330462 deletions
--- a/.github/dco.yml
+++ b/.github/dco.yml
@ -0,0 +1,2 @@
 allowRemediationCommits:
  individual: true
--- a/.github/workflows/dco-advisor.yml
+++ b/.github/workflows/dco-advisor.yml
@ -0,0 +1,192 @@
 name: DCO Advisor Bot
 on:
  pull_request_target:
    types: [opened, reopened, synchronize]
 permissions:
  pull-requests: write
  issues: write
 jobs:
  dco_advisor:
    runs-on: ubuntu-latest
    steps:
      - name: Handle DCO check result
        uses: actions/github-script@v7
        with:
          github-token: ${{ secrets.GITHUB_TOKEN }}
          script: |
            const pr = context.payload.pull_request || context.payload.check_run?.pull_requests?.[0];
            if (!pr) return;
            const prNumber = pr.number;
            const baseRef = pr.base.ref;
            const headSha =
              context.payload.check_run?.head_sha ||
              pr.head?.sha;
            const username = pr.user.login;
            console.log("HEAD SHA:", headSha);
            const sleep = ms => new Promise(resolve => setTimeout(resolve, ms));
            // Poll until DCO check has a conclusion (max 6 attempts, 30s)
            let dcoCheck = null;
            for (let attempt = 0; attempt < 6; attempt++) {
              const { data: checks } = await github.rest.checks.listForRef({
                owner: context.repo.owner,
                repo: context.repo.repo,
                ref: headSha
              });
              console.log("All check runs:");
                checks.check_runs.forEach(run => {
                console.log(`- ${run.name} (${run.status}/${run.conclusion}) @ ${run.head_sha}`);
              });
              dcoCheck = checks.check_runs.find(run =>
                run.name.toLowerCase().includes("dco") &&
              !run.name.toLowerCase().includes("dco_advisor") &&
                run.head_sha === headSha
              );
              if (dcoCheck?.conclusion) break;
              console.log(`Waiting for DCO check... (${attempt + 1})`);
              await sleep(5000); // wait 5 seconds
            }
            if (!dcoCheck || !dcoCheck.conclusion) {
              console.log("DCO check did not complete in time.");
              return;
            }
            const isFailure = ["failure", "action_required"].includes(dcoCheck.conclusion);
            console.log(`DCO check conclusion for ${headSha}: ${dcoCheck.conclusion} (treated as ${isFailure ? "failure" : "success"})`);
            // Parse DCO output for commit SHAs and author
            let badCommits = [];
            let authorName = "";
            let authorEmail = "";
            let moreInfo = `More info: [DCO check report](${dcoCheck?.html_url})`;
            if (isFailure) {
                const { data: commits } = await github.rest.pulls.listCommits({
                    owner: context.repo.owner,
                    repo: context.repo.repo,
                    pull_number: prNumber,
                });
                for (const commit of commits) {
                    const commitMessage = commit.commit.message;
                    const signoffMatch = commitMessage.match(/^Signed-off-by:\s+.+<.+>$/m);
                    if (!signoffMatch) {
                        console.log(`Bad commit found ${commit.sha}`)
                        badCommits.push({
                        sha: commit.sha,
                        authorName: commit.commit.author.name,
                        authorEmail: commit.commit.author.email,
                        });
                    }
                }            
            }
            // If multiple authors are present, you could adapt the message accordingly
            // For now, we'll just use the first one
            if (badCommits.length > 0) {
            authorName = badCommits[0].authorName;
            authorEmail = badCommits[0].authorEmail;
            }
            // Generate remediation commit message if needed
            let remediationSnippet = "";
            if (badCommits.length && authorEmail) {
              remediationSnippet = `git commit --allow-empty -s -m "DCO Remediation Commit for ${authorName} <${authorEmail}>\n\n` +
                badCommits.map(c => `I, ${c.authorName} <${c.authorEmail}>, hereby add my Signed-off-by to this commit: ${c.sha}`).join('\n') +
                `"`;
            } else {
              remediationSnippet = "# Unable to auto-generate remediation message. Please check the DCO check details.";
            }
            // Build comment
            const commentHeader = '<!-- dco-advice-bot -->';
            let body = "";
            if (isFailure) {
              body = [
                commentHeader,
                '❌ **DCO Check Failed**',
                '',
                `Hi @${username}, your pull request has failed the Developer Certificate of Origin (DCO) check.`,
                '',
                'This repository supports **remediation commits**, so you can fix this without rewriting history — but you must follow the required message format.',
                '',
                '---',
                '',
                '### 🛠 Quick Fix: Add a remediation commit',
                'Run this command:',
                '',
                '```bash',
                remediationSnippet,
                'git push',
                '```',
                '',
                '---',
                '',
                '<details>',
                '<summary>🔧 Advanced: Sign off each commit directly</summary>',
                '',
                '**For the latest commit:**',
                '```bash',
                'git commit --amend --signoff',
                'git push --force-with-lease',
                '```',
                '',
                '**For multiple commits:**',
                '```bash',
                `git rebase --signoff origin/${baseRef}`,
                'git push --force-with-lease',
                '```',
                '',
                '</details>',
                '',
                moreInfo
              ].join('\n');
            } else {
              body = [
                commentHeader,
                '✅ **DCO Check Passed**',
                '',
                `Thanks @${username}, all your commits are properly signed off. 🎉`
              ].join('\n');
            }
            // Get existing comments on the PR
            const { data: comments } = await github.rest.issues.listComments({
              owner: context.repo.owner,
              repo: context.repo.repo,
              issue_number: prNumber
            });
            // Look for a previous bot comment
            const existingComment = comments.find(c =>
              c.body.includes("<!-- dco-advice-bot -->")
            );
            if (existingComment) {
              await github.rest.issues.updateComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                comment_id: existingComment.id,
                body: body
              });
            } else {
              await github.rest.issues.createComment({
                owner: context.repo.owner,
                repo: context.repo.repo,
                issue_number: prNumber,
                body: body
              });
            }
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,26 @@
 ## [v2.37.0](https://github.com/docling-project/docling/releases/tag/v2.37.0) - 2025-06-16
 ### Feature
 * Make Page.parsed_page the only source of truth for text cells, add OCR cells to it ([#1745](https://github.com/docling-project/docling/issues/1745)) ([`7d3302c`](https://github.com/docling-project/docling/commit/7d3302cb48dd91cd29673d7c4eaf7326736d0685))
 * Support xlsm files ([#1520](https://github.com/docling-project/docling/issues/1520)) ([`df14022`](https://github.com/docling-project/docling/commit/df140227c3b8bcad0c68bf3d129930cccd96a07e))
 ### Fix
 * Pptx line break and space handling ([#1664](https://github.com/docling-project/docling/issues/1664)) ([`f28d23c`](https://github.com/docling-project/docling/commit/f28d23cf03d059619d1d3482594596ab7c87d197))
 * **asciidoc:** Set default size when missing in image directive ([#1769](https://github.com/docling-project/docling/issues/1769)) ([`b886e4d`](https://github.com/docling-project/docling/commit/b886e4df312447d39f58cf6e3c45b0f863940321))
 * Handle NoneType error in MsPowerpointDocumentBackend ([#1747](https://github.com/docling-project/docling/issues/1747)) ([`7a275c7`](https://github.com/docling-project/docling/commit/7a275c763731d9c96b7cf32f2e27b8dc8bebacd7))
 * Prov for merged-elems ([#1728](https://github.com/docling-project/docling/issues/1728)) ([`6613b9e`](https://github.com/docling-project/docling/commit/6613b9e98bc8b89791dc0334de8970ff243aba82))
 * **tesseract:** Initialize df_osd to avoid uninitialized variable error ([#1718](https://github.com/docling-project/docling/issues/1718)) ([`e979750`](https://github.com/docling-project/docling/commit/e979750ce93b2fae89dbb60ff06333f80c1c2908))
 * Allow custom torch_dtype in vlm models ([#1735](https://github.com/docling-project/docling/issues/1735)) ([`f7f3113`](https://github.com/docling-project/docling/commit/f7f31137f10999fefdb70da7e5ef56536f650400))
 * Improve extraction from textboxes in Word docs ([#1701](https://github.com/docling-project/docling/issues/1701)) ([`9dbcb3d`](https://github.com/docling-project/docling/commit/9dbcb3d7d4f27d1c935c8681c57ed59524452d53))
 * Add WEBP to the list of image file extensions ([#1711](https://github.com/docling-project/docling/issues/1711)) ([`a2b83fe`](https://github.com/docling-project/docling/commit/a2b83fe4aea66c273a83bf17177e87d45d3f18d1))
 ### Documentation
 * Update vlm models api examples with LM Studio ([#1759](https://github.com/docling-project/docling/issues/1759)) ([`0432a31`](https://github.com/docling-project/docling/commit/0432a31b2f7c9fe944c3a1d4b608ef938b4f2299))
 * Add open webui ([#1734](https://github.com/docling-project/docling/issues/1734)) ([`49b10e7`](https://github.com/docling-project/docling/commit/49b10e74191d4d580c9305ac08d9898a79346d7d))
 ## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04
 ### Fix
--- a/docling/backend/asciidoc_backend.py
+++ b/docling/backend/asciidoc_backend.py
@ -2,7 +2,7 @@ import logging
 import re
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import Final, Set, Union
 from docling_core.types.doc import (
    DocItemLabel,
@ -22,6 +22,9 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
 DEFAULT_IMAGE_WIDTH: Final = 128
 DEFAULT_IMAGE_HEIGHT: Final = 128
 class AsciiDocBackend(DeclarativeDocumentBackend):
    def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@ -200,9 +203,11 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
                item = self._parse_picture(line)
-                size = None
+                size: Size
                if "width" in item and "height" in item:
                    size = Size(width=int(item["width"]), height=int(item["height"]))
                else:
                    size = Size(width=DEFAULT_IMAGE_WIDTH, height=DEFAULT_IMAGE_HEIGHT)
                uri = None
                if (
@ -264,14 +269,16 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return doc
-    def _get_current_level(self, parents):
+    @staticmethod
    def _get_current_level(parents):
        for k, v in parents.items():
            if v is None and k > 0:
                return k - 1
        return 0
-    def _get_current_parent(self, parents):
+    @staticmethod
    def _get_current_parent(parents):
        for k, v in parents.items():
            if v is None and k > 0:
                return parents[k - 1]
@ -279,17 +286,21 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return None
    #   =========   Title
-    def _is_title(self, line):
+    @staticmethod
    def _is_title(line):
        return re.match(r"^= ", line)
-    def _parse_title(self, line):
+    @staticmethod
    def _parse_title(line):
        return {"type": "title", "text": line[2:].strip(), "level": 0}
    #   =========   Section headers
-    def _is_section_header(self, line):
+    @staticmethod
    def _is_section_header(line):
        return re.match(r"^==+\s+", line)
-    def _parse_section_header(self, line):
+    @staticmethod
    def _parse_section_header(line):
        match = re.match(r"^(=+)\s+(.*)", line)
        marker = match.group(1)  # The list marker (e.g., "*", "-", "1.")
@ -303,10 +314,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        }
    #   =========   Lists
-    def _is_list_item(self, line):
+    @staticmethod
    def _is_list_item(line):
        return re.match(r"^(\s)*(\*|-|\d+\.|\w+\.) ", line)
-    def _parse_list_item(self, line):
+    @staticmethod
    def _parse_list_item(line):
        """Extract the item marker (number or bullet symbol) and the text of the item."""
        match = re.match(r"^(\s*)(\*|-|\d+\.)\s+(.*)", line)
@ -342,14 +355,17 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
            }
    #   =========   Tables
-    def _is_table_line(self, line):
+    @staticmethod
    def _is_table_line(line):
        return re.match(r"^\|.*\|", line)
-    def _parse_table_line(self, line):
+    @staticmethod
    def _parse_table_line(line):
        # Split table cells and trim extra spaces
        return [cell.strip() for cell in line.split("|") if cell.strip()]
-    def _populate_table_as_grid(self, table_data):
+    @staticmethod
    def _populate_table_as_grid(table_data):
        num_rows = len(table_data)
        # Adjust the table data into a grid format
@ -380,10 +396,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return data
    #   =========   Pictures
-    def _is_picture(self, line):
+    @staticmethod
    def _is_picture(line):
        return re.match(r"^image::", line)
-    def _parse_picture(self, line):
+    @staticmethod
    def _parse_picture(line):
        """
        Parse an image macro, extracting its path and attributes.
        Syntax: image::path/to/image.png[Alt Text, width=200, height=150, align=center]
@ -406,10 +424,12 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return {"type": "picture", "uri": line}
    #   =========   Captions
-    def _is_caption(self, line):
+    @staticmethod
    def _is_caption(line):
        return re.match(r"^\.(.+)", line)
-    def _parse_caption(self, line):
+    @staticmethod
    def _parse_caption(line):
        mtch = re.match(r"^\.(.+)", line)
        if mtch:
            text = mtch.group(1)
@ -418,5 +438,6 @@ class AsciiDocBackend(DeclarativeDocumentBackend):
        return {"type": "caption", "text": ""}
    #   =========   Plain text
-    def _parse_text(self, line):
+    @staticmethod
    def _parse_text(line):
        return {"type": "text", "text": line.strip()}
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -7,12 +7,17 @@ from typing import List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
    BoundingRectangle,
    SegmentedPdfPage,
    TextCell,
 )
 from docling_parse.pdf_parsers import pdf_parser_v1
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.backend.pypdfium2_backend import get_pdf_page_geometry
 from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)
@ -36,6 +41,51 @@ class DoclingParsePageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid
    def _compute_text_cells(self) -> List[TextCell]:
        """Compute text cells from docling-parse data."""
        cells: List[TextCell] = []
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["width"]
        parser_height = self._dpage["height"]
        for i in range(len(self._dpage["cells"])):
            rect = self._dpage["cells"][i]["box"]["device"]
            x0, y0, x1, y1 = rect
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        return cells
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -70,75 +120,27 @@ class DoclingParsePageBackend(PdfPageBackend):
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
+        if not self.valid:
            return None
        text_cells = self._compute_text_cells()
        # Get the PDF page geometry from pypdfium2
        dimension = get_pdf_page_geometry(self._ppage)
        # Create SegmentedPdfPage
        return SegmentedPdfPage(
            dimension=dimension,
            textline_cells=text_cells,
            char_cells=[],
            word_cells=[],
            has_lines=len(text_cells) > 0,
            has_words=False,
            has_chars=False,
        )
    def get_text_cells(self) -> Iterable[TextCell]:
-        cells: List[TextCell] = []
+        return self._compute_text_cells()
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["width"]
        parser_height = self._dpage["height"]
        for i in range(len(self._dpage["cells"])):
            rect = self._dpage["cells"][i]["box"]["device"]
            x0, y0, x1, y1 = rect
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = self._dpage["cells"][i]["content"]["rnormalized"]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            # l=x0, b=y0, r=x1, t=y1,
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        def draw_clusters_and_cells():
            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
        # before merge:
        # draw_clusters_and_cells()
        # cells = merge_horizontal_cells(cells)
        # after merge:
        # draw_clusters_and_cells()
        return cells
    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -7,12 +7,19 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 from docling_core.types.doc import BoundingBox, CoordOrigin
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
    BoundingRectangle,
    PdfPageBoundaryType,
    PdfPageGeometry,
    SegmentedPdfPage,
    TextCell,
 )
 from docling_parse.pdf_parsers import pdf_parser_v2
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfPage
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.backend.pypdfium2_backend import get_pdf_page_geometry
 from docling.datamodel.base_models import Size
 from docling.utils.locks import pypdfium2_lock
@ -40,6 +47,55 @@ class DoclingParseV2PageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid
    def _compute_text_cells(self) -> List[TextCell]:
        """Compute text cells from docling-parse v2 data."""
        cells: List[TextCell] = []
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["sanitized"]["dimension"]["width"]
        parser_height = self._dpage["sanitized"]["dimension"]["height"]
        cells_data = self._dpage["sanitized"]["cells"]["data"]
        cells_header = self._dpage["sanitized"]["cells"]["header"]
        for i, cell_data in enumerate(cells_data):
            x0 = cell_data[cells_header.index("x0")]
            y0 = cell_data[cells_header.index("y0")]
            x1 = cell_data[cells_header.index("x1")]
            y1 = cell_data[cells_header.index("y1")]
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = cell_data[cells_header.index("text")]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        return cells
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        if not self.valid:
            return ""
@ -81,73 +137,27 @@ class DoclingParseV2PageBackend(PdfPageBackend):
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
-        return None
+        if not self.valid:
            return None
        text_cells = self._compute_text_cells()
        # Get the PDF page geometry from pypdfium2
        dimension = get_pdf_page_geometry(self._ppage)
        # Create SegmentedPdfPage
        return SegmentedPdfPage(
            dimension=dimension,
            textline_cells=text_cells,
            char_cells=[],
            word_cells=[],
            has_textlines=len(text_cells) > 0,
            has_words=False,
            has_chars=False,
        )
    def get_text_cells(self) -> Iterable[TextCell]:
-        cells: List[TextCell] = []
+        return self._compute_text_cells()
        cell_counter = 0
        if not self.valid:
            return cells
        page_size = self.get_size()
        parser_width = self._dpage["sanitized"]["dimension"]["width"]
        parser_height = self._dpage["sanitized"]["dimension"]["height"]
        cells_data = self._dpage["sanitized"]["cells"]["data"]
        cells_header = self._dpage["sanitized"]["cells"]["header"]
        for i, cell_data in enumerate(cells_data):
            x0 = cell_data[cells_header.index("x0")]
            y0 = cell_data[cells_header.index("y0")]
            x1 = cell_data[cells_header.index("x1")]
            y1 = cell_data[cells_header.index("y1")]
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            text_piece = cell_data[cells_header.index("text")]
            cells.append(
                TextCell(
                    index=cell_counter,
                    text=text_piece,
                    orig=text_piece,
                    from_ocr=False,
                    rect=BoundingRectangle.from_bounding_box(
                        BoundingBox(
                            # l=x0, b=y0, r=x1, t=y1,
                            l=x0 * page_size.width / parser_width,
                            b=y0 * page_size.height / parser_height,
                            r=x1 * page_size.width / parser_width,
                            t=y1 * page_size.height / parser_height,
                            coord_origin=CoordOrigin.BOTTOMLEFT,
                        )
                    ).to_top_left_origin(page_size.height),
                )
            )
            cell_counter += 1
        def draw_clusters_and_cells():
            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.bbox.as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
        # draw_clusters_and_cells()
        return cells
    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
        AREA_THRESHOLD = 0  # 32 * 32
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@ -59,20 +59,6 @@ class DoclingParseV4PageBackend(PdfPageBackend):
        return self._dpage
    def get_text_cells(self) -> Iterable[TextCell]:
        page_size = self.get_size()
        [tc.to_top_left_origin(page_size.height) for tc in self._dpage.textline_cells]
        # for cell in self._dpage.textline_cells:
        #     rect = cell.rect
        #
        #     assert (
        #         rect.to_bounding_box().l <= rect.to_bounding_box().r
        #     ), f"left is > right on bounding box {rect.to_bounding_box()} of rect {rect}"
        #     assert (
        #         rect.to_bounding_box().t <= rect.to_bounding_box().b
        #     ), f"top is > bottom on bounding box {rect.to_bounding_box()} of rect {rect}"
        return self._dpage.textline_cells
    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
@ -171,12 +157,28 @@ class DoclingParseV4DocumentBackend(PdfDocumentBackend):
        self, page_no: int, create_words: bool = True, create_textlines: bool = True
    ) -> DoclingParseV4PageBackend:
        with pypdfium2_lock:
            seg_page = self.dp_doc.get_page(
                page_no + 1,
                create_words=create_words,
                create_textlines=create_textlines,
            )
            # In Docling, all TextCell instances are expected with top-left origin.
            [
                tc.to_top_left_origin(seg_page.dimension.height)
                for tc in seg_page.textline_cells
            ]
            [
                tc.to_top_left_origin(seg_page.dimension.height)
                for tc in seg_page.char_cells
            ]
            [
                tc.to_top_left_origin(seg_page.dimension.height)
                for tc in seg_page.word_cells
            ]
            return DoclingParseV4PageBackend(
-                self.dp_doc.get_page(
+                seg_page,
                    page_no + 1,
                    create_words=create_words,
                    create_textlines=create_textlines,
                ),
                self._pdoc[page_no],
            )
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -1,17 +1,15 @@
 import logging
 import re
 import warnings
 from copy import deepcopy
 from io import BytesIO
 from pathlib import Path
 from typing import List, Optional, Set, Union
 import marko
 import marko.element
 import marko.ext
 import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
    DocItem,
    DocItemLabel,
    DoclingDocument,
    DocumentOrigin,
@ -21,7 +19,9 @@ from docling_core.types.doc import (
    TableData,
    TextItem,
 )
 from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from marko import Markdown
 from pydantic import AnyUrl, TypeAdapter
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.backend.html_backend import HTMLDocumentBackend
@ -71,7 +71,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        self.in_table = False
        self.md_table_buffer: list[str] = []
        self.inline_texts: list[str] = []
        self._html_blocks: int = 0
        try:
@ -156,25 +155,16 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                doc.add_table(data=table_data)
        return
    def _process_inline_text(
        self, parent_item: Optional[NodeItem], doc: DoclingDocument
    ):
        txt = " ".join(self.inline_texts)
        if len(txt) > 0:
            doc.add_text(
                label=DocItemLabel.PARAGRAPH,
                parent=parent_item,
                text=txt,
            )
        self.inline_texts = []
    def _iterate_elements(  # noqa: C901
        self,
        *,
        element: marko.element.Element,
        depth: int,
        doc: DoclingDocument,
        visited: Set[marko.element.Element],
        parent_item: Optional[NodeItem] = None,
        formatting: Optional[Formatting] = None,
        hyperlink: Optional[Union[AnyUrl, Path]] = None,
    ):
        if element in visited:
            return
@ -183,44 +173,32 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        # Check for different element types and process relevant details
        if isinstance(element, marko.block.Heading) and len(element.children) > 0:
            self._close_table(doc)
            self._process_inline_text(parent_item, doc)
            _log.debug(
                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
            )
-            if element.level == 1:
+
-                doc_label = DocItemLabel.TITLE
+            if len(element.children) == 1:
                child = element.children[0]
                snippet_text = str(child.children)  # type: ignore
                visited.add(child)
            else:
-                doc_label = DocItemLabel.SECTION_HEADER
+                snippet_text = ""  # inline group will be created
-            # Header could have arbitrary inclusion of bold, italic or emphasis,
+            if element.level == 1:
-            # hence we need to traverse the tree to get full text of a header
+                parent_item = doc.add_title(
-            strings: List[str] = []
+                    text=snippet_text,
-
+                    parent=parent_item,
-            # Define a recursive function to traverse the tree
+                    formatting=formatting,
-            def traverse(node: marko.block.BlockElement):
+                    hyperlink=hyperlink,
-                # Check if the node has a "children" attribute
+                )
-                if hasattr(node, "children"):
+            else:
-                    # If "children" is a list, continue traversal
+                parent_item = doc.add_heading(
-                    if isinstance(node.children, list):
+                    text=snippet_text,
-                        for child in node.children:
+                    level=element.level - 1,
-                            traverse(child)
+                    parent=parent_item,
-                    # If "children" is text, add it to header text
+                    formatting=formatting,
-                    elif isinstance(node.children, str):
+                    hyperlink=hyperlink,
-                        strings.append(node.children)
+                )
            traverse(element)
            snippet_text = "".join(strings)
            if len(snippet_text) > 0:
                if doc_label == DocItemLabel.SECTION_HEADER:
                    parent_item = doc.add_heading(
                        text=snippet_text,
                        level=element.level - 1,
                        parent=parent_item,
                    )
                else:
                    parent_item = doc.add_text(
                        label=doc_label, parent=parent_item, text=snippet_text
                    )
        elif isinstance(element, marko.block.List):
            has_non_empty_list_items = False
@ -230,7 +208,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    break
            self._close_table(doc)
            self._process_inline_text(parent_item, doc)
            _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}")
            if has_non_empty_list_items:
                label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST
@ -240,41 +217,60 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif (
            isinstance(element, marko.block.ListItem)
-            and len(element.children) > 0
+            and len(element.children) == 1
-            and isinstance((first_child := element.children[0]), marko.block.Paragraph)
+            and isinstance((child := element.children[0]), marko.block.Paragraph)
            and len(child.children) > 0
        ):
            self._close_table(doc)
            self._process_inline_text(parent_item, doc)
            _log.debug(" - List item")
-            snippet_text = str(first_child.children[0].children)  # type: ignore
+            if len(child.children) == 1:
-            is_numbered = False
+                snippet_text = str(child.children[0].children)  # type: ignore
-            if (
+                visited.add(child)
-                parent_item is not None
+            else:
-                and isinstance(parent_item, DocItem)
+                snippet_text = ""  # inline group will be created
-                and parent_item.label == GroupLabel.ORDERED_LIST
+            is_numbered = isinstance(parent_item, OrderedList)
-            ):
+            if not isinstance(parent_item, (OrderedList, UnorderedList)):
-                is_numbered = True
+                _log.warning("ListItem would have not had a list parent, adding one.")
-            doc.add_list_item(
+                parent_item = doc.add_unordered_list(parent=parent_item)
-                enumerated=is_numbered, parent=parent_item, text=snippet_text
+            parent_item = doc.add_list_item(
                enumerated=is_numbered,
                parent=parent_item,
                text=snippet_text,
                formatting=formatting,
                hyperlink=hyperlink,
            )
            visited.add(first_child)
        elif isinstance(element, marko.inline.Image):
            self._close_table(doc)
            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
            fig_caption: Optional[TextItem] = None
            if element.title is not None and element.title != "":
                fig_caption = doc.add_text(
-                    label=DocItemLabel.CAPTION, text=element.title
+                    label=DocItemLabel.CAPTION,
                    text=element.title,
                    formatting=formatting,
                    hyperlink=hyperlink,
                )
            doc.add_picture(parent=parent_item, caption=fig_caption)
-        elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0:
+        elif isinstance(element, marko.inline.Emphasis):
-            self._process_inline_text(parent_item, doc)
+            _log.debug(f" - Emphasis: {element.children}")
            formatting = deepcopy(formatting) if formatting else Formatting()
            formatting.italic = True
        elif isinstance(element, marko.inline.StrongEmphasis):
            _log.debug(f" - StrongEmphasis: {element.children}")
            formatting = deepcopy(formatting) if formatting else Formatting()
            formatting.bold = True
        elif isinstance(element, marko.inline.Link):
            _log.debug(f" - Link: {element.children}")
            hyperlink = TypeAdapter(Optional[Union[AnyUrl, Path]]).validate_python(
                element.dest
            )
        elif isinstance(element, marko.inline.RawText):
            _log.debug(f" - Paragraph (raw text): {element.children}")
@ -287,28 +283,41 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text
                else:
                    self.md_table_buffer.append(snippet_text)
-            else:
+            elif snippet_text:
                self._close_table(doc)
-                # most likely just inline text
+                doc.add_text(
-                self.inline_texts.append(str(element.children))
+                    label=DocItemLabel.TEXT,
                    parent=parent_item,
                    text=snippet_text,
                    formatting=formatting,
                    hyperlink=hyperlink,
                )
        elif isinstance(element, marko.inline.CodeSpan):
            self._close_table(doc)
            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Span: {element.children}")
            snippet_text = str(element.children).strip()
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
                parent=parent_item,
                text=snippet_text,
                formatting=formatting,
                hyperlink=hyperlink,
            )
        elif (
            isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode))
            and len(element.children) > 0
-            and isinstance((first_child := element.children[0]), marko.inline.RawText)
+            and isinstance((child := element.children[0]), marko.inline.RawText)
-            and len(snippet_text := (first_child.children.strip())) > 0
+            and len(snippet_text := (child.children.strip())) > 0
        ):
            self._close_table(doc)
            self._process_inline_text(parent_item, doc)
            _log.debug(f" - Code Block: {element.children}")
-            doc.add_code(parent=parent_item, text=snippet_text)
+            doc.add_code(
                parent=parent_item,
                text=snippet_text,
                formatting=formatting,
                hyperlink=hyperlink,
            )
        elif isinstance(element, marko.inline.LineBreak):
            if self.in_table:
@ -317,7 +326,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
        elif isinstance(element, marko.block.HTMLBlock):
            self._html_blocks += 1
            self._process_inline_text(parent_item, doc)
            self._close_table(doc)
            _log.debug(f"HTML Block: {element}")
            if (
@ -327,14 +335,25 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                # wrap in markers to enable post-processing in convert()
                text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
-                doc.add_code(parent=parent_item, text=text_to_add)
+                doc.add_code(
                    parent=parent_item,
                    text=text_to_add,
                    formatting=formatting,
                    hyperlink=hyperlink,
                )
        else:
            if not isinstance(element, str):
                self._close_table(doc)
                _log.debug(f"Some other element: {element}")
        if (
            isinstance(element, (marko.block.Paragraph, marko.block.Heading))
            and len(element.children) > 1
        ):
            parent_item = doc.add_inline_group(parent=parent_item)
        processed_block_types = (
-            marko.block.Heading,
+            # marko.block.Heading,
            marko.block.CodeBlock,
            marko.block.FencedCode,
            marko.inline.RawText,
@ -351,6 +370,8 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                    doc=doc,
                    visited=visited,
                    parent_item=parent_item,
                    formatting=formatting,
                    hyperlink=hyperlink,
                )
    def is_valid(self) -> bool:
@ -392,7 +413,6 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                parent_item=None,
                visited=set(),
            )
            self._process_inline_text(None, doc)  # handle last hanging inline text
            self._close_table(doc=doc)  # handle any last hanging table
            # if HTML blocks were detected, export to HTML and delegate to HTML backend
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@ -20,6 +20,7 @@ from docling_core.types.doc.document import ContentLayer
 from PIL import Image, UnidentifiedImageError
 from pptx import Presentation
 from pptx.enum.shapes import MSO_SHAPE_TYPE, PP_PLACEHOLDER
 from pptx.oxml.text import CT_TextLineBreak
 from docling.backend.abstract_backend import (
    DeclarativeDocumentBackend,
@ -120,136 +121,91 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
        return prov
-    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):  # noqa: C901
+    def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size):
        is_a_list = False
        is_list_group_created = False
        enum_list_item_value = 0
        new_list = None
        bullet_type = "None"
        list_label = GroupLabel.LIST
        doc_label = DocItemLabel.LIST_ITEM
        prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size)
-        # Identify if shape contains lists
+        def is_list_item(paragraph):
-        for paragraph in shape.text_frame.paragraphs:
+            """Check if the paragraph is a list item."""
            # Check if paragraph is a bullet point using the `element` XML
            p = paragraph._element
            if (
                p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
                is not None
            ):
-                bullet_type = "Bullet"
+                return (True, "Bullet")
                is_a_list = True
            elif (
                p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
                is not None
            ):
-                bullet_type = "Numbered"
+                return (True, "Numbered")
-                is_a_list = True
+            elif paragraph.level > 0:
            else:
                is_a_list = False
            if paragraph.level > 0:
                # Most likely a sub-list
-                is_a_list = True
+                return (True, "None")
            if is_a_list:
                # Determine if this is an unordered list or an ordered list.
                # Set GroupLabel.ORDERED_LIST when it fits.
                if bullet_type == "Numbered":
                    list_label = GroupLabel.ORDERED_LIST
            if is_a_list:
                _log.debug("LIST DETECTED!")
            else:
-                _log.debug("No List")
+                return (False, "None")
        # If there is a list inside of the shape, create a new docling list to assign list items to
        # if is_a_list:
        #     new_list = doc.add_group(
        #         label=list_label, name=f"list", parent=parent_slide
        #     )
        # Iterate through paragraphs to build up text
        for paragraph in shape.text_frame.paragraphs:
-            # p_text = paragraph.text.strip()
+            is_a_list, bullet_type = is_list_item(paragraph)
            p = paragraph._element
            enum_list_item_value += 1
            inline_paragraph_text = ""
            inline_list_item_text = ""
-            for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
+            # Convert line breaks to spaces and accumulate text
-                if len(e.text.strip()) > 0:
+            p_text = ""
-                    e_is_a_list_item = False
+            for e in p.content_children:
-                    is_numbered = False
+                if isinstance(e, CT_TextLineBreak):
-                    if (
+                    p_text += " "
-                        p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
+                else:
-                        is not None
+                    p_text += e.text
                    ):
                        bullet_type = "Bullet"
                        e_is_a_list_item = True
                    elif (
                        p.find(".//a:buAutoNum", namespaces={"a": self.namespaces["a"]})
                        is not None
                    ):
                        bullet_type = "Numbered"
                        is_numbered = True
                        e_is_a_list_item = True
                    else:
                        e_is_a_list_item = False
-                    if e_is_a_list_item:
+            if is_a_list:
-                        if len(inline_paragraph_text) > 0:
+                enum_marker = ""
-                            # output accumulated inline text:
+                enumerated = bullet_type == "Numbered"
-                            doc.add_text(
+
-                                label=doc_label,
+                if not is_list_group_created:
-                                parent=parent_slide,
+                    new_list = doc.add_group(
-                                text=inline_paragraph_text,
+                        label=GroupLabel.ORDERED_LIST
-                                prov=prov,
+                        if enumerated
-                            )
+                        else GroupLabel.LIST,
-                        # Set marker and enumerated arguments if this is an enumeration element.
+                        name="list",
-                        inline_list_item_text += e.text
+                        parent=parent_slide,
-                        # print(e.text)
+                    )
-                    else:
+                    is_list_group_created = True
-                        # Assign proper label to the text, depending if it's a Title or Section Header
+                    enum_list_item_value = 0
-                        # For other types of text, assign - PARAGRAPH
+
-                        doc_label = DocItemLabel.PARAGRAPH
+                if enumerated:
-                        if shape.is_placeholder:
+                    enum_list_item_value += 1
-                            placeholder_type = shape.placeholder_format.type
+                    enum_marker = str(enum_list_item_value) + "."
-                            if placeholder_type in [
+
-                                PP_PLACEHOLDER.CENTER_TITLE,
+                doc.add_list_item(
-                                PP_PLACEHOLDER.TITLE,
+                    marker=enum_marker,
-                            ]:
+                    enumerated=enumerated,
-                                # It's a title
+                    parent=new_list,
-                                doc_label = DocItemLabel.TITLE
+                    text=p_text,
-                            elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
+                    prov=prov,
-                                DocItemLabel.SECTION_HEADER
+                )
-                        enum_list_item_value = 0
+            else:  # is paragraph not a list item
-                        inline_paragraph_text += e.text
+                # Assign proper label to the text, depending if it's a Title or Section Header
                # For other types of text, assign - PARAGRAPH
                doc_label = DocItemLabel.PARAGRAPH
                if shape.is_placeholder:
                    placeholder_type = shape.placeholder_format.type
                    if placeholder_type in [
                        PP_PLACEHOLDER.CENTER_TITLE,
                        PP_PLACEHOLDER.TITLE,
                    ]:
                        # It's a title
                        doc_label = DocItemLabel.TITLE
                    elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
                        DocItemLabel.SECTION_HEADER
            if len(inline_paragraph_text) > 0:
                # output accumulated inline text:
                doc.add_text(
                    label=doc_label,
                    parent=parent_slide,
-                    text=inline_paragraph_text,
+                    text=p_text,
                    prov=prov,
                )
            if len(inline_list_item_text) > 0:
                enum_marker = ""
                if is_numbered:
                    enum_marker = str(enum_list_item_value) + "."
                if not is_list_group_created:
                    new_list = doc.add_group(
                        label=list_label, name="list", parent=parent_slide
                    )
                    is_list_group_created = True
                doc.add_list_item(
                    marker=enum_marker,
                    enumerated=is_numbered,
                    parent=new_list,
                    text=inline_list_item_text,
                    prov=prov,
                )
        return
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -14,7 +14,7 @@ from docling_core.types.doc import (
    TableCell,
    TableData,
 )
-from docling_core.types.doc.document import Formatting
+from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList
 from docx import Document
 from docx.document import Document as DocxDocument
 from docx.oxml.table import CT_Tc
@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            self.valid = True
        except Exception as e:
            raise RuntimeError(
-                f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}"
+                f"MsWordDocumentBackend could not load document with hash {self.document_hash}"
            ) from e
    @override
@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                    self._handle_tables(element, docx_obj, doc)
                except Exception:
                    _log.debug("could not parse a table, broken docx table")
-
+            # Check for Image
            elif drawing_blip:
                self._handle_pictures(docx_obj, drawing_blip, doc)
                # Check for Text after the Image
                if (
                    tag_name in ["p"]
                    and element.find(".//w:t", namespaces=namespaces) is not None
                ):
                    self._handle_text_elements(element, docx_obj, doc)
            # Check for the sdt containers, like table of contents
            elif tag_name in ["sdt"]:
                sdt_content = element.find(".//w:sdtContent", namespaces=namespaces)
@ -268,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
                self._handle_text_elements(element, docx_obj, doc)
            else:
                _log.debug(f"Ignoring element in DOCX with tag: {tag_name}")
        return doc
    def _str_to_int(
@ -578,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        all_paragraphs = []
        # Sort paragraphs within each container, then process containers
-        for container_id, paragraphs in container_paragraphs.items():
+        for paragraphs in container_paragraphs.values():
            # Sort by vertical position within each container
            sorted_container_paragraphs = sorted(
                paragraphs,
@ -689,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        doc: DoclingDocument,
    ) -> None:
        paragraph = Paragraph(element, docx_obj)
-
+        paragraph_elements = self._get_paragraph_elements(paragraph)
        text, equations = self._handle_equations_in_text(
            element=element, text=paragraph.text
        )
        if text is None:
            return
        paragraph_elements = self._get_paragraph_elements(paragraph)
        text = text.strip()
        # Common styles for bullet and numbered lists.
@ -912,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        )
        return
    def _add_formatted_list_item(
        self,
        doc: DoclingDocument,
        elements: list,
        marker: str,
        enumerated: bool,
        level: int,
    ) -> None:
        # This should not happen by construction
        if not isinstance(self.parents[level], (OrderedList, UnorderedList)):
            return
        if len(elements) == 1:
            text, format, hyperlink = elements[0]
            doc.add_list_item(
                marker=marker,
                enumerated=enumerated,
                parent=self.parents[level],
                text=text,
                formatting=format,
                hyperlink=hyperlink,
            )
        else:
            new_item = doc.add_list_item(
                marker=marker,
                enumerated=enumerated,
                parent=self.parents[level],
                text="",
            )
            new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item)
            for text, format, hyperlink in elements:
                doc.add_text(
                    label=DocItemLabel.TEXT,
                    parent=new_parent,
                    text=text,
                    formatting=format,
                    hyperlink=hyperlink,
                )
    def _add_list_item(
        self,
        *,
@ -921,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
        elements: list,
        is_numbered: bool = False,
    ) -> None:
        # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed.
        if not elements:
            return None
        enum_marker = ""
        level = self._get_level()
@ -937,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
+            self._add_formatted_list_item(
-                doc=doc,
+                doc, elements, enum_marker, is_numbered, level
                prev_parent=self.parents[level],
                paragraph_elements=elements,
            )
            for text, format, hyperlink in elements:
                doc.add_list_item(
                    marker=enum_marker,
                    enumerated=is_numbered,
                    parent=new_parent,
                    text=text,
                    formatting=format,
                    hyperlink=hyperlink,
                )
        elif (
            self._prev_numid() == numid
            and self.level_at_new_list is not None
@ -981,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-
+            self._add_formatted_list_item(
-            new_parent = self._create_or_reuse_parent(
+                doc,
-                doc=doc,
+                elements,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
+                enum_marker,
-                paragraph_elements=elements,
+                is_numbered,
                self.level_at_new_list + ilevel,
            )
            for text, format, hyperlink in elements:
                doc.add_list_item(
                    marker=enum_marker,
                    enumerated=is_numbered,
                    parent=new_parent,
                    text=text,
                    formatting=format,
                    hyperlink=hyperlink,
                )
        elif (
            self._prev_numid() == numid
            and self.level_at_new_list is not None
            and prev_indent is not None
            and ilevel < prev_indent
        ):  # Close list
-            for k, v in self.parents.items():
+            for k in self.parents:
                if k > self.level_at_new_list + ilevel:
                    self.parents[k] = None
@ -1011,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
+            self._add_formatted_list_item(
-                doc=doc,
+                doc,
-                prev_parent=self.parents[self.level_at_new_list + ilevel],
+                elements,
-                paragraph_elements=elements,
+                enum_marker,
                is_numbered,
                self.level_at_new_list + ilevel,
            )
            for text, format, hyperlink in elements:
                doc.add_list_item(
                    marker=enum_marker,
                    enumerated=is_numbered,
                    parent=new_parent,
                    text=text,
                    formatting=format,
                    hyperlink=hyperlink,
                )
            self.listIter = 0
        elif self._prev_numid() == numid or prev_indent == ilevel:
@ -1033,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
            if is_numbered:
                enum_marker = str(self.listIter) + "."
                is_numbered = True
-            new_parent = self._create_or_reuse_parent(
+            self._add_formatted_list_item(
-                doc=doc,
+                doc, elements, enum_marker, is_numbered, level - 1
                prev_parent=self.parents[level - 1],
                paragraph_elements=elements,
            )
-            for text, format, hyperlink in elements:
+
                # Add the list item to the parent group
                doc.add_list_item(
                    marker=enum_marker,
                    enumerated=is_numbered,
                    parent=new_parent,
                    text=text,
                    formatting=format,
                    hyperlink=hyperlink,
                )
        return
    def _handle_tables(
--- a/docling/backend/pypdfium2_backend.py
+++ b/docling/backend/pypdfium2_backend.py
@ -8,7 +8,13 @@ from typing import TYPE_CHECKING, List, Optional, Union
 import pypdfium2 as pdfium
 import pypdfium2.raw as pdfium_c
 from docling_core.types.doc import BoundingBox, CoordOrigin, Size
-from docling_core.types.doc.page import BoundingRectangle, SegmentedPdfPage, TextCell
+from docling_core.types.doc.page import (
    BoundingRectangle,
    PdfPageBoundaryType,
    PdfPageGeometry,
    SegmentedPdfPage,
    TextCell,
 )
 from PIL import Image, ImageDraw
 from pypdfium2 import PdfTextPage
 from pypdfium2._helpers.misc import PdfiumError
@ -16,6 +22,76 @@ from pypdfium2._helpers.misc import PdfiumError
 from docling.backend.pdf_backend import PdfDocumentBackend, PdfPageBackend
 from docling.utils.locks import pypdfium2_lock
 def get_pdf_page_geometry(
    ppage: pdfium.PdfPage,
    angle: float = 0.0,
    boundary_type: PdfPageBoundaryType = PdfPageBoundaryType.CROP_BOX,
 ) -> PdfPageGeometry:
    """
    Create PdfPageGeometry from a pypdfium2 PdfPage object.
    Args:
        ppage: pypdfium2 PdfPage object
        angle: Page rotation angle in degrees (default: 0.0)
        boundary_type: The boundary type for the page (default: CROP_BOX)
    Returns:
        PdfPageGeometry with all the different bounding boxes properly set
    """
    with pypdfium2_lock:
        # Get the main bounding box (intersection of crop_box and media_box)
        bbox_tuple = ppage.get_bbox()
        bbox = BoundingBox.from_tuple(bbox_tuple, CoordOrigin.BOTTOMLEFT)
        # Get all the different page boxes from pypdfium2
        media_box_tuple = ppage.get_mediabox()
        crop_box_tuple = ppage.get_cropbox()
        art_box_tuple = ppage.get_artbox()
        bleed_box_tuple = ppage.get_bleedbox()
        trim_box_tuple = ppage.get_trimbox()
        # Convert to BoundingBox objects using existing from_tuple method
        # pypdfium2 returns (x0, y0, x1, y1) in PDF coordinate system (bottom-left origin)
        # Use bbox as fallback when specific box types are not defined
        media_bbox = (
            BoundingBox.from_tuple(media_box_tuple, CoordOrigin.BOTTOMLEFT)
            if media_box_tuple
            else bbox
        )
        crop_bbox = (
            BoundingBox.from_tuple(crop_box_tuple, CoordOrigin.BOTTOMLEFT)
            if crop_box_tuple
            else bbox
        )
        art_bbox = (
            BoundingBox.from_tuple(art_box_tuple, CoordOrigin.BOTTOMLEFT)
            if art_box_tuple
            else bbox
        )
        bleed_bbox = (
            BoundingBox.from_tuple(bleed_box_tuple, CoordOrigin.BOTTOMLEFT)
            if bleed_box_tuple
            else bbox
        )
        trim_bbox = (
            BoundingBox.from_tuple(trim_box_tuple, CoordOrigin.BOTTOMLEFT)
            if trim_box_tuple
            else bbox
        )
        return PdfPageGeometry(
            angle=angle,
            rect=BoundingRectangle.from_bounding_box(bbox),
            boundary_type=boundary_type,
            art_bbox=art_bbox,
            bleed_bbox=bleed_bbox,
            crop_bbox=crop_bbox,
            media_bbox=media_bbox,
            trim_bbox=trim_bbox,
        )
 if TYPE_CHECKING:
    from docling.datamodel.document import InputDocument
@ -41,38 +117,8 @@ class PyPdfiumPageBackend(PdfPageBackend):
    def is_valid(self) -> bool:
        return self.valid
-    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
+    def _compute_text_cells(self) -> List[TextCell]:
-        AREA_THRESHOLD = 0  # 32 * 32
+        """Compute text cells from pypdfium."""
        page_size = self.get_size()
        with pypdfium2_lock:
            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
                pos = obj.get_pos()
                cropbox = BoundingBox.from_tuple(
                    pos, origin=CoordOrigin.BOTTOMLEFT
                ).to_top_left_origin(page_height=page_size.height)
                if cropbox.area() > AREA_THRESHOLD:
                    cropbox = cropbox.scaled(scale=scale)
                    yield cropbox
    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
            bbox = bbox.to_bottom_left_origin(self.get_size().height)
        with pypdfium2_lock:
            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
        return None
    def get_text_cells(self) -> Iterable[TextCell]:
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
@ -203,30 +249,58 @@ class PyPdfiumPageBackend(PdfPageBackend):
            return merged_cells
-        def draw_clusters_and_cells():
+        return merge_horizontal_cells(cells)
            image = (
                self.get_page_image()
            )  # make new image to avoid drawing on the saved ones
            draw = ImageDraw.Draw(image)
            for c in cells:
                x0, y0, x1, y1 = c.rect.to_bounding_box().as_tuple()
                cell_color = (
                    random.randint(30, 140),
                    random.randint(30, 140),
                    random.randint(30, 140),
                )
                draw.rectangle([(x0, y0), (x1, y1)], outline=cell_color)
            image.show()
-        # before merge:
+    def get_bitmap_rects(self, scale: float = 1) -> Iterable[BoundingBox]:
-        # draw_clusters_and_cells()
+        AREA_THRESHOLD = 0  # 32 * 32
        page_size = self.get_size()
        with pypdfium2_lock:
            for obj in self._ppage.get_objects(filter=[pdfium_c.FPDF_PAGEOBJ_IMAGE]):
                pos = obj.get_pos()
                cropbox = BoundingBox.from_tuple(
                    pos, origin=CoordOrigin.BOTTOMLEFT
                ).to_top_left_origin(page_height=page_size.height)
-        cells = merge_horizontal_cells(cells)
+                if cropbox.area() > AREA_THRESHOLD:
                    cropbox = cropbox.scaled(scale=scale)
-        # after merge:
+                    yield cropbox
        # draw_clusters_and_cells()
-        return cells
+    def get_text_in_rect(self, bbox: BoundingBox) -> str:
        with pypdfium2_lock:
            if not self.text_page:
                self.text_page = self._ppage.get_textpage()
        if bbox.coord_origin != CoordOrigin.BOTTOMLEFT:
            bbox = bbox.to_bottom_left_origin(self.get_size().height)
        with pypdfium2_lock:
            text_piece = self.text_page.get_text_bounded(*bbox.as_tuple())
        return text_piece
    def get_segmented_page(self) -> Optional[SegmentedPdfPage]:
        if not self.valid:
            return None
        text_cells = self._compute_text_cells()
        # Get the PDF page geometry from pypdfium2
        dimension = get_pdf_page_geometry(self._ppage)
        # Create SegmentedPdfPage
        return SegmentedPdfPage(
            dimension=dimension,
            textline_cells=text_cells,
            char_cells=[],
            word_cells=[],
            has_textlines=len(text_cells) > 0,
            has_words=False,
            has_chars=False,
        )
    def get_text_cells(self) -> Iterable[TextCell]:
        return self._compute_text_cells()
    def get_page_image(
        self, scale: float = 1, cropbox: Optional[BoundingBox] = None
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -235,7 +235,6 @@ class Page(BaseModel):
    page_no: int
    # page_hash: Optional[str] = None
    size: Optional[Size] = None
    cells: List[TextCell] = []
    parsed_page: Optional[SegmentedPdfPage] = None
    predictions: PagePredictions = PagePredictions()
    assembled: Optional[AssembledUnit] = None
@ -248,12 +247,27 @@ class Page(BaseModel):
        float, Image
    ] = {}  # Cache of images in different scales. By default it is cleared during assembling.
    @property
    def cells(self) -> List[TextCell]:
        """Return text cells as a read-only view of parsed_page.textline_cells."""
        if self.parsed_page is not None:
            return self.parsed_page.textline_cells
        else:
            return []
    def get_image(
-        self, scale: float = 1.0, cropbox: Optional[BoundingBox] = None
+        self,
        scale: float = 1.0,
        max_size: Optional[int] = None,
        cropbox: Optional[BoundingBox] = None,
    ) -> Optional[Image]:
        if self._backend is None:
            return self._image_cache.get(scale, None)
        if max_size:
            assert self.size is not None
            scale = min(scale, max_size / max(self.size.as_tuple()))
        if scale not in self._image_cache:
            if cropbox is None:
                self._image_cache[scale] = self._backend.get_page_image(scale=scale)
--- a/docling/datamodel/pipeline_options.py
+++ b/docling/datamodel/pipeline_options.py
@ -302,7 +302,9 @@ class PdfPipelineOptions(PaginatedPipelineOptions):
        ),
    )
-    generate_parsed_pages: bool = False
+    generate_parsed_pages: Literal[True] = (
        True  # Always True since parsed_page is now mandatory
    )
 class ProcessingPipeline(str, Enum):
--- a/docling/datamodel/pipeline_options_vlm_model.py
+++ b/docling/datamodel/pipeline_options_vlm_model.py
@ -10,6 +10,8 @@ from docling.datamodel.accelerator_options import AcceleratorDevice
 class BaseVlmOptions(BaseModel):
    kind: str
    prompt: str
    scale: float = 2.0
    max_size: Optional[int] = None
 class ResponseFormat(str, Enum):
@ -49,8 +51,6 @@ class InlineVlmOptions(BaseVlmOptions):
        AcceleratorDevice.MPS,
    ]
    scale: float = 2.0
    temperature: float = 0.0
    stop_strings: List[str] = []
    extra_generation_config: Dict[str, Any] = {}
@ -76,7 +76,6 @@ class ApiVlmOptions(BaseVlmOptions):
    )  # Default to ollama
    headers: Dict[str, str] = {}
    params: Dict[str, Any] = {}
    scale: float = 2.0
    timeout: float = 60
    concurrency: int = 1
    response_format: ResponseFormat
--- a/docling/models/api_vlm_model.py
+++ b/docling/models/api_vlm_model.py
@ -48,7 +48,9 @@ class ApiVlmModel(BasePageModel):
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                    )
                    assert hi_res_image is not None
                    if hi_res_image:
                        if hi_res_image.mode != "RGB":
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@ -86,7 +86,7 @@ class BaseItemAndImageEnrichmentModel(
            coord_origin=bbox.coord_origin,
        )
-        page_ix = element_prov.page_no - 1
+        page_ix = element_prov.page_no - conv_res.pages[0].page_no - 1
        cropped_image = conv_res.pages[page_ix].get_image(
            scale=self.images_scale, cropbox=expanded_bbox
        )
--- a/docling/models/base_ocr_model.py
+++ b/docling/models/base_ocr_model.py
@ -7,6 +7,7 @@ from typing import List, Optional, Type
 import numpy as np
 from docling_core.types.doc import BoundingBox, CoordOrigin
 from docling_core.types.doc.page import TextCell
 from PIL import Image, ImageDraw
 from rtree import index
 from scipy.ndimage import binary_dilation, find_objects, label
@ -107,7 +108,9 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
            return []
    # Filters OCR cells by dropping any OCR cell that intersects with an existing programmatic cell.
-    def _filter_ocr_cells(self, ocr_cells, programmatic_cells):
+    def _filter_ocr_cells(
        self, ocr_cells: List[TextCell], programmatic_cells: List[TextCell]
    ) -> List[TextCell]:
        # Create R-tree index for programmatic cells
        p = index.Property()
        p.dimension = 2
@ -130,19 +133,38 @@ class BaseOcrModel(BasePageModel, BaseModelWithOptions):
        ]
        return filtered_ocr_cells
-    def post_process_cells(self, ocr_cells, programmatic_cells):
+    def post_process_cells(self, ocr_cells: List[TextCell], page: Page) -> None:
        r"""
-        Post-process the ocr and programmatic cells and return the final list of of cells
+        Post-process the OCR cells and update the page object.
        Updates parsed_page.textline_cells directly since page.cells is now read-only.
        """
-        if self.options.force_full_page_ocr:
+        # Get existing cells from the read-only property
-            # If a full page OCR is forced, use only the OCR cells
+        existing_cells = page.cells
            cells = ocr_cells
            return cells
-        ## Remove OCR cells which overlap with programmatic cells.
+        # Combine existing and OCR cells with overlap filtering
-        filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, programmatic_cells)
+        final_cells = self._combine_cells(existing_cells, ocr_cells)
-        programmatic_cells.extend(filtered_ocr_cells)
+
-        return programmatic_cells
+        assert page.parsed_page is not None
        # Update parsed_page.textline_cells directly
        page.parsed_page.textline_cells = final_cells
        page.parsed_page.has_lines = len(final_cells) > 0
    def _combine_cells(
        self, existing_cells: List[TextCell], ocr_cells: List[TextCell]
    ) -> List[TextCell]:
        """Combine existing and OCR cells with filtering and re-indexing."""
        if self.options.force_full_page_ocr:
            combined = ocr_cells
        else:
            filtered_ocr_cells = self._filter_ocr_cells(ocr_cells, existing_cells)
            combined = list(existing_cells) + filtered_ocr_cells
        # Re-index in-place
        for i, cell in enumerate(combined):
            cell.index = i
        return combined
    def draw_ocr_rects_and_cells(self, conv_res, page, ocr_rects, show: bool = False):
        image = copy.deepcopy(page.image)
--- a/docling/models/easyocr_model.py
+++ b/docling/models/easyocr_model.py
@ -177,7 +177,7 @@ class EasyOcrModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)
                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -176,9 +176,9 @@ class LayoutModel(BasePageModel):
                    # Apply postprocessing
                    processed_clusters, processed_cells = LayoutPostprocessor(
-                        page.cells, clusters, page.size
+                        page, clusters
                    ).postprocess()
-                    # processed_clusters, processed_cells = clusters, page.cells
+                    # Note: LayoutPostprocessor updates page.cells and page.parsed_page internally
                    with warnings.catch_warnings():
                        warnings.filterwarnings(
@ -198,7 +198,6 @@ class LayoutModel(BasePageModel):
                            )
                        )
                    page.cells = processed_cells
                    page.predictions.layout = LayoutPrediction(
                        clusters=processed_clusters
                    )
--- a/docling/models/ocr_mac_model.py
+++ b/docling/models/ocr_mac_model.py
@ -132,7 +132,7 @@ class OcrMacModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)
                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -2,7 +2,7 @@ import re
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
-from typing import Optional
+from typing import Literal, Optional
 import numpy as np
 from PIL import ImageDraw
@ -17,7 +17,6 @@ from docling.utils.profiling import TimeRecorder
 class PagePreprocessingOptions(BaseModel):
    images_scale: Optional[float]
    create_parsed_page: bool
 class PagePreprocessingModel(BasePageModel):
@ -66,10 +65,8 @@ class PagePreprocessingModel(BasePageModel):
    def _parse_page_cells(self, conv_res: ConversionResult, page: Page) -> Page:
        assert page._backend is not None
-        page.cells = list(page._backend.get_text_cells())
+        page.parsed_page = page._backend.get_segmented_page()
-
+        assert page.parsed_page is not None
        if self.options.create_parsed_page:
            page.parsed_page = page._backend.get_segmented_page()
        # Rate the text quality from the PDF parser, and aggregate on page
        text_scores = []
--- a/docling/models/rapid_ocr_model.py
+++ b/docling/models/rapid_ocr_model.py
@ -134,7 +134,7 @@ class RapidOcrModel(BaseOcrModel):
                            all_ocr_cells.extend(cells)
                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@ -124,7 +124,7 @@ class ReadingOrderModel:
            page_no = page.page_no + 1
            size = page.size
-            assert size is not None
+            assert size is not None, "Page size is not initialized."
            out_doc.add_page(page_no=page_no, size=size)
--- a/docling/models/tesseract_ocr_cli_model.py
+++ b/docling/models/tesseract_ocr_cli_model.py
@ -306,7 +306,7 @@ class TesseractOcrCliModel(BaseOcrModel):
                            all_ocr_cells.append(cell)
                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -235,7 +235,7 @@ class TesseractOcrModel(BaseOcrModel):
                        all_ocr_cells.extend(cells)
                    # Post-process the cells
-                    page.cells = self.post_process_cells(all_ocr_cells, page.cells)
+                    self.post_process_cells(all_ocr_cells, page)
                # DEBUG code:
                if settings.debug.visualize_ocr:
--- a/docling/models/vlm_models_inline/hf_transformers_model.py
+++ b/docling/models/vlm_models_inline/hf_transformers_model.py
@ -123,7 +123,9 @@ class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMix
                with TimeRecorder(conv_res, "vlm"):
                    assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                    )
                    # Define prompt structure
                    prompt = self.formulate_prompt()
--- a/docling/models/vlm_models_inline/mlx_model.py
+++ b/docling/models/vlm_models_inline/mlx_model.py
@ -73,7 +73,9 @@ class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
                with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
                    assert page.size is not None
-                    hi_res_image = page.get_image(scale=self.vlm_options.scale)
+                    hi_res_image = page.get_image(
                        scale=self.vlm_options.scale, max_size=self.vlm_options.max_size
                    )
                    if hi_res_image is not None:
                        im_width, im_height = hi_res_image.size
--- a/docling/pipeline/base_pipeline.py
+++ b/docling/pipeline/base_pipeline.py
@ -193,6 +193,17 @@ class PaginatedPipeline(BasePipeline):  # TODO this is a bad name.
                )
                raise e
            # Filter out uninitialized pages (those with size=None) that may remain
            # after timeout or processing failures to prevent assertion errors downstream
            initial_page_count = len(conv_res.pages)
            conv_res.pages = [page for page in conv_res.pages if page.size is not None]
            if len(conv_res.pages) < initial_page_count:
                _log.info(
                    f"Filtered out {initial_page_count - len(conv_res.pages)} uninitialized pages "
                    f"due to timeout or processing failures"
                )
        return conv_res
    def _unload(self, conv_res: ConversionResult) -> ConversionResult:
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -72,7 +72,6 @@ class StandardPdfPipeline(PaginatedPipeline):
            PagePreprocessingModel(
                options=PagePreprocessingOptions(
                    images_scale=pipeline_options.images_scale,
                    create_parsed_page=pipeline_options.generate_parsed_pages,
                )
            ),
            # OCR
--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItemLabel, Size
 from docling_core.types.doc.page import TextCell
 from rtree import index
-from docling.datamodel.base_models import BoundingBox, Cluster
+from docling.datamodel.base_models import BoundingBox, Cluster, Page
 _log = logging.getLogger(__name__)
@ -194,11 +194,11 @@ class LayoutPostprocessor:
        DocItemLabel.TITLE: DocItemLabel.SECTION_HEADER,
    }
-    def __init__(self, cells: List[TextCell], clusters: List[Cluster], page_size: Size):
+    def __init__(self, page: Page, clusters: List[Cluster]) -> None:
-        """Initialize processor with cells and clusters."""
+        """Initialize processor with page and clusters."""
-        """Initialize processor with cells and spatial indices."""
+        self.cells = page.cells
-        self.cells = cells
+        self.page = page
-        self.page_size = page_size
+        self.page_size = page.size
        self.all_clusters = clusters
        self.regular_clusters = [
            c for c in clusters if c.label not in self.SPECIAL_TYPES
@ -240,6 +240,10 @@ class LayoutPostprocessor:
            for child in cluster.children:
                child.cells = self._sort_cells(child.cells)
        assert self.page.parsed_page is not None
        self.page.parsed_page.textline_cells = self.cells
        self.page.parsed_page.has_lines = len(self.cells) > 0
        return final_clusters, self.cells
    def _process_regular_clusters(self) -> List[Cluster]:
@ -301,6 +305,7 @@ class LayoutPostprocessor:
        special_clusters = self._handle_cross_type_overlaps(special_clusters)
        # Calculate page area from known page size
        assert self.page_size is not None
        page_area = self.page_size.width * self.page_size.height
        if page_area > 0:
            # Filter out full-page pictures
--- a/docs/examples/batch_convert.py
+++ b/docs/examples/batch_convert.py
@ -121,14 +121,15 @@ def export_documents(
 def main():
    logging.basicConfig(level=logging.INFO)
    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_paths = [
-        Path("./tests/data/pdf/2206.01062.pdf"),
+        data_folder / "pdf/2206.01062.pdf",
-        Path("./tests/data/pdf/2203.01017v2.pdf"),
+        data_folder / "pdf/2203.01017v2.pdf",
-        Path("./tests/data/pdf/2305.03393v1.pdf"),
+        data_folder / "pdf/2305.03393v1.pdf",
-        Path("./tests/data/pdf/redp5110_sampled.pdf"),
+        data_folder / "pdf/redp5110_sampled.pdf",
    ]
-    # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
+    # buf = BytesIO((data_folder / "pdf/2206.01062.pdf").open("rb").read())
    # docs = [DocumentStream(name="my_doc.pdf", stream=buf)]
    # input = DocumentConversionInput.from_streams(docs)
--- a/docs/examples/custom_convert.py
+++ b/docs/examples/custom_convert.py
@ -16,7 +16,8 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    ###########################################################################
--- a/docs/examples/develop_formula_understanding.py
+++ b/docs/examples/develop_formula_understanding.py
@ -71,7 +71,8 @@ class ExampleFormulaUnderstandingPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2203.01017v2.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2203.01017v2.pdf"
    pipeline_options = ExampleFormulaUnderstandingPipelineOptions()
    pipeline_options.do_formula_understanding = True
--- a/docs/examples/develop_picture_enrichment.py
+++ b/docs/examples/develop_picture_enrichment.py
@ -76,7 +76,8 @@ class ExamplePictureClassifierPipeline(StandardPdfPipeline):
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    pipeline_options = ExamplePictureClassifierPipelineOptions()
    pipeline_options.images_scale = 2.0
--- a/docs/examples/export_figures.py
+++ b/docs/examples/export_figures.py
@ -16,7 +16,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_multimodal.py
+++ b/docs/examples/export_multimodal.py
@ -19,7 +19,8 @@ IMAGE_RESOLUTION_SCALE = 2.0
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/export_tables.py
+++ b/docs/examples/export_tables.py
@ -12,7 +12,8 @@ _log = logging.getLogger(__name__)
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")
    doc_converter = DocumentConverter()
--- a/docs/examples/full_page_ocr.py
+++ b/docs/examples/full_page_ocr.py
@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    pipeline_options = PdfPipelineOptions()
    pipeline_options.do_ocr = True
@ -32,7 +33,7 @@ def main():
        }
    )
-    doc = converter.convert(input_doc).document
+    doc = converter.convert(input_doc_path).document
    md = doc.export_to_markdown()
    print(md)
--- a/docs/examples/pictures_description_api.py
+++ b/docs/examples/pictures_description_api.py
@ -96,7 +96,8 @@ def watsonx_vlm_options():
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    pipeline_options = PdfPipelineOptions(
        enable_remote_services=True  # <-- this is required!
--- a/docs/examples/run_with_accelerator.py
+++ b/docs/examples/run_with_accelerator.py
@ -10,7 +10,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    # Explicitly set the accelerator
    # accelerator_options = AcceleratorOptions(
@ -47,7 +48,7 @@ def main():
    settings.debug.profile_pipeline_timings = True
    # Convert the document
-    conversion_result = converter.convert(input_doc)
+    conversion_result = converter.convert(input_doc_path)
    doc = conversion_result.document
    # List with total time per document
--- a/docs/examples/tesseract_lang_detection.py
+++ b/docs/examples/tesseract_lang_detection.py
@ -9,7 +9,8 @@ from docling.document_converter import DocumentConverter, PdfFormatOption
 def main():
-    input_doc = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    # Set lang=["auto"] with a tesseract OCR engine: TesseractOcrOptions, TesseractCliOcrOptions
    # ocr_options = TesseractOcrOptions(lang=["auto"])
@ -27,7 +28,7 @@ def main():
        }
    )
-    doc = converter.convert(input_doc).document
+    doc = converter.convert(input_doc_path).document
    md = doc.export_to_markdown()
    print(md)
--- a/docs/examples/translate.py
+++ b/docs/examples/translate.py
@ -30,7 +30,8 @@ def translate(text: str, src: str = "en", dest: str = "de"):
 def main():
    logging.basicConfig(level=logging.INFO)
-    input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
    input_doc_path = data_folder / "pdf/2206.01062.pdf"
    output_dir = Path("scratch")
    # Important: For operating with page images, we must keep them, otherwise the DocumentConverter
--- a/docs/examples/vlm_pipeline_api_model.py
+++ b/docs/examples/vlm_pipeline_api_model.py
@ -95,8 +95,8 @@ def watsonx_vlm_options(model: str, prompt: str):
 def main():
    logging.basicConfig(level=logging.INFO)
-    # input_doc_path = Path("./tests/data/pdf/2206.01062.pdf")
+    data_folder = Path(__file__).parent / "../../tests/data"
-    input_doc_path = Path("./tests/data/pdf/2305.03393v1-pg9.pdf")
+    input_doc_path = data_folder / "pdf/2305.03393v1-pg9.pdf"
    pipeline_options = VlmPipelineOptions(
        enable_remote_services=True  # <-- this is required!
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [project]
 name = "docling"
-version = "2.36.1"  # DO NOT EDIT, updated automatically
+version = "2.37.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 license = "MIT"
 keywords = [
--- a/tests/data/asciidoc/test_03.asciidoc
+++ b/tests/data/asciidoc/test_03.asciidoc
@ -0,0 +1,29 @@
 :_mod-docs-content-type: PROCEDURE
 :experimental:
 [id="renaming-a-bookmark_{context}"]
 = Renaming a bookmark
 You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
 Renaming the bookmark does not rename the folder.
 .Procedure
 . Right-click the bookmark in the side bar.
 . Select *Rename…*.
 +
 image::rename-bookmark-menu.png[Rename bookmark menu]
 . In the *Name* field, enter the new name for the bookmark.
 +
 image::rename-bookmark-text.png[Bookmark name field]
 . Click btn:[Rename].
 .Verification
 * Check that the side bar lists the bookmark under the new name.
 +
 image::renamed-bookmark.png[Renamed bookmark]
--- a/tests/data/docx/word_image_anchors.docx
+++ b/tests/data/docx/word_image_anchors.docx
--- a/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v1/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v1/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v1/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v1/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v1/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v1/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v1/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v1/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v1/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v1/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v1/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v1/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.pages.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.pages.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.pages.json
--- a/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
+++ b/tests/data/groundtruth/docling_v2/amt_handbook_sample.pages.json
--- a/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
+++ b/tests/data/groundtruth/docling_v2/code_and_formula.pages.json
--- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
+++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.md
@ -0,0 +1,20 @@
 # Contribution guideline example
 This is simple.
 Foo *emphasis* **strong emphasis** ***both*** .
 Create your feature branch: `git checkout -b feature/AmazingFeature` .
 1. Pull the [**repository**](https://github.com/docling-project/docling) .
 2. Create your feature branch ( `git checkout -b feature/AmazingFeature` )
 3. Commit your changes ( `git commit -m 'Add some AmazingFeature'` )
 4. Push to the branch ( `git push origin feature/AmazingFeature` )
 5. Open a Pull Request
 ## 
 *Second* section
 - **First** : Lorem ipsum.
 - **Second** : Dolor `sit` amet.
--- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
+++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml
@ -0,0 +1,565 @@
 body:
  children:
  - $ref: '#/texts/0'
  - $ref: '#/texts/1'
  - $ref: '#/groups/0'
  - $ref: '#/groups/1'
  - $ref: '#/groups/2'
  - $ref: '#/texts/27'
  - $ref: '#/groups/8'
  content_layer: body
  label: unspecified
  name: _root_
  self_ref: '#/body'
 form_items: []
 furniture:
  children: []
  content_layer: furniture
  label: unspecified
  name: _root_
  self_ref: '#/furniture'
 groups:
 - children:
  - $ref: '#/texts/2'
  - $ref: '#/texts/3'
  - $ref: '#/texts/4'
  - $ref: '#/texts/5'
  - $ref: '#/texts/6'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/body'
  self_ref: '#/groups/0'
 - children:
  - $ref: '#/texts/7'
  - $ref: '#/texts/8'
  - $ref: '#/texts/9'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/body'
  self_ref: '#/groups/1'
 - children:
  - $ref: '#/texts/10'
  - $ref: '#/texts/14'
  - $ref: '#/texts/18'
  - $ref: '#/texts/22'
  - $ref: '#/texts/26'
  content_layer: body
  label: ordered_list
  name: list
  parent:
    $ref: '#/body'
  self_ref: '#/groups/2'
 - children:
  - $ref: '#/texts/11'
  - $ref: '#/texts/12'
  - $ref: '#/texts/13'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/texts/10'
  self_ref: '#/groups/3'
 - children:
  - $ref: '#/texts/15'
  - $ref: '#/texts/16'
  - $ref: '#/texts/17'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/texts/14'
  self_ref: '#/groups/4'
 - children:
  - $ref: '#/texts/19'
  - $ref: '#/texts/20'
  - $ref: '#/texts/21'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/texts/18'
  self_ref: '#/groups/5'
 - children:
  - $ref: '#/texts/23'
  - $ref: '#/texts/24'
  - $ref: '#/texts/25'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/texts/22'
  self_ref: '#/groups/6'
 - children:
  - $ref: '#/texts/28'
  - $ref: '#/texts/29'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/texts/27'
  self_ref: '#/groups/7'
 - children:
  - $ref: '#/texts/30'
  - $ref: '#/texts/33'
  content_layer: body
  label: list
  name: list
  parent:
    $ref: '#/body'
  self_ref: '#/groups/8'
 - children:
  - $ref: '#/texts/31'
  - $ref: '#/texts/32'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/texts/30'
  self_ref: '#/groups/9'
 - children:
  - $ref: '#/texts/34'
  - $ref: '#/texts/35'
  - $ref: '#/texts/36'
  - $ref: '#/texts/37'
  content_layer: body
  label: inline
  name: group
  parent:
    $ref: '#/texts/33'
  self_ref: '#/groups/10'
 key_value_items: []
 name: inline_and_formatting
 origin:
  binary_hash: 9342273634728023910
  filename: inline_and_formatting.md
  mimetype: text/markdown
 pages: {}
 pictures: []
 schema_name: DoclingDocument
 tables: []
 texts:
 - children: []
  content_layer: body
  label: title
  orig: Contribution guideline example
  parent:
    $ref: '#/body'
  prov: []
  self_ref: '#/texts/0'
  text: Contribution guideline example
 - children: []
  content_layer: body
  label: text
  orig: This is simple.
  parent:
    $ref: '#/body'
  prov: []
  self_ref: '#/texts/1'
  text: This is simple.
 - children: []
  content_layer: body
  label: text
  orig: Foo
  parent:
    $ref: '#/groups/0'
  prov: []
  self_ref: '#/texts/2'
  text: Foo
 - children: []
  content_layer: body
  formatting:
    bold: false
    italic: true
    strikethrough: false
    underline: false
  label: text
  orig: emphasis
  parent:
    $ref: '#/groups/0'
  prov: []
  self_ref: '#/texts/3'
  text: emphasis
 - children: []
  content_layer: body
  formatting:
    bold: true
    italic: false
    strikethrough: false
    underline: false
  label: text
  orig: strong emphasis
  parent:
    $ref: '#/groups/0'
  prov: []
  self_ref: '#/texts/4'
  text: strong emphasis
 - children: []
  content_layer: body
  formatting:
    bold: true
    italic: true
    strikethrough: false
    underline: false
  label: text
  orig: both
  parent:
    $ref: '#/groups/0'
  prov: []
  self_ref: '#/texts/5'
  text: both
 - children: []
  content_layer: body
  label: text
  orig: .
  parent:
    $ref: '#/groups/0'
  prov: []
  self_ref: '#/texts/6'
  text: .
 - children: []
  content_layer: body
  label: text
  orig: 'Create your feature branch:'
  parent:
    $ref: '#/groups/1'
  prov: []
  self_ref: '#/texts/7'
  text: 'Create your feature branch:'
 - captions: []
  children: []
  code_language: unknown
  content_layer: body
  footnotes: []
  label: code
  orig: git checkout -b feature/AmazingFeature
  parent:
    $ref: '#/groups/1'
  prov: []
  references: []
  self_ref: '#/texts/8'
  text: git checkout -b feature/AmazingFeature
 - children: []
  content_layer: body
  label: text
  orig: .
  parent:
    $ref: '#/groups/1'
  prov: []
  self_ref: '#/texts/9'
  text: .
 - children:
  - $ref: '#/groups/3'
  content_layer: body
  enumerated: true
  label: list_item
  marker: '-'
  orig: ''
  parent:
    $ref: '#/groups/2'
  prov: []
  self_ref: '#/texts/10'
  text: ''
 - children: []
  content_layer: body
  label: text
  orig: Pull the
  parent:
    $ref: '#/groups/3'
  prov: []
  self_ref: '#/texts/11'
  text: Pull the
 - children: []
  content_layer: body
  formatting:
    bold: true
    italic: false
    strikethrough: false
    underline: false
  hyperlink: https://github.com/docling-project/docling
  label: text
  orig: repository
  parent:
    $ref: '#/groups/3'
  prov: []
  self_ref: '#/texts/12'
  text: repository
 - children: []
  content_layer: body
  label: text
  orig: .
  parent:
    $ref: '#/groups/3'
  prov: []
  self_ref: '#/texts/13'
  text: .
 - children:
  - $ref: '#/groups/4'
  content_layer: body
  enumerated: true
  label: list_item
  marker: '-'
  orig: ''
  parent:
    $ref: '#/groups/2'
  prov: []
  self_ref: '#/texts/14'
  text: ''
 - children: []
  content_layer: body
  label: text
  orig: Create your feature branch (
  parent:
    $ref: '#/groups/4'
  prov: []
  self_ref: '#/texts/15'
  text: Create your feature branch (
 - captions: []
  children: []
  code_language: unknown
  content_layer: body
  footnotes: []
  label: code
  orig: git checkout -b feature/AmazingFeature
  parent:
    $ref: '#/groups/4'
  prov: []
  references: []
  self_ref: '#/texts/16'
  text: git checkout -b feature/AmazingFeature
 - children: []
  content_layer: body
  label: text
  orig: )
  parent:
    $ref: '#/groups/4'
  prov: []
  self_ref: '#/texts/17'
  text: )
 - children:
  - $ref: '#/groups/5'
  content_layer: body
  enumerated: true
  label: list_item
  marker: '-'
  orig: ''
  parent:
    $ref: '#/groups/2'
  prov: []
  self_ref: '#/texts/18'
  text: ''
 - children: []
  content_layer: body
  label: text
  orig: Commit your changes (
  parent:
    $ref: '#/groups/5'
  prov: []
  self_ref: '#/texts/19'
  text: Commit your changes (
 - captions: []
  children: []
  code_language: unknown
  content_layer: body
  footnotes: []
  label: code
  orig: git commit -m 'Add some AmazingFeature'
  parent:
    $ref: '#/groups/5'
  prov: []
  references: []
  self_ref: '#/texts/20'
  text: git commit -m 'Add some AmazingFeature'
 - children: []
  content_layer: body
  label: text
  orig: )
  parent:
    $ref: '#/groups/5'
  prov: []
  self_ref: '#/texts/21'
  text: )
 - children:
  - $ref: '#/groups/6'
  content_layer: body
  enumerated: true
  label: list_item
  marker: '-'
  orig: ''
  parent:
    $ref: '#/groups/2'
  prov: []
  self_ref: '#/texts/22'
  text: ''
 - children: []
  content_layer: body
  label: text
  orig: Push to the branch (
  parent:
    $ref: '#/groups/6'
  prov: []
  self_ref: '#/texts/23'
  text: Push to the branch (
 - captions: []
  children: []
  code_language: unknown
  content_layer: body
  footnotes: []
  label: code
  orig: git push origin feature/AmazingFeature
  parent:
    $ref: '#/groups/6'
  prov: []
  references: []
  self_ref: '#/texts/24'
  text: git push origin feature/AmazingFeature
 - children: []
  content_layer: body
  label: text
  orig: )
  parent:
    $ref: '#/groups/6'
  prov: []
  self_ref: '#/texts/25'
  text: )
 - children: []
  content_layer: body
  enumerated: true
  label: list_item
  marker: '-'
  orig: Open a Pull Request
  parent:
    $ref: '#/groups/2'
  prov: []
  self_ref: '#/texts/26'
  text: Open a Pull Request
 - children:
  - $ref: '#/groups/7'
  content_layer: body
  label: section_header
  level: 1
  orig: ''
  parent:
    $ref: '#/body'
  prov: []
  self_ref: '#/texts/27'
  text: ''
 - children: []
  content_layer: body
  formatting:
    bold: false
    italic: true
    strikethrough: false
    underline: false
  label: text
  orig: Second
  parent:
    $ref: '#/groups/7'
  prov: []
  self_ref: '#/texts/28'
  text: Second
 - children: []
  content_layer: body
  label: text
  orig: section
  parent:
    $ref: '#/groups/7'
  prov: []
  self_ref: '#/texts/29'
  text: section
 - children:
  - $ref: '#/groups/9'
  content_layer: body
  enumerated: false
  label: list_item
  marker: '-'
  orig: ''
  parent:
    $ref: '#/groups/8'
  prov: []
  self_ref: '#/texts/30'
  text: ''
 - children: []
  content_layer: body
  formatting:
    bold: true
    italic: false
    strikethrough: false
    underline: false
  label: text
  orig: First
  parent:
    $ref: '#/groups/9'
  prov: []
  self_ref: '#/texts/31'
  text: First
 - children: []
  content_layer: body
  label: text
  orig: ': Lorem ipsum.'
  parent:
    $ref: '#/groups/9'
  prov: []
  self_ref: '#/texts/32'
  text: ': Lorem ipsum.'
 - children:
  - $ref: '#/groups/10'
  content_layer: body
  enumerated: false
  label: list_item
  marker: '-'
  orig: ''
  parent:
    $ref: '#/groups/8'
  prov: []
  self_ref: '#/texts/33'
  text: ''
 - children: []
  content_layer: body
  formatting:
    bold: true
    italic: false
    strikethrough: false
    underline: false
  label: text
  orig: Second
  parent:
    $ref: '#/groups/10'
  prov: []
  self_ref: '#/texts/34'
  text: Second
 - children: []
  content_layer: body
  label: text
  orig: ': Dolor'
  parent:
    $ref: '#/groups/10'
  prov: []
  self_ref: '#/texts/35'
  text: ': Dolor'
 - captions: []
  children: []
  code_language: unknown
  content_layer: body
  footnotes: []
  label: code
  orig: sit
  parent:
    $ref: '#/groups/10'
  prov: []
  references: []
  self_ref: '#/texts/36'
  text: sit
 - children: []
  content_layer: body
  label: text
  orig: amet.
  parent:
    $ref: '#/groups/10'
  prov: []
  self_ref: '#/texts/37'
  text: amet.
 version: 1.3.0
--- a/tests/data/groundtruth/docling_v2/multi_page.pages.json
+++ b/tests/data/groundtruth/docling_v2/multi_page.pages.json
--- a/tests/data/groundtruth/docling_v2/picture_classification.pages.json
+++ b/tests/data/groundtruth/docling_v2/picture_classification.pages.json
--- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.itxt
+++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.itxt
@ -0,0 +1,3 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: chapter: group slide-0
    item-2 at level 2: title: X-Library The fully customisable ... llection exclusively for our customers
--- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json
+++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json
@ -0,0 +1,86 @@
 {
  "schema_name": "DoclingDocument",
  "version": "1.3.0",
  "name": "powerpoint_bad_text",
  "origin": {
    "mimetype": "application/vnd.ms-powerpoint",
    "binary_hash": 1443005848482130016,
    "filename": "powerpoint_bad_text.pptx"
  },
  "furniture": {
    "self_ref": "#/furniture",
    "children": [],
    "content_layer": "furniture",
    "name": "_root_",
    "label": "unspecified"
  },
  "body": {
    "self_ref": "#/body",
    "children": [
      {
        "$ref": "#/groups/0"
      }
    ],
    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
  "groups": [
    {
      "self_ref": "#/groups/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/0"
        }
      ],
      "content_layer": "body",
      "name": "slide-0",
      "label": "chapter"
    }
  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "title",
      "prov": [
        {
          "page_no": 1,
          "bbox": {
            "l": 1041400.0,
            "t": 4582390.0,
            "r": 8083550.0,
            "b": 1689099.0,
            "coord_origin": "BOTTOMLEFT"
          },
          "charspan": [
            0,
            118
          ]
        }
      ],
      "orig": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers",
      "text": "X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers"
    }
  ],
  "pictures": [],
  "tables": [],
  "key_value_items": [],
  "form_items": [],
  "pages": {
    "1": {
      "size": {
        "width": 12190413.0,
        "height": 6858000.0
      },
      "page_no": 1
    }
  }
 }
--- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.md
+++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.md
@ -0,0 +1 @@
 # X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers
--- a/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
+++ b/tests/data/groundtruth/docling_v2/redp5110_sampled.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_01.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_02.pages.json
--- a/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
+++ b/tests/data/groundtruth/docling_v2/right_to_left_03.pages.json
--- a/tests/data/groundtruth/docling_v2/test_03.asciidoc.md
+++ b/tests/data/groundtruth/docling_v2/test_03.asciidoc.md
@ -0,0 +1,23 @@
 :\_mod-docs-content-type: PROCEDURE :experimental:
 # Renaming a bookmark
 [id="renaming-a-bookmark\_{context}"]
 You can rename a bookmark to distinguish it from other bookmarks. If you have bookmarks to several folders that all share the same name, you can tell the bookmarks apart if you rename them.
 Renaming the bookmark does not rename the folder.
 - Check that the side bar lists the bookmark under the new name.
 Procedure . Right-click the bookmark in the side bar. . Select *Rename…*. +
 <!-- image -->
 In the *Name* field, enter the new name for the bookmark. +
 <!-- image -->
 Click btn:[Rename]. .Verification
 <!-- image -->
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt
@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_
    item-16 at level 2: list_item: Italic bullet 1
    item-17 at level 2: list_item: Bold bullet 2
    item-18 at level 2: list_item: Underline bullet 3
-    item-19 at level 2: inline: group group
+    item-19 at level 2: list_item: 
-      item-20 at level 3: list_item: Some
+      item-20 at level 3: inline: group group
-      item-21 at level 3: list_item: italic
+        item-21 at level 4: text: Some
-      item-22 at level 3: list_item: bold
+        item-22 at level 4: text: italic
-      item-23 at level 3: list_item: underline
+        item-23 at level 4: text: bold
-    item-24 at level 2: list: group list
+        item-24 at level 4: text: underline
-      item-25 at level 3: inline: group group
+    item-25 at level 2: list: group list
-        item-26 at level 4: list_item: Nested
+      item-26 at level 3: list_item: 
-        item-27 at level 4: list_item: italic
+        item-27 at level 4: inline: group group
-        item-28 at level 4: list_item: bold
+          item-28 at level 5: text: Nested
-  item-29 at level 1: paragraph: 
+          item-29 at level 5: text: italic
          item-30 at level 5: text: bold
  item-31 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
+++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json
@ -42,7 +42,7 @@
        "$ref": "#/groups/1"
      },
      {
-        "$ref": "#/texts/23"
+        "$ref": "#/texts/25"
      }
    ],
    "content_layer": "body",
@ -98,7 +98,7 @@
          "$ref": "#/texts/15"
        },
        {
-          "$ref": "#/groups/2"
+          "$ref": "#/texts/16"
        },
        {
          "$ref": "#/groups/3"
@ -111,12 +111,9 @@
    {
      "self_ref": "#/groups/2",
      "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/texts/16"
      },
      "children": [
        {
          "$ref": "#/texts/16"
        },
        {
          "$ref": "#/texts/17"
        },
@ -125,6 +122,9 @@
        },
        {
          "$ref": "#/texts/19"
        },
        {
          "$ref": "#/texts/20"
        }
      ],
      "content_layer": "body",
@ -138,7 +138,7 @@
      },
      "children": [
        {
-          "$ref": "#/groups/4"
+          "$ref": "#/texts/21"
        }
      ],
      "content_layer": "body",
@ -148,17 +148,17 @@
    {
      "self_ref": "#/groups/4",
      "parent": {
-        "$ref": "#/groups/3"
+        "$ref": "#/texts/21"
      },
      "children": [
        {
          "$ref": "#/texts/20"
        },
        {
          "$ref": "#/texts/21"
        },
        {
          "$ref": "#/texts/22"
        },
        {
          "$ref": "#/texts/23"
        },
        {
          "$ref": "#/texts/24"
        }
      ],
      "content_layer": "body",
@ -461,20 +461,18 @@
    {
      "self_ref": "#/texts/16",
      "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/groups/1"
      },
-      "children": [],
+      "children": [
        {
          "$ref": "#/groups/2"
        }
      ],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
-      "orig": "Some",
+      "orig": "",
-      "text": "Some",
+      "text": "",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
      },
      "enumerated": false,
      "marker": "-"
    },
@ -485,18 +483,16 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
      "prov": [],
-      "orig": "italic",
+      "orig": "Some",
-      "text": "italic",
+      "text": "Some",
      "formatting": {
        "bold": false,
-        "italic": true,
+        "italic": false,
        "underline": false,
        "strikethrough": false
-      },
+      }
      "enumerated": false,
      "marker": "-"
    },
    {
      "self_ref": "#/texts/18",
@ -505,67 +501,7 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
      "prov": [],
      "orig": "bold",
      "text": "bold",
      "formatting": {
        "bold": true,
        "italic": false,
        "underline": false,
        "strikethrough": false
      },
      "enumerated": false,
      "marker": "-"
    },
    {
      "self_ref": "#/texts/19",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "underline",
      "text": "underline",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": true,
        "strikethrough": false
      },
      "enumerated": false,
      "marker": "-"
    },
    {
      "self_ref": "#/texts/20",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "Nested",
      "text": "Nested",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
      },
      "enumerated": false,
      "marker": "-"
    },
    {
      "self_ref": "#/texts/21",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "italic",
      "text": "italic",
@ -574,7 +510,59 @@
        "italic": true,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/19",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "bold",
      "text": "bold",
      "formatting": {
        "bold": true,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/20",
      "parent": {
        "$ref": "#/groups/2"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "underline",
      "text": "underline",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": true,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/21",
      "parent": {
        "$ref": "#/groups/3"
      },
      "children": [
        {
          "$ref": "#/groups/4"
        }
      ],
      "content_layer": "body",
      "label": "list_item",
      "prov": [],
      "orig": "",
      "text": "",
      "enumerated": false,
      "marker": "-"
    },
@ -585,7 +573,43 @@
      },
      "children": [],
      "content_layer": "body",
-      "label": "list_item",
+      "label": "text",
      "prov": [],
      "orig": "Nested",
      "text": "Nested",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/23",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "italic",
      "text": "italic",
      "formatting": {
        "bold": false,
        "italic": true,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/24",
      "parent": {
        "$ref": "#/groups/4"
      },
      "children": [],
      "content_layer": "body",
      "label": "text",
      "prov": [],
      "orig": "bold",
      "text": "bold",
@ -594,12 +618,10 @@
        "italic": false,
        "underline": false,
        "strikethrough": false
-      },
+      }
      "enumerated": false,
      "marker": "-"
    },
    {
-      "self_ref": "#/texts/23",
+      "self_ref": "#/texts/25",
      "parent": {
        "$ref": "#/body"
      },
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt
@ -0,0 +1,16 @@
 item-0 at level 0: unspecified: group _root_
  item-1 at level 1: paragraph: Transcript
  item-2 at level 1: paragraph: February 20, 2025, 8:32PM
  item-3 at level 1: picture
  item-4 at level 1: inline: group group
    item-5 at level 2: paragraph: This is test 1
    item-6 at level 2: paragraph: 0:08
 Correct, he is not.
  item-7 at level 1: paragraph: 
  item-8 at level 1: picture
  item-9 at level 1: inline: group group
    item-10 at level 2: paragraph: This is test 2
    item-11 at level 2: paragraph: 0:16
 Yeah, exactly.
  item-12 at level 1: paragraph: 
  item-13 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.json
@ -0,0 +1,286 @@
 {
  "schema_name": "DoclingDocument",
  "version": "1.3.0",
  "name": "word_image_anchors",
  "origin": {
    "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
    "binary_hash": 2428692234257307633,
    "filename": "word_image_anchors.docx"
  },
  "furniture": {
    "self_ref": "#/furniture",
    "children": [],
    "content_layer": "furniture",
    "name": "_root_",
    "label": "unspecified"
  },
  "body": {
    "self_ref": "#/body",
    "children": [
      {
        "$ref": "#/texts/0"
      },
      {
        "$ref": "#/texts/1"
      },
      {
        "$ref": "#/pictures/0"
      },
      {
        "$ref": "#/groups/0"
      },
      {
        "$ref": "#/texts/4"
      },
      {
        "$ref": "#/pictures/1"
      },
      {
        "$ref": "#/groups/1"
      },
      {
        "$ref": "#/texts/7"
      },
      {
        "$ref": "#/texts/8"
      }
    ],
    "content_layer": "body",
    "name": "_root_",
    "label": "unspecified"
  },
  "groups": [
    {
      "self_ref": "#/groups/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/2"
        },
        {
          "$ref": "#/texts/3"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    },
    {
      "self_ref": "#/groups/1",
      "parent": {
        "$ref": "#/body"
      },
      "children": [
        {
          "$ref": "#/texts/5"
        },
        {
          "$ref": "#/texts/6"
        }
      ],
      "content_layer": "body",
      "name": "group",
      "label": "inline"
    }
  ],
  "texts": [
    {
      "self_ref": "#/texts/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "Transcript",
      "text": "Transcript",
      "formatting": {
        "bold": true,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/1",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "February 20, 2025, 8:32PM",
      "text": "February 20, 2025, 8:32PM",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/2",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "This is test 1",
      "text": "This is test 1",
      "formatting": {
        "bold": true,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/3",
      "parent": {
        "$ref": "#/groups/0"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "0:08\nCorrect, he is not.",
      "text": "0:08\nCorrect, he is not.",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/4",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
      "text": ""
    },
    {
      "self_ref": "#/texts/5",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "This is test 2",
      "text": "This is test 2",
      "formatting": {
        "bold": true,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/6",
      "parent": {
        "$ref": "#/groups/1"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "0:16\nYeah, exactly.",
      "text": "0:16\nYeah, exactly.",
      "formatting": {
        "bold": false,
        "italic": false,
        "underline": false,
        "strikethrough": false
      }
    },
    {
      "self_ref": "#/texts/7",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
      "text": ""
    },
    {
      "self_ref": "#/texts/8",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "label": "paragraph",
      "prov": [],
      "orig": "",
      "text": ""
    }
  ],
  "pictures": [
    {
      "self_ref": "#/pictures/0",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "label": "picture",
      "prov": [],
      "captions": [],
      "references": [],
      "footnotes": [],
      "image": {
        "mimetype": "image/png",
        "dpi": 72,
        "size": {
          "width": 100.0,
          "height": 100.0
        },
        "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAAz0lEQVR4nO3bUW0CURRF0TukQvDSauBr0mACE1VBAzYQg5Lpdw0wO2EtA+cl+/6+GQAAAAAAAAAAAADe1DIR53X9mcNcdhnf5nm93Y8T8DElyzyuv/evlx/CMqeJOOz9AP4TJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiBIkRJEaQGEFiWp8+t/k8f6/bDrvPl28CAAAAAAAAAAAAAAAAzLv5A5bTEG2TIIlOAAAAAElFTkSuQmCC"
      },
      "annotations": []
    },
    {
      "self_ref": "#/pictures/1",
      "parent": {
        "$ref": "#/body"
      },
      "children": [],
      "content_layer": "body",
      "label": "picture",
      "prov": [],
      "captions": [],
      "references": [],
      "footnotes": [],
      "image": {
        "mimetype": "image/png",
        "dpi": 72,
        "size": {
          "width": 100.0,
          "height": 100.0
        },
        "uri": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAGQAAABkCAYAAABw4pVUAAAJIElEQVR4nO2dbWxb1RnH/8+1c5O4bITEwJrRF1ZAI6gtL9oK29oxihAdTQOVoGhbKyS0MDWZJk1CQ+q0aR/4xLYvJNGabdK07MukrSUNaxEvg7aUlteuLUoHrUTbseylSRSgpLGd3Ac9596kSWzHvva1fXzv/UmW4jaxj5+/z73nPOec50/QnM3t5xdbUWOlZeBGgK8jNpYC3AxQHOAGEMXAXKN+mSgF5nGAxgAeBmiIyToH0GnDwklj0jqxq/fK/0BjCJrR2jn8ZcPCXSBaC9DtAC/39h3oDMBHwHzQMvD3ga74P6ERWgjS1jG8BjAeALgVQEuZ334QoAHA2t3fHX8dQRWktX0obpi1jzDjewSshgYwcIwIf7KSiT8M9DYPB0KQts7RlWDuANCuSw/NAAPoBVF3f1fjCZQRKq8QeBzgragqqA+Ep8olDJXj0kSm+XNi6kQVw8RdnEz+otSXspIK0rZ9eDuIngTQAH8wBuYd/T3xnqoSRIauERi/ZuYN8CFEtG8K1o9LMWT2XJBN20e+TwZ1gdmEnyFKssWde3qafuvpy3r5Ym0dI78B8BiCxc7+7qYfaCXIxvbRpZEa7gOwDsHkwFSKtj7b23iu4oLYs2z6M4BlCDZnAd5S7Gy/KEHu3z5yDxN2AVhUzOv4iE+JsfmZnqbnyy7Iph+O3kcWD2g8264UzAa17nm68W+F/DEV0TOeC8XIChPj3kJ6ChV4z3gpvEzl5FOA17u9pxhuR1PODTwUIzeLJFZ2zEokiDO0Dfpoyg3LnJh5L4gz6QvqPKMY1jmx804QSYcEcAbuJY85MSz+pq7WuGEc831uqtQQJS1Yq3MlJHP2EMnahmJ4ALOpYpkDI9d6hl9T6JVAYqnWiAq5ZKlNCDW1p3y0uKQLY1YqcX22lcesPUSWXUMxSkKDE9v8e4izM+R4adoToiBalWnjROYeonaHhJSULDFO6yFh76hsL0nvIfYmtpBykCHWlGFk9X8d0uqrbqjBj7YtQlODq3QbLAtIphgffcL44N+TeO1oEgfeSkJT2Eolrpo94orO/l/ZawuuvBjFYBhAXS2px9VxE2tWmdjWZmHvgQnsemECmkEq5sAvp/9hztdPNj7DZxAB8SsMfLc1hscfvQz1dXp93+bH3Ji98KTLLvRSEDGAO1abaH8wBp2QmNuLfmk9RM5n+BvDAL6y0sTa23RLzV2K/ax7iDosUzUcOprEmydS6udoBFixJIovLYng2msiMGuyX5YW1RNuXBHFwbd1utGr2D8xI4ik2MFlP7lUFBcnGK+8kZh5/uJh+2e5ibc/FMs6OpN7yjVXR6AZLaKBpOZVq9WZPp/w+vEkXjqcwOQUqoppDeyvkTpg6R+GzltIpeQQVBXhaOD0azntGgyYgQ//p2P3sTUw5By490ePK8u1X4zANDPf2D+6YOGtd+3BgF7wctHCkEP58BG3ttTgG7eZat6RKa0iYrwzqKMggGgRlQoJVGWX20xcvyyKDetqcfsqE7F6yiiGCPH7v45DV0SLqJSr0CCX6Jq776hVj3yQZKMMi/v2XFTDZX3h66JO7RD4kQvjjKMnU3j2lQm898EkdEe0kB7SDJ9yWYzwtVtMLGuOqEnkvoMJ3XtIs2FX1fEvEQNYujiCrZti+NVPPq9m8vpCcUOVOAoAREDzlRGVVpGRmJ5wQ9SpN4Vq49Cs5KJQZwI3LJcEYxRLFkcyDnsFyXFta4vh/OgF/Ou/mk0QiWLRmeJfVcbFeclF4blX7ecy+vrOxno0Xp5ZlSVfiOBba2rxx37NhsDMNe4WrKuEFw8nsHd/Qg13MxGJAC0r5qxea4OhyuL5kGPvpTD2cfZLsfQemUxqBVHKsGsU+o9TZyeRWCDjK72kvlazCTHzuAx7x+BDWlZEEVtgQ8PUFHAxodtghsZk2FuRUnal5tabanD557LfIkUM6UV6wcPSQ4bgMzbeWYcNa+vUWnsmZJR/bkizIa+ChqJS15ZYs2tpHsj+qju/eim5KMGXeYg8FpqHCOMTjBOn9BvLiBZRKTKMKuTrt5jq4RbpHYOnJ/H8oblzGD2g04ZUfEaAODs0pd+E0EG0iEr57Sl/zg/Tesbpc5P43V/G9UuZOIgWhl0LXcpv+5ePLzAGXp7Az57+RON1ETojWjhTVT4CwDcbHZIpVjfuMx9O4cjxJPa/mdR8HWRGA2crKfNBED0MjTj+fgqP/tSXc9bMiAbT+7LEJaDS7Qk6lqOBEsQp9zBY6UYFmMHpkhuzhldi2RBSGS7FfpYg1u4KtSYEl2I/J2eyqWPkH34+RaUj4lmyp7vp5unnc2aEYmZSkVYFGJoX8zmCiLOMY2YSUh7YiXlmQZzz0r1lakwI0Du/KlB6Eouou5wtCjSUHus0QezaG+SqkmZIIVBf/tWACE8V9B4h+ZMlxhkFEeXEc8nFy4e4QGKbzWQs60KIGGApz6UQrxlzYgtXgqi7P/MOz5sTdJh3LOT0lnN3w/2do3vDyqTemYk909X47YV+J+farbiRSRFgj9oUXIiSKpY5yCmIpIXFjcyzhgUUtrgzH5u9vHY3ONZwOz1pWTDZma+9nqsdcm0dI/tDhwTXHOjvbvpmvr/sav+PWMPZbmQheXLWiRlKIojt08dbbDufkDwsj7a49TZ0vUNOPJXEGi5M0+c0BdtciKdhQVsWxX1MrOFCUbLb5hXqZVjwHlLx6RNruPDylWYseW+hHoZCUZt67W8Brw9v9DPWq+uLcfkUQnNiv5kTzya07y4eT88hSMOY0R6I3BdRUj6rl2IInh8MkRSBuJFJZhM+hYj2yWfMNx3i6rVRQpQBFtGTPrJOGpP1jP6eeE+p3qCkR6ek4WKA5YflYCbuks9SSjGEsh2/tZ17xOaHXeV2Kg/1yYaEbGvgnr8byoxjqSTOMu06GMdkQTIQvbJvqlxCTFOxgCg3H7P2EfHP0GWDNwPHZK+tbO9caN27lGjxDbX9M8SyQbkElNsUYNA+n2HtLiQZ6EtB0syQLdxl10KX8tteV92WE8d8RM70yTGyfJZVAy0I5iHlt6XisxQZlrq2TlnbZrt4Jzc4JQrtqnhS+0uVm5IKR1JUh4akXIWqkGDhpJwDt4+B68tnvr6L5zB8YjIAAAAASUVORK5CYII="
      },
      "annotations": []
    }
  ],
  "tables": [],
  "key_value_items": [],
  "form_items": [],
  "pages": {}
 }
--- a/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
+++ b/tests/data/groundtruth/docling_v2/word_image_anchors.docx.md
@ -0,0 +1,13 @@
 **Transcript**
 February 20, 2025, 8:32PM
 <!-- image -->
 **This is test 1** 0:08
 Correct, he is not.
 <!-- image -->
 **This is test 2** 0:16
 Yeah, exactly.
--- a/tests/data/md/inline_and_formatting.md
+++ b/tests/data/md/inline_and_formatting.md
@ -0,0 +1,18 @@
 # Contribution guideline example
 This is simple.
 Foo *emphasis* **strong emphasis** ***both***.
 Create your feature branch: `git checkout -b feature/AmazingFeature`.
 1. Pull the [**repository**](https://github.com/docling-project/docling).
 2. Create your feature branch (`git checkout -b feature/AmazingFeature`)
 3. Commit your changes (`git commit -m 'Add some AmazingFeature'`)
 4. Push to the branch (`git push origin feature/AmazingFeature`)
 5. Open a Pull Request
 ## *Second* section  <!-- inline groups in headings not yet supported by serializers -->
 - **First**: Lorem ipsum.
 - **Second**: Dolor `sit` amet.
--- a/tests/data/pptx/powerpoint_bad_text.pptx
+++ b/tests/data/pptx/powerpoint_bad_text.pptx
--- a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
+++ b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json
@ -5,84 +5,159 @@
      "width": 2000.0,
      "height": 2829.0
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 246.4065456254215,
+          "r_x0": 0.0,
-          "r_y0": 329.06770715202435,
+          "r_y0": 0.0,
-          "r_x1": 1691.991797818404,
+          "r_x1": 2000.0,
-          "r_y1": 329.06770715202435,
+          "r_y1": 0.0,
-          "r_x2": 1691.991797818404,
+          "r_x2": 2000.0,
-          "r_y2": 258.9040166758338,
+          "r_y2": 2829.0,
-          "r_x3": 246.4065456254215,
+          "r_x3": 0.0,
-          "r_y3": 258.9040166758338,
+          "r_y3": 2829.0,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 2829.0,
-        "from_ocr": true
+          "r": 2000.0,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 2829.0,
          "r": 2000.0,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 2829.0,
          "r": 2000.0,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 2829.0,
          "r": 2000.0,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 2829.0,
          "r": 2000.0,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [
-        "index": 1,
+        {
-        "rgba": {
+          "index": 0,
-          "r": 0,
+          "rect": {
-          "g": 0,
+            "r_x0": 0.0,
-          "b": 0,
+            "r_y0": 0.0,
-          "a": 255
+            "r_x1": 2000.0,
            "r_y1": 0.0,
            "r_x2": 2000.0,
            "r_y2": 2829.0,
            "r_x3": 0.0,
            "r_y3": 2829.0,
            "coord_origin": "BOTTOMLEFT"
          },
          "uri": null
        }
      ],
      "char_cells": [],
      "word_cells": [],
      "textline_cells": [
        {
          "index": 0,
          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 246.4065456254215,
            "r_y0": 329.06770715202435,
            "r_x1": 1691.991797818404,
            "r_y1": 329.06770715202435,
            "r_x2": 1691.991797818404,
            "r_y2": 258.9040166758338,
            "r_x3": 246.4065456254215,
            "r_y3": 258.9040166758338,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 234.08627147881114,
+          "index": 1,
-          "r_y0": 419.5788697734327,
+          "rgba": {
-          "r_x1": 1696.0985042090742,
+            "r": 0,
-          "r_y1": 419.5788697734327,
+            "g": 0,
-          "r_x2": 1696.0985042090742,
+            "b": 0,
-          "r_y2": 349.4151792972422,
+            "a": 255
-          "r_x3": 234.08627147881114,
+          },
-          "r_y3": 349.4151792972422,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 234.08627147881114,
            "r_y0": 419.5788697734327,
            "r_x1": 1696.0985042090742,
            "r_y1": 419.5788697734327,
            "r_x2": 1696.0985042090742,
            "r_y2": 349.4151792972422,
            "r_x3": 234.08627147881114,
            "r_y3": 349.4151792972422,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 242.29979922858777,
-          "g": 0,
+            "r_y0": 509.8779072023336,
-          "b": 0,
+            "r_x1": 513.3470125989277,
-          "a": 255
+            "r_y1": 509.8779072023336,
-        },
+            "r_x2": 513.3470125989277,
-        "rect": {
+            "r_y2": 439.9752910477536,
-          "r_x0": 242.29979922858777,
+            "r_x3": 242.29979922858777,
-          "r_y0": 509.8779072023336,
+            "r_y3": 439.9752910477536,
-          "r_x1": 513.3470125989277,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 509.8779072023336,
+          },
-          "r_x2": 513.3470125989277,
+          "text": "package",
-          "r_y2": 439.9752910477536,
+          "orig": "package",
-          "r_x3": 242.29979922858777,
+          "text_direction": "left_to_right",
-          "r_y3": 439.9752910477536,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 73.34702132031646,
+          "r_x0": 0.0,
-          "r_y0": 97.99999977896755,
+          "r_y0": 0.0,
-          "r_x1": 503.64955224479564,
+          "r_x1": 595.201171875,
-          "r_y1": 97.99999977896755,
+          "r_y1": 0.0,
-          "r_x2": 503.64955224479564,
+          "r_x2": 595.201171875,
-          "r_y2": 76.99999977896756,
+          "r_y2": 841.9216918945312,
-          "r_x3": 73.34702132031646,
+          "r_x3": 0.0,
-          "r_y3": 76.99999977896756,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 73.34702132031646,
            "r_y0": 97.99999977896755,
            "r_x1": 503.64955224479564,
            "r_y1": 97.99999977896755,
            "r_x2": 503.64955224479564,
            "r_y2": 76.99999977896756,
            "r_x3": 73.34702132031646,
            "r_y3": 76.99999977896756,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 69.6796630536824,
+          "index": 1,
-          "r_y0": 124.83139494707741,
+          "rgba": {
-          "r_x1": 504.8720051760782,
+            "r": 0,
-          "r_y1": 124.83139494707741,
+            "g": 0,
-          "r_x2": 504.8720051760782,
+            "b": 0,
-          "r_y2": 104.00000011573796,
+            "a": 255
-          "r_x3": 69.6796630536824,
+          },
-          "r_y3": 104.00000011573796,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 69.6796630536824,
            "r_y0": 124.83139494707741,
            "r_x1": 504.8720051760782,
            "r_y1": 124.83139494707741,
            "r_x2": 504.8720051760782,
            "r_y2": 104.00000011573796,
            "r_x3": 69.6796630536824,
            "r_y3": 104.00000011573796,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 71.84193505100733,
-          "g": 0,
+            "r_y0": 152.90926970226084,
-          "b": 0,
+            "r_x1": 153.088934155825,
-          "a": 255
+            "r_y1": 152.90926970226084,
-        },
+            "r_x2": 153.088934155825,
-        "rect": {
+            "r_y2": 129.797125232046,
-          "r_x0": 71.84193505100733,
+            "r_x3": 71.84193505100733,
-          "r_y0": 152.90926970226084,
+            "r_y3": 129.797125232046,
-          "r_x1": 153.088934155825,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 152.90926970226084,
+          },
-          "r_x2": 153.088934155825,
+          "text": "package",
-          "r_y2": 129.797125232046,
+          "orig": "package",
-          "r_x3": 71.84193505100733,
+          "text_direction": "left_to_right",
-          "r_y3": 129.797125232046,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_180.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 89.2388782764286,
+          "r_x0": 0.0,
-          "r_y0": 764.898293373551,
+          "r_y0": 0.0,
-          "r_x1": 521.9863147998661,
+          "r_x1": 595.201171875,
-          "r_y1": 764.898293373551,
+          "r_y1": 0.0,
-          "r_x2": 521.9863147998661,
+          "r_x2": 595.201171875,
-          "r_y2": 744.0929853494625,
+          "r_y2": 841.9216918945312,
-          "r_x3": 89.2388782764286,
+          "r_x3": 0.0,
-          "r_y3": 744.0929853494625,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 89.2388782764286,
            "r_y0": 764.898293373551,
            "r_x1": 521.9863147998661,
            "r_y1": 764.898293373551,
            "r_x2": 521.9863147998661,
            "r_y2": 744.0929853494625,
            "r_x3": 89.2388782764286,
            "r_y3": 744.0929853494625,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 89.23887497045128,
+          "index": 1,
-          "r_y0": 739.1977118987292,
+          "rgba": {
-          "r_x1": 523.208764293368,
+            "r": 0,
-          "r_y1": 739.1977118987292,
+            "g": 0,
-          "r_x2": 523.208764293368,
+            "b": 0,
-          "r_y2": 717.1685676116198,
+            "a": 255
-          "r_x3": 89.23887497045128,
+          },
-          "r_y3": 717.1685676116198,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 89.23887497045128,
            "r_y0": 739.1977118987292,
            "r_x1": 523.208764293368,
            "r_y1": 739.1977118987292,
            "r_x2": 523.208764293368,
            "r_y2": 717.1685676116198,
            "r_x3": 89.23887497045128,
            "r_y3": 717.1685676116198,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 441.2561096985719,
-          "g": 0,
+            "r_y0": 710.0268078458798,
-          "b": 0,
+            "r_x1": 522.0347860494834,
-          "a": 255
+            "r_y1": 710.0268078458798,
-        },
+            "r_x2": 522.0347860494834,
-        "rect": {
+            "r_y2": 690.0429592741025,
-          "r_x0": 441.2561096985719,
+            "r_x3": 441.2561096985719,
-          "r_y0": 710.0268078458798,
+            "r_y3": 690.0429592741025,
-          "r_x1": 522.0347860494834,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 710.0268078458798,
+          },
-          "r_x2": 522.0347860494834,
+          "text": "package",
-          "r_y2": 690.0429592741025,
+          "orig": "package",
-          "r_x3": 441.2561096985719,
+          "text_direction": "left_to_right",
-          "r_y3": 690.0429592741025,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_270.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 744.0930045534915,
+          "r_x0": 0.0,
-          "r_y0": 504.87200373583954,
+          "r_y0": 0.0,
-          "r_x1": 764.8982839673505,
+          "r_x1": 595.201171875,
-          "r_y1": 504.87200373583954,
+          "r_y1": 0.0,
-          "r_x2": 764.8982839673505,
+          "r_x2": 595.201171875,
-          "r_y2": 73.34702001188118,
+          "r_y2": 841.9216918945312,
-          "r_x3": 744.0930045534915,
+          "r_x3": 0.0,
-          "r_y3": 73.34702001188118,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 744.0930045534915,
            "r_y0": 504.87200373583954,
            "r_x1": 764.8982839673505,
            "r_y1": 504.87200373583954,
            "r_x2": 764.8982839673505,
            "r_y2": 73.34702001188118,
            "r_x3": 744.0930045534915,
            "r_y3": 73.34702001188118,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 717.168585936602,
+          "index": 1,
-          "r_y0": 504.8720061466397,
+          "rgba": {
-          "r_x1": 737.9738558137178,
+            "r": 0,
-          "r_y1": 504.8720061466397,
+            "g": 0,
-          "r_x2": 737.9738558137178,
+            "b": 0,
-          "r_y2": 70.90211682372312,
+            "a": 255
-          "r_x3": 717.168585936602,
+          },
-          "r_y3": 70.90211682372312,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 717.168585936602,
            "r_y0": 504.8720061466397,
            "r_x1": 737.9738558137178,
            "r_y1": 504.8720061466397,
            "r_x2": 737.9738558137178,
            "r_y2": 70.90211682372312,
            "r_x3": 717.168585936602,
            "r_y3": 70.90211682372312,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 690.2441821046808,
-          "g": 0,
+            "r_y0": 152.80629773131633,
-          "b": 0,
+            "r_x1": 709.8255852011977,
-          "a": 255
+            "r_y1": 152.80629773131633,
-        },
+            "r_x2": 709.8255852011977,
-        "rect": {
+            "r_y2": 72.124570639845,
-          "r_x0": 690.2441821046808,
+            "r_x3": 690.2441821046808,
-          "r_y0": 152.80629773131633,
+            "r_y3": 72.124570639845,
-          "r_x1": 709.8255852011977,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 152.80629773131633,
+          },
-          "r_x2": 709.8255852011977,
+          "text": "package",
-          "r_y2": 72.124570639845,
+          "orig": "package",
-          "r_x3": 690.2441821046808,
+          "text_direction": "left_to_right",
-          "r_y3": 72.124570639845,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v1/ocr_test_rotated_90.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 77.10171545548258,
+          "r_x0": 0.0,
-          "r_y0": 520.7638571913312,
+          "r_y0": 0.0,
-          "r_x1": 96.68315797053792,
+          "r_x1": 595.201171875,
-          "r_y1": 520.7638571913312,
+          "r_y1": 0.0,
-          "r_x2": 96.68315797053792,
+          "r_x2": 595.201171875,
-          "r_y2": 89.2388734673729,
+          "r_y2": 841.9216918945312,
-          "r_x3": 77.10171545548258,
+          "r_x3": 0.0,
-          "r_y3": 89.2388734673729,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 77.10171545548258,
            "r_y0": 520.7638571913312,
            "r_x1": 96.68315797053792,
            "r_y1": 520.7638571913312,
            "r_x2": 96.68315797053792,
            "r_y2": 89.2388734673729,
            "r_x3": 77.10171545548258,
            "r_y3": 89.2388734673729,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 100.64168123325977,
+          "index": 1,
-          "r_y0": 523.3236155182395,
+          "rgba": {
-          "r_x1": 126.08064862014129,
+            "r": 0,
-          "r_y1": 523.3236155182395,
+            "g": 0,
-          "r_x2": 126.08064862014129,
+            "b": 0,
-          "r_y2": 89.1266754140729,
+            "a": 255
-          "r_x3": 100.64168123325977,
+          },
-          "r_y3": 89.1266754140729,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 100.64168123325977,
            "r_y0": 523.3236155182395,
            "r_x1": 126.08064862014129,
            "r_y1": 523.3236155182395,
            "r_x2": 126.08064862014129,
            "r_y2": 89.1266754140729,
            "r_x3": 100.64168123325977,
            "r_y3": 89.1266754140729,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 131.21306574279092,
-          "g": 0,
+            "r_y0": 521.0762158417759,
-          "b": 0,
+            "r_x1": 152.19606490864376,
-          "a": 255
+            "r_y1": 521.0762158417759,
-        },
+            "r_x2": 152.19606490864376,
-        "rect": {
+            "r_y2": 441.0071698212682,
-          "r_x0": 131.21306574279092,
+            "r_x3": 131.21306574279092,
-          "r_y0": 521.0762158417759,
+            "r_y3": 441.0071698212682,
-          "r_x1": 152.19606490864376,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 521.0762158417759,
+          },
-          "r_x2": 152.19606490864376,
+          "text": "package",
-          "r_y2": 441.0071698212682,
+          "orig": "package",
-          "r_x3": 131.21306574279092,
+          "text_direction": "left_to_right",
-          "r_y3": 441.0071698212682,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 73.34702132031646,
+          "r_x0": 0.0,
-          "r_y0": 97.99999977896755,
+          "r_y0": 0.0,
-          "r_x1": 503.64955224479564,
+          "r_x1": 595.201171875,
-          "r_y1": 97.99999977896755,
+          "r_y1": 0.0,
-          "r_x2": 503.64955224479564,
+          "r_x2": 595.201171875,
-          "r_y2": 76.99999977896756,
+          "r_y2": 841.9216918945312,
-          "r_x3": 73.34702132031646,
+          "r_x3": 0.0,
-          "r_y3": 76.99999977896756,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 73.34702132031646,
            "r_y0": 97.99999977896755,
            "r_x1": 503.64955224479564,
            "r_y1": 97.99999977896755,
            "r_x2": 503.64955224479564,
            "r_y2": 76.99999977896756,
            "r_x3": 73.34702132031646,
            "r_y3": 76.99999977896756,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 69.6796630536824,
+          "index": 1,
-          "r_y0": 124.83139494707741,
+          "rgba": {
-          "r_x1": 504.8720051760782,
+            "r": 0,
-          "r_y1": 124.83139494707741,
+            "g": 0,
-          "r_x2": 504.8720051760782,
+            "b": 0,
-          "r_y2": 104.00000011573796,
+            "a": 255
-          "r_x3": 69.6796630536824,
+          },
-          "r_y3": 104.00000011573796,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 69.6796630536824,
            "r_y0": 124.83139494707741,
            "r_x1": 504.8720051760782,
            "r_y1": 124.83139494707741,
            "r_x2": 504.8720051760782,
            "r_y2": 104.00000011573796,
            "r_x3": 69.6796630536824,
            "r_y3": 104.00000011573796,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 71.84193505100733,
-          "g": 0,
+            "r_y0": 152.90926970226084,
-          "b": 0,
+            "r_x1": 153.088934155825,
-          "a": 255
+            "r_y1": 152.90926970226084,
-        },
+            "r_x2": 153.088934155825,
-        "rect": {
+            "r_y2": 129.797125232046,
-          "r_x0": 71.84193505100733,
+            "r_x3": 71.84193505100733,
-          "r_y0": 152.90926970226084,
+            "r_y3": 129.797125232046,
-          "r_x1": 153.088934155825,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 152.90926970226084,
+          },
-          "r_x2": 153.088934155825,
+          "text": "package",
-          "r_y2": 129.797125232046,
+          "orig": "package",
-          "r_x3": 71.84193505100733,
+          "text_direction": "left_to_right",
-          "r_y3": 129.797125232046,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.pages.json
@ -5,84 +5,143 @@
      "width": 595.201171875,
      "height": 841.9216918945312
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 89.2388782764286,
+          "r_x0": 0.0,
-          "r_y0": 764.898293373551,
+          "r_y0": 0.0,
-          "r_x1": 521.9863147998661,
+          "r_x1": 595.201171875,
-          "r_y1": 764.898293373551,
+          "r_y1": 0.0,
-          "r_x2": 521.9863147998661,
+          "r_x2": 595.201171875,
-          "r_y2": 744.0929853494625,
+          "r_y2": 841.9216918945312,
-          "r_x3": 89.2388782764286,
+          "r_x3": 0.0,
-          "r_y3": 744.0929853494625,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 89.2388782764286,
            "r_y0": 764.898293373551,
            "r_x1": 521.9863147998661,
            "r_y1": 764.898293373551,
            "r_x2": 521.9863147998661,
            "r_y2": 744.0929853494625,
            "r_x3": 89.2388782764286,
            "r_y3": 744.0929853494625,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 89.23887497045128,
+          "index": 1,
-          "r_y0": 739.1977118987292,
+          "rgba": {
-          "r_x1": 523.208764293368,
+            "r": 0,
-          "r_y1": 739.1977118987292,
+            "g": 0,
-          "r_x2": 523.208764293368,
+            "b": 0,
-          "r_y2": 717.1685676116198,
+            "a": 255
-          "r_x3": 89.23887497045128,
+          },
-          "r_y3": 717.1685676116198,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 89.23887497045128,
            "r_y0": 739.1977118987292,
            "r_x1": 523.208764293368,
            "r_y1": 739.1977118987292,
            "r_x2": 523.208764293368,
            "r_y2": 717.1685676116198,
            "r_x3": 89.23887497045128,
            "r_y3": 717.1685676116198,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 441.2561096985719,
-          "g": 0,
+            "r_y0": 710.0268078458798,
-          "b": 0,
+            "r_x1": 522.0347860494834,
-          "a": 255
+            "r_y1": 710.0268078458798,
-        },
+            "r_x2": 522.0347860494834,
-        "rect": {
+            "r_y2": 690.0429592741025,
-          "r_x0": 441.2561096985719,
+            "r_x3": 441.2561096985719,
-          "r_y0": 710.0268078458798,
+            "r_y3": 690.0429592741025,
-          "r_x1": 522.0347860494834,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 710.0268078458798,
+          },
-          "r_x2": 522.0347860494834,
+          "text": "package",
-          "r_y2": 690.0429592741025,
+          "orig": "package",
-          "r_x3": 441.2561096985719,
+          "text_direction": "left_to_right",
-          "r_y3": 690.0429592741025,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 744.0930045534915,
+          "r_x0": 0.0,
-          "r_y0": 504.87200373583954,
+          "r_y0": 0.0,
-          "r_x1": 764.8982839673505,
+          "r_x1": 595.201171875,
-          "r_y1": 504.87200373583954,
+          "r_y1": 0.0,
-          "r_x2": 764.8982839673505,
+          "r_x2": 595.201171875,
-          "r_y2": 73.34702001188118,
+          "r_y2": 841.9216918945312,
-          "r_x3": 744.0930045534915,
+          "r_x3": 0.0,
-          "r_y3": 73.34702001188118,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 744.0930045534915,
            "r_y0": 504.87200373583954,
            "r_x1": 764.8982839673505,
            "r_y1": 504.87200373583954,
            "r_x2": 764.8982839673505,
            "r_y2": 73.34702001188118,
            "r_x3": 744.0930045534915,
            "r_y3": 73.34702001188118,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 717.168585936602,
+          "index": 1,
-          "r_y0": 504.8720061466397,
+          "rgba": {
-          "r_x1": 737.9738558137178,
+            "r": 0,
-          "r_y1": 504.8720061466397,
+            "g": 0,
-          "r_x2": 737.9738558137178,
+            "b": 0,
-          "r_y2": 70.90211682372312,
+            "a": 255
-          "r_x3": 717.168585936602,
+          },
-          "r_y3": 70.90211682372312,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 717.168585936602,
            "r_y0": 504.8720061466397,
            "r_x1": 737.9738558137178,
            "r_y1": 504.8720061466397,
            "r_x2": 737.9738558137178,
            "r_y2": 70.90211682372312,
            "r_x3": 717.168585936602,
            "r_y3": 70.90211682372312,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 690.2441821046808,
-          "g": 0,
+            "r_y0": 152.80629773131633,
-          "b": 0,
+            "r_x1": 709.8255852011977,
-          "a": 255
+            "r_y1": 152.80629773131633,
-        },
+            "r_x2": 709.8255852011977,
-        "rect": {
+            "r_y2": 72.124570639845,
-          "r_x0": 690.2441821046808,
+            "r_x3": 690.2441821046808,
-          "r_y0": 152.80629773131633,
+            "r_y3": 72.124570639845,
-          "r_x1": 709.8255852011977,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 152.80629773131633,
+          },
-          "r_x2": 709.8255852011977,
+          "text": "package",
-          "r_y2": 72.124570639845,
+          "orig": "package",
-          "r_x3": 690.2441821046808,
+          "text_direction": "left_to_right",
-          "r_y3": 72.124570639845,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
+++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.pages.json
@ -5,84 +5,143 @@
      "width": 841.9216918945312,
      "height": 595.201171875
    },
-    "cells": [
+    "parsed_page": {
-      {
+      "dimension": {
-        "index": 0,
+        "angle": 0.0,
        "rgba": {
          "r": 0,
          "g": 0,
          "b": 0,
          "a": 255
        },
        "rect": {
-          "r_x0": 77.10171545548258,
+          "r_x0": 0.0,
-          "r_y0": 520.7638571913312,
+          "r_y0": 0.0,
-          "r_x1": 96.68315797053792,
+          "r_x1": 595.201171875,
-          "r_y1": 520.7638571913312,
+          "r_y1": 0.0,
-          "r_x2": 96.68315797053792,
+          "r_x2": 595.201171875,
-          "r_y2": 89.2388734673729,
+          "r_y2": 841.9216918945312,
-          "r_x3": 77.10171545548258,
+          "r_x3": 0.0,
-          "r_y3": 89.2388734673729,
+          "r_y3": 841.9216918945312,
-          "coord_origin": "TOPLEFT"
+          "coord_origin": "BOTTOMLEFT"
        },
-        "text": "Docling bundles PDF document conversion to",
+        "boundary_type": "crop_box",
-        "orig": "Docling bundles PDF document conversion to",
+        "art_bbox": {
-        "text_direction": "left_to_right",
+          "l": 0.0,
-        "confidence": 1.0,
+          "t": 841.9216918945312,
-        "from_ocr": true
+          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "bleed_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "crop_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "media_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        },
        "trim_bbox": {
          "l": 0.0,
          "t": 841.9216918945312,
          "r": 595.201171875,
          "b": 0.0,
          "coord_origin": "BOTTOMLEFT"
        }
      },
-      {
+      "bitmap_resources": [],
-        "index": 1,
+      "char_cells": [],
-        "rgba": {
+      "word_cells": [],
-          "r": 0,
+      "textline_cells": [
-          "g": 0,
+        {
-          "b": 0,
+          "index": 0,
-          "a": 255
+          "rgba": {
            "r": 0,
            "g": 0,
            "b": 0,
            "a": 255
          },
          "rect": {
            "r_x0": 77.10171545548258,
            "r_y0": 520.7638571913312,
            "r_x1": 96.68315797053792,
            "r_y1": 520.7638571913312,
            "r_x2": 96.68315797053792,
            "r_y2": 89.2388734673729,
            "r_x3": 77.10171545548258,
            "r_y3": 89.2388734673729,
            "coord_origin": "TOPLEFT"
          },
          "text": "Docling bundles PDF document conversion to",
          "orig": "Docling bundles PDF document conversion to",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "rect": {
+        {
-          "r_x0": 100.64168123325977,
+          "index": 1,
-          "r_y0": 523.3236155182395,
+          "rgba": {
-          "r_x1": 126.08064862014129,
+            "r": 0,
-          "r_y1": 523.3236155182395,
+            "g": 0,
-          "r_x2": 126.08064862014129,
+            "b": 0,
-          "r_y2": 89.1266754140729,
+            "a": 255
-          "r_x3": 100.64168123325977,
+          },
-          "r_y3": 89.1266754140729,
+          "rect": {
-          "coord_origin": "TOPLEFT"
+            "r_x0": 100.64168123325977,
            "r_y0": 523.3236155182395,
            "r_x1": 126.08064862014129,
            "r_y1": 523.3236155182395,
            "r_x2": 126.08064862014129,
            "r_y2": 89.1266754140729,
            "r_x3": 100.64168123325977,
            "r_y3": 89.1266754140729,
            "coord_origin": "TOPLEFT"
          },
          "text": "JSON and Markdown in an easy self contained",
          "orig": "JSON and Markdown in an easy self contained",
          "text_direction": "left_to_right",
          "confidence": 1.0,
          "from_ocr": true
        },
-        "text": "JSON and Markdown in an easy self contained",
+        {
-        "orig": "JSON and Markdown in an easy self contained",
+          "index": 2,
-        "text_direction": "left_to_right",
+          "rgba": {
-        "confidence": 1.0,
+            "r": 0,
-        "from_ocr": true
+            "g": 0,
-      },
+            "b": 0,
-      {
+            "a": 255
-        "index": 2,
+          },
-        "rgba": {
+          "rect": {
-          "r": 0,
+            "r_x0": 131.21306574279092,
-          "g": 0,
+            "r_y0": 521.0762158417759,
-          "b": 0,
+            "r_x1": 152.19606490864376,
-          "a": 255
+            "r_y1": 521.0762158417759,
-        },
+            "r_x2": 152.19606490864376,
-        "rect": {
+            "r_y2": 441.0071698212682,
-          "r_x0": 131.21306574279092,
+            "r_x3": 131.21306574279092,
-          "r_y0": 521.0762158417759,
+            "r_y3": 441.0071698212682,
-          "r_x1": 152.19606490864376,
+            "coord_origin": "TOPLEFT"
-          "r_y1": 521.0762158417759,
+          },
-          "r_x2": 152.19606490864376,
+          "text": "package",
-          "r_y2": 441.0071698212682,
+          "orig": "package",
-          "r_x3": 131.21306574279092,
+          "text_direction": "left_to_right",
-          "r_y3": 441.0071698212682,
+          "confidence": 1.0,
-          "coord_origin": "TOPLEFT"
+          "from_ocr": true
-        },
+        }
-        "text": "package",
+      ],
-        "orig": "package",
+      "has_chars": false,
-        "text_direction": "left_to_right",
+      "has_words": false,
-        "confidence": 1.0,
+      "has_lines": true,
-        "from_ocr": true
+      "image": null,
-      }
+      "lines": []
-    ],
+    },
    "parsed_page": null,
    "predictions": {
      "layout": {
        "clusters": [
--- a/tests/test_backend_asciidoc.py
+++ b/tests/test_backend_asciidoc.py
@ -2,7 +2,11 @@ import glob
 import os
 from pathlib import Path
-from docling.backend.asciidoc_backend import AsciiDocBackend
+from docling.backend.asciidoc_backend import (
    DEFAULT_IMAGE_HEIGHT,
    DEFAULT_IMAGE_WIDTH,
    AsciiDocBackend,
 )
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import InputDocument
@ -18,6 +22,24 @@ def _get_backend(fname):
    return doc_backend
 def test_parse_picture():
    line = (
        "image::images/example1.png[Example Image, width=200, height=150, align=center]"
    )
    res = AsciiDocBackend._parse_picture(line)
    assert res
    assert res.get("width", 0) == "200"
    assert res.get("height", 0) == "150"
    assert res.get("uri", "") == "images/example1.png"
    line = "image::renamed-bookmark.png[Renamed bookmark]"
    res = AsciiDocBackend._parse_picture(line)
    assert res
    assert "width" not in res
    assert "height" not in res
    assert res.get("uri", "") == "renamed-bookmark.png"
 def test_asciidocs_examples():
    fnames = sorted(glob.glob("./tests/data/asciidoc/*.asciidoc"))
--- a/tests/test_backend_markdown.py
+++ b/tests/test_backend_markdown.py
@ -2,7 +2,7 @@ from pathlib import Path
 from docling.backend.md_backend import MarkdownDocumentBackend
 from docling.datamodel.base_models import InputFormat
-from docling.datamodel.document import InputDocument
+from docling.datamodel.document import DoclingDocument, InputDocument
 from .test_data_gen_flag import GEN_TEST_DATA
@ -11,12 +11,15 @@ def test_convert_valid():
    fmt = InputFormat.MD
    cls = MarkdownDocumentBackend
-    test_data_path = Path("tests") / "data"
+    root_path = Path("tests") / "data"
-    relevant_paths = sorted((test_data_path / "md").rglob("*.md"))
+    relevant_paths = sorted((root_path / "md").rglob("*.md"))
    assert len(relevant_paths) > 0
    yaml_filter = ["inline_and_formatting"]
    for in_path in relevant_paths:
-        gt_path = test_data_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
+        md_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.md"
        yaml_gt_path = root_path / "groundtruth" / "docling_v2" / f"{in_path.name}.yaml"
        in_doc = InputDocument(
            path_or_stream=in_path,
@ -33,9 +36,17 @@ def test_convert_valid():
        act_data = act_doc.export_to_markdown()
        if GEN_TEST_DATA:
-            with open(gt_path, mode="w", encoding="utf-8") as f:
+            with open(md_gt_path, mode="w", encoding="utf-8") as f:
                f.write(f"{act_data}\n")
            if in_path.stem in yaml_filter:
                with open(yaml_gt_path, mode="w", encoding="utf-8") as f:
                    act_doc.save_as_yaml(yaml_gt_path)
        else:
-            with open(gt_path, encoding="utf-8") as f:
+            with open(md_gt_path, encoding="utf-8") as f:
                exp_data = f.read().rstrip()
-            assert exp_data == act_data
+            assert act_data == exp_data
            if in_path.stem in yaml_filter:
                exp_doc = DoclingDocument.load_from_yaml(yaml_gt_path)
                assert act_doc == exp_doc
--- a/tests/test_backend_msword.py
+++ b/tests/test_backend_msword.py
@ -9,6 +9,7 @@ from docling.datamodel.document import (
    DoclingDocument,
    InputDocument,
    SectionHeaderItem,
    TextItem,
 )
 from docling.document_converter import DocumentConverter
@ -96,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
        pred_md: str = doc.export_to_markdown()
        assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
-            "export to md"
+            f"export to markdown failed on {docx_path}"
        )
        pred_itxt: str = doc._export_to_indented_text(
            max_text_len=70, explicit_tables=False
        )
        assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
-            "export to indented-text"
+            f"export to indented-text failed on {docx_path}"
        )
        assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), (
-            "document document"
+            f"DoclingDocument verification failed on {docx_path}"
        )
        if docx_path.name == "word_tables.docx":
@ -116,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
                pred_text=pred_html,
                gtfile=str(gt_path) + ".html",
                generate=GENERATE,
-            ), "export to html"
+            ), f"export to html failed on {docx_path}"
 flaky_path = Path("tests/data/docx/textbox.docx")
@ -131,3 +132,42 @@ def test_e2e_docx_conversions():
@pytest.mark.xfail(strict=False)
 def test_textbox_conversion():
    _test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
 def test_text_after_image_anchors():
    """
    Test to analyse whether text gets parsed after image anchors.
    """
    in_path = Path("tests/data/docx/word_image_anchors.docx")
    in_doc = InputDocument(
        path_or_stream=in_path,
        format=InputFormat.DOCX,
        backend=MsWordDocumentBackend,
    )
    backend = MsWordDocumentBackend(
        in_doc=in_doc,
        path_or_stream=in_path,
    )
    doc = backend.convert()
    found_text_after_anchor_1 = found_text_after_anchor_2 = (
        found_text_after_anchor_3
    ) = found_text_after_anchor_4 = False
    for item, _ in doc.iterate_items():
        if isinstance(item, TextItem):
            if item.text == "This is test 1":
                found_text_after_anchor_1 = True
            elif item.text == "0:08\nCorrect, he is not.":
                found_text_after_anchor_2 = True
            elif item.text == "This is test 2":
                found_text_after_anchor_3 = True
            elif item.text == "0:16\nYeah, exactly.":
                found_text_after_anchor_4 = True
    assert (
        found_text_after_anchor_1
        and found_text_after_anchor_2
        and found_text_after_anchor_3
        and found_text_after_anchor_4
    )
--- a/tests/test_code_formula.py
+++ b/tests/test_code_formula.py
@ -60,3 +60,25 @@ def test_code_and_formula_conversion():
    gt = "a ^ { 2 } + 8 = 1 2"
    predicted = formula_blocks[0].text
    assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
 def test_formula_conversion_with_page_range():
    pdf_path = Path("tests/data/pdf/code_and_formula.pdf")
    converter = get_converter()
    print(f"converting {pdf_path} with page range")
    doc_result: ConversionResult = converter.convert(pdf_path, page_range=(2, 2))
    results = doc_result.document.texts
    formula_blocks = [
        el
        for el in results
        if isinstance(el, TextItem) and el.label == DocItemLabel.FORMULA
    ]
    assert len(formula_blocks) == 1
    gt = "a ^ { 2 } + 8 = 1 2"
    predicted = formula_blocks[0].text
    assert predicted == gt, f"mismatch in text {predicted=}, {gt=}"
--- a/tests/test_e2e_ocr_conversion.py
+++ b/tests/test_e2e_ocr_conversion.py
@ -57,14 +57,14 @@ def test_e2e_conversions():
    pdf_paths = get_pdf_paths()
    engines: List[Tuple[OcrOptions, bool]] = [
        (EasyOcrOptions(), False),
        (TesseractOcrOptions(), True),
        (TesseractCliOcrOptions(), True),
-        (EasyOcrOptions(force_full_page_ocr=True), False),
+        (EasyOcrOptions(), False),
        (TesseractOcrOptions(force_full_page_ocr=True), True),
        (TesseractOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
        (TesseractCliOcrOptions(force_full_page_ocr=True), True),
        (TesseractCliOcrOptions(force_full_page_ocr=True, lang=["auto"]), True),
        (EasyOcrOptions(force_full_page_ocr=True), False),
    ]
    # rapidocr is only available for Python >=3.6,<3.13
--- a/uv.lock
+++ b/uv.lock
@ -818,7 +818,7 @@ wheels = [
 [[package]]
 name = "docling"
-version = "2.36.1"
+version = "2.37.0"
 source = { editable = "." }
 dependencies = [
    { name = "beautifulsoup4" },
		`@ -0,0 +1,2 @@`
							`allowRemediationCommits:`
							`individual: true`
		`@ -0,0 +1 @@`
							`# X-Library The fully customisable and copyright-free standard content template collection exclusively for our customers`