Merge branch 'main' of https://github.com/docling-project/docling into dev/fix_msword_backend_identify_text_after_image

Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
2025-07-25 19:44:34 +00:00 · 2025-05-29 15:04:06 +02:00 · 2025-05-29 15:04:06 +02:00 · 84dc120d39
commit 84dc120d39
parent fffa865014 3942923125
23 changed files with 4498 additions and 187 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -0,0 +1,3 @@
+tests/data/** linguist-vendored
+tests/data_scanned/** linguist-vendored
+docs/** linguist-vendored
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,3 +1,15 @@
+## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22
+
+### Feature
+
+* **ocr:** Auto-detect rotated pages in Tesseract ([#1167](https://github.com/docling-project/docling/issues/1167)) ([`45265bf`](https://github.com/docling-project/docling/commit/45265bf8b1a6d6ad5367bb3f17fb3fa9d4366a05))
+* Establish confidence estimation for document and pages ([#1313](https://github.com/docling-project/docling/issues/1313)) ([`9087524`](https://github.com/docling-project/docling/commit/90875247e5813da1de17f3cd4475937e8bd45571))
+
+### Fix
+
+* Fix ZeroDivisionError for cell_bbox.area() ([#1636](https://github.com/docling-project/docling/issues/1636)) ([`c2f595d`](https://github.com/docling-project/docling/commit/c2f595d2830ca2e28e68c5da606e89541264f156))
+* **integration:** Update the Apify Actor integration ([#1619](https://github.com/docling-project/docling/issues/1619)) ([`14d4f5b`](https://github.com/docling-project/docling/commit/14d4f5b109fa65d777ab147b3ce9b5174d020a5d))
+
 ## [v2.33.0](https://github.com/docling-project/docling/releases/tag/v2.33.0) - 2025-05-20

 ### Feature
--- a/docling/backend/docling_parse_backend.py
+++ b/docling/backend/docling_parse_backend.py
@ -60,7 +60,7 @@ class DoclingParsePageBackend(PdfPageBackend):
                coord_origin=CoordOrigin.BOTTOMLEFT,
            ).to_top_left_origin(page_height=page_size.height * scale)

-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)

            if overlap_frac > 0.5:
                if len(text_piece) > 0:
--- a/docling/backend/docling_parse_v2_backend.py
+++ b/docling/backend/docling_parse_v2_backend.py
@ -71,7 +71,7 @@ class DoclingParseV2PageBackend(PdfPageBackend):
                coord_origin=CoordOrigin.BOTTOMLEFT,
            ).to_top_left_origin(page_height=page_size.height * scale)

-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)

            if overlap_frac > 0.5:
                if len(text_piece) > 0:
--- a/docling/backend/docling_parse_v4_backend.py
+++ b/docling/backend/docling_parse_v4_backend.py
@ -46,7 +46,7 @@ class DoclingParseV4PageBackend(PdfPageBackend):
                .scaled(scale)
            )

-            overlap_frac = cell_bbox.intersection_area_with(bbox) / cell_bbox.area()
+            overlap_frac = cell_bbox.intersection_over_self(bbox)

            if overlap_frac > 0.5:
                if len(text_piece) > 0:
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -12,6 +12,12 @@ from typing import Annotated, Dict, List, Optional, Type

 import rich.table
 import typer
+from docling_core.transforms.serializer.html import (
+    HTMLDocSerializer,
+    HTMLOutputStyle,
+    HTMLParams,
+)
+from docling_core.transforms.visualizer.layout_visualizer import LayoutVisualizer
 from docling_core.types.doc import ImageRefMode
 from docling_core.utils.file import resolve_source_to_path
 from pydantic import TypeAdapter
@ -156,6 +162,7 @@ def export_documents(
    export_json: bool,
    export_html: bool,
    export_html_split_page: bool,
+    show_layout: bool,
    export_md: bool,
    export_txt: bool,
    export_doctags: bool,
@ -189,9 +196,27 @@ def export_documents(
            if export_html_split_page:
                fname = output_dir / f"{doc_filename}.html"
                _log.info(f"writing HTML output to {fname}")
-                conv_res.document.save_as_html(
-                    filename=fname, image_mode=image_export_mode, split_page_view=True
-                )
+                if show_layout:
+                    ser = HTMLDocSerializer(
+                        doc=conv_res.document,
+                        params=HTMLParams(
+                            image_mode=image_export_mode,
+                            output_style=HTMLOutputStyle.SPLIT_PAGE,
+                        ),
+                    )
+                    visualizer = LayoutVisualizer()
+                    visualizer.params.show_label = False
+                    ser_res = ser.serialize(
+                        visualizer=visualizer,
+                    )
+                    with open(fname, "w") as fw:
+                        fw.write(ser_res.text)
+                else:
+                    conv_res.document.save_as_html(
+                        filename=fname,
+                        image_mode=image_export_mode,
+                        split_page_view=True,
+                    )

            # Export Text format:
            if export_txt:
@ -250,6 +275,13 @@ def convert(  # noqa: C901
    to_formats: List[OutputFormat] = typer.Option(
        None, "--to", help="Specify output formats. Defaults to Markdown."
    ),
+    show_layout: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="If enabled, the page images will show the bounding-boxes of the items.",
+        ),
+    ] = False,
    headers: str = typer.Option(
        None,
        "--headers",
@ -596,6 +628,7 @@ def convert(  # noqa: C901
            export_json=export_json,
            export_html=export_html,
            export_html_split_page=export_html_split_page,
+            show_layout=show_layout,
            export_md=export_md,
            export_txt=export_txt,
            export_doctags=export_doctags,
--- a/docling/datamodel/document.py
+++ b/docling/datamodel/document.py
@ -334,9 +334,9 @@ class _DocumentConversionInput(BaseModel):
    ) -> Optional[InputFormat]:
        """Guess the input format of a document by checking part of its content."""
        input_format: Optional[InputFormat] = None
-        content_str = content.decode("utf-8")

        if mime == "application/xml":
+            content_str = content.decode("utf-8")
            match_doctype = re.search(r"<!DOCTYPE [^>]+>", content_str)
            if match_doctype:
                xml_doctype = match_doctype.group()
@ -358,6 +358,7 @@ class _DocumentConversionInput(BaseModel):
                    input_format = InputFormat.XML_JATS

        elif mime == "text/plain":
+            content_str = content.decode("utf-8")
            if InputFormat.XML_USPTO in formats and content_str.startswith("PATN\r\n"):
                input_format = InputFormat.XML_USPTO

--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -185,13 +185,23 @@ class LayoutModel(BasePageModel):
                    ).postprocess()
                    # processed_clusters, processed_cells = clusters, page.cells

-                    conv_res.confidence.pages[page.page_no].layout_score = float(
-                        np.mean([c.confidence for c in processed_clusters])
-                    )
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )

-                    conv_res.confidence.pages[page.page_no].ocr_score = float(
-                        np.mean([c.confidence for c in processed_cells if c.from_ocr])
-                    )
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )

                    page.cells = processed_cells
                    page.predictions.layout = LayoutPrediction(
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -1,4 +1,5 @@
 import re
+import warnings
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
@ -7,7 +8,7 @@ import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel

-from docling.datamodel.base_models import Page, ScoreValue
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel):
            score = self.rate_text_quality(c.text)
            text_scores.append(score)

-        conv_res.confidence.pages[page.page_no].parse_score = float(
-            np.nanquantile(
-                text_scores, q=0.10
-            )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
-        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
+            )
+            conv_res.confidence.pages[page.page_no].parse_score = float(
+                np.nanquantile(
+                    text_scores, q=0.10
+                )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
+            )

        # DEBUG code:
        def draw_text_boxes(image, cells, show: bool = False):
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem

 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
+from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
                "When defined, it must point to a folder containing all models required by the pipeline."
            )

-        self.keep_images = (
-            self.pipeline_options.generate_page_images
-            or self.pipeline_options.generate_picture_images
-            or self.pipeline_options.generate_table_images
-        )
+        with warnings.catch_warnings():  # deprecated generate_table_images
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            self.keep_images = (
+                self.pipeline_options.generate_page_images
+                or self.pipeline_options.generate_picture_images
+                or self.pipeline_options.generate_table_images
+            )

        self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())

@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
                    )

            # Generate images of the requested element types
-            if (
-                self.pipeline_options.generate_picture_images
-                or self.pipeline_options.generate_table_images
-            ):
-                scale = self.pipeline_options.images_scale
-                for element, _level in conv_res.document.iterate_items():
-                    if not isinstance(element, DocItem) or len(element.prov) == 0:
-                        continue
-                    if (
-                        isinstance(element, PictureItem)
-                        and self.pipeline_options.generate_picture_images
-                    ) or (
-                        isinstance(element, TableItem)
-                        and self.pipeline_options.generate_table_images
-                    ):
-                        page_ix = element.prov[0].page_no - 1
-                        page = next(
-                            (p for p in conv_res.pages if p.page_no == page_ix),
-                            cast("Page", None),
-                        )
-                        assert page is not None
-                        assert page.size is not None
-                        assert page.image is not None
+            with warnings.catch_warnings():  # deprecated generate_table_images
+                warnings.filterwarnings("ignore", category=DeprecationWarning)
+                if (
+                    self.pipeline_options.generate_picture_images
+                    or self.pipeline_options.generate_table_images
+                ):
+                    scale = self.pipeline_options.images_scale
+                    for element, _level in conv_res.document.iterate_items():
+                        if not isinstance(element, DocItem) or len(element.prov) == 0:
+                            continue
+                        if (
+                            isinstance(element, PictureItem)
+                            and self.pipeline_options.generate_picture_images
+                        ) or (
+                            isinstance(element, TableItem)
+                            and self.pipeline_options.generate_table_images
+                        ):
+                            page_ix = element.prov[0].page_no - 1
+                            page = next(
+                                (p for p in conv_res.pages if p.page_no == page_ix),
+                                cast("Page", None),
+                            )
+                            assert page is not None
+                            assert page.size is not None
+                            assert page.image is not None

-                        crop_bbox = (
-                            element.prov[0]
-                            .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
-                        )
+                            crop_bbox = (
+                                element.prov[0]
+                                .bbox.scaled(scale=scale)
+                                .to_top_left_origin(
+                                    page_height=page.size.height * scale
+                                )
+                            )

-                        cropped_im = page.image.crop(crop_bbox.as_tuple())
-                        element.image = ImageRef.from_pil(
-                            cropped_im, dpi=int(72 * scale)
-                        )
+                            cropped_im = page.image.crop(crop_bbox.as_tuple())
+                            element.image = ImageRef.from_pil(
+                                cropped_im, dpi=int(72 * scale)
+                            )

            # Aggregate confidence values for document:
            if len(conv_res.pages) > 0:
-                conv_res.confidence.layout_score = float(
-                    np.nanmean(
-                        [c.layout_score for c in conv_res.confidence.pages.values()]
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        category=RuntimeWarning,
+                        message="Mean of empty slice|All-NaN slice encountered",
                    )
-                )
-                conv_res.confidence.parse_score = float(
-                    np.nanquantile(
-                        [c.parse_score for c in conv_res.confidence.pages.values()],
-                        q=0.1,  # parse score should relate to worst 10% of pages.
+                    conv_res.confidence.layout_score = float(
+                        np.nanmean(
+                            [c.layout_score for c in conv_res.confidence.pages.values()]
+                        )
                    )
-                )
-                conv_res.confidence.table_score = float(
-                    np.nanmean(
-                        [c.table_score for c in conv_res.confidence.pages.values()]
+                    conv_res.confidence.parse_score = float(
+                        np.nanquantile(
+                            [c.parse_score for c in conv_res.confidence.pages.values()],
+                            q=0.1,  # parse score should relate to worst 10% of pages.
+                        )
                    )
-                )
-                conv_res.confidence.ocr_score = float(
-                    np.nanmean(
-                        [c.ocr_score for c in conv_res.confidence.pages.values()]
+                    conv_res.confidence.table_score = float(
+                        np.nanmean(
+                            [c.table_score for c in conv_res.confidence.pages.values()]
+                        )
+                    )
+                    conv_res.confidence.ocr_score = float(
+                        np.nanmean(
+                            [c.ocr_score for c in conv_res.confidence.pages.values()]
+                        )
                    )
-                )

        return conv_res

--- a/docling/utils/layout_postprocessor.py
+++ b/docling/utils/layout_postprocessor.py
@ -90,17 +90,12 @@ class SpatialClusterIndex:
        containment_threshold: float,
    ) -> bool:
        """Check if two bboxes overlap sufficiently."""
-        area1, area2 = bbox1.area(), bbox2.area()
-        if area1 <= 0 or area2 <= 0:
+        if bbox1.area() <= 0 or bbox2.area() <= 0:
            return False

-        overlap_area = bbox1.intersection_area_with(bbox2)
-        if overlap_area <= 0:
-            return False
-
-        iou = overlap_area / (area1 + area2 - overlap_area)
-        containment1 = overlap_area / area1
-        containment2 = overlap_area / area2
+        iou = bbox1.intersection_over_union(bbox2)
+        containment1 = bbox1.intersection_over_self(bbox2)
+        containment2 = bbox2.intersection_over_self(bbox1)

        return (
            iou > overlap_threshold
@ -321,11 +316,9 @@ class LayoutPostprocessor:
        for special in special_clusters:
            contained = []
            for cluster in self.regular_clusters:
-                overlap = cluster.bbox.intersection_area_with(special.bbox)
-                if overlap > 0:
-                    containment = overlap / cluster.bbox.area()
-                    if containment > 0.8:
-                        contained.append(cluster)
+                containment = cluster.bbox.intersection_over_self(special.bbox)
+                if containment > 0.8:
+                    contained.append(cluster)

            if contained:
                # Sort contained clusters by minimum cell ID:
@ -379,9 +372,7 @@ class LayoutPostprocessor:
            for regular in self.regular_clusters:
                if regular.label == DocItemLabel.TABLE:
                    # Calculate overlap
-                    overlap = regular.bbox.intersection_area_with(wrapper.bbox)
-                    wrapper_area = wrapper.bbox.area()
-                    overlap_ratio = overlap / wrapper_area
+                    overlap_ratio = wrapper.bbox.intersection_over_self(regular.bbox)

                    conf_diff = wrapper.confidence - regular.confidence

@ -421,8 +412,7 @@ class LayoutPostprocessor:
        # Rule 2: CODE vs others
        if candidate.label == DocItemLabel.CODE:
            # Calculate how much of the other cluster is contained within the CODE cluster
-            overlap = other.bbox.intersection_area_with(candidate.bbox)
-            containment = overlap / other.bbox.area()
+            containment = other.bbox.intersection_over_self(candidate.bbox)
            if containment > 0.8:  # other is 80% contained within CODE
                return True

@ -586,11 +576,9 @@ class LayoutPostprocessor:
                if cell.rect.to_bounding_box().area() <= 0:
                    continue

-                overlap = cell.rect.to_bounding_box().intersection_area_with(
+                overlap_ratio = cell.rect.to_bounding_box().intersection_over_self(
                    cluster.bbox
                )
-                overlap_ratio = overlap / cell.rect.to_bounding_box().area()
-
                if overlap_ratio > best_overlap:
                    best_overlap = overlap_ratio
                    best_cluster = cluster
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,6 +1,6 @@
 [tool.poetry]
 name = "docling"
-version = "2.33.0"  # DO NOT EDIT, updated automatically
+version = "2.34.0"  # DO NOT EDIT, updated automatically
 description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
 authors = [
  "Christoph Auer <cau@zurich.ibm.com>",
@ -46,7 +46,7 @@ packages = [{ include = "docling" }]
 ######################
 python = "^3.9"
 pydantic = "^2.0.0"
-docling-core = {version = "^2.29.0", extras = ["chunking"]}
+docling-core = {version = "^2.31.2", extras = ["chunking"]}
 docling-ibm-models = "^3.4.0"
 docling-parse = "^4.0.0"
 filetype = "^1.2.0"
--- a/tests/data/groundtruth/docling_v2/example_08.html.itxt
+++ b/tests/data/groundtruth/docling_v2/example_08.html.itxt
@ -0,0 +1,8 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group header-1
+    item-2 at level 2: section_header: Pivot table with with 1 row header
+      item-3 at level 3: table with [6x4]
+    item-4 at level 2: section_header: Pivot table with 2 row headers
+      item-5 at level 3: table with [6x5]
+    item-6 at level 2: section_header: Equivalent pivot table
+      item-7 at level 3: table with [6x5]
--- a/tests/data/groundtruth/docling_v2/example_08.html.json
+++ b/tests/data/groundtruth/docling_v2/example_08.html.json
--- a/tests/data/groundtruth/docling_v2/example_08.html.md
+++ b/tests/data/groundtruth/docling_v2/example_08.html.md
@ -0,0 +1,29 @@
+## Pivot table with with 1 row header
+
+|   Year | Month    | Revenue   | Cost   |
+|--------|----------|-----------|--------|
+|   2025 | January  | $134      | $162   |
+|   2025 | February | $150      | $155   |
+|   2025 | March    | $160      | $143   |
+|   2025 | April    | $210      | $150   |
+|   2025 | May      | $280      | $120   |
+
+## Pivot table with 2 row headers
+
+|   Year | Quarter   | Month    | Revenue   | Cost   |
+|--------|-----------|----------|-----------|--------|
+|   2025 | Q1        | January  | $134      | $162   |
+|   2025 | Q1        | February | $150      | $155   |
+|   2025 | Q1        | March    | $160      | $143   |
+|   2025 | Q2        | April    | $210      | $150   |
+|   2025 | Q2        | May      | $280      | $120   |
+
+## Equivalent pivot table
+
+|   Year | Quarter   | Month    | Revenue   | Cost   |
+|--------|-----------|----------|-----------|--------|
+|   2025 | Q1        | January  | $134      | $162   |
+|   2025 | Q1        | February | $150      | $155   |
+|   2025 | Q1        | March    | $160      | $143   |
+|   2025 | Q2        | April    | $210      | $150   |
+|   2025 | Q2        | May      | $280      | $120   |
--- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt
@ -0,0 +1,94 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
+  item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
+  item-3 at level 1: paragraph: 
+  item-4 at level 1: section: group textbox
+    item-5 at level 2: paragraph: Student falls ill
+    item-6 at level 2: paragraph: 
+    item-7 at level 2: paragraph: 
+    item-8 at level 2: list: group list
+      item-9 at level 3: list_item: Suggested Reportable Symptoms:
+＊ ... sh
+＊ Blisters
+＊ Headache
+＊ Sore throat
+  item-10 at level 1: list_item: 
+  item-11 at level 1: paragraph: 
+  item-12 at level 1: paragraph: 
+  item-13 at level 1: section: group textbox
+    item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
+  item-15 at level 1: paragraph: 
+  item-16 at level 1: paragraph: 
+  item-17 at level 1: paragraph: 
+  item-18 at level 1: paragraph: 
+  item-19 at level 1: section: group textbox
+    item-20 at level 2: paragraph: Yes
+  item-21 at level 1: paragraph: 
+  item-22 at level 1: paragraph: 
+  item-23 at level 1: section: group textbox
+    item-24 at level 2: paragraph:   A report must be submitted wi ... saster Prevention Information Network.
+    item-25 at level 2: paragraph:   A report must also be submitt ... d Infectious Disease Reporting System.
+    item-26 at level 2: paragraph: 
+    item-27 at level 2: paragraph: 
+  item-28 at level 1: paragraph: 
+  item-29 at level 1: paragraph: 
+  item-30 at level 1: paragraph: 
+  item-31 at level 1: paragraph: 
+  item-32 at level 1: paragraph: 
+  item-33 at level 1: paragraph: 
+  item-34 at level 1: section: group textbox
+    item-35 at level 2: paragraph: Health Bureau:
+    item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
+    item-37 at level 2: list: group list
+      item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
+      item-39 at level 3: list_item: Implement appropriate epidemic p ...  the Communicable Disease Control Act.
+    item-40 at level 2: paragraph: 
+    item-41 at level 2: paragraph: 
+  item-42 at level 1: list: group list
+    item-43 at level 2: list_item: 
+  item-44 at level 1: paragraph: 
+  item-45 at level 1: section: group textbox
+    item-46 at level 2: paragraph: Department of Education:
+Collabo ... vention measures at all school levels.
+  item-47 at level 1: paragraph: 
+  item-48 at level 1: paragraph: 
+  item-49 at level 1: paragraph: 
+  item-50 at level 1: paragraph: 
+  item-51 at level 1: paragraph: 
+  item-52 at level 1: paragraph: 
+  item-53 at level 1: paragraph: 
+  item-54 at level 1: section: group textbox
+    item-55 at level 2: inline: group group
+      item-56 at level 3: paragraph: The Health Bureau will handle
+      item-57 at level 3: paragraph: reporting and specimen collection
+      item-58 at level 3: paragraph: .
+    item-59 at level 2: paragraph: 
+    item-60 at level 2: paragraph: 
+  item-61 at level 1: paragraph: 
+  item-62 at level 1: paragraph: 
+  item-63 at level 1: paragraph: 
+  item-64 at level 1: section: group textbox
+    item-65 at level 2: paragraph: Whether the epidemic has eased.
+    item-66 at level 2: paragraph: 
+    item-67 at level 2: paragraph: 
+  item-68 at level 1: paragraph: 
+  item-69 at level 1: section: group textbox
+    item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
+    item-71 at level 2: paragraph: No
+  item-72 at level 1: paragraph: 
+  item-73 at level 1: paragraph: 
+  item-74 at level 1: section: group textbox
+  item-75 at level 1: paragraph: 
+  item-76 at level 1: section: group textbox
+  item-77 at level 1: paragraph: 
+  item-78 at level 1: paragraph: 
+  item-79 at level 1: section: group textbox
+    item-80 at level 2: paragraph: Case closed.
+    item-81 at level 2: paragraph: 
+    item-82 at level 2: paragraph: 
+    item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
+  item-84 at level 1: paragraph: 
+  item-85 at level 1: section: group textbox
+  item-86 at level 1: paragraph: 
+  item-87 at level 1: paragraph: 
+  item-88 at level 1: paragraph: 
--- a/tests/data/groundtruth/docling_v2/textbox.docx.json
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.json
--- a/tests/data/groundtruth/docling_v2/textbox.docx.md
+++ b/tests/data/groundtruth/docling_v2/textbox.docx.md
@ -0,0 +1,46 @@
+**Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten**
+
+**Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten**
+
+**Student falls ill**
+
+- Suggested Reportable Symptoms:
+＊ Fever
+＊ Cough
+＊ Diarrhea
+＊ Vomiting
+＊ Rash
+＊ Blisters
+＊ Headache
+＊ Sore throat
+
+If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)
+show the same suggested reportable symptoms
+
+Yes
+
+  A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
+
+  A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
+
+**Health Bureau:**
+
+Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.
+
+- If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.
+- Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.
+
+Department of Education:
+Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.
+
+The Health Bureau will handle **reporting and specimen collection** .
+
+**Whether the epidemic has eased.**
+
+**Whether the test results are positive for a legally designated infectious disease.**
+
+No
+
+**Case closed.**
+
+The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.
--- a/tests/data/html/example_08.html
+++ b/tests/data/html/example_08.html
--- a/tests/test_backend_csv.py
+++ b/tests/test_backend_csv.py
@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions():
        print(f"converting {csv_path}")

        gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
-
-        conv_result: ConversionResult = converter.convert(csv_path)
+        if csv_path.stem in (
+            "csv-too-few-columns",
+            "csv-too-many-columns",
+            "csv-inconsistent-header",
+        ):
+            with warns(UserWarning, match="Inconsistent column lengths"):
+                conv_result: ConversionResult = converter.convert(csv_path)
+        else:
+            conv_result: ConversionResult = converter.convert(csv_path)

        doc: DoclingDocument = conv_result.document

--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@ -38,17 +38,15 @@ def get_converter():

 def test_compare_legacy_output(test_doc_paths):
    converter = get_converter()
-
    res = converter.convert_all(test_doc_paths, raises_on_error=True)
-
    for conv_res in res:
        print(f"Results for {conv_res.input.file}")
-        print(
-            json.dumps(
-                conv_res.legacy_document.model_dump(
-                    mode="json", by_alias=True, exclude_none=True
+        with pytest.warns(DeprecationWarning, match="Use document instead"):
+            print(
+                json.dumps(
+                    conv_res.legacy_document.model_dump(
+                        mode="json", by_alias=True, exclude_none=True
+                    )
                )
            )
-        )
-
    # assert res.legacy_output == res.legacy_output_transformed
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -4,6 +4,7 @@ import warnings
 from pathlib import Path
 from typing import List, Optional

+import pytest
 from docling_core.types.doc import (
    DocItem,
    DoclingDocument,
@ -302,9 +303,8 @@ def verify_conversion_result_v1(
    )

    doc_pred_pages: List[Page] = doc_result.pages
-    doc_pred: DsDocument = doc_result.legacy_document
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", DeprecationWarning)
+    with pytest.warns(DeprecationWarning, match="Use document instead"):
+        doc_pred: DsDocument = doc_result.legacy_document
        doc_pred_md = doc_result.legacy_document.export_to_markdown()
        doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()

@ -391,7 +391,7 @@ def verify_conversion_result_v2(
    doc_pred_pages: List[Page] = doc_result.pages
    doc_pred: DoclingDocument = doc_result.document
    doc_pred_md = doc_result.document.export_to_markdown()
-    doc_pred_dt = doc_result.document.export_to_document_tokens()
+    doc_pred_dt = doc_result.document.export_to_doctags()

    engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"