From 53ffc565ca5205e1287e51a5a2864160685cc9fa Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Mon, 26 May 2025 05:47:57 +0200
Subject: [PATCH] chore: fix or catch deprecation warnings

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
---
 docling/models/layout_model.py             |  22 +++-
 docling/models/page_preprocessing_model.py |  17 ++-
 docling/pipeline/standard_pdf_pipeline.py  | 122 +++++++++++----------
 tests/test_backend_csv.py                  |  11 +-
 tests/test_legacy_format_transform.py      |  14 +--
 tests/verify_utils.py                      |   8 +-
 6 files changed, 113 insertions(+), 81 deletions(-)

diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py
index e2abb373..03a047fa 100644
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@@ -185,13 +185,23 @@ class LayoutModel(BasePageModel):
                     ).postprocess()
                     # processed_clusters, processed_cells = clusters, page.cells
 
-                    conv_res.confidence.pages[page.page_no].layout_score = float(
-                        np.mean([c.confidence for c in processed_clusters])
-                    )
+                    with warnings.catch_warnings():
+                        warnings.filterwarnings(
+                            "ignore",
+                            "Mean of empty slice|invalid value encountered in scalar divide",
+                            RuntimeWarning,
+                            "numpy",
+                        )
 
-                    conv_res.confidence.pages[page.page_no].ocr_score = float(
-                        np.mean([c.confidence for c in processed_cells if c.from_ocr])
-                    )
+                        conv_res.confidence.pages[page.page_no].layout_score = float(
+                            np.mean([c.confidence for c in processed_clusters])
+                        )
+
+                        conv_res.confidence.pages[page.page_no].ocr_score = float(
+                            np.mean(
+                                [c.confidence for c in processed_cells if c.from_ocr]
+                            )
+                        )
 
                     page.cells = processed_cells
                     page.predictions.layout = LayoutPrediction(
diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py
index 6a1dcf19..3cfa6352 100644
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@@ -1,4 +1,5 @@
 import re
+import warnings
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
@@ -7,7 +8,7 @@ import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel
 
-from docling.datamodel.base_models import Page, ScoreValue
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel):
             score = self.rate_text_quality(c.text)
             text_scores.append(score)
 
-        conv_res.confidence.pages[page.page_no].parse_score = float(
-            np.nanquantile(
-                text_scores, q=0.10
-            )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
-        )
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
+            )
+            conv_res.confidence.pages[page.page_no].parse_score = float(
+                np.nanquantile(
+                    text_scores, q=0.10
+                )  # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
+            )
 
         # DEBUG code:
         def draw_text_boxes(image, cells, show: bool = False):
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 4269900c..88317fd3 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
+from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
@@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
                 "When defined, it must point to a folder containing all models required by the pipeline."
             )
 
-        self.keep_images = (
-            self.pipeline_options.generate_page_images
-            or self.pipeline_options.generate_picture_images
-            or self.pipeline_options.generate_table_images
-        )
+        with warnings.catch_warnings():  # deprecated generate_table_images
+            warnings.filterwarnings("ignore", category=DeprecationWarning)
+            self.keep_images = (
+                self.pipeline_options.generate_page_images
+                or self.pipeline_options.generate_picture_images
+                or self.pipeline_options.generate_table_images
+            )
 
         self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
 
@@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
                     )
 
             # Generate images of the requested element types
-            if (
-                self.pipeline_options.generate_picture_images
-                or self.pipeline_options.generate_table_images
-            ):
-                scale = self.pipeline_options.images_scale
-                for element, _level in conv_res.document.iterate_items():
-                    if not isinstance(element, DocItem) or len(element.prov) == 0:
-                        continue
-                    if (
-                        isinstance(element, PictureItem)
-                        and self.pipeline_options.generate_picture_images
-                    ) or (
-                        isinstance(element, TableItem)
-                        and self.pipeline_options.generate_table_images
-                    ):
-                        page_ix = element.prov[0].page_no - 1
-                        page = next(
-                            (p for p in conv_res.pages if p.page_no == page_ix),
-                            cast("Page", None),
-                        )
-                        assert page is not None
-                        assert page.size is not None
-                        assert page.image is not None
+            with warnings.catch_warnings():  # deprecated generate_table_images
+                warnings.filterwarnings("ignore", category=DeprecationWarning)
+                if (
+                    self.pipeline_options.generate_picture_images
+                    or self.pipeline_options.generate_table_images
+                ):
+                    scale = self.pipeline_options.images_scale
+                    for element, _level in conv_res.document.iterate_items():
+                        if not isinstance(element, DocItem) or len(element.prov) == 0:
+                            continue
+                        if (
+                            isinstance(element, PictureItem)
+                            and self.pipeline_options.generate_picture_images
+                        ) or (
+                            isinstance(element, TableItem)
+                            and self.pipeline_options.generate_table_images
+                        ):
+                            page_ix = element.prov[0].page_no - 1
+                            page = next(
+                                (p for p in conv_res.pages if p.page_no == page_ix),
+                                cast("Page", None),
+                            )
+                            assert page is not None
+                            assert page.size is not None
+                            assert page.image is not None
 
-                        crop_bbox = (
-                            element.prov[0]
-                            .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
-                        )
+                            crop_bbox = (
+                                element.prov[0]
+                                .bbox.scaled(scale=scale)
+                                .to_top_left_origin(
+                                    page_height=page.size.height * scale
+                                )
+                            )
 
-                        cropped_im = page.image.crop(crop_bbox.as_tuple())
-                        element.image = ImageRef.from_pil(
-                            cropped_im, dpi=int(72 * scale)
-                        )
+                            cropped_im = page.image.crop(crop_bbox.as_tuple())
+                            element.image = ImageRef.from_pil(
+                                cropped_im, dpi=int(72 * scale)
+                            )
 
             # Aggregate confidence values for document:
             if len(conv_res.pages) > 0:
-                conv_res.confidence.layout_score = float(
-                    np.nanmean(
-                        [c.layout_score for c in conv_res.confidence.pages.values()]
+                with warnings.catch_warnings():
+                    warnings.filterwarnings(
+                        "ignore",
+                        category=RuntimeWarning,
+                        message="Mean of empty slice|All-NaN slice encountered",
                     )
-                )
-                conv_res.confidence.parse_score = float(
-                    np.nanquantile(
-                        [c.parse_score for c in conv_res.confidence.pages.values()],
-                        q=0.1,  # parse score should relate to worst 10% of pages.
+                    conv_res.confidence.layout_score = float(
+                        np.nanmean(
+                            [c.layout_score for c in conv_res.confidence.pages.values()]
+                        )
                     )
-                )
-                conv_res.confidence.table_score = float(
-                    np.nanmean(
-                        [c.table_score for c in conv_res.confidence.pages.values()]
+                    conv_res.confidence.parse_score = float(
+                        np.nanquantile(
+                            [c.parse_score for c in conv_res.confidence.pages.values()],
+                            q=0.1,  # parse score should relate to worst 10% of pages.
+                        )
                     )
-                )
-                conv_res.confidence.ocr_score = float(
-                    np.nanmean(
-                        [c.ocr_score for c in conv_res.confidence.pages.values()]
+                    conv_res.confidence.table_score = float(
+                        np.nanmean(
+                            [c.table_score for c in conv_res.confidence.pages.values()]
+                        )
+                    )
+                    conv_res.confidence.ocr_score = float(
+                        np.nanmean(
+                            [c.ocr_score for c in conv_res.confidence.pages.values()]
+                        )
                     )
-                )
 
         return conv_res
 
diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py
index d929ae19..f7b5d309 100644
--- a/tests/test_backend_csv.py
+++ b/tests/test_backend_csv.py
@@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions():
         print(f"converting {csv_path}")
 
         gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
-
-        conv_result: ConversionResult = converter.convert(csv_path)
+        if csv_path.stem in (
+            "csv-too-few-columns",
+            "csv-too-many-columns",
+            "csv-inconsistent-header",
+        ):
+            with warns(UserWarning, match="Inconsistent column lengths"):
+                conv_result: ConversionResult = converter.convert(csv_path)
+        else:
+            conv_result: ConversionResult = converter.convert(csv_path)
 
         doc: DoclingDocument = conv_result.document
 
diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py
index caef8ffc..73c73c5b 100644
--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@@ -38,17 +38,15 @@ def get_converter():
 
 def test_compare_legacy_output(test_doc_paths):
     converter = get_converter()
-
     res = converter.convert_all(test_doc_paths, raises_on_error=True)
-
     for conv_res in res:
         print(f"Results for {conv_res.input.file}")
-        print(
-            json.dumps(
-                conv_res.legacy_document.model_dump(
-                    mode="json", by_alias=True, exclude_none=True
+        with pytest.warns(DeprecationWarning, match="Use document instead"):
+            print(
+                json.dumps(
+                    conv_res.legacy_document.model_dump(
+                        mode="json", by_alias=True, exclude_none=True
+                    )
                 )
             )
-        )
-
     # assert res.legacy_output == res.legacy_output_transformed
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 46a46ace..0db53502 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -4,6 +4,7 @@ import warnings
 from pathlib import Path
 from typing import List, Optional
 
+import pytest
 from docling_core.types.doc import (
     DocItem,
     DoclingDocument,
@@ -302,9 +303,8 @@ def verify_conversion_result_v1(
     )
 
     doc_pred_pages: List[Page] = doc_result.pages
-    doc_pred: DsDocument = doc_result.legacy_document
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore", DeprecationWarning)
+    with pytest.warns(DeprecationWarning, match="Use document instead"):
+        doc_pred: DsDocument = doc_result.legacy_document
         doc_pred_md = doc_result.legacy_document.export_to_markdown()
         doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
 
@@ -391,7 +391,7 @@ def verify_conversion_result_v2(
     doc_pred_pages: List[Page] = doc_result.pages
     doc_pred: DoclingDocument = doc_result.document
     doc_pred_md = doc_result.document.export_to_markdown()
-    doc_pred_dt = doc_result.document.export_to_document_tokens()
+    doc_pred_dt = doc_result.document.export_to_doctags()
 
     engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"