chore: fix or catch deprecation warnings

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-07-27 04:24:45 +00:00 · 2025-05-26 05:47:57 +02:00 · 2025-05-26 05:47:57 +02:00 · 53ffc565ca
commit 53ffc565ca
parent 106951e71e
6 changed files with 113 additions and 81 deletions
--- a/docling/models/layout_model.py
+++ b/docling/models/layout_model.py
@ -185,12 +185,22 @@ class LayoutModel(BasePageModel):
                    ).postprocess()
                    # processed_clusters, processed_cells = clusters, page.cells
                    with warnings.catch_warnings():
                        warnings.filterwarnings(
                            "ignore",
                            "Mean of empty slice|invalid value encountered in scalar divide",
                            RuntimeWarning,
                            "numpy",
                        )
                        conv_res.confidence.pages[page.page_no].layout_score = float(
                            np.mean([c.confidence for c in processed_clusters])
                        )
                        conv_res.confidence.pages[page.page_no].ocr_score = float(
-                        np.mean([c.confidence for c in processed_cells if c.from_ocr])
+                            np.mean(
                                [c.confidence for c in processed_cells if c.from_ocr]
                            )
                        )
                    page.cells = processed_cells
--- a/docling/models/page_preprocessing_model.py
+++ b/docling/models/page_preprocessing_model.py
@ -1,4 +1,5 @@
 import re
 import warnings
 from collections.abc import Iterable
 from pathlib import Path
 from typing import Optional
@ -7,7 +8,7 @@ import numpy as np
 from PIL import ImageDraw
 from pydantic import BaseModel
-from docling.datamodel.base_models import Page, ScoreValue
+from docling.datamodel.base_models import Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.settings import settings
 from docling.models.base_model import BasePageModel
@ -76,6 +77,10 @@ class PagePreprocessingModel(BasePageModel):
            score = self.rate_text_quality(c.text)
            text_scores.append(score)
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore", "Mean of empty slice", RuntimeWarning, "numpy"
            )
            conv_res.confidence.pages[page.page_no].parse_score = float(
                np.nanquantile(
                    text_scores, q=0.10
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
-from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
+from docling.datamodel.base_models import AssembledUnit, Page
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.datamodel.settings import settings
@ -55,6 +55,8 @@ class StandardPdfPipeline(PaginatedPipeline):
                "When defined, it must point to a folder containing all models required by the pipeline."
            )
        with warnings.catch_warnings():  # deprecated generate_table_images
            warnings.filterwarnings("ignore", category=DeprecationWarning)
            self.keep_images = (
                self.pipeline_options.generate_page_images
                or self.pipeline_options.generate_picture_images
@ -210,6 +212,8 @@ class StandardPdfPipeline(PaginatedPipeline):
                    )
            # Generate images of the requested element types
            with warnings.catch_warnings():  # deprecated generate_table_images
                warnings.filterwarnings("ignore", category=DeprecationWarning)
                if (
                    self.pipeline_options.generate_picture_images
                    or self.pipeline_options.generate_table_images
@ -237,7 +241,9 @@ class StandardPdfPipeline(PaginatedPipeline):
                            crop_bbox = (
                                element.prov[0]
                                .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
+                                .to_top_left_origin(
                                    page_height=page.size.height * scale
                                )
                            )
                            cropped_im = page.image.crop(crop_bbox.as_tuple())
@ -247,6 +253,12 @@ class StandardPdfPipeline(PaginatedPipeline):
            # Aggregate confidence values for document:
            if len(conv_res.pages) > 0:
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore",
                        category=RuntimeWarning,
                        message="Mean of empty slice|All-NaN slice encountered",
                    )
                    conv_res.confidence.layout_score = float(
                        np.nanmean(
                            [c.layout_score for c in conv_res.confidence.pages.values()]
--- a/tests/test_backend_csv.py
+++ b/tests/test_backend_csv.py
@ -39,7 +39,14 @@ def test_e2e_valid_csv_conversions():
        print(f"converting {csv_path}")
        gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
-
+        if csv_path.stem in (
            "csv-too-few-columns",
            "csv-too-many-columns",
            "csv-inconsistent-header",
        ):
            with warns(UserWarning, match="Inconsistent column lengths"):
                conv_result: ConversionResult = converter.convert(csv_path)
        else:
            conv_result: ConversionResult = converter.convert(csv_path)
        doc: DoclingDocument = conv_result.document
--- a/tests/test_legacy_format_transform.py
+++ b/tests/test_legacy_format_transform.py
@ -38,11 +38,10 @@ def get_converter():
 def test_compare_legacy_output(test_doc_paths):
    converter = get_converter()
    res = converter.convert_all(test_doc_paths, raises_on_error=True)
    for conv_res in res:
        print(f"Results for {conv_res.input.file}")
        with pytest.warns(DeprecationWarning, match="Use document instead"):
            print(
                json.dumps(
                    conv_res.legacy_document.model_dump(
@ -50,5 +49,4 @@ def test_compare_legacy_output(test_doc_paths):
                    )
                )
            )
    # assert res.legacy_output == res.legacy_output_transformed
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@ -4,6 +4,7 @@ import warnings
 from pathlib import Path
 from typing import List, Optional
 import pytest
 from docling_core.types.doc import (
    DocItem,
    DoclingDocument,
@ -302,9 +303,8 @@ def verify_conversion_result_v1(
    )
    doc_pred_pages: List[Page] = doc_result.pages
    with pytest.warns(DeprecationWarning, match="Use document instead"):
        doc_pred: DsDocument = doc_result.legacy_document
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", DeprecationWarning)
        doc_pred_md = doc_result.legacy_document.export_to_markdown()
        doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
@ -391,7 +391,7 @@ def verify_conversion_result_v2(
    doc_pred_pages: List[Page] = doc_result.pages
    doc_pred: DoclingDocument = doc_result.document
    doc_pred_md = doc_result.document.export_to_markdown()
-    doc_pred_dt = doc_result.document.export_to_document_tokens()
+    doc_pred_dt = doc_result.document.export_to_doctags()
    engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"