From 53ffc565ca5205e1287e51a5a2864160685cc9fa Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Mon, 26 May 2025 05:47:57 +0200 Subject: [PATCH] chore: fix or catch deprecation warnings Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/models/layout_model.py | 22 +++- docling/models/page_preprocessing_model.py | 17 ++- docling/pipeline/standard_pdf_pipeline.py | 122 +++++++++++---------- tests/test_backend_csv.py | 11 +- tests/test_legacy_format_transform.py | 14 +-- tests/verify_utils.py | 8 +- 6 files changed, 113 insertions(+), 81 deletions(-) diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index e2abb373..03a047fa 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -185,13 +185,23 @@ class LayoutModel(BasePageModel): ).postprocess() # processed_clusters, processed_cells = clusters, page.cells - conv_res.confidence.pages[page.page_no].layout_score = float( - np.mean([c.confidence for c in processed_clusters]) - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + "Mean of empty slice|invalid value encountered in scalar divide", + RuntimeWarning, + "numpy", + ) - conv_res.confidence.pages[page.page_no].ocr_score = float( - np.mean([c.confidence for c in processed_cells if c.from_ocr]) - ) + conv_res.confidence.pages[page.page_no].layout_score = float( + np.mean([c.confidence for c in processed_clusters]) + ) + + conv_res.confidence.pages[page.page_no].ocr_score = float( + np.mean( + [c.confidence for c in processed_cells if c.from_ocr] + ) + ) page.cells = processed_cells page.predictions.layout = LayoutPrediction( diff --git a/docling/models/page_preprocessing_model.py b/docling/models/page_preprocessing_model.py index 6a1dcf19..3cfa6352 100644 --- a/docling/models/page_preprocessing_model.py +++ b/docling/models/page_preprocessing_model.py @@ -1,4 +1,5 @@ import re +import warnings from collections.abc import Iterable from pathlib import Path from typing import Optional @@ -7,7 +8,7 @@ import numpy as np from PIL import ImageDraw from pydantic import BaseModel -from docling.datamodel.base_models import Page, ScoreValue +from docling.datamodel.base_models import Page from docling.datamodel.document import ConversionResult from docling.datamodel.settings import settings from docling.models.base_model import BasePageModel @@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel): score = self.rate_text_quality(c.text) text_scores.append(score) - conv_res.confidence.pages[page.page_no].parse_score = float( - np.nanquantile( - text_scores, q=0.10 - ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. - ) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", "Mean of empty slice", RuntimeWarning, "numpy" + ) + conv_res.confidence.pages[page.page_no].parse_score = float( + np.nanquantile( + text_scores, q=0.10 + ) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells. + ) # DEBUG code: def draw_text_boxes(image, cells, show: bool = False): diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py index 4269900c..88317fd3 100644 --- a/docling/pipeline/standard_pdf_pipeline.py +++ b/docling/pipeline/standard_pdf_pipeline.py @@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend -from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores +from docling.datamodel.base_models import AssembledUnit, Page from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.settings import settings @@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline): "When defined, it must point to a folder containing all models required by the pipeline." ) - self.keep_images = ( - self.pipeline_options.generate_page_images - or self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ) + with warnings.catch_warnings(): # deprecated generate_table_images + warnings.filterwarnings("ignore", category=DeprecationWarning) + self.keep_images = ( + self.pipeline_options.generate_page_images + or self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ) self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions()) @@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline): ) # Generate images of the requested element types - if ( - self.pipeline_options.generate_picture_images - or self.pipeline_options.generate_table_images - ): - scale = self.pipeline_options.images_scale - for element, _level in conv_res.document.iterate_items(): - if not isinstance(element, DocItem) or len(element.prov) == 0: - continue - if ( - isinstance(element, PictureItem) - and self.pipeline_options.generate_picture_images - ) or ( - isinstance(element, TableItem) - and self.pipeline_options.generate_table_images - ): - page_ix = element.prov[0].page_no - 1 - page = next( - (p for p in conv_res.pages if p.page_no == page_ix), - cast("Page", None), - ) - assert page is not None - assert page.size is not None - assert page.image is not None + with warnings.catch_warnings(): # deprecated generate_table_images + warnings.filterwarnings("ignore", category=DeprecationWarning) + if ( + self.pipeline_options.generate_picture_images + or self.pipeline_options.generate_table_images + ): + scale = self.pipeline_options.images_scale + for element, _level in conv_res.document.iterate_items(): + if not isinstance(element, DocItem) or len(element.prov) == 0: + continue + if ( + isinstance(element, PictureItem) + and self.pipeline_options.generate_picture_images + ) or ( + isinstance(element, TableItem) + and self.pipeline_options.generate_table_images + ): + page_ix = element.prov[0].page_no - 1 + page = next( + (p for p in conv_res.pages if p.page_no == page_ix), + cast("Page", None), + ) + assert page is not None + assert page.size is not None + assert page.image is not None - crop_bbox = ( - element.prov[0] - .bbox.scaled(scale=scale) - .to_top_left_origin(page_height=page.size.height * scale) - ) + crop_bbox = ( + element.prov[0] + .bbox.scaled(scale=scale) + .to_top_left_origin( + page_height=page.size.height * scale + ) + ) - cropped_im = page.image.crop(crop_bbox.as_tuple()) - element.image = ImageRef.from_pil( - cropped_im, dpi=int(72 * scale) - ) + cropped_im = page.image.crop(crop_bbox.as_tuple()) + element.image = ImageRef.from_pil( + cropped_im, dpi=int(72 * scale) + ) # Aggregate confidence values for document: if len(conv_res.pages) > 0: - conv_res.confidence.layout_score = float( - np.nanmean( - [c.layout_score for c in conv_res.confidence.pages.values()] + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=RuntimeWarning, + message="Mean of empty slice|All-NaN slice encountered", ) - ) - conv_res.confidence.parse_score = float( - np.nanquantile( - [c.parse_score for c in conv_res.confidence.pages.values()], - q=0.1, # parse score should relate to worst 10% of pages. + conv_res.confidence.layout_score = float( + np.nanmean( + [c.layout_score for c in conv_res.confidence.pages.values()] + ) ) - ) - conv_res.confidence.table_score = float( - np.nanmean( - [c.table_score for c in conv_res.confidence.pages.values()] + conv_res.confidence.parse_score = float( + np.nanquantile( + [c.parse_score for c in conv_res.confidence.pages.values()], + q=0.1, # parse score should relate to worst 10% of pages. + ) ) - ) - conv_res.confidence.ocr_score = float( - np.nanmean( - [c.ocr_score for c in conv_res.confidence.pages.values()] + conv_res.confidence.table_score = float( + np.nanmean( + [c.table_score for c in conv_res.confidence.pages.values()] + ) + ) + conv_res.confidence.ocr_score = float( + np.nanmean( + [c.ocr_score for c in conv_res.confidence.pages.values()] + ) ) - ) return conv_res diff --git a/tests/test_backend_csv.py b/tests/test_backend_csv.py index d929ae19..f7b5d309 100644 --- a/tests/test_backend_csv.py +++ b/tests/test_backend_csv.py @@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions(): print(f"converting {csv_path}") gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name - - conv_result: ConversionResult = converter.convert(csv_path) + if csv_path.stem in ( + "csv-too-few-columns", + "csv-too-many-columns", + "csv-inconsistent-header", + ): + with warns(UserWarning, match="Inconsistent column lengths"): + conv_result: ConversionResult = converter.convert(csv_path) + else: + conv_result: ConversionResult = converter.convert(csv_path) doc: DoclingDocument = conv_result.document diff --git a/tests/test_legacy_format_transform.py b/tests/test_legacy_format_transform.py index caef8ffc..73c73c5b 100644 --- a/tests/test_legacy_format_transform.py +++ b/tests/test_legacy_format_transform.py @@ -38,17 +38,15 @@ def get_converter(): def test_compare_legacy_output(test_doc_paths): converter = get_converter() - res = converter.convert_all(test_doc_paths, raises_on_error=True) - for conv_res in res: print(f"Results for {conv_res.input.file}") - print( - json.dumps( - conv_res.legacy_document.model_dump( - mode="json", by_alias=True, exclude_none=True + with pytest.warns(DeprecationWarning, match="Use document instead"): + print( + json.dumps( + conv_res.legacy_document.model_dump( + mode="json", by_alias=True, exclude_none=True + ) ) ) - ) - # assert res.legacy_output == res.legacy_output_transformed diff --git a/tests/verify_utils.py b/tests/verify_utils.py index 46a46ace..0db53502 100644 --- a/tests/verify_utils.py +++ b/tests/verify_utils.py @@ -4,6 +4,7 @@ import warnings from pathlib import Path from typing import List, Optional +import pytest from docling_core.types.doc import ( DocItem, DoclingDocument, @@ -302,9 +303,8 @@ def verify_conversion_result_v1( ) doc_pred_pages: List[Page] = doc_result.pages - doc_pred: DsDocument = doc_result.legacy_document - with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) + with pytest.warns(DeprecationWarning, match="Use document instead"): + doc_pred: DsDocument = doc_result.legacy_document doc_pred_md = doc_result.legacy_document.export_to_markdown() doc_pred_dt = doc_result.legacy_document.export_to_document_tokens() @@ -391,7 +391,7 @@ def verify_conversion_result_v2( doc_pred_pages: List[Page] = doc_result.pages doc_pred: DoclingDocument = doc_result.document doc_pred_md = doc_result.document.export_to_markdown() - doc_pred_dt = doc_result.document.export_to_document_tokens() + doc_pred_dt = doc_result.document.export_to_doctags() engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"