chore: fix or catch deprecation warnings

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis 2025-05-26 05:47:57 +02:00
parent 106951e71e
commit 53ffc565ca
6 changed files with 113 additions and 81 deletions

View File

@ -185,12 +185,22 @@ class LayoutModel(BasePageModel):
).postprocess() ).postprocess()
# processed_clusters, processed_cells = clusters, page.cells # processed_clusters, processed_cells = clusters, page.cells
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
"Mean of empty slice|invalid value encountered in scalar divide",
RuntimeWarning,
"numpy",
)
conv_res.confidence.pages[page.page_no].layout_score = float( conv_res.confidence.pages[page.page_no].layout_score = float(
np.mean([c.confidence for c in processed_clusters]) np.mean([c.confidence for c in processed_clusters])
) )
conv_res.confidence.pages[page.page_no].ocr_score = float( conv_res.confidence.pages[page.page_no].ocr_score = float(
np.mean([c.confidence for c in processed_cells if c.from_ocr]) np.mean(
[c.confidence for c in processed_cells if c.from_ocr]
)
) )
page.cells = processed_cells page.cells = processed_cells

View File

@ -1,4 +1,5 @@
import re import re
import warnings
from collections.abc import Iterable from collections.abc import Iterable
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
@ -7,7 +8,7 @@ import numpy as np
from PIL import ImageDraw from PIL import ImageDraw
from pydantic import BaseModel from pydantic import BaseModel
from docling.datamodel.base_models import Page, ScoreValue from docling.datamodel.base_models import Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
from docling.models.base_model import BasePageModel from docling.models.base_model import BasePageModel
@ -76,6 +77,10 @@ class PagePreprocessingModel(BasePageModel):
score = self.rate_text_quality(c.text) score = self.rate_text_quality(c.text)
text_scores.append(score) text_scores.append(score)
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
)
conv_res.confidence.pages[page.page_no].parse_score = float( conv_res.confidence.pages[page.page_no].parse_score = float(
np.nanquantile( np.nanquantile(
text_scores, q=0.10 text_scores, q=0.10

View File

@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores from docling.datamodel.base_models import AssembledUnit, Page
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.datamodel.settings import settings from docling.datamodel.settings import settings
@ -55,6 +55,8 @@ class StandardPdfPipeline(PaginatedPipeline):
"When defined, it must point to a folder containing all models required by the pipeline." "When defined, it must point to a folder containing all models required by the pipeline."
) )
with warnings.catch_warnings(): # deprecated generate_table_images
warnings.filterwarnings("ignore", category=DeprecationWarning)
self.keep_images = ( self.keep_images = (
self.pipeline_options.generate_page_images self.pipeline_options.generate_page_images
or self.pipeline_options.generate_picture_images or self.pipeline_options.generate_picture_images
@ -210,6 +212,8 @@ class StandardPdfPipeline(PaginatedPipeline):
) )
# Generate images of the requested element types # Generate images of the requested element types
with warnings.catch_warnings(): # deprecated generate_table_images
warnings.filterwarnings("ignore", category=DeprecationWarning)
if ( if (
self.pipeline_options.generate_picture_images self.pipeline_options.generate_picture_images
or self.pipeline_options.generate_table_images or self.pipeline_options.generate_table_images
@ -237,7 +241,9 @@ class StandardPdfPipeline(PaginatedPipeline):
crop_bbox = ( crop_bbox = (
element.prov[0] element.prov[0]
.bbox.scaled(scale=scale) .bbox.scaled(scale=scale)
.to_top_left_origin(page_height=page.size.height * scale) .to_top_left_origin(
page_height=page.size.height * scale
)
) )
cropped_im = page.image.crop(crop_bbox.as_tuple()) cropped_im = page.image.crop(crop_bbox.as_tuple())
@ -247,6 +253,12 @@ class StandardPdfPipeline(PaginatedPipeline):
# Aggregate confidence values for document: # Aggregate confidence values for document:
if len(conv_res.pages) > 0: if len(conv_res.pages) > 0:
with warnings.catch_warnings():
warnings.filterwarnings(
"ignore",
category=RuntimeWarning,
message="Mean of empty slice|All-NaN slice encountered",
)
conv_res.confidence.layout_score = float( conv_res.confidence.layout_score = float(
np.nanmean( np.nanmean(
[c.layout_score for c in conv_res.confidence.pages.values()] [c.layout_score for c in conv_res.confidence.pages.values()]

View File

@ -39,7 +39,14 @@ def test_e2e_valid_csv_conversions():
print(f"converting {csv_path}") print(f"converting {csv_path}")
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
if csv_path.stem in (
"csv-too-few-columns",
"csv-too-many-columns",
"csv-inconsistent-header",
):
with warns(UserWarning, match="Inconsistent column lengths"):
conv_result: ConversionResult = converter.convert(csv_path)
else:
conv_result: ConversionResult = converter.convert(csv_path) conv_result: ConversionResult = converter.convert(csv_path)
doc: DoclingDocument = conv_result.document doc: DoclingDocument = conv_result.document

View File

@ -38,11 +38,10 @@ def get_converter():
def test_compare_legacy_output(test_doc_paths): def test_compare_legacy_output(test_doc_paths):
converter = get_converter() converter = get_converter()
res = converter.convert_all(test_doc_paths, raises_on_error=True) res = converter.convert_all(test_doc_paths, raises_on_error=True)
for conv_res in res: for conv_res in res:
print(f"Results for {conv_res.input.file}") print(f"Results for {conv_res.input.file}")
with pytest.warns(DeprecationWarning, match="Use document instead"):
print( print(
json.dumps( json.dumps(
conv_res.legacy_document.model_dump( conv_res.legacy_document.model_dump(
@ -50,5 +49,4 @@ def test_compare_legacy_output(test_doc_paths):
) )
) )
) )
# assert res.legacy_output == res.legacy_output_transformed # assert res.legacy_output == res.legacy_output_transformed

View File

@ -4,6 +4,7 @@ import warnings
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import List, Optional
import pytest
from docling_core.types.doc import ( from docling_core.types.doc import (
DocItem, DocItem,
DoclingDocument, DoclingDocument,
@ -302,9 +303,8 @@ def verify_conversion_result_v1(
) )
doc_pred_pages: List[Page] = doc_result.pages doc_pred_pages: List[Page] = doc_result.pages
with pytest.warns(DeprecationWarning, match="Use document instead"):
doc_pred: DsDocument = doc_result.legacy_document doc_pred: DsDocument = doc_result.legacy_document
with warnings.catch_warnings():
warnings.simplefilter("ignore", DeprecationWarning)
doc_pred_md = doc_result.legacy_document.export_to_markdown() doc_pred_md = doc_result.legacy_document.export_to_markdown()
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens() doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
@ -391,7 +391,7 @@ def verify_conversion_result_v2(
doc_pred_pages: List[Page] = doc_result.pages doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DoclingDocument = doc_result.document doc_pred: DoclingDocument = doc_result.document
doc_pred_md = doc_result.document.export_to_markdown() doc_pred_md = doc_result.document.export_to_markdown()
doc_pred_dt = doc_result.document.export_to_document_tokens() doc_pred_dt = doc_result.document.export_to_doctags()
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}" engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"