mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
chore: fix or catch deprecation warnings
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
parent
106951e71e
commit
53ffc565ca
@ -185,13 +185,23 @@ class LayoutModel(BasePageModel):
|
|||||||
).postprocess()
|
).postprocess()
|
||||||
# processed_clusters, processed_cells = clusters, page.cells
|
# processed_clusters, processed_cells = clusters, page.cells
|
||||||
|
|
||||||
conv_res.confidence.pages[page.page_no].layout_score = float(
|
with warnings.catch_warnings():
|
||||||
np.mean([c.confidence for c in processed_clusters])
|
warnings.filterwarnings(
|
||||||
)
|
"ignore",
|
||||||
|
"Mean of empty slice|invalid value encountered in scalar divide",
|
||||||
|
RuntimeWarning,
|
||||||
|
"numpy",
|
||||||
|
)
|
||||||
|
|
||||||
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
conv_res.confidence.pages[page.page_no].layout_score = float(
|
||||||
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
np.mean([c.confidence for c in processed_clusters])
|
||||||
)
|
)
|
||||||
|
|
||||||
|
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
||||||
|
np.mean(
|
||||||
|
[c.confidence for c in processed_cells if c.from_ocr]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
page.cells = processed_cells
|
page.cells = processed_cells
|
||||||
page.predictions.layout = LayoutPrediction(
|
page.predictions.layout = LayoutPrediction(
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
import re
|
import re
|
||||||
|
import warnings
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@ -7,7 +8,7 @@ import numpy as np
|
|||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, ScoreValue
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
@ -76,11 +77,15 @@ class PagePreprocessingModel(BasePageModel):
|
|||||||
score = self.rate_text_quality(c.text)
|
score = self.rate_text_quality(c.text)
|
||||||
text_scores.append(score)
|
text_scores.append(score)
|
||||||
|
|
||||||
conv_res.confidence.pages[page.page_no].parse_score = float(
|
with warnings.catch_warnings():
|
||||||
np.nanquantile(
|
warnings.filterwarnings(
|
||||||
text_scores, q=0.10
|
"ignore", "Mean of empty slice", RuntimeWarning, "numpy"
|
||||||
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
)
|
||||||
)
|
conv_res.confidence.pages[page.page_no].parse_score = float(
|
||||||
|
np.nanquantile(
|
||||||
|
text_scores, q=0.10
|
||||||
|
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
||||||
|
)
|
||||||
|
|
||||||
# DEBUG code:
|
# DEBUG code:
|
||||||
def draw_text_boxes(image, cells, show: bool = False):
|
def draw_text_boxes(image, cells, show: bool = False):
|
||||||
|
@ -8,7 +8,7 @@ from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
|||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
|
from docling.datamodel.base_models import AssembledUnit, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
@ -55,11 +55,13 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
"When defined, it must point to a folder containing all models required by the pipeline."
|
"When defined, it must point to a folder containing all models required by the pipeline."
|
||||||
)
|
)
|
||||||
|
|
||||||
self.keep_images = (
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
self.pipeline_options.generate_page_images
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
or self.pipeline_options.generate_picture_images
|
self.keep_images = (
|
||||||
or self.pipeline_options.generate_table_images
|
self.pipeline_options.generate_page_images
|
||||||
)
|
or self.pipeline_options.generate_picture_images
|
||||||
|
or self.pipeline_options.generate_table_images
|
||||||
|
)
|
||||||
|
|
||||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||||
|
|
||||||
@ -210,64 +212,74 @@ class StandardPdfPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# Generate images of the requested element types
|
# Generate images of the requested element types
|
||||||
if (
|
with warnings.catch_warnings(): # deprecated generate_table_images
|
||||||
self.pipeline_options.generate_picture_images
|
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
||||||
or self.pipeline_options.generate_table_images
|
if (
|
||||||
):
|
self.pipeline_options.generate_picture_images
|
||||||
scale = self.pipeline_options.images_scale
|
or self.pipeline_options.generate_table_images
|
||||||
for element, _level in conv_res.document.iterate_items():
|
):
|
||||||
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
scale = self.pipeline_options.images_scale
|
||||||
continue
|
for element, _level in conv_res.document.iterate_items():
|
||||||
if (
|
if not isinstance(element, DocItem) or len(element.prov) == 0:
|
||||||
isinstance(element, PictureItem)
|
continue
|
||||||
and self.pipeline_options.generate_picture_images
|
if (
|
||||||
) or (
|
isinstance(element, PictureItem)
|
||||||
isinstance(element, TableItem)
|
and self.pipeline_options.generate_picture_images
|
||||||
and self.pipeline_options.generate_table_images
|
) or (
|
||||||
):
|
isinstance(element, TableItem)
|
||||||
page_ix = element.prov[0].page_no - 1
|
and self.pipeline_options.generate_table_images
|
||||||
page = next(
|
):
|
||||||
(p for p in conv_res.pages if p.page_no == page_ix),
|
page_ix = element.prov[0].page_no - 1
|
||||||
cast("Page", None),
|
page = next(
|
||||||
)
|
(p for p in conv_res.pages if p.page_no == page_ix),
|
||||||
assert page is not None
|
cast("Page", None),
|
||||||
assert page.size is not None
|
)
|
||||||
assert page.image is not None
|
assert page is not None
|
||||||
|
assert page.size is not None
|
||||||
|
assert page.image is not None
|
||||||
|
|
||||||
crop_bbox = (
|
crop_bbox = (
|
||||||
element.prov[0]
|
element.prov[0]
|
||||||
.bbox.scaled(scale=scale)
|
.bbox.scaled(scale=scale)
|
||||||
.to_top_left_origin(page_height=page.size.height * scale)
|
.to_top_left_origin(
|
||||||
)
|
page_height=page.size.height * scale
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
cropped_im = page.image.crop(crop_bbox.as_tuple())
|
||||||
element.image = ImageRef.from_pil(
|
element.image = ImageRef.from_pil(
|
||||||
cropped_im, dpi=int(72 * scale)
|
cropped_im, dpi=int(72 * scale)
|
||||||
)
|
)
|
||||||
|
|
||||||
# Aggregate confidence values for document:
|
# Aggregate confidence values for document:
|
||||||
if len(conv_res.pages) > 0:
|
if len(conv_res.pages) > 0:
|
||||||
conv_res.confidence.layout_score = float(
|
with warnings.catch_warnings():
|
||||||
np.nanmean(
|
warnings.filterwarnings(
|
||||||
[c.layout_score for c in conv_res.confidence.pages.values()]
|
"ignore",
|
||||||
|
category=RuntimeWarning,
|
||||||
|
message="Mean of empty slice|All-NaN slice encountered",
|
||||||
)
|
)
|
||||||
)
|
conv_res.confidence.layout_score = float(
|
||||||
conv_res.confidence.parse_score = float(
|
np.nanmean(
|
||||||
np.nanquantile(
|
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||||
[c.parse_score for c in conv_res.confidence.pages.values()],
|
)
|
||||||
q=0.1, # parse score should relate to worst 10% of pages.
|
|
||||||
)
|
)
|
||||||
)
|
conv_res.confidence.parse_score = float(
|
||||||
conv_res.confidence.table_score = float(
|
np.nanquantile(
|
||||||
np.nanmean(
|
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||||
[c.table_score for c in conv_res.confidence.pages.values()]
|
q=0.1, # parse score should relate to worst 10% of pages.
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
conv_res.confidence.table_score = float(
|
||||||
conv_res.confidence.ocr_score = float(
|
np.nanmean(
|
||||||
np.nanmean(
|
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||||
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
)
|
||||||
|
)
|
||||||
|
conv_res.confidence.ocr_score = float(
|
||||||
|
np.nanmean(
|
||||||
|
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
|
@ -39,8 +39,15 @@ def test_e2e_valid_csv_conversions():
|
|||||||
print(f"converting {csv_path}")
|
print(f"converting {csv_path}")
|
||||||
|
|
||||||
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
|
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
|
||||||
|
if csv_path.stem in (
|
||||||
conv_result: ConversionResult = converter.convert(csv_path)
|
"csv-too-few-columns",
|
||||||
|
"csv-too-many-columns",
|
||||||
|
"csv-inconsistent-header",
|
||||||
|
):
|
||||||
|
with warns(UserWarning, match="Inconsistent column lengths"):
|
||||||
|
conv_result: ConversionResult = converter.convert(csv_path)
|
||||||
|
else:
|
||||||
|
conv_result: ConversionResult = converter.convert(csv_path)
|
||||||
|
|
||||||
doc: DoclingDocument = conv_result.document
|
doc: DoclingDocument = conv_result.document
|
||||||
|
|
||||||
|
@ -38,17 +38,15 @@ def get_converter():
|
|||||||
|
|
||||||
def test_compare_legacy_output(test_doc_paths):
|
def test_compare_legacy_output(test_doc_paths):
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
res = converter.convert_all(test_doc_paths, raises_on_error=True)
|
res = converter.convert_all(test_doc_paths, raises_on_error=True)
|
||||||
|
|
||||||
for conv_res in res:
|
for conv_res in res:
|
||||||
print(f"Results for {conv_res.input.file}")
|
print(f"Results for {conv_res.input.file}")
|
||||||
print(
|
with pytest.warns(DeprecationWarning, match="Use document instead"):
|
||||||
json.dumps(
|
print(
|
||||||
conv_res.legacy_document.model_dump(
|
json.dumps(
|
||||||
mode="json", by_alias=True, exclude_none=True
|
conv_res.legacy_document.model_dump(
|
||||||
|
mode="json", by_alias=True, exclude_none=True
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
)
|
|
||||||
|
|
||||||
# assert res.legacy_output == res.legacy_output_transformed
|
# assert res.legacy_output == res.legacy_output_transformed
|
||||||
|
@ -4,6 +4,7 @@ import warnings
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import pytest
|
||||||
from docling_core.types.doc import (
|
from docling_core.types.doc import (
|
||||||
DocItem,
|
DocItem,
|
||||||
DoclingDocument,
|
DoclingDocument,
|
||||||
@ -302,9 +303,8 @@ def verify_conversion_result_v1(
|
|||||||
)
|
)
|
||||||
|
|
||||||
doc_pred_pages: List[Page] = doc_result.pages
|
doc_pred_pages: List[Page] = doc_result.pages
|
||||||
doc_pred: DsDocument = doc_result.legacy_document
|
with pytest.warns(DeprecationWarning, match="Use document instead"):
|
||||||
with warnings.catch_warnings():
|
doc_pred: DsDocument = doc_result.legacy_document
|
||||||
warnings.simplefilter("ignore", DeprecationWarning)
|
|
||||||
doc_pred_md = doc_result.legacy_document.export_to_markdown()
|
doc_pred_md = doc_result.legacy_document.export_to_markdown()
|
||||||
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
doc_pred_dt = doc_result.legacy_document.export_to_document_tokens()
|
||||||
|
|
||||||
@ -391,7 +391,7 @@ def verify_conversion_result_v2(
|
|||||||
doc_pred_pages: List[Page] = doc_result.pages
|
doc_pred_pages: List[Page] = doc_result.pages
|
||||||
doc_pred: DoclingDocument = doc_result.document
|
doc_pred: DoclingDocument = doc_result.document
|
||||||
doc_pred_md = doc_result.document.export_to_markdown()
|
doc_pred_md = doc_result.document.export_to_markdown()
|
||||||
doc_pred_dt = doc_result.document.export_to_document_tokens()
|
doc_pred_dt = doc_result.document.export_to_doctags()
|
||||||
|
|
||||||
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
engine_suffix = "" if ocr_engine is None else f".{ocr_engine}"
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user