test: update verify_utils to check CodeItem and FormulaItem (#2775)

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-12-12 13:10:33 +01:00
committed by GitHub
parent 7c24b014f6
commit 4897092b0b

View File

@@ -1,21 +1,18 @@
import json import json
import os import os
import warnings
from pathlib import Path from pathlib import Path
from typing import List, Optional from typing import Optional
import pytest import pytest
from docling_core.types.doc import ( from docling_core.types.doc import (
CodeItem,
DocItem, DocItem,
DoclingDocument, DoclingDocument,
FormulaItem,
PictureItem, PictureItem,
TableItem, TableItem,
TextItem, TextItem,
) )
from docling_core.types.doc.base import (
BoundingBox,
PydanticSerCtxKey,
)
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from PIL import Image as PILImage from PIL import Image as PILImage
from pydantic import BaseModel, TypeAdapter from pydantic import BaseModel, TypeAdapter
@@ -71,7 +68,7 @@ def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4):
def verify_cells( def verify_cells(
doc_pred_pages: List[_TestPagesMeta], doc_true_pages: List[_TestPagesMeta] doc_pred_pages: list[_TestPagesMeta], doc_true_pages: list[_TestPagesMeta]
): ):
assert len(doc_pred_pages) == len(doc_true_pages), ( assert len(doc_pred_pages) == len(doc_true_pages), (
"pred- and true-doc do not have the same number of pages" "pred- and true-doc do not have the same number of pages"
@@ -247,18 +244,14 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
# Validate text content # Validate text content
if isinstance(true_item, TextItem): if isinstance(true_item, TextItem):
assert isinstance(pred_item, TextItem), ( assert isinstance(pred_item, TextItem), (
"Test item is not a TextItem as the expected one " f"Test item should be a TextItem {true_item=} {pred_item=} "
f"{true_item=} "
f"{pred_item=} "
) )
assert verify_text(true_item.text, pred_item.text, fuzzy=fuzzy) assert verify_text(true_item.text, pred_item.text, fuzzy=fuzzy)
# Validate table content # Validate table content
if isinstance(true_item, TableItem): if isinstance(true_item, TableItem):
assert isinstance(pred_item, TableItem), ( assert isinstance(pred_item, TableItem), "Test item should be a TableItem"
"Test item is not a TableItem as the expected one"
)
assert verify_table_v2(true_item, pred_item, fuzzy=fuzzy), ( assert verify_table_v2(true_item, pred_item, fuzzy=fuzzy), (
"Tables not matching" "Tables not matching"
) )
@@ -266,7 +259,7 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
# Validate picture content # Validate picture content
if isinstance(true_item, PictureItem): if isinstance(true_item, PictureItem):
assert isinstance(pred_item, PictureItem), ( assert isinstance(pred_item, PictureItem), (
"Test item is not a PictureItem as the expected one" "Test item should be a PictureItem"
) )
true_image = true_item.get_image(doc=doc_true) true_image = true_item.get_image(doc=doc_true)
@@ -275,9 +268,19 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
assert verify_picture_image_v2(true_image, pred_image), ( assert verify_picture_image_v2(true_image, pred_image), (
"Picture image mismatch" "Picture image mismatch"
) )
# TODO: check picture annotations # TODO: check picture annotations
# Validate code content
if isinstance(true_item, CodeItem):
assert isinstance(pred_item, CodeItem), "Test item should be a CodeItem"
assert true_item.code_language == pred_item.code_language
# Validate formula content
if isinstance(true_item, FormulaItem):
assert isinstance(pred_item, FormulaItem), (
"Test item should be a FormulaItem"
)
return True return True
@@ -366,14 +369,14 @@ def verify_conversion_result_v2(
verify_doctags: bool = True, verify_doctags: bool = True,
indent: int = 2, indent: int = 2,
): ):
PageMetaList = TypeAdapter(List[_TestPagesMeta]) PageMetaList = TypeAdapter(list[_TestPagesMeta])
assert doc_result.status == ConversionStatus.SUCCESS, ( assert doc_result.status == ConversionStatus.SUCCESS, (
f"Doc {input_path} did not convert successfully." f"Doc {input_path} did not convert successfully."
) )
doc_pred_pages: List[Page] = doc_result.pages doc_pred_pages: list[Page] = doc_result.pages
doc_pred_pages_meta: List[_TestPagesMeta] = [ doc_pred_pages_meta: list[_TestPagesMeta] = [
_TestPagesMeta.from_page(page) for page in doc_pred_pages _TestPagesMeta.from_page(page) for page in doc_pred_pages
] ]
doc_pred: DoclingDocument = doc_result.document doc_pred: DoclingDocument = doc_result.document