ci: update docling-parse and remove pages.json (#2372)

* update docling-parse and remove pages.json

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* ocr gt

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-03 09:53:13 +02:00
committed by GitHub
parent ca2be7ff3a
commit 9505202e38
57 changed files with 1140 additions and 8850601 deletions

View File

@@ -18,7 +18,7 @@ from docling_core.types.doc.base import (
)
from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
from PIL import Image as PILImage
from pydantic import TypeAdapter
from pydantic import BaseModel, TypeAdapter
from pydantic.json import pydantic_encoder
from docling.datamodel.base_models import ConversionStatus, Page
@@ -28,6 +28,14 @@ COORD_PREC = 2 # decimal places for coordinates
CONFID_PREC = 3 # decimal places for confidence
class _TestPagesMeta(BaseModel):
num_cells: int
@classmethod
def from_page(cls, page: Page):
return cls(num_cells=len(page.cells))
def levenshtein(str1: str, str2: str) -> int:
# Ensure str1 is the shorter string to optimize memory usage
if len(str1) > len(str2):
@@ -62,36 +70,21 @@ def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4):
return True
def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
def verify_cells(
doc_pred_pages: List[_TestPagesMeta], doc_true_pages: List[_TestPagesMeta]
):
assert len(doc_pred_pages) == len(doc_true_pages), (
"pred- and true-doc do not have the same number of pages"
)
for pid, page_true_item in enumerate(doc_true_pages):
num_true_cells = len(page_true_item.cells)
num_pred_cells = len(doc_pred_pages[pid].cells)
num_true_cells = page_true_item.num_cells
num_pred_cells = doc_pred_pages[pid].num_cells
assert num_true_cells == num_pred_cells, (
f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}"
)
for cid, cell_true_item in enumerate(page_true_item.cells):
cell_pred_item = doc_pred_pages[pid].cells[cid]
true_text = cell_true_item.text
pred_text = cell_pred_item.text
assert true_text == pred_text, f"{true_text}!={pred_text}"
true_bbox = cell_true_item.rect.to_bounding_box().as_tuple()
norm_pred_bbox = BoundingBox.model_validate_json(
cell_pred_item.rect.to_bounding_box().model_dump_json(
context={PydanticSerCtxKey.COORD_PREC.value: COORD_PREC}
)
).as_tuple()
assert true_bbox == norm_pred_bbox, (
f"bbox is not the same: {true_bbox} != {norm_pred_bbox}"
)
return True
@@ -307,13 +300,10 @@ def verify_conversion_result_v1(
fuzzy: bool = False,
indent: int = 2,
):
PageList = TypeAdapter(List[Page])
assert doc_result.status == ConversionStatus.SUCCESS, (
f"Doc {input_path} did not convert successfully."
)
doc_pred_pages: List[Page] = doc_result.pages
with pytest.warns(DeprecationWarning, match="Use document instead"):
doc_pred: DsDocument = doc_result.legacy_document
doc_pred_md = doc_result.legacy_document.export_to_markdown()
@@ -327,18 +317,11 @@ def verify_conversion_result_v1(
input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name
)
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
with open(pages_path, mode="w", encoding="utf-8") as fw:
fw.write(
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
)
json_path.parent.mkdir(parents=True, exist_ok=True)
with open(json_path, mode="w", encoding="utf-8") as fw:
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
@@ -351,9 +334,6 @@ def verify_conversion_result_v1(
with open(dt_path, mode="w", encoding="utf-8") as fw:
fw.write(doc_pred_dt)
else: # default branch in test
with open(pages_path, encoding="utf-8") as fr:
doc_true_pages = PageList.validate_json(fr.read())
with open(json_path, encoding="utf-8") as fr:
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
@@ -363,11 +343,6 @@ def verify_conversion_result_v1(
with open(dt_path, encoding="utf-8") as fr:
doc_true_dt = fr.read()
if not fuzzy:
assert verify_cells(doc_pred_pages, doc_true_pages), (
f"Mismatch in PDF cell prediction for {input_path}"
)
# assert verify_output(
# doc_pred, doc_true
# ), f"Mismatch in JSON prediction for {input_path}"
@@ -394,13 +369,16 @@ def verify_conversion_result_v2(
verify_doctags: bool = True,
indent: int = 2,
):
PageList = TypeAdapter(List[Page])
PageMetaList = TypeAdapter(List[_TestPagesMeta])
assert doc_result.status == ConversionStatus.SUCCESS, (
f"Doc {input_path} did not convert successfully."
)
doc_pred_pages: List[Page] = doc_result.pages
doc_pred_pages_meta: List[_TestPagesMeta] = [
_TestPagesMeta.from_page(page) for page in doc_pred_pages
]
doc_pred: DoclingDocument = doc_result.document
doc_pred_md = doc_result.document.export_to_markdown()
doc_pred_dt = doc_result.document.export_to_doctags()
@@ -413,7 +391,7 @@ def verify_conversion_result_v2(
input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name
)
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.meta.json")
json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
@@ -421,18 +399,9 @@ def verify_conversion_result_v2(
if generate: # only used when re-generating truth
pages_path.parent.mkdir(parents=True, exist_ok=True)
pages_data = [
page.model_dump(
mode="json",
context={
PydanticSerCtxKey.COORD_PREC.value: COORD_PREC,
PydanticSerCtxKey.CONFID_PREC.value: CONFID_PREC,
},
)
for page in doc_pred_pages
]
pages_data = PageMetaList.dump_json(doc_pred_pages_meta, indent=2)
with open(pages_path, mode="w", encoding="utf-8") as fw:
fw.write(json.dumps(pages_data, indent=indent))
fw.write(pages_data.decode())
json_path.parent.mkdir(parents=True, exist_ok=True)
doc_pred.save_as_json(
@@ -448,7 +417,7 @@ def verify_conversion_result_v2(
fw.write(doc_pred_dt)
else: # default branch in test
with open(pages_path, encoding="utf-8") as fr:
doc_true_pages = PageList.validate_json(fr.read())
doc_true_pages_meta = PageMetaList.validate_json(fr.read())
with open(json_path, encoding="utf-8") as fr:
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
@@ -460,7 +429,7 @@ def verify_conversion_result_v2(
doc_true_dt = fr.read()
if not fuzzy:
assert verify_cells(doc_pred_pages, doc_true_pages), (
assert verify_cells(doc_pred_pages_meta, doc_true_pages_meta), (
f"Mismatch in PDF cell prediction for {input_path}"
)