ci: update docling-parse and remove pages.json (#2372)

* update docling-parse and remove pages.json Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * ocr gt Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 20:58:11 +00:00 · 2025-10-03 09:53:13 +02:00
parent ca2be7ff3a
commit 9505202e38
57 changed files with 1140 additions and 8850601 deletions
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -18,7 +18,7 @@ from docling_core.types.doc.base import (
 )
 from docling_core.types.legacy_doc.document import ExportedCCSDocument as DsDocument
 from PIL import Image as PILImage
-from pydantic import TypeAdapter
+from pydantic import BaseModel, TypeAdapter
 from pydantic.json import pydantic_encoder

 from docling.datamodel.base_models import ConversionStatus, Page
@@ -28,6 +28,14 @@ COORD_PREC = 2  # decimal places for coordinates
 CONFID_PREC = 3  # decimal places for confidence


+class _TestPagesMeta(BaseModel):
+    num_cells: int
+
+    @classmethod
+    def from_page(cls, page: Page):
+        return cls(num_cells=len(page.cells))
+
+
 def levenshtein(str1: str, str2: str) -> int:
    # Ensure str1 is the shorter string to optimize memory usage
    if len(str1) > len(str2):
@@ -62,36 +70,21 @@ def verify_text(gt: str, pred: str, fuzzy: bool, fuzzy_threshold: float = 0.4):
    return True


-def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
+def verify_cells(
+    doc_pred_pages: List[_TestPagesMeta], doc_true_pages: List[_TestPagesMeta]
+):
    assert len(doc_pred_pages) == len(doc_true_pages), (
        "pred- and true-doc do not have the same number of pages"
    )

    for pid, page_true_item in enumerate(doc_true_pages):
-        num_true_cells = len(page_true_item.cells)
-        num_pred_cells = len(doc_pred_pages[pid].cells)
+        num_true_cells = page_true_item.num_cells
+        num_pred_cells = doc_pred_pages[pid].num_cells

        assert num_true_cells == num_pred_cells, (
            f"num_true_cells!=num_pred_cells {num_true_cells}!={num_pred_cells}"
        )

-        for cid, cell_true_item in enumerate(page_true_item.cells):
-            cell_pred_item = doc_pred_pages[pid].cells[cid]
-
-            true_text = cell_true_item.text
-            pred_text = cell_pred_item.text
-            assert true_text == pred_text, f"{true_text}!={pred_text}"
-
-            true_bbox = cell_true_item.rect.to_bounding_box().as_tuple()
-            norm_pred_bbox = BoundingBox.model_validate_json(
-                cell_pred_item.rect.to_bounding_box().model_dump_json(
-                    context={PydanticSerCtxKey.COORD_PREC.value: COORD_PREC}
-                )
-            ).as_tuple()
-            assert true_bbox == norm_pred_bbox, (
-                f"bbox is not the same: {true_bbox} != {norm_pred_bbox}"
-            )
-
    return True


@@ -307,13 +300,10 @@ def verify_conversion_result_v1(
    fuzzy: bool = False,
    indent: int = 2,
 ):
-    PageList = TypeAdapter(List[Page])
-
    assert doc_result.status == ConversionStatus.SUCCESS, (
        f"Doc {input_path} did not convert successfully."
    )

-    doc_pred_pages: List[Page] = doc_result.pages
    with pytest.warns(DeprecationWarning, match="Use document instead"):
        doc_pred: DsDocument = doc_result.legacy_document
        doc_pred_md = doc_result.legacy_document.export_to_markdown()
@@ -327,18 +317,11 @@ def verify_conversion_result_v1(
            input_path.parent.parent / "groundtruth" / "docling_v1" / input_path.name
        )

-    pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
    json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
    md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
    dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")

    if generate:  # only used when re-generating truth
-        pages_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(pages_path, mode="w", encoding="utf-8") as fw:
-            fw.write(
-                json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
-            )
-
        json_path.parent.mkdir(parents=True, exist_ok=True)
        with open(json_path, mode="w", encoding="utf-8") as fw:
            fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
@@ -351,9 +334,6 @@ def verify_conversion_result_v1(
        with open(dt_path, mode="w", encoding="utf-8") as fw:
            fw.write(doc_pred_dt)
    else:  # default branch in test
-        with open(pages_path, encoding="utf-8") as fr:
-            doc_true_pages = PageList.validate_json(fr.read())
-
        with open(json_path, encoding="utf-8") as fr:
            doc_true: DsDocument = DsDocument.model_validate_json(fr.read())

@@ -363,11 +343,6 @@ def verify_conversion_result_v1(
        with open(dt_path, encoding="utf-8") as fr:
            doc_true_dt = fr.read()

-        if not fuzzy:
-            assert verify_cells(doc_pred_pages, doc_true_pages), (
-                f"Mismatch in PDF cell prediction for {input_path}"
-            )
-
        # assert verify_output(
        #    doc_pred, doc_true
        # ), f"Mismatch in JSON prediction for {input_path}"
@@ -394,13 +369,16 @@ def verify_conversion_result_v2(
    verify_doctags: bool = True,
    indent: int = 2,
 ):
-    PageList = TypeAdapter(List[Page])
+    PageMetaList = TypeAdapter(List[_TestPagesMeta])

    assert doc_result.status == ConversionStatus.SUCCESS, (
        f"Doc {input_path} did not convert successfully."
    )

    doc_pred_pages: List[Page] = doc_result.pages
+    doc_pred_pages_meta: List[_TestPagesMeta] = [
+        _TestPagesMeta.from_page(page) for page in doc_pred_pages
+    ]
    doc_pred: DoclingDocument = doc_result.document
    doc_pred_md = doc_result.document.export_to_markdown()
    doc_pred_dt = doc_result.document.export_to_doctags()
@@ -413,7 +391,7 @@ def verify_conversion_result_v2(
            input_path.parent.parent / "groundtruth" / "docling_v2" / input_path.name
        )

-    pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.json")
+    pages_path = gt_subpath.with_suffix(f"{engine_suffix}.pages.meta.json")
    json_path = gt_subpath.with_suffix(f"{engine_suffix}.json")
    md_path = gt_subpath.with_suffix(f"{engine_suffix}.md")
    dt_path = gt_subpath.with_suffix(f"{engine_suffix}.doctags.txt")
@@ -421,18 +399,9 @@ def verify_conversion_result_v2(
    if generate:  # only used when re-generating truth
        pages_path.parent.mkdir(parents=True, exist_ok=True)

-        pages_data = [
-            page.model_dump(
-                mode="json",
-                context={
-                    PydanticSerCtxKey.COORD_PREC.value: COORD_PREC,
-                    PydanticSerCtxKey.CONFID_PREC.value: CONFID_PREC,
-                },
-            )
-            for page in doc_pred_pages
-        ]
+        pages_data = PageMetaList.dump_json(doc_pred_pages_meta, indent=2)
        with open(pages_path, mode="w", encoding="utf-8") as fw:
-            fw.write(json.dumps(pages_data, indent=indent))
+            fw.write(pages_data.decode())

        json_path.parent.mkdir(parents=True, exist_ok=True)
        doc_pred.save_as_json(
@@ -448,7 +417,7 @@ def verify_conversion_result_v2(
            fw.write(doc_pred_dt)
    else:  # default branch in test
        with open(pages_path, encoding="utf-8") as fr:
-            doc_true_pages = PageList.validate_json(fr.read())
+            doc_true_pages_meta = PageMetaList.validate_json(fr.read())

        with open(json_path, encoding="utf-8") as fr:
            doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
@@ -460,7 +429,7 @@ def verify_conversion_result_v2(
            doc_true_dt = fr.read()

        if not fuzzy:
-            assert verify_cells(doc_pred_pages, doc_true_pages), (
+            assert verify_cells(doc_pred_pages_meta, doc_true_pages_meta), (
                f"Mismatch in PDF cell prediction for {input_path}"
            )