package verify utils and add more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-08-28 18:50:32 +02:00 · 2024-08-28 18:50:32 +02:00 · a700411288
commit a700411288
parent e44791691f
8 changed files with 184 additions and 76 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/data/2305.03393v1-pg9.json
+++ b/tests/data/2305.03393v1-pg9.json
--- a/tests/data/2305.03393v1-pg9.md
+++ b/tests/data/2305.03393v1-pg9.md
@ -0,0 +1,22 @@
 order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.
 ## 5.1 Hyper Parameter Optimization
 We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
 Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
 | #          | #          | Language   | TEDs        | TEDs              | TEDs        | mAP         | Inference   |
 |------------|------------|------------|-------------|-------------------|-------------|-------------|-------------|
 | enc-layers | dec-layers | Language   | simple      | complex           | all         | (0.75)      | time (secs) |
 | 6          | 6          | OTSL HTML  | 0.965 0.969 | 0.934 0.927       | 0.955 0.955 | 0.88 0.857  | 2.73 5.39   |
 | 4          | 4          | OTSL HTML  | 0.938 0.952 | 0.904             | 0.927       | 0.853       | 1.97        |
 |            |            | OTSL HTML  | 0.923       | 0.909 0.897 0.901 | 0.938 0.915 | 0.843       | 3.77        |
 | 2          | 4          |            | 0.945       |                   | 0.931       | 0.859 0.834 | 1.91 3.81   |
 | 4          | 2          | OTSL HTML  | 0.952 0.944 | 0.92 0.903        | 0.942 0.931 | 0.857 0.824 | 1.22 2      |
 ## 5.2 Quantitative Results
 We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.
 Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.
--- a/tests/data/2305.03393v1-pg9.pages.json
+++ b/tests/data/2305.03393v1-pg9.pages.json
--- a/tests/data/2305.03393v1-pg9.pdf
+++ b/tests/data/2305.03393v1-pg9.pdf
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -0,0 +1,51 @@
 from pathlib import Path
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import PipelineOptions
 from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter
 from .verify_utils import verify_conversion_result
 GENERATE = False
 def get_pdf_paths():
    # Define the directory you want to search
    directory = Path("./tests/data")
    # List all PDF files in the directory and its subdirectories
    pdf_files = sorted(directory.rglob("*.pdf"))
    return pdf_files
 def get_converter():
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
    return converter
 def test_e2e_conversions():
    pdf_paths = get_pdf_paths()
    converter = get_converter()
    for pdf_path in pdf_paths:
        print(f"converting {pdf_path}")
        doc_result: ConversionResult = converter.convert_single(pdf_path)
        verify_conversion_result(
            input_path=pdf_path, doc_result=doc_result, generate=GENERATE
        )
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -0,0 +1,69 @@
 from io import BytesIO
 from pathlib import Path
 import pytest
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import DocumentStream, PipelineOptions
 from docling.datamodel.document import ConversionResult, DocumentConversionInput
 from docling.document_converter import DocumentConverter
 from .verify_utils import verify_conversion_result
 def get_pdf_path():
    pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
    return pdf_path
@pytest.fixture
 def converter():
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
    return converter
 def test_convert_single(converter: DocumentConverter):
    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")
    doc_result: ConversionResult = converter.convert_single(pdf_path)
    verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
 def test_batch_path(converter: DocumentConverter):
    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")
    conv_input = DocumentConversionInput.from_paths([pdf_path])
    results = converter.convert(conv_input)
    for doc_result in results:
        verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
 def test_batch_bytes(converter: DocumentConverter):
    pdf_path = get_pdf_path()
    print(f"converting {pdf_path}")
    buf = BytesIO(pdf_path.open("rb").read())
    docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
    conv_input = DocumentConversionInput.from_streams(docs)
    results = converter.convert(conv_input)
    for doc_result in results:
        verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
--- a/tests/test_toplevel_functions.py
+++ b/tests/test_toplevel_functions.py
@ -1,6 +1,5 @@
 import glob
 import json
-from pathlib import Path, PosixPath
+from pathlib import Path
 from typing import List
 from docling_core.types import BaseText
@ -8,41 +7,11 @@ from docling_core.types import Document as DsDocument
 from pydantic import TypeAdapter
 from pydantic.json import pydantic_encoder
-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.datamodel.base_models import ConversionStatus, Page
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
 from docling.datamodel.base_models import ConversionStatus, Page, PipelineOptions
 from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter
 GENERATE = False
-def get_pdf_paths():
+def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
    # Define the directory you want to search
    directory = Path("./tests/data")
    # List all PDF files in the directory and its subdirectories
    pdf_files = sorted(directory.rglob("*.pdf"))
    return pdf_files
 def get_converter():
    pipeline_options = PipelineOptions()
    pipeline_options.do_ocr = False
    pipeline_options.do_table_structure = True
    pipeline_options.table_structure_options.do_cell_matching = True
    converter = DocumentConverter(
        pipeline_options=pipeline_options,
        pdf_backend=DoclingParseDocumentBackend,
    )
    return converter
 def verify_cells(doc_pred_pages, doc_true_pages):
    assert len(doc_pred_pages) == len(
        doc_true_pages
@ -75,7 +44,7 @@ def verify_cells(doc_pred_pages, doc_true_pages):
    return True
-def verify_maintext(doc_pred, doc_true):
+def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
    assert len(doc_true.main_text) == len(
        doc_pred.main_text
@ -93,7 +62,7 @@ def verify_maintext(doc_pred, doc_true):
    return True
-def verify_tables(doc_pred, doc_true):
+def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
    assert len(doc_true.tables) == len(
        doc_pred.tables
    ), "document has different count of tables than expected."
@ -130,29 +99,24 @@ def verify_md(doc_pred_md, doc_true_md):
    return doc_pred_md == doc_true_md
-def test_e2e_conversions():
+def verify_conversion_result(
    input_path: Path, doc_result: ConversionResult, generate=False
 ):
    PageList = TypeAdapter(List[Page])
    pdf_paths = get_pdf_paths()
    converter = get_converter()
    for path in pdf_paths:
        print(f"converting {path}")
        doc_result: ConversionResult = converter.convert_single(path)
    assert (
        doc_result.status == ConversionStatus.SUCCESS
-        ), f"Doc {path} did not convert successfully."
+    ), f"Doc {input_path} did not convert successfully."
-        doc_pred_pages: PageList = doc_result.pages
+    doc_pred_pages: List[Page] = doc_result.pages
    doc_pred: DsDocument = doc_result.output
    doc_pred_md = doc_result.render_as_markdown()
-        pages_path = path.with_suffix(".pages.json")
+    pages_path = input_path.with_suffix(".pages.json")
-        json_path = path.with_suffix(".json")
+    json_path = input_path.with_suffix(".json")
-        md_path = path.with_suffix(".md")
+    md_path = input_path.with_suffix(".md")
-        if GENERATE:  # only used when re-generating truth
+    if generate:  # only used when re-generating truth
        with open(pages_path, "w") as fw:
            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
@ -163,22 +127,22 @@ def test_e2e_conversions():
            fw.write(doc_pred_md)
    else:  # default branch in test
        with open(pages_path, "r") as fr:
-                doc_true_pages = PageList.validate_python(json.load(fr))
+            doc_true_pages = PageList.validate_json(fr.read())
        with open(json_path, "r") as fr:
-                doc_true = DsDocument.model_validate(json.load(fr))
+            doc_true = DsDocument.model_validate_json(fr.read())
        with open(md_path, "r") as fr:
-                doc_true_md = "".join(fr.readlines())
+            doc_true_md = fr.read()
        assert verify_cells(
            doc_pred_pages, doc_true_pages
-            ), f"Mismatch in PDF cell prediction for {path}"
+        ), f"Mismatch in PDF cell prediction for {input_path}"
        assert verify_output(
            doc_pred, doc_true
-            ), f"Mismatch in JSON prediction for {path}"
+        ), f"Mismatch in JSON prediction for {input_path}"
        assert verify_md(
            doc_pred_md, doc_true_md
-            ), f"Mismatch in Markdown prediction for {path}"
+        ), f"Mismatch in Markdown prediction for {input_path}"