package verify utils and add more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
2025-07-26 20:14:47 +00:00 · 2024-08-28 18:50:32 +02:00 · 2024-08-28 18:50:32 +02:00 · a700411288
commit a700411288
parent e44791691f
8 changed files with 184 additions and 76 deletions
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/data/2305.03393v1-pg9.json
+++ b/tests/data/2305.03393v1-pg9.json
--- a/tests/data/2305.03393v1-pg9.md
+++ b/tests/data/2305.03393v1-pg9.md
@ -0,0 +1,22 @@
+order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.
+
+## 5.1 Hyper Parameter Optimization
+
+We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
+
+Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
+
+| #          | #          | Language   | TEDs        | TEDs              | TEDs        | mAP         | Inference   |
+|------------|------------|------------|-------------|-------------------|-------------|-------------|-------------|
+| enc-layers | dec-layers | Language   | simple      | complex           | all         | (0.75)      | time (secs) |
+| 6          | 6          | OTSL HTML  | 0.965 0.969 | 0.934 0.927       | 0.955 0.955 | 0.88 0.857  | 2.73 5.39   |
+| 4          | 4          | OTSL HTML  | 0.938 0.952 | 0.904             | 0.927       | 0.853       | 1.97        |
+|            |            | OTSL HTML  | 0.923       | 0.909 0.897 0.901 | 0.938 0.915 | 0.843       | 3.77        |
+| 2          | 4          |            | 0.945       |                   | 0.931       | 0.859 0.834 | 1.91 3.81   |
+| 4          | 2          | OTSL HTML  | 0.952 0.944 | 0.92 0.903        | 0.942 0.931 | 0.857 0.824 | 1.22 2      |
+
+## 5.2 Quantitative Results
+
+We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.
+
+Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.
--- a/tests/data/2305.03393v1-pg9.pages.json
+++ b/tests/data/2305.03393v1-pg9.pages.json
--- a/tests/data/2305.03393v1-pg9.pdf
+++ b/tests/data/2305.03393v1-pg9.pdf
--- a/tests/test_e2e_conversion.py
+++ b/tests/test_e2e_conversion.py
@ -0,0 +1,51 @@
+from pathlib import Path
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import PipelineOptions
+from docling.datamodel.document import ConversionResult
+from docling.document_converter import DocumentConverter
+
+from .verify_utils import verify_conversion_result
+
+GENERATE = False
+
+
+def get_pdf_paths():
+
+    # Define the directory you want to search
+    directory = Path("./tests/data")
+
+    # List all PDF files in the directory and its subdirectories
+    pdf_files = sorted(directory.rglob("*.pdf"))
+    return pdf_files
+
+
+def get_converter():
+
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = False
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    return converter
+
+
+def test_e2e_conversions():
+
+    pdf_paths = get_pdf_paths()
+    converter = get_converter()
+
+    for pdf_path in pdf_paths:
+        print(f"converting {pdf_path}")
+
+        doc_result: ConversionResult = converter.convert_single(pdf_path)
+
+        verify_conversion_result(
+            input_path=pdf_path, doc_result=doc_result, generate=GENERATE
+        )
--- a/tests/test_interfaces.py
+++ b/tests/test_interfaces.py
@ -0,0 +1,69 @@
+from io import BytesIO
+from pathlib import Path
+
+import pytest
+
+from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+from docling.datamodel.base_models import DocumentStream, PipelineOptions
+from docling.datamodel.document import ConversionResult, DocumentConversionInput
+from docling.document_converter import DocumentConverter
+
+from .verify_utils import verify_conversion_result
+
+
+def get_pdf_path():
+
+    pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
+    return pdf_path
+
+
+@pytest.fixture
+def converter():
+
+    pipeline_options = PipelineOptions()
+    pipeline_options.do_ocr = False
+    pipeline_options.do_table_structure = True
+    pipeline_options.table_structure_options.do_cell_matching = True
+
+    converter = DocumentConverter(
+        pipeline_options=pipeline_options,
+        pdf_backend=DoclingParseDocumentBackend,
+    )
+
+    return converter
+
+
+def test_convert_single(converter: DocumentConverter):
+
+    pdf_path = get_pdf_path()
+    print(f"converting {pdf_path}")
+
+    doc_result: ConversionResult = converter.convert_single(pdf_path)
+    verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
+
+
+def test_batch_path(converter: DocumentConverter):
+
+    pdf_path = get_pdf_path()
+    print(f"converting {pdf_path}")
+
+    conv_input = DocumentConversionInput.from_paths([pdf_path])
+
+    results = converter.convert(conv_input)
+    for doc_result in results:
+        verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
+
+
+def test_batch_bytes(converter: DocumentConverter):
+
+    pdf_path = get_pdf_path()
+    print(f"converting {pdf_path}")
+
+    buf = BytesIO(pdf_path.open("rb").read())
+    docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
+    conv_input = DocumentConversionInput.from_streams(docs)
+
+    results = converter.convert(conv_input)
+    for doc_result in results:
+        verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
--- a/tests/test_toplevel_functions.py
+++ b/tests/test_toplevel_functions.py
@ -1,6 +1,5 @@
-import glob
 import json
-from pathlib import Path, PosixPath
+from pathlib import Path
 from typing import List

 from docling_core.types import BaseText
@ -8,41 +7,11 @@ from docling_core.types import Document as DsDocument
 from pydantic import TypeAdapter
 from pydantic.json import pydantic_encoder

-from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
-from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, Page, PipelineOptions
+from docling.datamodel.base_models import ConversionStatus, Page
 from docling.datamodel.document import ConversionResult
-from docling.document_converter import DocumentConverter
-
-GENERATE = False


-def get_pdf_paths():
-
-    # Define the directory you want to search
-    directory = Path("./tests/data")
-
-    # List all PDF files in the directory and its subdirectories
-    pdf_files = sorted(directory.rglob("*.pdf"))
-    return pdf_files
-
-
-def get_converter():
-
-    pipeline_options = PipelineOptions()
-    pipeline_options.do_ocr = False
-    pipeline_options.do_table_structure = True
-    pipeline_options.table_structure_options.do_cell_matching = True
-
-    converter = DocumentConverter(
-        pipeline_options=pipeline_options,
-        pdf_backend=DoclingParseDocumentBackend,
-    )
-
-    return converter
-
-
-def verify_cells(doc_pred_pages, doc_true_pages):
+def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):

    assert len(doc_pred_pages) == len(
        doc_true_pages
@ -75,7 +44,7 @@ def verify_cells(doc_pred_pages, doc_true_pages):
    return True


-def verify_maintext(doc_pred, doc_true):
+def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):

    assert len(doc_true.main_text) == len(
        doc_pred.main_text
@ -93,7 +62,7 @@ def verify_maintext(doc_pred, doc_true):
    return True


-def verify_tables(doc_pred, doc_true):
+def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
    assert len(doc_true.tables) == len(
        doc_pred.tables
    ), "document has different count of tables than expected."
@ -130,29 +99,24 @@ def verify_md(doc_pred_md, doc_true_md):
    return doc_pred_md == doc_true_md


-def test_e2e_conversions():
+def verify_conversion_result(
+    input_path: Path, doc_result: ConversionResult, generate=False
+):
    PageList = TypeAdapter(List[Page])

-    pdf_paths = get_pdf_paths()
-    converter = get_converter()
-
-    for path in pdf_paths:
-        print(f"converting {path}")
-
-        doc_result: ConversionResult = converter.convert_single(path)
    assert (
        doc_result.status == ConversionStatus.SUCCESS
-        ), f"Doc {path} did not convert successfully."
+    ), f"Doc {input_path} did not convert successfully."

-        doc_pred_pages: PageList = doc_result.pages
+    doc_pred_pages: List[Page] = doc_result.pages
    doc_pred: DsDocument = doc_result.output
    doc_pred_md = doc_result.render_as_markdown()

-        pages_path = path.with_suffix(".pages.json")
-        json_path = path.with_suffix(".json")
-        md_path = path.with_suffix(".md")
+    pages_path = input_path.with_suffix(".pages.json")
+    json_path = input_path.with_suffix(".json")
+    md_path = input_path.with_suffix(".md")

-        if GENERATE:  # only used when re-generating truth
+    if generate:  # only used when re-generating truth
        with open(pages_path, "w") as fw:
            fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))

@ -163,22 +127,22 @@ def test_e2e_conversions():
            fw.write(doc_pred_md)
    else:  # default branch in test
        with open(pages_path, "r") as fr:
-                doc_true_pages = PageList.validate_python(json.load(fr))
+            doc_true_pages = PageList.validate_json(fr.read())

        with open(json_path, "r") as fr:
-                doc_true = DsDocument.model_validate(json.load(fr))
+            doc_true = DsDocument.model_validate_json(fr.read())

        with open(md_path, "r") as fr:
-                doc_true_md = "".join(fr.readlines())
+            doc_true_md = fr.read()

        assert verify_cells(
            doc_pred_pages, doc_true_pages
-            ), f"Mismatch in PDF cell prediction for {path}"
+        ), f"Mismatch in PDF cell prediction for {input_path}"

        assert verify_output(
            doc_pred, doc_true
-            ), f"Mismatch in JSON prediction for {path}"
+        ), f"Mismatch in JSON prediction for {input_path}"

        assert verify_md(
            doc_pred_md, doc_true_md
-            ), f"Mismatch in Markdown prediction for {path}"
+        ), f"Mismatch in Markdown prediction for {input_path}"