package verify utils and add more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-08-28 18:50:32 +02:00
parent e44791691f
commit a700411288
8 changed files with 184 additions and 76 deletions

0
tests/__init__.py Normal file
View File

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,22 @@
order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.
## 5.1 Hyper Parameter Optimization
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|------------|------------|------------|-------------|-------------------|-------------|-------------|-------------|
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
| 6 | 6 | OTSL HTML | 0.965 0.969 | 0.934 0.927 | 0.955 0.955 | 0.88 0.857 | 2.73 5.39 |
| 4 | 4 | OTSL HTML | 0.938 0.952 | 0.904 | 0.927 | 0.853 | 1.97 |
| | | OTSL HTML | 0.923 | 0.909 0.897 0.901 | 0.938 0.915 | 0.843 | 3.77 |
| 2 | 4 | | 0.945 | | 0.931 | 0.859 0.834 | 1.91 3.81 |
| 4 | 2 | OTSL HTML | 0.952 0.944 | 0.92 0.903 | 0.942 0.931 | 0.857 0.824 | 1.22 2 |
## 5.2 Quantitative Results
We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.
Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -0,0 +1,51 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result
GENERATE = False
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files
def get_converter():
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def test_e2e_conversions():
pdf_paths = get_pdf_paths()
converter = get_converter()
for pdf_path in pdf_paths:
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
verify_conversion_result(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE
)

69
tests/test_interfaces.py Normal file
View File

@ -0,0 +1,69 @@
from io import BytesIO
from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result
def get_pdf_path():
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
return pdf_path
@pytest.fixture
def converter():
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def test_convert_single(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
conv_input = DocumentConversionInput.from_paths([pdf_path])
results = converter.convert(conv_input)
for doc_result in results:
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
buf = BytesIO(pdf_path.open("rb").read())
docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert(conv_input)
for doc_result in results:
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)

View File

@ -1,6 +1,5 @@
import glob
import json
from pathlib import Path, PosixPath
from pathlib import Path
from typing import List
from docling_core.types import BaseText
@ -8,41 +7,11 @@ from docling_core.types import Document as DsDocument
from pydantic import TypeAdapter
from pydantic.json import pydantic_encoder
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, Page, PipelineOptions
from docling.datamodel.base_models import ConversionStatus, Page
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
GENERATE = False
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files
def get_converter():
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def verify_cells(doc_pred_pages, doc_true_pages):
def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
assert len(doc_pred_pages) == len(
doc_true_pages
@ -75,7 +44,7 @@ def verify_cells(doc_pred_pages, doc_true_pages):
return True
def verify_maintext(doc_pred, doc_true):
def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
assert len(doc_true.main_text) == len(
doc_pred.main_text
@ -93,7 +62,7 @@ def verify_maintext(doc_pred, doc_true):
return True
def verify_tables(doc_pred, doc_true):
def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
assert len(doc_true.tables) == len(
doc_pred.tables
), "document has different count of tables than expected."
@ -130,29 +99,24 @@ def verify_md(doc_pred_md, doc_true_md):
return doc_pred_md == doc_true_md
def test_e2e_conversions():
def verify_conversion_result(
input_path: Path, doc_result: ConversionResult, generate=False
):
PageList = TypeAdapter(List[Page])
pdf_paths = get_pdf_paths()
converter = get_converter()
for path in pdf_paths:
print(f"converting {path}")
doc_result: ConversionResult = converter.convert_single(path)
assert (
doc_result.status == ConversionStatus.SUCCESS
), f"Doc {path} did not convert successfully."
), f"Doc {input_path} did not convert successfully."
doc_pred_pages: PageList = doc_result.pages
doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.output
doc_pred_md = doc_result.render_as_markdown()
pages_path = path.with_suffix(".pages.json")
json_path = path.with_suffix(".json")
md_path = path.with_suffix(".md")
pages_path = input_path.with_suffix(".pages.json")
json_path = input_path.with_suffix(".json")
md_path = input_path.with_suffix(".md")
if GENERATE: # only used when re-generating truth
if generate: # only used when re-generating truth
with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
@ -163,22 +127,22 @@ def test_e2e_conversions():
fw.write(doc_pred_md)
else: # default branch in test
with open(pages_path, "r") as fr:
doc_true_pages = PageList.validate_python(json.load(fr))
doc_true_pages = PageList.validate_json(fr.read())
with open(json_path, "r") as fr:
doc_true = DsDocument.model_validate(json.load(fr))
doc_true = DsDocument.model_validate_json(fr.read())
with open(md_path, "r") as fr:
doc_true_md = "".join(fr.readlines())
doc_true_md = fr.read()
assert verify_cells(
doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {path}"
), f"Mismatch in PDF cell prediction for {input_path}"
assert verify_output(
doc_pred, doc_true
), f"Mismatch in JSON prediction for {path}"
), f"Mismatch in JSON prediction for {input_path}"
assert verify_md(
doc_pred_md, doc_true_md
), f"Mismatch in Markdown prediction for {path}"
), f"Mismatch in Markdown prediction for {input_path}"