package verify utils and add more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-08-28 18:50:32 +02:00
parent e44791691f
commit a700411288
8 changed files with 184 additions and 76 deletions

0
tests/__init__.py Normal file
View File

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,22 @@
order to compute the TED score. Inference timing results for all experiments were obtained from the same machine on a single core with AMD EPYC 7763 CPU @2.45 GHz.
## 5.1 Hyper Parameter Optimization
We have chosen the PubTabNet data set to perform HPO, since it includes a highly diverse set of tables. Also we report TED scores separately for simple and complex tables (tables with cell spans). Results are presented in Table. 1. It is evident that with OTSL, our model achieves the same TED score and slightly better mAP scores in comparison to HTML. However OTSL yields a 2x speed up in the inference runtime over HTML.
Table 1. HPO performed in OTSL and HTML representation on the same transformer-based TableFormer [9] architecture, trained only on PubTabNet [22]. Effects of reducing the # of layers in encoder and decoder stages of the model show that smaller models trained on OTSL perform better, especially in recognizing complex table structures, and maintain a much higher mAP score than the HTML counterpart.
| # | # | Language | TEDs | TEDs | TEDs | mAP | Inference |
|------------|------------|------------|-------------|-------------------|-------------|-------------|-------------|
| enc-layers | dec-layers | Language | simple | complex | all | (0.75) | time (secs) |
| 6 | 6 | OTSL HTML | 0.965 0.969 | 0.934 0.927 | 0.955 0.955 | 0.88 0.857 | 2.73 5.39 |
| 4 | 4 | OTSL HTML | 0.938 0.952 | 0.904 | 0.927 | 0.853 | 1.97 |
| | | OTSL HTML | 0.923 | 0.909 0.897 0.901 | 0.938 0.915 | 0.843 | 3.77 |
| 2 | 4 | | 0.945 | | 0.931 | 0.859 0.834 | 1.91 3.81 |
| 4 | 2 | OTSL HTML | 0.952 0.944 | 0.92 0.903 | 0.942 0.931 | 0.857 0.824 | 1.22 2 |
## 5.2 Quantitative Results
We picked the model parameter configuration that produced the best prediction quality (enc=6, dec=6, heads=8) with PubTabNet alone, then independently trained and evaluated it on three publicly available data sets: PubTabNet (395k samples), FinTabNet (113k samples) and PubTables-1M (about 1M samples). Performance results are presented in Table. 2. It is clearly evident that the model trained on OTSL outperforms HTML across the board, keeping high TEDs and mAP scores even on difficult financial tables (FinTabNet) that contain sparse and large tables.
Additionally, the results show that OTSL has an advantage over HTML when applied on a bigger data set like PubTables-1M and achieves significantly improved scores. Finally, OTSL achieves faster inference due to fewer decoding steps which is a result of the reduced sequence representation.

File diff suppressed because one or more lines are too long

Binary file not shown.

View File

@ -0,0 +1,51 @@
from pathlib import Path
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import PipelineOptions
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result
GENERATE = False
def get_pdf_paths():
# Define the directory you want to search
directory = Path("./tests/data")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files
def get_converter():
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def test_e2e_conversions():
pdf_paths = get_pdf_paths()
converter = get_converter()
for pdf_path in pdf_paths:
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
verify_conversion_result(
input_path=pdf_path, doc_result=doc_result, generate=GENERATE
)

69
tests/test_interfaces.py Normal file
View File

@ -0,0 +1,69 @@
from io import BytesIO
from pathlib import Path
import pytest
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import DocumentStream, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter
from .verify_utils import verify_conversion_result
def get_pdf_path():
pdf_path = Path("./tests/data/2305.03393v1-pg9.pdf")
return pdf_path
@pytest.fixture
def converter():
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def test_convert_single(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
doc_result: ConversionResult = converter.convert_single(pdf_path)
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_path(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
conv_input = DocumentConversionInput.from_paths([pdf_path])
results = converter.convert(conv_input)
for doc_result in results:
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)
def test_batch_bytes(converter: DocumentConverter):
pdf_path = get_pdf_path()
print(f"converting {pdf_path}")
buf = BytesIO(pdf_path.open("rb").read())
docs = [DocumentStream(filename=pdf_path.name, stream=buf)]
conv_input = DocumentConversionInput.from_streams(docs)
results = converter.convert(conv_input)
for doc_result in results:
verify_conversion_result(input_path=pdf_path, doc_result=doc_result)

View File

@ -1,6 +1,5 @@
import glob
import json import json
from pathlib import Path, PosixPath from pathlib import Path
from typing import List from typing import List
from docling_core.types import BaseText from docling_core.types import BaseText
@ -8,41 +7,11 @@ from docling_core.types import Document as DsDocument
from pydantic import TypeAdapter from pydantic import TypeAdapter
from pydantic.json import pydantic_encoder from pydantic.json import pydantic_encoder
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.datamodel.base_models import ConversionStatus, Page
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, Page, PipelineOptions
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
GENERATE = False
def get_pdf_paths(): def verify_cells(doc_pred_pages: List[Page], doc_true_pages: List[Page]):
# Define the directory you want to search
directory = Path("./tests/data")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.pdf"))
return pdf_files
def get_converter():
pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True
pipeline_options.table_structure_options.do_cell_matching = True
converter = DocumentConverter(
pipeline_options=pipeline_options,
pdf_backend=DoclingParseDocumentBackend,
)
return converter
def verify_cells(doc_pred_pages, doc_true_pages):
assert len(doc_pred_pages) == len( assert len(doc_pred_pages) == len(
doc_true_pages doc_true_pages
@ -75,7 +44,7 @@ def verify_cells(doc_pred_pages, doc_true_pages):
return True return True
def verify_maintext(doc_pred, doc_true): def verify_maintext(doc_pred: DsDocument, doc_true: DsDocument):
assert len(doc_true.main_text) == len( assert len(doc_true.main_text) == len(
doc_pred.main_text doc_pred.main_text
@ -93,7 +62,7 @@ def verify_maintext(doc_pred, doc_true):
return True return True
def verify_tables(doc_pred, doc_true): def verify_tables(doc_pred: DsDocument, doc_true: DsDocument):
assert len(doc_true.tables) == len( assert len(doc_true.tables) == len(
doc_pred.tables doc_pred.tables
), "document has different count of tables than expected." ), "document has different count of tables than expected."
@ -130,29 +99,24 @@ def verify_md(doc_pred_md, doc_true_md):
return doc_pred_md == doc_true_md return doc_pred_md == doc_true_md
def test_e2e_conversions(): def verify_conversion_result(
input_path: Path, doc_result: ConversionResult, generate=False
):
PageList = TypeAdapter(List[Page]) PageList = TypeAdapter(List[Page])
pdf_paths = get_pdf_paths()
converter = get_converter()
for path in pdf_paths:
print(f"converting {path}")
doc_result: ConversionResult = converter.convert_single(path)
assert ( assert (
doc_result.status == ConversionStatus.SUCCESS doc_result.status == ConversionStatus.SUCCESS
), f"Doc {path} did not convert successfully." ), f"Doc {input_path} did not convert successfully."
doc_pred_pages: PageList = doc_result.pages doc_pred_pages: List[Page] = doc_result.pages
doc_pred: DsDocument = doc_result.output doc_pred: DsDocument = doc_result.output
doc_pred_md = doc_result.render_as_markdown() doc_pred_md = doc_result.render_as_markdown()
pages_path = path.with_suffix(".pages.json") pages_path = input_path.with_suffix(".pages.json")
json_path = path.with_suffix(".json") json_path = input_path.with_suffix(".json")
md_path = path.with_suffix(".md") md_path = input_path.with_suffix(".md")
if GENERATE: # only used when re-generating truth if generate: # only used when re-generating truth
with open(pages_path, "w") as fw: with open(pages_path, "w") as fw:
fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder)) fw.write(json.dumps(doc_pred_pages, default=pydantic_encoder))
@ -163,22 +127,22 @@ def test_e2e_conversions():
fw.write(doc_pred_md) fw.write(doc_pred_md)
else: # default branch in test else: # default branch in test
with open(pages_path, "r") as fr: with open(pages_path, "r") as fr:
doc_true_pages = PageList.validate_python(json.load(fr)) doc_true_pages = PageList.validate_json(fr.read())
with open(json_path, "r") as fr: with open(json_path, "r") as fr:
doc_true = DsDocument.model_validate(json.load(fr)) doc_true = DsDocument.model_validate_json(fr.read())
with open(md_path, "r") as fr: with open(md_path, "r") as fr:
doc_true_md = "".join(fr.readlines()) doc_true_md = fr.read()
assert verify_cells( assert verify_cells(
doc_pred_pages, doc_true_pages doc_pred_pages, doc_true_pages
), f"Mismatch in PDF cell prediction for {path}" ), f"Mismatch in PDF cell prediction for {input_path}"
assert verify_output( assert verify_output(
doc_pred, doc_true doc_pred, doc_true
), f"Mismatch in JSON prediction for {path}" ), f"Mismatch in JSON prediction for {input_path}"
assert verify_md( assert verify_md(
doc_pred_md, doc_true_md doc_pred_md, doc_true_md
), f"Mismatch in Markdown prediction for {path}" ), f"Mismatch in Markdown prediction for {input_path}"