fix: Add unit tests (#51)

* add the pytests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* renamed the test folder and added the toplevel test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the toplevel function test

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* need to start running all tests successfully

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added the reference converted documents

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added first test for json and md output

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* ran pre-commit

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* replaced deprecated json function with model_dump_json

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* replaced deprecated json function with model_dump_json

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformatted code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Fix backend tests

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* commented out the drawing

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* ci: avoid duplicate runs

Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>

* commented out json verification for now

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added verification of input cells

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* reformat code

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added test to verify the cells in the pages

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added test to verify the cells in the pages (2)

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added test to verify the cells in the pages (3)

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* run all examples in CI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* make sure examples return failures

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* raise a failure if examples fail

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* fix examples

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* run examples after tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Add tests and update top_level_tests using only datamodels

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Remove unnecessary code

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Validate conversion status on e2e test

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* package verify utils and add more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* reduce docs in example, since they are already in the tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* skip batch_convert

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* pin docling-parse 1.1.2

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* updated the error messages

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* commented out the json verification for now

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* bumped GLM version

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* Fix lockfile

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Pin new docling-parse v1.1.3

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Signed-off-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2024-08-30 14:08:20 +02:00
committed by GitHub
parent 256f4d504e
commit 48f4d1ba52
43 changed files with 4999 additions and 353 deletions

View File

@@ -49,17 +49,18 @@ def export_documents(
f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted."
)
return success_count, partial_success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./test/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"),
Path("./test/data/redp5110.pdf"),
Path("./test/data/redp5695.pdf"),
Path("./tests/data/2206.01062.pdf"),
Path("./tests/data/2203.01017v2.pdf"),
Path("./tests/data/2305.03393v1.pdf"),
Path("./tests/data/redp5110.pdf"),
Path("./tests/data/redp5695.pdf"),
]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
@@ -73,12 +74,19 @@ def main():
start_time = time.time()
conv_results = doc_converter.convert(input)
export_documents(conv_results, output_dir=Path("./scratch"))
success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

View File

@@ -42,14 +42,14 @@ def export_documents(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
)
return success_count, failure_count
def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./test/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"),
Path("./tests/data/2206.01062.pdf"),
]
###########################################################################
@@ -114,12 +114,19 @@ def main():
start_time = time.time()
conv_results = doc_converter.convert(input)
export_documents(conv_results, output_dir=Path("./scratch"))
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()

View File

@@ -22,7 +22,7 @@ def main():
logging.basicConfig(level=logging.INFO)
input_doc_paths = [
Path("./test/data/2206.01062.pdf"),
Path("./tests/data/2206.01062.pdf"),
]
output_dir = Path("./scratch")
@@ -41,10 +41,13 @@ def main():
conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue
doc_filename = conv_res.input.file.stem
@@ -66,10 +69,17 @@ def main():
with element_image_filename.open("wb") as fp:
image.save(fp, "PNG")
success_count += 1
end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__":
main()