Merge branch 'dev/add-strict-tests' of github.com:DS4SD/docling into dev/add-strict-tests

This commit is contained in:
Christoph Auer 2024-08-28 14:22:32 +02:00
commit c09d2bca47
6 changed files with 51 additions and 18 deletions

View File

@ -16,6 +16,12 @@ jobs:
run: poetry run pre-commit run --all-files run: poetry run pre-commit run --all-files
- name: Install with poetry - name: Install with poetry
run: poetry install --all-extras run: poetry install --all-extras
- name: Run examples
run: |
for file in examples/*.py; do
echo "Running example $file"
poetry run python "$file" || exit 1
done
- name: Testing - name: Testing
run: | run: |
poetry run pytest -v tests poetry run pytest -v tests

View File

@ -49,17 +49,18 @@ def export_documents(
f"of which {failure_count} failed " f"of which {failure_count} failed "
f"and {partial_success_count} were partially converted." f"and {partial_success_count} were partially converted."
) )
return success_count, partial_success_count, failure_count
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_paths = [
Path("./test/data/2206.01062.pdf"), Path("./tests/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"), Path("./tests/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"), Path("./tests/data/2305.03393v1.pdf"),
Path("./test/data/redp5110.pdf"), Path("./tests/data/redp5110.pdf"),
Path("./test/data/redp5695.pdf"), Path("./tests/data/redp5695.pdf"),
] ]
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read()) # buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
@ -73,12 +74,19 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert(input)
export_documents(conv_results, output_dir=Path("./scratch")) success_count, partial_success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -42,14 +42,16 @@ def export_documents(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed" f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
) )
return success_count, failure_count
def main(): def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_paths = [
Path("./test/data/2206.01062.pdf"), Path("./tests/data/2206.01062.pdf"),
Path("./test/data/2203.01017v2.pdf"), Path("./tests/data/2203.01017v2.pdf"),
Path("./test/data/2305.03393v1.pdf"), Path("./tests/data/2305.03393v1.pdf"),
] ]
########################################################################### ###########################################################################
@ -114,12 +116,19 @@ def main():
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert(input)
export_documents(conv_results, output_dir=Path("./scratch")) success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch")
)
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -22,7 +22,7 @@ def main():
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [ input_doc_paths = [
Path("./test/data/2206.01062.pdf"), Path("./tests/data/2206.01062.pdf"),
] ]
output_dir = Path("./scratch") output_dir = Path("./scratch")
@ -41,10 +41,13 @@ def main():
conv_results = doc_converter.convert(input_files) conv_results = doc_converter.convert(input_files)
success_count = 0
failure_count = 0
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
for conv_res in conv_results: for conv_res in conv_results:
if conv_res.status != ConversionStatus.SUCCESS: if conv_res.status != ConversionStatus.SUCCESS:
_log.info(f"Document {conv_res.input.file} failed to convert.") _log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1
continue continue
doc_filename = conv_res.input.file.stem doc_filename = conv_res.input.file.stem
@ -66,10 +69,17 @@ def main():
with element_image_filename.open("wb") as fp: with element_image_filename.open("wb") as fp:
image.save(fp, "PNG") image.save(fp, "PNG")
success_count += 1
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

12
poetry.lock generated
View File

@ -797,13 +797,13 @@ tabulate = ">=0.9.0,<0.10.0"
[[package]] [[package]]
name = "docling-ibm-models" name = "docling-ibm-models"
version = "1.1.2" version = "1.1.3"
description = "This package contains the AI models used by the Docling PDF conversion package" description = "This package contains the AI models used by the Docling PDF conversion package"
optional = false optional = false
python-versions = "<4.0,>=3.10" python-versions = "<4.0,>=3.10"
files = [ files = [
{file = "docling_ibm_models-1.1.2-py3-none-any.whl", hash = "sha256:5b5cc7bfdd690597a43ca64b4528ec4060d3557cf07d1a6ee01a7b952539a607"}, {file = "docling_ibm_models-1.1.3-py3-none-any.whl", hash = "sha256:d7dfbacf5f6c63a150cb2270d67d0d6e892763af99d2a7fd434b0b240da67df3"},
{file = "docling_ibm_models-1.1.2.tar.gz", hash = "sha256:9926769a7053fd3696238ed9e05798b548701df01e2e2f37c48cafc7bfb62b6f"}, {file = "docling_ibm_models-1.1.3.tar.gz", hash = "sha256:dd89476d152c74c1b0a9c445fe31ebca4390d9b23c568bbd175b92a0ee112e77"},
] ]
[package.dependencies] [package.dependencies]
@ -2696,8 +2696,8 @@ files = [
numpy = [ numpy = [
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""}, {version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
] ]
[[package]] [[package]]
@ -2752,8 +2752,8 @@ files = [
[package.dependencies] [package.dependencies]
numpy = [ numpy = [
{version = ">=1.22.4", markers = "python_version < \"3.11\""}, {version = ">=1.22.4", markers = "python_version < \"3.11\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
{version = ">=1.26.0", markers = "python_version >= \"3.12\""}, {version = ">=1.26.0", markers = "python_version >= \"3.12\""},
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
] ]
python-dateutil = ">=2.8.2" python-dateutil = ">=2.8.2"
pytz = ">=2020.1" pytz = ">=2020.1"
@ -5143,4 +5143,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "e0f8f29e02dcc980287efc0b946df1df4d149bfe498cc16abda897842b45b019" content-hash = "35a2c8652173107818d14f38266af07b689fa80a5f0e8f3e06f5708797511cf6"

View File

@ -24,7 +24,7 @@ packages = [{include = "docling"}]
python = "^3.10" python = "^3.10"
pydantic = "^2.0.0" pydantic = "^2.0.0"
docling-core = "^1.1.2" docling-core = "^1.1.2"
docling-ibm-models = "^1.1.2" docling-ibm-models = "^1.1.3"
deepsearch-glm = ">=0.19.0,<1" deepsearch-glm = ">=0.19.0,<1"
filetype = "^1.2.0" filetype = "^1.2.0"
pypdfium2 = "^4.30.0" pypdfium2 = "^4.30.0"