mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge branch 'dev/add-strict-tests' of github.com:DS4SD/docling into dev/add-strict-tests
This commit is contained in:
commit
c09d2bca47
6
.github/workflows/checks.yml
vendored
6
.github/workflows/checks.yml
vendored
@ -16,6 +16,12 @@ jobs:
|
|||||||
run: poetry run pre-commit run --all-files
|
run: poetry run pre-commit run --all-files
|
||||||
- name: Install with poetry
|
- name: Install with poetry
|
||||||
run: poetry install --all-extras
|
run: poetry install --all-extras
|
||||||
|
- name: Run examples
|
||||||
|
run: |
|
||||||
|
for file in examples/*.py; do
|
||||||
|
echo "Running example $file"
|
||||||
|
poetry run python "$file" || exit 1
|
||||||
|
done
|
||||||
- name: Testing
|
- name: Testing
|
||||||
run: |
|
run: |
|
||||||
poetry run pytest -v tests
|
poetry run pytest -v tests
|
||||||
|
@ -49,17 +49,18 @@ def export_documents(
|
|||||||
f"of which {failure_count} failed "
|
f"of which {failure_count} failed "
|
||||||
f"and {partial_success_count} were partially converted."
|
f"and {partial_success_count} were partially converted."
|
||||||
)
|
)
|
||||||
|
return success_count, partial_success_count, failure_count
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_paths = [
|
||||||
Path("./test/data/2206.01062.pdf"),
|
Path("./tests/data/2206.01062.pdf"),
|
||||||
Path("./test/data/2203.01017v2.pdf"),
|
Path("./tests/data/2203.01017v2.pdf"),
|
||||||
Path("./test/data/2305.03393v1.pdf"),
|
Path("./tests/data/2305.03393v1.pdf"),
|
||||||
Path("./test/data/redp5110.pdf"),
|
Path("./tests/data/redp5110.pdf"),
|
||||||
Path("./test/data/redp5695.pdf"),
|
Path("./tests/data/redp5695.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||||
@ -73,12 +74,19 @@ def main():
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert(input)
|
||||||
export_documents(conv_results, output_dir=Path("./scratch"))
|
success_count, partial_success_count, failure_count = export_documents(
|
||||||
|
conv_results, output_dir=Path("./scratch")
|
||||||
|
)
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
|
if failure_count > 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@ -42,14 +42,16 @@ def export_documents(
|
|||||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return success_count, failure_count
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_paths = [
|
||||||
Path("./test/data/2206.01062.pdf"),
|
Path("./tests/data/2206.01062.pdf"),
|
||||||
Path("./test/data/2203.01017v2.pdf"),
|
Path("./tests/data/2203.01017v2.pdf"),
|
||||||
Path("./test/data/2305.03393v1.pdf"),
|
Path("./tests/data/2305.03393v1.pdf"),
|
||||||
]
|
]
|
||||||
|
|
||||||
###########################################################################
|
###########################################################################
|
||||||
@ -114,12 +116,19 @@ def main():
|
|||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
|
||||||
conv_results = doc_converter.convert(input)
|
conv_results = doc_converter.convert(input)
|
||||||
export_documents(conv_results, output_dir=Path("./scratch"))
|
success_count, failure_count = export_documents(
|
||||||
|
conv_results, output_dir=Path("./scratch")
|
||||||
|
)
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
|
if failure_count > 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
@ -22,7 +22,7 @@ def main():
|
|||||||
logging.basicConfig(level=logging.INFO)
|
logging.basicConfig(level=logging.INFO)
|
||||||
|
|
||||||
input_doc_paths = [
|
input_doc_paths = [
|
||||||
Path("./test/data/2206.01062.pdf"),
|
Path("./tests/data/2206.01062.pdf"),
|
||||||
]
|
]
|
||||||
output_dir = Path("./scratch")
|
output_dir = Path("./scratch")
|
||||||
|
|
||||||
@ -41,10 +41,13 @@ def main():
|
|||||||
|
|
||||||
conv_results = doc_converter.convert(input_files)
|
conv_results = doc_converter.convert(input_files)
|
||||||
|
|
||||||
|
success_count = 0
|
||||||
|
failure_count = 0
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
for conv_res in conv_results:
|
for conv_res in conv_results:
|
||||||
if conv_res.status != ConversionStatus.SUCCESS:
|
if conv_res.status != ConversionStatus.SUCCESS:
|
||||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||||
|
failure_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
doc_filename = conv_res.input.file.stem
|
doc_filename = conv_res.input.file.stem
|
||||||
@ -66,10 +69,17 @@ def main():
|
|||||||
with element_image_filename.open("wb") as fp:
|
with element_image_filename.open("wb") as fp:
|
||||||
image.save(fp, "PNG")
|
image.save(fp, "PNG")
|
||||||
|
|
||||||
|
success_count += 1
|
||||||
|
|
||||||
end_time = time.time() - start_time
|
end_time = time.time() - start_time
|
||||||
|
|
||||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||||
|
|
||||||
|
if failure_count > 0:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
12
poetry.lock
generated
12
poetry.lock
generated
@ -797,13 +797,13 @@ tabulate = ">=0.9.0,<0.10.0"
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "docling-ibm-models"
|
name = "docling-ibm-models"
|
||||||
version = "1.1.2"
|
version = "1.1.3"
|
||||||
description = "This package contains the AI models used by the Docling PDF conversion package"
|
description = "This package contains the AI models used by the Docling PDF conversion package"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "<4.0,>=3.10"
|
python-versions = "<4.0,>=3.10"
|
||||||
files = [
|
files = [
|
||||||
{file = "docling_ibm_models-1.1.2-py3-none-any.whl", hash = "sha256:5b5cc7bfdd690597a43ca64b4528ec4060d3557cf07d1a6ee01a7b952539a607"},
|
{file = "docling_ibm_models-1.1.3-py3-none-any.whl", hash = "sha256:d7dfbacf5f6c63a150cb2270d67d0d6e892763af99d2a7fd434b0b240da67df3"},
|
||||||
{file = "docling_ibm_models-1.1.2.tar.gz", hash = "sha256:9926769a7053fd3696238ed9e05798b548701df01e2e2f37c48cafc7bfb62b6f"},
|
{file = "docling_ibm_models-1.1.3.tar.gz", hash = "sha256:dd89476d152c74c1b0a9c445fe31ebca4390d9b23c568bbd175b92a0ee112e77"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
@ -2696,8 +2696,8 @@ files = [
|
|||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
|
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -2752,8 +2752,8 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
|
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||||
]
|
]
|
||||||
python-dateutil = ">=2.8.2"
|
python-dateutil = ">=2.8.2"
|
||||||
pytz = ">=2020.1"
|
pytz = ">=2020.1"
|
||||||
@ -5143,4 +5143,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "e0f8f29e02dcc980287efc0b946df1df4d149bfe498cc16abda897842b45b019"
|
content-hash = "35a2c8652173107818d14f38266af07b689fa80a5f0e8f3e06f5708797511cf6"
|
||||||
|
@ -24,7 +24,7 @@ packages = [{include = "docling"}]
|
|||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
pydantic = "^2.0.0"
|
pydantic = "^2.0.0"
|
||||||
docling-core = "^1.1.2"
|
docling-core = "^1.1.2"
|
||||||
docling-ibm-models = "^1.1.2"
|
docling-ibm-models = "^1.1.3"
|
||||||
deepsearch-glm = ">=0.19.0,<1"
|
deepsearch-glm = ">=0.19.0,<1"
|
||||||
filetype = "^1.2.0"
|
filetype = "^1.2.0"
|
||||||
pypdfium2 = "^4.30.0"
|
pypdfium2 = "^4.30.0"
|
||||||
|
Loading…
Reference in New Issue
Block a user