mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
Merge branch 'dev/add-strict-tests' of github.com:DS4SD/docling into dev/add-strict-tests
This commit is contained in:
commit
c09d2bca47
6
.github/workflows/checks.yml
vendored
6
.github/workflows/checks.yml
vendored
@ -16,6 +16,12 @@ jobs:
|
||||
run: poetry run pre-commit run --all-files
|
||||
- name: Install with poetry
|
||||
run: poetry install --all-extras
|
||||
- name: Run examples
|
||||
run: |
|
||||
for file in examples/*.py; do
|
||||
echo "Running example $file"
|
||||
poetry run python "$file" || exit 1
|
||||
done
|
||||
- name: Testing
|
||||
run: |
|
||||
poetry run pytest -v tests
|
||||
|
@ -49,17 +49,18 @@ def export_documents(
|
||||
f"of which {failure_count} failed "
|
||||
f"and {partial_success_count} were partially converted."
|
||||
)
|
||||
return success_count, partial_success_count, failure_count
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./test/data/2206.01062.pdf"),
|
||||
Path("./test/data/2203.01017v2.pdf"),
|
||||
Path("./test/data/2305.03393v1.pdf"),
|
||||
Path("./test/data/redp5110.pdf"),
|
||||
Path("./test/data/redp5695.pdf"),
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
Path("./tests/data/2203.01017v2.pdf"),
|
||||
Path("./tests/data/2305.03393v1.pdf"),
|
||||
Path("./tests/data/redp5110.pdf"),
|
||||
Path("./tests/data/redp5695.pdf"),
|
||||
]
|
||||
|
||||
# buf = BytesIO(Path("./test/data/2206.01062.pdf").open("rb").read())
|
||||
@ -73,12 +74,19 @@ def main():
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
export_documents(conv_results, output_dir=Path("./scratch"))
|
||||
success_count, partial_success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -42,14 +42,16 @@ def export_documents(
|
||||
f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
|
||||
)
|
||||
|
||||
return success_count, failure_count
|
||||
|
||||
|
||||
def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./test/data/2206.01062.pdf"),
|
||||
Path("./test/data/2203.01017v2.pdf"),
|
||||
Path("./test/data/2305.03393v1.pdf"),
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
Path("./tests/data/2203.01017v2.pdf"),
|
||||
Path("./tests/data/2305.03393v1.pdf"),
|
||||
]
|
||||
|
||||
###########################################################################
|
||||
@ -114,12 +116,19 @@ def main():
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert(input)
|
||||
export_documents(conv_results, output_dir=Path("./scratch"))
|
||||
success_count, failure_count = export_documents(
|
||||
conv_results, output_dir=Path("./scratch")
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
@ -22,7 +22,7 @@ def main():
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
input_doc_paths = [
|
||||
Path("./test/data/2206.01062.pdf"),
|
||||
Path("./tests/data/2206.01062.pdf"),
|
||||
]
|
||||
output_dir = Path("./scratch")
|
||||
|
||||
@ -41,10 +41,13 @@ def main():
|
||||
|
||||
conv_results = doc_converter.convert(input_files)
|
||||
|
||||
success_count = 0
|
||||
failure_count = 0
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
for conv_res in conv_results:
|
||||
if conv_res.status != ConversionStatus.SUCCESS:
|
||||
_log.info(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
continue
|
||||
|
||||
doc_filename = conv_res.input.file.stem
|
||||
@ -66,10 +69,17 @@ def main():
|
||||
with element_image_filename.open("wb") as fp:
|
||||
image.save(fp, "PNG")
|
||||
|
||||
success_count += 1
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
||||
_log.info(f"All documents were converted in {end_time:.2f} seconds.")
|
||||
|
||||
if failure_count > 0:
|
||||
raise RuntimeError(
|
||||
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
12
poetry.lock
generated
12
poetry.lock
generated
@ -797,13 +797,13 @@ tabulate = ">=0.9.0,<0.10.0"
|
||||
|
||||
[[package]]
|
||||
name = "docling-ibm-models"
|
||||
version = "1.1.2"
|
||||
version = "1.1.3"
|
||||
description = "This package contains the AI models used by the Docling PDF conversion package"
|
||||
optional = false
|
||||
python-versions = "<4.0,>=3.10"
|
||||
files = [
|
||||
{file = "docling_ibm_models-1.1.2-py3-none-any.whl", hash = "sha256:5b5cc7bfdd690597a43ca64b4528ec4060d3557cf07d1a6ee01a7b952539a607"},
|
||||
{file = "docling_ibm_models-1.1.2.tar.gz", hash = "sha256:9926769a7053fd3696238ed9e05798b548701df01e2e2f37c48cafc7bfb62b6f"},
|
||||
{file = "docling_ibm_models-1.1.3-py3-none-any.whl", hash = "sha256:d7dfbacf5f6c63a150cb2270d67d0d6e892763af99d2a7fd434b0b240da67df3"},
|
||||
{file = "docling_ibm_models-1.1.3.tar.gz", hash = "sha256:dd89476d152c74c1b0a9c445fe31ebca4390d9b23c568bbd175b92a0ee112e77"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
@ -2696,8 +2696,8 @@ files = [
|
||||
numpy = [
|
||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -2752,8 +2752,8 @@ files = [
|
||||
[package.dependencies]
|
||||
numpy = [
|
||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||
]
|
||||
python-dateutil = ">=2.8.2"
|
||||
pytz = ">=2020.1"
|
||||
@ -5143,4 +5143,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "e0f8f29e02dcc980287efc0b946df1df4d149bfe498cc16abda897842b45b019"
|
||||
content-hash = "35a2c8652173107818d14f38266af07b689fa80a5f0e8f3e06f5708797511cf6"
|
||||
|
@ -24,7 +24,7 @@ packages = [{include = "docling"}]
|
||||
python = "^3.10"
|
||||
pydantic = "^2.0.0"
|
||||
docling-core = "^1.1.2"
|
||||
docling-ibm-models = "^1.1.2"
|
||||
docling-ibm-models = "^1.1.3"
|
||||
deepsearch-glm = ">=0.19.0,<1"
|
||||
filetype = "^1.2.0"
|
||||
pypdfium2 = "^4.30.0"
|
||||
|
Loading…
Reference in New Issue
Block a user