diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index 078f68ba..b403da25 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -2,72 +2,290 @@ on: workflow_call: inputs: push_coverage: - type: boolean - description: "If true, the coverage results are pushed to codecov.io." - default: true + type: boolean + description: "If true, the coverage results are pushed to codecov.io." + default: true secrets: CODECOV_TOKEN: - required: false + required: false env: - HF_HUB_DOWNLOAD_TIMEOUT: "60" - HF_HUB_ETAG_TIMEOUT: "60" + HF_HUB_DOWNLOAD_TIMEOUT: "90" + HF_HUB_ETAG_TIMEOUT: "90" UV_FROZEN: "1" + PYTEST_ML: |- + tests/test_e2e_conversion.py + tests/test_e2e_ocr_conversion.py + tests/test_backend_webp.py + tests/test_asr_pipeline.py + tests/test_threaded_pipeline.py + PYTEST_TO_SKIP: |- + EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping)\.py$' jobs: - run-checks: + lint: runs-on: ubuntu-latest strategy: + fail-fast: false matrix: - python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + python-version: ['3.12'] steps: - - uses: actions/checkout@v4 - - name: Install tesseract and ffmpeg - run: sudo apt-get update && sudo apt-get install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config - - name: Set TESSDATA_PREFIX - run: | - echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" - - name: Cache Hugging Face models - uses: actions/cache@v4 - with: - path: ~/.cache/huggingface - key: huggingface-cache-py${{ matrix.python-version }} + - uses: actions/checkout@v5 + - name: Install uv and set the python version - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@v6 with: python-version: ${{ matrix.python-version }} enable-cache: true - - name: pre-commit cache key + + - name: Set pre-commit cache key run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV" - - uses: actions/cache@v4 + + - name: Cache pre-commit environments + uses: actions/cache@v4 with: path: ~/.cache/pre-commit key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }} - - name: Install dependencies - run: uv sync --frozen --all-extras - - name: Check style and run tests - run: pre-commit run --all-files - - name: Testing - run: | - uv run --no-sync pytest -v --cov=docling --cov-report=xml tests - - name: Upload coverage to Codecov - if: inputs.push_coverage - uses: codecov/codecov-action@v5 - with: - token: ${{ secrets.CODECOV_TOKEN }} - files: ./coverage.xml - - name: Run examples - run: | - for file in docs/examples/*.py; do - # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|granitedocling_repetition_stopping|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then - echo "Skipping $file" - continue - fi + restore-keys: | + pre-commit|${{ env.PY }}| - echo "Running example $file" - uv run --no-sync python "$file" || exit 1 - done + - name: Install Python Dependencies + run: uv sync --frozen --all-extras + + - name: Check style + run: | + echo "--- Running pre-commit style checks ---" + uv run pre-commit run --all-files + + run-tests-1: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + steps: + - uses: actions/checkout@v5 + + - name: Grant permissions to APT cache directory # allows restore + run: sudo chown -R $USER:$USER /var/cache/apt/archives + + - name: Cache APT packages + id: apt-cache + uses: actions/cache@v4 + with: + path: /var/cache/apt/archives + key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }} + restore-keys: | + apt-packages-${{ runner.os }}- + + - name: Install System Dependencies + run: | + if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then + sudo apt-get -qq update + fi + sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config + + - name: Set TESSDATA_PREFIX + run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" + + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python Dependencies + run: uv sync --frozen --all-extras + + - name: Cache Models + uses: actions/cache@v4 + with: + path: | + ~/.cache/huggingface + ~/.cache/modelscope + ~/.EasyOCR/ + key: models-cache + + - name: Pre-download Models + run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])" + + - name: Run tests for GROUP1 + run: | + echo "--- Running tests ---" + GROUP1=$(echo "$PYTEST_ML" | sed -e 's/^/--ignore=/' | tr '\n' ' ') + echo "Running tests for GROUP1" + uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP1 + + - name: Upload coverage to Codecov + if: inputs.push_coverage + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: ./coverage.xml + flags: run-tests-1 + + - name: Grant permissions to APT cache directory # allows backup + run: sudo chown -R $USER:$USER /var/cache/apt/archives + + run-tests-2: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + steps: + - uses: actions/checkout@v5 + + - name: Grant permissions to APT cache directory # allows restore + run: sudo chown -R $USER:$USER /var/cache/apt/archives + + - name: Cache APT packages + id: apt-cache + uses: actions/cache@v4 + with: + path: /var/cache/apt/archives + key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }} + restore-keys: | + apt-packages-${{ runner.os }}- + + - name: Install System Dependencies + run: | + if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then + sudo apt-get -qq update + fi + sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config + + - name: Set TESSDATA_PREFIX + run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" + + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python Dependencies + run: uv sync --frozen --all-extras + + - name: Cache Models + uses: actions/cache@v4 + with: + path: | + ~/.cache/huggingface + ~/.cache/modelscope + ~/.EasyOCR/ + key: models-cache + + - name: Pre-download Models + run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])" + + - name: Run tests for GROUP2 + run: | + echo "--- Running tests ---" + GROUP2=$(echo "$PYTEST_ML" | tr '\n' ' ') + echo "Running tests for GROUP2" + DESELECT_OPT="" + if [ -n "$PYTEST_TO_SKIP" ]; then + DESELECT_OPT="--deselect $PYTEST_TO_SKIP" + fi + echo "Running tests for GROUP2" + uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP2 $DESELECT_OPT + + - name: Upload coverage to Codecov + if: inputs.push_coverage + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + files: ./coverage.xml + flags: run-tests-2 + + - name: Grant permissions to APT cache directory # allows backup + run: sudo chown -R $USER:$USER /var/cache/apt/archives + + run-examples: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] + steps: + - uses: actions/checkout@v5 + + - name: Grant permissions to APT cache directory # allows restore + run: sudo chown -R $USER:$USER /var/cache/apt/archives + + - name: Cache APT packages + id: apt-cache + uses: actions/cache@v4 + with: + path: /var/cache/apt/archives + key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }} + restore-keys: | + apt-packages-${{ runner.os }}- + + - name: Install System Dependencies + run: | + if [[ "${{ steps.apt-cache.outputs.cache-hit }}" != "true" ]]; then + sudo apt-get -qq update + fi + sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev pkg-config + + - name: Set TESSDATA_PREFIX + run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV" + + - name: Install uv and set the python version + uses: astral-sh/setup-uv@v6 + with: + python-version: ${{ matrix.python-version }} + + - name: Install Python Dependencies + run: uv sync --frozen --all-extras + + - name: Cache Models + uses: actions/cache@v4 + with: + path: | + ~/.cache/huggingface + ~/.cache/modelscope + ~/.EasyOCR/ + key: models-cache + + - name: Pre-download Models + run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])" + + - name: Run examples + run: | + echo "--- Creating output directory ---" + mkdir -p scratch + + echo "--- Running examples ---" + + summary_file="runtime_summary.log" + echo "--- Example Runtimes ---" > "$summary_file" + + for file in docs/examples/*.py; do + if [[ "$(basename "$file")" =~ ${EXAMPLES_TO_SKIP} ]]; then + echo "Skipping example: $(basename "$file")" + else + echo "--- Running example $(basename "$file") ---" + + start_time=$SECONDS + + uv run --no-sync python "$file" || exit 1 + duration=$((SECONDS - start_time)) + echo "Finished in ${duration}s." + + echo "$(basename "$file"): ${duration}s" >> "$summary_file" + fi + done + + echo + echo "===================================" + echo " Final Runtime Summary " + echo "===================================" + cat "$summary_file" + echo "===================================" + + - name: Grant permissions to APT cache directory # allows backup + run: sudo chown -R $USER:$USER /var/cache/apt/archives build-package: runs-on: ubuntu-latest @@ -75,18 +293,23 @@ jobs: matrix: python-version: ['3.12'] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 + - name: Install uv and set the python version - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@v6 with: python-version: ${{ matrix.python-version }} enable-cache: true + - name: Install dependencies run: uv sync --all-extras + - name: Build package run: uv build + - name: Check content of wheel run: unzip -l dist/*.whl + - name: Store the distribution packages uses: actions/upload-artifact@v4 with: @@ -106,12 +329,17 @@ jobs: with: name: python-package-distributions path: dist/ + - name: Install uv and set the python version - uses: astral-sh/setup-uv@v5 + uses: astral-sh/setup-uv@v6 with: python-version: ${{ matrix.python-version }} - enable-cache: true + activate-environment: true + enable-cache: false + - name: Install package - run: uv pip install dist/*.whl + run: | + uv pip install dist/*.whl + - name: Run docling - run: docling --help + run: uv run docling --help diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index a36998d9..11e748b7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -22,6 +22,6 @@ repos: language: system files: '\.py$' - repo: https://github.com/astral-sh/uv-pre-commit - rev: 0.7.8 + rev: 0.8.3 hooks: - id: uv-lock diff --git a/pyproject.toml b/pyproject.toml index befe0003..eb547be6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -123,6 +123,7 @@ dev = [ "pytest~=8.3", "pytest-cov>=6.1.1", "pytest-dependency~=0.6", + "pytest-durations~=1.6.1", "pytest-xdist~=3.3", "ipykernel~=6.29", "ipywidgets~=8.1", diff --git a/uv.lock b/uv.lock index 6c12c4bb..91ff61a7 100644 --- a/uv.lock +++ b/uv.lock @@ -1160,6 +1160,7 @@ dev = [ { name = "pytest" }, { name = "pytest-cov" }, { name = "pytest-dependency" }, + { name = "pytest-durations" }, { name = "pytest-xdist" }, { name = "python-semantic-release" }, { name = "types-openpyxl" }, @@ -1243,6 +1244,7 @@ dev = [ { name = "pytest", specifier = "~=8.3" }, { name = "pytest-cov", specifier = ">=6.1.1" }, { name = "pytest-dependency", specifier = "~=0.6" }, + { name = "pytest-durations", specifier = "~=1.6.1" }, { name = "pytest-xdist", specifier = "~=3.3" }, { name = "python-semantic-release", specifier = "~=7.32" }, { name = "types-openpyxl", specifier = "~=3.1" }, @@ -5542,6 +5544,18 @@ dependencies = [ ] sdist = { url = "https://files.pythonhosted.org/packages/7e/3b/317cc04e77d707d338540ca67b619df8f247f3f4c9f40e67bf5ea503ad94/pytest-dependency-0.6.0.tar.gz", hash = "sha256:934b0e6a39d95995062c193f7eaeed8a8ffa06ff1bcef4b62b0dc74a708bacc1", size = 19499, upload-time = "2023-12-31T20:38:54.991Z" } +[[package]] +name = "pytest-durations" +version = "1.6.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/90/59/1e73dd71c87aa47bcf37e4f5c25596e94d5628bee15bdbeaaae1585a26e4/pytest_durations-1.6.1.tar.gz", hash = "sha256:bc43fbabb1fef6bb75766d35a6bf142c1e153fef01677ccd4705cc376bf2be34", size = 11630, upload-time = "2025-08-29T06:49:54.554Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c8/a2/1d1914f3ff2c85003962b3f07f7b908fc9fd745584e9a51d392a65e261f6/pytest_durations-1.6.1-py3-none-any.whl", hash = "sha256:af7e0f6b883e7897c8dc1cb294016c6144c25f93a8930d48aec4b40ef461c51e", size = 13521, upload-time = "2025-08-29T06:49:53.386Z" }, +] + [[package]] name = "pytest-xdist" version = "3.8.0"