docling/.github/workflows/checks.yml

on:
  workflow_call:
    inputs:
      push_coverage:
        type: boolean
        description: "If true, the coverage results are pushed to codecov.io."
        default: true
    secrets:
      CODECOV_TOKEN:
        required: false

env:
  HF_HUB_DOWNLOAD_TIMEOUT: "90"
  HF_HUB_ETAG_TIMEOUT: "90"
  UV_FROZEN: "1"
  PYTEST_ML: |-
    tests/test_e2e_conversion.py
    tests/test_e2e_ocr_conversion.py
    tests/test_backend_webp.py
    tests/test_asr_pipeline.py
    tests/test_threaded_pipeline.py
  PYTEST_TO_SKIP: |-
  EXAMPLES_TO_SKIP: '^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|minimal_asr_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model|granitedocling_repetition_stopping)\.py$'

jobs:
  lint:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        python-version: ['3.12']
    steps:
      - uses: actions/checkout@v5

      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true

      - name: Set pre-commit cache key
        run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"

      - name: Cache pre-commit environments
        uses: actions/cache@v4
        with:
          path: ~/.cache/pre-commit
          key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
          restore-keys: |
            pre-commit|${{ env.PY }}|

      - name: Install Python Dependencies
        run: uv sync --frozen --all-extras

      - name: Check style
        run: |
          echo "--- Running pre-commit style checks ---"
          uv run pre-commit run --all-files

  run-tests-1:
      runs-on: ubuntu-latest
      strategy:
        fail-fast: false
        matrix:
          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
      steps:
        - uses: actions/checkout@v5

        - name: Grant permissions to APT cache directory # allows restore
          run: sudo chown -R $USER:$USER /var/cache/apt/archives

        - name: Cache APT packages
          id: apt-cache
          uses: actions/cache@v4
          with:
            path: /var/cache/apt/archives
            key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
            restore-keys: |
              apt-packages-${{ runner.os }}-

        - name: Install System Dependencies
          run: |
            sudo apt-get -qq update
            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config

        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"

        - name: Install uv and set the python version
          uses: astral-sh/setup-uv@v6
          with:
            python-version: ${{ matrix.python-version }}

        - name: Install Python Dependencies
          run: uv sync --frozen --all-extras

        - name: Cache Models
          uses: actions/cache@v4
          with:
            path: |
              ~/.cache/huggingface
              ~/.cache/modelscope
              ~/.EasyOCR/
            key: models-cache

        - name: Pre-download Models
          run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"

        - name: Run tests for GROUP1
          run: |
            echo "--- Running tests ---"
            GROUP1=$(echo "$PYTEST_ML" | sed -e 's/^/--ignore=/' | tr '\n' ' ')
            echo "Running tests for GROUP1"
            uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP1

        - name: Upload coverage to Codecov
          if: inputs.push_coverage
          uses: codecov/codecov-action@v5
          with:
            token: ${{ secrets.CODECOV_TOKEN }}
            files: ./coverage.xml
            flags: run-tests-1

        - name: Grant permissions to APT cache directory # allows backup
          run: sudo chown -R $USER:$USER /var/cache/apt/archives

  run-tests-2:
      runs-on: ubuntu-latest
      strategy:
        fail-fast: false
        matrix:
          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
      steps:
        - uses: actions/checkout@v5

        - name: Grant permissions to APT cache directory # allows restore
          run: sudo chown -R $USER:$USER /var/cache/apt/archives

        - name: Cache APT packages
          id: apt-cache
          uses: actions/cache@v4
          with:
            path: /var/cache/apt/archives
            key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
            restore-keys: |
              apt-packages-${{ runner.os }}-

        - name: Install System Dependencies
          run: |
            sudo apt-get -qq update
            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config

        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"

        - name: Install uv and set the python version
          uses: astral-sh/setup-uv@v6
          with:
            python-version: ${{ matrix.python-version }}

        - name: Install Python Dependencies
          run: uv sync --frozen --all-extras

        - name: Cache Models
          uses: actions/cache@v4
          with:
            path: |
              ~/.cache/huggingface
              ~/.cache/modelscope
              ~/.EasyOCR/
            key: models-cache

        - name: Pre-download Models
          run: uv run python -c "import easyocr; reader = easyocr.Reader(['en', 'fr', 'de', 'es'])"

        - name: Run tests for GROUP2
          run: |
            echo "--- Running tests ---"
            GROUP2=$(echo "$PYTEST_ML" | tr '\n' ' ')
            echo "Running tests for GROUP2"
            DESELECT_OPT=""
            if [ -n "$PYTEST_TO_SKIP" ]; then
              DESELECT_OPT="--deselect $PYTEST_TO_SKIP"
            fi
            echo "Running tests for GROUP2"
            uv run pytest -v --durations=0 --cov=docling --cov-report=xml --cov-context=test $GROUP2 $DESELECT_OPT

        - name: Upload coverage to Codecov
          if: inputs.push_coverage
          uses: codecov/codecov-action@v5
          with:
            token: ${{ secrets.CODECOV_TOKEN }}
            files: ./coverage.xml
            flags: run-tests-2

        - name: Grant permissions to APT cache directory # allows backup
          run: sudo chown -R $USER:$USER /var/cache/apt/archives

  run-examples:
      runs-on: ubuntu-latest
      strategy:
        fail-fast: false
        matrix:
          python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
      steps:
        - uses: actions/checkout@v5

        - name: Grant permissions to APT cache directory # allows restore
          run: sudo chown -R $USER:$USER /var/cache/apt/archives

        - name: Cache APT packages
          id: apt-cache
          uses: actions/cache@v4
          with:
            path: /var/cache/apt/archives
            key: apt-packages-${{ runner.os }}-${{ hashFiles('.github/workflows/checks.yml') }}
            restore-keys: |
              apt-packages-${{ runner.os }}-

        - name: Install System Dependencies
          run: |
            sudo apt-get -qq update
            sudo apt-get -qq install -y ffmpeg tesseract-ocr tesseract-ocr-eng tesseract-ocr-fra tesseract-ocr-deu tesseract-ocr-spa tesseract-ocr-script-latn libleptonica-dev libtesseract-dev libreoffice pkg-config

        - name: Set TESSDATA_PREFIX
          run: echo "TESSDATA_PREFIX=$(dpkg -L tesseract-ocr-eng | grep tessdata$)" >> "$GITHUB_ENV"

        - name: Install uv and set the python version
          uses: astral-sh/setup-uv@v6
          with:
            python-version: ${{ matrix.python-version }}

        - name: Install Python Dependencies
          run: uv sync --frozen --all-extras

        - name: Cache Models
          uses: actions/cache@v4
          with:
            path: |
              ~/.cache/huggingface
              ~/.cache/modelscope
              ~/.EasyOCR/
            key: models-cache

        - name: Free up disk space
          run: |
            df -h
            sudo rm -rf /usr/share/dotnet
            sudo rm -rf /usr/local/lib/android
            sudo rm -rf /opt/ghc
            sudo apt-get clean
            df -h

        - name: Run examples
          run: |
            echo "--- Creating output directory ---"
            mkdir -p scratch

            echo "--- Running examples ---"

            summary_file="runtime_summary.log"
            echo "--- Example Runtimes ---" > "$summary_file"

            for file in docs/examples/*.py; do
              if [[ "$(basename "$file")" =~ ${EXAMPLES_TO_SKIP} ]]; then
                echo "Skipping example: $(basename "$file")"
              else
                echo "--- Running example $(basename "$file") ---"

                start_time=$SECONDS

                uv run --no-sync python "$file" || exit 1
                duration=$((SECONDS - start_time))
                echo "Finished in ${duration}s."

                echo "$(basename "$file"): ${duration}s" >> "$summary_file"
              fi
            done

            echo
            echo "==================================="
            echo "       Final Runtime Summary       "
            echo "==================================="
            cat "$summary_file"
            echo "==================================="

        - name: Grant permissions to APT cache directory # allows backup
          run: sudo chown -R $USER:$USER /var/cache/apt/archives

  build-package:
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ['3.12']
    steps:
      - uses: actions/checkout@v5

      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          enable-cache: true

      - name: Install dependencies
        run: uv sync --all-extras

      - name: Build package
        run: uv build

      - name: Check content of wheel
        run: unzip -l dist/*.whl

      - name: Store the distribution packages
        uses: actions/upload-artifact@v4
        with:
          name: python-package-distributions
          path: dist/

  test-package:
    needs:
      - build-package
    runs-on: ubuntu-latest
    strategy:
      matrix:
        python-version: ['3.12']
    steps:
      - name: Download all the dists
        uses: actions/download-artifact@v4
        with:
          name: python-package-distributions
          path: dist/

      - name: Install uv and set the python version
        uses: astral-sh/setup-uv@v6
        with:
          python-version: ${{ matrix.python-version }}
          activate-environment: true
          enable-cache: false

      - name: Install package
        run: |
          uv pip install dist/*.whl

      - name: Run docling
        run: uv run docling --help