mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Merge branch 'main' of https://github.com/docling-project/docling into dev/fix_msword_backend_identify_text_after_image
Signed-off-by: Michael Krissgau <michael.krissgau@ibm.com>
This commit is contained in:
commit
2bc564ccef
19
.github/actions/setup-poetry/action.yml
vendored
19
.github/actions/setup-poetry/action.yml
vendored
@ -1,19 +0,0 @@
|
|||||||
name: 'Set up Poetry and install'
|
|
||||||
description: 'Set up a specific version of Poetry and install dependencies using caching.'
|
|
||||||
inputs:
|
|
||||||
python-version:
|
|
||||||
description: "Version range or exact version of Python or PyPy to use, using SemVer's version range syntax."
|
|
||||||
default: '3.11'
|
|
||||||
runs:
|
|
||||||
using: 'composite'
|
|
||||||
steps:
|
|
||||||
- name: Install poetry
|
|
||||||
run: pipx install poetry==1.8.5
|
|
||||||
shell: bash
|
|
||||||
- uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ inputs.python-version }}
|
|
||||||
cache: 'poetry'
|
|
||||||
- name: Install dependencies
|
|
||||||
run: poetry install --all-extras
|
|
||||||
shell: bash
|
|
7
.github/scripts/release.sh
vendored
7
.github/scripts/release.sh
vendored
@ -10,11 +10,12 @@ fi
|
|||||||
CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
|
CHGLOG_FILE="${CHGLOG_FILE:-CHANGELOG.md}"
|
||||||
|
|
||||||
# update package version
|
# update package version
|
||||||
poetry version "${TARGET_VERSION}"
|
uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "${TARGET_VERSION}"
|
||||||
|
UV_FROZEN=0 uv lock --upgrade-package docling
|
||||||
|
|
||||||
# collect release notes
|
# collect release notes
|
||||||
REL_NOTES=$(mktemp)
|
REL_NOTES=$(mktemp)
|
||||||
poetry run semantic-release changelog --unreleased >> "${REL_NOTES}"
|
uv run --no-sync semantic-release changelog --unreleased >> "${REL_NOTES}"
|
||||||
|
|
||||||
# update changelog
|
# update changelog
|
||||||
TMP_CHGLOG=$(mktemp)
|
TMP_CHGLOG=$(mktemp)
|
||||||
@ -30,7 +31,7 @@ mv "${TMP_CHGLOG}" "${CHGLOG_FILE}"
|
|||||||
# push changes
|
# push changes
|
||||||
git config --global user.name 'github-actions[bot]'
|
git config --global user.name 'github-actions[bot]'
|
||||||
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
git config --global user.email 'github-actions[bot]@users.noreply.github.com'
|
||||||
git add pyproject.toml "${CHGLOG_FILE}"
|
git add pyproject.toml uv.lock "${CHGLOG_FILE}"
|
||||||
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
|
COMMIT_MSG="chore: bump version to ${TARGET_VERSION} [skip ci]"
|
||||||
git commit -m "${COMMIT_MSG}"
|
git commit -m "${COMMIT_MSG}"
|
||||||
git push origin main
|
git push origin main
|
||||||
|
23
.github/workflows/cd.yml
vendored
23
.github/workflows/cd.yml
vendored
@ -4,8 +4,7 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
env:
|
env:
|
||||||
# disable keyring (https://github.com/actions/runner-images/issues/6185):
|
UV_FROZEN: "1"
|
||||||
PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
code-checks:
|
code-checks:
|
||||||
@ -20,15 +19,20 @@ jobs:
|
|||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
with:
|
with:
|
||||||
fetch-depth: 0 # for fetching tags, required for semantic-release
|
fetch-depth: 0 # for fetching tags, required for semantic-release
|
||||||
- uses: ./.github/actions/setup-poetry
|
- name: Install uv and set the python version
|
||||||
|
uses: astral-sh/setup-uv@v5
|
||||||
|
with:
|
||||||
|
enable-cache: true
|
||||||
|
- name: Install dependencies
|
||||||
|
run: uv sync --only-dev
|
||||||
- name: Check version of potential release
|
- name: Check version of potential release
|
||||||
id: version_check
|
id: version_check
|
||||||
run: |
|
run: |
|
||||||
TRGT_VERSION=$(poetry run semantic-release print-version)
|
TRGT_VERSION=$(uv run --no-sync semantic-release print-version)
|
||||||
echo "TRGT_VERSION=${TRGT_VERSION}" >> $GITHUB_OUTPUT
|
echo "TRGT_VERSION=${TRGT_VERSION}" >> "$GITHUB_OUTPUT"
|
||||||
echo "${TRGT_VERSION}"
|
echo "${TRGT_VERSION}"
|
||||||
- name: Check notes of potential release
|
- name: Check notes of potential release
|
||||||
run: poetry run semantic-release changelog --unreleased
|
run: uv run --no-sync semantic-release changelog --unreleased
|
||||||
release:
|
release:
|
||||||
needs: [code-checks, pre-release-check]
|
needs: [code-checks, pre-release-check]
|
||||||
if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
|
if: needs.pre-release-check.outputs.TARGET_TAG_V != ''
|
||||||
@ -45,7 +49,12 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
token: ${{ steps.app-token.outputs.token }}
|
token: ${{ steps.app-token.outputs.token }}
|
||||||
fetch-depth: 0 # for fetching tags, required for semantic-release
|
fetch-depth: 0 # for fetching tags, required for semantic-release
|
||||||
- uses: ./.github/actions/setup-poetry
|
- name: Install uv and set the python version
|
||||||
|
uses: astral-sh/setup-uv@v5
|
||||||
|
with:
|
||||||
|
enable-cache: true
|
||||||
|
- name: Install dependencies
|
||||||
|
run: uv sync --only-dev
|
||||||
- name: Run release script
|
- name: Run release script
|
||||||
env:
|
env:
|
||||||
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
GH_TOKEN: ${{ steps.app-token.outputs.token }}
|
||||||
|
74
.github/workflows/checks.yml
vendored
74
.github/workflows/checks.yml
vendored
@ -12,6 +12,7 @@ on:
|
|||||||
env:
|
env:
|
||||||
HF_HUB_DOWNLOAD_TIMEOUT: "60"
|
HF_HUB_DOWNLOAD_TIMEOUT: "60"
|
||||||
HF_HUB_ETAG_TIMEOUT: "60"
|
HF_HUB_ETAG_TIMEOUT: "60"
|
||||||
|
UV_FROZEN: "1"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run-checks:
|
run-checks:
|
||||||
@ -31,16 +32,24 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
path: ~/.cache/huggingface
|
path: ~/.cache/huggingface
|
||||||
key: huggingface-cache-py${{ matrix.python-version }}
|
key: huggingface-cache-py${{ matrix.python-version }}
|
||||||
- uses: ./.github/actions/setup-poetry
|
- name: Install uv and set the python version
|
||||||
|
uses: astral-sh/setup-uv@v5
|
||||||
with:
|
with:
|
||||||
python-version: ${{ matrix.python-version }}
|
python-version: ${{ matrix.python-version }}
|
||||||
- name: Run styling check
|
enable-cache: true
|
||||||
run: poetry run pre-commit run --all-files
|
- name: pre-commit cache key
|
||||||
- name: Install with poetry
|
run: echo "PY=$(python -VV | sha256sum | cut -d' ' -f1)" >> "$GITHUB_ENV"
|
||||||
run: poetry install --all-extras
|
- uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: ~/.cache/pre-commit
|
||||||
|
key: pre-commit|${{ env.PY }}|${{ hashFiles('.pre-commit-config.yaml') }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: uv sync --frozen --all-extras
|
||||||
|
- name: Check style and run tests
|
||||||
|
run: pre-commit run --all-files
|
||||||
- name: Testing
|
- name: Testing
|
||||||
run: |
|
run: |
|
||||||
poetry run pytest -v --cov=docling --cov-report=xml tests
|
uv run --no-sync pytest -v --cov=docling --cov-report=xml tests
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
if: inputs.push_coverage
|
if: inputs.push_coverage
|
||||||
uses: codecov/codecov-action@v5
|
uses: codecov/codecov-action@v5
|
||||||
@ -51,13 +60,58 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
for file in docs/examples/*.py; do
|
for file in docs/examples/*.py; do
|
||||||
# Skip batch_convert.py
|
# Skip batch_convert.py
|
||||||
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal_vlm_pipeline|minimal|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
|
if [[ "$(basename "$file")" =~ ^(batch_convert|compare_vlm_models|minimal|minimal_vlm_pipeline|export_multimodal|custom_convert|develop_picture_enrichment|rapidocr_with_custom_models|offline_convert|pictures_description|pictures_description_api|vlm_pipeline_api_model).py ]]; then
|
||||||
echo "Skipping $file"
|
echo "Skipping $file"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
echo "Running example $file"
|
echo "Running example $file"
|
||||||
poetry run python "$file" || exit 1
|
uv run --no-sync python "$file" || exit 1
|
||||||
done
|
done
|
||||||
- name: Build with poetry
|
|
||||||
run: poetry build
|
build-package:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ['3.12']
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- name: Install uv and set the python version
|
||||||
|
uses: astral-sh/setup-uv@v5
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
enable-cache: true
|
||||||
|
- name: Install dependencies
|
||||||
|
run: uv sync --all-extras
|
||||||
|
- name: Build package
|
||||||
|
run: uv build
|
||||||
|
- name: Check content of wheel
|
||||||
|
run: unzip -l dist/*.whl
|
||||||
|
- name: Store the distribution packages
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: python-package-distributions
|
||||||
|
path: dist/
|
||||||
|
|
||||||
|
test-package:
|
||||||
|
needs:
|
||||||
|
- build-package
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ['3.12']
|
||||||
|
steps:
|
||||||
|
- name: Download all the dists
|
||||||
|
uses: actions/download-artifact@v4
|
||||||
|
with:
|
||||||
|
name: python-package-distributions
|
||||||
|
path: dist/
|
||||||
|
- name: Install uv and set the python version
|
||||||
|
uses: astral-sh/setup-uv@v5
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
enable-cache: true
|
||||||
|
- name: Install package
|
||||||
|
run: uv pip install dist/*.whl
|
||||||
|
- name: Run docling
|
||||||
|
run: docling --help
|
||||||
|
3
.github/workflows/ci-docs.yml
vendored
3
.github/workflows/ci-docs.yml
vendored
@ -8,6 +8,9 @@ on:
|
|||||||
- "**"
|
- "**"
|
||||||
- "!gh-pages"
|
- "!gh-pages"
|
||||||
|
|
||||||
|
env:
|
||||||
|
UV_FROZEN: "1"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-docs:
|
build-docs:
|
||||||
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
|
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
|
||||||
|
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -9,10 +9,6 @@ on:
|
|||||||
- "!main"
|
- "!main"
|
||||||
- "!gh-pages"
|
- "!gh-pages"
|
||||||
|
|
||||||
env:
|
|
||||||
# disable keyring (https://github.com/actions/runner-images/issues/6185):
|
|
||||||
PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
code-checks:
|
code-checks:
|
||||||
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
|
if: ${{ github.event_name == 'push' || (github.event.pull_request.head.repo.full_name != 'docling-project/docling' && github.event.pull_request.head.repo.full_name != 'docling-project/docling') }}
|
||||||
|
13
.github/workflows/docs.yml
vendored
13
.github/workflows/docs.yml
vendored
@ -6,14 +6,21 @@ on:
|
|||||||
description: "If true, the docs will be deployed."
|
description: "If true, the docs will be deployed."
|
||||||
default: false
|
default: false
|
||||||
|
|
||||||
|
env:
|
||||||
|
UV_FROZEN: "1"
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
run-docs:
|
run-docs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: ./.github/actions/setup-poetry
|
- name: Install uv and set the python version
|
||||||
|
uses: astral-sh/setup-uv@v5
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
enable-cache: true
|
||||||
- name: Build docs
|
- name: Build docs
|
||||||
run: poetry run mkdocs build --verbose --clean
|
run: uv run mkdocs build --verbose --clean
|
||||||
- name: Build and push docs
|
- name: Build and push docs
|
||||||
if: inputs.deploy
|
if: inputs.deploy
|
||||||
run: poetry run mkdocs gh-deploy --force
|
run: uv run --no-sync mkdocs gh-deploy --force
|
||||||
|
22
.github/workflows/pypi.yml
vendored
22
.github/workflows/pypi.yml
vendored
@ -4,16 +4,18 @@ on:
|
|||||||
release:
|
release:
|
||||||
types: [published]
|
types: [published]
|
||||||
|
|
||||||
|
env:
|
||||||
|
UV_FROZEN: "1"
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
contents: read
|
contents: read
|
||||||
|
|
||||||
env:
|
|
||||||
# disable keyring (https://github.com/actions/runner-images/issues/6185):
|
|
||||||
PYTHON_KEYRING_BACKEND: keyring.backends.null.Keyring
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
build-and-publish:
|
build-and-publish:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ['3.12']
|
||||||
environment:
|
environment:
|
||||||
name: pypi
|
name: pypi
|
||||||
url: https://pypi.org/p/docling
|
url: https://pypi.org/p/docling
|
||||||
@ -21,9 +23,15 @@ jobs:
|
|||||||
id-token: write # IMPORTANT: mandatory for trusted publishing
|
id-token: write # IMPORTANT: mandatory for trusted publishing
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: ./.github/actions/setup-poetry
|
- name: Install uv and set the python version
|
||||||
- name: Build and publish
|
uses: astral-sh/setup-uv@v5
|
||||||
run: poetry build
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
enable-cache: true
|
||||||
|
- name: Install dependencies
|
||||||
|
run: uv sync --all-extras
|
||||||
|
- name: Build package
|
||||||
|
run: uv build
|
||||||
- name: Publish distribution 📦 to PyPI
|
- name: Publish distribution 📦 to PyPI
|
||||||
uses: pypa/gh-action-pypi-publish@release/v1
|
uses: pypa/gh-action-pypi-publish@release/v1
|
||||||
with:
|
with:
|
||||||
|
@ -17,12 +17,11 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: mypy
|
- id: mypy
|
||||||
name: MyPy
|
name: MyPy
|
||||||
entry: poetry run mypy docling
|
entry: uv run --no-sync mypy docling
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
language: system
|
language: system
|
||||||
files: '\.py$'
|
files: '\.py$'
|
||||||
- id: poetry
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
name: Poetry check
|
rev: 0.7.8
|
||||||
entry: poetry check --lock
|
hooks:
|
||||||
pass_filenames: false
|
- id: uv-lock
|
||||||
language: system
|
|
||||||
|
32
CHANGELOG.md
32
CHANGELOG.md
@ -1,3 +1,35 @@
|
|||||||
|
## [v2.36.1](https://github.com/docling-project/docling/releases/tag/v2.36.1) - 2025-06-04
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Remove typer and click constraints ([#1707](https://github.com/docling-project/docling/issues/1707)) ([`8846f1a`](https://github.com/docling-project/docling/commit/8846f1a393923a6badcca3a78a664a4dd31eae0d))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Flash-attn usage and install ([#1706](https://github.com/docling-project/docling/issues/1706)) ([`be42b03`](https://github.com/docling-project/docling/commit/be42b03f9b366bed33e95c1033b90c63f300b914))
|
||||||
|
|
||||||
|
## [v2.36.0](https://github.com/docling-project/docling/releases/tag/v2.36.0) - 2025-06-03
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Simplify dependencies, switch to uv ([#1700](https://github.com/docling-project/docling/issues/1700)) ([`cdd4018`](https://github.com/docling-project/docling/commit/cdd401847a35f16d69944eb6dddf57e4e0b65020))
|
||||||
|
* New vlm-models support ([#1570](https://github.com/docling-project/docling/issues/1570)) ([`cfdf4ce`](https://github.com/docling-project/docling/commit/cfdf4cea25e681fc557df310b8bf34f3dd892e15))
|
||||||
|
|
||||||
|
## [v2.35.0](https://github.com/docling-project/docling/releases/tag/v2.35.0) - 2025-06-02
|
||||||
|
|
||||||
|
### Feature
|
||||||
|
|
||||||
|
* Add visualization of bbox on page with html export. ([#1663](https://github.com/docling-project/docling/issues/1663)) ([`b356b33`](https://github.com/docling-project/docling/commit/b356b33059bdeeaf1584d9d189cbf1c4832e367c))
|
||||||
|
|
||||||
|
### Fix
|
||||||
|
|
||||||
|
* Guess HTML content starting with script tag ([#1673](https://github.com/docling-project/docling/issues/1673)) ([`984cb13`](https://github.com/docling-project/docling/commit/984cb137f6a8ae2f3a63623add6c474d97ef8739))
|
||||||
|
* UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd0 in position 0: invalid continuation byte ([#1665](https://github.com/docling-project/docling/issues/1665)) ([`51d3450`](https://github.com/docling-project/docling/commit/51d34509156e2dbec9e697276681d59f9ca7e020))
|
||||||
|
|
||||||
|
### Documentation
|
||||||
|
|
||||||
|
* Fix typo in index.md ([#1676](https://github.com/docling-project/docling/issues/1676)) ([`11ca4f7`](https://github.com/docling-project/docling/commit/11ca4f7a7bd8068bee472510dd71f1cd58f86f17))
|
||||||
|
|
||||||
## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22
|
## [v2.34.0](https://github.com/docling-project/docling/releases/tag/v2.34.0) - 2025-05-22
|
||||||
|
|
||||||
### Feature
|
### Feature
|
||||||
|
@ -6,70 +6,52 @@ For more details on the contributing guidelines head to the Docling Project [com
|
|||||||
|
|
||||||
## Developing
|
## Developing
|
||||||
|
|
||||||
### Usage of Poetry
|
### Usage of uv
|
||||||
|
|
||||||
We use Poetry to manage dependencies.
|
We use [uv](https://docs.astral.sh/uv/) as package and project manager.
|
||||||
|
|
||||||
#### Installation
|
#### Installation
|
||||||
|
|
||||||
To install Poetry, follow the documentation here: https://python-poetry.org/docs/master/#installing-with-the-official-installer
|
To install `uv`, check the documentation on [Installing uv](https://docs.astral.sh/uv/getting-started/installation/).
|
||||||
|
|
||||||
1. Install Poetry globally on your machine:
|
#### Create an environment and sync it
|
||||||
```bash
|
|
||||||
curl -sSL https://install.python-poetry.org | python3 -
|
|
||||||
```
|
|
||||||
The installation script will print the installation bin folder `POETRY_BIN` which you need in the next steps.
|
|
||||||
|
|
||||||
2. Make sure Poetry is in your `$PATH`:
|
You can use the `uv sync` to create a project virtual environment (if it does not already exist) and sync
|
||||||
- for `zsh`:
|
the project's dependencies with the environment.
|
||||||
```sh
|
|
||||||
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.zshrc
|
|
||||||
```
|
|
||||||
- for `bash`:
|
|
||||||
```sh
|
|
||||||
echo 'export PATH="POETRY_BIN:$PATH"' >> ~/.bashrc
|
|
||||||
```
|
|
||||||
|
|
||||||
3. The official guidelines linked above include useful details on configuring autocomplete for most shell environments, e.g., Bash and Zsh.
|
|
||||||
|
|
||||||
#### Create a Virtual Environment and Install Dependencies
|
|
||||||
|
|
||||||
To activate the Virtual Environment, run:
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
poetry shell
|
uv sync
|
||||||
```
|
```
|
||||||
|
|
||||||
This will spawn a shell with the Virtual Environment activated. If the Virtual Environment doesn't exist, Poetry will create one for you. Then, to install dependencies, run:
|
#### Use a specific Python version (optional)
|
||||||
|
|
||||||
|
If you need to work with a specific version of Python, you can create a new virtual environment for that version
|
||||||
|
and run the sync command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
poetry install
|
uv venv --python 3.12
|
||||||
|
uv sync
|
||||||
```
|
```
|
||||||
|
|
||||||
**(Advanced) Use a Specific Python Version**
|
More detailed options are described on the [Using Python environments](https://docs.astral.sh/uv/pip/environments/) documentation.
|
||||||
|
|
||||||
If you need to work with a specific (older) version of Python, run:
|
#### Add a new dependency
|
||||||
|
|
||||||
|
Simply use the `uv add` command. The `pyproject.toml` and `uv.lock` files will be updated.
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
poetry env use $(which python3.8)
|
uv add [OPTIONS] <PACKAGES|--requirements <REQUIREMENTS>>
|
||||||
```
|
|
||||||
|
|
||||||
This creates a Virtual Environment with Python 3.8. For other versions, replace `$(which python3.8)` with the path to the interpreter (e.g., `/usr/bin/python3.8`) or use `$(which pythonX.Y)`.
|
|
||||||
|
|
||||||
#### Add a New Dependency
|
|
||||||
|
|
||||||
```bash
|
|
||||||
poetry add NAME
|
|
||||||
```
|
```
|
||||||
|
|
||||||
## Coding Style Guidelines
|
## Coding Style Guidelines
|
||||||
|
|
||||||
We use the following tools to enforce code style:
|
We use the following tools to enforce code style:
|
||||||
|
|
||||||
- iSort, to sort imports
|
- [Ruff](https://docs.astral.sh/ruff/), as linter and code formatter
|
||||||
- Black, to format code
|
- [MyPy](https://mypy.readthedocs.io), as static type checker
|
||||||
|
|
||||||
We run a series of checks on the codebase on every commit using `pre-commit`. To install the hooks, run:
|
A set of styling checks, as well as regression tests, are defined and managed through the [pre-commit](https://pre-commit.com/) framework.
|
||||||
|
To ensure that those scripts run automatically before a commit is finalized, install `pre-commit` on your local repository:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pre-commit install
|
pre-commit install
|
||||||
@ -81,7 +63,7 @@ To run the checks on-demand, run:
|
|||||||
pre-commit run --all-files
|
pre-commit run --all-files
|
||||||
```
|
```
|
||||||
|
|
||||||
Note: Checks like `Black` and `isort` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by its hooks. In these cases, `git add` the modified files and `git commit` again.
|
Note: Checks like `Ruff` will "fail" if they modify files. This is because `pre-commit` doesn't like to see files modified by its hooks. In these cases, `git add` the modified files and `git commit` again.
|
||||||
|
|
||||||
## Tests
|
## Tests
|
||||||
|
|
||||||
@ -94,7 +76,7 @@ When a change improves the conversion results, multiple reference documents must
|
|||||||
The reference data can be regenerated with
|
The reference data can be regenerated with
|
||||||
|
|
||||||
```sh
|
```sh
|
||||||
DOCLING_GEN_TEST_DATA=1 poetry run pytest
|
DOCLING_GEN_TEST_DATA=1 uv run pytest
|
||||||
```
|
```
|
||||||
|
|
||||||
All PRs modifying the reference test data require a double review to guarantee we don't miss edge cases.
|
All PRs modifying the reference test data require a double review to guarantee we don't miss edge cases.
|
||||||
|
@ -14,9 +14,8 @@
|
|||||||
[](https://docling-project.github.io/docling/)
|
[](https://docling-project.github.io/docling/)
|
||||||
[](https://pypi.org/project/docling/)
|
[](https://pypi.org/project/docling/)
|
||||||
[](https://pypi.org/project/docling/)
|
[](https://pypi.org/project/docling/)
|
||||||
[](https://python-poetry.org/)
|
[](https://github.com/astral-sh/uv)
|
||||||
[](https://github.com/psf/black)
|
[](https://github.com/astral-sh/ruff)
|
||||||
[](https://pycqa.github.io/isort/)
|
|
||||||
[](https://pydantic.dev)
|
[](https://pydantic.dev)
|
||||||
[](https://github.com/pre-commit/pre-commit)
|
[](https://github.com/pre-commit/pre-commit)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
@ -36,7 +35,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|||||||
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
||||||
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
||||||
* 🔍 Extensive OCR support for scanned PDFs and images
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
||||||
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕
|
* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview))
|
||||||
* 💻 Simple and convenient CLI
|
* 💻 Simple and convenient CLI
|
||||||
|
|
||||||
### Coming soon
|
### Coming soon
|
||||||
|
@ -28,6 +28,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
|
|||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FormatToExtensions,
|
FormatToExtensions,
|
||||||
@ -36,8 +37,6 @@ from docling.datamodel.base_models import (
|
|||||||
)
|
)
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
PaginatedPipelineOptions,
|
PaginatedPipelineOptions,
|
||||||
@ -45,14 +44,16 @@ from docling.datamodel.pipeline_options import (
|
|||||||
PdfPipeline,
|
PdfPipeline,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
VlmModelType,
|
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
granite_vision_vlm_conversion_options,
|
|
||||||
granite_vision_vlm_ollama_conversion_options,
|
|
||||||
smoldocling_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.datamodel.vlm_model_specs import (
|
||||||
|
GRANITE_VISION_OLLAMA,
|
||||||
|
GRANITE_VISION_TRANSFORMERS,
|
||||||
|
SMOLDOCLING_MLX,
|
||||||
|
SMOLDOCLING_TRANSFORMERS,
|
||||||
|
VlmModelType,
|
||||||
|
)
|
||||||
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
from docling.document_converter import DocumentConverter, FormatOption, PdfFormatOption
|
||||||
from docling.models.factories import get_ocr_factory
|
from docling.models.factories import get_ocr_factory
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
@ -579,20 +580,16 @@ def convert( # noqa: C901
|
|||||||
)
|
)
|
||||||
|
|
||||||
if vlm_model == VlmModelType.GRANITE_VISION:
|
if vlm_model == VlmModelType.GRANITE_VISION:
|
||||||
pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
pipeline_options.vlm_options = GRANITE_VISION_TRANSFORMERS
|
||||||
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
elif vlm_model == VlmModelType.GRANITE_VISION_OLLAMA:
|
||||||
pipeline_options.vlm_options = (
|
pipeline_options.vlm_options = GRANITE_VISION_OLLAMA
|
||||||
granite_vision_vlm_ollama_conversion_options
|
|
||||||
)
|
|
||||||
elif vlm_model == VlmModelType.SMOLDOCLING:
|
elif vlm_model == VlmModelType.SMOLDOCLING:
|
||||||
pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
pipeline_options.vlm_options = SMOLDOCLING_TRANSFORMERS
|
||||||
if sys.platform == "darwin":
|
if sys.platform == "darwin":
|
||||||
try:
|
try:
|
||||||
import mlx_vlm
|
import mlx_vlm
|
||||||
|
|
||||||
pipeline_options.vlm_options = (
|
pipeline_options.vlm_options = SMOLDOCLING_MLX
|
||||||
smoldocling_vlm_mlx_conversion_options
|
|
||||||
)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
_log.warning(
|
_log.warning(
|
||||||
"To run SmolDocling faster, please install mlx-vlm:\n"
|
"To run SmolDocling faster, please install mlx-vlm:\n"
|
||||||
|
68
docling/datamodel/accelerator_options.py
Normal file
68
docling/datamodel/accelerator_options.py
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Union
|
||||||
|
|
||||||
|
from pydantic import field_validator, model_validator
|
||||||
|
from pydantic_settings import BaseSettings, SettingsConfigDict
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class AcceleratorDevice(str, Enum):
|
||||||
|
"""Devices to run model inference"""
|
||||||
|
|
||||||
|
AUTO = "auto"
|
||||||
|
CPU = "cpu"
|
||||||
|
CUDA = "cuda"
|
||||||
|
MPS = "mps"
|
||||||
|
|
||||||
|
|
||||||
|
class AcceleratorOptions(BaseSettings):
|
||||||
|
model_config = SettingsConfigDict(
|
||||||
|
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
||||||
|
)
|
||||||
|
|
||||||
|
num_threads: int = 4
|
||||||
|
device: Union[str, AcceleratorDevice] = "auto"
|
||||||
|
cuda_use_flash_attention2: bool = False
|
||||||
|
|
||||||
|
@field_validator("device")
|
||||||
|
def validate_device(cls, value):
|
||||||
|
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
||||||
|
if value in {d.value for d in AcceleratorDevice} or re.match(
|
||||||
|
r"^cuda(:\d+)?$", value
|
||||||
|
):
|
||||||
|
return value
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
||||||
|
)
|
||||||
|
|
||||||
|
@model_validator(mode="before")
|
||||||
|
@classmethod
|
||||||
|
def check_alternative_envvars(cls, data: Any) -> Any:
|
||||||
|
r"""
|
||||||
|
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
||||||
|
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
||||||
|
|
||||||
|
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
||||||
|
the same functionality. In case the alias envvar is set and the user tries to override the
|
||||||
|
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
||||||
|
as an extra input instead of simply overwriting the evvar value for that parameter.
|
||||||
|
"""
|
||||||
|
if isinstance(data, dict):
|
||||||
|
input_num_threads = data.get("num_threads")
|
||||||
|
# Check if to set the num_threads from the alternative envvar
|
||||||
|
if input_num_threads is None:
|
||||||
|
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
||||||
|
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
||||||
|
if docling_num_threads is None and omp_num_threads is not None:
|
||||||
|
try:
|
||||||
|
data["num_threads"] = int(omp_num_threads)
|
||||||
|
except ValueError:
|
||||||
|
_log.error(
|
||||||
|
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
||||||
|
omp_num_threads,
|
||||||
|
)
|
||||||
|
return data
|
@ -13,11 +13,11 @@ from docling_core.types.doc import (
|
|||||||
TableCell,
|
TableCell,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
from docling_core.types.doc.page import SegmentedPdfPage, TextCell
|
||||||
|
|
||||||
# DO NOT REMOVE; explicitly exposed from this location
|
|
||||||
from docling_core.types.io import (
|
from docling_core.types.io import (
|
||||||
DocumentStream,
|
DocumentStream,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# DO NOT REMOVE; explicitly exposed from this location
|
||||||
from PIL.Image import Image
|
from PIL.Image import Image
|
||||||
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
||||||
|
|
||||||
@ -67,7 +67,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
|||||||
InputFormat.MD: ["md"],
|
InputFormat.MD: ["md"],
|
||||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||||
InputFormat.XML_JATS: ["xml", "nxml"],
|
InputFormat.XML_JATS: ["xml", "nxml"],
|
||||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp", "webp"],
|
||||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||||
InputFormat.CSV: ["csv"],
|
InputFormat.CSV: ["csv"],
|
||||||
InputFormat.XLSX: ["xlsx"],
|
InputFormat.XLSX: ["xlsx"],
|
||||||
@ -131,12 +131,6 @@ class ErrorItem(BaseModel):
|
|||||||
error_message: str
|
error_message: str
|
||||||
|
|
||||||
|
|
||||||
# class Cell(BaseModel):
|
|
||||||
# id: int
|
|
||||||
# text: str
|
|
||||||
# bbox: BoundingBox
|
|
||||||
|
|
||||||
|
|
||||||
class Cluster(BaseModel):
|
class Cluster(BaseModel):
|
||||||
id: int
|
id: int
|
||||||
label: DocItemLabel
|
label: DocItemLabel
|
||||||
@ -158,8 +152,16 @@ class LayoutPrediction(BaseModel):
|
|||||||
clusters: List[Cluster] = []
|
clusters: List[Cluster] = []
|
||||||
|
|
||||||
|
|
||||||
|
class VlmPredictionToken(BaseModel):
|
||||||
|
text: str = ""
|
||||||
|
token: int = -1
|
||||||
|
logprob: float = -1
|
||||||
|
|
||||||
|
|
||||||
class VlmPrediction(BaseModel):
|
class VlmPrediction(BaseModel):
|
||||||
text: str = ""
|
text: str = ""
|
||||||
|
generated_tokens: list[VlmPredictionToken] = []
|
||||||
|
generation_time: float = -1
|
||||||
|
|
||||||
|
|
||||||
class ContainerElement(
|
class ContainerElement(
|
||||||
|
@ -412,7 +412,11 @@ class _DocumentConversionInput(BaseModel):
|
|||||||
else:
|
else:
|
||||||
return "application/xml"
|
return "application/xml"
|
||||||
|
|
||||||
if re.match(r"<!doctype\s+html|<html|<head|<body", content_str):
|
if re.match(
|
||||||
|
r"(<script.*?>.*?</script>\s*)?(<!doctype\s+html|<html|<head|<body)",
|
||||||
|
content_str,
|
||||||
|
re.DOTALL,
|
||||||
|
):
|
||||||
return "text/html"
|
return "text/html"
|
||||||
|
|
||||||
p = re.compile(
|
p = re.compile(
|
||||||
|
@ -1,6 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import os
|
|
||||||
import re
|
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
from typing import Any, ClassVar, Dict, List, Literal, Optional, Union
|
||||||
@ -10,73 +8,28 @@ from pydantic import (
|
|||||||
BaseModel,
|
BaseModel,
|
||||||
ConfigDict,
|
ConfigDict,
|
||||||
Field,
|
Field,
|
||||||
field_validator,
|
|
||||||
model_validator,
|
|
||||||
)
|
)
|
||||||
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
||||||
from typing_extensions import deprecated
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
|
# Import the following for backwards compatibility
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
|
ApiVlmOptions,
|
||||||
|
InferenceFramework,
|
||||||
|
InlineVlmOptions,
|
||||||
|
ResponseFormat,
|
||||||
|
)
|
||||||
|
from docling.datamodel.vlm_model_specs import (
|
||||||
|
GRANITE_VISION_OLLAMA as granite_vision_vlm_ollama_conversion_options,
|
||||||
|
GRANITE_VISION_TRANSFORMERS as granite_vision_vlm_conversion_options,
|
||||||
|
SMOLDOCLING_MLX as smoldocling_vlm_mlx_conversion_options,
|
||||||
|
SMOLDOCLING_TRANSFORMERS as smoldocling_vlm_conversion_options,
|
||||||
|
VlmModelType,
|
||||||
|
)
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class AcceleratorDevice(str, Enum):
|
|
||||||
"""Devices to run model inference"""
|
|
||||||
|
|
||||||
AUTO = "auto"
|
|
||||||
CPU = "cpu"
|
|
||||||
CUDA = "cuda"
|
|
||||||
MPS = "mps"
|
|
||||||
|
|
||||||
|
|
||||||
class AcceleratorOptions(BaseSettings):
|
|
||||||
model_config = SettingsConfigDict(
|
|
||||||
env_prefix="DOCLING_", env_nested_delimiter="_", populate_by_name=True
|
|
||||||
)
|
|
||||||
|
|
||||||
num_threads: int = 4
|
|
||||||
device: Union[str, AcceleratorDevice] = "auto"
|
|
||||||
cuda_use_flash_attention2: bool = False
|
|
||||||
|
|
||||||
@field_validator("device")
|
|
||||||
def validate_device(cls, value):
|
|
||||||
# "auto", "cpu", "cuda", "mps", or "cuda:N"
|
|
||||||
if value in {d.value for d in AcceleratorDevice} or re.match(
|
|
||||||
r"^cuda(:\d+)?$", value
|
|
||||||
):
|
|
||||||
return value
|
|
||||||
raise ValueError(
|
|
||||||
"Invalid device option. Use 'auto', 'cpu', 'mps', 'cuda', or 'cuda:N'."
|
|
||||||
)
|
|
||||||
|
|
||||||
@model_validator(mode="before")
|
|
||||||
@classmethod
|
|
||||||
def check_alternative_envvars(cls, data: Any) -> Any:
|
|
||||||
r"""
|
|
||||||
Set num_threads from the "alternative" envvar OMP_NUM_THREADS.
|
|
||||||
The alternative envvar is used only if it is valid and the regular envvar is not set.
|
|
||||||
|
|
||||||
Notice: The standard pydantic settings mechanism with parameter "aliases" does not provide
|
|
||||||
the same functionality. In case the alias envvar is set and the user tries to override the
|
|
||||||
parameter in settings initialization, Pydantic treats the parameter provided in __init__()
|
|
||||||
as an extra input instead of simply overwriting the evvar value for that parameter.
|
|
||||||
"""
|
|
||||||
if isinstance(data, dict):
|
|
||||||
input_num_threads = data.get("num_threads")
|
|
||||||
# Check if to set the num_threads from the alternative envvar
|
|
||||||
if input_num_threads is None:
|
|
||||||
docling_num_threads = os.getenv("DOCLING_NUM_THREADS")
|
|
||||||
omp_num_threads = os.getenv("OMP_NUM_THREADS")
|
|
||||||
if docling_num_threads is None and omp_num_threads is not None:
|
|
||||||
try:
|
|
||||||
data["num_threads"] = int(omp_num_threads)
|
|
||||||
except ValueError:
|
|
||||||
_log.error(
|
|
||||||
"Ignoring misformatted envvar OMP_NUM_THREADS '%s'",
|
|
||||||
omp_num_threads,
|
|
||||||
)
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
class BaseOptions(BaseModel):
|
class BaseOptions(BaseModel):
|
||||||
"""Base class for options."""
|
"""Base class for options."""
|
||||||
|
|
||||||
@ -121,24 +74,22 @@ class RapidOcrOptions(OcrOptions):
|
|||||||
lang: List[str] = [
|
lang: List[str] = [
|
||||||
"english",
|
"english",
|
||||||
"chinese",
|
"chinese",
|
||||||
] # However, language as a parameter is not supported by rapidocr yet and hence changing this options doesn't affect anything.
|
]
|
||||||
# For more details on supported languages by RapidOCR visit https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
# However, language as a parameter is not supported by rapidocr yet
|
||||||
|
# and hence changing this options doesn't affect anything.
|
||||||
|
|
||||||
|
# For more details on supported languages by RapidOCR visit
|
||||||
|
# https://rapidai.github.io/RapidOCRDocs/blog/2022/09/28/%E6%94%AF%E6%8C%81%E8%AF%86%E5%88%AB%E8%AF%AD%E8%A8%80/
|
||||||
|
|
||||||
|
# For more details on the following options visit
|
||||||
|
# https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
||||||
|
|
||||||
# For more details on the following options visit https://rapidai.github.io/RapidOCRDocs/install_usage/api/RapidOCR/
|
|
||||||
text_score: float = 0.5 # same default as rapidocr
|
text_score: float = 0.5 # same default as rapidocr
|
||||||
|
|
||||||
use_det: Optional[bool] = None # same default as rapidocr
|
use_det: Optional[bool] = None # same default as rapidocr
|
||||||
use_cls: Optional[bool] = None # same default as rapidocr
|
use_cls: Optional[bool] = None # same default as rapidocr
|
||||||
use_rec: Optional[bool] = None # same default as rapidocr
|
use_rec: Optional[bool] = None # same default as rapidocr
|
||||||
|
|
||||||
# class Device(Enum):
|
|
||||||
# CPU = "CPU"
|
|
||||||
# CUDA = "CUDA"
|
|
||||||
# DIRECTML = "DIRECTML"
|
|
||||||
# AUTO = "AUTO"
|
|
||||||
|
|
||||||
# device: Device = Device.AUTO # Default value is AUTO
|
|
||||||
|
|
||||||
print_verbose: bool = False # same default as rapidocr
|
print_verbose: bool = False # same default as rapidocr
|
||||||
|
|
||||||
det_model_path: Optional[str] = None # same default as rapidocr
|
det_model_path: Optional[str] = None # same default as rapidocr
|
||||||
@ -244,101 +195,18 @@ class PictureDescriptionVlmOptions(PictureDescriptionBaseOptions):
|
|||||||
return self.repo_id.replace("/", "--")
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
# SmolVLM
|
||||||
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
smolvlm_picture_description = PictureDescriptionVlmOptions(
|
||||||
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
repo_id="HuggingFaceTB/SmolVLM-256M-Instruct"
|
||||||
)
|
)
|
||||||
# phi_picture_description = PictureDescriptionVlmOptions(repo_id="microsoft/Phi-3-vision-128k-instruct")
|
|
||||||
|
# GraniteVision
|
||||||
granite_picture_description = PictureDescriptionVlmOptions(
|
granite_picture_description = PictureDescriptionVlmOptions(
|
||||||
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
||||||
prompt="What is shown in this image?",
|
prompt="What is shown in this image?",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class BaseVlmOptions(BaseModel):
|
|
||||||
kind: str
|
|
||||||
prompt: str
|
|
||||||
|
|
||||||
|
|
||||||
class ResponseFormat(str, Enum):
|
|
||||||
DOCTAGS = "doctags"
|
|
||||||
MARKDOWN = "markdown"
|
|
||||||
|
|
||||||
|
|
||||||
class InferenceFramework(str, Enum):
|
|
||||||
MLX = "mlx"
|
|
||||||
TRANSFORMERS = "transformers"
|
|
||||||
OPENAI = "openai"
|
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceVlmOptions(BaseVlmOptions):
|
|
||||||
kind: Literal["hf_model_options"] = "hf_model_options"
|
|
||||||
|
|
||||||
repo_id: str
|
|
||||||
load_in_8bit: bool = True
|
|
||||||
llm_int8_threshold: float = 6.0
|
|
||||||
quantized: bool = False
|
|
||||||
|
|
||||||
inference_framework: InferenceFramework
|
|
||||||
response_format: ResponseFormat
|
|
||||||
|
|
||||||
@property
|
|
||||||
def repo_cache_folder(self) -> str:
|
|
||||||
return self.repo_id.replace("/", "--")
|
|
||||||
|
|
||||||
|
|
||||||
class ApiVlmOptions(BaseVlmOptions):
|
|
||||||
kind: Literal["api_model_options"] = "api_model_options"
|
|
||||||
|
|
||||||
url: AnyUrl = AnyUrl(
|
|
||||||
"http://localhost:11434/v1/chat/completions"
|
|
||||||
) # Default to ollama
|
|
||||||
headers: Dict[str, str] = {}
|
|
||||||
params: Dict[str, Any] = {}
|
|
||||||
scale: float = 2.0
|
|
||||||
timeout: float = 60
|
|
||||||
concurrency: int = 1
|
|
||||||
response_format: ResponseFormat
|
|
||||||
|
|
||||||
|
|
||||||
smoldocling_vlm_mlx_conversion_options = HuggingFaceVlmOptions(
|
|
||||||
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
|
||||||
prompt="Convert this page to docling.",
|
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
|
||||||
inference_framework=InferenceFramework.MLX,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
smoldocling_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
||||||
repo_id="ds4sd/SmolDocling-256M-preview",
|
|
||||||
prompt="Convert this page to docling.",
|
|
||||||
response_format=ResponseFormat.DOCTAGS,
|
|
||||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
||||||
)
|
|
||||||
|
|
||||||
granite_vision_vlm_conversion_options = HuggingFaceVlmOptions(
|
|
||||||
repo_id="ibm-granite/granite-vision-3.1-2b-preview",
|
|
||||||
# prompt="OCR the full page to markdown.",
|
|
||||||
prompt="OCR this image.",
|
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
|
||||||
inference_framework=InferenceFramework.TRANSFORMERS,
|
|
||||||
)
|
|
||||||
|
|
||||||
granite_vision_vlm_ollama_conversion_options = ApiVlmOptions(
|
|
||||||
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
|
||||||
params={"model": "granite3.2-vision:2b"},
|
|
||||||
prompt="OCR the full page to markdown.",
|
|
||||||
scale=1.0,
|
|
||||||
timeout=120,
|
|
||||||
response_format=ResponseFormat.MARKDOWN,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class VlmModelType(str, Enum):
|
|
||||||
SMOLDOCLING = "smoldocling"
|
|
||||||
GRANITE_VISION = "granite_vision"
|
|
||||||
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the backend options
|
# Define an enum for the backend options
|
||||||
class PdfBackend(str, Enum):
|
class PdfBackend(str, Enum):
|
||||||
"""Enum of valid PDF backends."""
|
"""Enum of valid PDF backends."""
|
||||||
@ -387,7 +255,7 @@ class VlmPipelineOptions(PaginatedPipelineOptions):
|
|||||||
False # (To be used with vlms, or other generative models)
|
False # (To be used with vlms, or other generative models)
|
||||||
)
|
)
|
||||||
# If True, text from backend will be used instead of generated text
|
# If True, text from backend will be used instead of generated text
|
||||||
vlm_options: Union[HuggingFaceVlmOptions, ApiVlmOptions] = (
|
vlm_options: Union[InlineVlmOptions, ApiVlmOptions] = (
|
||||||
smoldocling_vlm_conversion_options
|
smoldocling_vlm_conversion_options
|
||||||
)
|
)
|
||||||
|
|
||||||
|
81
docling/datamodel/pipeline_options_vlm_model.py
Normal file
81
docling/datamodel/pipeline_options_vlm_model.py
Normal file
@ -0,0 +1,81 @@
|
|||||||
|
from enum import Enum
|
||||||
|
from typing import Any, Dict, List, Literal
|
||||||
|
|
||||||
|
from pydantic import AnyUrl, BaseModel
|
||||||
|
from typing_extensions import deprecated
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
|
|
||||||
|
|
||||||
|
class BaseVlmOptions(BaseModel):
|
||||||
|
kind: str
|
||||||
|
prompt: str
|
||||||
|
|
||||||
|
|
||||||
|
class ResponseFormat(str, Enum):
|
||||||
|
DOCTAGS = "doctags"
|
||||||
|
MARKDOWN = "markdown"
|
||||||
|
HTML = "html"
|
||||||
|
|
||||||
|
|
||||||
|
class InferenceFramework(str, Enum):
|
||||||
|
MLX = "mlx"
|
||||||
|
TRANSFORMERS = "transformers"
|
||||||
|
|
||||||
|
|
||||||
|
class TransformersModelType(str, Enum):
|
||||||
|
AUTOMODEL = "automodel"
|
||||||
|
AUTOMODEL_VISION2SEQ = "automodel-vision2seq"
|
||||||
|
AUTOMODEL_CAUSALLM = "automodel-causallm"
|
||||||
|
|
||||||
|
|
||||||
|
class InlineVlmOptions(BaseVlmOptions):
|
||||||
|
kind: Literal["inline_model_options"] = "inline_model_options"
|
||||||
|
|
||||||
|
repo_id: str
|
||||||
|
trust_remote_code: bool = False
|
||||||
|
load_in_8bit: bool = True
|
||||||
|
llm_int8_threshold: float = 6.0
|
||||||
|
quantized: bool = False
|
||||||
|
|
||||||
|
inference_framework: InferenceFramework
|
||||||
|
transformers_model_type: TransformersModelType = TransformersModelType.AUTOMODEL
|
||||||
|
response_format: ResponseFormat
|
||||||
|
|
||||||
|
supported_devices: List[AcceleratorDevice] = [
|
||||||
|
AcceleratorDevice.CPU,
|
||||||
|
AcceleratorDevice.CUDA,
|
||||||
|
AcceleratorDevice.MPS,
|
||||||
|
]
|
||||||
|
|
||||||
|
scale: float = 2.0
|
||||||
|
|
||||||
|
temperature: float = 0.0
|
||||||
|
stop_strings: List[str] = []
|
||||||
|
extra_generation_config: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
use_kv_cache: bool = True
|
||||||
|
max_new_tokens: int = 4096
|
||||||
|
|
||||||
|
@property
|
||||||
|
def repo_cache_folder(self) -> str:
|
||||||
|
return self.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
|
||||||
|
@deprecated("Use InlineVlmOptions instead.")
|
||||||
|
class HuggingFaceVlmOptions(InlineVlmOptions):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ApiVlmOptions(BaseVlmOptions):
|
||||||
|
kind: Literal["api_model_options"] = "api_model_options"
|
||||||
|
|
||||||
|
url: AnyUrl = AnyUrl(
|
||||||
|
"http://localhost:11434/v1/chat/completions"
|
||||||
|
) # Default to ollama
|
||||||
|
headers: Dict[str, str] = {}
|
||||||
|
params: Dict[str, Any] = {}
|
||||||
|
scale: float = 2.0
|
||||||
|
timeout: float = 60
|
||||||
|
concurrency: int = 1
|
||||||
|
response_format: ResponseFormat
|
144
docling/datamodel/vlm_model_specs.py
Normal file
144
docling/datamodel/vlm_model_specs.py
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
import logging
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
from pydantic import (
|
||||||
|
AnyUrl,
|
||||||
|
)
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
|
ApiVlmOptions,
|
||||||
|
InferenceFramework,
|
||||||
|
InlineVlmOptions,
|
||||||
|
ResponseFormat,
|
||||||
|
TransformersModelType,
|
||||||
|
)
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# SmolDocling
|
||||||
|
SMOLDOCLING_MLX = InlineVlmOptions(
|
||||||
|
repo_id="ds4sd/SmolDocling-256M-preview-mlx-bf16",
|
||||||
|
prompt="Convert this page to docling.",
|
||||||
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
|
inference_framework=InferenceFramework.MLX,
|
||||||
|
supported_devices=[AcceleratorDevice.MPS],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
SMOLDOCLING_TRANSFORMERS = InlineVlmOptions(
|
||||||
|
repo_id="ds4sd/SmolDocling-256M-preview",
|
||||||
|
prompt="Convert this page to docling.",
|
||||||
|
response_format=ResponseFormat.DOCTAGS,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
||||||
|
supported_devices=[
|
||||||
|
AcceleratorDevice.CPU,
|
||||||
|
AcceleratorDevice.CUDA,
|
||||||
|
AcceleratorDevice.MPS,
|
||||||
|
],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# GraniteVision
|
||||||
|
GRANITE_VISION_TRANSFORMERS = InlineVlmOptions(
|
||||||
|
repo_id="ibm-granite/granite-vision-3.2-2b",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
||||||
|
supported_devices=[
|
||||||
|
AcceleratorDevice.CPU,
|
||||||
|
AcceleratorDevice.CUDA,
|
||||||
|
AcceleratorDevice.MPS,
|
||||||
|
],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
GRANITE_VISION_OLLAMA = ApiVlmOptions(
|
||||||
|
url=AnyUrl("http://localhost:11434/v1/chat/completions"),
|
||||||
|
params={"model": "granite3.2-vision:2b"},
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
scale=1.0,
|
||||||
|
timeout=120,
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pixtral
|
||||||
|
PIXTRAL_12B_TRANSFORMERS = InlineVlmOptions(
|
||||||
|
repo_id="mistral-community/pixtral-12b",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
||||||
|
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
PIXTRAL_12B_MLX = InlineVlmOptions(
|
||||||
|
repo_id="mlx-community/pixtral-12b-bf16",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.MLX,
|
||||||
|
supported_devices=[AcceleratorDevice.MPS],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Phi4
|
||||||
|
PHI4_TRANSFORMERS = InlineVlmOptions(
|
||||||
|
repo_id="microsoft/Phi-4-multimodal-instruct",
|
||||||
|
prompt="Convert this page to MarkDown. Do not miss any text and only output the bare markdown",
|
||||||
|
trust_remote_code=True,
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
transformers_model_type=TransformersModelType.AUTOMODEL_CAUSALLM,
|
||||||
|
supported_devices=[AcceleratorDevice.CPU, AcceleratorDevice.CUDA],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
extra_generation_config=dict(num_logits_to_keep=0),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Qwen
|
||||||
|
QWEN25_VL_3B_MLX = InlineVlmOptions(
|
||||||
|
repo_id="mlx-community/Qwen2.5-VL-3B-Instruct-bf16",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.MLX,
|
||||||
|
supported_devices=[AcceleratorDevice.MPS],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Gemma-3
|
||||||
|
GEMMA3_12B_MLX = InlineVlmOptions(
|
||||||
|
repo_id="mlx-community/gemma-3-12b-it-bf16",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.MLX,
|
||||||
|
supported_devices=[AcceleratorDevice.MPS],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
GEMMA3_27B_MLX = InlineVlmOptions(
|
||||||
|
repo_id="mlx-community/gemma-3-27b-it-bf16",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.MLX,
|
||||||
|
supported_devices=[AcceleratorDevice.MPS],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class VlmModelType(str, Enum):
|
||||||
|
SMOLDOCLING = "smoldocling"
|
||||||
|
GRANITE_VISION = "granite_vision"
|
||||||
|
GRANITE_VISION_OLLAMA = "granite_vision_ollama"
|
@ -186,6 +186,11 @@ class DocumentConverter:
|
|||||||
Tuple[Type[BasePipeline], str], BasePipeline
|
Tuple[Type[BasePipeline], str], BasePipeline
|
||||||
] = {}
|
] = {}
|
||||||
|
|
||||||
|
def _get_initialized_pipelines(
|
||||||
|
self,
|
||||||
|
) -> dict[tuple[Type[BasePipeline], str], BasePipeline]:
|
||||||
|
return self.initialized_pipelines
|
||||||
|
|
||||||
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
def _get_pipeline_options_hash(self, pipeline_options: PipelineOptions) -> str:
|
||||||
"""Generate a hash of pipeline options to use as part of the cache key."""
|
"""Generate a hash of pipeline options to use as part of the cache key."""
|
||||||
options_str = str(pipeline_options.model_dump())
|
options_str = str(pipeline_options.model_dump())
|
||||||
|
@ -3,7 +3,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import ApiVlmOptions
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions
|
||||||
from docling.exceptions import OperationNotAllowed
|
from docling.exceptions import OperationNotAllowed
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
from docling.utils.api_image_request import api_image_request
|
from docling.utils.api_image_request import api_image_request
|
||||||
|
@ -11,9 +11,10 @@ from PIL import Image, ImageDraw
|
|||||||
from rtree import index
|
from rtree import index
|
||||||
from scipy.ndimage import binary_dilation, find_objects, label
|
from scipy.ndimage import binary_dilation, find_objects, label
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import AcceleratorOptions, OcrOptions
|
from docling.datamodel.pipeline_options import OcrOptions
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
from docling.models.base_model import BaseModelWithOptions, BasePageModel
|
||||||
|
|
||||||
|
@ -16,9 +16,10 @@ from docling_core.types.doc.labels import CodeLanguageLabel
|
|||||||
from PIL import Image, ImageOps
|
from PIL import Image, ImageOps
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
from docling.datamodel.base_models import ItemAndImageEnrichmentElement
|
||||||
from docling.datamodel.pipeline_options import AcceleratorOptions
|
|
||||||
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
from docling.models.base_model import BaseItemAndImageEnrichmentModel
|
||||||
|
from docling.models.utils.hf_model_download import download_hf_model
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
|
||||||
|
|
||||||
@ -117,20 +118,14 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
|||||||
force: bool = False,
|
force: bool = False,
|
||||||
progress: bool = False,
|
progress: bool = False,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
from huggingface_hub import snapshot_download
|
return download_hf_model(
|
||||||
from huggingface_hub.utils import disable_progress_bars
|
|
||||||
|
|
||||||
if not progress:
|
|
||||||
disable_progress_bars()
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id="ds4sd/CodeFormula",
|
repo_id="ds4sd/CodeFormula",
|
||||||
force_download=force,
|
|
||||||
local_dir=local_dir,
|
|
||||||
revision="v1.0.2",
|
revision="v1.0.2",
|
||||||
|
local_dir=local_dir,
|
||||||
|
force=force,
|
||||||
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
"""
|
"""
|
||||||
Determines if a given element in a document can be processed by the model.
|
Determines if a given element in a document can be processed by the model.
|
||||||
|
@ -13,8 +13,9 @@ from docling_core.types.doc import (
|
|||||||
from PIL import Image
|
from PIL import Image
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import AcceleratorOptions
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.models.base_model import BaseEnrichmentModel
|
from docling.models.base_model import BaseEnrichmentModel
|
||||||
|
from docling.models.utils.hf_model_download import download_hf_model
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
|
||||||
|
|
||||||
@ -105,20 +106,14 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
|||||||
def download_models(
|
def download_models(
|
||||||
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
||||||
) -> Path:
|
) -> Path:
|
||||||
from huggingface_hub import snapshot_download
|
return download_hf_model(
|
||||||
from huggingface_hub.utils import disable_progress_bars
|
|
||||||
|
|
||||||
if not progress:
|
|
||||||
disable_progress_bars()
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id="ds4sd/DocumentFigureClassifier",
|
repo_id="ds4sd/DocumentFigureClassifier",
|
||||||
force_download=force,
|
|
||||||
local_dir=local_dir,
|
|
||||||
revision="v1.0.1",
|
revision="v1.0.1",
|
||||||
|
local_dir=local_dir,
|
||||||
|
force=force,
|
||||||
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
def is_processable(self, doc: DoclingDocument, element: NodeItem) -> bool:
|
||||||
"""
|
"""
|
||||||
Determines if the given element can be processed by the classifier.
|
Determines if the given element can be processed by the classifier.
|
||||||
|
@ -9,11 +9,10 @@ import numpy
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
)
|
)
|
||||||
|
@ -1,182 +0,0 @@
|
|||||||
import logging
|
|
||||||
import time
|
|
||||||
from collections.abc import Iterable
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
|
||||||
from docling.datamodel.document import ConversionResult
|
|
||||||
from docling.datamodel.pipeline_options import (
|
|
||||||
AcceleratorOptions,
|
|
||||||
HuggingFaceVlmOptions,
|
|
||||||
)
|
|
||||||
from docling.models.base_model import BasePageModel
|
|
||||||
from docling.utils.accelerator_utils import decide_device
|
|
||||||
from docling.utils.profiling import TimeRecorder
|
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceVlmModel(BasePageModel):
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
enabled: bool,
|
|
||||||
artifacts_path: Optional[Path],
|
|
||||||
accelerator_options: AcceleratorOptions,
|
|
||||||
vlm_options: HuggingFaceVlmOptions,
|
|
||||||
):
|
|
||||||
self.enabled = enabled
|
|
||||||
|
|
||||||
self.vlm_options = vlm_options
|
|
||||||
|
|
||||||
if self.enabled:
|
|
||||||
import torch
|
|
||||||
from transformers import ( # type: ignore
|
|
||||||
AutoModelForVision2Seq,
|
|
||||||
AutoProcessor,
|
|
||||||
BitsAndBytesConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
device = decide_device(accelerator_options.device)
|
|
||||||
self.device = device
|
|
||||||
|
|
||||||
_log.debug(f"Available device for HuggingFace VLM: {device}")
|
|
||||||
|
|
||||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
|
||||||
|
|
||||||
# PARAMETERS:
|
|
||||||
if artifacts_path is None:
|
|
||||||
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
|
||||||
elif (artifacts_path / repo_cache_folder).exists():
|
|
||||||
artifacts_path = artifacts_path / repo_cache_folder
|
|
||||||
|
|
||||||
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
|
|
||||||
self.param_quantization_config = BitsAndBytesConfig(
|
|
||||||
load_in_8bit=vlm_options.load_in_8bit, # True,
|
|
||||||
llm_int8_threshold=vlm_options.llm_int8_threshold, # 6.0
|
|
||||||
)
|
|
||||||
self.param_quantized = vlm_options.quantized # False
|
|
||||||
|
|
||||||
self.processor = AutoProcessor.from_pretrained(artifacts_path)
|
|
||||||
if not self.param_quantized:
|
|
||||||
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
|
||||||
artifacts_path,
|
|
||||||
device_map=device,
|
|
||||||
torch_dtype=torch.bfloat16,
|
|
||||||
_attn_implementation=(
|
|
||||||
"flash_attention_2"
|
|
||||||
if self.device.startswith("cuda")
|
|
||||||
and accelerator_options.cuda_use_flash_attention2
|
|
||||||
else "eager"
|
|
||||||
),
|
|
||||||
) # .to(self.device)
|
|
||||||
|
|
||||||
else:
|
|
||||||
self.vlm_model = AutoModelForVision2Seq.from_pretrained(
|
|
||||||
artifacts_path,
|
|
||||||
device_map=device,
|
|
||||||
torch_dtype="auto",
|
|
||||||
quantization_config=self.param_quantization_config,
|
|
||||||
_attn_implementation=(
|
|
||||||
"flash_attention_2"
|
|
||||||
if self.device.startswith("cuda")
|
|
||||||
and accelerator_options.cuda_use_flash_attention2
|
|
||||||
else "eager"
|
|
||||||
),
|
|
||||||
) # .to(self.device)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def download_models(
|
|
||||||
repo_id: str,
|
|
||||||
local_dir: Optional[Path] = None,
|
|
||||||
force: bool = False,
|
|
||||||
progress: bool = False,
|
|
||||||
) -> Path:
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
from huggingface_hub.utils import disable_progress_bars
|
|
||||||
|
|
||||||
if not progress:
|
|
||||||
disable_progress_bars()
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id=repo_id,
|
|
||||||
force_download=force,
|
|
||||||
local_dir=local_dir,
|
|
||||||
# revision="v0.0.1",
|
|
||||||
)
|
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
|
||||||
) -> Iterable[Page]:
|
|
||||||
for page in page_batch:
|
|
||||||
assert page._backend is not None
|
|
||||||
if not page._backend.is_valid():
|
|
||||||
yield page
|
|
||||||
else:
|
|
||||||
with TimeRecorder(conv_res, "vlm"):
|
|
||||||
assert page.size is not None
|
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
|
||||||
# hi_res_image = page.get_image(scale=1.0) # 72dpi
|
|
||||||
|
|
||||||
if hi_res_image is not None:
|
|
||||||
im_width, im_height = hi_res_image.size
|
|
||||||
|
|
||||||
# populate page_tags with predicted doc tags
|
|
||||||
page_tags = ""
|
|
||||||
|
|
||||||
if hi_res_image:
|
|
||||||
if hi_res_image.mode != "RGB":
|
|
||||||
hi_res_image = hi_res_image.convert("RGB")
|
|
||||||
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
"role": "user",
|
|
||||||
"content": [
|
|
||||||
{
|
|
||||||
"type": "text",
|
|
||||||
"text": "This is a page from a document.",
|
|
||||||
},
|
|
||||||
{"type": "image"},
|
|
||||||
{"type": "text", "text": self.param_question},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
prompt = self.processor.apply_chat_template(
|
|
||||||
messages, add_generation_prompt=False
|
|
||||||
)
|
|
||||||
inputs = self.processor(
|
|
||||||
text=prompt, images=[hi_res_image], return_tensors="pt"
|
|
||||||
)
|
|
||||||
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
|
||||||
|
|
||||||
start_time = time.time()
|
|
||||||
# Call model to generate:
|
|
||||||
generated_ids = self.vlm_model.generate(
|
|
||||||
**inputs, max_new_tokens=4096, use_cache=True
|
|
||||||
)
|
|
||||||
|
|
||||||
generation_time = time.time() - start_time
|
|
||||||
generated_texts = self.processor.batch_decode(
|
|
||||||
generated_ids[:, inputs["input_ids"].shape[1] :],
|
|
||||||
skip_special_tokens=False,
|
|
||||||
)[0]
|
|
||||||
|
|
||||||
num_tokens = len(generated_ids[0])
|
|
||||||
page_tags = generated_texts
|
|
||||||
|
|
||||||
_log.debug(
|
|
||||||
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
|
||||||
)
|
|
||||||
|
|
||||||
# inference_time = time.time() - start_time
|
|
||||||
# tokens_per_second = num_tokens / generation_time
|
|
||||||
# print("")
|
|
||||||
# print(f"Page Inference Time: {inference_time:.2f} seconds")
|
|
||||||
# print(f"Total tokens on page: {num_tokens:.2f}")
|
|
||||||
# print(f"Tokens/sec: {tokens_per_second:.2f}")
|
|
||||||
# print("")
|
|
||||||
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
|
||||||
|
|
||||||
yield page
|
|
@ -10,11 +10,12 @@ from docling_core.types.doc import DocItemLabel
|
|||||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
from docling.datamodel.base_models import BoundingBox, Cluster, LayoutPrediction, Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import AcceleratorOptions
|
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.models.utils.hf_model_download import download_hf_model
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
from docling.utils.layout_postprocessor import LayoutPostprocessor
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
@ -83,20 +84,14 @@ class LayoutModel(BasePageModel):
|
|||||||
force: bool = False,
|
force: bool = False,
|
||||||
progress: bool = False,
|
progress: bool = False,
|
||||||
) -> Path:
|
) -> Path:
|
||||||
from huggingface_hub import snapshot_download
|
return download_hf_model(
|
||||||
from huggingface_hub.utils import disable_progress_bars
|
|
||||||
|
|
||||||
if not progress:
|
|
||||||
disable_progress_bars()
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id="ds4sd/docling-models",
|
repo_id="ds4sd/docling-models",
|
||||||
force_download=force,
|
revision="v2.2.0",
|
||||||
local_dir=local_dir,
|
local_dir=local_dir,
|
||||||
revision="v2.1.0",
|
force=force,
|
||||||
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def draw_clusters_and_cells_side_by_side(
|
def draw_clusters_and_cells_side_by_side(
|
||||||
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
|
self, conv_res, page, clusters, mode_prefix: str, show: bool = False
|
||||||
):
|
):
|
||||||
|
@ -8,10 +8,10 @@ from typing import Optional, Type
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
)
|
)
|
||||||
|
@ -5,8 +5,8 @@ from typing import Optional, Type, Union
|
|||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
|
||||||
PictureDescriptionApiOptions,
|
PictureDescriptionApiOptions,
|
||||||
PictureDescriptionBaseOptions,
|
PictureDescriptionBaseOptions,
|
||||||
)
|
)
|
||||||
|
@ -13,8 +13,8 @@ from docling_core.types.doc.document import ( # TODO: move import to docling_co
|
|||||||
)
|
)
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
|
||||||
PictureDescriptionBaseOptions,
|
PictureDescriptionBaseOptions,
|
||||||
)
|
)
|
||||||
from docling.models.base_model import (
|
from docling.models.base_model import (
|
||||||
|
@ -4,16 +4,21 @@ from typing import Optional, Type, Union
|
|||||||
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
|
||||||
PictureDescriptionBaseOptions,
|
PictureDescriptionBaseOptions,
|
||||||
PictureDescriptionVlmOptions,
|
PictureDescriptionVlmOptions,
|
||||||
)
|
)
|
||||||
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
from docling.models.picture_description_base_model import PictureDescriptionBaseModel
|
||||||
|
from docling.models.utils.hf_model_download import (
|
||||||
|
HuggingFaceModelDownloadMixin,
|
||||||
|
)
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
|
||||||
|
|
||||||
class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
class PictureDescriptionVlmModel(
|
||||||
|
PictureDescriptionBaseModel, HuggingFaceModelDownloadMixin
|
||||||
|
):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
def get_options_type(cls) -> Type[PictureDescriptionBaseOptions]:
|
||||||
return PictureDescriptionVlmOptions
|
return PictureDescriptionVlmOptions
|
||||||
@ -66,26 +71,6 @@ class PictureDescriptionVlmModel(PictureDescriptionBaseModel):
|
|||||||
|
|
||||||
self.provenance = f"{self.options.repo_id}"
|
self.provenance = f"{self.options.repo_id}"
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def download_models(
|
|
||||||
repo_id: str,
|
|
||||||
local_dir: Optional[Path] = None,
|
|
||||||
force: bool = False,
|
|
||||||
progress: bool = False,
|
|
||||||
) -> Path:
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
from huggingface_hub.utils import disable_progress_bars
|
|
||||||
|
|
||||||
if not progress:
|
|
||||||
disable_progress_bars()
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id=repo_id,
|
|
||||||
force_download=force,
|
|
||||||
local_dir=local_dir,
|
|
||||||
)
|
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
def _annotate_images(self, images: Iterable[Image.Image]) -> Iterable[str]:
|
||||||
from transformers import GenerationConfig
|
from transformers import GenerationConfig
|
||||||
|
|
||||||
|
@ -7,11 +7,10 @@ import numpy
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
from docling_core.types.doc.page import BoundingRectangle, TextCell
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
RapidOcrOptions,
|
RapidOcrOptions,
|
||||||
)
|
)
|
||||||
|
@ -13,16 +13,16 @@ from docling_core.types.doc.page import (
|
|||||||
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
from docling_ibm_models.tableformer.data_management.tf_predictor import TFPredictor
|
||||||
from PIL import ImageDraw
|
from PIL import ImageDraw
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
from docling.datamodel.base_models import Page, Table, TableStructurePrediction
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
TableStructureOptions,
|
TableStructureOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.models.utils.hf_model_download import download_hf_model
|
||||||
from docling.utils.accelerator_utils import decide_device
|
from docling.utils.accelerator_utils import decide_device
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
@ -90,20 +90,14 @@ class TableStructureModel(BasePageModel):
|
|||||||
def download_models(
|
def download_models(
|
||||||
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
||||||
) -> Path:
|
) -> Path:
|
||||||
from huggingface_hub import snapshot_download
|
return download_hf_model(
|
||||||
from huggingface_hub.utils import disable_progress_bars
|
|
||||||
|
|
||||||
if not progress:
|
|
||||||
disable_progress_bars()
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id="ds4sd/docling-models",
|
repo_id="ds4sd/docling-models",
|
||||||
force_download=force,
|
|
||||||
local_dir=local_dir,
|
|
||||||
revision="v2.2.0",
|
revision="v2.2.0",
|
||||||
|
local_dir=local_dir,
|
||||||
|
force=force,
|
||||||
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def draw_table_and_cells(
|
def draw_table_and_cells(
|
||||||
self,
|
self,
|
||||||
conv_res: ConversionResult,
|
conv_res: ConversionResult,
|
||||||
|
@ -13,10 +13,10 @@ import pandas as pd
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import TextCell
|
from docling_core.types.doc.page import TextCell
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
)
|
)
|
||||||
|
@ -7,10 +7,10 @@ from typing import Iterable, Optional, Type
|
|||||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||||
from docling_core.types.doc.page import TextCell
|
from docling_core.types.doc.page import TextCell
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorOptions
|
||||||
from docling.datamodel.base_models import Page
|
from docling.datamodel.base_models import Page
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorOptions,
|
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
|
0
docling/models/utils/__init__.py
Normal file
0
docling/models/utils/__init__.py
Normal file
40
docling/models/utils/hf_model_download.py
Normal file
40
docling/models/utils/hf_model_download.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def download_hf_model(
|
||||||
|
repo_id: str,
|
||||||
|
local_dir: Optional[Path] = None,
|
||||||
|
force: bool = False,
|
||||||
|
progress: bool = False,
|
||||||
|
revision: Optional[str] = None,
|
||||||
|
) -> Path:
|
||||||
|
from huggingface_hub import snapshot_download
|
||||||
|
from huggingface_hub.utils import disable_progress_bars
|
||||||
|
|
||||||
|
if not progress:
|
||||||
|
disable_progress_bars()
|
||||||
|
download_path = snapshot_download(
|
||||||
|
repo_id=repo_id,
|
||||||
|
force_download=force,
|
||||||
|
local_dir=local_dir,
|
||||||
|
revision=revision,
|
||||||
|
)
|
||||||
|
|
||||||
|
return Path(download_path)
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingFaceModelDownloadMixin:
|
||||||
|
@staticmethod
|
||||||
|
def download_models(
|
||||||
|
repo_id: str,
|
||||||
|
local_dir: Optional[Path] = None,
|
||||||
|
force: bool = False,
|
||||||
|
progress: bool = False,
|
||||||
|
) -> Path:
|
||||||
|
return download_hf_model(
|
||||||
|
repo_id=repo_id, local_dir=local_dir, force=force, progress=progress
|
||||||
|
)
|
0
docling/models/vlm_models_inline/__init__.py
Normal file
0
docling/models/vlm_models_inline/__init__.py
Normal file
194
docling/models/vlm_models_inline/hf_transformers_model.py
Normal file
194
docling/models/vlm_models_inline/hf_transformers_model.py
Normal file
@ -0,0 +1,194 @@
|
|||||||
|
import importlib.metadata
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.base_models import Page, VlmPrediction
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
|
InlineVlmOptions,
|
||||||
|
TransformersModelType,
|
||||||
|
)
|
||||||
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.models.utils.hf_model_download import (
|
||||||
|
HuggingFaceModelDownloadMixin,
|
||||||
|
)
|
||||||
|
from docling.utils.accelerator_utils import decide_device
|
||||||
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class HuggingFaceTransformersVlmModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
enabled: bool,
|
||||||
|
artifacts_path: Optional[Path],
|
||||||
|
accelerator_options: AcceleratorOptions,
|
||||||
|
vlm_options: InlineVlmOptions,
|
||||||
|
):
|
||||||
|
self.enabled = enabled
|
||||||
|
|
||||||
|
self.vlm_options = vlm_options
|
||||||
|
|
||||||
|
if self.enabled:
|
||||||
|
import torch
|
||||||
|
from transformers import (
|
||||||
|
AutoModel,
|
||||||
|
AutoModelForCausalLM,
|
||||||
|
AutoModelForVision2Seq,
|
||||||
|
AutoProcessor,
|
||||||
|
BitsAndBytesConfig,
|
||||||
|
GenerationConfig,
|
||||||
|
)
|
||||||
|
|
||||||
|
transformers_version = importlib.metadata.version("transformers")
|
||||||
|
if (
|
||||||
|
self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct"
|
||||||
|
and transformers_version >= "4.52.0"
|
||||||
|
):
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Phi 4 only works with transformers<4.52.0 but you have {transformers_version=}. Please downgrage running pip install -U 'transformers<4.52.0'."
|
||||||
|
)
|
||||||
|
|
||||||
|
self.device = decide_device(
|
||||||
|
accelerator_options.device,
|
||||||
|
supported_devices=vlm_options.supported_devices,
|
||||||
|
)
|
||||||
|
_log.debug(f"Available device for VLM: {self.device}")
|
||||||
|
|
||||||
|
self.use_cache = vlm_options.use_kv_cache
|
||||||
|
self.max_new_tokens = vlm_options.max_new_tokens
|
||||||
|
self.temperature = vlm_options.temperature
|
||||||
|
|
||||||
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||||
|
|
||||||
|
if artifacts_path is None:
|
||||||
|
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
||||||
|
elif (artifacts_path / repo_cache_folder).exists():
|
||||||
|
artifacts_path = artifacts_path / repo_cache_folder
|
||||||
|
|
||||||
|
self.param_quantization_config: Optional[BitsAndBytesConfig] = None
|
||||||
|
if vlm_options.quantized:
|
||||||
|
self.param_quantization_config = BitsAndBytesConfig(
|
||||||
|
load_in_8bit=vlm_options.load_in_8bit,
|
||||||
|
llm_int8_threshold=vlm_options.llm_int8_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
model_cls: Any = AutoModel
|
||||||
|
if (
|
||||||
|
self.vlm_options.transformers_model_type
|
||||||
|
== TransformersModelType.AUTOMODEL_CAUSALLM
|
||||||
|
):
|
||||||
|
model_cls = AutoModelForCausalLM
|
||||||
|
elif (
|
||||||
|
self.vlm_options.transformers_model_type
|
||||||
|
== TransformersModelType.AUTOMODEL_VISION2SEQ
|
||||||
|
):
|
||||||
|
model_cls = AutoModelForVision2Seq
|
||||||
|
|
||||||
|
self.processor = AutoProcessor.from_pretrained(
|
||||||
|
artifacts_path,
|
||||||
|
trust_remote_code=vlm_options.trust_remote_code,
|
||||||
|
)
|
||||||
|
self.vlm_model = model_cls.from_pretrained(
|
||||||
|
artifacts_path,
|
||||||
|
device_map=self.device,
|
||||||
|
_attn_implementation=(
|
||||||
|
"flash_attention_2"
|
||||||
|
if self.device.startswith("cuda")
|
||||||
|
and accelerator_options.cuda_use_flash_attention2
|
||||||
|
else "eager"
|
||||||
|
),
|
||||||
|
trust_remote_code=vlm_options.trust_remote_code,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Load generation config
|
||||||
|
self.generation_config = GenerationConfig.from_pretrained(artifacts_path)
|
||||||
|
|
||||||
|
def __call__(
|
||||||
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
|
) -> Iterable[Page]:
|
||||||
|
for page in page_batch:
|
||||||
|
assert page._backend is not None
|
||||||
|
if not page._backend.is_valid():
|
||||||
|
yield page
|
||||||
|
else:
|
||||||
|
with TimeRecorder(conv_res, "vlm"):
|
||||||
|
assert page.size is not None
|
||||||
|
|
||||||
|
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
||||||
|
|
||||||
|
# Define prompt structure
|
||||||
|
prompt = self.formulate_prompt()
|
||||||
|
|
||||||
|
inputs = self.processor(
|
||||||
|
text=prompt, images=[hi_res_image], return_tensors="pt"
|
||||||
|
).to(self.device)
|
||||||
|
|
||||||
|
start_time = time.time()
|
||||||
|
# Call model to generate:
|
||||||
|
generated_ids = self.vlm_model.generate(
|
||||||
|
**inputs,
|
||||||
|
max_new_tokens=self.max_new_tokens,
|
||||||
|
use_cache=self.use_cache,
|
||||||
|
temperature=self.temperature,
|
||||||
|
generation_config=self.generation_config,
|
||||||
|
**self.vlm_options.extra_generation_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
generation_time = time.time() - start_time
|
||||||
|
generated_texts = self.processor.batch_decode(
|
||||||
|
generated_ids[:, inputs["input_ids"].shape[1] :],
|
||||||
|
skip_special_tokens=False,
|
||||||
|
)[0]
|
||||||
|
|
||||||
|
num_tokens = len(generated_ids[0])
|
||||||
|
_log.debug(
|
||||||
|
f"Generated {num_tokens} tokens in time {generation_time:.2f} seconds."
|
||||||
|
)
|
||||||
|
page.predictions.vlm_response = VlmPrediction(
|
||||||
|
text=generated_texts,
|
||||||
|
generation_time=generation_time,
|
||||||
|
)
|
||||||
|
|
||||||
|
yield page
|
||||||
|
|
||||||
|
def formulate_prompt(self) -> str:
|
||||||
|
"""Formulate a prompt for the VLM."""
|
||||||
|
|
||||||
|
if self.vlm_options.repo_id == "microsoft/Phi-4-multimodal-instruct":
|
||||||
|
_log.debug("Using specialized prompt for Phi-4")
|
||||||
|
# more info here: https://huggingface.co/microsoft/Phi-4-multimodal-instruct#loading-the-model-locally
|
||||||
|
|
||||||
|
user_prompt = "<|user|>"
|
||||||
|
assistant_prompt = "<|assistant|>"
|
||||||
|
prompt_suffix = "<|end|>"
|
||||||
|
|
||||||
|
prompt = f"{user_prompt}<|image_1|>{self.vlm_options.prompt}{prompt_suffix}{assistant_prompt}"
|
||||||
|
_log.debug(f"prompt for {self.vlm_options.repo_id}: {prompt}")
|
||||||
|
|
||||||
|
return prompt
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{
|
||||||
|
"type": "text",
|
||||||
|
"text": "This is a page from a document.",
|
||||||
|
},
|
||||||
|
{"type": "image"},
|
||||||
|
{"type": "text", "text": self.vlm_options.prompt},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
prompt = self.processor.apply_chat_template(
|
||||||
|
messages, add_generation_prompt=False
|
||||||
|
)
|
||||||
|
return prompt
|
@ -4,29 +4,34 @@ from collections.abc import Iterable
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from docling.datamodel.base_models import Page, VlmPrediction
|
from docling.datamodel.accelerator_options import (
|
||||||
from docling.datamodel.document import ConversionResult
|
|
||||||
from docling.datamodel.pipeline_options import (
|
|
||||||
AcceleratorOptions,
|
AcceleratorOptions,
|
||||||
HuggingFaceVlmOptions,
|
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.base_models import Page, VlmPrediction, VlmPredictionToken
|
||||||
|
from docling.datamodel.document import ConversionResult
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions
|
||||||
from docling.models.base_model import BasePageModel
|
from docling.models.base_model import BasePageModel
|
||||||
|
from docling.models.utils.hf_model_download import (
|
||||||
|
HuggingFaceModelDownloadMixin,
|
||||||
|
)
|
||||||
from docling.utils.profiling import TimeRecorder
|
from docling.utils.profiling import TimeRecorder
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class HuggingFaceMlxModel(BasePageModel):
|
class HuggingFaceMlxModel(BasePageModel, HuggingFaceModelDownloadMixin):
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
enabled: bool,
|
enabled: bool,
|
||||||
artifacts_path: Optional[Path],
|
artifacts_path: Optional[Path],
|
||||||
accelerator_options: AcceleratorOptions,
|
accelerator_options: AcceleratorOptions,
|
||||||
vlm_options: HuggingFaceVlmOptions,
|
vlm_options: InlineVlmOptions,
|
||||||
):
|
):
|
||||||
self.enabled = enabled
|
self.enabled = enabled
|
||||||
|
|
||||||
self.vlm_options = vlm_options
|
self.vlm_options = vlm_options
|
||||||
|
self.max_tokens = vlm_options.max_new_tokens
|
||||||
|
self.temperature = vlm_options.temperature
|
||||||
|
|
||||||
if self.enabled:
|
if self.enabled:
|
||||||
try:
|
try:
|
||||||
@ -39,42 +44,24 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
repo_cache_folder = vlm_options.repo_id.replace("/", "--")
|
||||||
|
|
||||||
self.apply_chat_template = apply_chat_template
|
self.apply_chat_template = apply_chat_template
|
||||||
self.stream_generate = stream_generate
|
self.stream_generate = stream_generate
|
||||||
|
|
||||||
# PARAMETERS:
|
# PARAMETERS:
|
||||||
if artifacts_path is None:
|
if artifacts_path is None:
|
||||||
artifacts_path = self.download_models(self.vlm_options.repo_id)
|
artifacts_path = self.download_models(
|
||||||
|
self.vlm_options.repo_id,
|
||||||
|
)
|
||||||
elif (artifacts_path / repo_cache_folder).exists():
|
elif (artifacts_path / repo_cache_folder).exists():
|
||||||
artifacts_path = artifacts_path / repo_cache_folder
|
artifacts_path = artifacts_path / repo_cache_folder
|
||||||
|
|
||||||
self.param_question = vlm_options.prompt # "Perform Layout Analysis."
|
self.param_question = vlm_options.prompt
|
||||||
|
|
||||||
## Load the model
|
## Load the model
|
||||||
self.vlm_model, self.processor = load(artifacts_path)
|
self.vlm_model, self.processor = load(artifacts_path)
|
||||||
self.config = load_config(artifacts_path)
|
self.config = load_config(artifacts_path)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def download_models(
|
|
||||||
repo_id: str,
|
|
||||||
local_dir: Optional[Path] = None,
|
|
||||||
force: bool = False,
|
|
||||||
progress: bool = False,
|
|
||||||
) -> Path:
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
from huggingface_hub.utils import disable_progress_bars
|
|
||||||
|
|
||||||
if not progress:
|
|
||||||
disable_progress_bars()
|
|
||||||
download_path = snapshot_download(
|
|
||||||
repo_id=repo_id,
|
|
||||||
force_download=force,
|
|
||||||
local_dir=local_dir,
|
|
||||||
# revision="v0.0.1",
|
|
||||||
)
|
|
||||||
|
|
||||||
return Path(download_path)
|
|
||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||||
) -> Iterable[Page]:
|
) -> Iterable[Page]:
|
||||||
@ -83,12 +70,10 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
if not page._backend.is_valid():
|
if not page._backend.is_valid():
|
||||||
yield page
|
yield page
|
||||||
else:
|
else:
|
||||||
with TimeRecorder(conv_res, "vlm"):
|
with TimeRecorder(conv_res, f"vlm-mlx-{self.vlm_options.repo_id}"):
|
||||||
assert page.size is not None
|
assert page.size is not None
|
||||||
|
|
||||||
hi_res_image = page.get_image(scale=2.0) # 144dpi
|
hi_res_image = page.get_image(scale=self.vlm_options.scale)
|
||||||
# hi_res_image = page.get_image(scale=1.0) # 72dpi
|
|
||||||
|
|
||||||
if hi_res_image is not None:
|
if hi_res_image is not None:
|
||||||
im_width, im_height = hi_res_image.size
|
im_width, im_height = hi_res_image.size
|
||||||
|
|
||||||
@ -104,16 +89,45 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
)
|
)
|
||||||
|
|
||||||
start_time = time.time()
|
start_time = time.time()
|
||||||
|
_log.debug("start generating ...")
|
||||||
|
|
||||||
# Call model to generate:
|
# Call model to generate:
|
||||||
|
tokens: list[VlmPredictionToken] = []
|
||||||
|
|
||||||
output = ""
|
output = ""
|
||||||
for token in self.stream_generate(
|
for token in self.stream_generate(
|
||||||
self.vlm_model,
|
self.vlm_model,
|
||||||
self.processor,
|
self.processor,
|
||||||
prompt,
|
prompt,
|
||||||
[hi_res_image],
|
[hi_res_image],
|
||||||
max_tokens=4096,
|
max_tokens=self.max_tokens,
|
||||||
verbose=False,
|
verbose=False,
|
||||||
|
temp=self.temperature,
|
||||||
):
|
):
|
||||||
|
if len(token.logprobs.shape) == 1:
|
||||||
|
tokens.append(
|
||||||
|
VlmPredictionToken(
|
||||||
|
text=token.text,
|
||||||
|
token=token.token,
|
||||||
|
logprob=token.logprobs[token.token],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
len(token.logprobs.shape) == 2
|
||||||
|
and token.logprobs.shape[0] == 1
|
||||||
|
):
|
||||||
|
tokens.append(
|
||||||
|
VlmPredictionToken(
|
||||||
|
text=token.text,
|
||||||
|
token=token.token,
|
||||||
|
logprob=token.logprobs[0, token.token],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
_log.warning(
|
||||||
|
f"incompatible shape for logprobs: {token.logprobs.shape}"
|
||||||
|
)
|
||||||
|
|
||||||
output += token.text
|
output += token.text
|
||||||
if "</doctag>" in token.text:
|
if "</doctag>" in token.text:
|
||||||
break
|
break
|
||||||
@ -121,15 +135,13 @@ class HuggingFaceMlxModel(BasePageModel):
|
|||||||
generation_time = time.time() - start_time
|
generation_time = time.time() - start_time
|
||||||
page_tags = output
|
page_tags = output
|
||||||
|
|
||||||
_log.debug(f"Generation time {generation_time:.2f} seconds.")
|
_log.debug(
|
||||||
|
f"{generation_time:.2f} seconds for {len(tokens)} tokens ({len(tokens) / generation_time} tokens/sec)."
|
||||||
# inference_time = time.time() - start_time
|
)
|
||||||
# tokens_per_second = num_tokens / generation_time
|
page.predictions.vlm_response = VlmPrediction(
|
||||||
# print("")
|
text=page_tags,
|
||||||
# print(f"Page Inference Time: {inference_time:.2f} seconds")
|
generation_time=generation_time,
|
||||||
# print(f"Total tokens on page: {num_tokens:.2f}")
|
generated_tokens=tokens,
|
||||||
# print(f"Tokens/sec: {tokens_per_second:.2f}")
|
)
|
||||||
# print("")
|
|
||||||
page.predictions.vlm_response = VlmPrediction(text=page_tags)
|
|
||||||
|
|
||||||
yield page
|
yield page
|
@ -1,29 +1,46 @@
|
|||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional, Union, cast
|
from typing import List, Optional, Union, cast
|
||||||
|
|
||||||
from docling_core.types import DoclingDocument
|
from docling_core.types.doc import (
|
||||||
from docling_core.types.doc import BoundingBox, DocItem, ImageRef, PictureItem, TextItem
|
BoundingBox,
|
||||||
|
DocItem,
|
||||||
|
DoclingDocument,
|
||||||
|
ImageRef,
|
||||||
|
PictureItem,
|
||||||
|
ProvenanceItem,
|
||||||
|
TextItem,
|
||||||
|
)
|
||||||
|
from docling_core.types.doc.base import (
|
||||||
|
BoundingBox,
|
||||||
|
Size,
|
||||||
|
)
|
||||||
from docling_core.types.doc.document import DocTagsDocument
|
from docling_core.types.doc.document import DocTagsDocument
|
||||||
from PIL import Image as PILImage
|
from PIL import Image as PILImage
|
||||||
|
|
||||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||||
|
from docling.backend.html_backend import HTMLDocumentBackend
|
||||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat, Page
|
from docling.datamodel.base_models import InputFormat, Page
|
||||||
from docling.datamodel.document import ConversionResult, InputDocument
|
from docling.datamodel.document import ConversionResult, InputDocument
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
ApiVlmOptions,
|
|
||||||
HuggingFaceVlmOptions,
|
|
||||||
InferenceFramework,
|
|
||||||
ResponseFormat,
|
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import (
|
||||||
|
ApiVlmOptions,
|
||||||
|
InferenceFramework,
|
||||||
|
InlineVlmOptions,
|
||||||
|
ResponseFormat,
|
||||||
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
from docling.models.api_vlm_model import ApiVlmModel
|
from docling.models.api_vlm_model import ApiVlmModel
|
||||||
from docling.models.hf_mlx_model import HuggingFaceMlxModel
|
from docling.models.vlm_models_inline.hf_transformers_model import (
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
HuggingFaceTransformersVlmModel,
|
||||||
|
)
|
||||||
|
from docling.models.vlm_models_inline.mlx_model import HuggingFaceMlxModel
|
||||||
from docling.pipeline.base_pipeline import PaginatedPipeline
|
from docling.pipeline.base_pipeline import PaginatedPipeline
|
||||||
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
from docling.utils.profiling import ProfilingScope, TimeRecorder
|
||||||
|
|
||||||
@ -66,8 +83,8 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
vlm_options=cast(ApiVlmOptions, self.pipeline_options.vlm_options),
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
elif isinstance(self.pipeline_options.vlm_options, HuggingFaceVlmOptions):
|
elif isinstance(self.pipeline_options.vlm_options, InlineVlmOptions):
|
||||||
vlm_options = cast(HuggingFaceVlmOptions, self.pipeline_options.vlm_options)
|
vlm_options = cast(InlineVlmOptions, self.pipeline_options.vlm_options)
|
||||||
if vlm_options.inference_framework == InferenceFramework.MLX:
|
if vlm_options.inference_framework == InferenceFramework.MLX:
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
HuggingFaceMlxModel(
|
HuggingFaceMlxModel(
|
||||||
@ -77,15 +94,19 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
vlm_options=vlm_options,
|
vlm_options=vlm_options,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
else:
|
elif vlm_options.inference_framework == InferenceFramework.TRANSFORMERS:
|
||||||
self.build_pipe = [
|
self.build_pipe = [
|
||||||
HuggingFaceVlmModel(
|
HuggingFaceTransformersVlmModel(
|
||||||
enabled=True, # must be always enabled for this pipeline to make sense.
|
enabled=True, # must be always enabled for this pipeline to make sense.
|
||||||
artifacts_path=artifacts_path,
|
artifacts_path=artifacts_path,
|
||||||
accelerator_options=pipeline_options.accelerator_options,
|
accelerator_options=pipeline_options.accelerator_options,
|
||||||
vlm_options=vlm_options,
|
vlm_options=vlm_options,
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"Could not instantiate the right type of VLM pipeline: {vlm_options.inference_framework}"
|
||||||
|
)
|
||||||
|
|
||||||
self.enrichment_pipe = [
|
self.enrichment_pipe = [
|
||||||
# Other models working on `NodeItem` elements in the DoclingDocument
|
# Other models working on `NodeItem` elements in the DoclingDocument
|
||||||
@ -116,49 +137,19 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
self.pipeline_options.vlm_options.response_format
|
self.pipeline_options.vlm_options.response_format
|
||||||
== ResponseFormat.DOCTAGS
|
== ResponseFormat.DOCTAGS
|
||||||
):
|
):
|
||||||
doctags_list = []
|
conv_res.document = self._turn_dt_into_doc(conv_res)
|
||||||
image_list = []
|
|
||||||
for page in conv_res.pages:
|
|
||||||
predicted_doctags = ""
|
|
||||||
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
|
||||||
if page.predictions.vlm_response:
|
|
||||||
predicted_doctags = page.predictions.vlm_response.text
|
|
||||||
if page.image:
|
|
||||||
img = page.image
|
|
||||||
image_list.append(img)
|
|
||||||
doctags_list.append(predicted_doctags)
|
|
||||||
|
|
||||||
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
|
||||||
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
|
||||||
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
|
||||||
doctags_list_c, image_list_c
|
|
||||||
)
|
|
||||||
conv_res.document = DoclingDocument.load_from_doctags(doctags_doc)
|
|
||||||
|
|
||||||
# If forced backend text, replace model predicted text with backend one
|
|
||||||
if self.force_backend_text:
|
|
||||||
scale = self.pipeline_options.images_scale
|
|
||||||
for element, _level in conv_res.document.iterate_items():
|
|
||||||
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
|
||||||
continue
|
|
||||||
page_ix = element.prov[0].page_no - 1
|
|
||||||
page = conv_res.pages[page_ix]
|
|
||||||
if not page.size:
|
|
||||||
continue
|
|
||||||
crop_bbox = (
|
|
||||||
element.prov[0]
|
|
||||||
.bbox.scaled(scale=scale)
|
|
||||||
.to_top_left_origin(page_height=page.size.height * scale)
|
|
||||||
)
|
|
||||||
txt = self.extract_text_from_backend(page, crop_bbox)
|
|
||||||
element.text = txt
|
|
||||||
element.orig = txt
|
|
||||||
elif (
|
elif (
|
||||||
self.pipeline_options.vlm_options.response_format
|
self.pipeline_options.vlm_options.response_format
|
||||||
== ResponseFormat.MARKDOWN
|
== ResponseFormat.MARKDOWN
|
||||||
):
|
):
|
||||||
conv_res.document = self._turn_md_into_doc(conv_res)
|
conv_res.document = self._turn_md_into_doc(conv_res)
|
||||||
|
|
||||||
|
elif (
|
||||||
|
self.pipeline_options.vlm_options.response_format == ResponseFormat.HTML
|
||||||
|
):
|
||||||
|
conv_res.document = self._turn_html_into_doc(conv_res)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
|
f"Unsupported VLM response format {self.pipeline_options.vlm_options.response_format}"
|
||||||
@ -192,11 +183,81 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
return conv_res
|
return conv_res
|
||||||
|
|
||||||
def _turn_md_into_doc(self, conv_res):
|
def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
|
||||||
predicted_text = ""
|
doctags_list = []
|
||||||
for pg_idx, page in enumerate(conv_res.pages):
|
image_list = []
|
||||||
|
for page in conv_res.pages:
|
||||||
|
predicted_doctags = ""
|
||||||
|
img = PILImage.new("RGB", (1, 1), "rgb(255,255,255)")
|
||||||
if page.predictions.vlm_response:
|
if page.predictions.vlm_response:
|
||||||
predicted_text += page.predictions.vlm_response.text + "\n\n"
|
predicted_doctags = page.predictions.vlm_response.text
|
||||||
|
if page.image:
|
||||||
|
img = page.image
|
||||||
|
image_list.append(img)
|
||||||
|
doctags_list.append(predicted_doctags)
|
||||||
|
|
||||||
|
doctags_list_c = cast(List[Union[Path, str]], doctags_list)
|
||||||
|
image_list_c = cast(List[Union[Path, PILImage.Image]], image_list)
|
||||||
|
doctags_doc = DocTagsDocument.from_doctags_and_image_pairs(
|
||||||
|
doctags_list_c, image_list_c
|
||||||
|
)
|
||||||
|
conv_res.document = DoclingDocument.load_from_doctags(
|
||||||
|
doctag_document=doctags_doc
|
||||||
|
)
|
||||||
|
|
||||||
|
# If forced backend text, replace model predicted text with backend one
|
||||||
|
if page.size:
|
||||||
|
if self.force_backend_text:
|
||||||
|
scale = self.pipeline_options.images_scale
|
||||||
|
for element, _level in conv_res.document.iterate_items():
|
||||||
|
if not isinstance(element, TextItem) or len(element.prov) == 0:
|
||||||
|
continue
|
||||||
|
crop_bbox = (
|
||||||
|
element.prov[0]
|
||||||
|
.bbox.scaled(scale=scale)
|
||||||
|
.to_top_left_origin(page_height=page.size.height * scale)
|
||||||
|
)
|
||||||
|
txt = self.extract_text_from_backend(page, crop_bbox)
|
||||||
|
element.text = txt
|
||||||
|
element.orig = txt
|
||||||
|
|
||||||
|
return conv_res.document
|
||||||
|
|
||||||
|
def _turn_md_into_doc(self, conv_res):
|
||||||
|
def _extract_markdown_code(text):
|
||||||
|
"""
|
||||||
|
Extracts text from markdown code blocks (enclosed in triple backticks).
|
||||||
|
If no code blocks are found, returns the original text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Input text that may contain markdown code blocks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted code if code blocks exist, otherwise original text
|
||||||
|
"""
|
||||||
|
# Regex pattern to match content between triple backticks
|
||||||
|
# This handles multiline content and optional language specifier
|
||||||
|
pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
|
||||||
|
|
||||||
|
# Search with DOTALL flag to match across multiple lines
|
||||||
|
mtch = re.search(pattern, text, re.DOTALL)
|
||||||
|
|
||||||
|
if mtch:
|
||||||
|
# Return only the content of the first capturing group
|
||||||
|
return mtch.group(1)
|
||||||
|
else:
|
||||||
|
# No code blocks found, return original text
|
||||||
|
return text
|
||||||
|
|
||||||
|
for pg_idx, page in enumerate(conv_res.pages):
|
||||||
|
page_no = pg_idx + 1 # FIXME: might be incorrect
|
||||||
|
|
||||||
|
predicted_text = ""
|
||||||
|
if page.predictions.vlm_response:
|
||||||
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
||||||
|
|
||||||
|
predicted_text = _extract_markdown_code(text=predicted_text)
|
||||||
|
|
||||||
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
||||||
out_doc = InputDocument(
|
out_doc = InputDocument(
|
||||||
path_or_stream=response_bytes,
|
path_or_stream=response_bytes,
|
||||||
@ -208,7 +269,113 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
in_doc=out_doc,
|
in_doc=out_doc,
|
||||||
path_or_stream=response_bytes,
|
path_or_stream=response_bytes,
|
||||||
)
|
)
|
||||||
return backend.convert()
|
page_doc = backend.convert()
|
||||||
|
|
||||||
|
if page.image is not None:
|
||||||
|
pg_width = page.image.width
|
||||||
|
pg_height = page.image.height
|
||||||
|
else:
|
||||||
|
pg_width = 1
|
||||||
|
pg_height = 1
|
||||||
|
|
||||||
|
conv_res.document.add_page(
|
||||||
|
page_no=page_no,
|
||||||
|
size=Size(width=pg_width, height=pg_height),
|
||||||
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
||||||
|
if page.image
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
for item, level in page_doc.iterate_items():
|
||||||
|
item.prov = [
|
||||||
|
ProvenanceItem(
|
||||||
|
page_no=pg_idx + 1,
|
||||||
|
bbox=BoundingBox(
|
||||||
|
t=0.0, b=0.0, l=0.0, r=0.0
|
||||||
|
), # FIXME: would be nice not to have to "fake" it
|
||||||
|
charspan=[0, 0],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
conv_res.document.append_child_item(child=item)
|
||||||
|
|
||||||
|
return conv_res.document
|
||||||
|
|
||||||
|
def _turn_html_into_doc(self, conv_res):
|
||||||
|
def _extract_html_code(text):
|
||||||
|
"""
|
||||||
|
Extracts text from markdown code blocks (enclosed in triple backticks).
|
||||||
|
If no code blocks are found, returns the original text.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): Input text that may contain markdown code blocks
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Extracted code if code blocks exist, otherwise original text
|
||||||
|
"""
|
||||||
|
# Regex pattern to match content between triple backticks
|
||||||
|
# This handles multiline content and optional language specifier
|
||||||
|
pattern = r"^```(?:\w*\n)?(.*?)```(\n)*$"
|
||||||
|
|
||||||
|
# Search with DOTALL flag to match across multiple lines
|
||||||
|
mtch = re.search(pattern, text, re.DOTALL)
|
||||||
|
|
||||||
|
if mtch:
|
||||||
|
# Return only the content of the first capturing group
|
||||||
|
return mtch.group(1)
|
||||||
|
else:
|
||||||
|
# No code blocks found, return original text
|
||||||
|
return text
|
||||||
|
|
||||||
|
for pg_idx, page in enumerate(conv_res.pages):
|
||||||
|
page_no = pg_idx + 1 # FIXME: might be incorrect
|
||||||
|
|
||||||
|
predicted_text = ""
|
||||||
|
if page.predictions.vlm_response:
|
||||||
|
predicted_text = page.predictions.vlm_response.text + "\n\n"
|
||||||
|
|
||||||
|
predicted_text = _extract_html_code(text=predicted_text)
|
||||||
|
|
||||||
|
response_bytes = BytesIO(predicted_text.encode("utf8"))
|
||||||
|
out_doc = InputDocument(
|
||||||
|
path_or_stream=response_bytes,
|
||||||
|
filename=conv_res.input.file.name,
|
||||||
|
format=InputFormat.MD,
|
||||||
|
backend=HTMLDocumentBackend,
|
||||||
|
)
|
||||||
|
backend = HTMLDocumentBackend(
|
||||||
|
in_doc=out_doc,
|
||||||
|
path_or_stream=response_bytes,
|
||||||
|
)
|
||||||
|
page_doc = backend.convert()
|
||||||
|
|
||||||
|
if page.image is not None:
|
||||||
|
pg_width = page.image.width
|
||||||
|
pg_height = page.image.height
|
||||||
|
else:
|
||||||
|
pg_width = 1
|
||||||
|
pg_height = 1
|
||||||
|
|
||||||
|
conv_res.document.add_page(
|
||||||
|
page_no=page_no,
|
||||||
|
size=Size(width=pg_width, height=pg_height),
|
||||||
|
image=ImageRef.from_pil(image=page.image, dpi=72)
|
||||||
|
if page.image
|
||||||
|
else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
for item, level in page_doc.iterate_items():
|
||||||
|
item.prov = [
|
||||||
|
ProvenanceItem(
|
||||||
|
page_no=pg_idx + 1,
|
||||||
|
bbox=BoundingBox(
|
||||||
|
t=0.0, b=0.0, l=0.0, r=0.0
|
||||||
|
), # FIXME: would be nice not to have to "fake" it
|
||||||
|
charspan=[0, 0],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
conv_res.document.append_child_item(child=item)
|
||||||
|
|
||||||
|
return conv_res.document
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_default_options(cls) -> VlmPipelineOptions:
|
def get_default_options(cls) -> VlmPipelineOptions:
|
||||||
|
@ -1,13 +1,16 @@
|
|||||||
import logging
|
import logging
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from docling.datamodel.pipeline_options import AcceleratorDevice
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def decide_device(accelerator_device: str) -> str:
|
def decide_device(
|
||||||
|
accelerator_device: str, supported_devices: Optional[List[AcceleratorDevice]] = None
|
||||||
|
) -> str:
|
||||||
r"""
|
r"""
|
||||||
Resolve the device based on the acceleration options and the available devices in the system.
|
Resolve the device based on the acceleration options and the available devices in the system.
|
||||||
|
|
||||||
@ -20,6 +23,18 @@ def decide_device(accelerator_device: str) -> str:
|
|||||||
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
has_cuda = torch.backends.cuda.is_built() and torch.cuda.is_available()
|
||||||
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
has_mps = torch.backends.mps.is_built() and torch.backends.mps.is_available()
|
||||||
|
|
||||||
|
if supported_devices is not None:
|
||||||
|
if has_cuda and AcceleratorDevice.CUDA not in supported_devices:
|
||||||
|
_log.info(
|
||||||
|
f"Removing CUDA from available devices because it is not in {supported_devices=}"
|
||||||
|
)
|
||||||
|
has_cuda = False
|
||||||
|
if has_mps and AcceleratorDevice.MPS not in supported_devices:
|
||||||
|
_log.info(
|
||||||
|
f"Removing MPS from available devices because it is not in {supported_devices=}"
|
||||||
|
)
|
||||||
|
has_mps = False
|
||||||
|
|
||||||
if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
|
if accelerator_device == AcceleratorDevice.AUTO.value: # Handle 'auto'
|
||||||
if has_cuda:
|
if has_cuda:
|
||||||
device = "cuda:0"
|
device = "cuda:0"
|
||||||
|
@ -4,18 +4,20 @@ from typing import Optional
|
|||||||
|
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
granite_picture_description,
|
granite_picture_description,
|
||||||
smoldocling_vlm_conversion_options,
|
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
|
||||||
smolvlm_picture_description,
|
smolvlm_picture_description,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
from docling.datamodel.vlm_model_specs import (
|
||||||
|
SMOLDOCLING_MLX,
|
||||||
|
SMOLDOCLING_TRANSFORMERS,
|
||||||
|
)
|
||||||
from docling.models.code_formula_model import CodeFormulaModel
|
from docling.models.code_formula_model import CodeFormulaModel
|
||||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||||
from docling.models.easyocr_model import EasyOcrModel
|
from docling.models.easyocr_model import EasyOcrModel
|
||||||
from docling.models.hf_vlm_model import HuggingFaceVlmModel
|
|
||||||
from docling.models.layout_model import LayoutModel
|
from docling.models.layout_model import LayoutModel
|
||||||
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
from docling.models.picture_description_vlm_model import PictureDescriptionVlmModel
|
||||||
from docling.models.table_structure_model import TableStructureModel
|
from docling.models.table_structure_model import TableStructureModel
|
||||||
|
from docling.models.utils.hf_model_download import download_hf_model
|
||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
@ -75,7 +77,7 @@ def download_models(
|
|||||||
|
|
||||||
if with_smolvlm:
|
if with_smolvlm:
|
||||||
_log.info("Downloading SmolVlm model...")
|
_log.info("Downloading SmolVlm model...")
|
||||||
PictureDescriptionVlmModel.download_models(
|
download_hf_model(
|
||||||
repo_id=smolvlm_picture_description.repo_id,
|
repo_id=smolvlm_picture_description.repo_id,
|
||||||
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
local_dir=output_dir / smolvlm_picture_description.repo_cache_folder,
|
||||||
force=force,
|
force=force,
|
||||||
@ -84,26 +86,25 @@ def download_models(
|
|||||||
|
|
||||||
if with_smoldocling:
|
if with_smoldocling:
|
||||||
_log.info("Downloading SmolDocling model...")
|
_log.info("Downloading SmolDocling model...")
|
||||||
HuggingFaceVlmModel.download_models(
|
download_hf_model(
|
||||||
repo_id=smoldocling_vlm_conversion_options.repo_id,
|
repo_id=SMOLDOCLING_TRANSFORMERS.repo_id,
|
||||||
local_dir=output_dir / smoldocling_vlm_conversion_options.repo_cache_folder,
|
local_dir=output_dir / SMOLDOCLING_TRANSFORMERS.repo_cache_folder,
|
||||||
force=force,
|
force=force,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
if with_smoldocling_mlx:
|
if with_smoldocling_mlx:
|
||||||
_log.info("Downloading SmolDocling MLX model...")
|
_log.info("Downloading SmolDocling MLX model...")
|
||||||
HuggingFaceVlmModel.download_models(
|
download_hf_model(
|
||||||
repo_id=smoldocling_vlm_mlx_conversion_options.repo_id,
|
repo_id=SMOLDOCLING_MLX.repo_id,
|
||||||
local_dir=output_dir
|
local_dir=output_dir / SMOLDOCLING_MLX.repo_cache_folder,
|
||||||
/ smoldocling_vlm_mlx_conversion_options.repo_cache_folder,
|
|
||||||
force=force,
|
force=force,
|
||||||
progress=progress,
|
progress=progress,
|
||||||
)
|
)
|
||||||
|
|
||||||
if with_granite_vision:
|
if with_granite_vision:
|
||||||
_log.info("Downloading Granite Vision model...")
|
_log.info("Downloading Granite Vision model...")
|
||||||
PictureDescriptionVlmModel.download_models(
|
download_hf_model(
|
||||||
repo_id=granite_picture_description.repo_id,
|
repo_id=granite_picture_description.repo_id,
|
||||||
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
local_dir=output_dir / granite_picture_description.repo_cache_folder,
|
||||||
force=force,
|
force=force,
|
||||||
|
160
docs/examples/compare_vlm_models.py
vendored
Normal file
160
docs/examples/compare_vlm_models.py
vendored
Normal file
@ -0,0 +1,160 @@
|
|||||||
|
# Compare VLM models
|
||||||
|
# ==================
|
||||||
|
#
|
||||||
|
# This example runs the VLM pipeline with different vision-language models.
|
||||||
|
# Their runtime as well output quality is compared.
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
||||||
|
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from docling.datamodel import vlm_model_specs
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
VlmPipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import InferenceFramework
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
|
|
||||||
|
def convert(sources: list[Path], converter: DocumentConverter):
|
||||||
|
model_id = pipeline_options.vlm_options.repo_id.replace("/", "_")
|
||||||
|
framework = pipeline_options.vlm_options.inference_framework
|
||||||
|
for source in sources:
|
||||||
|
print("================================================")
|
||||||
|
print("Processing...")
|
||||||
|
print(f"Source: {source}")
|
||||||
|
print("---")
|
||||||
|
print(f"Model: {model_id}")
|
||||||
|
print(f"Framework: {framework}")
|
||||||
|
print("================================================")
|
||||||
|
print("")
|
||||||
|
|
||||||
|
res = converter.convert(source)
|
||||||
|
|
||||||
|
print("")
|
||||||
|
|
||||||
|
fname = f"{res.input.file.stem}-{model_id}-{framework}"
|
||||||
|
|
||||||
|
inference_time = 0.0
|
||||||
|
for i, page in enumerate(res.pages):
|
||||||
|
inference_time += page.predictions.vlm_response.generation_time
|
||||||
|
print("")
|
||||||
|
print(
|
||||||
|
f" ---------- Predicted page {i} in {pipeline_options.vlm_options.response_format} in {page.predictions.vlm_response.generation_time} [sec]:"
|
||||||
|
)
|
||||||
|
print(page.predictions.vlm_response.text)
|
||||||
|
print(" ---------- ")
|
||||||
|
|
||||||
|
print("===== Final output of the converted document =======")
|
||||||
|
|
||||||
|
with (out_path / f"{fname}.json").open("w") as fp:
|
||||||
|
fp.write(json.dumps(res.document.export_to_dict()))
|
||||||
|
|
||||||
|
res.document.save_as_json(
|
||||||
|
out_path / f"{fname}.json",
|
||||||
|
image_mode=ImageRefMode.PLACEHOLDER,
|
||||||
|
)
|
||||||
|
print(f" => produced {out_path / fname}.json")
|
||||||
|
|
||||||
|
res.document.save_as_markdown(
|
||||||
|
out_path / f"{fname}.md",
|
||||||
|
image_mode=ImageRefMode.PLACEHOLDER,
|
||||||
|
)
|
||||||
|
print(f" => produced {out_path / fname}.md")
|
||||||
|
|
||||||
|
res.document.save_as_html(
|
||||||
|
out_path / f"{fname}.html",
|
||||||
|
image_mode=ImageRefMode.EMBEDDED,
|
||||||
|
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
||||||
|
split_page_view=True,
|
||||||
|
)
|
||||||
|
print(f" => produced {out_path / fname}.html")
|
||||||
|
|
||||||
|
pg_num = res.document.num_pages()
|
||||||
|
print("")
|
||||||
|
print(
|
||||||
|
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
|
||||||
|
)
|
||||||
|
print("====================================================")
|
||||||
|
|
||||||
|
return [
|
||||||
|
source,
|
||||||
|
model_id,
|
||||||
|
str(framework),
|
||||||
|
pg_num,
|
||||||
|
inference_time,
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sources = [
|
||||||
|
"tests/data/pdf/2305.03393v1-pg9.pdf",
|
||||||
|
]
|
||||||
|
|
||||||
|
out_path = Path("scratch")
|
||||||
|
out_path.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
## Use VlmPipeline
|
||||||
|
pipeline_options = VlmPipelineOptions()
|
||||||
|
pipeline_options.generate_page_images = True
|
||||||
|
|
||||||
|
## On GPU systems, enable flash_attention_2 with CUDA:
|
||||||
|
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
||||||
|
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
||||||
|
|
||||||
|
vlm_models = [
|
||||||
|
## DocTags / SmolDocling models
|
||||||
|
vlm_model_specs.SMOLDOCLING_MLX,
|
||||||
|
vlm_model_specs.SMOLDOCLING_TRANSFORMERS,
|
||||||
|
## Markdown models (using MLX framework)
|
||||||
|
vlm_model_specs.QWEN25_VL_3B_MLX,
|
||||||
|
vlm_model_specs.PIXTRAL_12B_MLX,
|
||||||
|
vlm_model_specs.GEMMA3_12B_MLX,
|
||||||
|
## Markdown models (using Transformers framework)
|
||||||
|
vlm_model_specs.GRANITE_VISION_TRANSFORMERS,
|
||||||
|
vlm_model_specs.PHI4_TRANSFORMERS,
|
||||||
|
vlm_model_specs.PIXTRAL_12B_TRANSFORMERS,
|
||||||
|
]
|
||||||
|
|
||||||
|
# Remove MLX models if not on Mac
|
||||||
|
if sys.platform != "darwin":
|
||||||
|
vlm_models = [
|
||||||
|
m for m in vlm_models if m.inference_framework != InferenceFramework.MLX
|
||||||
|
]
|
||||||
|
|
||||||
|
rows = []
|
||||||
|
for vlm_options in vlm_models:
|
||||||
|
pipeline_options.vlm_options = vlm_options
|
||||||
|
|
||||||
|
## Set up pipeline for PDF or image inputs
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
InputFormat.IMAGE: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
row = convert(sources=sources, converter=converter)
|
||||||
|
rows.append(row)
|
||||||
|
|
||||||
|
print(
|
||||||
|
tabulate(
|
||||||
|
rows, headers=["source", "model_id", "framework", "num_pages", "time"]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
print("see if memory gets released ...")
|
||||||
|
time.sleep(10)
|
3
docs/examples/custom_convert.py
vendored
3
docs/examples/custom_convert.py
vendored
@ -3,10 +3,9 @@ import logging
|
|||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
101
docs/examples/minimal_vlm_pipeline.py
vendored
101
docs/examples/minimal_vlm_pipeline.py
vendored
@ -1,101 +1,46 @@
|
|||||||
import json
|
from docling.datamodel import vlm_model_specs
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
from docling_core.types.doc import DocItemLabel, ImageRefMode
|
|
||||||
from docling_core.types.doc.document import DEFAULT_EXPORT_LABELS
|
|
||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
smoldocling_vlm_mlx_conversion_options,
|
|
||||||
)
|
)
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
sources = [
|
source = "https://arxiv.org/pdf/2501.17887"
|
||||||
# "tests/data/2305.03393v1-pg9-img.png",
|
|
||||||
"tests/data/pdf/2305.03393v1-pg9.pdf",
|
|
||||||
]
|
|
||||||
|
|
||||||
## Use experimental VlmPipeline
|
###### USING SIMPLE DEFAULT VALUES
|
||||||
pipeline_options = VlmPipelineOptions()
|
# - SmolDocling model
|
||||||
# If force_backend_text = True, text from backend will be used instead of generated text
|
# - Using the transformers framework
|
||||||
pipeline_options.force_backend_text = False
|
|
||||||
|
|
||||||
## On GPU systems, enable flash_attention_2 with CUDA:
|
converter = DocumentConverter(
|
||||||
# pipeline_options.accelerator_options.device = AcceleratorDevice.CUDA
|
format_options={
|
||||||
# pipeline_options.accelerator_options.cuda_use_flash_attention2 = True
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
## Pick a VLM model. We choose SmolDocling-256M by default
|
doc = converter.convert(source=source).document
|
||||||
# pipeline_options.vlm_options = smoldocling_vlm_conversion_options
|
|
||||||
|
|
||||||
## Pick a VLM model. Fast Apple Silicon friendly implementation for SmolDocling-256M via MLX
|
print(doc.export_to_markdown())
|
||||||
pipeline_options.vlm_options = smoldocling_vlm_mlx_conversion_options
|
|
||||||
|
|
||||||
## Alternative VLM models:
|
|
||||||
# pipeline_options.vlm_options = granite_vision_vlm_conversion_options
|
|
||||||
|
|
||||||
## Set up pipeline for PDF or image inputs
|
###### USING MACOS MPS ACCELERATOR
|
||||||
|
# For more options see the compare_vlm_models.py example.
|
||||||
|
|
||||||
|
pipeline_options = VlmPipelineOptions(
|
||||||
|
vlm_options=vlm_model_specs.SMOLDOCLING_MLX,
|
||||||
|
)
|
||||||
|
|
||||||
converter = DocumentConverter(
|
converter = DocumentConverter(
|
||||||
format_options={
|
format_options={
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_cls=VlmPipeline,
|
pipeline_cls=VlmPipeline,
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
),
|
),
|
||||||
InputFormat.IMAGE: PdfFormatOption(
|
|
||||||
pipeline_cls=VlmPipeline,
|
|
||||||
pipeline_options=pipeline_options,
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
out_path = Path("scratch")
|
doc = converter.convert(source=source).document
|
||||||
out_path.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
for source in sources:
|
print(doc.export_to_markdown())
|
||||||
start_time = time.time()
|
|
||||||
print("================================================")
|
|
||||||
print(f"Processing... {source}")
|
|
||||||
print("================================================")
|
|
||||||
print("")
|
|
||||||
|
|
||||||
res = converter.convert(source)
|
|
||||||
|
|
||||||
print("")
|
|
||||||
print(res.document.export_to_markdown())
|
|
||||||
|
|
||||||
for page in res.pages:
|
|
||||||
print("")
|
|
||||||
print("Predicted page in DOCTAGS:")
|
|
||||||
print(page.predictions.vlm_response.text)
|
|
||||||
|
|
||||||
res.document.save_as_html(
|
|
||||||
filename=Path(f"{out_path}/{res.input.file.stem}.html"),
|
|
||||||
image_mode=ImageRefMode.REFERENCED,
|
|
||||||
labels=[*DEFAULT_EXPORT_LABELS, DocItemLabel.FOOTNOTE],
|
|
||||||
)
|
|
||||||
|
|
||||||
with (out_path / f"{res.input.file.stem}.json").open("w") as fp:
|
|
||||||
fp.write(json.dumps(res.document.export_to_dict()))
|
|
||||||
|
|
||||||
res.document.save_as_json(
|
|
||||||
out_path / f"{res.input.file.stem}.json",
|
|
||||||
image_mode=ImageRefMode.PLACEHOLDER,
|
|
||||||
)
|
|
||||||
|
|
||||||
res.document.save_as_markdown(
|
|
||||||
out_path / f"{res.input.file.stem}.md",
|
|
||||||
image_mode=ImageRefMode.PLACEHOLDER,
|
|
||||||
)
|
|
||||||
|
|
||||||
pg_num = res.document.num_pages()
|
|
||||||
print("")
|
|
||||||
inference_time = time.time() - start_time
|
|
||||||
print(
|
|
||||||
f"Total document prediction time: {inference_time:.2f} seconds, pages: {pg_num}"
|
|
||||||
)
|
|
||||||
|
|
||||||
print("================================================")
|
|
||||||
print("done!")
|
|
||||||
print("================================================")
|
|
||||||
|
3
docs/examples/run_with_accelerator.py
vendored
3
docs/examples/run_with_accelerator.py
vendored
@ -1,9 +1,8 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
)
|
)
|
||||||
from docling.datamodel.settings import settings
|
from docling.datamodel.settings import settings
|
||||||
|
1
docs/examples/translate.py
vendored
1
docs/examples/translate.py
vendored
@ -1,5 +1,4 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
|
from docling_core.types.doc import ImageRefMode, TableItem, TextItem
|
||||||
|
3
docs/examples/vlm_pipeline_api_model.py
vendored
3
docs/examples/vlm_pipeline_api_model.py
vendored
@ -7,10 +7,9 @@ from dotenv import load_dotenv
|
|||||||
|
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
ApiVlmOptions,
|
|
||||||
ResponseFormat,
|
|
||||||
VlmPipelineOptions,
|
VlmPipelineOptions,
|
||||||
)
|
)
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
from docling.pipeline.vlm_pipeline import VlmPipeline
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
|
52
docs/faq/index.md
vendored
52
docs/faq/index.md
vendored
@ -44,6 +44,23 @@ This is a collection of FAQ collected from the user questions on <https://github
|
|||||||
Source: Issue [#283](https://github.com/docling-project/docling/issues/283#issuecomment-2465035868)
|
Source: Issue [#283](https://github.com/docling-project/docling/issues/283#issuecomment-2465035868)
|
||||||
|
|
||||||
|
|
||||||
|
??? question "Is macOS x86_64 supported?"
|
||||||
|
|
||||||
|
### Is macOS x86_64 supported?
|
||||||
|
|
||||||
|
Yes, Docling (still) supports running the standard pipeline on macOS x86_64.
|
||||||
|
|
||||||
|
However, users might get into a combination of incompatible dependencies on a fresh install.
|
||||||
|
Because Docling depends on PyTorch which dropped support for macOS x86_64 after the 2.2.2 release,
|
||||||
|
and this old version of PyTorch works only with NumPy 1.x, users **must** ensure the correct NumPy version is running.
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install docling "numpy<2.0.0"
|
||||||
|
```
|
||||||
|
|
||||||
|
Source: Issue [#1694](https://github.com/docling-project/docling/issues/1694).
|
||||||
|
|
||||||
|
|
||||||
??? question "Are text styles (bold, underline, etc) supported?"
|
??? question "Are text styles (bold, underline, etc) supported?"
|
||||||
|
|
||||||
### Are text styles (bold, underline, etc) supported?
|
### Are text styles (bold, underline, etc) supported?
|
||||||
@ -177,3 +194,38 @@ This is a collection of FAQ collected from the user questions on <https://github
|
|||||||
Also see [docling#725](https://github.com/docling-project/docling/issues/725).
|
Also see [docling#725](https://github.com/docling-project/docling/issues/725).
|
||||||
|
|
||||||
Source: Issue [docling-core#119](https://github.com/docling-project/docling-core/issues/119)
|
Source: Issue [docling-core#119](https://github.com/docling-project/docling-core/issues/119)
|
||||||
|
|
||||||
|
|
||||||
|
??? question "How to use flash attention?"
|
||||||
|
|
||||||
|
### How to use flash attention?
|
||||||
|
|
||||||
|
When running models in Docling on CUDA devices, you can enable the usage of the Flash Attention2 library.
|
||||||
|
|
||||||
|
Using environment variables:
|
||||||
|
|
||||||
|
```
|
||||||
|
DOCLING_CUDA_USE_FLASH_ATTENTION2=1
|
||||||
|
```
|
||||||
|
|
||||||
|
Using code:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from docling.datamodel.accelerator_options import (
|
||||||
|
AcceleratorOptions,
|
||||||
|
)
|
||||||
|
|
||||||
|
pipeline_options = VlmPipelineOptions(
|
||||||
|
accelerator_options=AcceleratorOptions(cuda_use_flash_attention2=True)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
This requires having the [flash-attn](https://pypi.org/project/flash-attn/) package installed. Below are two alternative ways for installing it:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# Building from sources (required the CUDA dev environment)
|
||||||
|
pip install flash-attn
|
||||||
|
|
||||||
|
# Using pre-built wheels (not available in all possible setups)
|
||||||
|
FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn
|
||||||
|
```
|
||||||
|
10
docs/index.md
vendored
10
docs/index.md
vendored
@ -6,13 +6,13 @@
|
|||||||
[](https://arxiv.org/abs/2408.09869)
|
[](https://arxiv.org/abs/2408.09869)
|
||||||
[](https://pypi.org/project/docling/)
|
[](https://pypi.org/project/docling/)
|
||||||
[](https://pypi.org/project/docling/)
|
[](https://pypi.org/project/docling/)
|
||||||
[](https://python-poetry.org/)
|
[](https://github.com/astral-sh/uv)
|
||||||
[](https://github.com/psf/black)
|
[](https://github.com/astral-sh/ruff)
|
||||||
[](https://pycqa.github.io/isort/)
|
|
||||||
[](https://pydantic.dev)
|
[](https://pydantic.dev)
|
||||||
[](https://github.com/pre-commit/pre-commit)
|
[](https://github.com/pre-commit/pre-commit)
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://pepy.tech/projects/docling)
|
[](https://pepy.tech/projects/docling)
|
||||||
|
[](https://apify.com/vancura/docling)
|
||||||
[](https://www.bestpractices.dev/projects/10101)
|
[](https://www.bestpractices.dev/projects/10101)
|
||||||
[](https://lfaidata.foundation/projects/)
|
[](https://lfaidata.foundation/projects/)
|
||||||
|
|
||||||
@ -27,7 +27,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|||||||
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
* 🔒 Local execution capabilities for sensitive data and air-gapped environments
|
||||||
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
* 🤖 Plug-and-play [integrations][integrations] incl. LangChain, LlamaIndex, Crew AI & Haystack for agentic AI
|
||||||
* 🔍 Extensive OCR support for scanned PDFs and images
|
* 🔍 Extensive OCR support for scanned PDFs and images
|
||||||
* 🥚 Support of Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🆕🔥
|
* 🥚 Support of several Visual Language Models ([SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)) 🔥
|
||||||
* 💻 Simple and convenient CLI
|
* 💻 Simple and convenient CLI
|
||||||
|
|
||||||
### Coming soon
|
### Coming soon
|
||||||
@ -39,7 +39,7 @@ Docling simplifies document processing, parsing diverse formats — including ad
|
|||||||
## Get started
|
## Get started
|
||||||
|
|
||||||
<div class="grid">
|
<div class="grid">
|
||||||
<a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamendals</a>
|
<a href="concepts/" class="card"><b>Concepts</b><br />Learn Docling fundamentals</a>
|
||||||
<a href="examples/" class="card"><b>Examples</b><br />Try out recipes for various use cases, including conversion, RAG, and more</a>
|
<a href="examples/" class="card"><b>Examples</b><br />Try out recipes for various use cases, including conversion, RAG, and more</a>
|
||||||
<a href="integrations/" class="card"><b>Integrations</b><br />Check out integrations with popular frameworks and tools</a>
|
<a href="integrations/" class="card"><b>Integrations</b><br />Check out integrations with popular frameworks and tools</a>
|
||||||
<a href="reference/document_converter/" class="card"><b>Reference</b><br />See more API details</a>
|
<a href="reference/document_converter/" class="card"><b>Reference</b><br />See more API details</a>
|
||||||
|
2
docs/installation/index.md
vendored
2
docs/installation/index.md
vendored
@ -129,5 +129,5 @@ Works on macOS, Linux, and Windows, with support for both x86_64 and arm64 archi
|
|||||||
To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
|
To develop Docling features, bugfixes etc., install as follows from your local clone's root dir:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
poetry install --all-extras
|
uv sync --all-extras
|
||||||
```
|
```
|
||||||
|
121
docs/usage/vision_models.md
vendored
Normal file
121
docs/usage/vision_models.md
vendored
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
|
||||||
|
The `VlmPipeline` in Docling allows to convert documents end-to-end using a vision-language model.
|
||||||
|
|
||||||
|
Docling supports vision-language models which output:
|
||||||
|
|
||||||
|
- DocTags (e.g. [SmolDocling](https://huggingface.co/ds4sd/SmolDocling-256M-preview)), the preferred choice
|
||||||
|
- Markdown
|
||||||
|
- HTML
|
||||||
|
|
||||||
|
|
||||||
|
For running Docling using local models with the `VlmPipeline`:
|
||||||
|
|
||||||
|
=== "CLI"
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docling --pipeline vlm FILE
|
||||||
|
```
|
||||||
|
|
||||||
|
=== "Python"
|
||||||
|
|
||||||
|
See also the example [minimal_vlm_pipeline.py](./../examples/minimal_vlm_pipeline.py).
|
||||||
|
|
||||||
|
```python
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = converter.convert(source="FILE").document
|
||||||
|
```
|
||||||
|
|
||||||
|
## Available local models
|
||||||
|
|
||||||
|
By default, the vision-language models are running locally.
|
||||||
|
Docling allows to choose between the Hugging Face [Transformers](https://github.com/huggingface/transformers) framweork and the [MLX](https://github.com/Blaizzy/mlx-vlm) (for Apple devices with MPS acceleration) one.
|
||||||
|
|
||||||
|
The following table reports the models currently available out-of-the-box.
|
||||||
|
|
||||||
|
| Model instance | Model | Framework | Device | Num pages | Inference time (sec) |
|
||||||
|
| ---------------|------ | --------- | ------ | --------- | ---------------------|
|
||||||
|
| `vlm_model_specs.SMOLDOCLING_TRANSFORMERS` | [ds4sd/SmolDocling-256M-preview](https://huggingface.co/ds4sd/SmolDocling-256M-preview) | `Transformers/AutoModelForVision2Seq` | MPS | 1 | 102.212 |
|
||||||
|
| `vlm_model_specs.SMOLDOCLING_MLX` | [ds4sd/SmolDocling-256M-preview-mlx-bf16](https://huggingface.co/ds4sd/SmolDocling-256M-preview-mlx-bf16) | `MLX`| MPS | 1 | 6.15453 |
|
||||||
|
| `vlm_model_specs.QWEN25_VL_3B_MLX` | [mlx-community/Qwen2.5-VL-3B-Instruct-bf16](https://huggingface.co/mlx-community/Qwen2.5-VL-3B-Instruct-bf16) | `MLX`| MPS | 1 | 23.4951 |
|
||||||
|
| `vlm_model_specs.PIXTRAL_12B_MLX` | [mlx-community/pixtral-12b-bf16](https://huggingface.co/mlx-community/pixtral-12b-bf16) | `MLX` | MPS | 1 | 308.856 |
|
||||||
|
| `vlm_model_specs.GEMMA3_12B_MLX` | [mlx-community/gemma-3-12b-it-bf16](https://huggingface.co/mlx-community/gemma-3-12b-it-bf16) | `MLX` | MPS | 1 | 378.486 |
|
||||||
|
| `vlm_model_specs.GRANITE_VISION_TRANSFORMERS` | [ibm-granite/granite-vision-3.2-2b](https://huggingface.co/ibm-granite/granite-vision-3.2-2b) | `Transformers/AutoModelForVision2Seq` | MPS | 1 | 104.75 |
|
||||||
|
| `vlm_model_specs.PHI4_TRANSFORMERS` | [microsoft/Phi-4-multimodal-instruct](https://huggingface.co/microsoft/Phi-4-multimodal-instruct) | `Transformers/AutoModelForCasualLM` | CPU | 1 | 1175.67 |
|
||||||
|
| `vlm_model_specs.PIXTRAL_12B_TRANSFORMERS` | [mistral-community/pixtral-12b](https://huggingface.co/mistral-community/pixtral-12b) | `Transformers/AutoModelForVision2Seq` | CPU | 1 | 1828.21 |
|
||||||
|
|
||||||
|
_Inference time is computed on a Macbook M3 Max using the example page `tests/data/pdf/2305.03393v1-pg9.pdf`. The comparision is done with the example [compare_vlm_models.py](./../examples/compare_vlm_models.py)._
|
||||||
|
|
||||||
|
For choosing the model, the code snippet above can be extended as follow
|
||||||
|
|
||||||
|
```python
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
from docling.pipeline.vlm_pipeline import VlmPipeline
|
||||||
|
from docling.datamodel.pipeline_options import (
|
||||||
|
VlmPipelineOptions,
|
||||||
|
)
|
||||||
|
from docling.datamodel import vlm_model_specs
|
||||||
|
|
||||||
|
pipeline_options = VlmPipelineOptions(
|
||||||
|
vlm_options=vlm_model_specs.SMOLDOCLING_MLX, # <-- change the model here
|
||||||
|
)
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(
|
||||||
|
pipeline_cls=VlmPipeline,
|
||||||
|
pipeline_options=pipeline_options,
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
doc = converter.convert(source="FILE").document
|
||||||
|
```
|
||||||
|
|
||||||
|
### Other models
|
||||||
|
|
||||||
|
Other models can be configured by directly providing the Hugging Face `repo_id`, the prompt and a few more options.
|
||||||
|
|
||||||
|
For example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from docling.datamodel.pipeline_options_vlm_model import InlineVlmOptions, InferenceFramework, TransformersModelType
|
||||||
|
|
||||||
|
pipeline_options = VlmPipelineOptions(
|
||||||
|
vlm_options=InlineVlmOptions(
|
||||||
|
repo_id="ibm-granite/granite-vision-3.2-2b",
|
||||||
|
prompt="Convert this page to markdown. Do not miss any text and only output the bare markdown!",
|
||||||
|
response_format=ResponseFormat.MARKDOWN,
|
||||||
|
inference_framework=InferenceFramework.TRANSFORMERS,
|
||||||
|
transformers_model_type=TransformersModelType.AUTOMODEL_VISION2SEQ,
|
||||||
|
supported_devices=[
|
||||||
|
AcceleratorDevice.CPU,
|
||||||
|
AcceleratorDevice.CUDA,
|
||||||
|
AcceleratorDevice.MPS,
|
||||||
|
],
|
||||||
|
scale=2.0,
|
||||||
|
temperature=0.0,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
## Remote models
|
||||||
|
|
||||||
|
Additionally to local models, the `VlmPipeline` allows to offload the inference to a remote service hosting the models.
|
||||||
|
Many remote inference services are provided, the key requirement is to offer an OpenAI-compatible API. This includes vLLM, Ollama, etc.
|
||||||
|
|
||||||
|
More examples on how to connect with the remote inference services can be found in the following examples:
|
||||||
|
|
||||||
|
- [vlm_pipeline_api_model.py](./../examples/vlm_pipeline_api_model.py)
|
@ -60,6 +60,7 @@ nav:
|
|||||||
- Usage: usage/index.md
|
- Usage: usage/index.md
|
||||||
- Supported formats: usage/supported_formats.md
|
- Supported formats: usage/supported_formats.md
|
||||||
- Enrichment features: usage/enrichments.md
|
- Enrichment features: usage/enrichments.md
|
||||||
|
- Vision models: usage/vision_models.md
|
||||||
- FAQ:
|
- FAQ:
|
||||||
- FAQ: faq/index.md
|
- FAQ: faq/index.md
|
||||||
- Concepts:
|
- Concepts:
|
||||||
@ -78,6 +79,7 @@ nav:
|
|||||||
- "Multi-format conversion": examples/run_with_formats.py
|
- "Multi-format conversion": examples/run_with_formats.py
|
||||||
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
|
- "VLM pipeline with SmolDocling": examples/minimal_vlm_pipeline.py
|
||||||
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
|
- "VLM pipeline with remote model": examples/vlm_pipeline_api_model.py
|
||||||
|
- "VLM comparison": examples/compare_vlm_models.py
|
||||||
- "Figure export": examples/export_figures.py
|
- "Figure export": examples/export_figures.py
|
||||||
- "Table export": examples/export_tables.py
|
- "Table export": examples/export_tables.py
|
||||||
- "Multimodal export": examples/export_multimodal.py
|
- "Multimodal export": examples/export_multimodal.py
|
||||||
|
8637
poetry.lock
generated
8637
poetry.lock
generated
File diff suppressed because it is too large
Load Diff
250
pyproject.toml
250
pyproject.toml
@ -1,20 +1,8 @@
|
|||||||
[tool.poetry]
|
[project]
|
||||||
name = "docling"
|
name = "docling"
|
||||||
version = "2.34.0" # DO NOT EDIT, updated automatically
|
version = "2.36.1" # DO NOT EDIT, updated automatically
|
||||||
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
|
||||||
authors = [
|
|
||||||
"Christoph Auer <cau@zurich.ibm.com>",
|
|
||||||
"Michele Dolfi <dol@zurich.ibm.com>",
|
|
||||||
"Maxim Lysak <mly@zurich.ibm.com>",
|
|
||||||
"Nikos Livathinos <nli@zurich.ibm.com>",
|
|
||||||
"Ahmed Nassar <ahn@zurich.ibm.com>",
|
|
||||||
"Panos Vagenas <pva@zurich.ibm.com>",
|
|
||||||
"Peter Staar <taa@zurich.ibm.com>",
|
|
||||||
]
|
|
||||||
license = "MIT"
|
license = "MIT"
|
||||||
readme = "README.md"
|
|
||||||
repository = "https://github.com/docling-project/docling"
|
|
||||||
homepage = "https://github.com/docling-project/docling"
|
|
||||||
keywords = [
|
keywords = [
|
||||||
"docling",
|
"docling",
|
||||||
"convert",
|
"convert",
|
||||||
@ -29,144 +17,136 @@ keywords = [
|
|||||||
"table former",
|
"table former",
|
||||||
]
|
]
|
||||||
classifiers = [
|
classifiers = [
|
||||||
"License :: OSI Approved :: MIT License",
|
|
||||||
"Operating System :: MacOS :: MacOS X",
|
"Operating System :: MacOS :: MacOS X",
|
||||||
"Operating System :: POSIX :: Linux",
|
"Operating System :: POSIX :: Linux",
|
||||||
|
"Operating System :: Microsoft :: Windows",
|
||||||
"Development Status :: 5 - Production/Stable",
|
"Development Status :: 5 - Production/Stable",
|
||||||
"Intended Audience :: Developers",
|
"Intended Audience :: Developers",
|
||||||
"Intended Audience :: Science/Research",
|
"Intended Audience :: Science/Research",
|
||||||
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
||||||
"Programming Language :: Python :: 3",
|
"Programming Language :: Python :: 3",
|
||||||
|
"Programming Language :: Python :: 3.9",
|
||||||
|
"Programming Language :: Python :: 3.10",
|
||||||
|
"Programming Language :: Python :: 3.11",
|
||||||
|
"Programming Language :: Python :: 3.12",
|
||||||
|
"Programming Language :: Python :: 3.13",
|
||||||
]
|
]
|
||||||
packages = [{ include = "docling" }]
|
readme = "README.md"
|
||||||
|
authors = [
|
||||||
[tool.poetry.dependencies]
|
{ name = "Christoph Auer", email = "cau@zurich.ibm.com" },
|
||||||
######################
|
{ name = "Michele Dolfi", email = "dol@zurich.ibm.com" },
|
||||||
# actual dependencies:
|
{ name = "Maxim Lysak", email = "mly@zurich.ibm.com" },
|
||||||
######################
|
{ name = "Nikos Livathinos", email = "nli@zurich.ibm.com" },
|
||||||
python = "^3.9"
|
{ name = "Ahmed Nassar", email = "ahn@zurich.ibm.com" },
|
||||||
pydantic = "^2.0.0"
|
{ name = "Panos Vagenas", email = "pva@zurich.ibm.com" },
|
||||||
docling-core = {version = "^2.31.2", extras = ["chunking"]}
|
{ name = "Peter Staar", email = "taa@zurich.ibm.com" },
|
||||||
docling-ibm-models = "^3.4.0"
|
|
||||||
docling-parse = "^4.0.0"
|
|
||||||
filetype = "^1.2.0"
|
|
||||||
pypdfium2 = "^4.30.0"
|
|
||||||
pydantic-settings = "^2.3.0"
|
|
||||||
huggingface_hub = ">=0.23,<1"
|
|
||||||
requests = "^2.32.2"
|
|
||||||
easyocr = "^1.7"
|
|
||||||
tesserocr = { version = "^2.7.1", optional = true }
|
|
||||||
certifi = ">=2024.7.4"
|
|
||||||
rtree = "^1.3.0"
|
|
||||||
scipy = [
|
|
||||||
{ version = "^1.6.0", markers = "python_version >= '3.10'" },
|
|
||||||
{ version = ">=1.6.0,<1.14.0", markers = "python_version < '3.10'" },
|
|
||||||
]
|
]
|
||||||
typer = ">=0.12.5,<0.16.0"
|
requires-python = '>=3.9,<4.0'
|
||||||
python-docx = "^1.1.2"
|
dependencies = [
|
||||||
python-pptx = "^1.0.2"
|
'pydantic (>=2.0.0,<3.0.0)',
|
||||||
beautifulsoup4 = "^4.12.3"
|
'docling-core[chunking] (>=2.29.0,<3.0.0)',
|
||||||
pandas = "^2.1.4"
|
'docling-ibm-models (>=3.4.4,<4.0.0)',
|
||||||
marko = "^2.1.2"
|
'docling-parse (>=4.0.0,<5.0.0)',
|
||||||
openpyxl = "^3.1.5"
|
'filetype (>=1.2.0,<2.0.0)',
|
||||||
lxml = ">=4.0.0,<6.0.0"
|
'pypdfium2 (>=4.30.0,<5.0.0)',
|
||||||
ocrmac = { version = "^1.0.0", markers = "sys_platform == 'darwin'", optional = true }
|
'pydantic-settings (>=2.3.0,<3.0.0)',
|
||||||
rapidocr-onnxruntime = { version = "^1.4.0", optional = true, markers = "python_version < '3.13'" }
|
'huggingface_hub (>=0.23,<1)',
|
||||||
onnxruntime = [
|
'requests (>=2.32.2,<3.0.0)',
|
||||||
# 1.19.2 is the last version with python3.9 support,
|
'easyocr (>=1.7,<2.0)',
|
||||||
# see https://github.com/microsoft/onnxruntime/releases/tag/v1.20.0
|
'certifi (>=2024.7.4)',
|
||||||
{ version = ">=1.7.0,<1.20.0", optional = true, markers = "python_version < '3.10'" },
|
'rtree (>=1.3.0,<2.0.0)',
|
||||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" },
|
'typer (>=0.12.5,<0.17.0)',
|
||||||
|
'python-docx (>=1.1.2,<2.0.0)',
|
||||||
|
'python-pptx (>=1.0.2,<2.0.0)',
|
||||||
|
'beautifulsoup4 (>=4.12.3,<5.0.0)',
|
||||||
|
'pandas (>=2.1.4,<3.0.0)',
|
||||||
|
'marko (>=2.1.2,<3.0.0)',
|
||||||
|
'openpyxl (>=3.1.5,<4.0.0)',
|
||||||
|
'lxml (>=4.0.0,<6.0.0)',
|
||||||
|
'pillow (>=10.0.0,<12.0.0)',
|
||||||
|
'tqdm (>=4.65.0,<5.0.0)',
|
||||||
|
'pluggy (>=1.0.0,<2.0.0)',
|
||||||
|
'pylatexenc (>=2.10,<3.0)',
|
||||||
|
'scipy (>=1.6.0,<2.0.0)',
|
||||||
|
# 'scipy (>=1.6.0,<2.0.0) ; python_version >= "3.10"',
|
||||||
|
# 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
|
||||||
]
|
]
|
||||||
|
|
||||||
transformers = [
|
[project.urls]
|
||||||
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^4.46.0", optional = true },
|
homepage = "https://github.com/docling-project/docling"
|
||||||
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~4.42.0", optional = true },
|
repository = "https://github.com/docling-project/docling"
|
||||||
]
|
issues = "https://github.com/docling-project/docling/issues"
|
||||||
accelerate = [
|
changelog = "https://github.com/docling-project/docling/blob/main/CHANGELOG.md"
|
||||||
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^1.2.1", optional = true },
|
|
||||||
]
|
|
||||||
pillow = ">=10.0.0,<12.0.0"
|
|
||||||
tqdm = "^4.65.0"
|
|
||||||
pluggy = "^1.0.0"
|
|
||||||
pylatexenc = "^2.10"
|
|
||||||
click = "<8.2.0"
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[project.entry-points.docling]
|
||||||
python = "^3.9.2"
|
"docling_defaults" = "docling.models.plugins.defaults"
|
||||||
black = { extras = ["jupyter"], version = "^24.4.2" }
|
|
||||||
pytest = "^7.2.2"
|
|
||||||
pre-commit = "^3.7.1"
|
|
||||||
mypy = "^1.10.1"
|
|
||||||
isort = "^5.10.1"
|
|
||||||
python-semantic-release = "^7.32.2"
|
|
||||||
flake8 = "^6.0.0"
|
|
||||||
pyproject-flake8 = "^6.0.0"
|
|
||||||
pytest-xdist = "^3.3.1"
|
|
||||||
types-requests = "^2.31.0.2"
|
|
||||||
flake8-pyproject = "^1.2.3"
|
|
||||||
pylint = "^2.17.5"
|
|
||||||
pandas-stubs = "^2.1.4.231227"
|
|
||||||
ipykernel = "^6.29.5"
|
|
||||||
ipywidgets = "^8.1.5"
|
|
||||||
nbqa = "^1.9.0"
|
|
||||||
types-openpyxl = "^3.1.5.20241114"
|
|
||||||
types-tqdm = "^4.67.0.20241221"
|
|
||||||
coverage = "^7.6.2"
|
|
||||||
pytest-cov = "^6.0.0"
|
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[project.scripts]
|
||||||
mkdocs-material = "^9.5.40"
|
|
||||||
mkdocs-jupyter = "^0.25.0"
|
|
||||||
mkdocs-click = "^0.8.1"
|
|
||||||
mkdocstrings = { extras = ["python"], version = "^0.27.0" }
|
|
||||||
griffe-pydantic = "^1.1.0"
|
|
||||||
|
|
||||||
[tool.poetry.group.examples.dependencies]
|
|
||||||
datasets = "^2.21.0"
|
|
||||||
python-dotenv = "^1.0.1"
|
|
||||||
langchain-huggingface = "^0.0.3"
|
|
||||||
langchain-milvus = "^0.1.4"
|
|
||||||
langchain-text-splitters = "^0.2.4"
|
|
||||||
|
|
||||||
[tool.poetry.group.constraints]
|
|
||||||
optional = true
|
|
||||||
|
|
||||||
[tool.poetry.group.constraints.dependencies]
|
|
||||||
numpy = [
|
|
||||||
{ version = ">=1.24.4,<3.0.0", markers = 'python_version >= "3.10"' },
|
|
||||||
{ version = ">=1.24.4,<2.1.0", markers = 'python_version < "3.10"' },
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.poetry.group.mac_intel]
|
|
||||||
optional = true
|
|
||||||
|
|
||||||
[tool.poetry.group.mac_intel.dependencies]
|
|
||||||
torch = [
|
|
||||||
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^2.2.2" },
|
|
||||||
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~2.2.2" },
|
|
||||||
]
|
|
||||||
torchvision = [
|
|
||||||
{ markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'", version = "^0" },
|
|
||||||
{ markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'", version = "~0.17.2" },
|
|
||||||
]
|
|
||||||
|
|
||||||
[tool.poetry.extras]
|
|
||||||
tesserocr = ["tesserocr"]
|
|
||||||
ocrmac = ["ocrmac"]
|
|
||||||
vlm = ["transformers", "accelerate"]
|
|
||||||
rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
|
||||||
|
|
||||||
[tool.poetry.scripts]
|
|
||||||
docling = "docling.cli.main:app"
|
docling = "docling.cli.main:app"
|
||||||
docling-tools = "docling.cli.tools:app"
|
docling-tools = "docling.cli.tools:app"
|
||||||
|
|
||||||
[tool.poetry.plugins."docling"]
|
[project.optional-dependencies]
|
||||||
"docling_defaults" = "docling.models.plugins.defaults"
|
tesserocr = ['tesserocr (>=2.7.1,<3.0.0)']
|
||||||
|
ocrmac = ['ocrmac (>=1.0.0,<2.0.0) ; sys_platform == "darwin"']
|
||||||
|
vlm = [
|
||||||
|
'transformers (>=4.46.0,<5.0.0)',
|
||||||
|
'accelerate (>=1.2.1,<2.0.0)',
|
||||||
|
'mlx-vlm >=0.1.22 ; python_version >= "3.10" and sys_platform == "darwin" and platform_machine == "arm64"',
|
||||||
|
]
|
||||||
|
rapidocr = [
|
||||||
|
'rapidocr-onnxruntime (>=1.4.0,<2.0.0) ; python_version < "3.13"',
|
||||||
|
'onnxruntime (>=1.7.0,<2.0.0)',
|
||||||
|
# 'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
||||||
|
# 'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
||||||
|
]
|
||||||
|
|
||||||
[build-system]
|
[dependency-groups]
|
||||||
requires = ["poetry-core"]
|
dev = [
|
||||||
build-backend = "poetry.core.masonry.api"
|
"pre-commit~=3.7",
|
||||||
|
"mypy~=1.10",
|
||||||
|
"types-setuptools~=70.3",
|
||||||
|
"pandas-stubs~=2.1",
|
||||||
|
"types-openpyxl~=3.1",
|
||||||
|
"types-requests~=2.31",
|
||||||
|
"boto3-stubs~=1.37",
|
||||||
|
"types-urllib3~=1.26",
|
||||||
|
"types-tqdm~=4.67",
|
||||||
|
"coverage~=7.6",
|
||||||
|
"pytest~=8.3",
|
||||||
|
"pytest-cov>=6.1.1",
|
||||||
|
"pytest-dependency~=0.6",
|
||||||
|
"pytest-xdist~=3.3",
|
||||||
|
"ipykernel~=6.29",
|
||||||
|
"ipywidgets~=8.1",
|
||||||
|
"nbqa~=1.9",
|
||||||
|
"python-semantic-release~=7.32",
|
||||||
|
]
|
||||||
|
docs = [
|
||||||
|
"mkdocs-material~=9.5",
|
||||||
|
"mkdocs-jupyter~=0.25",
|
||||||
|
"mkdocs-click~=0.8",
|
||||||
|
"mkdocstrings[python]~=0.27",
|
||||||
|
"griffe-pydantic~=1.1",
|
||||||
|
]
|
||||||
|
examples = [
|
||||||
|
"datasets~=2.21",
|
||||||
|
"python-dotenv~=1.0",
|
||||||
|
"langchain-huggingface>=0.0.3",
|
||||||
|
"langchain-milvus~=0.1",
|
||||||
|
"langchain-text-splitters~=0.2",
|
||||||
|
]
|
||||||
|
constraints = [
|
||||||
|
'onnxruntime (>=1.7.0,<2.0.0) ; python_version >= "3.10"',
|
||||||
|
'onnxruntime (>=1.7.0,<1.20.0) ; python_version < "3.10"',
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
[tool.uv]
|
||||||
|
package = true
|
||||||
|
default-groups = "all"
|
||||||
|
|
||||||
|
[tool.setuptools.packages.find]
|
||||||
|
include = ["docling*"]
|
||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
target-version = "py39"
|
target-version = "py39"
|
||||||
|
BIN
tests/data/docx/textbox.docx
vendored
BIN
tests/data/docx/textbox.docx
vendored
Binary file not shown.
102
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
102
tests/data/groundtruth/docling_v2/textbox.docx.itxt
vendored
@ -26,69 +26,71 @@ item-0 at level 0: unspecified: group _root_
|
|||||||
item-21 at level 1: paragraph:
|
item-21 at level 1: paragraph:
|
||||||
item-22 at level 1: paragraph:
|
item-22 at level 1: paragraph:
|
||||||
item-23 at level 1: section: group textbox
|
item-23 at level 1: section: group textbox
|
||||||
item-24 at level 2: paragraph: A report must be submitted wi ... saster Prevention Information Network.
|
item-24 at level 2: list: group list
|
||||||
item-25 at level 2: paragraph: A report must also be submitt ... d Infectious Disease Reporting System.
|
item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network.
|
||||||
item-26 at level 2: paragraph:
|
item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System.
|
||||||
item-27 at level 2: paragraph:
|
item-27 at level 2: paragraph:
|
||||||
item-28 at level 1: paragraph:
|
item-28 at level 2: paragraph:
|
||||||
item-29 at level 1: paragraph:
|
item-29 at level 1: list: group list
|
||||||
item-30 at level 1: paragraph:
|
item-30 at level 2: list_item:
|
||||||
item-31 at level 1: paragraph:
|
item-31 at level 1: paragraph:
|
||||||
item-32 at level 1: paragraph:
|
item-32 at level 1: paragraph:
|
||||||
item-33 at level 1: paragraph:
|
item-33 at level 1: paragraph:
|
||||||
item-34 at level 1: section: group textbox
|
item-34 at level 1: paragraph:
|
||||||
item-35 at level 2: paragraph: Health Bureau:
|
item-35 at level 1: paragraph:
|
||||||
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
item-36 at level 1: section: group textbox
|
||||||
item-37 at level 2: list: group list
|
item-37 at level 2: paragraph: Health Bureau:
|
||||||
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||||
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
item-39 at level 2: list: group list
|
||||||
item-40 at level 2: paragraph:
|
item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||||
item-41 at level 2: paragraph:
|
item-41 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||||
item-42 at level 1: list: group list
|
item-42 at level 2: paragraph:
|
||||||
item-43 at level 2: list_item:
|
item-43 at level 2: paragraph:
|
||||||
item-44 at level 1: paragraph:
|
item-44 at level 1: list: group list
|
||||||
item-45 at level 1: section: group textbox
|
item-45 at level 2: list_item:
|
||||||
item-46 at level 2: paragraph: Department of Education:
|
item-46 at level 1: paragraph:
|
||||||
|
item-47 at level 1: section: group textbox
|
||||||
|
item-48 at level 2: paragraph: Department of Education:
|
||||||
Collabo ... vention measures at all school levels.
|
Collabo ... vention measures at all school levels.
|
||||||
item-47 at level 1: paragraph:
|
|
||||||
item-48 at level 1: paragraph:
|
|
||||||
item-49 at level 1: paragraph:
|
item-49 at level 1: paragraph:
|
||||||
item-50 at level 1: paragraph:
|
item-50 at level 1: paragraph:
|
||||||
item-51 at level 1: paragraph:
|
item-51 at level 1: paragraph:
|
||||||
item-52 at level 1: paragraph:
|
item-52 at level 1: paragraph:
|
||||||
item-53 at level 1: paragraph:
|
item-53 at level 1: paragraph:
|
||||||
item-54 at level 1: section: group textbox
|
item-54 at level 1: paragraph:
|
||||||
item-55 at level 2: inline: group group
|
item-55 at level 1: paragraph:
|
||||||
item-56 at level 3: paragraph: The Health Bureau will handle
|
item-56 at level 1: section: group textbox
|
||||||
item-57 at level 3: paragraph: reporting and specimen collection
|
item-57 at level 2: inline: group group
|
||||||
item-58 at level 3: paragraph: .
|
item-58 at level 3: paragraph: The Health Bureau will handle
|
||||||
item-59 at level 2: paragraph:
|
item-59 at level 3: paragraph: reporting and specimen collection
|
||||||
item-60 at level 2: paragraph:
|
item-60 at level 3: paragraph: .
|
||||||
item-61 at level 1: paragraph:
|
item-61 at level 2: paragraph:
|
||||||
item-62 at level 1: paragraph:
|
item-62 at level 2: paragraph:
|
||||||
item-63 at level 1: paragraph:
|
item-63 at level 1: paragraph:
|
||||||
item-64 at level 1: section: group textbox
|
item-64 at level 1: paragraph:
|
||||||
item-65 at level 2: paragraph: Whether the epidemic has eased.
|
item-65 at level 1: paragraph:
|
||||||
item-66 at level 2: paragraph:
|
item-66 at level 1: section: group textbox
|
||||||
item-67 at level 2: paragraph:
|
item-67 at level 2: paragraph: Whether the epidemic has eased.
|
||||||
item-68 at level 1: paragraph:
|
item-68 at level 2: paragraph:
|
||||||
item-69 at level 1: section: group textbox
|
item-69 at level 2: paragraph:
|
||||||
item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
item-70 at level 1: paragraph:
|
||||||
item-71 at level 2: paragraph: No
|
item-71 at level 1: section: group textbox
|
||||||
item-72 at level 1: paragraph:
|
item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||||
item-73 at level 1: paragraph:
|
item-73 at level 2: paragraph: No
|
||||||
item-74 at level 1: section: group textbox
|
item-74 at level 1: paragraph:
|
||||||
item-75 at level 1: paragraph:
|
item-75 at level 1: paragraph:
|
||||||
item-76 at level 1: section: group textbox
|
item-76 at level 1: section: group textbox
|
||||||
item-77 at level 1: paragraph:
|
item-77 at level 1: paragraph:
|
||||||
item-78 at level 1: paragraph:
|
item-78 at level 1: section: group textbox
|
||||||
item-79 at level 1: section: group textbox
|
item-79 at level 1: paragraph:
|
||||||
item-80 at level 2: paragraph: Case closed.
|
item-80 at level 1: paragraph:
|
||||||
item-81 at level 2: paragraph:
|
item-81 at level 1: section: group textbox
|
||||||
item-82 at level 2: paragraph:
|
item-82 at level 2: paragraph: Case closed.
|
||||||
item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
item-83 at level 2: paragraph:
|
||||||
item-84 at level 1: paragraph:
|
item-84 at level 2: paragraph:
|
||||||
item-85 at level 1: section: group textbox
|
item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||||
item-86 at level 1: paragraph:
|
item-86 at level 1: paragraph:
|
||||||
item-87 at level 1: paragraph:
|
item-87 at level 1: section: group textbox
|
||||||
item-88 at level 1: paragraph:
|
item-88 at level 1: paragraph:
|
||||||
|
item-89 at level 1: paragraph:
|
||||||
|
item-90 at level 1: paragraph:
|
200
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
200
tests/data/groundtruth/docling_v2/textbox.docx.json
vendored
@ -4,7 +4,7 @@
|
|||||||
"name": "textbox",
|
"name": "textbox",
|
||||||
"origin": {
|
"origin": {
|
||||||
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
"mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
"binary_hash": 830302052279341882,
|
"binary_hash": 11723995438039370060,
|
||||||
"filename": "textbox.docx"
|
"filename": "textbox.docx"
|
||||||
},
|
},
|
||||||
"furniture": {
|
"furniture": {
|
||||||
@ -66,7 +66,7 @@
|
|||||||
"$ref": "#/groups/4"
|
"$ref": "#/groups/4"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/22"
|
"$ref": "#/groups/6"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/23"
|
"$ref": "#/texts/23"
|
||||||
@ -84,16 +84,16 @@
|
|||||||
"$ref": "#/texts/27"
|
"$ref": "#/texts/27"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/groups/7"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/7"
|
"$ref": "#/groups/9"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/35"
|
"$ref": "#/texts/35"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/8"
|
"$ref": "#/groups/10"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/37"
|
"$ref": "#/texts/37"
|
||||||
@ -117,7 +117,7 @@
|
|||||||
"$ref": "#/texts/43"
|
"$ref": "#/texts/43"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/9"
|
"$ref": "#/groups/11"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/49"
|
"$ref": "#/texts/49"
|
||||||
@ -129,13 +129,13 @@
|
|||||||
"$ref": "#/texts/51"
|
"$ref": "#/texts/51"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/11"
|
"$ref": "#/groups/13"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/55"
|
"$ref": "#/texts/55"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/12"
|
"$ref": "#/groups/14"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/58"
|
"$ref": "#/texts/58"
|
||||||
@ -144,13 +144,13 @@
|
|||||||
"$ref": "#/texts/59"
|
"$ref": "#/texts/59"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/13"
|
"$ref": "#/groups/15"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/60"
|
"$ref": "#/texts/60"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/14"
|
"$ref": "#/groups/16"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/61"
|
"$ref": "#/texts/61"
|
||||||
@ -159,13 +159,13 @@
|
|||||||
"$ref": "#/texts/62"
|
"$ref": "#/texts/62"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/15"
|
"$ref": "#/groups/17"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/67"
|
"$ref": "#/texts/67"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/16"
|
"$ref": "#/groups/18"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/68"
|
"$ref": "#/texts/68"
|
||||||
@ -254,10 +254,7 @@
|
|||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/18"
|
"$ref": "#/groups/5"
|
||||||
},
|
|
||||||
{
|
|
||||||
"$ref": "#/texts/19"
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/20"
|
"$ref": "#/texts/20"
|
||||||
@ -272,6 +269,37 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/5",
|
"self_ref": "#/groups/5",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/groups/4"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/18"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/19"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "list",
|
||||||
|
"label": "list"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/6",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [
|
||||||
|
{
|
||||||
|
"$ref": "#/texts/22"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "list",
|
||||||
|
"label": "list"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/7",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
@ -283,7 +311,7 @@
|
|||||||
"$ref": "#/texts/29"
|
"$ref": "#/texts/29"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/6"
|
"$ref": "#/groups/8"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/32"
|
"$ref": "#/texts/32"
|
||||||
@ -297,9 +325,9 @@
|
|||||||
"label": "section"
|
"label": "section"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/6",
|
"self_ref": "#/groups/8",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/groups/7"
|
||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
@ -314,7 +342,7 @@
|
|||||||
"label": "list"
|
"label": "list"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/7",
|
"self_ref": "#/groups/9",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
@ -328,7 +356,7 @@
|
|||||||
"label": "list"
|
"label": "list"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/8",
|
"self_ref": "#/groups/10",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
@ -342,13 +370,13 @@
|
|||||||
"label": "section"
|
"label": "section"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/9",
|
"self_ref": "#/groups/11",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
"$ref": "#/groups/10"
|
"$ref": "#/groups/12"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/47"
|
"$ref": "#/texts/47"
|
||||||
@ -362,9 +390,9 @@
|
|||||||
"label": "section"
|
"label": "section"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/10",
|
"self_ref": "#/groups/12",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/9"
|
"$ref": "#/groups/11"
|
||||||
},
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
@ -382,7 +410,7 @@
|
|||||||
"label": "inline"
|
"label": "inline"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/11",
|
"self_ref": "#/groups/13",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
@ -402,7 +430,7 @@
|
|||||||
"label": "section"
|
"label": "section"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/12",
|
"self_ref": "#/groups/14",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
@ -418,31 +446,31 @@
|
|||||||
"name": "textbox",
|
"name": "textbox",
|
||||||
"label": "section"
|
"label": "section"
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"self_ref": "#/groups/13",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/body"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "textbox",
|
|
||||||
"label": "section"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"self_ref": "#/groups/14",
|
|
||||||
"parent": {
|
|
||||||
"$ref": "#/body"
|
|
||||||
},
|
|
||||||
"children": [],
|
|
||||||
"content_layer": "body",
|
|
||||||
"name": "textbox",
|
|
||||||
"label": "section"
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/15",
|
"self_ref": "#/groups/15",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "textbox",
|
||||||
|
"label": "section"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/16",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
|
"children": [],
|
||||||
|
"content_layer": "body",
|
||||||
|
"name": "textbox",
|
||||||
|
"label": "section"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"self_ref": "#/groups/17",
|
||||||
|
"parent": {
|
||||||
|
"$ref": "#/body"
|
||||||
|
},
|
||||||
"children": [
|
"children": [
|
||||||
{
|
{
|
||||||
"$ref": "#/texts/63"
|
"$ref": "#/texts/63"
|
||||||
@ -462,7 +490,7 @@
|
|||||||
"label": "section"
|
"label": "section"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/groups/16",
|
"self_ref": "#/groups/18",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/body"
|
||||||
},
|
},
|
||||||
@ -732,38 +760,42 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/18",
|
"self_ref": "#/texts/18",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/4"
|
"$ref": "#/groups/5"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": " A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
|
"orig": "A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
|
||||||
"text": " A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
|
"text": "A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.",
|
||||||
"formatting": {
|
"formatting": {
|
||||||
"bold": false,
|
"bold": false,
|
||||||
"italic": false,
|
"italic": false,
|
||||||
"underline": false,
|
"underline": false,
|
||||||
"strikethrough": false
|
"strikethrough": false
|
||||||
}
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/19",
|
"self_ref": "#/texts/19",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/4"
|
"$ref": "#/groups/5"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": " A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
|
"orig": "A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
|
||||||
"text": " A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
|
"text": "A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.",
|
||||||
"formatting": {
|
"formatting": {
|
||||||
"bold": false,
|
"bold": false,
|
||||||
"italic": false,
|
"italic": false,
|
||||||
"underline": false,
|
"underline": false,
|
||||||
"strikethrough": false
|
"strikethrough": false
|
||||||
}
|
},
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/20",
|
"self_ref": "#/texts/20",
|
||||||
@ -792,14 +824,16 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/22",
|
"self_ref": "#/texts/22",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/body"
|
"$ref": "#/groups/6"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
"label": "paragraph",
|
"label": "list_item",
|
||||||
"prov": [],
|
"prov": [],
|
||||||
"orig": "",
|
"orig": "",
|
||||||
"text": ""
|
"text": "",
|
||||||
|
"enumerated": false,
|
||||||
|
"marker": "-"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"self_ref": "#/texts/23",
|
"self_ref": "#/texts/23",
|
||||||
@ -864,7 +898,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/28",
|
"self_ref": "#/texts/28",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/groups/7"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -882,7 +916,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/29",
|
"self_ref": "#/texts/29",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/groups/7"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -900,7 +934,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/30",
|
"self_ref": "#/texts/30",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/6"
|
"$ref": "#/groups/8"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -920,7 +954,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/31",
|
"self_ref": "#/texts/31",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/6"
|
"$ref": "#/groups/8"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -940,7 +974,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/32",
|
"self_ref": "#/texts/32",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/groups/7"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -952,7 +986,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/33",
|
"self_ref": "#/texts/33",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/5"
|
"$ref": "#/groups/7"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -964,7 +998,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/34",
|
"self_ref": "#/texts/34",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/7"
|
"$ref": "#/groups/9"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -990,7 +1024,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/36",
|
"self_ref": "#/texts/36",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/8"
|
"$ref": "#/groups/10"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1092,7 +1126,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/44",
|
"self_ref": "#/texts/44",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/10"
|
"$ref": "#/groups/12"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1110,7 +1144,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/45",
|
"self_ref": "#/texts/45",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/10"
|
"$ref": "#/groups/12"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1128,7 +1162,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/46",
|
"self_ref": "#/texts/46",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/10"
|
"$ref": "#/groups/12"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1146,7 +1180,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/47",
|
"self_ref": "#/texts/47",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/9"
|
"$ref": "#/groups/11"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1158,7 +1192,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/48",
|
"self_ref": "#/texts/48",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/9"
|
"$ref": "#/groups/11"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1206,7 +1240,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/52",
|
"self_ref": "#/texts/52",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/11"
|
"$ref": "#/groups/13"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1224,7 +1258,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/53",
|
"self_ref": "#/texts/53",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/11"
|
"$ref": "#/groups/13"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1236,7 +1270,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/54",
|
"self_ref": "#/texts/54",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/11"
|
"$ref": "#/groups/13"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1260,7 +1294,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/56",
|
"self_ref": "#/texts/56",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/12"
|
"$ref": "#/groups/14"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1278,7 +1312,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/57",
|
"self_ref": "#/texts/57",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/12"
|
"$ref": "#/groups/14"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1356,7 +1390,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/63",
|
"self_ref": "#/texts/63",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/15"
|
"$ref": "#/groups/17"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1374,7 +1408,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/64",
|
"self_ref": "#/texts/64",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/15"
|
"$ref": "#/groups/17"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1386,7 +1420,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/65",
|
"self_ref": "#/texts/65",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/15"
|
"$ref": "#/groups/17"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
@ -1398,7 +1432,7 @@
|
|||||||
{
|
{
|
||||||
"self_ref": "#/texts/66",
|
"self_ref": "#/texts/66",
|
||||||
"parent": {
|
"parent": {
|
||||||
"$ref": "#/groups/15"
|
"$ref": "#/groups/17"
|
||||||
},
|
},
|
||||||
"children": [],
|
"children": [],
|
||||||
"content_layer": "body",
|
"content_layer": "body",
|
||||||
|
@ -19,9 +19,8 @@ show the same suggested reportable symptoms
|
|||||||
|
|
||||||
Yes
|
Yes
|
||||||
|
|
||||||
A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
|
- A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
|
||||||
|
- A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
|
||||||
A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
|
|
||||||
|
|
||||||
**Health Bureau:**
|
**Health Bureau:**
|
||||||
|
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import (
|
from docling.datamodel.document import (
|
||||||
@ -17,6 +19,7 @@ from .verify_utils import verify_document, verify_export
|
|||||||
GENERATE = GEN_TEST_DATA
|
GENERATE = GEN_TEST_DATA
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail(strict=False)
|
||||||
def test_textbox_extraction():
|
def test_textbox_extraction():
|
||||||
in_path = Path("tests/data/docx/textbox.docx")
|
in_path = Path("tests/data/docx/textbox.docx")
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
@ -78,8 +81,7 @@ def get_converter():
|
|||||||
return converter
|
return converter
|
||||||
|
|
||||||
|
|
||||||
def test_e2e_docx_conversions():
|
def _test_e2e_docx_conversions_impl(docx_paths: list[Path]):
|
||||||
docx_paths = get_docx_paths()
|
|
||||||
converter = get_converter()
|
converter = get_converter()
|
||||||
|
|
||||||
for docx_path in docx_paths:
|
for docx_path in docx_paths:
|
||||||
@ -118,6 +120,20 @@ def test_e2e_docx_conversions():
|
|||||||
), "export to html"
|
), "export to html"
|
||||||
|
|
||||||
|
|
||||||
|
flaky_path = Path("tests/data/docx/textbox.docx")
|
||||||
|
|
||||||
|
|
||||||
|
def test_e2e_docx_conversions():
|
||||||
|
_test_e2e_docx_conversions_impl(
|
||||||
|
docx_paths=[path for path in get_docx_paths() if path != flaky_path]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.xfail(strict=False)
|
||||||
|
def test_textbox_conversion():
|
||||||
|
_test_e2e_docx_conversions_impl(docx_paths=[flaky_path])
|
||||||
|
|
||||||
|
|
||||||
def test_text_after_image_anchors():
|
def test_text_after_image_anchors():
|
||||||
"""
|
"""
|
||||||
Test to analyse whether text gets parsed after image anchors.
|
Test to analyse whether text gets parsed after image anchors.
|
||||||
|
@ -1,9 +1,10 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import AcceleratorDevice, PdfPipelineOptions
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||||
|
|
||||||
from .test_data_gen_flag import GEN_TEST_DATA
|
from .test_data_gen_flag import GEN_TEST_DATA
|
||||||
|
@ -3,10 +3,10 @@ from pathlib import Path
|
|||||||
from typing import List, Tuple
|
from typing import List, Tuple
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrMacOptions,
|
OcrMacOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
|
@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
|
|||||||
doc_path = Path("./tests/data/html/wiki_duck.html")
|
doc_path = Path("./tests/data/html/wiki_duck.html")
|
||||||
assert dci._guess_format(doc_path) == InputFormat.HTML
|
assert dci._guess_format(doc_path) == InputFormat.HTML
|
||||||
|
|
||||||
|
html_str = ( # HTML starting with a script
|
||||||
|
"<script>\nconsole.log('foo');\n</script>"
|
||||||
|
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
|
||||||
|
)
|
||||||
|
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
|
||||||
|
assert dci._guess_format(stream) == InputFormat.HTML
|
||||||
|
|
||||||
# Valid MD
|
# Valid MD
|
||||||
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
||||||
stream = DocumentStream(name="wiki.md", stream=buf)
|
stream = DocumentStream(name="wiki.md", stream=buf)
|
||||||
|
@ -7,11 +7,10 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
|||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
|
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
|
||||||
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
|
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
|
||||||
from docling.datamodel.document import ConversionResult
|
from docling.datamodel.document import ConversionResult
|
||||||
from docling.datamodel.pipeline_options import (
|
from docling.datamodel.pipeline_options import (
|
||||||
AcceleratorDevice,
|
|
||||||
AcceleratorOptions,
|
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
TableFormerMode,
|
TableFormerMode,
|
||||||
)
|
)
|
||||||
|
@ -323,33 +323,33 @@ def verify_conversion_result_v1(
|
|||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(
|
fw.write(
|
||||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||||
)
|
)
|
||||||
|
|
||||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(json_path, "w") as fw:
|
with open(json_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||||
|
|
||||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(md_path, "w") as fw:
|
with open(md_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_md)
|
fw.write(doc_pred_md)
|
||||||
|
|
||||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
with open(pages_path) as fr:
|
with open(pages_path, encoding="utf-8") as fr:
|
||||||
doc_true_pages = PageList.validate_json(fr.read())
|
doc_true_pages = PageList.validate_json(fr.read())
|
||||||
|
|
||||||
with open(json_path) as fr:
|
with open(json_path, encoding="utf-8") as fr:
|
||||||
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
doc_true: DsDocument = DsDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
with open(md_path) as fr:
|
with open(md_path, encoding="utf-8") as fr:
|
||||||
doc_true_md = fr.read()
|
doc_true_md = fr.read()
|
||||||
|
|
||||||
with open(dt_path) as fr:
|
with open(dt_path, encoding="utf-8") as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
if not fuzzy:
|
if not fuzzy:
|
||||||
@ -408,33 +408,33 @@ def verify_conversion_result_v2(
|
|||||||
|
|
||||||
if generate: # only used when re-generating truth
|
if generate: # only used when re-generating truth
|
||||||
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
pages_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(pages_path, "w") as fw:
|
with open(pages_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(
|
fw.write(
|
||||||
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
json.dumps(doc_pred_pages, default=pydantic_encoder, indent=indent)
|
||||||
)
|
)
|
||||||
|
|
||||||
json_path.parent.mkdir(parents=True, exist_ok=True)
|
json_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(json_path, "w") as fw:
|
with open(json_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
fw.write(json.dumps(doc_pred, default=pydantic_encoder, indent=indent))
|
||||||
|
|
||||||
md_path.parent.mkdir(parents=True, exist_ok=True)
|
md_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(md_path, "w") as fw:
|
with open(md_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_md)
|
fw.write(doc_pred_md)
|
||||||
|
|
||||||
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
dt_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
with open(dt_path, "w") as fw:
|
with open(dt_path, mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(doc_pred_dt)
|
fw.write(doc_pred_dt)
|
||||||
else: # default branch in test
|
else: # default branch in test
|
||||||
with open(pages_path) as fr:
|
with open(pages_path, encoding="utf-8") as fr:
|
||||||
doc_true_pages = PageList.validate_json(fr.read())
|
doc_true_pages = PageList.validate_json(fr.read())
|
||||||
|
|
||||||
with open(json_path) as fr:
|
with open(json_path, encoding="utf-8") as fr:
|
||||||
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
doc_true: DoclingDocument = DoclingDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
with open(md_path) as fr:
|
with open(md_path, encoding="utf-8") as fr:
|
||||||
doc_true_md = fr.read()
|
doc_true_md = fr.read()
|
||||||
|
|
||||||
with open(dt_path) as fr:
|
with open(dt_path, encoding="utf-8") as fr:
|
||||||
doc_true_dt = fr.read()
|
doc_true_dt = fr.read()
|
||||||
|
|
||||||
if not fuzzy:
|
if not fuzzy:
|
||||||
@ -461,12 +461,12 @@ def verify_conversion_result_v2(
|
|||||||
|
|
||||||
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
def verify_document(pred_doc: DoclingDocument, gtfile: str, generate: bool = False):
|
||||||
if not os.path.exists(gtfile) or generate:
|
if not os.path.exists(gtfile) or generate:
|
||||||
with open(gtfile, "w") as fw:
|
with open(gtfile, mode="w", encoding="utf-8") as fw:
|
||||||
json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
|
json.dump(pred_doc.export_to_dict(), fw, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
with open(gtfile) as fr:
|
with open(gtfile, encoding="utf-8") as fr:
|
||||||
true_doc = DoclingDocument.model_validate_json(fr.read())
|
true_doc = DoclingDocument.model_validate_json(fr.read())
|
||||||
|
|
||||||
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
return verify_docitems(pred_doc, true_doc, fuzzy=False)
|
||||||
@ -476,11 +476,11 @@ def verify_export(pred_text: str, gtfile: str, generate: bool = False) -> bool:
|
|||||||
file = Path(gtfile)
|
file = Path(gtfile)
|
||||||
|
|
||||||
if not file.exists() or generate:
|
if not file.exists() or generate:
|
||||||
with file.open("w") as fw:
|
with file.open(mode="w", encoding="utf-8") as fw:
|
||||||
fw.write(pred_text)
|
fw.write(pred_text)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
with file.open("r") as fr:
|
with file.open(encoding="utf-8") as fr:
|
||||||
true_text = fr.read()
|
true_text = fr.read()
|
||||||
|
|
||||||
return pred_text == true_text
|
return pred_text == true_text
|
||||||
|
Loading…
Reference in New Issue
Block a user