add docling-models utility

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2025-02-04 15:35:19 +01:00
parent 18aad34d67
commit dc9e759354
12 changed files with 315 additions and 34 deletions

View File

@ -28,7 +28,7 @@ jobs:
run: |
for file in docs/examples/*.py; do
# Skip batch_convert.py
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|offline_convert).py ]]; then
echo "Skipping $file"
continue
fi

View File

@ -0,0 +1,160 @@
import logging
import warnings
from pathlib import Path
from typing import Annotated
import typer
from docling.datamodel.settings import settings
from docling.models.code_formula_model import CodeFormulaModel
from docling.models.document_picture_classifier import DocumentPictureClassifier
from docling.models.easyocr_model import EasyOcrModel
from docling.models.layout_model import LayoutModel
from docling.models.rapid_ocr_model import RapidOcrModel
from docling.models.table_structure_model import TableStructureModel
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
_log = logging.getLogger(__name__)
from rich.console import Console
console = Console()
err_console = Console(stderr=True)
app = typer.Typer(
name="Docling model helper",
add_completion=False,
pretty_exceptions_enable=False,
)
@app.command("download")
def download(
output_dir: Annotated[
Path,
typer.Option(
...,
"-o",
"--output-dir",
help="The directory where all the models are downloaded.",
),
] = settings.cache_dir
/ "models",
force: Annotated[
bool, typer.Option(..., help="If true, the download will be forced")
] = False,
quite: Annotated[
bool,
typer.Option(
...,
"-q",
help="No extra output is generated, the CLI print only the directory with the cached models.",
),
] = False,
layout: Annotated[
bool,
typer.Option(..., help="If true, the layout model weights are downloaded."),
] = True,
tableformer: Annotated[
bool,
typer.Option(
..., help="If true, the tableformer model weights are downloaded."
),
] = True,
code_formula: Annotated[
bool,
typer.Option(
..., help="If true, the code formula model weights are downloaded."
),
] = True,
picture_classifier: Annotated[
bool,
typer.Option(
..., help="If true, the picture classifier model weights are downloaded."
),
] = True,
easyocr: Annotated[
bool,
typer.Option(..., help="If true, the easyocr model weights are downloaded."),
] = True,
rapidocr: Annotated[
bool,
typer.Option(..., help="If true, the rapidocr model weights are downloaded."),
] = True,
):
# Make sure the folder exists
output_dir.mkdir(exist_ok=True, parents=True)
show_progress = not quite
if layout:
if not quite:
typer.secho(f"Downloading layout model...", fg="blue")
LayoutModel.download_models_hf(
local_dir=output_dir / LayoutModel._model_repo_folder,
force=force,
progress=show_progress,
)
if tableformer:
if not quite:
typer.secho(f"Downloading tableformer model...", fg="blue")
TableStructureModel.download_models_hf(
local_dir=output_dir / TableStructureModel._model_repo_folder,
force=force,
progress=show_progress,
)
if picture_classifier:
if not quite:
typer.secho(f"Downloading picture classifier model...", fg="blue")
DocumentPictureClassifier.download_models_hf(
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
force=force,
progress=show_progress,
)
if code_formula:
if not quite:
typer.secho(f"Downloading code formula model...", fg="blue")
CodeFormulaModel.download_models_hf(
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
force=force,
progress=show_progress,
)
if easyocr:
if not quite:
typer.secho(f"Downloading easyocr models...", fg="blue")
EasyOcrModel.download_models(
local_dir=output_dir / EasyOcrModel._model_repo_folder,
force=force,
progress=show_progress,
)
if quite:
typer.echo(output_dir)
else:
typer.secho(f"All models downloaded in the directory {output_dir}.", fg="green")
console.print(
"\n",
"Docling can now be configured for running offline using the local artifacts.\n\n",
"Using the CLI:",
"`docling --artifacts-path={output_dir} FILE`",
"\n",
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
)
@app.command(hidden=True)
def other():
raise NotImplementedError()
click_app = typer.main.get_command(app)
if __name__ == "__main__":
app()

View File

@ -109,12 +109,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/CodeFormula",
force_download=force,

View File

@ -101,12 +101,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/DocumentFigureClassifier",
force_download=force,

View File

@ -1,7 +1,10 @@
import logging
import warnings
from typing import Iterable
import zipfile
from pathlib import Path
from typing import Iterable, List, Optional
import httpx
import numpy
import torch
from docling_core.types.doc import BoundingBox, CoordOrigin
@ -17,11 +20,14 @@ from docling.datamodel.settings import settings
from docling.models.base_ocr_model import BaseOcrModel
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import TimeRecorder
from docling.utils.utils import download_url_with_progress
_log = logging.getLogger(__name__)
class EasyOcrModel(BaseOcrModel):
_model_repo_folder = "EasyOcr"
def __init__(
self,
enabled: bool,
@ -71,6 +77,40 @@ class EasyOcrModel(BaseOcrModel):
verbose=False,
)
@staticmethod
def download_models(
detection_models: List[str] = ["craft"],
recognition_models: List[str] = ["english_g2", "latin_g2"],
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
from easyocr.config import detection_models as det_models_dict
from easyocr.config import recognition_models as rec_models_dict
if local_dir is None:
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
local_dir.mkdir(parents=True, exist_ok=True)
# Collect models to download
download_list = []
for model_name in detection_models:
if model_name in det_models_dict:
download_list.append(det_models_dict[model_name])
for model_name in recognition_models:
if model_name in rec_models_dict["gen2"]:
download_list.append(rec_models_dict["gen2"][model_name])
# Download models
for model_details in download_list:
buf = download_url_with_progress(model_details["url"], progress=progress)
with zipfile.ZipFile(buf, "r") as zip_ref:
zip_ref.extractall(local_dir)
return local_dir
def __call__(
self, conv_res: ConversionResult, page_batch: Iterable[Page]
) -> Iterable[Page]:

View File

@ -77,12 +77,15 @@ class LayoutModel(BasePageModel):
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
local_dir: Optional[Path] = None,
force: bool = False,
progress: bool = False,
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,

View File

@ -84,12 +84,13 @@ class TableStructureModel(BasePageModel):
@staticmethod
def download_models_hf(
local_dir: Optional[Path] = None, force: bool = False
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
) -> Path:
from huggingface_hub import snapshot_download
from huggingface_hub.utils import disable_progress_bars
disable_progress_bars()
if not progress:
disable_progress_bars()
download_path = snapshot_download(
repo_id="ds4sd/docling-models",
force_download=force,

View File

@ -4,6 +4,9 @@ from itertools import islice
from pathlib import Path
from typing import List, Union
import requests
from tqdm import tqdm
def chunkify(iterator, chunk_size):
"""Yield successive chunks of chunk_size from the iterable."""
@ -39,3 +42,24 @@ def create_hash(string: str):
hasher.update(string.encode("utf-8"))
return hasher.hexdigest()
def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
buf = BytesIO()
with requests.get(url, stream=True, allow_redirects=True) as response:
total_size = int(response.headers.get("content-length", 0))
progress_bar = tqdm(
total=total_size,
unit="B",
unit_scale=True,
unit_divisor=1024,
disable=(not progress),
)
for chunk in response.iter_content(10 * 1024):
buf.write(chunk)
progress_bar.update(len(chunk))
progress_bar.close()
buf.seek(0)
return buf

View File

@ -0,0 +1,19 @@
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
# The location of the local artifacts, e.g. from the `docling-models download` command
artifacts_path = Path("PATH TO MODELS") # <-- fill me
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
pipeline_options.ocr_options = EasyOcrOptions(
download_enabled=False, model_storage_directory=str(artifacts_path / "EasyOcr")
)
doc_converter = DocumentConverter(
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
)
result = doc_converter.convert("FILE TO CONVERT") # <-- fill me
print(result.document.export_to_markdown())

View File

@ -26,12 +26,47 @@ To see all available options (export formats etc.) run `docling --help`. More de
### Advanced options
#### Provide specific artifacts path (offline mode)
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
```python
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
artifacts_path = "/local/path/to/artifacts"
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
# if you are using EasyOcr
pipeline_options.ocr_options = EasyOcrOptions(
download_enabled=False,
model_storage_directory=str(artifacts_path / "EasyOcr")
)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
```
To download all the artifacts needed to run offline, Docling provides the `docling-models download` utility.
```sh
$ docling-models download
Downloading layout model...
Downloading tableformer model...
Downloading picture classifier model...
Downloading code formula model...
Downloading easyocr models...
All models downloaded in the directory $HOME/.cache/docling/models.
```
#### Adjust pipeline features
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
one can adjust the conversion pipeline and features.
##### Control PDF table extraction options
You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
@ -70,28 +105,6 @@ doc_converter = DocumentConverter(
)
```
##### Provide specific artifacts path
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
```python
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
# # to explicitly prefetch:
# artifacts_path = StandardPdfPipeline.download_models_hf()
artifacts_path = "/local/path/to/artifacts"
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
doc_converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
}
)
```
#### Impose limits on the document size

16
poetry.lock generated
View File

@ -7248,6 +7248,20 @@ files = [
[package.dependencies]
urllib3 = ">=2"
[[package]]
name = "types-tqdm"
version = "4.67.0.20241221"
description = "Typing stubs for tqdm"
optional = false
python-versions = ">=3.8"
files = [
{file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
{file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
]
[package.dependencies]
types-requests = "*"
[[package]]
name = "typing-extensions"
version = "4.12.2"
@ -7837,4 +7851,4 @@ tesserocr = ["tesserocr"]
[metadata]
lock-version = "2.0"
python-versions = "^3.9"
content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632"
content-hash = "7d8c8a4c2562f3e88673fb9a32d0a5f85aca0f7c7aeaa67a7a65a0f930a5c6c7"

View File

@ -60,6 +60,7 @@ onnxruntime = [
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
]
pillow = "^10.0.0"
tqdm = "^4.65.0"
[tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"}
@ -79,6 +80,7 @@ ipykernel = "^6.29.5"
ipywidgets = "^8.1.5"
nbqa = "^1.9.0"
types-openpyxl = "^3.1.5.20241114"
types-tqdm = "^4.67.0.20241221"
[tool.poetry.group.docs.dependencies]
mkdocs-material = "^9.5.40"
@ -123,6 +125,7 @@ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
[tool.poetry.scripts]
docling = "docling.cli.main:app"
docling-models = "docling.cli.models_download:app"
[build-system]
requires = ["poetry-core"]