mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
add docling-models utility
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
18aad34d67
commit
dc9e759354
2
.github/workflows/checks.yml
vendored
2
.github/workflows/checks.yml
vendored
@ -28,7 +28,7 @@ jobs:
|
||||
run: |
|
||||
for file in docs/examples/*.py; do
|
||||
# Skip batch_convert.py
|
||||
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then
|
||||
if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|offline_convert).py ]]; then
|
||||
echo "Skipping $file"
|
||||
continue
|
||||
fi
|
||||
|
160
docling/cli/models_download.py
Normal file
160
docling/cli/models_download.py
Normal file
@ -0,0 +1,160 @@
|
||||
import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
from typing import Annotated
|
||||
|
||||
import typer
|
||||
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.code_formula_model import CodeFormulaModel
|
||||
from docling.models.document_picture_classifier import DocumentPictureClassifier
|
||||
from docling.models.easyocr_model import EasyOcrModel
|
||||
from docling.models.layout_model import LayoutModel
|
||||
from docling.models.rapid_ocr_model import RapidOcrModel
|
||||
from docling.models.table_structure_model import TableStructureModel
|
||||
|
||||
warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
|
||||
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
from rich.console import Console
|
||||
|
||||
console = Console()
|
||||
err_console = Console(stderr=True)
|
||||
|
||||
|
||||
app = typer.Typer(
|
||||
name="Docling model helper",
|
||||
add_completion=False,
|
||||
pretty_exceptions_enable=False,
|
||||
)
|
||||
|
||||
|
||||
@app.command("download")
|
||||
def download(
|
||||
output_dir: Annotated[
|
||||
Path,
|
||||
typer.Option(
|
||||
...,
|
||||
"-o",
|
||||
"--output-dir",
|
||||
help="The directory where all the models are downloaded.",
|
||||
),
|
||||
] = settings.cache_dir
|
||||
/ "models",
|
||||
force: Annotated[
|
||||
bool, typer.Option(..., help="If true, the download will be forced")
|
||||
] = False,
|
||||
quite: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
"-q",
|
||||
help="No extra output is generated, the CLI print only the directory with the cached models.",
|
||||
),
|
||||
] = False,
|
||||
layout: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="If true, the layout model weights are downloaded."),
|
||||
] = True,
|
||||
tableformer: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If true, the tableformer model weights are downloaded."
|
||||
),
|
||||
] = True,
|
||||
code_formula: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If true, the code formula model weights are downloaded."
|
||||
),
|
||||
] = True,
|
||||
picture_classifier: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If true, the picture classifier model weights are downloaded."
|
||||
),
|
||||
] = True,
|
||||
easyocr: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="If true, the easyocr model weights are downloaded."),
|
||||
] = True,
|
||||
rapidocr: Annotated[
|
||||
bool,
|
||||
typer.Option(..., help="If true, the rapidocr model weights are downloaded."),
|
||||
] = True,
|
||||
):
|
||||
# Make sure the folder exists
|
||||
output_dir.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
show_progress = not quite
|
||||
|
||||
if layout:
|
||||
if not quite:
|
||||
typer.secho(f"Downloading layout model...", fg="blue")
|
||||
LayoutModel.download_models_hf(
|
||||
local_dir=output_dir / LayoutModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=show_progress,
|
||||
)
|
||||
|
||||
if tableformer:
|
||||
if not quite:
|
||||
typer.secho(f"Downloading tableformer model...", fg="blue")
|
||||
TableStructureModel.download_models_hf(
|
||||
local_dir=output_dir / TableStructureModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=show_progress,
|
||||
)
|
||||
|
||||
if picture_classifier:
|
||||
if not quite:
|
||||
typer.secho(f"Downloading picture classifier model...", fg="blue")
|
||||
DocumentPictureClassifier.download_models_hf(
|
||||
local_dir=output_dir / DocumentPictureClassifier._model_repo_folder,
|
||||
force=force,
|
||||
progress=show_progress,
|
||||
)
|
||||
|
||||
if code_formula:
|
||||
if not quite:
|
||||
typer.secho(f"Downloading code formula model...", fg="blue")
|
||||
CodeFormulaModel.download_models_hf(
|
||||
local_dir=output_dir / CodeFormulaModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=show_progress,
|
||||
)
|
||||
|
||||
if easyocr:
|
||||
if not quite:
|
||||
typer.secho(f"Downloading easyocr models...", fg="blue")
|
||||
EasyOcrModel.download_models(
|
||||
local_dir=output_dir / EasyOcrModel._model_repo_folder,
|
||||
force=force,
|
||||
progress=show_progress,
|
||||
)
|
||||
|
||||
if quite:
|
||||
typer.echo(output_dir)
|
||||
else:
|
||||
typer.secho(f"All models downloaded in the directory {output_dir}.", fg="green")
|
||||
|
||||
console.print(
|
||||
"\n",
|
||||
"Docling can now be configured for running offline using the local artifacts.\n\n",
|
||||
"Using the CLI:",
|
||||
"`docling --artifacts-path={output_dir} FILE`",
|
||||
"\n",
|
||||
"Using Python: see the documentation at <https://ds4sd.github.io/docling/usage>.",
|
||||
)
|
||||
|
||||
|
||||
@app.command(hidden=True)
|
||||
def other():
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
click_app = typer.main.get_command(app)
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
@ -109,12 +109,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel):
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
local_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
disable_progress_bars()
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/CodeFormula",
|
||||
force_download=force,
|
||||
|
@ -101,12 +101,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel):
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
disable_progress_bars()
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/DocumentFigureClassifier",
|
||||
force_download=force,
|
||||
|
@ -1,7 +1,10 @@
|
||||
import logging
|
||||
import warnings
|
||||
from typing import Iterable
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import httpx
|
||||
import numpy
|
||||
import torch
|
||||
from docling_core.types.doc import BoundingBox, CoordOrigin
|
||||
@ -17,11 +20,14 @@ from docling.datamodel.settings import settings
|
||||
from docling.models.base_ocr_model import BaseOcrModel
|
||||
from docling.utils.accelerator_utils import decide_device
|
||||
from docling.utils.profiling import TimeRecorder
|
||||
from docling.utils.utils import download_url_with_progress
|
||||
|
||||
_log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class EasyOcrModel(BaseOcrModel):
|
||||
_model_repo_folder = "EasyOcr"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
enabled: bool,
|
||||
@ -71,6 +77,40 @@ class EasyOcrModel(BaseOcrModel):
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def download_models(
|
||||
detection_models: List[str] = ["craft"],
|
||||
recognition_models: List[str] = ["english_g2", "latin_g2"],
|
||||
local_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
# Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py
|
||||
from easyocr.config import detection_models as det_models_dict
|
||||
from easyocr.config import recognition_models as rec_models_dict
|
||||
|
||||
if local_dir is None:
|
||||
local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder
|
||||
|
||||
local_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Collect models to download
|
||||
download_list = []
|
||||
for model_name in detection_models:
|
||||
if model_name in det_models_dict:
|
||||
download_list.append(det_models_dict[model_name])
|
||||
for model_name in recognition_models:
|
||||
if model_name in rec_models_dict["gen2"]:
|
||||
download_list.append(rec_models_dict["gen2"][model_name])
|
||||
|
||||
# Download models
|
||||
for model_details in download_list:
|
||||
buf = download_url_with_progress(model_details["url"], progress=progress)
|
||||
with zipfile.ZipFile(buf, "r") as zip_ref:
|
||||
zip_ref.extractall(local_dir)
|
||||
|
||||
return local_dir
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
|
@ -77,12 +77,15 @@ class LayoutModel(BasePageModel):
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
local_dir: Optional[Path] = None,
|
||||
force: bool = False,
|
||||
progress: bool = False,
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
disable_progress_bars()
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
|
@ -84,12 +84,13 @@ class TableStructureModel(BasePageModel):
|
||||
|
||||
@staticmethod
|
||||
def download_models_hf(
|
||||
local_dir: Optional[Path] = None, force: bool = False
|
||||
local_dir: Optional[Path] = None, force: bool = False, progress: bool = False
|
||||
) -> Path:
|
||||
from huggingface_hub import snapshot_download
|
||||
from huggingface_hub.utils import disable_progress_bars
|
||||
|
||||
disable_progress_bars()
|
||||
if not progress:
|
||||
disable_progress_bars()
|
||||
download_path = snapshot_download(
|
||||
repo_id="ds4sd/docling-models",
|
||||
force_download=force,
|
||||
|
@ -4,6 +4,9 @@ from itertools import islice
|
||||
from pathlib import Path
|
||||
from typing import List, Union
|
||||
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
def chunkify(iterator, chunk_size):
|
||||
"""Yield successive chunks of chunk_size from the iterable."""
|
||||
@ -39,3 +42,24 @@ def create_hash(string: str):
|
||||
hasher.update(string.encode("utf-8"))
|
||||
|
||||
return hasher.hexdigest()
|
||||
|
||||
|
||||
def download_url_with_progress(url: str, progress: bool = False) -> BytesIO:
|
||||
buf = BytesIO()
|
||||
with requests.get(url, stream=True, allow_redirects=True) as response:
|
||||
total_size = int(response.headers.get("content-length", 0))
|
||||
progress_bar = tqdm(
|
||||
total=total_size,
|
||||
unit="B",
|
||||
unit_scale=True,
|
||||
unit_divisor=1024,
|
||||
disable=(not progress),
|
||||
)
|
||||
|
||||
for chunk in response.iter_content(10 * 1024):
|
||||
buf.write(chunk)
|
||||
progress_bar.update(len(chunk))
|
||||
progress_bar.close()
|
||||
|
||||
buf.seek(0)
|
||||
return buf
|
||||
|
19
docs/examples/offline_convert.py
Normal file
19
docs/examples/offline_convert.py
Normal file
@ -0,0 +1,19 @@
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
# The location of the local artifacts, e.g. from the `docling-models download` command
|
||||
artifacts_path = Path("PATH TO MODELS") # <-- fill me
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
pipeline_options.ocr_options = EasyOcrOptions(
|
||||
download_enabled=False, model_storage_directory=str(artifacts_path / "EasyOcr")
|
||||
)
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)}
|
||||
)
|
||||
|
||||
result = doc_converter.convert("FILE TO CONVERT") # <-- fill me
|
||||
print(result.document.export_to_markdown())
|
@ -26,12 +26,47 @@ To see all available options (export formats etc.) run `docling --help`. More de
|
||||
|
||||
### Advanced options
|
||||
|
||||
#### Provide specific artifacts path (offline mode)
|
||||
|
||||
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
|
||||
artifacts_path = "/local/path/to/artifacts"
|
||||
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
# if you are using EasyOcr
|
||||
pipeline_options.ocr_options = EasyOcrOptions(
|
||||
download_enabled=False,
|
||||
model_storage_directory=str(artifacts_path / "EasyOcr")
|
||||
)
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
To download all the artifacts needed to run offline, Docling provides the `docling-models download` utility.
|
||||
|
||||
```sh
|
||||
$ docling-models download
|
||||
Downloading layout model...
|
||||
Downloading tableformer model...
|
||||
Downloading picture classifier model...
|
||||
Downloading code formula model...
|
||||
Downloading easyocr models...
|
||||
All models downloaded in the directory $HOME/.cache/docling/models.
|
||||
```
|
||||
|
||||
#### Adjust pipeline features
|
||||
|
||||
The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways
|
||||
one can adjust the conversion pipeline and features.
|
||||
|
||||
|
||||
##### Control PDF table extraction options
|
||||
|
||||
You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself.
|
||||
@ -70,28 +105,6 @@ doc_converter = DocumentConverter(
|
||||
)
|
||||
```
|
||||
|
||||
##### Provide specific artifacts path
|
||||
|
||||
By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows:
|
||||
|
||||
```python
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.document_converter import DocumentConverter, PdfFormatOption
|
||||
from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
|
||||
|
||||
# # to explicitly prefetch:
|
||||
# artifacts_path = StandardPdfPipeline.download_models_hf()
|
||||
|
||||
artifacts_path = "/local/path/to/artifacts"
|
||||
|
||||
pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path)
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
#### Impose limits on the document size
|
||||
|
||||
|
16
poetry.lock
generated
16
poetry.lock
generated
@ -7248,6 +7248,20 @@ files = [
|
||||
[package.dependencies]
|
||||
urllib3 = ">=2"
|
||||
|
||||
[[package]]
|
||||
name = "types-tqdm"
|
||||
version = "4.67.0.20241221"
|
||||
description = "Typing stubs for tqdm"
|
||||
optional = false
|
||||
python-versions = ">=3.8"
|
||||
files = [
|
||||
{file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"},
|
||||
{file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
types-requests = "*"
|
||||
|
||||
[[package]]
|
||||
name = "typing-extensions"
|
||||
version = "4.12.2"
|
||||
@ -7837,4 +7851,4 @@ tesserocr = ["tesserocr"]
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.9"
|
||||
content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632"
|
||||
content-hash = "7d8c8a4c2562f3e88673fb9a32d0a5f85aca0f7c7aeaa67a7a65a0f930a5c6c7"
|
||||
|
@ -60,6 +60,7 @@ onnxruntime = [
|
||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
||||
]
|
||||
pillow = "^10.0.0"
|
||||
tqdm = "^4.65.0"
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||
@ -79,6 +80,7 @@ ipykernel = "^6.29.5"
|
||||
ipywidgets = "^8.1.5"
|
||||
nbqa = "^1.9.0"
|
||||
types-openpyxl = "^3.1.5.20241114"
|
||||
types-tqdm = "^4.67.0.20241221"
|
||||
|
||||
[tool.poetry.group.docs.dependencies]
|
||||
mkdocs-material = "^9.5.40"
|
||||
@ -123,6 +125,7 @@ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"]
|
||||
|
||||
[tool.poetry.scripts]
|
||||
docling = "docling.cli.main:app"
|
||||
docling-models = "docling.cli.models_download:app"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
|
Loading…
Reference in New Issue
Block a user