diff --git a/.github/workflows/checks.yml b/.github/workflows/checks.yml index e04e2803..b7a7e9df 100644 --- a/.github/workflows/checks.yml +++ b/.github/workflows/checks.yml @@ -28,7 +28,7 @@ jobs: run: | for file in docs/examples/*.py; do # Skip batch_convert.py - if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment).py ]]; then + if [[ "$(basename "$file")" =~ ^(batch_convert|minimal|export_multimodal|custom_convert|develop_picture_enrichment|offline_convert).py ]]; then echo "Skipping $file" continue fi diff --git a/docling/cli/models_download.py b/docling/cli/models_download.py new file mode 100644 index 00000000..f82de182 --- /dev/null +++ b/docling/cli/models_download.py @@ -0,0 +1,160 @@ +import logging +import warnings +from pathlib import Path +from typing import Annotated + +import typer + +from docling.datamodel.settings import settings +from docling.models.code_formula_model import CodeFormulaModel +from docling.models.document_picture_classifier import DocumentPictureClassifier +from docling.models.easyocr_model import EasyOcrModel +from docling.models.layout_model import LayoutModel +from docling.models.rapid_ocr_model import RapidOcrModel +from docling.models.table_structure_model import TableStructureModel + +warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch") +warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr") + +_log = logging.getLogger(__name__) +from rich.console import Console + +console = Console() +err_console = Console(stderr=True) + + +app = typer.Typer( + name="Docling model helper", + add_completion=False, + pretty_exceptions_enable=False, +) + + +@app.command("download") +def download( + output_dir: Annotated[ + Path, + typer.Option( + ..., + "-o", + "--output-dir", + help="The directory where all the models are downloaded.", + ), + ] = settings.cache_dir + / "models", + force: Annotated[ + bool, typer.Option(..., help="If true, the download will be forced") + ] = False, + quite: Annotated[ + bool, + typer.Option( + ..., + "-q", + help="No extra output is generated, the CLI print only the directory with the cached models.", + ), + ] = False, + layout: Annotated[ + bool, + typer.Option(..., help="If true, the layout model weights are downloaded."), + ] = True, + tableformer: Annotated[ + bool, + typer.Option( + ..., help="If true, the tableformer model weights are downloaded." + ), + ] = True, + code_formula: Annotated[ + bool, + typer.Option( + ..., help="If true, the code formula model weights are downloaded." + ), + ] = True, + picture_classifier: Annotated[ + bool, + typer.Option( + ..., help="If true, the picture classifier model weights are downloaded." + ), + ] = True, + easyocr: Annotated[ + bool, + typer.Option(..., help="If true, the easyocr model weights are downloaded."), + ] = True, + rapidocr: Annotated[ + bool, + typer.Option(..., help="If true, the rapidocr model weights are downloaded."), + ] = True, +): + # Make sure the folder exists + output_dir.mkdir(exist_ok=True, parents=True) + + show_progress = not quite + + if layout: + if not quite: + typer.secho(f"Downloading layout model...", fg="blue") + LayoutModel.download_models_hf( + local_dir=output_dir / LayoutModel._model_repo_folder, + force=force, + progress=show_progress, + ) + + if tableformer: + if not quite: + typer.secho(f"Downloading tableformer model...", fg="blue") + TableStructureModel.download_models_hf( + local_dir=output_dir / TableStructureModel._model_repo_folder, + force=force, + progress=show_progress, + ) + + if picture_classifier: + if not quite: + typer.secho(f"Downloading picture classifier model...", fg="blue") + DocumentPictureClassifier.download_models_hf( + local_dir=output_dir / DocumentPictureClassifier._model_repo_folder, + force=force, + progress=show_progress, + ) + + if code_formula: + if not quite: + typer.secho(f"Downloading code formula model...", fg="blue") + CodeFormulaModel.download_models_hf( + local_dir=output_dir / CodeFormulaModel._model_repo_folder, + force=force, + progress=show_progress, + ) + + if easyocr: + if not quite: + typer.secho(f"Downloading easyocr models...", fg="blue") + EasyOcrModel.download_models( + local_dir=output_dir / EasyOcrModel._model_repo_folder, + force=force, + progress=show_progress, + ) + + if quite: + typer.echo(output_dir) + else: + typer.secho(f"All models downloaded in the directory {output_dir}.", fg="green") + + console.print( + "\n", + "Docling can now be configured for running offline using the local artifacts.\n\n", + "Using the CLI:", + "`docling --artifacts-path={output_dir} FILE`", + "\n", + "Using Python: see the documentation at .", + ) + + +@app.command(hidden=True) +def other(): + raise NotImplementedError() + + +click_app = typer.main.get_command(app) + +if __name__ == "__main__": + app() diff --git a/docling/models/code_formula_model.py b/docling/models/code_formula_model.py index 2e380c6c..5906775f 100644 --- a/docling/models/code_formula_model.py +++ b/docling/models/code_formula_model.py @@ -109,12 +109,15 @@ class CodeFormulaModel(BaseItemAndImageEnrichmentModel): @staticmethod def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, ) -> Path: from huggingface_hub import snapshot_download from huggingface_hub.utils import disable_progress_bars - disable_progress_bars() + if not progress: + disable_progress_bars() download_path = snapshot_download( repo_id="ds4sd/CodeFormula", force_download=force, diff --git a/docling/models/document_picture_classifier.py b/docling/models/document_picture_classifier.py index ff981d92..0e6fd95d 100644 --- a/docling/models/document_picture_classifier.py +++ b/docling/models/document_picture_classifier.py @@ -101,12 +101,13 @@ class DocumentPictureClassifier(BaseEnrichmentModel): @staticmethod def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False + local_dir: Optional[Path] = None, force: bool = False, progress: bool = False ) -> Path: from huggingface_hub import snapshot_download from huggingface_hub.utils import disable_progress_bars - disable_progress_bars() + if not progress: + disable_progress_bars() download_path = snapshot_download( repo_id="ds4sd/DocumentFigureClassifier", force_download=force, diff --git a/docling/models/easyocr_model.py b/docling/models/easyocr_model.py index bbe4fb05..45de8f1b 100644 --- a/docling/models/easyocr_model.py +++ b/docling/models/easyocr_model.py @@ -1,7 +1,10 @@ import logging import warnings -from typing import Iterable +import zipfile +from pathlib import Path +from typing import Iterable, List, Optional +import httpx import numpy import torch from docling_core.types.doc import BoundingBox, CoordOrigin @@ -17,11 +20,14 @@ from docling.datamodel.settings import settings from docling.models.base_ocr_model import BaseOcrModel from docling.utils.accelerator_utils import decide_device from docling.utils.profiling import TimeRecorder +from docling.utils.utils import download_url_with_progress _log = logging.getLogger(__name__) class EasyOcrModel(BaseOcrModel): + _model_repo_folder = "EasyOcr" + def __init__( self, enabled: bool, @@ -71,6 +77,40 @@ class EasyOcrModel(BaseOcrModel): verbose=False, ) + @staticmethod + def download_models( + detection_models: List[str] = ["craft"], + recognition_models: List[str] = ["english_g2", "latin_g2"], + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, + ) -> Path: + # Models are located in https://github.com/JaidedAI/EasyOCR/blob/master/easyocr/config.py + from easyocr.config import detection_models as det_models_dict + from easyocr.config import recognition_models as rec_models_dict + + if local_dir is None: + local_dir = settings.cache_dir / "models" / EasyOcrModel._model_repo_folder + + local_dir.mkdir(parents=True, exist_ok=True) + + # Collect models to download + download_list = [] + for model_name in detection_models: + if model_name in det_models_dict: + download_list.append(det_models_dict[model_name]) + for model_name in recognition_models: + if model_name in rec_models_dict["gen2"]: + download_list.append(rec_models_dict["gen2"][model_name]) + + # Download models + for model_details in download_list: + buf = download_url_with_progress(model_details["url"], progress=progress) + with zipfile.ZipFile(buf, "r") as zip_ref: + zip_ref.extractall(local_dir) + + return local_dir + def __call__( self, conv_res: ConversionResult, page_batch: Iterable[Page] ) -> Iterable[Page]: diff --git a/docling/models/layout_model.py b/docling/models/layout_model.py index 2330cc29..df74eb6d 100644 --- a/docling/models/layout_model.py +++ b/docling/models/layout_model.py @@ -77,12 +77,15 @@ class LayoutModel(BasePageModel): @staticmethod def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False + local_dir: Optional[Path] = None, + force: bool = False, + progress: bool = False, ) -> Path: from huggingface_hub import snapshot_download from huggingface_hub.utils import disable_progress_bars - disable_progress_bars() + if not progress: + disable_progress_bars() download_path = snapshot_download( repo_id="ds4sd/docling-models", force_download=force, diff --git a/docling/models/table_structure_model.py b/docling/models/table_structure_model.py index 297b6c2e..b02f5cfd 100644 --- a/docling/models/table_structure_model.py +++ b/docling/models/table_structure_model.py @@ -84,12 +84,13 @@ class TableStructureModel(BasePageModel): @staticmethod def download_models_hf( - local_dir: Optional[Path] = None, force: bool = False + local_dir: Optional[Path] = None, force: bool = False, progress: bool = False ) -> Path: from huggingface_hub import snapshot_download from huggingface_hub.utils import disable_progress_bars - disable_progress_bars() + if not progress: + disable_progress_bars() download_path = snapshot_download( repo_id="ds4sd/docling-models", force_download=force, diff --git a/docling/utils/utils.py b/docling/utils/utils.py index 24b69423..1261f860 100644 --- a/docling/utils/utils.py +++ b/docling/utils/utils.py @@ -4,6 +4,9 @@ from itertools import islice from pathlib import Path from typing import List, Union +import requests +from tqdm import tqdm + def chunkify(iterator, chunk_size): """Yield successive chunks of chunk_size from the iterable.""" @@ -39,3 +42,24 @@ def create_hash(string: str): hasher.update(string.encode("utf-8")) return hasher.hexdigest() + + +def download_url_with_progress(url: str, progress: bool = False) -> BytesIO: + buf = BytesIO() + with requests.get(url, stream=True, allow_redirects=True) as response: + total_size = int(response.headers.get("content-length", 0)) + progress_bar = tqdm( + total=total_size, + unit="B", + unit_scale=True, + unit_divisor=1024, + disable=(not progress), + ) + + for chunk in response.iter_content(10 * 1024): + buf.write(chunk) + progress_bar.update(len(chunk)) + progress_bar.close() + + buf.seek(0) + return buf diff --git a/docs/examples/offline_convert.py b/docs/examples/offline_convert.py new file mode 100644 index 00000000..06e8ee84 --- /dev/null +++ b/docs/examples/offline_convert.py @@ -0,0 +1,19 @@ +from pathlib import Path + +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + +# The location of the local artifacts, e.g. from the `docling-models download` command +artifacts_path = Path("PATH TO MODELS") # <-- fill me +pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) +pipeline_options.ocr_options = EasyOcrOptions( + download_enabled=False, model_storage_directory=str(artifacts_path / "EasyOcr") +) + +doc_converter = DocumentConverter( + format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)} +) + +result = doc_converter.convert("FILE TO CONVERT") # <-- fill me +print(result.document.export_to_markdown()) diff --git a/docs/usage.md b/docs/usage.md index a577a3e3..8cbf8ba4 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -26,12 +26,47 @@ To see all available options (export formats etc.) run `docling --help`. More de ### Advanced options +#### Provide specific artifacts path (offline mode) + +By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows: + +```python +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import EasyOcrOptions, PdfPipelineOptions +from docling.document_converter import DocumentConverter, PdfFormatOption + +artifacts_path = "/local/path/to/artifacts" + +pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) +# if you are using EasyOcr +pipeline_options.ocr_options = EasyOcrOptions( + download_enabled=False, + model_storage_directory=str(artifacts_path / "EasyOcr") +) +doc_converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) + } +) +``` + +To download all the artifacts needed to run offline, Docling provides the `docling-models download` utility. + +```sh +$ docling-models download +Downloading layout model... +Downloading tableformer model... +Downloading picture classifier model... +Downloading code formula model... +Downloading easyocr models... +All models downloaded in the directory $HOME/.cache/docling/models. +``` + #### Adjust pipeline features The example file [custom_convert.py](./examples/custom_convert.py) contains multiple ways one can adjust the conversion pipeline and features. - ##### Control PDF table extraction options You can control if table structure recognition should map the recognized structure back to PDF cells (default) or use text cells from the structure prediction itself. @@ -70,28 +105,6 @@ doc_converter = DocumentConverter( ) ``` -##### Provide specific artifacts path - -By default, artifacts such as models are downloaded automatically upon first usage. If you would prefer to use a local path where the artifacts have been explicitly prefetched, you can do that as follows: - -```python -from docling.datamodel.base_models import InputFormat -from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption -from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline - -# # to explicitly prefetch: -# artifacts_path = StandardPdfPipeline.download_models_hf() - -artifacts_path = "/local/path/to/artifacts" - -pipeline_options = PdfPipelineOptions(artifacts_path=artifacts_path) -doc_converter = DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) - } -) -``` #### Impose limits on the document size diff --git a/poetry.lock b/poetry.lock index 04e4e007..2ff47498 100644 --- a/poetry.lock +++ b/poetry.lock @@ -7248,6 +7248,20 @@ files = [ [package.dependencies] urllib3 = ">=2" +[[package]] +name = "types-tqdm" +version = "4.67.0.20241221" +description = "Typing stubs for tqdm" +optional = false +python-versions = ">=3.8" +files = [ + {file = "types_tqdm-4.67.0.20241221-py3-none-any.whl", hash = "sha256:a1f1c9cda5c2d8482d2c73957a5398bfdedda10f6bc7b3b4e812d5c910486d29"}, + {file = "types_tqdm-4.67.0.20241221.tar.gz", hash = "sha256:e56046631056922385abe89aeb18af5611f471eadd7918a0ad7f34d84cd4c8cc"}, +] + +[package.dependencies] +types-requests = "*" + [[package]] name = "typing-extensions" version = "4.12.2" @@ -7837,4 +7851,4 @@ tesserocr = ["tesserocr"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "907c7cef6722358ac30193f07f9cc15684daf1b75b6c400104e87f3b22137632" +content-hash = "7d8c8a4c2562f3e88673fb9a32d0a5f85aca0f7c7aeaa67a7a65a0f930a5c6c7" diff --git a/pyproject.toml b/pyproject.toml index d12b70e2..621e6b1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,7 @@ onnxruntime = [ { version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" } ] pillow = "^10.0.0" +tqdm = "^4.65.0" [tool.poetry.group.dev.dependencies] black = {extras = ["jupyter"], version = "^24.4.2"} @@ -79,6 +80,7 @@ ipykernel = "^6.29.5" ipywidgets = "^8.1.5" nbqa = "^1.9.0" types-openpyxl = "^3.1.5.20241114" +types-tqdm = "^4.67.0.20241221" [tool.poetry.group.docs.dependencies] mkdocs-material = "^9.5.40" @@ -123,6 +125,7 @@ rapidocr = ["rapidocr-onnxruntime", "onnxruntime"] [tool.poetry.scripts] docling = "docling.cli.main:app" +docling-models = "docling.cli.models_download:app" [build-system] requires = ["poetry-core"]