use typer for the docling CLI

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Michele Dolfi 2024-09-12 14:47:06 +02:00
parent 7d06d79425
commit faf66aff57
4 changed files with 124 additions and 50 deletions

0
docling/cli/__init__.py Normal file
View File

View File

@ -1,9 +1,14 @@
import argparse import importlib
import json import json
import logging import logging
import time import time
import warnings
from enum import Enum
from pathlib import Path from pathlib import Path
from typing import Iterable from typing import Annotated, Iterable, List, Optional
import typer
from pydantic import AnyUrl
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
@ -11,14 +16,39 @@ from docling.datamodel.base_models import ConversionStatus, PipelineOptions
from docling.datamodel.document import ConversionResult, DocumentConversionInput from docling.datamodel.document import ConversionResult, DocumentConversionInput
from docling.document_converter import DocumentConverter from docling.document_converter import DocumentConverter
_log = logging.getLogger(__name__) warnings.filterwarnings(action="ignore", category=UserWarning, module="pydantic|torch")
warnings.filterwarnings(action="ignore", category=FutureWarning, module="easyocr")
from enum import Enum _log = logging.getLogger(__name__)
from rich.console import Console
err_console = Console(stderr=True)
app = typer.Typer(
name="Docling",
no_args_is_help=True,
add_completion=False,
pretty_exceptions_enable=False,
)
def version_callback(value: bool):
if value:
docling_version = importlib.metadata.version("docling")
docling_core_version = importlib.metadata.version("docling-core")
docling_ibm_models_version = importlib.metadata.version("docling-ibm-models")
docling_parse_version = importlib.metadata.version("docling-parse")
print(f"Docling version: {docling_version}")
print(f"Docling Core version: {docling_core_version}")
print(f"Docling IBM Models version: {docling_ibm_models_version}")
print(f"Docling Parse version: {docling_parse_version}")
raise typer.Exit()
# Define an enum for the backend options # Define an enum for the backend options
class Backend(Enum): class Backend(str, Enum):
PDFIUM = "pdfium" PYPDFIUM2 = "pypdfium2"
DOCLING = "docling" DOCLING = "docling"
@ -26,7 +56,6 @@ def export_documents(
conv_results: Iterable[ConversionResult], conv_results: Iterable[ConversionResult],
output_dir: Path, output_dir: Path,
): ):
output_dir.mkdir(parents=True, exist_ok=True)
success_count = 0 success_count = 0
failure_count = 0 failure_count = 0
@ -62,13 +91,52 @@ def export_documents(
f"Processed {success_count + failure_count} docs, of which {failure_count} failed" f"Processed {success_count + failure_count} docs, of which {failure_count} failed"
) )
return success_count, failure_count
@app.command(no_args_is_help=True)
def main(pdf, ocr, backend): def convert(
input_files: Annotated[
List[Path],
typer.Argument(
...,
metavar="file",
help="PDF files to convert. Directories are also accepted.",
),
],
ocr: Annotated[
bool,
typer.Option(
..., help="If enabled, the bitmap content will be processed using OCR."
),
] = True,
backend: Annotated[
Backend, typer.Option(..., help="The PDF backend to use.")
] = Backend.DOCLING,
output: Annotated[
Path, typer.Option(..., help="Output directory where results are saved.")
] = Path("."),
version: Annotated[
Optional[bool],
typer.Option(
"--version",
callback=version_callback,
is_eager=True,
help="Show version information.",
),
] = None,
):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
input_doc_paths = [Path(pdf)] input_doc_paths: List[Path] = []
for source in input_files:
if not source.exists():
err_console.print(
f"[red]Error: The input file {source} does not exist.[/red]"
)
raise typer.Abort()
elif source.is_dir():
input_doc_paths.extend(list(source.glob("**/*.pdf", case_sensitive=False)))
else:
input_doc_paths.append(source)
########################################################################### ###########################################################################
@ -77,7 +145,7 @@ def main(pdf, ocr, backend):
# Uncomment one section at the time to see the differences in the output. # Uncomment one section at the time to see the differences in the output.
doc_converter = None doc_converter = None
if backend == Backend.PDFIUM.value and not ocr: # PyPdfium without OCR if backend == Backend.PYPDFIUM2 and not ocr: # PyPdfium without OCR
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
@ -88,7 +156,7 @@ def main(pdf, ocr, backend):
pdf_backend=PyPdfiumDocumentBackend, pdf_backend=PyPdfiumDocumentBackend,
) )
elif backend == Backend.PDFIUM.value and ocr: # PyPdfium with OCR elif backend == Backend.PYPDFIUM2.value and ocr: # PyPdfium with OCR
pipeline_options = PipelineOptions() pipeline_options = PipelineOptions()
pipeline_options.do_ocr = False pipeline_options.do_ocr = False
pipeline_options.do_table_structure = True pipeline_options.do_table_structure = True
@ -121,8 +189,6 @@ def main(pdf, ocr, backend):
pdf_backend=DoclingParseDocumentBackend, pdf_backend=DoclingParseDocumentBackend,
) )
else:
return
########################################################################### ###########################################################################
# Define input files # Define input files
@ -131,44 +197,14 @@ def main(pdf, ocr, backend):
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert(input) conv_results = doc_converter.convert(input)
success_count, failure_count = export_documents(
conv_results, output_dir=Path("./scratch") output.mkdir(parents=True, exist_ok=True)
) export_documents(conv_results, output_dir=output)
end_time = time.time() - start_time end_time = time.time() - start_time
_log.info(f"All documents were converted in {end_time:.2f} seconds.") _log.info(f"All documents were converted in {end_time:.2f} seconds.")
if failure_count > 0:
raise RuntimeError(
f"The example failed converting {failure_count} on {len(input_doc_paths)}."
)
if __name__ == "__main__": if __name__ == "__main__":
app()
# Create an argument parser
parser = argparse.ArgumentParser(description="Process PDF files with optional OCR.")
# Add arguments
parser.add_argument(
"--pdf",
type=str,
default="./tests/data/2206.01062.pdf",
help="Path to the PDF file.",
)
parser.add_argument(
"--ocr", type=bool, default=False, help="Enable OCR (True or False)."
)
parser.add_argument(
"--backend",
type=lambda b: Backend[b.upper()],
choices=list(Backend),
default=Backend.DOCLING,
help="Select backend (pdfium or docling). Default is docling.",
)
# Parse the arguments
args = parser.parse_args()
main(args.pdf, args.ocr, args.backend.value)

35
poetry.lock generated
View File

@ -5684,6 +5684,17 @@ numpy = ">=1.14,<3"
docs = ["matplotlib", "numpydoc (==1.1.*)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"] docs = ["matplotlib", "numpydoc (==1.1.*)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"]
test = ["pytest", "pytest-cov"] test = ["pytest", "pytest-cov"]
[[package]]
name = "shellingham"
version = "1.5.4"
description = "Tool to Detect Surrounding Shell"
optional = false
python-versions = ">=3.7"
files = [
{file = "shellingham-1.5.4-py2.py3-none-any.whl", hash = "sha256:7ecfff8f2fd72616f7481040475a65b2bf8af90a56c89140852d1120324e8686"},
{file = "shellingham-1.5.4.tar.gz", hash = "sha256:8dbca0739d487e5bd35ab3ca4b36e11c4078f3a234bfce294b0a0291363404de"},
]
[[package]] [[package]]
name = "simplejson" name = "simplejson"
version = "3.19.3" version = "3.19.3"
@ -6584,6 +6595,11 @@ files = [
{file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"}, {file = "triton-3.0.0-1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:34e509deb77f1c067d8640725ef00c5cbfcb2052a1a3cb6a6d343841f92624eb"},
{file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"}, {file = "triton-3.0.0-1-cp38-cp38-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bcbf3b1c48af6a28011a5c40a5b3b9b5330530c3827716b5fbf6d7adcc1e53e9"},
{file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"}, {file = "triton-3.0.0-1-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6e5727202f7078c56f91ff13ad0c1abab14a0e7f2c87e91b12b6f64f3e8ae609"},
{file = "triton-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39b052da883351fdf6be3d93cedae6db3b8e3988d3b09ed221bccecfa9612230"},
{file = "triton-3.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cd34f19a8582af96e6291d4afce25dac08cb2a5d218c599163761e8e0827208e"},
{file = "triton-3.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0d5e10de8c011adeb7c878c6ce0dd6073b14367749e34467f1cff2bde1b78253"},
{file = "triton-3.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e8903767951bf86ec960b4fe4e21bc970055afc65e9d57e916d79ae3c93665e3"},
{file = "triton-3.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:41004fb1ae9a53fcb3e970745feb87f0e3c94c6ce1ba86e95fa3b8537894bef7"},
] ]
[package.dependencies] [package.dependencies]
@ -6617,6 +6633,23 @@ rfc3986 = ">=1.4.0"
tqdm = ">=4.14" tqdm = ">=4.14"
urllib3 = ">=1.26.0" urllib3 = ">=1.26.0"
[[package]]
name = "typer"
version = "0.12.5"
description = "Typer, build great CLIs. Easy to code. Based on Python type hints."
optional = false
python-versions = ">=3.7"
files = [
{file = "typer-0.12.5-py3-none-any.whl", hash = "sha256:62fe4e471711b147e3365034133904df3e235698399bc4de2b36c8579298d52b"},
{file = "typer-0.12.5.tar.gz", hash = "sha256:f592f089bedcc8ec1b974125d64851029c3b1af145f04aca64d69410f0c9b722"},
]
[package.dependencies]
click = ">=8.0.0"
rich = ">=10.11.0"
shellingham = ">=1.3.0"
typing-extensions = ">=3.7.4.3"
[[package]] [[package]]
name = "types-requests" name = "types-requests"
version = "2.32.0.20240907" version = "2.32.0.20240907"
@ -7169,4 +7202,4 @@ examples = ["langchain-huggingface", "langchain-milvus", "langchain-text-splitte
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.10" python-versions = "^3.10"
content-hash = "5ce8fc1e245442e355b967430e211b1378fed2e9fd20d2ddbea47f0e9f1dfcd5" content-hash = "e9eaa2a2de2ef321d274bbe245a290826c8604a730e9e65149421a6d2cfe2202"

View File

@ -60,6 +60,7 @@ torchvision = [
{version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"}, {version = "^0", optional = true, markers = "sys_platform != 'darwin' or platform_machine != 'x86_64'"},
{version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"} {version = "~0.17.2", optional = true, markers = "sys_platform == 'darwin' and platform_machine == 'x86_64'"}
] ]
typer = "^0.12.5"
[tool.poetry.group.dev.dependencies] [tool.poetry.group.dev.dependencies]
black = {extras = ["jupyter"], version = "^24.4.2"} black = {extras = ["jupyter"], version = "^24.4.2"}
@ -94,6 +95,10 @@ examples = [
"langchain-text-splitters", "langchain-text-splitters",
] ]
[tool.poetry.scripts]
docling = "docling.cli.main:app"
[build-system] [build-system]
requires = ["poetry-core"] requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api" build-backend = "poetry.core.masonry.api"