From af32a049d4a3104affef8ce33090569299057faa Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Sat, 2 Nov 2024 09:41:33 +0100 Subject: [PATCH] feat: add more options in the CLI Signed-off-by: Michele Dolfi --- docling/cli/main.py | 31 +++++++++++++++++++++++---- docling/datamodel/pipeline_options.py | 6 +++--- 2 files changed, 30 insertions(+), 7 deletions(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index 1800ea18..d9187c70 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -5,12 +5,15 @@ import time import warnings from enum import Enum from pathlib import Path -from typing import Annotated, Dict, Iterable, List, Optional +from typing import Annotated, Dict, Iterable, List, Optional, Type import typer from docling_core.utils.file import resolve_file_source from docling.backend.docling_parse_backend import DoclingParseDocumentBackend +from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend +from docling.backend.pdf_backend import PdfDocumentBackend +from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.datamodel.base_models import ( ConversionStatus, FormatToExtensions, @@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import ( EasyOcrOptions, OcrOptions, PdfPipelineOptions, + TableFormerMode, TesseractCliOcrOptions, TesseractOcrOptions, ) @@ -58,9 +62,10 @@ def version_callback(value: bool): # Define an enum for the backend options -class Backend(str, Enum): +class PdfBackend(str, Enum): PYPDFIUM2 = "pypdfium2" - DOCLING = "docling" + DLPARSE_V1 = "dlparse_v1" + DLPARSE_V2 = "dlparse_v2" # Define an enum for the ocr engines @@ -151,6 +156,13 @@ def convert( ocr_engine: Annotated[ OcrEngine, typer.Option(..., help="The OCR engine to use.") ] = OcrEngine.EASYOCR, + pdf_backend: Annotated[ + PdfBackend, typer.Option(..., help="The PDF backend to use.") + ] = PdfBackend.DLPARSE_V1, + table_mode: Annotated[ + TableFormerMode, + typer.Option(..., help="The mode to use in the table structure model."), + ] = TableFormerMode.FAST, abort_on_error: Annotated[ bool, typer.Option( @@ -217,11 +229,22 @@ def convert( do_table_structure=True, ) pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching + pipeline_options.table_structure_options.mode = table_mode + + match pdf_backend: + case PdfBackend.DLPARSE_V1: + backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend + case PdfBackend.DLPARSE_V2: + backend = DoclingParseV2DocumentBackend + case PdfBackend.PYPDFIUM2: + backend = PyPdfiumDocumentBackend + case _: + raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}") format_options: Dict[InputFormat, FormatOption] = { InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options, - backend=DoclingParseDocumentBackend, # pdf_backend + backend=backend, # pdf_backend ) } doc_converter = DocumentConverter( diff --git a/docling/datamodel/pipeline_options.py b/docling/datamodel/pipeline_options.py index efaa6ff8..d57f1671 100644 --- a/docling/datamodel/pipeline_options.py +++ b/docling/datamodel/pipeline_options.py @@ -1,4 +1,4 @@ -from enum import Enum, auto +from enum import Enum from pathlib import Path from typing import List, Literal, Optional, Union @@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field class TableFormerMode(str, Enum): - FAST = auto() - ACCURATE = auto() + FAST = "fast" + ACCURATE = "accurate" class TableStructureOptions(BaseModel):