mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
feat: add more options in the CLI
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
parent
9d8865856d
commit
af32a049d4
@ -5,12 +5,15 @@ import time
|
|||||||
import warnings
|
import warnings
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Annotated, Dict, Iterable, List, Optional
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
from docling_core.utils.file import resolve_file_source
|
from docling_core.utils.file import resolve_file_source
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||||
from docling.datamodel.base_models import (
|
from docling.datamodel.base_models import (
|
||||||
ConversionStatus,
|
ConversionStatus,
|
||||||
FormatToExtensions,
|
FormatToExtensions,
|
||||||
@ -22,6 +25,7 @@ from docling.datamodel.pipeline_options import (
|
|||||||
EasyOcrOptions,
|
EasyOcrOptions,
|
||||||
OcrOptions,
|
OcrOptions,
|
||||||
PdfPipelineOptions,
|
PdfPipelineOptions,
|
||||||
|
TableFormerMode,
|
||||||
TesseractCliOcrOptions,
|
TesseractCliOcrOptions,
|
||||||
TesseractOcrOptions,
|
TesseractOcrOptions,
|
||||||
)
|
)
|
||||||
@ -58,9 +62,10 @@ def version_callback(value: bool):
|
|||||||
|
|
||||||
|
|
||||||
# Define an enum for the backend options
|
# Define an enum for the backend options
|
||||||
class Backend(str, Enum):
|
class PdfBackend(str, Enum):
|
||||||
PYPDFIUM2 = "pypdfium2"
|
PYPDFIUM2 = "pypdfium2"
|
||||||
DOCLING = "docling"
|
DLPARSE_V1 = "dlparse_v1"
|
||||||
|
DLPARSE_V2 = "dlparse_v2"
|
||||||
|
|
||||||
|
|
||||||
# Define an enum for the ocr engines
|
# Define an enum for the ocr engines
|
||||||
@ -151,6 +156,13 @@ def convert(
|
|||||||
ocr_engine: Annotated[
|
ocr_engine: Annotated[
|
||||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||||
] = OcrEngine.EASYOCR,
|
] = OcrEngine.EASYOCR,
|
||||||
|
pdf_backend: Annotated[
|
||||||
|
PdfBackend, typer.Option(..., help="The PDF backend to use.")
|
||||||
|
] = PdfBackend.DLPARSE_V1,
|
||||||
|
table_mode: Annotated[
|
||||||
|
TableFormerMode,
|
||||||
|
typer.Option(..., help="The mode to use in the table structure model."),
|
||||||
|
] = TableFormerMode.FAST,
|
||||||
abort_on_error: Annotated[
|
abort_on_error: Annotated[
|
||||||
bool,
|
bool,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
@ -217,11 +229,22 @@ def convert(
|
|||||||
do_table_structure=True,
|
do_table_structure=True,
|
||||||
)
|
)
|
||||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||||
|
pipeline_options.table_structure_options.mode = table_mode
|
||||||
|
|
||||||
|
match pdf_backend:
|
||||||
|
case PdfBackend.DLPARSE_V1:
|
||||||
|
backend: Type[PdfDocumentBackend] = DoclingParseDocumentBackend
|
||||||
|
case PdfBackend.DLPARSE_V2:
|
||||||
|
backend = DoclingParseV2DocumentBackend
|
||||||
|
case PdfBackend.PYPDFIUM2:
|
||||||
|
backend = PyPdfiumDocumentBackend
|
||||||
|
case _:
|
||||||
|
raise RuntimeError(f"Unexpected PDF backend type {pdf_backend}")
|
||||||
|
|
||||||
format_options: Dict[InputFormat, FormatOption] = {
|
format_options: Dict[InputFormat, FormatOption] = {
|
||||||
InputFormat.PDF: PdfFormatOption(
|
InputFormat.PDF: PdfFormatOption(
|
||||||
pipeline_options=pipeline_options,
|
pipeline_options=pipeline_options,
|
||||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
backend=backend, # pdf_backend
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
doc_converter = DocumentConverter(
|
doc_converter = DocumentConverter(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
from enum import Enum, auto
|
from enum import Enum
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Literal, Optional, Union
|
from typing import List, Literal, Optional, Union
|
||||||
|
|
||||||
@ -6,8 +6,8 @@ from pydantic import BaseModel, ConfigDict, Field
|
|||||||
|
|
||||||
|
|
||||||
class TableFormerMode(str, Enum):
|
class TableFormerMode(str, Enum):
|
||||||
FAST = auto()
|
FAST = "fast"
|
||||||
ACCURATE = auto()
|
ACCURATE = "accurate"
|
||||||
|
|
||||||
|
|
||||||
class TableStructureOptions(BaseModel):
|
class TableStructureOptions(BaseModel):
|
||||||
|
Loading…
Reference in New Issue
Block a user