Update CLI

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
2025-07-27 04:24:45 +00:00 · 2024-10-15 09:50:06 +02:00 · 2024-10-15 09:50:06 +02:00 · afafb97b87
commit afafb97b87
parent 5b33b12660
8 changed files with 74 additions and 58 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -12,7 +12,12 @@ from docling_core.utils.file import resolve_file_source
 from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
 from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
-from docling.datamodel.base_models import ConversionStatus, InputFormat
+from docling.datamodel.base_models import (
    ConversionStatus,
    FormatToExtensions,
    InputFormat,
    OutputFormat,
 )
 from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
    EasyOcrOptions,
@ -108,7 +113,7 @@ def export_documents(
                fname = output_dir / f"{doc_filename}.doctags"
                with fname.open("w") as fp:
                    _log.info(f"writing Doc Tags output to {fname}")
-                    fp.write(conv_res.document.export_to_doctags())
+                    fp.write(conv_res.document.export_to_document_tokens())
        else:
            _log.warning(f"Document {conv_res.input.file} failed to convert.")
@ -129,41 +134,23 @@ def convert(
            help="PDF files to convert. Can be local file / directory paths or URL.",
        ),
    ],
-    export_json: Annotated[
+    from_formats: List[InputFormat] = typer.Option(
-        bool,
+        None,
-        typer.Option(
+        "--from",
-            ..., "--json/--no-json", help="If enabled the document is exported as JSON."
+        help="Specify input formats " "to convert from. Defaults to all formats.",
-        ),
+    ),
-    ] = False,
+    to_formats: List[OutputFormat] = typer.Option(
-    export_md: Annotated[
+        None, "--to", help="Specify output formats. " "Defaults to Markdown."
-        bool,
+    ),
        typer.Option(
            ..., "--md/--no-md", help="If enabled the document is exported as Markdown."
        ),
    ] = True,
    export_txt: Annotated[
        bool,
        typer.Option(
            ..., "--txt/--no-txt", help="If enabled the document is exported as Text."
        ),
    ] = False,
    export_doctags: Annotated[
        bool,
        typer.Option(
            ...,
            "--doctags/--no-doctags",
            help="If enabled the document is exported as Doc Tags.",
        ),
    ] = False,
    ocr: Annotated[
        bool,
        typer.Option(
            ..., help="If enabled, the bitmap content will be processed using OCR."
        ),
    ] = True,
-    backend: Annotated[
+    # backend: Annotated[
-        Backend, typer.Option(..., help="The PDF backend to use.")
+    #    Backend, typer.Option(..., help="The PDF backend to use.")
-    ] = Backend.DOCLING,
+    # ] = Backend.DOCLING,
    ocr_engine: Annotated[
        OcrEngine, typer.Option(..., help="The OCR engine to use.")
    ] = OcrEngine.EASYOCR,
@ -182,6 +169,9 @@ def convert(
 ):
    logging.basicConfig(level=logging.INFO)
    if from_formats is None:
        from_formats = [e for e in InputFormat]
    input_doc_paths: List[Path] = []
    for src in input_sources:
        source = resolve_file_source(source=src)
@ -191,20 +181,30 @@ def convert(
            )
            raise typer.Abort()
        elif source.is_dir():
-            input_doc_paths.extend(list(source.glob("**/*.pdf")))
+            for fmt in from_formats:
-            input_doc_paths.extend(list(source.glob("**/*.PDF")))
+                for ext in FormatToExtensions.get(fmt):
                    input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
                    input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
        else:
            input_doc_paths.append(source)
-    match backend:
+    if to_formats is None:
-        case Backend.PYPDFIUM2:
+        to_formats = [OutputFormat.MARKDOWN]
-            do_cell_matching = ocr  # only do cell matching when OCR enabled
+
-            pdf_backend = PyPdfiumDocumentBackend
+    export_json = OutputFormat.JSON in to_formats
-        case Backend.DOCLING:
+    export_md = OutputFormat.MARKDOWN in to_formats
-            do_cell_matching = True
+    export_txt = OutputFormat.TEXT in to_formats
-            pdf_backend = DoclingParseDocumentBackend
+    export_doctags = OutputFormat.DOCTAGS in to_formats
-        case _:
+
-            raise RuntimeError(f"Unexpected backend type {backend}")
+    # match backend:
    #     case Backend.PYPDFIUM2:
    #         do_cell_matching = ocr  # only do cell matching when OCR enabled
    #         pdf_backend = PyPdfiumDocumentBackend
    #     case Backend.DOCLING:
    #         do_cell_matching = True
    #         pdf_backend = DoclingParseDocumentBackend
    #     case _:
    #         raise RuntimeError(f"Unexpected backend type {backend}")
    match ocr_engine:
        case OcrEngine.EASYOCR:
@ -214,19 +214,20 @@ def convert(
        case OcrEngine.TESSERACT:
            ocr_options = TesseractOcrOptions()
        case _:
-            raise RuntimeError(f"Unexpected backend type {backend}")
+            raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
    pipeline_options = PdfPipelineOptions(
        do_ocr=ocr,
        ocr_options=ocr_options,
        do_table_structure=True,
    )
-    pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
+    pipeline_options.table_structure_options.do_cell_matching = True  # do_cell_matching
    doc_converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(
-                pipeline_options=pipeline_options, backend=pdf_backend
+                pipeline_options=pipeline_options,
                backend=DoclingParseDocumentBackend,  # pdf_backend
            )
        }
    )
--- a/docling/datamodel/base_models.py
+++ b/docling/datamodel/base_models.py
@ -1,6 +1,6 @@
 from enum import Enum, auto
 from io import BytesIO
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
 from docling_core.types.experimental import BoundingBox, Size
 from docling_core.types.experimental.document import PictureData, TableCell
@ -21,14 +21,29 @@ class ConversionStatus(str, Enum):
 class InputFormat(str, Enum):
-    DOCX = auto()
+    DOCX = "docx"
-    PPTX = auto()
+    PPTX = "pptx"
-    HTML = auto()
+    HTML = "html"
-    IMAGE = auto()
+    IMAGE = "image"
-    PDF = auto()
+    PDF = "pdf"
-FormatToMimeType = {
+class OutputFormat(str, Enum):
    MARKDOWN = "md"
    JSON = "json"
    TEXT = "text"
    DOCTAGS = "doctags"
 FormatToExtensions: Dict[InputFormat, List[str]] = {
    InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
    InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
    InputFormat.PDF: ["pdf"],
    InputFormat.HTML: ["html", "htm", "xhtml"],
    InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
 }
 FormatToMimeType: Dict[InputFormat, Set[str]] = {
    InputFormat.DOCX: {
        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
        "application/vnd.openxmlformats-officedocument.wordprocessingml.template",
--- a/docling/document_converter.py
+++ b/docling/document_converter.py
@ -108,7 +108,7 @@ class DocumentConverter:
        else:
            for f in self.allowed_formats:
                if f not in self.format_to_options.keys():
-                    _log.info(f"Requested format {f} will use default options.")
+                    _log.debug(f"Requested format {f} will use default options.")
                    self.format_to_options[f] = _format_to_default_options[f]
        self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
--- a/tests/data/groundtruth/docling_v2/2203.01017v2.json
+++ b/tests/data/groundtruth/docling_v2/2203.01017v2.json
--- a/tests/data/groundtruth/docling_v2/2206.01062.json
+++ b/tests/data/groundtruth/docling_v2/2206.01062.json
--- a/tests/data/groundtruth/docling_v2/2305.03393v1.json
+++ b/tests/data/groundtruth/docling_v2/2305.03393v1.json
--- a/tests/data/groundtruth/docling_v2/redp5110.json
+++ b/tests/data/groundtruth/docling_v2/redp5110.json
--- a/tests/data/groundtruth/docling_v2/redp5695.json
+++ b/tests/data/groundtruth/docling_v2/redp5695.json