Update CLI

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Christoph Auer 2024-10-15 09:50:06 +02:00
parent 5b33b12660
commit afafb97b87
8 changed files with 74 additions and 58 deletions

View File

@ -12,7 +12,12 @@ from docling_core.utils.file import resolve_file_source
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
from docling.datamodel.base_models import ConversionStatus, InputFormat from docling.datamodel.base_models import (
ConversionStatus,
FormatToExtensions,
InputFormat,
OutputFormat,
)
from docling.datamodel.document import ConversionResult from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import ( from docling.datamodel.pipeline_options import (
EasyOcrOptions, EasyOcrOptions,
@ -108,7 +113,7 @@ def export_documents(
fname = output_dir / f"{doc_filename}.doctags" fname = output_dir / f"{doc_filename}.doctags"
with fname.open("w") as fp: with fname.open("w") as fp:
_log.info(f"writing Doc Tags output to {fname}") _log.info(f"writing Doc Tags output to {fname}")
fp.write(conv_res.document.export_to_doctags()) fp.write(conv_res.document.export_to_document_tokens())
else: else:
_log.warning(f"Document {conv_res.input.file} failed to convert.") _log.warning(f"Document {conv_res.input.file} failed to convert.")
@ -129,41 +134,23 @@ def convert(
help="PDF files to convert. Can be local file / directory paths or URL.", help="PDF files to convert. Can be local file / directory paths or URL.",
), ),
], ],
export_json: Annotated[ from_formats: List[InputFormat] = typer.Option(
bool, None,
typer.Option( "--from",
..., "--json/--no-json", help="If enabled the document is exported as JSON." help="Specify input formats " "to convert from. Defaults to all formats.",
), ),
] = False, to_formats: List[OutputFormat] = typer.Option(
export_md: Annotated[ None, "--to", help="Specify output formats. " "Defaults to Markdown."
bool, ),
typer.Option(
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
),
] = True,
export_txt: Annotated[
bool,
typer.Option(
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
),
] = False,
export_doctags: Annotated[
bool,
typer.Option(
...,
"--doctags/--no-doctags",
help="If enabled the document is exported as Doc Tags.",
),
] = False,
ocr: Annotated[ ocr: Annotated[
bool, bool,
typer.Option( typer.Option(
..., help="If enabled, the bitmap content will be processed using OCR." ..., help="If enabled, the bitmap content will be processed using OCR."
), ),
] = True, ] = True,
backend: Annotated[ # backend: Annotated[
Backend, typer.Option(..., help="The PDF backend to use.") # Backend, typer.Option(..., help="The PDF backend to use.")
] = Backend.DOCLING, # ] = Backend.DOCLING,
ocr_engine: Annotated[ ocr_engine: Annotated[
OcrEngine, typer.Option(..., help="The OCR engine to use.") OcrEngine, typer.Option(..., help="The OCR engine to use.")
] = OcrEngine.EASYOCR, ] = OcrEngine.EASYOCR,
@ -182,6 +169,9 @@ def convert(
): ):
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
if from_formats is None:
from_formats = [e for e in InputFormat]
input_doc_paths: List[Path] = [] input_doc_paths: List[Path] = []
for src in input_sources: for src in input_sources:
source = resolve_file_source(source=src) source = resolve_file_source(source=src)
@ -191,20 +181,30 @@ def convert(
) )
raise typer.Abort() raise typer.Abort()
elif source.is_dir(): elif source.is_dir():
input_doc_paths.extend(list(source.glob("**/*.pdf"))) for fmt in from_formats:
input_doc_paths.extend(list(source.glob("**/*.PDF"))) for ext in FormatToExtensions.get(fmt):
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
else: else:
input_doc_paths.append(source) input_doc_paths.append(source)
match backend: if to_formats is None:
case Backend.PYPDFIUM2: to_formats = [OutputFormat.MARKDOWN]
do_cell_matching = ocr # only do cell matching when OCR enabled
pdf_backend = PyPdfiumDocumentBackend export_json = OutputFormat.JSON in to_formats
case Backend.DOCLING: export_md = OutputFormat.MARKDOWN in to_formats
do_cell_matching = True export_txt = OutputFormat.TEXT in to_formats
pdf_backend = DoclingParseDocumentBackend export_doctags = OutputFormat.DOCTAGS in to_formats
case _:
raise RuntimeError(f"Unexpected backend type {backend}") # match backend:
# case Backend.PYPDFIUM2:
# do_cell_matching = ocr # only do cell matching when OCR enabled
# pdf_backend = PyPdfiumDocumentBackend
# case Backend.DOCLING:
# do_cell_matching = True
# pdf_backend = DoclingParseDocumentBackend
# case _:
# raise RuntimeError(f"Unexpected backend type {backend}")
match ocr_engine: match ocr_engine:
case OcrEngine.EASYOCR: case OcrEngine.EASYOCR:
@ -214,19 +214,20 @@ def convert(
case OcrEngine.TESSERACT: case OcrEngine.TESSERACT:
ocr_options = TesseractOcrOptions() ocr_options = TesseractOcrOptions()
case _: case _:
raise RuntimeError(f"Unexpected backend type {backend}") raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
pipeline_options = PdfPipelineOptions( pipeline_options = PdfPipelineOptions(
do_ocr=ocr, do_ocr=ocr,
ocr_options=ocr_options, ocr_options=ocr_options,
do_table_structure=True, do_table_structure=True,
) )
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
doc_converter = DocumentConverter( doc_converter = DocumentConverter(
format_options={ format_options={
InputFormat.PDF: PdfFormatOption( InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options, backend=pdf_backend pipeline_options=pipeline_options,
backend=DoclingParseDocumentBackend, # pdf_backend
) )
} }
) )

View File

@ -1,6 +1,6 @@
from enum import Enum, auto from enum import Enum, auto
from io import BytesIO from io import BytesIO
from typing import TYPE_CHECKING, Dict, List, Optional, Union from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
from docling_core.types.experimental import BoundingBox, Size from docling_core.types.experimental import BoundingBox, Size
from docling_core.types.experimental.document import PictureData, TableCell from docling_core.types.experimental.document import PictureData, TableCell
@ -21,14 +21,29 @@ class ConversionStatus(str, Enum):
class InputFormat(str, Enum): class InputFormat(str, Enum):
DOCX = auto() DOCX = "docx"
PPTX = auto() PPTX = "pptx"
HTML = auto() HTML = "html"
IMAGE = auto() IMAGE = "image"
PDF = auto() PDF = "pdf"
FormatToMimeType = { class OutputFormat(str, Enum):
MARKDOWN = "md"
JSON = "json"
TEXT = "text"
DOCTAGS = "doctags"
FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
InputFormat.PDF: ["pdf"],
InputFormat.HTML: ["html", "htm", "xhtml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
}
FormatToMimeType: Dict[InputFormat, Set[str]] = {
InputFormat.DOCX: { InputFormat.DOCX: {
"application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template", "application/vnd.openxmlformats-officedocument.wordprocessingml.template",

View File

@ -108,7 +108,7 @@ class DocumentConverter:
else: else:
for f in self.allowed_formats: for f in self.allowed_formats:
if f not in self.format_to_options.keys(): if f not in self.format_to_options.keys():
_log.info(f"Requested format {f} will use default options.") _log.debug(f"Requested format {f} will use default options.")
self.format_to_options[f] = _format_to_default_options[f] self.format_to_options[f] = _format_to_default_options[f]
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {} self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long