mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Update CLI
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
5b33b12660
commit
afafb97b87
@ -12,7 +12,12 @@ from docling_core.utils.file import resolve_file_source
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.base_models import (
|
||||
ConversionStatus,
|
||||
FormatToExtensions,
|
||||
InputFormat,
|
||||
OutputFormat,
|
||||
)
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
EasyOcrOptions,
|
||||
@ -108,7 +113,7 @@ def export_documents(
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
with fname.open("w") as fp:
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
fp.write(conv_res.document.export_to_doctags())
|
||||
fp.write(conv_res.document.export_to_document_tokens())
|
||||
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
@ -129,41 +134,23 @@ def convert(
|
||||
help="PDF files to convert. Can be local file / directory paths or URL.",
|
||||
),
|
||||
],
|
||||
export_json: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., "--json/--no-json", help="If enabled the document is exported as JSON."
|
||||
),
|
||||
] = False,
|
||||
export_md: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., "--md/--no-md", help="If enabled the document is exported as Markdown."
|
||||
),
|
||||
] = True,
|
||||
export_txt: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., "--txt/--no-txt", help="If enabled the document is exported as Text."
|
||||
),
|
||||
] = False,
|
||||
export_doctags: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
"--doctags/--no-doctags",
|
||||
help="If enabled the document is exported as Doc Tags.",
|
||||
),
|
||||
] = False,
|
||||
from_formats: List[InputFormat] = typer.Option(
|
||||
None,
|
||||
"--from",
|
||||
help="Specify input formats " "to convert from. Defaults to all formats.",
|
||||
),
|
||||
to_formats: List[OutputFormat] = typer.Option(
|
||||
None, "--to", help="Specify output formats. " "Defaults to Markdown."
|
||||
),
|
||||
ocr: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
..., help="If enabled, the bitmap content will be processed using OCR."
|
||||
),
|
||||
] = True,
|
||||
backend: Annotated[
|
||||
Backend, typer.Option(..., help="The PDF backend to use.")
|
||||
] = Backend.DOCLING,
|
||||
# backend: Annotated[
|
||||
# Backend, typer.Option(..., help="The PDF backend to use.")
|
||||
# ] = Backend.DOCLING,
|
||||
ocr_engine: Annotated[
|
||||
OcrEngine, typer.Option(..., help="The OCR engine to use.")
|
||||
] = OcrEngine.EASYOCR,
|
||||
@ -182,6 +169,9 @@ def convert(
|
||||
):
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
if from_formats is None:
|
||||
from_formats = [e for e in InputFormat]
|
||||
|
||||
input_doc_paths: List[Path] = []
|
||||
for src in input_sources:
|
||||
source = resolve_file_source(source=src)
|
||||
@ -191,20 +181,30 @@ def convert(
|
||||
)
|
||||
raise typer.Abort()
|
||||
elif source.is_dir():
|
||||
input_doc_paths.extend(list(source.glob("**/*.pdf")))
|
||||
input_doc_paths.extend(list(source.glob("**/*.PDF")))
|
||||
for fmt in from_formats:
|
||||
for ext in FormatToExtensions.get(fmt):
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext}")))
|
||||
input_doc_paths.extend(list(source.glob(f"**/*.{ext.upper()}")))
|
||||
else:
|
||||
input_doc_paths.append(source)
|
||||
|
||||
match backend:
|
||||
case Backend.PYPDFIUM2:
|
||||
do_cell_matching = ocr # only do cell matching when OCR enabled
|
||||
pdf_backend = PyPdfiumDocumentBackend
|
||||
case Backend.DOCLING:
|
||||
do_cell_matching = True
|
||||
pdf_backend = DoclingParseDocumentBackend
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
if to_formats is None:
|
||||
to_formats = [OutputFormat.MARKDOWN]
|
||||
|
||||
export_json = OutputFormat.JSON in to_formats
|
||||
export_md = OutputFormat.MARKDOWN in to_formats
|
||||
export_txt = OutputFormat.TEXT in to_formats
|
||||
export_doctags = OutputFormat.DOCTAGS in to_formats
|
||||
|
||||
# match backend:
|
||||
# case Backend.PYPDFIUM2:
|
||||
# do_cell_matching = ocr # only do cell matching when OCR enabled
|
||||
# pdf_backend = PyPdfiumDocumentBackend
|
||||
# case Backend.DOCLING:
|
||||
# do_cell_matching = True
|
||||
# pdf_backend = DoclingParseDocumentBackend
|
||||
# case _:
|
||||
# raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
|
||||
match ocr_engine:
|
||||
case OcrEngine.EASYOCR:
|
||||
@ -214,19 +214,20 @@ def convert(
|
||||
case OcrEngine.TESSERACT:
|
||||
ocr_options = TesseractOcrOptions()
|
||||
case _:
|
||||
raise RuntimeError(f"Unexpected backend type {backend}")
|
||||
raise RuntimeError(f"Unexpected OCR engine type {ocr_engine}")
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = do_cell_matching
|
||||
pipeline_options.table_structure_options.do_cell_matching = True # do_cell_matching
|
||||
|
||||
doc_converter = DocumentConverter(
|
||||
format_options={
|
||||
InputFormat.PDF: PdfFormatOption(
|
||||
pipeline_options=pipeline_options, backend=pdf_backend
|
||||
pipeline_options=pipeline_options,
|
||||
backend=DoclingParseDocumentBackend, # pdf_backend
|
||||
)
|
||||
}
|
||||
)
|
||||
|
@ -1,6 +1,6 @@
|
||||
from enum import Enum, auto
|
||||
from io import BytesIO
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Union
|
||||
|
||||
from docling_core.types.experimental import BoundingBox, Size
|
||||
from docling_core.types.experimental.document import PictureData, TableCell
|
||||
@ -21,14 +21,29 @@ class ConversionStatus(str, Enum):
|
||||
|
||||
|
||||
class InputFormat(str, Enum):
|
||||
DOCX = auto()
|
||||
PPTX = auto()
|
||||
HTML = auto()
|
||||
IMAGE = auto()
|
||||
PDF = auto()
|
||||
DOCX = "docx"
|
||||
PPTX = "pptx"
|
||||
HTML = "html"
|
||||
IMAGE = "image"
|
||||
PDF = "pdf"
|
||||
|
||||
|
||||
FormatToMimeType = {
|
||||
class OutputFormat(str, Enum):
|
||||
MARKDOWN = "md"
|
||||
JSON = "json"
|
||||
TEXT = "text"
|
||||
DOCTAGS = "doctags"
|
||||
|
||||
|
||||
FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.DOCX: ["docx", "dotx", "docm", "dotm"],
|
||||
InputFormat.PPTX: ["pptx", "potx", "ppsx", "pptm", "potm", "ppsm"],
|
||||
InputFormat.PDF: ["pdf"],
|
||||
InputFormat.HTML: ["html", "htm", "xhtml"],
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
}
|
||||
|
||||
FormatToMimeType: Dict[InputFormat, Set[str]] = {
|
||||
InputFormat.DOCX: {
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
|
||||
|
@ -108,7 +108,7 @@ class DocumentConverter:
|
||||
else:
|
||||
for f in self.allowed_formats:
|
||||
if f not in self.format_to_options.keys():
|
||||
_log.info(f"Requested format {f} will use default options.")
|
||||
_log.debug(f"Requested format {f} will use default options.")
|
||||
self.format_to_options[f] = _format_to_default_options[f]
|
||||
|
||||
self.initialized_pipelines: Dict[Type[BasePipeline], BasePipeline] = {}
|
||||
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user