mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-01 23:12:20 +00:00
removed the duck emoji, added the in the cli. Currently, the referenced seems broken
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
89487dd76e
commit
d6c314d7f1
@ -13,6 +13,8 @@ import typer
|
||||
from docling_core.utils.file import resolve_source_to_path
|
||||
from pydantic import TypeAdapter, ValidationError
|
||||
|
||||
from docling_core.types.doc import ImageRefMode
|
||||
|
||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
@ -91,6 +93,7 @@ def export_documents(
|
||||
export_md: bool,
|
||||
export_txt: bool,
|
||||
export_doctags: bool,
|
||||
image_export_mode: ImageRefMode
|
||||
):
|
||||
|
||||
success_count = 0
|
||||
@ -103,39 +106,58 @@ def export_documents(
|
||||
|
||||
# Export JSON format:
|
||||
if export_json:
|
||||
fname = output_dir / f"{doc_filename}.json"
|
||||
_log.info(f"writing JSON output to {fname}")
|
||||
conv_res.document.save_as_json(filename=fname, image_mode=image_export_mode)
|
||||
"""
|
||||
fname = output_dir / f"{doc_filename}.json"
|
||||
with fname.open("w", encoding="utf8") as fp:
|
||||
_log.info(f"writing JSON output to {fname}")
|
||||
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
||||
|
||||
"""
|
||||
|
||||
# Export HTML format:
|
||||
if export_html:
|
||||
if export_html:
|
||||
fname = output_dir / f"{doc_filename}.html"
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
conv_res.document.save_as_html(filename=fname, image_mode=image_export_mode)
|
||||
"""
|
||||
with fname.open("w", encoding="utf8") as fp:
|
||||
_log.info(f"writing HTML output to {fname}")
|
||||
fp.write(conv_res.document.export_to_html())
|
||||
|
||||
"""
|
||||
|
||||
# Export Text format:
|
||||
if export_txt:
|
||||
fname = output_dir / f"{doc_filename}.txt"
|
||||
_log.info(f"writing TXT output to {fname}")
|
||||
conv_res.document.save_as_text(filename=fname)
|
||||
"""
|
||||
with fname.open("w", encoding="utf8") as fp:
|
||||
_log.info(f"writing Text output to {fname}")
|
||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
||||
|
||||
"""
|
||||
|
||||
# Export Markdown format:
|
||||
if export_md:
|
||||
fname = output_dir / f"{doc_filename}.md"
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
conv_res.document.save_as_md(filename=fname, image_mode=image_export_mode)
|
||||
"""
|
||||
with fname.open("w", encoding="utf8") as fp:
|
||||
_log.info(f"writing Markdown output to {fname}")
|
||||
fp.write(conv_res.document.export_to_markdown())
|
||||
|
||||
"""
|
||||
|
||||
# Export Document Tags format:
|
||||
if export_doctags:
|
||||
fname = output_dir / f"{doc_filename}.doctags"
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
conv_res.document.save_as_document_tokens(filename=fname)
|
||||
"""
|
||||
with fname.open("w", encoding="utf8") as fp:
|
||||
_log.info(f"writing Doc Tags output to {fname}")
|
||||
fp.write(conv_res.document.export_to_document_tokens())
|
||||
|
||||
"""
|
||||
else:
|
||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||
failure_count += 1
|
||||
@ -169,6 +191,13 @@ def convert(
|
||||
to_formats: List[OutputFormat] = typer.Option(
|
||||
None, "--to", help="Specify output formats. Defaults to Markdown."
|
||||
),
|
||||
image_export_mode: Annotated[
|
||||
ImageRefMode,
|
||||
typer.Option(
|
||||
...,
|
||||
help="Image export mode for the document (only in case of Markdown or HTML). In `placeholder`, only the position of the image is returned. In `embedded` mode, the image is contained in base64. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document."
|
||||
),
|
||||
] = ImageRefMode.EMBEDDED,
|
||||
ocr: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
@ -328,17 +357,20 @@ def convert(
|
||||
ocr_lang_list = _split_list(ocr_lang)
|
||||
if ocr_lang_list is not None:
|
||||
ocr_options.lang = ocr_lang_list
|
||||
|
||||
|
||||
pipeline_options = PdfPipelineOptions(
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
do_table_structure=True,
|
||||
)
|
||||
pipeline_options.table_structure_options.do_cell_matching = (
|
||||
True # do_cell_matching
|
||||
)
|
||||
pipeline_options.table_structure_options.mode = table_mode
|
||||
|
||||
if image_export_mode!=ImageRefMode.PLACEHOLDER:
|
||||
pipeline_options.generate_page_images = True
|
||||
|
||||
if artifacts_path is not None:
|
||||
pipeline_options.artifacts_path = artifacts_path
|
||||
|
||||
@ -361,7 +393,7 @@ def convert(
|
||||
allowed_formats=from_formats,
|
||||
format_options=format_options,
|
||||
)
|
||||
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
conv_results = doc_converter.convert_all(
|
||||
@ -377,6 +409,7 @@ def convert(
|
||||
export_md=export_md,
|
||||
export_txt=export_txt,
|
||||
export_doctags=export_doctags,
|
||||
image_export_mode = image_export_mode,
|
||||
)
|
||||
|
||||
end_time = time.time() - start_time
|
||||
|
@ -53,7 +53,7 @@ theme:
|
||||
- toc.follow
|
||||
nav:
|
||||
- Home:
|
||||
- "🦆 Docling": index.md
|
||||
- "Docling": index.md
|
||||
- Installation: installation.md
|
||||
- Usage: usage.md
|
||||
- CLI: cli.md
|
||||
|
Loading…
Reference in New Issue
Block a user