mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 07:22:14 +00:00
cleaning up the comments
Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
parent
d6c314d7f1
commit
0769cb03bf
@ -10,11 +10,10 @@ from pathlib import Path
|
|||||||
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
from typing import Annotated, Dict, Iterable, List, Optional, Type
|
||||||
|
|
||||||
import typer
|
import typer
|
||||||
|
from docling_core.types.doc import ImageRefMode
|
||||||
from docling_core.utils.file import resolve_source_to_path
|
from docling_core.utils.file import resolve_source_to_path
|
||||||
from pydantic import TypeAdapter, ValidationError
|
from pydantic import TypeAdapter, ValidationError
|
||||||
|
|
||||||
from docling_core.types.doc import ImageRefMode
|
|
||||||
|
|
||||||
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||||
@ -93,7 +92,7 @@ def export_documents(
|
|||||||
export_md: bool,
|
export_md: bool,
|
||||||
export_txt: bool,
|
export_txt: bool,
|
||||||
export_doctags: bool,
|
export_doctags: bool,
|
||||||
image_export_mode: ImageRefMode
|
image_export_mode: ImageRefMode,
|
||||||
):
|
):
|
||||||
|
|
||||||
success_count = 0
|
success_count = 0
|
||||||
@ -108,56 +107,42 @@ def export_documents(
|
|||||||
if export_json:
|
if export_json:
|
||||||
fname = output_dir / f"{doc_filename}.json"
|
fname = output_dir / f"{doc_filename}.json"
|
||||||
_log.info(f"writing JSON output to {fname}")
|
_log.info(f"writing JSON output to {fname}")
|
||||||
conv_res.document.save_as_json(filename=fname, image_mode=image_export_mode)
|
conv_res.document.save_as_json(
|
||||||
"""
|
filename=fname, image_mode=image_export_mode
|
||||||
fname = output_dir / f"{doc_filename}.json"
|
)
|
||||||
with fname.open("w", encoding="utf8") as fp:
|
|
||||||
_log.info(f"writing JSON output to {fname}")
|
|
||||||
fp.write(json.dumps(conv_res.document.export_to_dict()))
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Export HTML format:
|
# Export HTML format:
|
||||||
if export_html:
|
if export_html:
|
||||||
fname = output_dir / f"{doc_filename}.html"
|
fname = output_dir / f"{doc_filename}.html"
|
||||||
_log.info(f"writing HTML output to {fname}")
|
_log.info(f"writing HTML output to {fname}")
|
||||||
conv_res.document.save_as_html(filename=fname, image_mode=image_export_mode)
|
conv_res.document.save_as_html(
|
||||||
"""
|
filename=fname, image_mode=image_export_mode
|
||||||
with fname.open("w", encoding="utf8") as fp:
|
)
|
||||||
_log.info(f"writing HTML output to {fname}")
|
|
||||||
fp.write(conv_res.document.export_to_html())
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Export Text format:
|
# Export Text format:
|
||||||
if export_txt:
|
if export_txt:
|
||||||
fname = output_dir / f"{doc_filename}.txt"
|
fname = output_dir / f"{doc_filename}.txt"
|
||||||
_log.info(f"writing TXT output to {fname}")
|
_log.info(f"writing TXT output to {fname}")
|
||||||
conv_res.document.save_as_text(filename=fname)
|
conv_res.document.save_as_markdown(
|
||||||
"""
|
filename=fname,
|
||||||
with fname.open("w", encoding="utf8") as fp:
|
strict_text=True,
|
||||||
_log.info(f"writing Text output to {fname}")
|
image_mode=ImageRefMode.PLACEHOLDER,
|
||||||
fp.write(conv_res.document.export_to_markdown(strict_text=True))
|
)
|
||||||
"""
|
|
||||||
|
|
||||||
# Export Markdown format:
|
# Export Markdown format:
|
||||||
if export_md:
|
if export_md:
|
||||||
fname = output_dir / f"{doc_filename}.md"
|
fname = output_dir / f"{doc_filename}.md"
|
||||||
_log.info(f"writing Markdown output to {fname}")
|
_log.info(f"writing Markdown output to {fname}")
|
||||||
conv_res.document.save_as_md(filename=fname, image_mode=image_export_mode)
|
conv_res.document.save_as_markdown(
|
||||||
"""
|
filename=fname, image_mode=image_export_mode
|
||||||
with fname.open("w", encoding="utf8") as fp:
|
)
|
||||||
_log.info(f"writing Markdown output to {fname}")
|
|
||||||
fp.write(conv_res.document.export_to_markdown())
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Export Document Tags format:
|
# Export Document Tags format:
|
||||||
if export_doctags:
|
if export_doctags:
|
||||||
fname = output_dir / f"{doc_filename}.doctags"
|
fname = output_dir / f"{doc_filename}.doctags"
|
||||||
_log.info(f"writing Doc Tags output to {fname}")
|
_log.info(f"writing Doc Tags output to {fname}")
|
||||||
conv_res.document.save_as_document_tokens(filename=fname)
|
conv_res.document.save_as_document_tokens(filename=fname)
|
||||||
"""
|
|
||||||
with fname.open("w", encoding="utf8") as fp:
|
|
||||||
fp.write(conv_res.document.export_to_document_tokens())
|
|
||||||
"""
|
|
||||||
else:
|
else:
|
||||||
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
_log.warning(f"Document {conv_res.input.file} failed to convert.")
|
||||||
failure_count += 1
|
failure_count += 1
|
||||||
@ -195,7 +180,7 @@ def convert(
|
|||||||
ImageRefMode,
|
ImageRefMode,
|
||||||
typer.Option(
|
typer.Option(
|
||||||
...,
|
...,
|
||||||
help="Image export mode for the document (only in case of Markdown or HTML). In `placeholder`, only the position of the image is returned. In `embedded` mode, the image is contained in base64. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document."
|
help="Image export mode for the document (only in case of Markdown or HTML). In `placeholder`, only the position of the image is returned. In `embedded` mode, the image is contained in base64. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
|
||||||
),
|
),
|
||||||
] = ImageRefMode.EMBEDDED,
|
] = ImageRefMode.EMBEDDED,
|
||||||
ocr: Annotated[
|
ocr: Annotated[
|
||||||
|
Loading…
Reference in New Issue
Block a user