cleaning up the comments

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-12-06 10:45:06 +01:00
parent d6c314d7f1
commit 0769cb03bf

View File

@ -10,11 +10,10 @@ from pathlib import Path
from typing import Annotated, Dict, Iterable, List, Optional, Type from typing import Annotated, Dict, Iterable, List, Optional, Type
import typer import typer
from docling_core.types.doc import ImageRefMode
from docling_core.utils.file import resolve_source_to_path from docling_core.utils.file import resolve_source_to_path
from pydantic import TypeAdapter, ValidationError from pydantic import TypeAdapter, ValidationError
from docling_core.types.doc import ImageRefMode
from docling.backend.docling_parse_backend import DoclingParseDocumentBackend from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend from docling.backend.pdf_backend import PdfDocumentBackend
@ -93,7 +92,7 @@ def export_documents(
export_md: bool, export_md: bool,
export_txt: bool, export_txt: bool,
export_doctags: bool, export_doctags: bool,
image_export_mode: ImageRefMode image_export_mode: ImageRefMode,
): ):
success_count = 0 success_count = 0
@ -107,57 +106,43 @@ def export_documents(
# Export JSON format: # Export JSON format:
if export_json: if export_json:
fname = output_dir / f"{doc_filename}.json" fname = output_dir / f"{doc_filename}.json"
_log.info(f"writing JSON output to {fname}") _log.info(f"writing JSON output to {fname}")
conv_res.document.save_as_json(filename=fname, image_mode=image_export_mode) conv_res.document.save_as_json(
""" filename=fname, image_mode=image_export_mode
fname = output_dir / f"{doc_filename}.json" )
with fname.open("w", encoding="utf8") as fp:
_log.info(f"writing JSON output to {fname}")
fp.write(json.dumps(conv_res.document.export_to_dict()))
"""
# Export HTML format: # Export HTML format:
if export_html: if export_html:
fname = output_dir / f"{doc_filename}.html" fname = output_dir / f"{doc_filename}.html"
_log.info(f"writing HTML output to {fname}") _log.info(f"writing HTML output to {fname}")
conv_res.document.save_as_html(filename=fname, image_mode=image_export_mode) conv_res.document.save_as_html(
""" filename=fname, image_mode=image_export_mode
with fname.open("w", encoding="utf8") as fp: )
_log.info(f"writing HTML output to {fname}")
fp.write(conv_res.document.export_to_html())
"""
# Export Text format: # Export Text format:
if export_txt: if export_txt:
fname = output_dir / f"{doc_filename}.txt" fname = output_dir / f"{doc_filename}.txt"
_log.info(f"writing TXT output to {fname}") _log.info(f"writing TXT output to {fname}")
conv_res.document.save_as_text(filename=fname) conv_res.document.save_as_markdown(
""" filename=fname,
with fname.open("w", encoding="utf8") as fp: strict_text=True,
_log.info(f"writing Text output to {fname}") image_mode=ImageRefMode.PLACEHOLDER,
fp.write(conv_res.document.export_to_markdown(strict_text=True)) )
"""
# Export Markdown format: # Export Markdown format:
if export_md: if export_md:
fname = output_dir / f"{doc_filename}.md" fname = output_dir / f"{doc_filename}.md"
_log.info(f"writing Markdown output to {fname}") _log.info(f"writing Markdown output to {fname}")
conv_res.document.save_as_md(filename=fname, image_mode=image_export_mode) conv_res.document.save_as_markdown(
""" filename=fname, image_mode=image_export_mode
with fname.open("w", encoding="utf8") as fp: )
_log.info(f"writing Markdown output to {fname}")
fp.write(conv_res.document.export_to_markdown())
"""
# Export Document Tags format: # Export Document Tags format:
if export_doctags: if export_doctags:
fname = output_dir / f"{doc_filename}.doctags" fname = output_dir / f"{doc_filename}.doctags"
_log.info(f"writing Doc Tags output to {fname}") _log.info(f"writing Doc Tags output to {fname}")
conv_res.document.save_as_document_tokens(filename=fname) conv_res.document.save_as_document_tokens(filename=fname)
"""
with fname.open("w", encoding="utf8") as fp:
fp.write(conv_res.document.export_to_document_tokens())
"""
else: else:
_log.warning(f"Document {conv_res.input.file} failed to convert.") _log.warning(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1 failure_count += 1
@ -195,7 +180,7 @@ def convert(
ImageRefMode, ImageRefMode,
typer.Option( typer.Option(
..., ...,
help="Image export mode for the document (only in case of Markdown or HTML). In `placeholder`, only the position of the image is returned. In `embedded` mode, the image is contained in base64. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document." help="Image export mode for the document (only in case of Markdown or HTML). In `placeholder`, only the position of the image is returned. In `embedded` mode, the image is contained in base64. In `referenced` mode, the image is exported in PNG format and referenced from the main exported document.",
), ),
] = ImageRefMode.EMBEDDED, ] = ImageRefMode.EMBEDDED,
ocr: Annotated[ ocr: Annotated[
@ -357,20 +342,20 @@ def convert(
ocr_lang_list = _split_list(ocr_lang) ocr_lang_list = _split_list(ocr_lang)
if ocr_lang_list is not None: if ocr_lang_list is not None:
ocr_options.lang = ocr_lang_list ocr_options.lang = ocr_lang_list
pipeline_options = PdfPipelineOptions( pipeline_options = PdfPipelineOptions(
do_ocr=ocr, do_ocr=ocr,
ocr_options=ocr_options, ocr_options=ocr_options,
do_table_structure=True, do_table_structure=True,
) )
pipeline_options.table_structure_options.do_cell_matching = ( pipeline_options.table_structure_options.do_cell_matching = (
True # do_cell_matching True # do_cell_matching
) )
pipeline_options.table_structure_options.mode = table_mode pipeline_options.table_structure_options.mode = table_mode
if image_export_mode!=ImageRefMode.PLACEHOLDER: if image_export_mode != ImageRefMode.PLACEHOLDER:
pipeline_options.generate_page_images = True pipeline_options.generate_page_images = True
if artifacts_path is not None: if artifacts_path is not None:
pipeline_options.artifacts_path = artifacts_path pipeline_options.artifacts_path = artifacts_path
@ -393,7 +378,7 @@ def convert(
allowed_formats=from_formats, allowed_formats=from_formats,
format_options=format_options, format_options=format_options,
) )
start_time = time.time() start_time = time.time()
conv_results = doc_converter.convert_all( conv_results = doc_converter.convert_all(
@ -409,7 +394,7 @@ def convert(
export_md=export_md, export_md=export_md,
export_txt=export_txt, export_txt=export_txt,
export_doctags=export_doctags, export_doctags=export_doctags,
image_export_mode = image_export_mode, image_export_mode=image_export_mode,
) )
end_time = time.time() - start_time end_time = time.time() - start_time