From 8891c6653615768996b64e463dd26eed4a6bbe18 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Thu, 10 Apr 2025 13:58:59 +0200 Subject: [PATCH] updated the cli to output html in split-page mode Signed-off-by: Peter Staar --- docling/cli/main.py | 13 ++++++++++++- docling/datamodel/base_models.py | 1 + 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/docling/cli/main.py b/docling/cli/main.py index e0f0cbd8..4f266638 100644 --- a/docling/cli/main.py +++ b/docling/cli/main.py @@ -153,6 +153,7 @@ def export_documents( output_dir: Path, export_json: bool, export_html: bool, + export_html_split_page: bool, export_md: bool, export_txt: bool, export_doctags: bool, @@ -180,7 +181,15 @@ def export_documents( fname = output_dir / f"{doc_filename}.html" _log.info(f"writing HTML output to {fname}") conv_res.document.save_as_html( - filename=fname, image_mode=image_export_mode + filename=fname, image_mode=image_export_mode, split_page_view=False + ) + + # Export HTML format: + if export_html_split_page: + fname = output_dir / f"{doc_filename}.html" + _log.info(f"writing HTML output to {fname}") + conv_res.document.save_as_html( + filename=fname, image_mode=image_export_mode, split_page_view=True ) # Export Text format: @@ -471,6 +480,7 @@ def convert( export_json = OutputFormat.JSON in to_formats export_html = OutputFormat.HTML in to_formats + export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats export_md = OutputFormat.MARKDOWN in to_formats export_txt = OutputFormat.TEXT in to_formats export_doctags = OutputFormat.DOCTAGS in to_formats @@ -578,6 +588,7 @@ def convert( output_dir=output, export_json=export_json, export_html=export_html, + export_html_split_page=export_html_split_page, export_md=export_md, export_txt=export_txt, export_doctags=export_doctags, diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 76827a1b..b3f6c2aa 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -50,6 +50,7 @@ class OutputFormat(str, Enum): MARKDOWN = "md" JSON = "json" HTML = "html" + HTML_SPLIT_PAGE = "html_split_page" TEXT = "text" DOCTAGS = "doctags"