feat(cli): add option for html with split-page mode (#1355)

* updated the cli to output html in split-page mode

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* add pin for new docling-core with html split argument

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* relock with fixed html export in docling-core

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update test results

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update more tests

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update example

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update lock with docling-core fixes

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* update test results

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* add again chunking extras

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2025-04-14 08:41:50 +02:00
committed by GitHub
parent 0de70e7991
commit c0ba88edf1
17 changed files with 142 additions and 73 deletions

View File

@@ -154,6 +154,7 @@ def export_documents(
output_dir: Path,
export_json: bool,
export_html: bool,
export_html_split_page: bool,
export_md: bool,
export_txt: bool,
export_doctags: bool,
@@ -181,7 +182,15 @@ def export_documents(
fname = output_dir / f"{doc_filename}.html"
_log.info(f"writing HTML output to {fname}")
conv_res.document.save_as_html(
filename=fname, image_mode=image_export_mode
filename=fname, image_mode=image_export_mode, split_page_view=False
)
# Export HTML format:
if export_html_split_page:
fname = output_dir / f"{doc_filename}.html"
_log.info(f"writing HTML output to {fname}")
conv_res.document.save_as_html(
filename=fname, image_mode=image_export_mode, split_page_view=True
)
# Export Text format:
@@ -472,6 +481,7 @@ def convert(
export_json = OutputFormat.JSON in to_formats
export_html = OutputFormat.HTML in to_formats
export_html_split_page = OutputFormat.HTML_SPLIT_PAGE in to_formats
export_md = OutputFormat.MARKDOWN in to_formats
export_txt = OutputFormat.TEXT in to_formats
export_doctags = OutputFormat.DOCTAGS in to_formats
@@ -585,6 +595,7 @@ def convert(
output_dir=output,
export_json=export_json,
export_html=export_html,
export_html_split_page=export_html_split_page,
export_md=export_md,
export_txt=export_txt,
export_doctags=export_doctags,

View File

@@ -50,6 +50,7 @@ class OutputFormat(str, Enum):
MARKDOWN = "md"
JSON = "json"
HTML = "html"
HTML_SPLIT_PAGE = "html_split_page"
TEXT = "text"
DOCTAGS = "doctags"