From 4f89470ab1751a3769906e9117cff03783b7cd62 Mon Sep 17 00:00:00 2001 From: Peter Staar Date: Tue, 10 Sep 2024 15:22:48 +0200 Subject: [PATCH] updated all the examples to deal with new rendering Signed-off-by: Peter Staar --- docling/utils/export.py | 6 +++++- examples/custom_convert.py | 9 +++++++++ examples/export_multimodal.py | 2 ++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/docling/utils/export.py b/docling/utils/export.py index 31b17ea8..f438ed1d 100644 --- a/docling/utils/export.py +++ b/docling/utils/export.py @@ -163,8 +163,12 @@ def generate_multimodal_pages( content_md = doc.export_to_markdown( main_text_start=start_ix, main_text_stop=end_ix ) + # No page-tagging since we only do 1 page at the time + content_dt = doc.export_to_document_tokens( + main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False + ) - return content_text, content_md, page_cells, page_segments, page + return content_text, content_md, content_dt, page_cells, page_segments, page for ix, orig_item in enumerate(doc.main_text): diff --git a/examples/custom_convert.py b/examples/custom_convert.py index d4ead056..6f0b8f8f 100644 --- a/examples/custom_convert.py +++ b/examples/custom_convert.py @@ -31,9 +31,18 @@ def export_documents( with (output_dir / f"{doc_filename}.json").open("w") as fp: fp.write(json.dumps(conv_res.render_as_dict())) + # Export Text format: + with (output_dir / f"{doc_filename}.txt").open("w") as fp: + fp.write(conv_res.render_as_text()) + # Export Markdown format: with (output_dir / f"{doc_filename}.md").open("w") as fp: fp.write(conv_res.render_as_markdown()) + + # Export Document Tags format: + with (output_dir / f"{doc_filename}.doctags").open("w") as fp: + fp.write(conv_res.render_as_doctags()) + else: _log.info(f"Document {conv_res.input.file} failed to convert.") failure_count += 1 diff --git a/examples/export_multimodal.py b/examples/export_multimodal.py index cc0b9fab..7c016b19 100644 --- a/examples/export_multimodal.py +++ b/examples/export_multimodal.py @@ -51,6 +51,7 @@ def main(): for ( content_text, content_md, + content_dt, page_cells, page_segments, page, @@ -71,6 +72,7 @@ def main(): "cells": page_cells, "contents": content_text, "contents_md": content_md, + "contents_dt": content_dt, "segments": page_segments, "extra": { "page_num": page.page_no + 1,