updated all the examples to deal with new rendering

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
This commit is contained in:
Peter Staar 2024-09-10 15:22:48 +02:00
parent fd23432f6e
commit 4f89470ab1
3 changed files with 16 additions and 1 deletions

View File

@ -163,8 +163,12 @@ def generate_multimodal_pages(
content_md = doc.export_to_markdown( content_md = doc.export_to_markdown(
main_text_start=start_ix, main_text_stop=end_ix main_text_start=start_ix, main_text_stop=end_ix
) )
# No page-tagging since we only do 1 page at the time
content_dt = doc.export_to_document_tokens(
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
)
return content_text, content_md, page_cells, page_segments, page return content_text, content_md, content_dt, page_cells, page_segments, page
for ix, orig_item in enumerate(doc.main_text): for ix, orig_item in enumerate(doc.main_text):

View File

@ -31,9 +31,18 @@ def export_documents(
with (output_dir / f"{doc_filename}.json").open("w") as fp: with (output_dir / f"{doc_filename}.json").open("w") as fp:
fp.write(json.dumps(conv_res.render_as_dict())) fp.write(json.dumps(conv_res.render_as_dict()))
# Export Text format:
with (output_dir / f"{doc_filename}.txt").open("w") as fp:
fp.write(conv_res.render_as_text())
# Export Markdown format: # Export Markdown format:
with (output_dir / f"{doc_filename}.md").open("w") as fp: with (output_dir / f"{doc_filename}.md").open("w") as fp:
fp.write(conv_res.render_as_markdown()) fp.write(conv_res.render_as_markdown())
# Export Document Tags format:
with (output_dir / f"{doc_filename}.doctags").open("w") as fp:
fp.write(conv_res.render_as_doctags())
else: else:
_log.info(f"Document {conv_res.input.file} failed to convert.") _log.info(f"Document {conv_res.input.file} failed to convert.")
failure_count += 1 failure_count += 1

View File

@ -51,6 +51,7 @@ def main():
for ( for (
content_text, content_text,
content_md, content_md,
content_dt,
page_cells, page_cells,
page_segments, page_segments,
page, page,
@ -71,6 +72,7 @@ def main():
"cells": page_cells, "cells": page_cells,
"contents": content_text, "contents": content_text,
"contents_md": content_md, "contents_md": content_md,
"contents_dt": content_dt,
"segments": page_segments, "segments": page_segments,
"extra": { "extra": {
"page_num": page.page_no + 1, "page_num": page.page_no + 1,