fix: updated the render_as_doctags with the new arguments from docling-core (#93)

* updated the render_as_doctags with the new arguments from docling-core

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* ensuring that docling-core is >1.5.0 to accomodate with the latest export-to-doctags parameters

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* added the doctags tests

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* updated the README

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* fix poetry lock

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Fix formatting problems

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* fixed the doctag export in docling/utils/export.py

Signed-off-by: Peter Staar <taa@zurich.ibm.com>

* propagate xsize and ysize

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Peter Staar <taa@zurich.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Peter W. J. Staar
2024-09-23 20:12:18 +02:00
committed by GitHub
parent dce9934a0f
commit 4794ce460a
18 changed files with 3091 additions and 20 deletions

View File

@@ -368,20 +368,30 @@ class ConvertedDocument(BaseModel):
"table",
"figure",
],
page_tagging: bool = True,
location_tagging: bool = True,
location_dimensions: Tuple[int, int] = (100, 100),
add_new_line: bool = True,
xsize: int = 100,
ysize: int = 100,
add_location: bool = True,
add_content: bool = True,
add_page_index: bool = True,
# table specific flags
add_table_cell_location: bool = False,
add_table_cell_label: bool = True,
add_table_cell_text: bool = True,
) -> str:
return self.output.export_to_document_tokens(
delim=delim,
main_text_start=main_text_start,
main_text_stop=main_text_stop,
main_text_labels=main_text_labels,
page_tagging=page_tagging,
location_tagging=location_tagging,
location_dimensions=location_dimensions,
add_new_line=add_new_line,
xsize=xsize,
ysize=ysize,
add_location=add_location,
add_content=add_content,
add_page_index=add_page_index,
# table specific flags
add_table_cell_location=add_table_cell_location,
add_table_cell_label=add_table_cell_label,
add_table_cell_text=add_table_cell_text,
)
def render_element_images(

View File

@@ -111,7 +111,7 @@ def generate_multimodal_pages(
)
# No page-tagging since we only do 1 page at the time
content_dt = doc.export_to_document_tokens(
main_text_start=start_ix, main_text_stop=end_ix, page_tagging=False
main_text_start=start_ix, main_text_stop=end_ix, add_page_index=False
)
return content_text, content_md, content_dt, page_cells, page_segments, page