Fixed issue with group ordeering in pptx backend, added gebug log into run with formats

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-10-23 15:33:00 +02:00
parent 82126e3871
commit 76d904164e
2 changed files with 14 additions and 8 deletions

View File

@ -112,6 +112,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
def handle_text_elements(self, shape, parent_slide, slide_ind, doc): def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
is_a_list = False is_a_list = False
is_list_group_created = False
enum_list_item_value = 0 enum_list_item_value = 0
new_list = None new_list = None
bullet_type = "None" bullet_type = "None"
@ -153,6 +154,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
else: else:
_log.debug("No List") _log.debug("No List")
# If there is a list inside of the shape, create a new docling list to assign list items to
# if is_a_list:
# new_list = doc.add_group(
# label=list_label, name=f"list", parent=parent_slide
# )
# Iterate through paragraphs to build up text # Iterate through paragraphs to build up text
for paragraph in shape.text_frame.paragraphs: for paragraph in shape.text_frame.paragraphs:
# p_text = paragraph.text.strip() # p_text = paragraph.text.strip()
@ -219,16 +226,15 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
prov=prov, prov=prov,
) )
# If there is a list inside of the shape, create a new docling list to assign list items to
if is_a_list:
new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide
)
if len(inline_list_item_text) > 0: if len(inline_list_item_text) > 0:
enum_marker = "" enum_marker = ""
if is_numbered: if is_numbered:
enum_marker = str(enum_list_item_value) + "." enum_marker = str(enum_list_item_value) + "."
if not is_list_group_created:
new_list = doc.add_group(
label=list_label, name=f"list", parent=parent_slide
)
is_list_group_created = True
doc.add_list_item( doc.add_list_item(
marker=enum_marker, marker=enum_marker,
enumerated=is_numbered, enumerated=is_numbered,
@ -328,7 +334,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
if len(tcells) > 0: if len(tcells) > 0:
# If table is not fully empty... # If table is not fully empty...
# Create Docling table # Create Docling table
doc.add_table(data=data, prov=prov) doc.add_table(parent=parent_slide, data=data, prov=prov)
return return
def walk_linear(self, pptx_obj, doc) -> DoclingDocument: def walk_linear(self, pptx_obj, doc) -> DoclingDocument:

View File

@ -65,7 +65,7 @@ def main():
f"Document {res.input.file.name} converted." f"Document {res.input.file.name} converted."
f"\nSaved markdown output to: {str(out_path)}" f"\nSaved markdown output to: {str(out_path)}"
) )
# print(res.docdocument.export_to_markdown()) _log.debug(res.document._export_to_indented_text(max_text_len=16))
# Export Docling document format to markdowndoc: # Export Docling document format to markdowndoc:
with (out_path / f"{res.input.file.stem}.md").open("w") as fp: with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
fp.write(res.document.export_to_markdown()) fp.write(res.document.export_to_markdown())