From 76d904164eefae195133a98ffbb24b52b9ed21e4 Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Wed, 23 Oct 2024 15:33:00 +0200 Subject: [PATCH] Fixed issue with group ordeering in pptx backend, added gebug log into run with formats Signed-off-by: Maksym Lysak --- docling/backend/mspowerpoint_backend.py | 20 +++++++++++++------- docs/examples/run_with_formats.py | 2 +- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index 1f939f2e..cbec761c 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -112,6 +112,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB def handle_text_elements(self, shape, parent_slide, slide_ind, doc): is_a_list = False + is_list_group_created = False enum_list_item_value = 0 new_list = None bullet_type = "None" @@ -153,6 +154,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB else: _log.debug("No List") + # If there is a list inside of the shape, create a new docling list to assign list items to + # if is_a_list: + # new_list = doc.add_group( + # label=list_label, name=f"list", parent=parent_slide + # ) + # Iterate through paragraphs to build up text for paragraph in shape.text_frame.paragraphs: # p_text = paragraph.text.strip() @@ -219,16 +226,15 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB prov=prov, ) - # If there is a list inside of the shape, create a new docling list to assign list items to - if is_a_list: - new_list = doc.add_group( - label=list_label, name=f"list", parent=parent_slide - ) - if len(inline_list_item_text) > 0: enum_marker = "" if is_numbered: enum_marker = str(enum_list_item_value) + "." + if not is_list_group_created: + new_list = doc.add_group( + label=list_label, name=f"list", parent=parent_slide + ) + is_list_group_created = True doc.add_list_item( marker=enum_marker, enumerated=is_numbered, @@ -328,7 +334,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB if len(tcells) > 0: # If table is not fully empty... # Create Docling table - doc.add_table(data=data, prov=prov) + doc.add_table(parent=parent_slide, data=data, prov=prov) return def walk_linear(self, pptx_obj, doc) -> DoclingDocument: diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py index 00f649b4..a3b62b2d 100644 --- a/docs/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -65,7 +65,7 @@ def main(): f"Document {res.input.file.name} converted." f"\nSaved markdown output to: {str(out_path)}" ) - # print(res.docdocument.export_to_markdown()) + _log.debug(res.document._export_to_indented_text(max_text_len=16)) # Export Docling document format to markdowndoc: with (out_path / f"{res.input.file.stem}.md").open("w") as fp: fp.write(res.document.export_to_markdown())