mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
Fixed issue with group ordeering in pptx backend, added gebug log into run with formats
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
82126e3871
commit
76d904164e
@ -112,6 +112,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
|
|
||||||
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
|
||||||
is_a_list = False
|
is_a_list = False
|
||||||
|
is_list_group_created = False
|
||||||
enum_list_item_value = 0
|
enum_list_item_value = 0
|
||||||
new_list = None
|
new_list = None
|
||||||
bullet_type = "None"
|
bullet_type = "None"
|
||||||
@ -153,6 +154,12 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
else:
|
else:
|
||||||
_log.debug("No List")
|
_log.debug("No List")
|
||||||
|
|
||||||
|
# If there is a list inside of the shape, create a new docling list to assign list items to
|
||||||
|
# if is_a_list:
|
||||||
|
# new_list = doc.add_group(
|
||||||
|
# label=list_label, name=f"list", parent=parent_slide
|
||||||
|
# )
|
||||||
|
|
||||||
# Iterate through paragraphs to build up text
|
# Iterate through paragraphs to build up text
|
||||||
for paragraph in shape.text_frame.paragraphs:
|
for paragraph in shape.text_frame.paragraphs:
|
||||||
# p_text = paragraph.text.strip()
|
# p_text = paragraph.text.strip()
|
||||||
@ -219,16 +226,15 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
prov=prov,
|
prov=prov,
|
||||||
)
|
)
|
||||||
|
|
||||||
# If there is a list inside of the shape, create a new docling list to assign list items to
|
|
||||||
if is_a_list:
|
|
||||||
new_list = doc.add_group(
|
|
||||||
label=list_label, name=f"list", parent=parent_slide
|
|
||||||
)
|
|
||||||
|
|
||||||
if len(inline_list_item_text) > 0:
|
if len(inline_list_item_text) > 0:
|
||||||
enum_marker = ""
|
enum_marker = ""
|
||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(enum_list_item_value) + "."
|
enum_marker = str(enum_list_item_value) + "."
|
||||||
|
if not is_list_group_created:
|
||||||
|
new_list = doc.add_group(
|
||||||
|
label=list_label, name=f"list", parent=parent_slide
|
||||||
|
)
|
||||||
|
is_list_group_created = True
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
enumerated=is_numbered,
|
enumerated=is_numbered,
|
||||||
@ -328,7 +334,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
|
|||||||
if len(tcells) > 0:
|
if len(tcells) > 0:
|
||||||
# If table is not fully empty...
|
# If table is not fully empty...
|
||||||
# Create Docling table
|
# Create Docling table
|
||||||
doc.add_table(data=data, prov=prov)
|
doc.add_table(parent=parent_slide, data=data, prov=prov)
|
||||||
return
|
return
|
||||||
|
|
||||||
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
|
||||||
|
@ -65,7 +65,7 @@ def main():
|
|||||||
f"Document {res.input.file.name} converted."
|
f"Document {res.input.file.name} converted."
|
||||||
f"\nSaved markdown output to: {str(out_path)}"
|
f"\nSaved markdown output to: {str(out_path)}"
|
||||||
)
|
)
|
||||||
# print(res.docdocument.export_to_markdown())
|
_log.debug(res.document._export_to_indented_text(max_text_len=16))
|
||||||
# Export Docling document format to markdowndoc:
|
# Export Docling document format to markdowndoc:
|
||||||
with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
|
with (out_path / f"{res.input.file.stem}.md").open("w") as fp:
|
||||||
fp.write(res.document.export_to_markdown())
|
fp.write(res.document.export_to_markdown())
|
||||||
|
Loading…
Reference in New Issue
Block a user