Merge branch 'cau/input-format-abstraction' of github.com:DS4SD/docling into cau/input-format-abstraction

This commit is contained in:
Christoph Auer 2024-10-14 16:54:56 +02:00
commit b964c4bb69

View File

@ -109,7 +109,9 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
def handle_text_elements(self, shape, parent_slide, slide_ind, doc): def handle_text_elements(self, shape, parent_slide, slide_ind, doc):
is_a_list = False is_a_list = False
enum_list_item_value = 0
for paragraph in shape.text_frame.paragraphs: for paragraph in shape.text_frame.paragraphs:
enum_list_item_value += 1
bullet_type = "None" bullet_type = "None"
# Check if paragraph is a bullet point using the `element` XML # Check if paragraph is a bullet point using the `element` XML
p = paragraph._element p = paragraph._element
@ -157,7 +159,7 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}): for e in p.iterfind(".//a:r", namespaces={"a": self.namespaces["a"]}):
if len(e.text.strip()) > 0: if len(e.text.strip()) > 0:
e_is_a_list_item = False e_is_a_list_item = False
is_numbered = False
if ( if (
p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]}) p.find(".//a:buChar", namespaces={"a": self.namespaces["a"]})
is not None is not None
@ -169,13 +171,17 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
is not None is not None
): ):
bullet_type = "Numbered" bullet_type = "Numbered"
is_numbered = True
e_is_a_list_item = True e_is_a_list_item = True
else: else:
e_is_a_list_item = False e_is_a_list_item = False
if e_is_a_list_item: if e_is_a_list_item:
# TODO: Set marker and enumerated arguments if this is an enumeration element. # TODO: Set marker and enumerated arguments if this is an enumeration element.
enum_marker = str(enum_list_item_value) + "."
doc.add_list_item( doc.add_list_item(
marker=enum_marker,
enumerated=is_numbered,
parent=new_list, parent=new_list,
text=list_text, text=list_text,
prov=prov, prov=prov,
@ -195,6 +201,8 @@ class MsPowerpointDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentB
elif placeholder_type == PP_PLACEHOLDER.SUBTITLE: elif placeholder_type == PP_PLACEHOLDER.SUBTITLE:
DocItemLabel.SECTION_HEADER DocItemLabel.SECTION_HEADER
enum_list_item_value = 1
doc.add_text( doc.add_text(
label=doc_label, label=doc_label,
parent=parent_slide, parent=parent_slide,