proceed processing the content of single cell table as if its just part of the body

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-11-12 14:26:10 +01:00
parent f7b58dfa51
commit b46ae1af56

View File

@ -130,7 +130,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
for element in body: for element in body:
tag_name = etree.QName(element).localname tag_name = etree.QName(element).localname
# Check for Inline Images (drawings or blip elements) # Check for Inline Images (drawings or blip elements)
found_drawing = etree.ElementBase.xpath( found_drawing = etree.ElementBase.xpath(
element, ".//w:drawing", namespaces=self.xml_namespaces element, ".//w:drawing", namespaces=self.xml_namespaces
@ -164,8 +163,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return default return default
def get_numId_and_ilvl(self, paragraph): def get_numId_and_ilvl(self, paragraph):
if not hasattr(paragraph._element, "find"):
return None, None
# Access the XML element of the paragraph # Access the XML element of the paragraph
numPr = paragraph._element.find( numPr = paragraph._element.find(
".//w:numPr", namespaces=paragraph._element.nsmap ".//w:numPr", namespaces=paragraph._element.nsmap
@ -448,16 +445,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
for row in table.rows: for row in table.rows:
# Calculate the max number of columns # Calculate the max number of columns
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells)) num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
# if row.cells:
# num_cols = max(num_cols, len(row.cells))
print("num_rows = {}, num_cols = {}".format(num_rows, num_cols)) if num_rows == 1 and num_cols == 1:
if num_rows == 1:
if num_cols == 1:
cell_element = table.rows[0].cells[0] cell_element = table.rows[0].cells[0]
for paragraph in cell_element.paragraphs: # In case we have a table of only 1 cell, we consider it furniture
# print(paragraph.text) # And proceed processing the content of the cell as though it's in the document body
self.handle_text_elements(paragraph, docx_obj, doc) self.walk_linear(cell_element._element, docx_obj, doc)
return return
# Initialize the table grid # Initialize the table grid