Handling of single-cell tables in DOCX backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
Maksym Lysak 2024-11-12 09:43:37 +01:00
parent 81c8243a8b
commit 9569214afb

View File

@ -141,10 +141,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# Check for Tables
if element.tag.endswith("tbl"):
try:
self.handle_tables(element, docx_obj, doc)
except Exception:
_log.debug("could not parse a table, broken docx table")
# try:
self.handle_tables(element, docx_obj, doc)
# except Exception:
# _log.debug("could not parse a table, broken docx table")
elif found_drawing or found_pict:
self.handle_pictures(element, docx_obj, doc)
@ -164,6 +164,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return default
def get_numId_and_ilvl(self, paragraph):
if not hasattr(paragraph._element, "find"):
return None, None
# Access the XML element of the paragraph
numPr = paragraph._element.find(
".//w:numPr", namespaces=paragraph._element.nsmap
@ -201,7 +203,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label_str = ""
label_level = 0
if parts[0] == "Heading":
# print("{} - {}".format(parts[0], parts[1]))
label_str = parts[0]
label_level = self.str_to_int(parts[1], default=None)
if parts[1] == "Heading":
@ -212,24 +213,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return label, None
def handle_text_elements(self, element, docx_obj, doc):
# if hasattr(element, "text"):
# paragraph = element
# else:
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
if paragraph.text is None:
# _log.warn(f"paragraph has text==None")
return
text = paragraph.text.strip()
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
# Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph"
# TODO: reliably identify wether list is a numbered list or not
# Identify wether list is a numbered list or not
# is_numbered = "List Bullet" not in paragraph.style.name
is_numbered = False
p_style_name, p_level = self.get_label_and_level(paragraph)
numid, ilevel = self.get_numId_and_ilvl(paragraph)
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
if numid == 0:
numid = None
@ -453,6 +455,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
# if row.cells:
# num_cols = max(num_cols, len(row.cells))
print("num_rows = {}, num_cols = {}".format(num_rows, num_cols))
if num_rows == 1:
if num_cols == 1:
cell_element = table.rows[0].cells[0]
for paragraph in cell_element.paragraphs:
# print(paragraph.text)
self.handle_text_elements(paragraph, docx_obj, doc)
return
# Initialize the table grid
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]