From b46ae1af56e41882544e466305cc369206fca47d Mon Sep 17 00:00:00 2001 From: Maksym Lysak Date: Tue, 12 Nov 2024 14:26:10 +0100 Subject: [PATCH] proceed processing the content of single cell table as if its just part of the body Signed-off-by: Maksym Lysak --- docling/backend/msword_backend.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 00eeaa5f..eb7b75cb 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -130,7 +130,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): def walk_linear(self, body, docx_obj, doc) -> DoclingDocument: for element in body: tag_name = etree.QName(element).localname - # Check for Inline Images (drawings or blip elements) found_drawing = etree.ElementBase.xpath( element, ".//w:drawing", namespaces=self.xml_namespaces @@ -164,8 +163,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return default def get_numId_and_ilvl(self, paragraph): - if not hasattr(paragraph._element, "find"): - return None, None # Access the XML element of the paragraph numPr = paragraph._element.find( ".//w:numPr", namespaces=paragraph._element.nsmap @@ -448,17 +445,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): for row in table.rows: # Calculate the max number of columns num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells)) - # if row.cells: - # num_cols = max(num_cols, len(row.cells)) - print("num_rows = {}, num_cols = {}".format(num_rows, num_cols)) - if num_rows == 1: - if num_cols == 1: - cell_element = table.rows[0].cells[0] - for paragraph in cell_element.paragraphs: - # print(paragraph.text) - self.handle_text_elements(paragraph, docx_obj, doc) - return + if num_rows == 1 and num_cols == 1: + cell_element = table.rows[0].cells[0] + # In case we have a table of only 1 cell, we consider it furniture + # And proceed processing the content of the cell as though it's in the document body + self.walk_linear(cell_element._element, docx_obj, doc) + return # Initialize the table grid table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]