mirror of
https://github.com/DS4SD/docling.git
synced 2025-08-02 15:32:30 +00:00
proceed processing the content of single cell table as if its just part of the body
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
f7b58dfa51
commit
b46ae1af56
@ -130,7 +130,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
|
|
||||||
# Check for Inline Images (drawings or blip elements)
|
# Check for Inline Images (drawings or blip elements)
|
||||||
found_drawing = etree.ElementBase.xpath(
|
found_drawing = etree.ElementBase.xpath(
|
||||||
element, ".//w:drawing", namespaces=self.xml_namespaces
|
element, ".//w:drawing", namespaces=self.xml_namespaces
|
||||||
@ -164,8 +163,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
def get_numId_and_ilvl(self, paragraph):
|
def get_numId_and_ilvl(self, paragraph):
|
||||||
if not hasattr(paragraph._element, "find"):
|
|
||||||
return None, None
|
|
||||||
# Access the XML element of the paragraph
|
# Access the XML element of the paragraph
|
||||||
numPr = paragraph._element.find(
|
numPr = paragraph._element.find(
|
||||||
".//w:numPr", namespaces=paragraph._element.nsmap
|
".//w:numPr", namespaces=paragraph._element.nsmap
|
||||||
@ -448,16 +445,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for row in table.rows:
|
for row in table.rows:
|
||||||
# Calculate the max number of columns
|
# Calculate the max number of columns
|
||||||
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
||||||
# if row.cells:
|
|
||||||
# num_cols = max(num_cols, len(row.cells))
|
|
||||||
|
|
||||||
print("num_rows = {}, num_cols = {}".format(num_rows, num_cols))
|
if num_rows == 1 and num_cols == 1:
|
||||||
if num_rows == 1:
|
|
||||||
if num_cols == 1:
|
|
||||||
cell_element = table.rows[0].cells[0]
|
cell_element = table.rows[0].cells[0]
|
||||||
for paragraph in cell_element.paragraphs:
|
# In case we have a table of only 1 cell, we consider it furniture
|
||||||
# print(paragraph.text)
|
# And proceed processing the content of the cell as though it's in the document body
|
||||||
self.handle_text_elements(paragraph, docx_obj, doc)
|
self.walk_linear(cell_element._element, docx_obj, doc)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Initialize the table grid
|
# Initialize the table grid
|
||||||
|
Loading…
Reference in New Issue
Block a user