mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
Handling of single-cell tables in DOCX backend
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
81c8243a8b
commit
9569214afb
@ -141,10 +141,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Check for Tables
|
# Check for Tables
|
||||||
if element.tag.endswith("tbl"):
|
if element.tag.endswith("tbl"):
|
||||||
try:
|
# try:
|
||||||
self.handle_tables(element, docx_obj, doc)
|
self.handle_tables(element, docx_obj, doc)
|
||||||
except Exception:
|
# except Exception:
|
||||||
_log.debug("could not parse a table, broken docx table")
|
# _log.debug("could not parse a table, broken docx table")
|
||||||
|
|
||||||
elif found_drawing or found_pict:
|
elif found_drawing or found_pict:
|
||||||
self.handle_pictures(element, docx_obj, doc)
|
self.handle_pictures(element, docx_obj, doc)
|
||||||
@ -164,6 +164,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
def get_numId_and_ilvl(self, paragraph):
|
def get_numId_and_ilvl(self, paragraph):
|
||||||
|
if not hasattr(paragraph._element, "find"):
|
||||||
|
return None, None
|
||||||
# Access the XML element of the paragraph
|
# Access the XML element of the paragraph
|
||||||
numPr = paragraph._element.find(
|
numPr = paragraph._element.find(
|
||||||
".//w:numPr", namespaces=paragraph._element.nsmap
|
".//w:numPr", namespaces=paragraph._element.nsmap
|
||||||
@ -201,7 +203,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label_str = ""
|
label_str = ""
|
||||||
label_level = 0
|
label_level = 0
|
||||||
if parts[0] == "Heading":
|
if parts[0] == "Heading":
|
||||||
# print("{} - {}".format(parts[0], parts[1]))
|
|
||||||
label_str = parts[0]
|
label_str = parts[0]
|
||||||
label_level = self.str_to_int(parts[1], default=None)
|
label_level = self.str_to_int(parts[1], default=None)
|
||||||
if parts[1] == "Heading":
|
if parts[1] == "Heading":
|
||||||
@ -212,24 +213,25 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
return label, None
|
return label, None
|
||||||
|
|
||||||
def handle_text_elements(self, element, docx_obj, doc):
|
def handle_text_elements(self, element, docx_obj, doc):
|
||||||
|
# if hasattr(element, "text"):
|
||||||
|
# paragraph = element
|
||||||
|
# else:
|
||||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
||||||
|
|
||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
# _log.warn(f"paragraph has text==None")
|
# _log.warn(f"paragraph has text==None")
|
||||||
return
|
return
|
||||||
|
|
||||||
text = paragraph.text.strip()
|
text = paragraph.text.strip()
|
||||||
|
|
||||||
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
# TODO: reliably identify wether list is a numbered list or not
|
# Identify wether list is a numbered list or not
|
||||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
|
|
||||||
p_style_name, p_level = self.get_label_and_level(paragraph)
|
p_style_name, p_level = self.get_label_and_level(paragraph)
|
||||||
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
||||||
# print("numid: {}, ilevel: {}, text: {}".format(numid, ilevel, text))
|
|
||||||
|
|
||||||
if numid == 0:
|
if numid == 0:
|
||||||
numid = None
|
numid = None
|
||||||
@ -453,6 +455,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# if row.cells:
|
# if row.cells:
|
||||||
# num_cols = max(num_cols, len(row.cells))
|
# num_cols = max(num_cols, len(row.cells))
|
||||||
|
|
||||||
|
print("num_rows = {}, num_cols = {}".format(num_rows, num_cols))
|
||||||
|
if num_rows == 1:
|
||||||
|
if num_cols == 1:
|
||||||
|
cell_element = table.rows[0].cells[0]
|
||||||
|
for paragraph in cell_element.paragraphs:
|
||||||
|
# print(paragraph.text)
|
||||||
|
self.handle_text_elements(paragraph, docx_obj, doc)
|
||||||
|
return
|
||||||
|
|
||||||
# Initialize the table grid
|
# Initialize the table grid
|
||||||
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user