mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Improved docx parsing
Signed-off-by: Maxim Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
e613f7bc6c
commit
1346843301
@ -1,3 +1,4 @@
|
|||||||
|
import logging
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Set, Union
|
from typing import Set, Union
|
||||||
@ -21,6 +22,8 @@ from lxml import etree
|
|||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
|
||||||
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
|
|
||||||
@ -39,6 +42,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for i in range(-1, self.max_levels):
|
for i in range(-1, self.max_levels):
|
||||||
self.parents[i] = None
|
self.parents[i] = None
|
||||||
|
|
||||||
|
self.level = 0
|
||||||
|
|
||||||
self.history = {
|
self.history = {
|
||||||
"names": [None],
|
"names": [None],
|
||||||
"levels": [None],
|
"levels": [None],
|
||||||
@ -47,7 +52,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def warn(self, message):
|
def warn(self, message):
|
||||||
print(f"WARN: {message}")
|
_log.warn(message)
|
||||||
|
|
||||||
def is_valid(self) -> bool:
|
def is_valid(self) -> bool:
|
||||||
return True
|
return True
|
||||||
@ -94,17 +99,18 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
# Check for Text
|
|
||||||
if tag_name in ["p"]:
|
|
||||||
self.add_text(element, docx_obj, doc)
|
|
||||||
# Check for Tables
|
# Check for Tables
|
||||||
elif element.tag.endswith("tbl"):
|
if element.tag.endswith("tbl"):
|
||||||
self.add_table(element, docx_obj, doc)
|
self.handle_tables(element, docx_obj, doc)
|
||||||
# Check for Inline Images (drawings or blip elements)
|
# Check for Inline Images (drawings or blip elements)
|
||||||
elif element.tag.endswith("drawing") or element.tag.endswith("blip"):
|
elif element.xpath(".//w:drawing") or element.xpath(".//w:pict"):
|
||||||
self.add_figure(element, docx_obj, doc)
|
self.handle_pictures(element, docx_obj, doc)
|
||||||
|
# Check for Text
|
||||||
|
elif tag_name in ["p"]:
|
||||||
|
self.handle_text_elements(element, docx_obj, doc)
|
||||||
else:
|
else:
|
||||||
self.warn(f"ignoring element in DOCX with tag: {tag_name}")
|
self.warn(f"Ignoring element in DOCX with tag: {tag_name}")
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
def get_numId_and_ilvl(self, paragraph):
|
def get_numId_and_ilvl(self, paragraph):
|
||||||
@ -142,52 +148,43 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return label, None
|
return label, None
|
||||||
|
|
||||||
def add_text(self, element, docx_obj, doc):
|
def handle_text_elements(self, element, docx_obj, doc):
|
||||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
||||||
|
|
||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
# self.warn(f"paragraph has text==None")
|
# self.warn(f"paragraph has text==None")
|
||||||
return
|
return
|
||||||
|
|
||||||
text = paragraph.text.strip()
|
text = paragraph.text.strip()
|
||||||
|
# if len(text)==0 # keep empty paragraphs, they seperate adjacent lists!
|
||||||
|
|
||||||
# if len(text)==0: # keep empty paragraphs, they seperate adjacent lists!
|
p_style_name, p_level = self.get_label_and_level(paragraph)
|
||||||
# self.warn(f"paragraph has len(text)==0")
|
|
||||||
|
|
||||||
pname, plevel = self.get_label_and_level(paragraph)
|
|
||||||
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
numid, ilevel = self.get_numId_and_ilvl(paragraph)
|
||||||
|
|
||||||
# we detected a list
|
# Handle lists
|
||||||
if numid is not None and ilevel is not None:
|
if numid is not None and ilevel is not None:
|
||||||
self.add_listitem(
|
self.add_listitem(
|
||||||
element, docx_obj, doc, pname, plevel, numid, ilevel, text
|
element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text
|
||||||
)
|
)
|
||||||
self.update_history(pname, plevel, numid, ilevel)
|
self.update_history(p_style_name, p_level, numid, ilevel)
|
||||||
|
|
||||||
return
|
return
|
||||||
|
|
||||||
elif numid is None and self.prev_numid() is not None: # Close list
|
elif numid is None and self.prev_numid() is not None: # Close list
|
||||||
|
|
||||||
for key, val in self.parents.items():
|
for key, val in self.parents.items():
|
||||||
if key >= self.level_at_new_list:
|
if key >= self.level_at_new_list:
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
|
|
||||||
self.level = self.level_at_new_list - 1
|
self.level = self.level_at_new_list - 1
|
||||||
self.level_at_new_list = None
|
self.level_at_new_list = None
|
||||||
|
|
||||||
if pname in ["Title"]:
|
if p_style_name in ["Title"]:
|
||||||
|
|
||||||
for key, val in self.parents.items():
|
for key, val in self.parents.items():
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
|
|
||||||
self.parents[0] = doc.add_text(
|
self.parents[0] = doc.add_text(
|
||||||
parent=None, label=DocItemLabel.TITLE, text=text
|
parent=None, label=DocItemLabel.TITLE, text=text
|
||||||
)
|
)
|
||||||
|
|
||||||
elif "Heading" in pname:
|
elif "Heading" in p_style_name:
|
||||||
self.add_header(element, docx_obj, doc, pname, plevel, text)
|
self.add_header(element, docx_obj, doc, p_style_name, p_level, text)
|
||||||
|
|
||||||
elif pname in [
|
elif p_style_name in [
|
||||||
"Paragraph",
|
"Paragraph",
|
||||||
"Normal",
|
"Normal",
|
||||||
"Subtitle",
|
"Subtitle",
|
||||||
@ -203,15 +200,16 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
assert False, f"need to add a new paragraph: {pname}"
|
# Text style names can, and will have, not only default values but user values too
|
||||||
|
# hence we treat all other labels as pure text
|
||||||
|
doc.add_text(
|
||||||
|
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||||
|
)
|
||||||
|
|
||||||
self.update_history(pname, plevel, numid, ilevel)
|
self.update_history(p_style_name, p_level, numid, ilevel)
|
||||||
|
|
||||||
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
def add_header(self, element, docx_obj, doc, curr_name, curr_level, text: str):
|
||||||
|
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
# print(f"level: {level} => add_header(self, element, docx_obj, doc, {curr_name}, {curr_level}): {text}")
|
|
||||||
|
|
||||||
if isinstance(curr_level, int):
|
if isinstance(curr_level, int):
|
||||||
|
|
||||||
if curr_level == level:
|
if curr_level == level:
|
||||||
@ -251,12 +249,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def add_listitem(
|
def add_listitem(
|
||||||
self, element, docx_obj, doc, pname, plevel, numid, ilevel, text: str
|
self, element, docx_obj, doc, p_style_name, p_level, numid, ilevel, text: str
|
||||||
):
|
):
|
||||||
|
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
# print(f"level: {level} => add_listitem(self, element, docx_obj, doc, {pname}, {plevel}, {numid}, {ilevel}): {text}")
|
|
||||||
|
|
||||||
if self.prev_numid() is None: # Open new list
|
if self.prev_numid() is None: # Open new list
|
||||||
|
|
||||||
self.level_at_new_list = level
|
self.level_at_new_list = level
|
||||||
@ -305,7 +300,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
|
label=DocItemLabel.LIST_ITEM, parent=self.parents[level - 1], text=text
|
||||||
)
|
)
|
||||||
|
|
||||||
def add_table(self, element, docx_obj, doc):
|
def handle_tables(self, element, docx_obj, doc):
|
||||||
|
|
||||||
# Function to check if a cell has a colspan (gridSpan)
|
# Function to check if a cell has a colspan (gridSpan)
|
||||||
def get_colspan(cell):
|
def get_colspan(cell):
|
||||||
@ -331,8 +326,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Calculate the max number of columns
|
# Calculate the max number of columns
|
||||||
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells))
|
||||||
|
|
||||||
self.warn(f"table: [{num_rows}x{num_cols}]")
|
|
||||||
|
|
||||||
# Initialize the table grid
|
# Initialize the table grid
|
||||||
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)]
|
||||||
|
|
||||||
@ -348,8 +341,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
while table_grid[row_idx][col_idx] is not None:
|
while table_grid[row_idx][col_idx] is not None:
|
||||||
col_idx += 1
|
col_idx += 1
|
||||||
|
|
||||||
print(f"{row_idx}, {col_idx}, {row_span}, {col_span}")
|
|
||||||
|
|
||||||
# Fill the grid with the cell value, considering rowspan and colspan
|
# Fill the grid with the cell value, considering rowspan and colspan
|
||||||
for i in range(row_span if row_span == "restart" else 1):
|
for i in range(row_span if row_span == "restart" else 1):
|
||||||
for j in range(col_span):
|
for j in range(col_span):
|
||||||
@ -367,13 +358,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
row_header=False, # ((not col_header) and html_cell.name=='th')
|
row_header=False, # ((not col_header) and html_cell.name=='th')
|
||||||
)
|
)
|
||||||
|
|
||||||
# print(row_idx, "\t", col_idx, "\t", row_span, "\t", col_span, "\t", text)
|
|
||||||
data.table_cells.append(cell)
|
data.table_cells.append(cell)
|
||||||
|
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
doc.add_table(data=data, parent=self.parents[level - 1])
|
doc.add_table(data=data, parent=self.parents[level - 1])
|
||||||
|
|
||||||
def add_figure(self, element, docx_obj, doc):
|
def handle_pictures(self, element, docx_obj, doc):
|
||||||
doc.add_picture(
|
doc.add_picture(
|
||||||
data=BasePictureData(), parent=self.parents[self.level], caption=None
|
data=BasePictureData(), parent=self.parents[self.level], caption=None
|
||||||
)
|
)
|
||||||
|
Binary file not shown.
Loading…
Reference in New Issue
Block a user