mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-28 13:04:25 +00:00
feat: Enable markdown text formatting for docx
Signed-off-by: SimJeg <sjegou@nvidia.com>
This commit is contained in:
parent
1418fa1488
commit
7f9464b399
@ -222,12 +222,70 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return label, None
|
return label, None
|
||||||
|
|
||||||
|
def format_text(self, text, bold: bool, italic: bool, underline: bool):
|
||||||
|
"""
|
||||||
|
Apply bold, italic, and underline markdown styles to a text
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Exclude leading and trailing spaces from style
|
||||||
|
prefix, text, suffix = re.match(r"(^\s*)(.*?)(\s*$)", text, re.DOTALL).groups()
|
||||||
|
|
||||||
|
# Apply style
|
||||||
|
if bold:
|
||||||
|
text = f"**{text}**"
|
||||||
|
if italic:
|
||||||
|
text = f"*{text}*"
|
||||||
|
if underline:
|
||||||
|
text = f"<u>{text}</u>"
|
||||||
|
|
||||||
|
# Add back leading and trailing spaces
|
||||||
|
text = prefix + text + suffix
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
def format_paragraph(self, paragraph):
|
||||||
|
"""
|
||||||
|
Apply hyperlink, bold, italic, and underline markdown styles to a paragraph
|
||||||
|
"""
|
||||||
|
|
||||||
|
paragraph_text = ""
|
||||||
|
group_text = ""
|
||||||
|
previous_style = None
|
||||||
|
|
||||||
|
# Iterate over the runs of the paragraph and group them by style
|
||||||
|
for c in paragraph.iter_inner_content():
|
||||||
|
if isinstance(c, docx.text.hyperlink.Hyperlink):
|
||||||
|
text = f"[{c.text}]({c.address})"
|
||||||
|
style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline)
|
||||||
|
elif isinstance(c, docx.text.run.Run):
|
||||||
|
text = c.text
|
||||||
|
style = (c.bold, c.italic, c.underline)
|
||||||
|
else:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Initialize previous_style with the first style
|
||||||
|
previous_style = previous_style or style
|
||||||
|
|
||||||
|
# If the style changes for a non empty text, format the group and reset it
|
||||||
|
if len(text.strip()) and (style != previous_style):
|
||||||
|
paragraph_text += self.format_text(group_text, *previous_style)
|
||||||
|
previous_style = style
|
||||||
|
group_text = ""
|
||||||
|
|
||||||
|
group_text += text
|
||||||
|
|
||||||
|
# Format the last group
|
||||||
|
if len(group_text.strip()) > 0:
|
||||||
|
paragraph_text += self.format_text(group_text, *style)
|
||||||
|
|
||||||
|
return paragraph_text.strip()
|
||||||
|
|
||||||
def handle_text_elements(self, element, docx_obj, doc):
|
def handle_text_elements(self, element, docx_obj, doc):
|
||||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
||||||
|
|
||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
return
|
return
|
||||||
text = paragraph.text.strip()
|
text = self.format_paragraph(paragraph)
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
|
BIN
tests/data/docx/unit_test_formatting.docx
Normal file
BIN
tests/data/docx/unit_test_formatting.docx
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user