Use Formatting

Signed-off-by: SimJeg <sjegou@nvidia.com>
This commit is contained in:
SimJeg 2025-03-31 12:20:48 +02:00
parent 01b4c12d3b
commit 23fa9b9902

View File

@ -14,6 +14,8 @@ from docling_core.types.doc import (
TableCell, TableCell,
TableData, TableData,
) )
from docling_core.types.doc.document import Formatting
from docx import Document from docx import Document
from docx.document import Document as DocxDocument from docx.document import Document as DocxDocument
from docx.oxml.table import CT_Tc from docx.oxml.table import CT_Tc
@ -264,7 +266,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
else: else:
return label, None return label, None
def format_text(self, text, bold: bool, italic: bool, underline: bool): @classmethod
def get_format_from_run(cls, run: Run) -> Formatting:
return Formatting(
bold=run.bold if run.bold is not None else False,
italic=run.italic if run.italic is not None else False,
underline=run.underline if run.underline is not None else False,
)
def format_text(self, text: str, format: Formatting) -> str:
""" """
Apply bold, italic, and underline markdown styles to a text Apply bold, italic, and underline markdown styles to a text
""" """
@ -273,11 +283,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
prefix, text, suffix = re.match(r"(^\s*)(.*?)(\s*$)", text, re.DOTALL).groups() prefix, text, suffix = re.match(r"(^\s*)(.*?)(\s*$)", text, re.DOTALL).groups()
# Apply style # Apply style
if bold: if format.bold:
text = f"**{text}**" text = f"**{text}**"
if italic: if format.italic:
text = f"*{text}*" text = f"*{text}*"
if underline: if format.underline:
text = f"<u>{text}</u>" text = f"<u>{text}</u>"
# Add back leading and trailing spaces # Add back leading and trailing spaces
@ -292,33 +302,34 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph_text = "" paragraph_text = ""
group_text = "" group_text = ""
previous_style = None previous_format = None
# Iterate over the runs of the paragraph and group them by style # Iterate over the runs of the paragraph and group them by format
for c in paragraph.iter_inner_content(): for c in paragraph.iter_inner_content():
if isinstance(c, Hyperlink): if isinstance(c, Hyperlink):
text = f"[{c.text}]({c.address})" text = f"[{c.text}]({c.address})"
style = (c.runs[0].bold, c.runs[0].italic, c.runs[0].underline) format = self.get_format_from_run(c.runs[0])
elif isinstance(c, Run): elif isinstance(c, Run):
text = c.text text = c.text
style = (c.bold, c.italic, c.underline) format = self.get_format_from_run(c)
else: else:
continue continue
# Initialize previous_style with the first style # Initialize previous_format with the first format
previous_style = previous_style or style previous_format = previous_format or format
# If the style changes for a non empty text, format the group and reset it # If the style changes for a non empty text, format the group and reset it
if len(text.strip()) and (style != previous_style): if len(text.strip()) and (format != previous_format):
paragraph_text += self.format_text(group_text, *previous_style) previous_text = self.format_text(group_text, previous_format)
previous_style = style paragraph_text += previous_text
previous_format = format
group_text = "" group_text = ""
group_text += text group_text += text
# Format the last group # Format the last group
if len(group_text.strip()) > 0: if len(group_text.strip()) > 0:
paragraph_text += self.format_text(group_text, *style) paragraph_text += self.format_text(group_text, format)
return paragraph_text.strip() return paragraph_text.strip()