From d3362d15538d1412b0c148b5cde1ab7f71865c96 Mon Sep 17 00:00:00 2001 From: SimJeg Date: Mon, 31 Mar 2025 12:32:22 +0200 Subject: [PATCH] Handle hyperlink Signed-off-by: SimJeg --- docling/backend/msword_backend.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 7d4bcdc1..8ae35a35 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -303,12 +303,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph_text = "" group_text = "" previous_format = None + hyperlink = None # Iterate over the runs of the paragraph and group them by format for c in paragraph.iter_inner_content(): if isinstance(c, Hyperlink): text = f"[{c.text}]({c.address})" format = self.get_format_from_run(c.runs[0]) + hyperlink = c.address elif isinstance(c, Run): text = c.text format = self.get_format_from_run(c) @@ -319,11 +321,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): previous_format = previous_format or format # If the style changes for a non empty text, format the group and reset it - if len(text.strip()) and (format != previous_format): - previous_text = self.format_text(group_text, previous_format) - paragraph_text += previous_text + if (len(text.strip()) and (format != previous_format)) or (hyperlink is not None): + paragraph_text += self.format_text(group_text, previous_format) previous_format = format group_text = "" + hyperlink = None group_text += text @@ -331,6 +333,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if len(group_text.strip()) > 0: paragraph_text += self.format_text(group_text, format) + #TODO: return a list of tuple (text, format, hyperlink) instead of a single string return paragraph_text.strip() def handle_equations_in_text(self, element, text):