Address feedback

Signed-off-by: SimJeg <sjegou@nvidia.com>
This commit is contained in:
SimJeg 2025-04-02 17:20:52 +02:00
parent f40b21e94c
commit da25453155

View File

@ -121,13 +121,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file", origin=origin) doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
if self.is_valid(): if self.is_valid():
assert self.docx_obj is not None assert self.docx_obj is not None
doc = self.walk_linear(
self.docx_obj.sections[0].header._element, self.docx_obj, doc
)
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc) doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
doc = self.walk_linear(
self.docx_obj.sections[-1].footer._element, self.docx_obj, doc
)
return doc return doc
else: else:
raise RuntimeError( raise RuntimeError(
@ -283,14 +277,14 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
return label, None return label, None
@classmethod @classmethod
def get_format_from_run(cls, run: Run) -> Formatting: def _get_format_from_run(cls, run: Run) -> Formatting:
return Formatting( return Formatting(
bold=run.bold if run.bold is not None else False, bold=run.bold if run.bold is not None else False,
italic=run.italic if run.italic is not None else False, italic=run.italic if run.italic is not None else False,
underline=run.underline if run.underline is not None else False, underline=run.underline if run.underline is not None else False,
) )
def format_paragraph(self, paragraph: Paragraph): def _get_paragraph_elements(self, paragraph: Paragraph):
""" """
Extract paragraph elements along with their formatting and hyperlink Extract paragraph elements along with their formatting and hyperlink
""" """
@ -304,11 +298,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if isinstance(c, Hyperlink): if isinstance(c, Hyperlink):
text = c.text text = c.text
hyperlink = Path(c.address) hyperlink = Path(c.address)
format = self.get_format_from_run(c.runs[0]) format = self._get_format_from_run(c.runs[0])
elif isinstance(c, Run): elif isinstance(c, Run):
text = c.text text = c.text
hyperlink = None hyperlink = None
format = self.get_format_from_run(c) format = self._get_format_from_run(c)
else: else:
continue continue
@ -400,7 +394,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
if text is None: if text is None:
return return
paragraph_elements = self.format_paragraph(paragraph) paragraph_elements = self._get_paragraph_elements(paragraph)
# Common styles for bullet and numbered lists. # Common styles for bullet and numbered lists.
# "List Bullet", "List Number", "List Paragraph" # "List Bullet", "List Number", "List Paragraph"
@ -419,7 +413,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
and ilevel is not None and ilevel is not None
and p_style_id not in ["Title", "Heading"] and p_style_id not in ["Title", "Heading"]
): ):
self.add_listitem( self._add_listitem(
doc, doc,
numid, numid,
ilevel, ilevel,
@ -605,7 +599,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
return return
def add_listitem( def _add_listitem(
self, self,
doc: DoclingDocument, doc: DoclingDocument,
numid: int, numid: int,
@ -634,7 +628,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
label=GroupLabel.INLINE, parent=self.parents[level] label=GroupLabel.INLINE, parent=self.parents[level]
) )
for text, format, hyperlink in elements: for text, format, hyperlink in elements:
doc.add_list_item( doc._add_list_item(
marker=enum_marker, marker=enum_marker,
enumerated=is_numbered, enumerated=is_numbered,
parent=inline_fmt, parent=inline_fmt,
@ -678,7 +672,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level_at_new_list + ilevel], parent=self.parents[self.level_at_new_list + ilevel],
) )
for text, format, hyperlink in elements: for text, format, hyperlink in elements:
doc.add_list_item( doc._add_list_item(
marker=enum_marker, marker=enum_marker,
enumerated=is_numbered, enumerated=is_numbered,
parent=inline_fmt, parent=inline_fmt,
@ -706,7 +700,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
parent=self.parents[self.level_at_new_list + ilevel], parent=self.parents[self.level_at_new_list + ilevel],
) )
for text, format, hyperlink in elements: for text, format, hyperlink in elements:
doc.add_list_item( doc._add_list_item(
marker=enum_marker, marker=enum_marker,
enumerated=is_numbered, enumerated=is_numbered,
parent=inline_fmt, parent=inline_fmt,
@ -727,7 +721,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
) )
for text, format, hyperlink in elements: for text, format, hyperlink in elements:
# Add the list item to the parent group # Add the list item to the parent group
doc.add_list_item( doc._add_list_item(
marker=enum_marker, marker=enum_marker,
enumerated=is_numbered, enumerated=is_numbered,
parent=inline_fmt, parent=inline_fmt,