mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-29 05:24:28 +00:00
Run black and mypy
Signed-off-by: SimJeg <sjegou@nvidia.com>
This commit is contained in:
parent
60306f9a83
commit
1033c25435
@ -39,16 +39,10 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
@override
|
@override
|
||||||
def __init__(
|
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]) -> None:
|
||||||
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
|
||||||
) -> None:
|
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.XML_KEY = (
|
self.XML_KEY = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||||
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
self.xml_namespaces = {"w": "http://schemas.microsoft.com/office/word/2003/wordml"}
|
||||||
)
|
|
||||||
self.xml_namespaces = {
|
|
||||||
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
|
||||||
}
|
|
||||||
# self.initialise(path_or_stream)
|
# self.initialise(path_or_stream)
|
||||||
# Word file:
|
# Word file:
|
||||||
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
||||||
@ -219,13 +213,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return [input_string]
|
return [input_string]
|
||||||
|
|
||||||
def get_numId_and_ilvl(
|
def get_numId_and_ilvl(self, paragraph: Paragraph) -> tuple[Optional[int], Optional[int]]:
|
||||||
self, paragraph: Paragraph
|
|
||||||
) -> tuple[Optional[int], Optional[int]]:
|
|
||||||
# Access the XML element of the paragraph
|
# Access the XML element of the paragraph
|
||||||
numPr = paragraph._element.find(
|
numPr = paragraph._element.find(".//w:numPr", namespaces=paragraph._element.nsmap)
|
||||||
".//w:numPr", namespaces=paragraph._element.nsmap
|
|
||||||
)
|
|
||||||
|
|
||||||
if numPr is not None:
|
if numPr is not None:
|
||||||
# Get the numId element and extract the value
|
# Get the numId element and extract the value
|
||||||
@ -274,13 +264,12 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
underline=run.underline if run.underline is not None else False,
|
underline=run.underline if run.underline is not None else False,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def format_paragraph(self, paragraph: Paragraph):
|
||||||
def format_paragraph(self, paragraph: Paragraph) -> list[tuple[str, Formatting, str]]:
|
|
||||||
"""
|
"""
|
||||||
Apply hyperlink, bold, italic, and underline markdown styles to a paragraph
|
Extract paragraph elements along with their formatting and hyperlink
|
||||||
"""
|
"""
|
||||||
|
|
||||||
paragraph_elements = []
|
paragraph_elements: list[tuple[str, Formatting, Path | None]] = []
|
||||||
group_text = ""
|
group_text = ""
|
||||||
previous_format = None
|
previous_format = None
|
||||||
|
|
||||||
@ -288,7 +277,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
for c in paragraph.iter_inner_content():
|
for c in paragraph.iter_inner_content():
|
||||||
if isinstance(c, Hyperlink):
|
if isinstance(c, Hyperlink):
|
||||||
text = c.text
|
text = c.text
|
||||||
hyperlink = c.address
|
hyperlink = Path(c.address)
|
||||||
format = self.get_format_from_run(c.runs[0])
|
format = self.get_format_from_run(c.runs[0])
|
||||||
elif isinstance(c, Run):
|
elif isinstance(c, Run):
|
||||||
text = c.text
|
text = c.text
|
||||||
@ -370,11 +359,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
numid = None
|
numid = None
|
||||||
|
|
||||||
# Handle lists
|
# Handle lists
|
||||||
if (
|
if numid is not None and ilevel is not None and p_style_id not in ["Title", "Heading"]:
|
||||||
numid is not None
|
|
||||||
and ilevel is not None
|
|
||||||
and p_style_id not in ["Title", "Heading"]
|
|
||||||
):
|
|
||||||
self.add_listitem(
|
self.add_listitem(
|
||||||
doc,
|
doc,
|
||||||
numid,
|
numid,
|
||||||
@ -403,15 +388,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if p_style_id in ["Title"]:
|
if p_style_id in ["Title"]:
|
||||||
for key in range(len(self.parents)):
|
for key in range(len(self.parents)):
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
self.parents[0] = doc.add_text(
|
self.parents[0] = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
|
||||||
parent=None, label=DocItemLabel.TITLE, text=text
|
|
||||||
)
|
|
||||||
elif "Heading" in p_style_id:
|
elif "Heading" in p_style_id:
|
||||||
style_element = getattr(paragraph.style, "element", None)
|
style_element = getattr(paragraph.style, "element", None)
|
||||||
if style_element:
|
if style_element:
|
||||||
is_numbered_style = (
|
is_numbered_style = "<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
||||||
"<w:numPr>" in style_element.xml or "<w:numPr>" in element.xml
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
is_numbered_style = False
|
is_numbered_style = False
|
||||||
self.add_header(doc, p_level, text, is_numbered_style)
|
self.add_header(doc, p_level, text, is_numbered_style)
|
||||||
@ -470,8 +451,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level - 1])
|
parent = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level - 1])
|
||||||
for text, format, hyperlink in paragraph_elements:
|
for text, format, hyperlink in paragraph_elements:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, parent=parent, text=text,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
formatting=format, hyperlink=hyperlink
|
parent=parent,
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -481,8 +465,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
parent = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level - 1])
|
parent = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level - 1])
|
||||||
for text, format, hyperlink in paragraph_elements:
|
for text, format, hyperlink in paragraph_elements:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, parent=parent, text=text,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
formatting=format, hyperlink=hyperlink
|
parent=parent,
|
||||||
|
text=text,
|
||||||
|
formatting=format,
|
||||||
|
hyperlink=hyperlink,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||||
@ -556,7 +543,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc: DoclingDocument,
|
doc: DoclingDocument,
|
||||||
numid: int,
|
numid: int,
|
||||||
ilevel: int,
|
ilevel: int,
|
||||||
elements: list[tuple[str, Formatting, str]],
|
elements: list,
|
||||||
is_numbered: bool = False,
|
is_numbered: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
enum_marker = ""
|
enum_marker = ""
|
||||||
@ -617,7 +604,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
|
||||||
inline_fmt = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[self.level_at_new_list + ilevel])
|
inline_fmt = doc.add_group(
|
||||||
|
label=GroupLabel.INLINE, parent=self.parents[self.level_at_new_list + ilevel]
|
||||||
|
)
|
||||||
for text, format, hyperlink in elements:
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
@ -642,7 +631,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
inline_fmt = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[self.level_at_new_list + ilevel])
|
inline_fmt = doc.add_group(
|
||||||
|
label=GroupLabel.INLINE, parent=self.parents[self.level_at_new_list + ilevel]
|
||||||
|
)
|
||||||
for text, format, hyperlink in elements:
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
|
Loading…
Reference in New Issue
Block a user