mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Run precommit
Signed-off-by: SimJeg <sjegou@nvidia.com>
This commit is contained in:
parent
cd4b214f05
commit
f40b21e94c
@ -15,15 +15,14 @@ from docling_core.types.doc import (
|
|||||||
TableData,
|
TableData,
|
||||||
)
|
)
|
||||||
from docling_core.types.doc.document import Formatting
|
from docling_core.types.doc.document import Formatting
|
||||||
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.document import Document as DocxDocument
|
from docx.document import Document as DocxDocument
|
||||||
from docx.oxml.table import CT_Tc
|
from docx.oxml.table import CT_Tc
|
||||||
from docx.oxml.xmlchemy import BaseOxmlElement
|
from docx.oxml.xmlchemy import BaseOxmlElement
|
||||||
from docx.table import Table, _Cell
|
from docx.table import Table, _Cell
|
||||||
|
from docx.text.hyperlink import Hyperlink
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
from docx.text.run import Run
|
from docx.text.run import Run
|
||||||
from docx.text.hyperlink import Hyperlink
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from lxml.etree import XPath
|
from lxml.etree import XPath
|
||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
@ -39,10 +38,16 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
@override
|
@override
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]) -> None:
|
def __init__(
|
||||||
|
self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
|
||||||
|
) -> None:
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.XML_KEY = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
self.XML_KEY = (
|
||||||
self.xml_namespaces = {"w": "http://schemas.microsoft.com/office/word/2003/wordml"}
|
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||||
|
)
|
||||||
|
self.xml_namespaces = {
|
||||||
|
"w": "http://schemas.microsoft.com/office/word/2003/wordml"
|
||||||
|
}
|
||||||
# self.initialise(path_or_stream)
|
# self.initialise(path_or_stream)
|
||||||
# Word file:
|
# Word file:
|
||||||
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
|
||||||
@ -116,9 +121,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
|
||||||
if self.is_valid():
|
if self.is_valid():
|
||||||
assert self.docx_obj is not None
|
assert self.docx_obj is not None
|
||||||
doc = self.walk_linear(self.docx_obj.sections[0].header._element, self.docx_obj, doc)
|
doc = self.walk_linear(
|
||||||
|
self.docx_obj.sections[0].header._element, self.docx_obj, doc
|
||||||
|
)
|
||||||
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
doc = self.walk_linear(self.docx_obj.element.body, self.docx_obj, doc)
|
||||||
doc = self.walk_linear(self.docx_obj.sections[-1].footer._element, self.docx_obj, doc)
|
doc = self.walk_linear(
|
||||||
|
self.docx_obj.sections[-1].footer._element, self.docx_obj, doc
|
||||||
|
)
|
||||||
return doc
|
return doc
|
||||||
else:
|
else:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
@ -215,9 +224,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return [input_string]
|
return [input_string]
|
||||||
|
|
||||||
def get_numId_and_ilvl(self, paragraph: Paragraph) -> tuple[Optional[int], Optional[int]]:
|
def get_numId_and_ilvl(
|
||||||
|
self, paragraph: Paragraph
|
||||||
|
) -> tuple[Optional[int], Optional[int]]:
|
||||||
# Access the XML element of the paragraph
|
# Access the XML element of the paragraph
|
||||||
numPr = paragraph._element.find(".//w:numPr", namespaces=paragraph._element.nsmap)
|
numPr = paragraph._element.find(
|
||||||
|
".//w:numPr", namespaces=paragraph._element.nsmap
|
||||||
|
)
|
||||||
|
|
||||||
if numPr is not None:
|
if numPr is not None:
|
||||||
# Get the numId element and extract the value
|
# Get the numId element and extract the value
|
||||||
@ -302,11 +315,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Initialize previous_format with the first format
|
# Initialize previous_format with the first format
|
||||||
previous_format = previous_format or format
|
previous_format = previous_format or format
|
||||||
|
|
||||||
if (len(text.strip()) and (format != previous_format)) or (hyperlink is not None):
|
if (len(text.strip()) and (format != previous_format)) or (
|
||||||
|
hyperlink is not None
|
||||||
|
):
|
||||||
|
|
||||||
# If the style changes for a non empty text, add the previous group
|
# If the style changes for a non empty text, add the previous group
|
||||||
if len(group_text.strip()) > 0:
|
if len(group_text.strip()) > 0:
|
||||||
paragraph_elements.append((group_text.strip(), previous_format, None))
|
paragraph_elements.append(
|
||||||
|
(group_text.strip(), previous_format, None)
|
||||||
|
)
|
||||||
group_text = ""
|
group_text = ""
|
||||||
|
|
||||||
# If there is a hyperlink, add it immediately
|
# If there is a hyperlink, add it immediately
|
||||||
@ -397,7 +414,11 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
numid = None
|
numid = None
|
||||||
|
|
||||||
# Handle lists
|
# Handle lists
|
||||||
if numid is not None and ilevel is not None and p_style_id not in ["Title", "Heading"]:
|
if (
|
||||||
|
numid is not None
|
||||||
|
and ilevel is not None
|
||||||
|
and p_style_id not in ["Title", "Heading"]
|
||||||
|
):
|
||||||
self.add_listitem(
|
self.add_listitem(
|
||||||
doc,
|
doc,
|
||||||
numid,
|
numid,
|
||||||
@ -426,7 +447,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if p_style_id in ["Title"]:
|
if p_style_id in ["Title"]:
|
||||||
for key in range(len(self.parents)):
|
for key in range(len(self.parents)):
|
||||||
self.parents[key] = None
|
self.parents[key] = None
|
||||||
self.parents[0] = doc.add_text(parent=None, label=DocItemLabel.TITLE, text=text)
|
self.parents[0] = doc.add_text(
|
||||||
|
parent=None, label=DocItemLabel.TITLE, text=text
|
||||||
|
)
|
||||||
elif "Heading" in p_style_id:
|
elif "Heading" in p_style_id:
|
||||||
style_element = getattr(paragraph.style, "element", None)
|
style_element = getattr(paragraph.style, "element", None)
|
||||||
if style_element is not None:
|
if style_element is not None:
|
||||||
@ -488,7 +511,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
"Quote",
|
"Quote",
|
||||||
]:
|
]:
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
inline_fmt = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level - 1])
|
inline_fmt = doc.add_group(
|
||||||
|
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
||||||
|
)
|
||||||
for text, format, hyperlink in paragraph_elements:
|
for text, format, hyperlink in paragraph_elements:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -502,7 +527,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
# Text style names can, and will have, not only default values but user values too
|
# Text style names can, and will have, not only default values but user values too
|
||||||
# hence we treat all other labels as pure text
|
# hence we treat all other labels as pure text
|
||||||
level = self.get_level()
|
level = self.get_level()
|
||||||
inline_fmt = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level - 1])
|
inline_fmt = doc.add_group(
|
||||||
|
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
||||||
|
)
|
||||||
for text, format, hyperlink in paragraph_elements:
|
for text, format, hyperlink in paragraph_elements:
|
||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH,
|
label=DocItemLabel.PARAGRAPH,
|
||||||
@ -603,7 +630,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
|
||||||
inline_fmt = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level])
|
inline_fmt = doc.add_group(
|
||||||
|
label=GroupLabel.INLINE, parent=self.parents[level]
|
||||||
|
)
|
||||||
for text, format, hyperlink in elements:
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
marker=enum_marker,
|
marker=enum_marker,
|
||||||
@ -645,7 +674,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
is_numbered = True
|
is_numbered = True
|
||||||
|
|
||||||
inline_fmt = doc.add_group(
|
inline_fmt = doc.add_group(
|
||||||
label=GroupLabel.INLINE, parent=self.parents[self.level_at_new_list + ilevel]
|
label=GroupLabel.INLINE,
|
||||||
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
@ -672,7 +702,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
inline_fmt = doc.add_group(
|
inline_fmt = doc.add_group(
|
||||||
label=GroupLabel.INLINE, parent=self.parents[self.level_at_new_list + ilevel]
|
label=GroupLabel.INLINE,
|
||||||
|
parent=self.parents[self.level_at_new_list + ilevel],
|
||||||
)
|
)
|
||||||
for text, format, hyperlink in elements:
|
for text, format, hyperlink in elements:
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
@ -691,7 +722,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
if is_numbered:
|
if is_numbered:
|
||||||
enum_marker = str(self.listIter) + "."
|
enum_marker = str(self.listIter) + "."
|
||||||
is_numbered = True
|
is_numbered = True
|
||||||
inline_fmt = doc.add_group(label=GroupLabel.INLINE, parent=self.parents[level - 1])
|
inline_fmt = doc.add_group(
|
||||||
|
label=GroupLabel.INLINE, parent=self.parents[level - 1]
|
||||||
|
)
|
||||||
for text, format, hyperlink in elements:
|
for text, format, hyperlink in elements:
|
||||||
# Add the list item to the parent group
|
# Add the list item to the parent group
|
||||||
doc.add_list_item(
|
doc.add_list_item(
|
||||||
|
Loading…
Reference in New Issue
Block a user