mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-29 05:24:28 +00:00
Add parsing configuration
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
1f240c7763
commit
31e30a2cb7
@ -27,7 +27,12 @@ _log = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
def __init__(
|
||||||
|
self,
|
||||||
|
in_doc: "InputDocument",
|
||||||
|
path_or_stream: Union[BytesIO, Path],
|
||||||
|
get_latex=False,
|
||||||
|
):
|
||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.XML_KEY = (
|
self.XML_KEY = (
|
||||||
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||||
@ -49,6 +54,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
self.level = 0
|
self.level = 0
|
||||||
self.listIter = 0
|
self.listIter = 0
|
||||||
|
|
||||||
|
# Transform MSWord equations to latex
|
||||||
|
self.get_latex = get_latex
|
||||||
|
|
||||||
self.history = {
|
self.history = {
|
||||||
"names": [None],
|
"names": [None],
|
||||||
"levels": [None],
|
"levels": [None],
|
||||||
@ -240,9 +248,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
||||||
|
|
||||||
text = paragraph.text
|
text = paragraph.text
|
||||||
text = self.handle_equations_in_text(element=element, text=text)
|
if self.get_latex:
|
||||||
|
text = self.handle_equations_in_text(element=element, text=text)
|
||||||
|
|
||||||
if paragraph.text is None:
|
if text is None:
|
||||||
return
|
return
|
||||||
text = text.strip()
|
text = text.strip()
|
||||||
|
|
||||||
|
@ -157,7 +157,8 @@ module = [
|
|||||||
"deepsearch_glm.*",
|
"deepsearch_glm.*",
|
||||||
"lxml.*",
|
"lxml.*",
|
||||||
"bs4.*",
|
"bs4.*",
|
||||||
"huggingface_hub.*"
|
"huggingface_hub.*",
|
||||||
|
"pylatexenc.*"
|
||||||
]
|
]
|
||||||
ignore_missing_imports = true
|
ignore_missing_imports = true
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user