Add parsing configuration

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
Rafael Teixeira de Lima 2025-01-28 09:41:56 +01:00
parent 1f240c7763
commit 31e30a2cb7
2 changed files with 14 additions and 4 deletions

View File

@ -27,7 +27,12 @@ _log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
get_latex=False,
):
super().__init__(in_doc, path_or_stream)
self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
@ -49,6 +54,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level = 0
self.listIter = 0
# Transform MSWord equations to latex
self.get_latex = get_latex
self.history = {
"names": [None],
"levels": [None],
@ -240,9 +248,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
text = paragraph.text
text = self.handle_equations_in_text(element=element, text=text)
if self.get_latex:
text = self.handle_equations_in_text(element=element, text=text)
if paragraph.text is None:
if text is None:
return
text = text.strip()

View File

@ -157,7 +157,8 @@ module = [
"deepsearch_glm.*",
"lxml.*",
"bs4.*",
"huggingface_hub.*"
"huggingface_hub.*",
"pylatexenc.*"
]
ignore_missing_imports = true