Add parsing configuration

Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
Rafael Teixeira de Lima 2025-01-28 09:41:56 +01:00
parent 1f240c7763
commit 31e30a2cb7
2 changed files with 14 additions and 4 deletions

View File

@ -27,7 +27,12 @@ _log = logging.getLogger(__name__)
class MsWordDocumentBackend(DeclarativeDocumentBackend): class MsWordDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(
self,
in_doc: "InputDocument",
path_or_stream: Union[BytesIO, Path],
get_latex=False,
):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
self.XML_KEY = ( self.XML_KEY = (
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
@ -49,6 +54,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
self.level = 0 self.level = 0
self.listIter = 0 self.listIter = 0
# Transform MSWord equations to latex
self.get_latex = get_latex
self.history = { self.history = {
"names": [None], "names": [None],
"levels": [None], "levels": [None],
@ -240,9 +248,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
paragraph = docx.text.paragraph.Paragraph(element, docx_obj) paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
text = paragraph.text text = paragraph.text
text = self.handle_equations_in_text(element=element, text=text) if self.get_latex:
text = self.handle_equations_in_text(element=element, text=text)
if paragraph.text is None: if text is None:
return return
text = text.strip() text = text.strip()

View File

@ -157,7 +157,8 @@ module = [
"deepsearch_glm.*", "deepsearch_glm.*",
"lxml.*", "lxml.*",
"bs4.*", "bs4.*",
"huggingface_hub.*" "huggingface_hub.*",
"pylatexenc.*"
] ]
ignore_missing_imports = true ignore_missing_imports = true