diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 609dd66b..4ad59f1c 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -27,7 +27,12 @@ _log = logging.getLogger(__name__) class MsWordDocumentBackend(DeclarativeDocumentBackend): - def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + def __init__( + self, + in_doc: "InputDocument", + path_or_stream: Union[BytesIO, Path], + get_latex=False, + ): super().__init__(in_doc, path_or_stream) self.XML_KEY = ( "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val" @@ -49,6 +54,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.level = 0 self.listIter = 0 + # Transform MSWord equations to latex + self.get_latex = get_latex + self.history = { "names": [None], "levels": [None], @@ -240,9 +248,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): paragraph = docx.text.paragraph.Paragraph(element, docx_obj) text = paragraph.text - text = self.handle_equations_in_text(element=element, text=text) + if self.get_latex: + text = self.handle_equations_in_text(element=element, text=text) - if paragraph.text is None: + if text is None: return text = text.strip() diff --git a/pyproject.toml b/pyproject.toml index e2d2c236..7d89056e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,7 +157,8 @@ module = [ "deepsearch_glm.*", "lxml.*", "bs4.*", - "huggingface_hub.*" + "huggingface_hub.*", + "pylatexenc.*" ] ignore_missing_imports = true