mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Add parsing configuration
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
1f240c7763
commit
31e30a2cb7
@ -27,7 +27,12 @@ _log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
def __init__(
|
||||
self,
|
||||
in_doc: "InputDocument",
|
||||
path_or_stream: Union[BytesIO, Path],
|
||||
get_latex=False,
|
||||
):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.XML_KEY = (
|
||||
"{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val"
|
||||
@ -49,6 +54,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
self.level = 0
|
||||
self.listIter = 0
|
||||
|
||||
# Transform MSWord equations to latex
|
||||
self.get_latex = get_latex
|
||||
|
||||
self.history = {
|
||||
"names": [None],
|
||||
"levels": [None],
|
||||
@ -240,9 +248,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
||||
|
||||
text = paragraph.text
|
||||
text = self.handle_equations_in_text(element=element, text=text)
|
||||
if self.get_latex:
|
||||
text = self.handle_equations_in_text(element=element, text=text)
|
||||
|
||||
if paragraph.text is None:
|
||||
if text is None:
|
||||
return
|
||||
text = text.strip()
|
||||
|
||||
|
@ -157,7 +157,8 @@ module = [
|
||||
"deepsearch_glm.*",
|
||||
"lxml.*",
|
||||
"bs4.*",
|
||||
"huggingface_hub.*"
|
||||
"huggingface_hub.*",
|
||||
"pylatexenc.*"
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user