mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-29 13:34:21 +00:00
Recommit with fixed history
Signed-off-by: Rafael Teixeira de Lima <Rafael.td.lima@gmail.com>
This commit is contained in:
parent
5139b48e4e
commit
1f240c7763
0
docling/backend/docx_latex/__init__.py
Normal file
0
docling/backend/docx_latex/__init__.py
Normal file
271
docling/backend/docx_latex/latex_dict.py
Normal file
271
docling/backend/docx_latex/latex_dict.py
Normal file
@ -0,0 +1,271 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
"""
|
||||||
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/latex_dict.py
|
||||||
|
On 23/01/2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
CHARS = ("{", "}", "_", "^", "#", "&", "$", "%", "~")
|
||||||
|
|
||||||
|
BLANK = ""
|
||||||
|
BACKSLASH = "\\"
|
||||||
|
ALN = "&"
|
||||||
|
|
||||||
|
CHR = {
|
||||||
|
# Unicode : Latex Math Symbols
|
||||||
|
# Top accents
|
||||||
|
"\u0300": "\\grave{{{0}}}",
|
||||||
|
"\u0301": "\\acute{{{0}}}",
|
||||||
|
"\u0302": "\\hat{{{0}}}",
|
||||||
|
"\u0303": "\\tilde{{{0}}}",
|
||||||
|
"\u0304": "\\bar{{{0}}}",
|
||||||
|
"\u0305": "\\overbar{{{0}}}",
|
||||||
|
"\u0306": "\\breve{{{0}}}",
|
||||||
|
"\u0307": "\\dot{{{0}}}",
|
||||||
|
"\u0308": "\\ddot{{{0}}}",
|
||||||
|
"\u0309": "\\ovhook{{{0}}}",
|
||||||
|
"\u030a": "\\ocirc{{{0}}}}",
|
||||||
|
"\u030c": "\\check{{{0}}}}",
|
||||||
|
"\u0310": "\\candra{{{0}}}",
|
||||||
|
"\u0312": "\\oturnedcomma{{{0}}}",
|
||||||
|
"\u0315": "\\ocommatopright{{{0}}}",
|
||||||
|
"\u031a": "\\droang{{{0}}}",
|
||||||
|
"\u0338": "\\not{{{0}}}",
|
||||||
|
"\u20d0": "\\leftharpoonaccent{{{0}}}",
|
||||||
|
"\u20d1": "\\rightharpoonaccent{{{0}}}",
|
||||||
|
"\u20d2": "\\vertoverlay{{{0}}}",
|
||||||
|
"\u20d6": "\\overleftarrow{{{0}}}",
|
||||||
|
"\u20d7": "\\vec{{{0}}}",
|
||||||
|
"\u20db": "\\dddot{{{0}}}",
|
||||||
|
"\u20dc": "\\ddddot{{{0}}}",
|
||||||
|
"\u20e1": "\\overleftrightarrow{{{0}}}",
|
||||||
|
"\u20e7": "\\annuity{{{0}}}",
|
||||||
|
"\u20e9": "\\widebridgeabove{{{0}}}",
|
||||||
|
"\u20f0": "\\asteraccent{{{0}}}",
|
||||||
|
# Bottom accents
|
||||||
|
"\u0330": "\\wideutilde{{{0}}}",
|
||||||
|
"\u0331": "\\underbar{{{0}}}",
|
||||||
|
"\u20e8": "\\threeunderdot{{{0}}}",
|
||||||
|
"\u20ec": "\\underrightharpoondown{{{0}}}",
|
||||||
|
"\u20ed": "\\underleftharpoondown{{{0}}}",
|
||||||
|
"\u20ee": "\\underledtarrow{{{0}}}",
|
||||||
|
"\u20ef": "\\underrightarrow{{{0}}}",
|
||||||
|
# Over | group
|
||||||
|
"\u23b4": "\\overbracket{{{0}}}",
|
||||||
|
"\u23dc": "\\overparen{{{0}}}",
|
||||||
|
"\u23de": "\\overbrace{{{0}}}",
|
||||||
|
# Under| group
|
||||||
|
"\u23b5": "\\underbracket{{{0}}}",
|
||||||
|
"\u23dd": "\\underparen{{{0}}}",
|
||||||
|
"\u23df": "\\underbrace{{{0}}}",
|
||||||
|
}
|
||||||
|
|
||||||
|
CHR_BO = {
|
||||||
|
# Big operators,
|
||||||
|
"\u2140": "\\Bbbsum",
|
||||||
|
"\u220f": "\\prod",
|
||||||
|
"\u2210": "\\coprod",
|
||||||
|
"\u2211": "\\sum",
|
||||||
|
"\u222b": "\\int",
|
||||||
|
"\u22c0": "\\bigwedge",
|
||||||
|
"\u22c1": "\\bigvee",
|
||||||
|
"\u22c2": "\\bigcap",
|
||||||
|
"\u22c3": "\\bigcup",
|
||||||
|
"\u2a00": "\\bigodot",
|
||||||
|
"\u2a01": "\\bigoplus",
|
||||||
|
"\u2a02": "\\bigotimes",
|
||||||
|
}
|
||||||
|
|
||||||
|
T = {
|
||||||
|
"\u2192": "\\rightarrow ",
|
||||||
|
# Greek letters
|
||||||
|
"\U0001d6fc": "\\alpha ",
|
||||||
|
"\U0001d6fd": "\\beta ",
|
||||||
|
"\U0001d6fe": "\\gamma ",
|
||||||
|
"\U0001d6ff": "\\theta ",
|
||||||
|
"\U0001d700": "\\epsilon ",
|
||||||
|
"\U0001d701": "\\zeta ",
|
||||||
|
"\U0001d702": "\\eta ",
|
||||||
|
"\U0001d703": "\\theta ",
|
||||||
|
"\U0001d704": "\\iota ",
|
||||||
|
"\U0001d705": "\\kappa ",
|
||||||
|
"\U0001d706": "\\lambda ",
|
||||||
|
"\U0001d707": "\\m ",
|
||||||
|
"\U0001d708": "\\n ",
|
||||||
|
"\U0001d709": "\\xi ",
|
||||||
|
"\U0001d70a": "\\omicron ",
|
||||||
|
"\U0001d70b": "\\pi ",
|
||||||
|
"\U0001d70c": "\\rho ",
|
||||||
|
"\U0001d70d": "\\varsigma ",
|
||||||
|
"\U0001d70e": "\\sigma ",
|
||||||
|
"\U0001d70f": "\\ta ",
|
||||||
|
"\U0001d710": "\\upsilon ",
|
||||||
|
"\U0001d711": "\\phi ",
|
||||||
|
"\U0001d712": "\\chi ",
|
||||||
|
"\U0001d713": "\\psi ",
|
||||||
|
"\U0001d714": "\\omega ",
|
||||||
|
"\U0001d715": "\\partial ",
|
||||||
|
"\U0001d716": "\\varepsilon ",
|
||||||
|
"\U0001d717": "\\vartheta ",
|
||||||
|
"\U0001d718": "\\varkappa ",
|
||||||
|
"\U0001d719": "\\varphi ",
|
||||||
|
"\U0001d71a": "\\varrho ",
|
||||||
|
"\U0001d71b": "\\varpi ",
|
||||||
|
# Relation symbols
|
||||||
|
"\u2190": "\\leftarrow ",
|
||||||
|
"\u2191": "\\uparrow ",
|
||||||
|
"\u2192": "\\rightarrow ",
|
||||||
|
"\u2193": "\\downright ",
|
||||||
|
"\u2194": "\\leftrightarrow ",
|
||||||
|
"\u2195": "\\updownarrow ",
|
||||||
|
"\u2196": "\\nwarrow ",
|
||||||
|
"\u2197": "\\nearrow ",
|
||||||
|
"\u2198": "\\searrow ",
|
||||||
|
"\u2199": "\\swarrow ",
|
||||||
|
"\u22ee": "\\vdots ",
|
||||||
|
"\u22ef": "\\cdots ",
|
||||||
|
"\u22f0": "\\adots ",
|
||||||
|
"\u22f1": "\\ddots ",
|
||||||
|
"\u2260": "\\ne ",
|
||||||
|
"\u2264": "\\leq ",
|
||||||
|
"\u2265": "\\geq ",
|
||||||
|
"\u2266": "\\leqq ",
|
||||||
|
"\u2267": "\\geqq ",
|
||||||
|
"\u2268": "\\lneqq ",
|
||||||
|
"\u2269": "\\gneqq ",
|
||||||
|
"\u226a": "\\ll ",
|
||||||
|
"\u226b": "\\gg ",
|
||||||
|
"\u2208": "\\in ",
|
||||||
|
"\u2209": "\\notin ",
|
||||||
|
"\u220b": "\\ni ",
|
||||||
|
"\u220c": "\\nni ",
|
||||||
|
# Ordinary symbols
|
||||||
|
"\u221e": "\\infty ",
|
||||||
|
# Binary relations
|
||||||
|
"\u00b1": "\\pm ",
|
||||||
|
"\u2213": "\\mp ",
|
||||||
|
# Italic, Latin, uppercase
|
||||||
|
"\U0001d434": "A",
|
||||||
|
"\U0001d435": "B",
|
||||||
|
"\U0001d436": "C",
|
||||||
|
"\U0001d437": "D",
|
||||||
|
"\U0001d438": "E",
|
||||||
|
"\U0001d439": "F",
|
||||||
|
"\U0001d43a": "G",
|
||||||
|
"\U0001d43b": "H",
|
||||||
|
"\U0001d43c": "I",
|
||||||
|
"\U0001d43d": "J",
|
||||||
|
"\U0001d43e": "K",
|
||||||
|
"\U0001d43f": "L",
|
||||||
|
"\U0001d440": "M",
|
||||||
|
"\U0001d441": "N",
|
||||||
|
"\U0001d442": "O",
|
||||||
|
"\U0001d443": "P",
|
||||||
|
"\U0001d444": "Q",
|
||||||
|
"\U0001d445": "R",
|
||||||
|
"\U0001d446": "S",
|
||||||
|
"\U0001d447": "T",
|
||||||
|
"\U0001d448": "U",
|
||||||
|
"\U0001d449": "V",
|
||||||
|
"\U0001d44a": "W",
|
||||||
|
"\U0001d44b": "X",
|
||||||
|
"\U0001d44c": "Y",
|
||||||
|
"\U0001d44d": "Z",
|
||||||
|
# Italic, Latin, lowercase
|
||||||
|
"\U0001d44e": "a",
|
||||||
|
"\U0001d44f": "b",
|
||||||
|
"\U0001d450": "c",
|
||||||
|
"\U0001d451": "d",
|
||||||
|
"\U0001d452": "e",
|
||||||
|
"\U0001d453": "f",
|
||||||
|
"\U0001d454": "g",
|
||||||
|
"\U0001d456": "i",
|
||||||
|
"\U0001d457": "j",
|
||||||
|
"\U0001d458": "k",
|
||||||
|
"\U0001d459": "l",
|
||||||
|
"\U0001d45a": "m",
|
||||||
|
"\U0001d45b": "n",
|
||||||
|
"\U0001d45c": "o",
|
||||||
|
"\U0001d45d": "p",
|
||||||
|
"\U0001d45e": "q",
|
||||||
|
"\U0001d45f": "r",
|
||||||
|
"\U0001d460": "s",
|
||||||
|
"\U0001d461": "t",
|
||||||
|
"\U0001d462": "u",
|
||||||
|
"\U0001d463": "v",
|
||||||
|
"\U0001d464": "w",
|
||||||
|
"\U0001d465": "x",
|
||||||
|
"\U0001d466": "y",
|
||||||
|
"\U0001d467": "z",
|
||||||
|
}
|
||||||
|
|
||||||
|
FUNC = {
|
||||||
|
"sin": "\\sin({fe})",
|
||||||
|
"cos": "\\cos({fe})",
|
||||||
|
"tan": "\\tan({fe})",
|
||||||
|
"arcsin": "\\arcsin({fe})",
|
||||||
|
"arccos": "\\arccos({fe})",
|
||||||
|
"arctan": "\\arctan({fe})",
|
||||||
|
"arccot": "\\arccot({fe})",
|
||||||
|
"sinh": "\\sinh({fe})",
|
||||||
|
"cosh": "\\cosh({fe})",
|
||||||
|
"tanh": "\\tanh({fe})",
|
||||||
|
"coth": "\\coth({fe})",
|
||||||
|
"sec": "\\sec({fe})",
|
||||||
|
"csc": "\\csc({fe})",
|
||||||
|
}
|
||||||
|
|
||||||
|
FUNC_PLACE = "{fe}"
|
||||||
|
|
||||||
|
BRK = "\\\\"
|
||||||
|
|
||||||
|
CHR_DEFAULT = {
|
||||||
|
"ACC_VAL": "\\hat{{{0}}}",
|
||||||
|
}
|
||||||
|
|
||||||
|
POS = {
|
||||||
|
"top": "\\overline{{{0}}}", # not sure
|
||||||
|
"bot": "\\underline{{{0}}}",
|
||||||
|
}
|
||||||
|
|
||||||
|
POS_DEFAULT = {
|
||||||
|
"BAR_VAL": "\\overline{{{0}}}",
|
||||||
|
}
|
||||||
|
|
||||||
|
SUB = "_{{{0}}}"
|
||||||
|
|
||||||
|
SUP = "^{{{0}}}"
|
||||||
|
|
||||||
|
F = {
|
||||||
|
"bar": "\\frac{{{num}}}{{{den}}}",
|
||||||
|
"skw": r"^{{{num}}}/_{{{den}}}",
|
||||||
|
"noBar": "\\genfrac{{}}{{}}{{0pt}}{{}}{{{num}}}{{{den}}}",
|
||||||
|
"lin": "{{{num}}}/{{{den}}}",
|
||||||
|
}
|
||||||
|
F_DEFAULT = "\\frac{{{num}}}{{{den}}}"
|
||||||
|
|
||||||
|
D = "\\left{left}{text}\\right{right}"
|
||||||
|
|
||||||
|
D_DEFAULT = {
|
||||||
|
"left": "(",
|
||||||
|
"right": ")",
|
||||||
|
"null": ".",
|
||||||
|
}
|
||||||
|
|
||||||
|
RAD = "\\sqrt[{deg}]{{{text}}}"
|
||||||
|
RAD_DEFAULT = "\\sqrt{{{text}}}"
|
||||||
|
ARR = "{text}"
|
||||||
|
|
||||||
|
LIM_FUNC = {
|
||||||
|
"lim": "\\lim_{{{lim}}}",
|
||||||
|
"max": "\\max_{{{lim}}}",
|
||||||
|
"min": "\\min_{{{lim}}}",
|
||||||
|
}
|
||||||
|
|
||||||
|
LIM_TO = ("\\rightarrow", "\\to")
|
||||||
|
|
||||||
|
LIM_UPP = "\\overset{{{lim}}}{{{text}}}"
|
||||||
|
|
||||||
|
M = "\\begin{{matrix}}{text}\end{{matrix}}"
|
460
docling/backend/docx_latex/omml.py
Normal file
460
docling/backend/docx_latex/omml.py
Normal file
@ -0,0 +1,460 @@
|
|||||||
|
"""
|
||||||
|
Office Math Markup Language (OMML)
|
||||||
|
|
||||||
|
Adapted from https://github.com/xiilei/dwml/blob/master/dwml/omml.py
|
||||||
|
On 23/01/2025
|
||||||
|
"""
|
||||||
|
|
||||||
|
import lxml.etree as ET
|
||||||
|
from pylatexenc.latexencode import UnicodeToLatexEncoder
|
||||||
|
|
||||||
|
from docling.backend.docx_latex.latex_dict import (
|
||||||
|
ALN,
|
||||||
|
ARR,
|
||||||
|
BACKSLASH,
|
||||||
|
BLANK,
|
||||||
|
BRK,
|
||||||
|
CHARS,
|
||||||
|
CHR,
|
||||||
|
CHR_BO,
|
||||||
|
CHR_DEFAULT,
|
||||||
|
D_DEFAULT,
|
||||||
|
F_DEFAULT,
|
||||||
|
FUNC,
|
||||||
|
FUNC_PLACE,
|
||||||
|
LIM_FUNC,
|
||||||
|
LIM_TO,
|
||||||
|
LIM_UPP,
|
||||||
|
POS,
|
||||||
|
POS_DEFAULT,
|
||||||
|
RAD,
|
||||||
|
RAD_DEFAULT,
|
||||||
|
SUB,
|
||||||
|
SUP,
|
||||||
|
D,
|
||||||
|
F,
|
||||||
|
M,
|
||||||
|
T,
|
||||||
|
)
|
||||||
|
|
||||||
|
OMML_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/math}"
|
||||||
|
|
||||||
|
|
||||||
|
def load(stream):
|
||||||
|
tree = ET.parse(stream)
|
||||||
|
for omath in tree.findall(OMML_NS + "oMath"):
|
||||||
|
yield oMath2Latex(omath)
|
||||||
|
|
||||||
|
|
||||||
|
def load_string(string):
|
||||||
|
root = ET.fromstring(string)
|
||||||
|
for omath in root.findall(OMML_NS + "oMath"):
|
||||||
|
yield oMath2Latex(omath)
|
||||||
|
|
||||||
|
|
||||||
|
def escape_latex(strs):
|
||||||
|
last = None
|
||||||
|
new_chr = []
|
||||||
|
strs = strs.replace(r"\\", "\\")
|
||||||
|
for c in strs:
|
||||||
|
if (c in CHARS) and (last != BACKSLASH):
|
||||||
|
new_chr.append(BACKSLASH + c)
|
||||||
|
else:
|
||||||
|
new_chr.append(c)
|
||||||
|
last = c
|
||||||
|
return BLANK.join(new_chr)
|
||||||
|
|
||||||
|
|
||||||
|
def get_val(key, default=None, store=CHR):
|
||||||
|
if key is not None:
|
||||||
|
return key if not store else store.get(key, key)
|
||||||
|
else:
|
||||||
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
class Tag2Method(object):
|
||||||
|
|
||||||
|
def call_method(self, elm, stag=None):
|
||||||
|
getmethod = self.tag2meth.get
|
||||||
|
if stag is None:
|
||||||
|
stag = elm.tag.replace(OMML_NS, "")
|
||||||
|
method = getmethod(stag)
|
||||||
|
if method:
|
||||||
|
return method(self, elm)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def process_children_list(self, elm, include=None):
|
||||||
|
"""
|
||||||
|
process children of the elm,return iterable
|
||||||
|
"""
|
||||||
|
for _e in list(elm):
|
||||||
|
if OMML_NS not in _e.tag:
|
||||||
|
continue
|
||||||
|
stag = _e.tag.replace(OMML_NS, "")
|
||||||
|
if include and (stag not in include):
|
||||||
|
continue
|
||||||
|
t = self.call_method(_e, stag=stag)
|
||||||
|
if t is None:
|
||||||
|
t = self.process_unknow(_e, stag)
|
||||||
|
if t is None:
|
||||||
|
continue
|
||||||
|
yield (stag, t, _e)
|
||||||
|
|
||||||
|
def process_children_dict(self, elm, include=None):
|
||||||
|
"""
|
||||||
|
process children of the elm,return dict
|
||||||
|
"""
|
||||||
|
latex_chars = dict()
|
||||||
|
for stag, t, e in self.process_children_list(elm, include):
|
||||||
|
latex_chars[stag] = t
|
||||||
|
return latex_chars
|
||||||
|
|
||||||
|
def process_children(self, elm, include=None):
|
||||||
|
"""
|
||||||
|
process children of the elm,return string
|
||||||
|
"""
|
||||||
|
return BLANK.join(
|
||||||
|
(
|
||||||
|
t if not isinstance(t, Tag2Method) else str(t)
|
||||||
|
for stag, t, e in self.process_children_list(elm, include)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_unknow(self, elm, stag):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
class Pr(Tag2Method):
|
||||||
|
|
||||||
|
text = ""
|
||||||
|
|
||||||
|
__val_tags = ("chr", "pos", "begChr", "endChr", "type")
|
||||||
|
|
||||||
|
__innerdict = None # can't use the __dict__
|
||||||
|
|
||||||
|
""" common properties of element"""
|
||||||
|
|
||||||
|
def __init__(self, elm):
|
||||||
|
self.__innerdict = {}
|
||||||
|
self.text = self.process_children(elm)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.text
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.__str__(self)
|
||||||
|
|
||||||
|
def __getattr__(self, name):
|
||||||
|
return self.__innerdict.get(name, None)
|
||||||
|
|
||||||
|
def do_brk(self, elm):
|
||||||
|
self.__innerdict["brk"] = BRK
|
||||||
|
return BRK
|
||||||
|
|
||||||
|
def do_common(self, elm):
|
||||||
|
stag = elm.tag.replace(OMML_NS, "")
|
||||||
|
if stag in self.__val_tags:
|
||||||
|
t = elm.get("{0}val".format(OMML_NS))
|
||||||
|
self.__innerdict[stag] = t
|
||||||
|
return None
|
||||||
|
|
||||||
|
tag2meth = {
|
||||||
|
"brk": do_brk,
|
||||||
|
"chr": do_common,
|
||||||
|
"pos": do_common,
|
||||||
|
"begChr": do_common,
|
||||||
|
"endChr": do_common,
|
||||||
|
"type": do_common,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class oMath2Latex(Tag2Method):
|
||||||
|
"""
|
||||||
|
Convert oMath element of omml to latex
|
||||||
|
"""
|
||||||
|
|
||||||
|
_t_dict = T
|
||||||
|
|
||||||
|
__direct_tags = ("box", "sSub", "sSup", "sSubSup", "num", "den", "deg", "e")
|
||||||
|
u = UnicodeToLatexEncoder(
|
||||||
|
replacement_latex_protection="braces-all",
|
||||||
|
unknown_char_policy="keep",
|
||||||
|
unknown_char_warning=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
def __init__(self, element):
|
||||||
|
self._latex = self.process_children(element)
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.latex
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.__str__(self)
|
||||||
|
|
||||||
|
def process_unknow(self, elm, stag):
|
||||||
|
if stag in self.__direct_tags:
|
||||||
|
return self.process_children(elm)
|
||||||
|
elif stag[-2:] == "Pr":
|
||||||
|
return Pr(elm)
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def latex(self):
|
||||||
|
return self._latex
|
||||||
|
|
||||||
|
def do_acc(self, elm):
|
||||||
|
"""
|
||||||
|
the accent function
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
latex_s = get_val(
|
||||||
|
c_dict["accPr"].chr, default=CHR_DEFAULT.get("ACC_VAL"), store=CHR
|
||||||
|
)
|
||||||
|
return latex_s.format(c_dict["e"])
|
||||||
|
|
||||||
|
def do_bar(self, elm):
|
||||||
|
"""
|
||||||
|
the bar function
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict["barPr"]
|
||||||
|
latex_s = get_val(pr.pos, default=POS_DEFAULT.get("BAR_VAL"), store=POS)
|
||||||
|
return pr.text + latex_s.format(c_dict["e"])
|
||||||
|
|
||||||
|
def do_d(self, elm):
|
||||||
|
"""
|
||||||
|
the delimiter object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict["dPr"]
|
||||||
|
null = D_DEFAULT.get("null")
|
||||||
|
|
||||||
|
print(pr.text)
|
||||||
|
s_val = get_val(pr.begChr, default=D_DEFAULT.get("left"), store=T)
|
||||||
|
print(pr.begChr, D_DEFAULT.get("left"), s_val)
|
||||||
|
|
||||||
|
e_val = get_val(pr.endChr, default=D_DEFAULT.get("right"), store=T)
|
||||||
|
print(pr.endChr, D_DEFAULT.get("right"), s_val)
|
||||||
|
|
||||||
|
delim = pr.text + D.format(
|
||||||
|
left=null if not s_val else escape_latex(s_val),
|
||||||
|
text=c_dict["e"],
|
||||||
|
right=null if not e_val else escape_latex(e_val),
|
||||||
|
)
|
||||||
|
print(delim)
|
||||||
|
print()
|
||||||
|
return delim
|
||||||
|
|
||||||
|
def do_spre(self, elm):
|
||||||
|
"""
|
||||||
|
the Pre-Sub-Superscript object -- Not support yet
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
def do_sub(self, elm):
|
||||||
|
text = self.process_children(elm)
|
||||||
|
return SUB.format(text)
|
||||||
|
|
||||||
|
def do_sup(self, elm):
|
||||||
|
text = self.process_children(elm)
|
||||||
|
return SUP.format(text)
|
||||||
|
|
||||||
|
def do_f(self, elm):
|
||||||
|
"""
|
||||||
|
the fraction object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict["fPr"]
|
||||||
|
latex_s = get_val(pr.type, default=F_DEFAULT, store=F)
|
||||||
|
return pr.text + latex_s.format(num=c_dict.get("num"), den=c_dict.get("den"))
|
||||||
|
|
||||||
|
def do_func(self, elm):
|
||||||
|
"""
|
||||||
|
the Function-Apply object (Examples:sin cos)
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
func_name = c_dict.get("fName")
|
||||||
|
return func_name.replace(FUNC_PLACE, c_dict.get("e"))
|
||||||
|
|
||||||
|
def do_fname(self, elm):
|
||||||
|
"""
|
||||||
|
the func name
|
||||||
|
"""
|
||||||
|
latex_chars = []
|
||||||
|
for stag, t, e in self.process_children_list(elm):
|
||||||
|
if stag == "r":
|
||||||
|
if FUNC.get(t):
|
||||||
|
latex_chars.append(FUNC[t])
|
||||||
|
else:
|
||||||
|
raise NotSupport("Not support func %s" % t)
|
||||||
|
else:
|
||||||
|
latex_chars.append(t)
|
||||||
|
t = BLANK.join(latex_chars)
|
||||||
|
return t if FUNC_PLACE in t else t + FUNC_PLACE # do_func will replace this
|
||||||
|
|
||||||
|
def do_groupchr(self, elm):
|
||||||
|
"""
|
||||||
|
the Group-Character object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
pr = c_dict["groupChrPr"]
|
||||||
|
latex_s = get_val(pr.chr)
|
||||||
|
return pr.text + latex_s.format(c_dict["e"])
|
||||||
|
|
||||||
|
def do_rad(self, elm):
|
||||||
|
"""
|
||||||
|
the radical object
|
||||||
|
"""
|
||||||
|
c_dict = self.process_children_dict(elm)
|
||||||
|
text = c_dict.get("e")
|
||||||
|
deg_text = c_dict.get("deg")
|
||||||
|
if deg_text:
|
||||||
|
return RAD.format(deg=deg_text, text=text)
|
||||||
|
else:
|
||||||
|
return RAD_DEFAULT.format(text=text)
|
||||||
|
|
||||||
|
def do_eqarr(self, elm):
|
||||||
|
"""
|
||||||
|
the Array object
|
||||||
|
"""
|
||||||
|
return ARR.format(
|
||||||
|
text=BRK.join(
|
||||||
|
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def do_limlow(self, elm):
|
||||||
|
"""
|
||||||
|
the Lower-Limit object
|
||||||
|
"""
|
||||||
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||||
|
latex_s = LIM_FUNC.get(t_dict["e"])
|
||||||
|
if not latex_s:
|
||||||
|
raise NotSupport("Not support lim %s" % t_dict["e"])
|
||||||
|
else:
|
||||||
|
return latex_s.format(lim=t_dict.get("lim"))
|
||||||
|
|
||||||
|
def do_limupp(self, elm):
|
||||||
|
"""
|
||||||
|
the Upper-Limit object
|
||||||
|
"""
|
||||||
|
t_dict = self.process_children_dict(elm, include=("e", "lim"))
|
||||||
|
return LIM_UPP.format(lim=t_dict.get("lim"), text=t_dict.get("e"))
|
||||||
|
|
||||||
|
def do_lim(self, elm):
|
||||||
|
"""
|
||||||
|
the lower limit of the limLow object and the upper limit of the limUpp function
|
||||||
|
"""
|
||||||
|
return self.process_children(elm).replace(LIM_TO[0], LIM_TO[1])
|
||||||
|
|
||||||
|
def do_m(self, elm):
|
||||||
|
"""
|
||||||
|
the Matrix object
|
||||||
|
"""
|
||||||
|
rows = []
|
||||||
|
for stag, t, e in self.process_children_list(elm):
|
||||||
|
if stag is "mPr":
|
||||||
|
pass
|
||||||
|
elif stag == "mr":
|
||||||
|
rows.append(t)
|
||||||
|
return M.format(text=BRK.join(rows))
|
||||||
|
|
||||||
|
def do_mr(self, elm):
|
||||||
|
"""
|
||||||
|
a single row of the matrix m
|
||||||
|
"""
|
||||||
|
return ALN.join(
|
||||||
|
[t for stag, t, e in self.process_children_list(elm, include=("e",))]
|
||||||
|
)
|
||||||
|
|
||||||
|
def do_nary(self, elm):
|
||||||
|
"""
|
||||||
|
the n-ary object
|
||||||
|
"""
|
||||||
|
res = []
|
||||||
|
bo = ""
|
||||||
|
for stag, t, e in self.process_children_list(elm):
|
||||||
|
if stag == "naryPr":
|
||||||
|
bo = get_val(t.chr, store=CHR_BO)
|
||||||
|
else:
|
||||||
|
res.append(t)
|
||||||
|
return bo + BLANK.join(res)
|
||||||
|
|
||||||
|
def process_unicode(self, s):
|
||||||
|
# s = s if isinstance(s,unicode) else unicode(s,'utf-8')
|
||||||
|
# print(s, self._t_dict.get(s, s), unicode_to_latex(s))
|
||||||
|
# _str.append( self._t_dict.get(s, s) )
|
||||||
|
|
||||||
|
out_latex_str = self.u.unicode_to_latex(s)
|
||||||
|
|
||||||
|
# print(s, out_latex_str)
|
||||||
|
|
||||||
|
if (
|
||||||
|
s.startswith("{") is False
|
||||||
|
and out_latex_str.startswith("{")
|
||||||
|
and s.endswith("}") is False
|
||||||
|
and out_latex_str.endswith("}")
|
||||||
|
):
|
||||||
|
out_latex_str = f" {out_latex_str[1:-1]} "
|
||||||
|
|
||||||
|
# print(s, out_latex_str)
|
||||||
|
|
||||||
|
if "ensuremath" in out_latex_str:
|
||||||
|
out_latex_str = out_latex_str.replace("\\ensuremath{", " ")
|
||||||
|
out_latex_str = out_latex_str.replace("}", " ")
|
||||||
|
|
||||||
|
# print(s, out_latex_str)
|
||||||
|
|
||||||
|
if out_latex_str.strip().startswith("\\text"):
|
||||||
|
out_latex_str = f" \\text{{{out_latex_str}}} "
|
||||||
|
|
||||||
|
# print(s, out_latex_str)
|
||||||
|
|
||||||
|
return out_latex_str
|
||||||
|
|
||||||
|
def do_r(self, elm):
|
||||||
|
"""
|
||||||
|
Get text from 'r' element,And try convert them to latex symbols
|
||||||
|
@todo text style support , (sty)
|
||||||
|
@todo \text (latex pure text support)
|
||||||
|
"""
|
||||||
|
_str = []
|
||||||
|
_base_str = []
|
||||||
|
for s in elm.findtext("./{0}t".format(OMML_NS)):
|
||||||
|
out_latex_str = self.process_unicode(s)
|
||||||
|
_str.append(out_latex_str)
|
||||||
|
_base_str.append(s)
|
||||||
|
|
||||||
|
proc_str = escape_latex(BLANK.join(_str))
|
||||||
|
base_proc_str = BLANK.join(_base_str)
|
||||||
|
|
||||||
|
if "{" not in base_proc_str and "\\{" in proc_str:
|
||||||
|
proc_str = proc_str.replace("\\{", "{")
|
||||||
|
|
||||||
|
if "}" not in base_proc_str and "\\}" in proc_str:
|
||||||
|
proc_str = proc_str.replace("\\}", "}")
|
||||||
|
|
||||||
|
return proc_str
|
||||||
|
|
||||||
|
tag2meth = {
|
||||||
|
"acc": do_acc,
|
||||||
|
"r": do_r,
|
||||||
|
"bar": do_bar,
|
||||||
|
"sub": do_sub,
|
||||||
|
"sup": do_sup,
|
||||||
|
"f": do_f,
|
||||||
|
"func": do_func,
|
||||||
|
"fName": do_fname,
|
||||||
|
"groupChr": do_groupchr,
|
||||||
|
"d": do_d,
|
||||||
|
"rad": do_rad,
|
||||||
|
"eqArr": do_eqarr,
|
||||||
|
"limLow": do_limlow,
|
||||||
|
"limUpp": do_limupp,
|
||||||
|
"lim": do_lim,
|
||||||
|
"m": do_m,
|
||||||
|
"mr": do_mr,
|
||||||
|
"nary": do_nary,
|
||||||
|
}
|
@ -19,6 +19,7 @@ from lxml.etree import XPath
|
|||||||
from PIL import Image, UnidentifiedImageError
|
from PIL import Image, UnidentifiedImageError
|
||||||
|
|
||||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||||
|
from docling.backend.docx_latex.omml import oMath2Latex
|
||||||
from docling.datamodel.base_models import InputFormat
|
from docling.datamodel.base_models import InputFormat
|
||||||
from docling.datamodel.document import InputDocument
|
from docling.datamodel.document import InputDocument
|
||||||
|
|
||||||
@ -133,6 +134,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
def walk_linear(self, body, docx_obj, doc) -> DoclingDocument:
|
||||||
for element in body:
|
for element in body:
|
||||||
tag_name = etree.QName(element).localname
|
tag_name = etree.QName(element).localname
|
||||||
|
|
||||||
# Check for Inline Images (blip elements)
|
# Check for Inline Images (blip elements)
|
||||||
namespaces = {
|
namespaces = {
|
||||||
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
||||||
@ -221,12 +223,28 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
else:
|
else:
|
||||||
return label, None
|
return label, None
|
||||||
|
|
||||||
|
def handle_equations_in_text(self, element, text):
|
||||||
|
only_texts = []
|
||||||
|
texts_and_equations = []
|
||||||
|
for subt in element.iter():
|
||||||
|
if subt.tag.endswith("t") and "math" not in subt.tag:
|
||||||
|
only_texts.append(subt.text)
|
||||||
|
texts_and_equations.append(subt.text)
|
||||||
|
if "oMath" in subt.tag and "oMathPara" not in subt.tag:
|
||||||
|
texts_and_equations.append(f"${str(oMath2Latex(subt))}$")
|
||||||
|
|
||||||
|
assert "".join(only_texts) == text
|
||||||
|
return "".join(texts_and_equations)
|
||||||
|
|
||||||
def handle_text_elements(self, element, docx_obj, doc):
|
def handle_text_elements(self, element, docx_obj, doc):
|
||||||
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
paragraph = docx.text.paragraph.Paragraph(element, docx_obj)
|
||||||
|
|
||||||
|
text = paragraph.text
|
||||||
|
text = self.handle_equations_in_text(element=element, text=text)
|
||||||
|
|
||||||
if paragraph.text is None:
|
if paragraph.text is None:
|
||||||
return
|
return
|
||||||
text = paragraph.text.strip()
|
text = text.strip()
|
||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
@ -291,7 +309,6 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
doc.add_text(
|
doc.add_text(
|
||||||
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
label=DocItemLabel.PARAGRAPH, parent=self.parents[level - 1], text=text
|
||||||
)
|
)
|
||||||
|
|
||||||
self.update_history(p_style_id, p_level, numid, ilevel)
|
self.update_history(p_style_id, p_level, numid, ilevel)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
22
poetry.lock
generated
22
poetry.lock
generated
@ -3823,10 +3823,10 @@ files = [
|
|||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
|
||||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||||
|
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -3849,10 +3849,10 @@ files = [
|
|||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
{version = ">=1.21.0", markers = "python_version == \"3.9\" and platform_system == \"Darwin\" and platform_machine == \"arm64\""},
|
||||||
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
{version = ">=1.19.3", markers = "platform_system == \"Linux\" and platform_machine == \"aarch64\" and python_version >= \"3.8\" and python_version < \"3.10\" or python_version > \"3.9\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_system != \"Darwin\" and python_version < \"3.10\" or python_version >= \"3.9\" and platform_machine != \"arm64\" and python_version < \"3.10\""},
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
|
||||||
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
{version = ">=1.21.4", markers = "python_version >= \"3.10\" and platform_system == \"Darwin\" and python_version < \"3.11\""},
|
||||||
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
{version = ">=1.21.2", markers = "platform_system != \"Darwin\" and python_version >= \"3.10\" and python_version < \"3.11\""},
|
||||||
|
{version = ">=1.23.5", markers = "python_version >= \"3.11\" and python_version < \"3.12\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
@ -4037,8 +4037,8 @@ files = [
|
|||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
numpy = [
|
numpy = [
|
||||||
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
{version = ">=1.22.4", markers = "python_version < \"3.11\""},
|
||||||
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
|
||||||
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
{version = ">=1.23.2", markers = "python_version == \"3.11\""},
|
||||||
|
{version = ">=1.26.0", markers = "python_version >= \"3.12\""},
|
||||||
]
|
]
|
||||||
python-dateutil = ">=2.8.2"
|
python-dateutil = ">=2.8.2"
|
||||||
pytz = ">=2020.1"
|
pytz = ">=2020.1"
|
||||||
@ -4787,6 +4787,16 @@ files = [
|
|||||||
[package.extras]
|
[package.extras]
|
||||||
windows-terminal = ["colorama (>=0.4.6)"]
|
windows-terminal = ["colorama (>=0.4.6)"]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pylatexenc"
|
||||||
|
version = "2.10"
|
||||||
|
description = "Simple LaTeX parser providing latex-to-unicode and unicode-to-latex conversion"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "pylatexenc-2.10.tar.gz", hash = "sha256:3dd8fd84eb46dc30bee1e23eaab8d8fb5a7f507347b23e5f38ad9675c84f40d3"},
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pylint"
|
name = "pylint"
|
||||||
version = "2.17.7"
|
version = "2.17.7"
|
||||||
@ -7751,4 +7761,4 @@ tesserocr = ["tesserocr"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.9"
|
python-versions = "^3.9"
|
||||||
content-hash = "08d30cee8d77f9beee32d5dbec1643367ecae2b4c4b47b57fcb337711471eb5c"
|
content-hash = "3727fb425795e596dda2c7b5b726eb58fd28ff3c3c3c08e96b6458204ef9f7dc"
|
||||||
|
@ -57,6 +57,7 @@ onnxruntime = [
|
|||||||
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
{ version = "^1.7.0", optional = true, markers = "python_version >= '3.10'" }
|
||||||
]
|
]
|
||||||
pillow = "^10.0.0"
|
pillow = "^10.0.0"
|
||||||
|
pylatexenc = "^2.10"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
black = {extras = ["jupyter"], version = "^24.4.2"}
|
black = {extras = ["jupyter"], version = "^24.4.2"}
|
||||||
|
Loading…
Reference in New Issue
Block a user