From 0b836095319ebf2133c4a3a77602718034915e55 Mon Sep 17 00:00:00 2001 From: Rafael Teixeira de Lima Date: Thu, 24 Jul 2025 11:02:24 +0200 Subject: [PATCH 1/4] fix(docx): Adding plain latex equations to table cells (#1986) * Adding plain latex equations to table cells Signed-off-by: Rafael Teixeira de Lima * Adding test files Signed-off-by: Rafael Teixeira de Lima --------- Signed-off-by: Rafael Teixeira de Lima --- docling/backend/msword_backend.py | 11 +- tests/data/docx/table_with_equations.docx | Bin 0 -> 14228 bytes .../docling_v2/table_with_equations.docx.itxt | 3 + .../docling_v2/table_with_equations.docx.json | 174 ++++++++++++++++++ .../docling_v2/table_with_equations.docx.md | 3 + 5 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 tests/data/docx/table_with_equations.docx create mode 100644 tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/table_with_equations.docx.json create mode 100644 tests/data/groundtruth/docling_v2/table_with_equations.docx.md diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index abbcc6f6..45c53a98 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) _log.debug(f" spanned before row {spanned_idx}") + # Detect equations in cell text + text, equations = self._handle_equations_in_text( + element=cell._element, text=cell.text + ) + if len(equations) == 0: + text = cell.text + else: + text = text.replace("", "$").replace("", "$") + table_cell = TableCell( - text=cell.text, + text=text, row_span=spanned_idx - row_idx, col_span=cell.grid_span, start_row_offset_idx=row.grid_cols_before + row_idx, diff --git a/tests/data/docx/table_with_equations.docx b/tests/data/docx/table_with_equations.docx new file mode 100644 index 0000000000000000000000000000000000000000..151c03b247dafb0d4e4d65fd576e711f2b24b3aa GIT binary patch literal 14228 zcmeHu19v9b+V&IMW_N6K+_7z^W7}58wrwXJ+qP}1!wx%6zRa9CXJ-1$x85J{zO`1} zYt^&&byw|rZq?q`mX!bjMFoHZAOQdX0bnzK#^MVQ0PqnE06+mi0;>yHTR9k7IcO`o z+8Ekv(z;k$e9Q#}CeH={zxV&YY+yzx=VOOh-p;P`<`1$2^_|qNF#p) zf$0vc{1Q9Z_S%60ET;q%18qS_!g#mAs65~|y^?AHg;eiMa)QB?=o6=9%1X7oxJTn< zfJM3;Prt!8%;Igq)YgtA$pS=ARNosrt^h5Qo|o>2rg{&Eq)doXCU%Qp#P@$jXWY7L z-vCM4AhDs4sfWOU*UCnQiK|&;_Oucu#57p8pH($Zko4E7Lk&6hIGW3f_eheBwp;bH z$GeEc#^gzsRf&W+b@<@GBu6zgKv?LkTM}03p{U{7T$N8Py^6baqf|gr_qakMNCy-% zv6h-!u9~x+`9=;=7y;DADYUzahd>dtg$>|1l=erH!q0{x?nsBach~)i#K$z`ebqr0@@am-OjO++qLfRhBM`4;0_!WZ5l?jzBTaNv z&qj@ApW<;`6GMa<#Vs|q1w9=}dW_yTXzYu&>0`IN&Z7R10BWcls1Uj*4*1R&AgbIFtV{VQc#gOJiUH*8@CNdkOtXnuR3Lo=J2Q z-l7v~&KwAo`JG9z4~;6bcbhVASET zHY)o$Dm9+zLv2P|Hn;p$%aU01eU+hZa~CNN{^8nCjetW@ADyF4%$HiM5WXm;aTU3S zFnBR_CQJ>i=+;V#E!vRrHpc^8m$_G(I!#K(c<-R8#Rdtz{^yfm={tZqbZ|x=0qW01 zCQ0OrExT>y(h~B;&-C}0-dZ3!-orjg0o@nOff1v#G&Z>)Q3>60%1CT-C8~#An5tw7 zK#*a1AKfap37`qIx<>@?D2W zc@{K?<^%@!rBn8?pZ<)UQOp#nI^8QDuoG+)UP|+st-v-W7z2&5V=-pN8RMcfmo<52 zvao))%6_7e8d}H^ibMzVxFTKsL+1cLpy%))}n z>^0R{Gv_$AFNFvoX2BLwZ76Jfi&?inhwuE3VD&tzR$q3ZCR zJ#bKZ@09(k)VE|2D0)=t)hU~wH()<^U=RH$a&4y*KAO3SxjCJUP!0gCb|#+NUH0ai zKWwDZE2u((=|!sEYN(eri`j%pxf3@peh3Y0m{b{~!TX6d#&rr0(c9&CH@OpmQO;% z=|V%u_WkE7D8bA1R^#^8xLP%qDryKe zrPM5B6dDR+sT@5qgxlGP#DvSrZW=s0%K|`L;fC0z&W)}fyYqj5)^EM?TNR!=?6htK z4B3_Dlyam$Drl2J@K_ZZx1^A3BRN4YD*V_n+mU%B{87BLmUaWNG1mEP%jKDN0@Y&s ziu=i0p|x^OAmYHoydnPR1#^U~%R@uIBN~C(4chrzVJ!Oa=*)@QERnR|(sO^yv9{Mj zNuDf*nYd`;kKE{F8aCi9rnV+=xmbC%l{bR27vad&F$e8n44zp0&SK2N46|r~dmv-E zL)pYfb||sO*o?z?RISBE#d4Z!sHb37Zn@&gAN9|w&oX@pZ$mH{V^Qv3of)FVT)EH| zh|D-Zug!mYr)#*o68)s(US#*Vug9r_xb^vAW@s>qAPQ-p%GY^Vvfmf9TT{Z_kz^S& znOKg-N1(gY8vi`dmL0E>I_f)GNE&8J?NG_%8uqz{BVW~{8*NEWyn-mwr=ZNqB#maI z^wG4J_HBCq`NMnc|L^RuU{EV}^WO65y|%&yKmz@q9sX59{WVAYyPN`g|FU_{GXJ}e zZ?U8A<-~hsAMomz>`BAnDa*fSAx^Nm@B$FR=P9O!8o$}@tP~YHnbK*wu&YS6pY&!~ zo$L;E4xe)`${w;q{2pHEzDVg(f?s*ghiI;1eu77d8Ka}FrDc&hWhN~#ck3gV%O*ed zDX2c)fE|~~kULv~E+WeBfN4M(+)~Ri%r4>3nL!#$Hj}=;LK)HdmO0oA_WnbIb?U~J zAFDM8JGTM>kIh^7*b$q!DxSIlSRB2CVDzDovP9_AhcJDHB3bIvuDt$|F5S$sPR*Lk zS=!fz4+B4{lI@$+W^yhgsoEP+qO!h|6UcD8J-k;A|7|tK-p_Rt-~hn*M*slr-57qi z8V3_YOGCQf<)`243bpYt99Gm$j7Of}W)A$Q&LkH z2xMaw=mPk^xKZ3J2vEQ=O-)I>B?tJkybw{ET1*ZtI9ylfX^ZEb`zd2<$ojK$kRN|9kVcmYFjR!>sM^PKjn$W%xH20NP;x=u z5fKlFhIHOwmm<|Y{bjfZH0(UlNGS@KoM@>73^LeE@pb|Vw+eNtKtvJvU>)-LjZEoF zOrIS2qtlA`yO3uLs0#K0e>{r$q3Efl;cxjJ_rA_hAY{>XxSuEeGvTK}wgJG+(I7N~ z^Ip=ncV@OUf`XA2!dBUDiFOh`!(+2{V;{=F);ku9;};&8y@M?^I&-+qJ+T3U`pq-` z66ors3u}RmeMY>)c|y*$T9j$`;jdQ>uSVs&O0aFJ&l@` zv(z{)aN_vp>i6QEns<-jT7JO3dCM#<%GOL~5;xck1qPGQ)(d7 z{a+xXC~!o*ooG?nfc22_C~ze-IMCz$@DD!uz(DlT=V~~$YLyU!RPMN)L^ZIj@zE1y zI7B=sC+&dTCvU6PrQ^t##rd>(T<)A+Ue^wGW^lRM{k+GhihAKy-TSGtXS}cVc7MCM z=>_!qcpJ1R(^bsI@no)0bXvM)$aCsy6>$0@-V9a(>A_E4)b9L(bfhg z!Ut4fdNu$RJHCR2&Yl}Yzg4*(|2AA=3&IBa1-%>pEl$#HC0IQuPhe!HV6*n z)g?c2P*5oDz=>HBf-#NS;17WVl?UjF)%LhieIoXzeIwxf>E|13fso{iTWF`8iYlR&vA0vqY!Jgs^m`8|j&OX&DBXvQ(r~82mF6Q6lj>tBt*9gW*C3aP_GWRpA;Kd1YQ5zk)iMv% z85s>8iAY)VXtai1jPm8j<_hcMn#Kk6J)6HEfSC9vnAy9x(I#(qvM%e(AKc>^OsS6! z^}P->VDaloMuU$b>Qz}wG|Knccv0py#G$9D?-w2vO0CPTk-PEZW+(u?uqH$a?dX}| zb6j|qX0-X&sO|1;vrwhc8lpU3Vz6+rMZw2#N>ERYwj9({JZvs9yFNYCq&9yuqAkN} z9fBw}ngJHp1}Ud?uS=xAJ1`D~`iuV8xI=wL8_~Jpnev(hJnL5j%QK=F8BNk4nmi!1ZnM1N$*_bp zcH;|IJEUgWB8|3oLj@%Haw9=Z${4X+Lv(Q^`y@e*iW32KDmpq`0e4cs>lZg6d}%^2$xEQII})xV|vN+GVzCA)#F`C1-=~-8e{6jz^_ST z$c(gZ(GXCQ2qY&6)x5BXxFf2QEk1%2jj<{!=Fg(T%6~ zp4ME^JmTnD)c5oO63gt!0GzphiJ#_-xkOWkFFM5vFV)q!PbdpKO|A?+SV^-)U7fD+ z=)0=$R~h9pd%~tdO0RnQCC&CkNd}*DO~WJz;4v(#&j{;gtuwc!>%ObSKVcH=`e~R- z&+KwE%M&GD%!bb#+eBu_2_Yrs4bFX@t7Xtfo$p$_PF_5lDC;Z3O;z(-;AzctUtT-* z_B%TvRKFbPj^i+mj!TzMQ8z*ah*!>%Gqu)B=ct@4xcg}}u9(2%f8G-GqdK!|ls{pG zu9t6AFP_s7>YS#Yqsm;Ym|KnbTJU0GYTkS`e>c*Wx_Hj>p5FdD0oGM=J3@d00533q z`KY!xbZ{`WGPeIM`PC_}TCK7ndTQ&ubx&Qhw=7o>%~4=;Y~%+w3Z3?ID-ar@90+|Z zVLR7*^9+hNDUp)xqq)*kus50r4-67$f4k^g-ijfYDo6&q6heg!CMPb`|?J} zS-6q#T^oZQUDfD{TYSzBEYK&DFPk3p%aUt`lEZ#5|7sNfnzdiA`Ps0Q)BYUb`h*n_UYES zhJ|(K3opDW}cJH{3?}w*Q#LQhlqx80<4Umg<<)MCyHG);PMut=8+%vz3b3;Fu zECd4Q;gomEc1J;f>@j+U#a_mxsv#bln<``#KXB3;HmT1Q7jq<|H`yJJovtBKRPx|J~h&$~7QnRKlp-USU&rb+twdEa=eab;0Uj*SI@c9Ve5 zS4Vw@hsbhXlxiw)ZnDK@#7CMt)5Q__J!=Z*XDt(p+~Qr3XMFWtSsxNk6c#fDjC8`* zr#nQXDJPPGx36*%kQL0SKx+7v=8Gvd2Wq{i&nEhm0pOYR9#_pl9%*%|J5~AJYao=T zX+F-a-z`EKZ=dC@B{)`xR)sSxPc3&RSS}kv>d$n(l4BmYBQ|J%ai_qFgYdQHcupCP zdhqRutc>FPQFtfh_o;L{Uy1&Egd+Y^ukIyzZ%tFJ%53p^;|+sYoY+d2zB3=DGiFXb zLVlZY>K6KS&MxWArVwYlTKG;sJ@)2UkS+Tcc)GmNBOCUN!As(diCL2eZFl%*=coC% ze~cO)W`={KhyVay9RPsv@3Pt6!PUa>_n5J#scD}kitf2t`hpOj>GXlKfm+Q;V|TeG z!zFHocsC?H;}E@$dH}M{d&lML%TEkQ2h%=FD^5H4l2DQE@~4y-a=wybl@_aIL{|Zoo{D{gV)qvp)Lnd z-fSG&j~ISJnUxa=2#j9cNz_jiYL}{~>10Q-`#a`tUQGr^y>5@$HK8crZ0TYl zDBx(gjXjD0UuNhX7sYsi*O+azZ`4D6VDJ@N-(?4C(rHv1iiswOaCe$~i^0%IUIozN zb}7&DK|Q{R!zaByv_7$kEmI?l@+V{>CKh?`zxjT65b@m8o_A8CIq2n#{UBKWLii1b z_|=@-_=J3NZ}EnOS$mxI_K-B>dd!{EIORj2b8g#C_NXlG+nHf{07!LEEp_J@|3Z^J z0T0vpWo=u{10`%vcMj(ch>U*q6ATKP*Ed!}uSA@I{zP-N;+2Nr{=0q_{10m$rRdU@ zkDLirbV^$`9{2%w)Y?UaQNz~Id_}EHP3VV{od@RV!yi7RhE97?&DT?Q z9%)<7my1deib*+Ept?Uix66Vd6gDr}eqxK=N5yEsl*X4@aVu*3xA ztazr^@&|hJ?B1QkLSR|-YL$y+#CbNr3!BWF4cXxs@nzNa?1;sZKaEKF_7cVLF}71H zD9x|z;tfqlK%rae5@ur0um{IVpm-)QImI-IjJyL0b)M*I5v zPv)EzR%Wh0{Kt3N8b=SrVh6=0xRyE3A_nw)3LYnzT&{8l6=LIvy{A*gHGc81rv+qpfaf+!kETJ^Ug^e6p6j%l7v(9v?0{KI+F0U zl{uo?5`K$fLRL_Hc>;$n^jhTYzWISgp8SgE2W_mVWH1$MHj<5`l*~s|%33Z{>c(gy zcxyBjd_10gTiMu3AEg*t*}xvSK6qY{Cph7BLp#XS7BzowYE8^&bi7^@zg}cff3bIO z&uDR*UOm_>?o0L~awk=pP_F(YL2$3w9<%aE#RM*5M1urBrd^zq&@aK|h2|$8Q=B2l z8I6>eQA_52M4^&sh9rmK^cPl>r`Cx57qo^?T~f(L1}r9lB(fJG%jpRY@&+AJ6i*ly ze)F5kt7;*2F5t1*@pfkMq;Q>{UoI6LM)BQ<)`{ifIj_9oZBvUv7QW114Ti=_CwkRt z(5Rx9cF4o!J{5E{$@DyRDYwgfXVKG@>|IUsBY`1`ssOwEdEV>(gH~Sl=))Ju}IaCp^gu^FdO(rnT|frz zp>A}}(tjip@R9qJ(^YIDtD}fZPA?DqxS68>bmEmJ4fgG=ogF*87)ll>ZwBjVKg%Ss z2w1p)5$w;icG^5@@-P=k(mAArSKhP&Sj;&m#J)p=ghC=K_~WMFb0!jOv{~IMQ_85Z za;)lRpA>*%u$ZW4pdm1i@WGjzh$^)&ml~rxm^>_(m;hUSRl`Qn;ANv;Q};L}Vne7Y zB6$)+4E18pPWq?nw034ivUT85w}W<(?&RI}3*lX1m|a~bG~?$LEH#HEbKO>iCB;LO zYkCR&B?x@~6GJ)Kd-{HZIWAR-7IY2HkF&^arebr1GVH=V`qEA!R$qqehh`&VE|QJ$ zh!SMXV=TYAjL!rK6QnmB6`k&?^*OH;<}x&0R$1~>5~UHlykDg%wMlFy)q+K5SY=iZ zIx|Lb;qMZ1Ko3Ye#ZPq92-IPnqP!uGP;ro&mhL7v%Xqg{Vozr<99o|UMy)QArnm6S z-AXg1?}WQYu;rbjsMJ&IR}M}b4Zu%-{;@>BPWF*Qut?GXPNaaCL;P^fAn1E~{wd;j z3$RMaCJV4#+hyp1Rs*Wwo!D}>I^a{pj5H7@$T~R!(3N7YFVIeq@p61kadHIEE5(Sw zmWq%*ER+`B+u_i=w!#6Iw88=8znvy|{C`0H=u@h{x`BOepcw4SLg~+ogT|j7QMwYw z^*_Ph`(mN_b6}z6W=B91ObG`dn&cgez^zC854Za#TCnFqM?qB!f4`%U9r4V1W{3-BUTyz`xa;~6KD4SIpWv7xOyq*&(zg*qI z6K0b@jG(pa7eq9_`5hd2B9~rHU@p=0ma?LS1ep2q;l%L?gz63t(MgFlU$Kt+f>aNxUm zDqaqiyOVzf(+ZNT#~0m;zLlIQ*wvi1;y{*O`742bI}bc|D|cE>CYy_?hrG; zt~9g1T_HAqvmBh;ut_PwZ#VKTz$RpqgVR|n36z~s6ewmXohKKMFhlK4u+bp;h8*1b zQ3cp`xEcBTlzOQk$InDc$jEk*A*Jrm6^)7{kRG80aqWTm$41$V5VVR@*goQ&0NtqC-^sE@Y|D^{YN~0u)z) zCNzH0mcIWi?V2d4?XoqUdF(!?hR?l_*crQIHqCqaq%oJ`0phlATfo~1v;Zjm&;c`I zSZYv39b&xjz>ucoGb?$MH~up(kF}DlHzp-f9xm;W6D??pZYu5vRSPC= zR&k2X!~X0S+VX(~SG^Hy0QRC_E2c(a&ewal+NCAtupYG*HK6h7`e1R#_& z$1cSbBT2}u#Ve0e&i*UBSh&>!=g_l$-3koqd3V+d+OPUoi75p?NPQqCR6g`yN(Ede zL^Dh}j3*u^4=+}RU}H+fbB3!*ZBy)L!Inx=XspE6I%nDiMUZekq56`dM+zD?j-yiA;UiOb45|k!| z4Vm1&-g+LZO?L)iJ#Z*g(e?Fu)Y^p_=8k^zR_*AKDfenBw;W{%LDx*BNYQMmOX5hH zEvGS5o~<7Eyec%Gb{k$k&z6x9H%VA7b-Arc9dT`N=~k^3r5srccQ0nKspUql&-$HWM85w z=^280T`kn!x9`=?I8ga-JiB`ykNV14;Bh2lad(M3H&GbLe6?g$JJ8M}` zPbG`p;e+dVTiL$&hkIJqd#)8LvV6Lt80O%pHeLzw6JbMTnOx2v9-cPmI zAF(p&Q2?{s-F`IVq8q}qb1$;3ItT^$o??Y22Ttnc>~=jyb5=r1nFZ)hioua+$+7hi z&ZG?4WWXY|Hcac;Bn_)Qfv}Q1zaihw0^2dNJpACPto40X=9=euhrXtFQi*`VPTbNs z$)Awb`4~7nYOj+v*A40J#BrPm2a8#}-Wt5*`+CJx#*tq%iwehRO8xkX*{WkfrsVqR z#=f;G8WB;%S50554cPg;lJv*;N)?ufO8j+yYtyBq-D-J?gNV_6^X$fY;0=@rdlkP3 z+48~l#79zr#_2ss+})EhF~hn#8(BX!EuA|q$TyY|@3igN8(BFaD}Iy{YN%~9%uQti{%Yg8 zsL|hFbei%DrYU{La1pv9;&hX| z@_GJzqY3jDIkmkl4b>=R5gA6)i7^v9FwcaBk9R5dgOTzM3Z&Mx}53n(| z(%Swa!&^{CV~ev4%CJnY2iN#9JhT4=KZK|vWjH`G zj0tzx9q!KR!=Vi8s;O?BALO8UG9^ox#9^svEP$#xcZv~haY+rcthiUCRUEaR?<W zOG%C_ksrEIMxI~T)~`BAR#Faf-~*gGRfQ&ImMU$G2~&KoFNmc(l%=_amRc_xph9gp zwWxSXM3RRCh@$%&kU|qaZDEQW@Ad^Z6b*?sP_LEu4Jf^y$6~}MQ%tBx3wyzh2u5~0 z!Hu0ZYX%zLj&pxeh#=Sdv=1A4eCJm+IOkO&As;00Xei>XLjq2P$G-Hnbmi;PS(5%B zwR8bJPuLXbFVt3EQEiDA?I{hfs>%iYd~4!)MNjp>u&XREMV(=tT4{T*%R5t42*_?7 z!Gg3Est4K_%!KRj&&zk`7Zq-+cpTa3>l_j{^Kln^AuQSTr6jluLTv!fJ}>sy-zV6V zrFup_rbVPWRqc%((|f*x6eHVhl77s@{pv?576oM*+r8=!Chc;nO_#ctfO3#Bcfxad zL5&*p9cNQ`Ga~mfJSIMT$QyQg1aZ|*TG(RC>C@F-04Gh4xET>bayVJ<6AIB{C#$?Y zEvsG7Y^*Vr{*6zJ!3ovX2B6;@tAL6?i0YjmYTKNW zyExC;Cg9^J$Ph~frJ(35RD1eJS-_q%(iJC%9lomrQb}yde~W6FF$*=2XvK%fap8Z{ zU(evnmX!*(;D0^@enB5tC7ZWuP`}S$@-m}@Iq!Qas%~c4Foh*j(JdXN8ikw&9KI}hBO+2YD;7*5ERxI3?u(Qz0l4J!Y~K6R;wh*HQc8l(EM<- zE@hVE!a@3*`lE=Jz0TeKCXrX3Qlk$@<*NkxqF_=X*>xJT57 z-ZpW#RqwV7+B#{REwa#u`NHi=1*$LUc@$%gkvmH-;zSE%lOwY{zY~ zrYIFLzM_sn^zYNYs3?A3ehOPX-V%!v2*_Y1B=9;S8<%XJ1Hw@&)<7NMB`h6!w?Ed5za170in#H{JZyz97+JWnO zu=kCpj=ErU6d+|*;JE{RJ;0_xw^yz0iG1r(!By%wD+59xPlLP@PTH!72M0oLxOWCr z&O0Wz-S=2OFJEdFIh_TZwxfm}wZy@H3lS1}6@eiFin{sOee=jv2mJp-TKrYhuhg|abwRu@$^9dd?N=4Q zF8=?i0uucX6@R4y{0jed74T0uH}=2b|5+3K75r-{{U?}(>JRWQh4rr*ey#QX)UZhX zhlamZe!t@Xo}2$f0|3VF#pQox>R;i150!s~V=(*${zu@Hl>mD;kKZ=-zyP}6gKIm} HZ&&{hQRKPA literal 0 HcmV?d00001 diff --git a/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt b/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt new file mode 100644 index 00000000..8b54db7c --- /dev/null +++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt @@ -0,0 +1,3 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: table with [2x2] + item-2 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/table_with_equations.docx.json b/tests/data/groundtruth/docling_v2/table_with_equations.docx.json new file mode 100644 index 00000000..fc8f9780 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.json @@ -0,0 +1,174 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.5.0", + "name": "table_with_equations", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 6528760837820727976, + "filename": "table_with_equations.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "The next cell has an equation", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "$A= \\pi r^{2}$", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "The next cell has another equation", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 2, + "num_cols": 2, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "The next cell has an equation", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "$A= \\pi r^{2}$", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "The next cell has another equation", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/table_with_equations.docx.md b/tests/data/groundtruth/docling_v2/table_with_equations.docx.md new file mode 100644 index 00000000..837e6550 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.md @@ -0,0 +1,3 @@ +| The next cell has an equation | $A= \pi r^{2}$ | +|------------------------------------|----------------------------------------| +| The next cell has another equation | $x=\frac{-b \pm \sqrt{b^{2}-4ac}}{2a}$ | \ No newline at end of file From 7b5f86098d07b734f2b6aa8c88ae7cafa265246a Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Thu, 24 Jul 2025 11:07:36 +0200 Subject: [PATCH 2/4] docs: add chat with dosu (#1984) add chat with dosu Signed-off-by: Michele Dolfi --- README.md | 1 + docs/index.md | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/README.md b/README.md index c53e7b79..ebc5aeb7 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,7 @@ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling) +[![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github) [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101) [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/) diff --git a/docs/index.md b/docs/index.md index 7ec40bfa..768612ad 100644 --- a/docs/index.md +++ b/docs/index.md @@ -13,6 +13,7 @@ [![License MIT](https://img.shields.io/github/license/docling-project/docling)](https://opensource.org/licenses/MIT) [![PyPI Downloads](https://static.pepy.tech/badge/docling/month)](https://pepy.tech/projects/docling) [![Docling Actor](https://apify.com/actor-badge?actor=vancura/docling?fpr=docling)](https://apify.com/vancura/docling) +[![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github) [![OpenSSF Best Practices](https://www.bestpractices.dev/projects/10101/badge)](https://www.bestpractices.dev/projects/10101) [![LF AI & Data](https://img.shields.io/badge/LF%20AI%20%26%20Data-003778?logo=linuxfoundation&logoColor=fff&color=0094ff&labelColor=003778)](https://lfaidata.foundation/projects/) @@ -46,6 +47,13 @@ Docling simplifies document processing, parsing diverse formats — including ad Reference
See more API details
+## Live assistant + +Do you want to leverage the power of AI and get a live support on Docling? +Try out the [Chat with Dosu](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github) functionalities provided by our friends at [Dosu](https://dosu.dev/). + +[![Chat with Dosu](https://dosu.dev/dosu-chat-badge.svg)](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github) + ## LF AI & Data Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/). From 5132f061a8125332ba10a4a30e0dd4973637a11b Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Thu, 24 Jul 2025 11:19:25 +0200 Subject: [PATCH 3/4] fix(HTML): concatenation of child strings in table cells and list items (#1981) fix(HTML): ensure correct concatenation of child strings in table cells and list items Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 38 ++++- .../docling_v2/pntd.0008301.nxml.json | 4 +- .../docling_v2/pone.0234687.nxml.json | 144 +++++++++--------- .../docling_v2/wiki_duck.html.json | 102 ++++++------- .../groundtruth/docling_v2/wiki_duck.html.md | 8 +- 5 files changed, 162 insertions(+), 134 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 1d3f5712..dffc095b 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -5,7 +5,7 @@ from io import BytesIO from pathlib import Path from typing import Final, Optional, Union, cast -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from bs4.element import PreformattedString from docling_core.types.doc import ( DocItem, @@ -297,7 +297,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ): parts.append(child) elif isinstance(child, Tag) and child.name not in ("ul", "ol"): - text_part = child.get_text() + text_part = HTMLDocumentBackend.get_text(child) if text_part: parts.append(text_part) li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip() @@ -417,6 +417,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): content_layer=self.content_layer, ) + @staticmethod + def get_text(item: PageElement) -> str: + """Concatenate all child strings of a PageElement. + + This method is equivalent to `PageElement.get_text()` but also considers + certain tags. When called on a

or

  • tags, it returns the text with a + trailing space, otherwise the text is concatenated without separators. + """ + + def _extract_text_recursively(item: PageElement) -> list[str]: + """Recursively extract text from all child nodes.""" + result: list[str] = [] + + if isinstance(item, NavigableString): + result = [item] + elif isinstance(item, Tag): + tag = cast(Tag, item) + parts: list[str] = [] + for child in tag: + parts.extend(_extract_text_recursively(child)) + result.append( + "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts) + ) + + return result + + parts: list[str] = _extract_text_recursively(item) + + return "".join(parts) + @staticmethod def _get_cell_spans(cell: Tag) -> tuple[int, int]: """Extract colspan and rowspan values from a table cell tag. @@ -510,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): formula.replace_with(NavigableString(math_formula)) # TODO: extract content correctly from table-cells with lists - text = html_cell.text - - # label = html_cell.name + text = HTMLDocumentBackend.get_text(html_cell).strip() col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell) if row_header: row_span -= 1 diff --git a/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json b/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json index 1fae7c12..7d1e82b4 100644 --- a/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json +++ b/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json @@ -5839,7 +5839,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": " Number of total districts", + "text": "Number of total districts", "column_header": false, "row_header": false, "row_section": false @@ -6642,7 +6642,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": " Number of total districts", + "text": "Number of total districts", "column_header": false, "row_header": false, "row_section": false diff --git a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json index 7d4f7ef8..b504fd9c 100644 --- a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json +++ b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json @@ -4166,7 +4166,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ground corn", + "text": "Ground corn", "column_header": false, "row_header": false, "row_section": false @@ -4298,7 +4298,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean meal", + "text": "Soybean meal", "column_header": false, "row_header": false, "row_section": false @@ -4430,7 +4430,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silage", + "text": "Corn silage", "column_header": false, "row_header": false, "row_section": false @@ -4562,7 +4562,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann temperate pasture", + "text": "Ann temperate pasture", "column_header": false, "row_header": false, "row_section": false @@ -4694,7 +4694,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann tropical pasture", + "text": "Ann tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -4826,7 +4826,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Perenn tropical pasture", + "text": "Perenn tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -4970,7 +4970,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Organic matter", + "text": "Organic matter", "column_header": false, "row_header": false, "row_section": false @@ -5102,7 +5102,7 @@ "end_row_offset_idx": 14, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Crude protein", + "text": "Crude protein", "column_header": false, "row_header": false, "row_section": false @@ -5234,7 +5234,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Neutral detergent fibre", + "text": "Neutral detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -5366,7 +5366,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Acid detergent fibre", + "text": "Acid detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -5498,7 +5498,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ether extract", + "text": "Ether extract", "column_header": false, "row_header": false, "row_section": false @@ -5642,7 +5642,7 @@ "end_row_offset_idx": 19, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    OM digestibility, %", + "text": "OM digestibility, %", "column_header": false, "row_header": false, "row_section": false @@ -5774,7 +5774,7 @@ "end_row_offset_idx": 20, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    NEL, Mcal (kg DM)-1", + "text": "NEL, Mcal (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -5906,7 +5906,7 @@ "end_row_offset_idx": 21, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    MP, g (kg DM)-1", + "text": "MP, g (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -6713,7 +6713,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ground corn", + "text": "Ground corn", "column_header": false, "row_header": false, "row_section": false @@ -6847,7 +6847,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean meal", + "text": "Soybean meal", "column_header": false, "row_header": false, "row_section": false @@ -6981,7 +6981,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silage", + "text": "Corn silage", "column_header": false, "row_header": false, "row_section": false @@ -7115,7 +7115,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann temperate pasture", + "text": "Ann temperate pasture", "column_header": false, "row_header": false, "row_section": false @@ -7249,7 +7249,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann tropical pasture", + "text": "Ann tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -7383,7 +7383,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Perenn tropical pasture", + "text": "Perenn tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -7651,7 +7651,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Organic matter", + "text": "Organic matter", "column_header": false, "row_header": false, "row_section": false @@ -7785,7 +7785,7 @@ "end_row_offset_idx": 14, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Crude protein", + "text": "Crude protein", "column_header": false, "row_header": false, "row_section": false @@ -7919,7 +7919,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Neutral detergent fibre", + "text": "Neutral detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -8053,7 +8053,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Acid detergent fibre", + "text": "Acid detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -8187,7 +8187,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ether extract", + "text": "Ether extract", "column_header": false, "row_header": false, "row_section": false @@ -8455,7 +8455,7 @@ "end_row_offset_idx": 19, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    OM digestibility, %", + "text": "OM digestibility, %", "column_header": false, "row_header": false, "row_section": false @@ -8589,7 +8589,7 @@ "end_row_offset_idx": 20, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    NEL, Mcal (kg DM)-1", + "text": "NEL, Mcal (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -8723,7 +8723,7 @@ "end_row_offset_idx": 21, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    MP, g (kg DM)-1", + "text": "MP, g (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -8998,7 +8998,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn grain", + "text": "Corn grain", "column_header": false, "row_header": false, "row_section": false @@ -9058,7 +9058,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean", + "text": "Soybean", "column_header": false, "row_header": false, "row_section": false @@ -9178,7 +9178,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silageb", + "text": "Corn silageb", "column_header": false, "row_header": false, "row_section": false @@ -9238,7 +9238,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Annual ryegrassc", + "text": "Annual ryegrassc", "column_header": false, "row_header": false, "row_section": false @@ -9298,7 +9298,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Pearl milletd", + "text": "Pearl milletd", "column_header": false, "row_header": false, "row_section": false @@ -9358,7 +9358,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Kikuyu grasse", + "text": "Kikuyu grasse", "column_header": false, "row_header": false, "row_section": false @@ -9547,7 +9547,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn grain", + "text": "Corn grain", "column_header": false, "row_header": false, "row_section": false @@ -9609,7 +9609,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean", + "text": "Soybean", "column_header": false, "row_header": false, "row_section": false @@ -9733,7 +9733,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silageb", + "text": "Corn silageb", "column_header": false, "row_header": false, "row_section": false @@ -9795,7 +9795,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Annual ryegrassc", + "text": "Annual ryegrassc", "column_header": false, "row_header": false, "row_section": false @@ -9857,7 +9857,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Pearl milletd", + "text": "Pearl milletd", "column_header": false, "row_header": false, "row_section": false @@ -9919,7 +9919,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Kikuyu grasse", + "text": "Kikuyu grasse", "column_header": false, "row_header": false, "row_section": false @@ -10182,7 +10182,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N organic fertilizer, kg ha-1a", + "text": "N organic fertilizer, kg ha-1a", "column_header": false, "row_header": false, "row_section": false @@ -10242,7 +10242,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N synthetic fertilizer", + "text": "N synthetic fertilizer", "column_header": false, "row_header": false, "row_section": false @@ -10302,7 +10302,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N from residual DM, kg ha-1b", + "text": "N from residual DM, kg ha-1b", "column_header": false, "row_header": false, "row_section": false @@ -10362,7 +10362,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Emission fator, kg N2O-N (kg N)-1c", + "text": "Emission fator, kg N2O-N (kg N)-1c", "column_header": false, "row_header": false, "row_section": false @@ -10422,7 +10422,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from direct emissions", + "text": "kg N2O ha-1 from direct emissions", "column_header": false, "row_header": false, "row_section": false @@ -10542,7 +10542,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg organic N)-1b", + "text": "kg NH3-N+NOx-N (kg organic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10602,7 +10602,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg synthetic N)-1b", + "text": "kg NH3-N+NOx-N (kg synthetic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10662,7 +10662,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg NH3-N+NOx-N)-1b", + "text": "kg N2O-N (kg NH3-N+NOx-N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10722,7 +10722,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from NH3+NOx volatilized", + "text": "kg N2O ha-1 from NH3+NOx volatilized", "column_header": false, "row_header": false, "row_section": false @@ -10842,7 +10842,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N losses by leaching (kg N)-1b", + "text": "kg N losses by leaching (kg N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10902,7 +10902,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg N leaching)-1", + "text": "kg N2O-N (kg N leaching)-1", "column_header": false, "row_header": false, "row_section": false @@ -10962,7 +10962,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from N losses by leaching", + "text": "kg N2O ha-1 from N losses by leaching", "column_header": false, "row_header": false, "row_section": false @@ -11873,7 +11873,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N organic fertilizer, kg ha-1a", + "text": "N organic fertilizer, kg ha-1a", "column_header": false, "row_header": false, "row_section": false @@ -11935,7 +11935,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N synthetic fertilizer", + "text": "N synthetic fertilizer", "column_header": false, "row_header": false, "row_section": false @@ -11997,7 +11997,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N from residual DM, kg ha-1b", + "text": "N from residual DM, kg ha-1b", "column_header": false, "row_header": false, "row_section": false @@ -12059,7 +12059,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Emission fator, kg N2O-N (kg N)-1c", + "text": "Emission fator, kg N2O-N (kg N)-1c", "column_header": false, "row_header": false, "row_section": false @@ -12121,7 +12121,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from direct emissions", + "text": "kg N2O ha-1 from direct emissions", "column_header": false, "row_header": false, "row_section": false @@ -12245,7 +12245,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg organic N)-1b", + "text": "kg NH3-N+NOx-N (kg organic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12307,7 +12307,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg synthetic N)-1b", + "text": "kg NH3-N+NOx-N (kg synthetic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12369,7 +12369,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg NH3-N+NOx-N)-1b", + "text": "kg N2O-N (kg NH3-N+NOx-N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12431,7 +12431,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from NH3+NOx volatilized", + "text": "kg N2O ha-1 from NH3+NOx volatilized", "column_header": false, "row_header": false, "row_section": false @@ -12555,7 +12555,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N losses by leaching (kg N)-1b", + "text": "kg N losses by leaching (kg N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12617,7 +12617,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg N leaching)-1", + "text": "kg N2O-N (kg N leaching)-1", "column_header": false, "row_header": false, "row_section": false @@ -12679,7 +12679,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from N losses by leaching", + "text": "kg N2O ha-1 from N losses by leaching", "column_header": false, "row_header": false, "row_section": false @@ -13780,7 +13780,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Fuel for manure handling", + "text": "Fuel for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -13828,7 +13828,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Machinery for manure handling", + "text": "Machinery for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -13924,7 +13924,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for milking", + "text": "Electricity for milking", "column_header": false, "row_header": false, "row_section": false @@ -13972,7 +13972,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for lightingd", + "text": "Electricity for lightingd", "column_header": false, "row_header": false, "row_section": false @@ -14375,7 +14375,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Fuel for manure handling", + "text": "Fuel for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -14425,7 +14425,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Machinery for manure handling", + "text": "Machinery for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -14525,7 +14525,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for milking", + "text": "Electricity for milking", "column_header": false, "row_header": false, "row_section": false @@ -14575,7 +14575,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for lightingd", + "text": "Electricity for lightingd", "column_header": false, "row_header": false, "row_section": false diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 29799b45..31978f31 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -8410,7 +8410,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Duck\n", + "text": "Duck", "column_header": true, "row_header": false, "row_section": false @@ -8422,7 +8422,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\n", + "text": "", "column_header": false, "row_header": false, "row_section": false @@ -8434,7 +8434,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Bufflehead\n(Bucephala albeola)\n", + "text": "Bufflehead\n(Bucephala albeola)", "column_header": false, "row_header": false, "row_section": false @@ -8446,7 +8446,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Scientific classification \n", + "text": "Scientific classification", "column_header": true, "row_header": false, "row_section": false @@ -8458,7 +8458,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Domain:\n", + "text": "Domain:", "column_header": false, "row_header": false, "row_section": false @@ -8470,7 +8470,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Eukaryota\n", + "text": "Eukaryota", "column_header": false, "row_header": false, "row_section": false @@ -8482,7 +8482,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Kingdom:\n", + "text": "Kingdom:", "column_header": false, "row_header": false, "row_section": false @@ -8494,7 +8494,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Animalia\n", + "text": "Animalia", "column_header": false, "row_header": false, "row_section": false @@ -8506,7 +8506,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Phylum:\n", + "text": "Phylum:", "column_header": false, "row_header": false, "row_section": false @@ -8518,7 +8518,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Chordata\n", + "text": "Chordata", "column_header": false, "row_header": false, "row_section": false @@ -8530,7 +8530,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Class:\n", + "text": "Class:", "column_header": false, "row_header": false, "row_section": false @@ -8542,7 +8542,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Aves\n", + "text": "Aves", "column_header": false, "row_header": false, "row_section": false @@ -8554,7 +8554,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Order:\n", + "text": "Order:", "column_header": false, "row_header": false, "row_section": false @@ -8566,7 +8566,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anseriformes\n", + "text": "Anseriformes", "column_header": false, "row_header": false, "row_section": false @@ -8578,7 +8578,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Superfamily:\n", + "text": "Superfamily:", "column_header": false, "row_header": false, "row_section": false @@ -8590,7 +8590,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatoidea\n", + "text": "Anatoidea", "column_header": false, "row_header": false, "row_section": false @@ -8602,7 +8602,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Family:\n", + "text": "Family:", "column_header": false, "row_header": false, "row_section": false @@ -8614,7 +8614,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatidae\n", + "text": "Anatidae", "column_header": false, "row_header": false, "row_section": false @@ -8626,7 +8626,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Subfamilies\n", + "text": "Subfamilies", "column_header": true, "row_header": false, "row_section": false @@ -8638,7 +8638,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\nSee text\n\n", + "text": "See text", "column_header": false, "row_header": false, "row_section": false @@ -8655,7 +8655,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Duck\n", + "text": "Duck", "column_header": true, "row_header": false, "row_section": false @@ -8667,7 +8667,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Duck\n", + "text": "Duck", "column_header": true, "row_header": false, "row_section": false @@ -8681,7 +8681,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\n", + "text": "", "column_header": false, "row_header": false, "row_section": false @@ -8693,7 +8693,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\n", + "text": "", "column_header": false, "row_header": false, "row_section": false @@ -8707,7 +8707,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Bufflehead\n(Bucephala albeola)\n", + "text": "Bufflehead\n(Bucephala albeola)", "column_header": false, "row_header": false, "row_section": false @@ -8719,7 +8719,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Bufflehead\n(Bucephala albeola)\n", + "text": "Bufflehead\n(Bucephala albeola)", "column_header": false, "row_header": false, "row_section": false @@ -8733,7 +8733,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Scientific classification \n", + "text": "Scientific classification", "column_header": true, "row_header": false, "row_section": false @@ -8745,7 +8745,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Scientific classification \n", + "text": "Scientific classification", "column_header": true, "row_header": false, "row_section": false @@ -8759,7 +8759,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Domain:\n", + "text": "Domain:", "column_header": false, "row_header": false, "row_section": false @@ -8771,7 +8771,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Eukaryota\n", + "text": "Eukaryota", "column_header": false, "row_header": false, "row_section": false @@ -8785,7 +8785,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Kingdom:\n", + "text": "Kingdom:", "column_header": false, "row_header": false, "row_section": false @@ -8797,7 +8797,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Animalia\n", + "text": "Animalia", "column_header": false, "row_header": false, "row_section": false @@ -8811,7 +8811,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Phylum:\n", + "text": "Phylum:", "column_header": false, "row_header": false, "row_section": false @@ -8823,7 +8823,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Chordata\n", + "text": "Chordata", "column_header": false, "row_header": false, "row_section": false @@ -8837,7 +8837,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Class:\n", + "text": "Class:", "column_header": false, "row_header": false, "row_section": false @@ -8849,7 +8849,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Aves\n", + "text": "Aves", "column_header": false, "row_header": false, "row_section": false @@ -8863,7 +8863,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Order:\n", + "text": "Order:", "column_header": false, "row_header": false, "row_section": false @@ -8875,7 +8875,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anseriformes\n", + "text": "Anseriformes", "column_header": false, "row_header": false, "row_section": false @@ -8889,7 +8889,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Superfamily:\n", + "text": "Superfamily:", "column_header": false, "row_header": false, "row_section": false @@ -8901,7 +8901,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatoidea\n", + "text": "Anatoidea", "column_header": false, "row_header": false, "row_section": false @@ -8915,7 +8915,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Family:\n", + "text": "Family:", "column_header": false, "row_header": false, "row_section": false @@ -8927,7 +8927,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatidae\n", + "text": "Anatidae", "column_header": false, "row_header": false, "row_section": false @@ -8941,7 +8941,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Subfamilies\n", + "text": "Subfamilies", "column_header": true, "row_header": false, "row_section": false @@ -8953,7 +8953,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Subfamilies\n", + "text": "Subfamilies", "column_header": true, "row_header": false, "row_section": false @@ -8967,7 +8967,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\nSee text\n\n", + "text": "See text", "column_header": false, "row_header": false, "row_section": false @@ -8979,7 +8979,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\nSee text\n\n", + "text": "See text", "column_header": false, "row_header": false, "row_section": false @@ -9010,7 +9010,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Authority control databases ", + "text": "Authority control databases", "column_header": true, "row_header": false, "row_section": false @@ -9034,7 +9034,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "United StatesFranceBnF dataJapanLatviaIsrael", + "text": "United States France BnF data Japan Latvia Israel", "column_header": false, "row_header": false, "row_section": false @@ -9075,7 +9075,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Authority control databases ", + "text": "Authority control databases", "column_header": true, "row_header": false, "row_section": false @@ -9087,7 +9087,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Authority control databases ", + "text": "Authority control databases", "column_header": true, "row_header": false, "row_section": false @@ -9113,7 +9113,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "United StatesFranceBnF dataJapanLatviaIsrael", + "text": "United States France BnF data Japan Latvia Israel", "column_header": false, "row_header": false, "row_section": false diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md index fa78a10d..d121e122 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md @@ -511,10 +511,10 @@ Duck at Wikipedia's sister projects -| Authority control databases | Authority control databases | -|--------------------------------|----------------------------------------------| -| National | United StatesFranceBnF dataJapanLatviaIsrael | -| Other | IdRef | +| Authority control databases | Authority control databases | +|-------------------------------|---------------------------------------------------| +| National | United States France BnF data Japan Latvia Israel | +| Other | IdRef | Retrieved from "https://en.wikipedia.org/w/index.php?title=Duck&oldid=1246843351" From 8227841c1b987a3e7b00d7f3e0182598301c5db0 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 24 Jul 2025 10:21:10 +0000 Subject: [PATCH 4/4] chore: bump version to 2.42.2 [skip ci] --- CHANGELOG.md | 13 +++++++++++++ pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dd313163..5994ae14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## [v2.42.2](https://github.com/docling-project/docling/releases/tag/v2.42.2) - 2025-07-24 + +### Fix + +* **HTML:** Concatenation of child strings in table cells and list items ([#1981](https://github.com/docling-project/docling/issues/1981)) ([`5132f06`](https://github.com/docling-project/docling/commit/5132f061a8125332ba10a4a30e0dd4973637a11b)) +* **docx:** Adding plain latex equations to table cells ([#1986](https://github.com/docling-project/docling/issues/1986)) ([`0b83609`](https://github.com/docling-project/docling/commit/0b836095319ebf2133c4a3a77602718034915e55)) +* Preserve PARTIAL_SUCCESS status when document timeout hits ([#1975](https://github.com/docling-project/docling/issues/1975)) ([`98e2fcf`](https://github.com/docling-project/docling/commit/98e2fcff63660c158bafb9a1b5584c1439d7a533)) +* Multi-page image support (tiff) ([#1928](https://github.com/docling-project/docling/issues/1928)) ([`8d50a59`](https://github.com/docling-project/docling/commit/8d50a59d4887caac1c214add8037ed0b5250f68c)) + +### Documentation + +* Add chat with dosu ([#1984](https://github.com/docling-project/docling/issues/1984)) ([`7b5f860`](https://github.com/docling-project/docling/commit/7b5f86098d07b734f2b6aa8c88ae7cafa265246a)) + ## [v2.42.1](https://github.com/docling-project/docling/releases/tag/v2.42.1) - 2025-07-22 ### Fix diff --git a/pyproject.toml b/pyproject.toml index c472e7c6..8d2429b9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "docling" -version = "2.42.1" # DO NOT EDIT, updated automatically +version = "2.42.2" # DO NOT EDIT, updated automatically description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications." license = "MIT" keywords = [ diff --git a/uv.lock b/uv.lock index c790638a..52cdd1a2 100644 --- a/uv.lock +++ b/uv.lock @@ -806,7 +806,7 @@ wheels = [ [[package]] name = "docling" -version = "2.42.1" +version = "2.42.2" source = { editable = "." } dependencies = [ { name = "accelerate" },