From 1350a8d3e5ea3c4b4d506757758880c8f78efd8c Mon Sep 17 00:00:00 2001 From: mkrssg <34207463+mkrssg@users.noreply.github.com> Date: Fri, 20 Jun 2025 10:55:30 +0200 Subject: [PATCH 1/2] fix(msword_backend): Identify text in the same line after an image #1425 (#1610) * fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau * test: add test file and case for fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau * test: added groundtruth test files for fix(msword_backend): Identify text in the same line after an image / image anchor #1425 Signed-off-by: Michael Krissgau * fix: extraneous empty paragraphs for test files Signed-off-by: Michael Krissgau --------- Signed-off-by: Michael Krissgau Co-authored-by: Michael Krissgau --- docling/backend/msword_backend.py | 8 +- tests/data/docx/word_image_anchors.docx | Bin 0 -> 18560 bytes .../docling_v2/word_image_anchors.docx.itxt | 16 + .../docling_v2/word_image_anchors.docx.json | 286 ++++++++++++++++++ .../docling_v2/word_image_anchors.docx.md | 13 + tests/test_backend_msword.py | 40 +++ 6 files changed, 362 insertions(+), 1 deletion(-) create mode 100644 tests/data/docx/word_image_anchors.docx create mode 100644 tests/data/groundtruth/docling_v2/word_image_anchors.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/word_image_anchors.docx.json create mode 100644 tests/data/groundtruth/docling_v2/word_image_anchors.docx.md diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 44a0f2cf..ec071ef8 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -251,9 +251,15 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_tables(element, docx_obj, doc) except Exception: _log.debug("could not parse a table, broken docx table") - + # Check for Image elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + # Check for Text after the Image + if ( + tag_name in ["p"] + and element.find(".//w:t", namespaces=namespaces) is not None + ): + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) diff --git a/tests/data/docx/word_image_anchors.docx b/tests/data/docx/word_image_anchors.docx new file mode 100644 index 0000000000000000000000000000000000000000..c0b030c32e5cb88ebdb913eb1916e23d0c94eb32 GIT binary patch literal 18560 zcmeIab$A^+(mr_1%yG=j%oH;-Gcz+gW;?KGvPFT!+M+)X&{P^TSLUa{mA z)8k;4_-8^Ue+tVV#Fmgo%&i?*QuFWvK^7A zx7UN<8J_XcP7j$!n}9GUfkK3+IW(a`c>j>G_jeG0+`n~H+*r(p^N%YsAN>&Kqoe9N z7+X2g(f&dIw}bv4+$;Zf>t%6W9~~5y@BG{Ax2ZPy6!RRcgGA7=gkD|3(O>CxUe>;`A3}|&F5A(Kw@BPK7-r5-8O9c z{+-F|Fp)UL@V9Vv)UXLm@Ysitl(9~fJxZawy+RmsLW+r5L*k#g-|dwb?o+&(exx$Y z&0C0RdO(zL3%3X1G>>C=L!0ncCo|E+M0c;(X!j}|#WylWT2S0lV_MPEkz~XgltE)& zv`!tl=XMnKg$7bXWkZG1J$ssHA#1!HgzaPSvYr#D*XnNBVr6LaUw^2p|Lh-r!&)e% z5C8xwB>;f*@lafC9gOIVYz>{QKeX8&YV1Tq({`m1*_*fX5y3xe<;Z49BtFpg2;9u| zTm4EO&or_%6jdm&_{!zm6O*7sjZq;-So{nt1RS!t+cP`bxS7cCitgFf=%PHWf{GU0 z3EO<<58>YJ+C?q6w+HNYTsT+)aisVN)Ef)C9|jjAw;k`dF1(@YwPWT4aK7ZdM!F)Z z--L!84AKZC%w&4gxgyz7tY|ibsM(g>Geu@snQ+sij5Wh3Il_w&t2Qz|aT0=~YB?<+ z(Cix_gd0Z>zazV>Qj0 z3Z-7T=jBwGOjpvGQ)}Xo=7+0J5LauJBj`C-U{z|K&9(p_TLokpo6caX6-QvI3^*k^E&-A|@HvDV8T_pXUL82<5gbhycB|&k zPSW+My-2UQE)=#L>q6p$F`)Z#b$J&;)%vTg-j=LPH~Be&Dq?=|!U_H)yi;(w{djDR zXe0tM!V=AGL$!!xC`3~h=@JHBnJ~`LYW?h^D@YfR3-;Ek_MokAGdJ$e6WC4Wx2bTI zS5Hi2fgx@h5+_OSa^Wr&x~{lS^V10Lc(*H4zb3De-(Hk*UF<64*B0i@#8f>cAG++y zR4)!5DkG+Poo#q;1k1N1J(!#1nMr+=P)+C-BW6~|s zm|CsH{v1a3r6$V@L7vpkfPTb4M<=(KmpjLT#t3ev9UF;QXUx|4GO;|RYu+sG>od^s z+Ud+i4p3zl4@XC)U_l~ci)0apx1zEQ;~1L+1^H%#vGNXoYa}8u=jWHR4Z~ta)2hds z_H_;rshF>}g7e7F?kKOim$x<2sMvyi5e802MuI{?`mKZYBq)QQQ7sN`E?mF={J8GN zmHV&7u0TzRq9m_AQc@nMQ4)M3Bb)#n|jz=&kbn%PmB+q=PI0Bh?p{neNO%uc-nuxj{ zG=R#m?j1m4g4$>Ix|Vj}7CgJ#n+)YX08*ikigz`TBuY; zz=R6rVj`Z6)Xugoy%ToF7o#-qiihLx{kr$2FRvIr&;1))_t72q&HV@%J{?}du0F3@ ztBO;H?K_V8eWe8kt%~%c8G#3eavq@${P}5l-M+fwfy^f|47Uzr`RxY9Pp~hAyGnBH z-{cI?`y0@zh^A+8gLnFjlWIth7#_KoI^Gk-Y#{{J4|xU?G&PtJbwKk$EnAinQyheMW8j1-_L+Wx%TuCfR$FE3)Ox z@8Rz8bP$K<_&IQk3lwbmA*kTlTc1l)Z3f_cYxP)e{rKIQ`vG^e8`Skzvm7KhW!EhZ z6`$dbVrx#FkgTlT>kMeac%~0&BQMG-TFmD9Ipzj+m$IK&-uq-UGbbDs#}=0J;%pvc zoKHcuRE4sXxvSDCWLB8!uly}?f)wPF za-4fMFPsUT%5-I3J#RqoM9T4U51wp}shH1u4)IsxNlxUUlS#7cThlJ)lpWV}8Jk0P z{a>{7cs!KGRvaUj5ACZpoDM5T3Rpb0j9eM+;g`s|67}dVnUAaY1Ktxx4;YHhKnT7o z<6Gw6Y(A)p1*}+H^zY~1u{B}?P02=@a^rvS3hxz+={r)dg80f=S{iywa z_R*TqCG+8{37w0*2<~#)BmclxDntvIr3yf>i{qcM2zZKBhhoF1r$5G1Q})*y_m*lMHZH3H$1j01C3L`ceV6FHWkZ>S*eKzDEK zU74?=+*(S~gae-tvdh!P7d*t|>vGxAkuqd7eBtV`76V5JnXSNMfFH*078DCu#-!dY zNLR8?*vGH=!M!=zRU5d*g_ac*!KT>;Bw7~@DrydIfP^ey?hZrb8zeE*meR+a{n};s z&I@~p-J>lQM7P-b>i!w!(vIL|cVX!7A_o0W86f~jwy_dwd^lErB-D@6KSj*i*vMR; z&fHqx)R=+R&c-xUPDT_C8uJenoVb{f!pEo49|;otgW7E;Dt&waIVy+>04m0@4*>w+ zPvS!SN^Ux*D_znc{D?wREW`q^y?n44p7>C4_|QZd=&`;=Hc3m$IM8hKJ5p?8?-iNf zQBOYW=0cu`NE@aE#^~b#3qWFs=q<7%3k=c&_w>gi3Irjcvql9VGDdV(*%JrE=z|7D z5)lNz646^>LlGFE`|0n9K@#*p!eWX3KSKM7ca1xIG|!S2N-rKYco6Ugf=gdce&T(R z9DKcUbK~WCB}tiq&~M;EaS(Y04T|CLS&N5VwhXj-%+9|p!>T1_z5R4<5hA{PF)OZnbzr&x`w2oZS zjB`jYRxU)os`zsK(-!kJI*`|DScN)K?}X*lyqL6z_E# z9pCb4CT6WwB9>}wP;B2ck>Z$2RS*^W(%Hy}O;yVA^TOC{WyHqIH2B3(ZPznmGSut* zV<^{KIS4p@T7VHKw|;dLa4?yJ7ASGM-VfLvEkO-Xy4)Xw8cJ2Bf-K%1OGo+to1qWL z8X>b2IT$9%hp+&|g=K^)1oZs>@bv!@MwTPgce#)3AL+jdBjf)UVT6YMr!Z#vSYeBz z4s_UBXBieM5M^mjh2+t^5+EmhA`t4qgUc88i&;z-BA+BL*C5tPa7-*glrIh`@d5P{ zAd>JzVt|}(lnaxK`zZ+wmeXIb8Iynr5tk&=3)Snl&17T9+0MDa!|dUD_Sov`(ZcD$ zc5ov8NVcBpWy|xtKj6W14H@>%TKjrW+NPpeV5tUtaoW3V;EM)43wr^K$l~8LF87Tb zY9Gp11PB-0EJVBma*{)OK;2q`x2@wkXoRvjV1rOq1QWCo zZ`7+fh=?0!%yzJRL)%&c02LT~4qkf;XaJ}k$&jq%PXJ~?enI5K?iJjq-J~6I=O~HtQxtOJm+ig3bE3s|Q3@RqFor<3z*7g`r}t%@I&MkWDBXtr zfg%)ZCOTtYsbipGyM##$2fPCCuF8OJ_Ca&@tIE~VHNJ)AELEkZqrI_@v)4OPA=X!x!i24P5`e|_xbGk`%Yp#BV)m}47JvZr0 zTe`zZA(jXqW8@(bAy+S+iGgRQSHTKoC<3G7x`kro+y{H}`80~6-q*-PvF#0D14Z=Q z8K@MevYhI*1dI$l5T(rRN5Fk_9 z@Z>|c3Os5?)i-7U)f-|j$jNfxrKsa0Ed+)0q^$xkE{T7)&jKQ5`suC-*6?)2d0cgy{Rl}rX;l|f0TN2QlSvhE zom_Ac91nH#U2L%=Hv-}3_lS)Y&Yd(M;#u5hN_5U)a=9*54lqVbksdxPrGe~q@sszl zJm`|K1ivbpWuzCGJ6=8Rti{bW94K$M z&oNS)#kz0|Kudf9Iys!2?p#p^&Hc*qOhgcsgWxpm!rBsftWo}zyh~eRnk?3{WB|!ua|&y3rq$CYCs3{0xXdS zgN^U(RgD{WZ80S1>%Mk!9G`ENFi{r-BcI6l2=D0;e5ESiCmg|EJ0A2%<~50JY_}2_ zEQbZpE6En1tHCFcve#X%*%az4(aK*BIt%P?o}}Caa)+w%vuFw?Xxo*_$Wnvbp9V#f z<_TJdR`&?!2}Gk(Wx_89|^1fUAH^WXC>j3rp( z>!N#=S4l0vh!>j3F}-vEjn|*I^ve2j(k-MG@YUJBB`dhy4H!NgbSS#OjjbGnB6kZo zp$gPLrfC3C>}q|w^*q|SU97DKl3qv;_EDt3cQ-8*Q5-$*yygq zulj=Zp3)u9xp8|J-k{5=YMrb+Dj>ue61ZJYrxT3zZC~ZeMQR`lFLJ9K23k6pNn1r3 z{z!!nG(lUp12=r#^z0kjw`a%aeufLzd{2#BWzN$&;F_Cy4bQ#UV`%V_gj&SB!Zzh5 z`dZ+craGYGGdk?NX8MHgdE0iwj^z9LfNJ_Oi3M>6e9QC9V2F8Uz4p-fsgz;*i{}ND zpj`QKJZIH%M+P4p!tOc)9KtvCa>4#e#_zoBeyTd*RDC^mv?`^~Yvz*B$LyC2#P#R$ zmhpn_Nd5z*tbBW)LTXB#s=uHCl~}-SErStK@2vJwyj-B63^#8_B38&r2}f5Z9{0k> zxDG7Zb#%Ci?E#cc;sr6qZmmHm!G(k%$w)E2y-Q^K;QQqcpp*>o2YlgJ@tt01dK$<& zP02hlbz~bZZcJ3FZ!cM3Go&&vSW&R}S$70SuHK%SUd{y4qA-9&OjYLQTrhwXu_Bzv zQI+ckBk_4+w=>G^n}p~xv1OT+Ggx#j##)jPm3L$ zqfCtTbo3~352`O;`+Js9eMA=aJpA?)?cJeFRFL1()V@k->7rOVX}<{%JA@22i!GsV zzLxfD3&y(4xqVbP;#HN})#=Xy>62B}&kAQdqzu6J7C>8JZk7HO=TeI9N6otz%Yt>T zh$}4{ICFUQlBXA#oj^ySrUK4ut;3JGC!3yZO5trVbib%XTfOVSM8|t;Q9whP<<0`r zCo2Q%9EWmnm$&e1yLwkR>$aErAZ<|rmv#?>x66y7TDzWy3?@WwYQs?bXH(gBMsl7u!G=vuKa&Yh#owK=7buUAuOHx}5hu#`J1 z5wgXB@KP$B^Vy0 zxKeit6O`HmmUn>}dQXdx{^a)*oLa4(m_cH&!-S+WiE`Pd<0>6kR(kT;fK+VxTUj|o zV?Hnj=J=ns$q8w07K(OKBC-gq?mw-Yt4h8?e&6S=)j~`5?Y`MLi2(y2)oy_>DDv^C z16@T&=>T4f>KYyzT@U!#w`?|4qr3b zv|MKNbdq0rVI1Snk<=1C|89c*wJKbHW12Gk;nob{|IGw-GBdU|ru&mJ{5henE*Xv` zir5Zug%{K`;y(6Ezl>#C?8UbzIXVcw zHY8|5M-;Nq4D|HW{Wy5k$!(dh1X8lu0UzTxoK!oW0Wf2B$tcnmjGNG0BrRf-TWdJo zr@y+-bspR-G+b!wdcn4W=S$nd@M43f!a4q^J+$3=)oEE^yS0>CwzAqvPqf*v#B_rT zBNs2w{1vuQ1W~$HL4YDcr@3XlfyHVm=@vwJkVBOyYb0OR_X~GN5MXvdD2^;jYvSfF z;tPXF5}d)_LL7mAVM4AC97I(&T%{2FezbF#bVvRrT3+T04rXXMLEiupk&H>rjt*G4 z4gA$nUwCr#_YTjS=aGSEU7jcBrn~x8Deg*Ap{znH@gQmt*m^r*5$W9dCQ3 z3qCK8x<~o^$1SFnS-c*%uM0k}FTKF$4o)e-croZO2VZzxY^u>&&C~&SSNHMkA&5hg zPVsU-Kq)X_-0_J(yuAF#Yc|~%Yj<$q54*uznLA7o*P|b_kYO$~K4Eoo_nSrAAAheX zuDeo6At5Tt!S z7{b&=I^86hflH_oj~ydOl*X7nLJZzPZ3Wu35oCRL#?CEbef0b=DAc{gH(P+L*b|XF zsg<)hjNZo_y(-LD6T7N=5vy-U&^1F)Fp3dTjmVmc<*7f}25#PecE{ZuB2AoZWFdG) z*nc(1lMap)W~wqy9D21#WdKbOmXR){Ax?iCP{i{9L=t$`E?Bj7){u3&5)*U+lw-2L z>OxGsIHN}8wc(U4V(DO4@|2pDk4zgz0*y`t) zvQdA@5t$lad0rS+xM3*0fD2x5neW890nsJ395Rtk-s96OY$}kJb2W?3p_xUVLib#D zU3?>ffuT>JhONjTB-I?+otw5@kI%V7Ki~d3#Ng(Al3(xJcs&Q-tQtN{3J9oj5Y3u@ z$kv;b5Dpm)Ys0Ygs9<_icB}kNFY#APuv+=&2fWHVKj@t~1X-HJnPS(I&UmFS!b3(s z+bO!ORak&_46hE+SiG2JZ44SksV9G??UxliZY(ehA3T=Dag-X3Q)09XiWV8wYzy-O zYjFRX(%9}UM{p^avI^8FhWKV&CQx{#J12h@t!7eJva)k;7ynjl^T=G7uLkFjn8Q!?gF0!6(!vnF28 z0%fSxE_hlB_W}C2)*1v05%X~)%EJ)om>c+0z_3zCkoW-3P<}Qq@z2r57%$?SPX&e3 zuLD8`XL87bWe00~@e!JJ^GY5sLPK>_S>N|=RM&^x7`uqn>EoRBX_XB!xe3&q7qP|8 zMRyDjV#Xh7K!dK*m5n3sD`+kq#0~7g=4?m&}A}P0%M@8dbB&PtW1 zwq-&3u!!xFtP1V`yl1CsPut3YhV`|9QibQ$kUryrc5}zV}EaBaObj;!MIAs#S-8J8KTEc4Ni?BdS=;Y@7YFN%M zP%U+CGihMCXhuk`56Y};SKXg$b$R>P{Pdsm&aDKhQ<)!g&ckqjTTym2c5*VeF?IYy z0oJN+IBv5ddY?6oRikl3IYqeSZS5XoDF;v6GUTbbA#- zVU+-6=U{l^w99gGb0&9b|UjzOe1cRle zG*fu3vXtIX4EJ28Q z(nL3>%lJ9YLt8!gqf#HYxngF9PsT`mNYQWY}`QIKw|7< zlw1UHg=ZbISMNw&TsnB6DCY>NJSi;s$4vk}7)Zxul65R!YPW%U4(uYb4tZDc+{mM8 z1#-#{V9eHP`s-S5n&Tzd&tL{I2|vHKY;w%UD*Ly--_`nEo&^@t>T+&ZNrDbt=C()V zf04@&xRAW^Clh`vjtc|irZ&G5_@9IM!kj+BBuwmN_fm+I_%pu}o9>e;NDPA^`G-Ff z2G=~Aj}N*>g0bVvS;COx5k9Q%KEmf>t31eFX^Ov~kqh46;bl|1Dpq_lbq#iaX?nx6 zsd3e4zaWT{^mvGOG*;Gb;B{zYE+Lxx)e=+83zO#W6-d0knyD+`&zu2UkQ_rF5?AB{ zg&~vU*oLdywnH|KztY>hDOL>JdNUz3+DpPq#cOIPo)n4_iDQKSgI$ z<_iKu`{fMhI#l>A**ISJcPB$vHS1XXdSp=N&rd`}g)AxtY$Snws_3K~Jij%sHCa(w?mrAKIbqLhMDBqToqIG;=^!yB z6^bCb%d}KV^Xn+}?lbtRpxU{h;tN^1pVjz5Qo8ParHC-JwVXO=oevF(=Y)`%U3BW# zb{%Itl&GCIHo+)InqO70Y)|BBmMCS@Dcpv)u9vvVSiQeJf?r#Xa{Z9%pf+O*R@v8F z3f0*dek!M~s`-%^sPehKW^kC0W~I9mwd&P{9_mDXWFbSLec(|$yaQ=YJjT6r{Adpn zb-^W&b1c*B$RtyBDw?)vhZmJepZ~ta^GuE?}cYjwqdX_t(5x9p898Tjgrl^;L+~o2Mf*wKX@H zM{|g|D{6UhUD>|Zle0?@3Dtb6tfiu@jdtaIyVyD@Bv~^g{Y(o=MDyZ^TTz>Gy$bOI z(ojm?UNSz$SlDH=L0^R5g`JoH{2)RYa1^A|wnq(PFApi>JB>nF3y!AKKSdZc-mL~q zgV-Np?|F#cZxjNT)QRQ42^;eeJc*t&Iz$d3rVJ9p0z<3>Cw9j*-SNU zPeY#4h={%Ll+6zagFIvXzL)mhF*3GK6lfE$8Q^C0Qx3LL-?wVSW=LG5AaQ%PkdQMXFveq@|@zOc;Db(0_eJXBY3b#4VE2*Ins*H5&S-8k8GiAdtAzVU6@O>-A z1okvkj(FOYfYDAq)At3{j~Z`|s$FS^(K{^}B$iITDTYM81*L(nmX2MJ;5|Pt+*B8V zFtOKKq0=j<4>fM6>#4hu$8MArHkYhPNMTyQ<~S<`l4_=oO{S`j1>e+6-0cb@i~eit zzFK)`rALww?+6Fs!BJhtzEJt3wkzCPXNl>cxD4^OeE8K@86l>Pa@Tc}({d`fL=j_~ zS(ojomfX|Lc4E-$qz7`$jI^iciOSOl*=$Ubhe_%QAH%EIyer$`t;GFq3Gs+&L+2G% z9PJ9QF$1}3oiMg9m+5+@OX;0oVJ>wG!q})SgWMO3gRwXlZ8myuH{Q}pa52YWZ%qo< z9b5S}j%>G|hx|aeZG+rP4?K2s7T7MNyS*KDA*-2YS2CaKygGE2MJ{M#4O1O!<_{Ay zGVMFo8l5j+I*1154~)uJigk|=k1SU_@NUmC%5J~h_9KH|tb1iSoU85BeC9gNi7XMm zrP3_10w^cn<<=Z+IZRfsk!Ur4_FRg?+kH!{x2UuT*Og5P-KX#o5fZ9q-KvKbbv``lSN+KpLXH|6?j`!(V6tMOV zo|ZSL>*F&%{VNtNKj!gqz3ch4E##nDO6bk5{r>gsXf3#{s7&p9mD2u;=XqeVN5}i! zeaGvQlZpx*1sxIC;K)yEo{e>qRjcZ2IE7bi>NMA-Iw3>vUM{P$D*0KB)*WbLZRg)@!2G~0Z02#e67y}P_7(~X0 z4nqDr8D;E*_U!tR_JASeKz3}l)5(t7rXI?8%G4dZ$Y_z}BxS4d;YU*Yr0oz<_-gnn zrG8#*(gC>TOX!C;U@AtvSH(xscSfHp)1TT1;je~CN4&E| zuLXJOgAXWK)y5i$PaE46RqxC1qm@*ts>U}?{ljV@*KHoL6m>p(vQuoS3?xy7Oyk2x zIiJdIdme>#>eKs3R=K8wrPSYk@c0;6?0$Vadp5epFO^n|C1<2sm!_@V8mw=6JH0Wg zeyUuzEM1{AEP7!mo=JI1n6OFcabql=g*fndev-4;xj$|Z7!=4axo*?-uErwcLeUk9 zyI5Uf$O;d5Pq}{a7>d8WT4@*BSS^(Q3eEQIu)3IsJ^(Wd9-%i7q$*GTV@u3{v z=}HCSi&rU#ms=?a#;7H(*lwsqzlo0oE^dTu7%h1XJ;ok z{cD+Jm#gl@sM=Jidvkh;V|ir`>X${^6FuhCs?>_}(-${+12Azi$a+VkCS==PwJE9Z zuA(7G<`14aheuR#Kg~#SlfdX8*fI{Hxe{S;tBK{g1oXN}jkhpjV^U}zBG9;^IgIsY z_!VQX$#qQ>{TH>WAQ(SSplokhh7gPEqfT0>kJX8R^2?c0)9zjrh>IlQ*$8oXmN64Q zHA*SU^gcH!Xxan^`L|BASN1Ek*q|T4D`mrqwlTDK7p9zb{ z<~~P4I?PoS2jX>NO}jurHw1k(D%5Q-RRaKZeTlcp#=PWRUej<6s=(qzR>arB zr~S`CzmjOcU4PLU6Bb^s9Jpejm0B^pywHZDu9Phh@`{=ImL0gN=xQ_WM`}RtTaMw9QL)kZ1hp| z=Pb*-Q{&c10Nm;YiT`Cm8m`j|vXcu4TkB6O_L6a(f->sBdt?jx=#mz==;n@|aNd#l zVrr9-1RDSr{t_fBUkyRRA>)YTFfJCZe()bv0tgJ zCnCT`%mVeCPulDC*v@>A^SNSuAxQv)2&!q`4DXaO?jQew|1IsvB4v%u5~kRM*~cg` z9N-_4rnPP_34AVdBM|VGc?2P1=hv7ygXWmgC5!#)g~A4;kd-R~g|@;HH9W-okWglx zCcH%J&y+(golpFwq5ivF#NK|JWy|Q<0T-5z$VNW(zGEtB>&4BGZRye8kbmDDGk4?a z!g1|Eq+5#Wlu`Sam2(4`J68=>u;-wqGm!eo$07aTB^udYMoUcS4k>%aGg~V{_nrr{ zhD&#nYW=UPGsw8tMh$!JiLLrym;cBJ%1Y+Wz>Uyydm8jj%=59IhGi*79zeCvRFG?6 zG~m`##i%4%6Sys-3S%jR(r&Bi()g|)7GiXG@m@ZhGEiPA(mPx!x*rcm|3Y45dn3OY zjS^4WcPnin<6CD7|D}_9J&|8Fv0|%bcl4fLVOeK4EQwp^(Em!OWo?A<&VDDu<}GyI zx+8ic!{;qjDYeTYnorD|?`m^JTL4s}2t)`YpE}t1N@md?)-J*aA^`I5*FyMUPznF2njq)* zHAapAj7I67Mdck9=(PXuVJv7c5Rjr#BuaX?#7biP z|7bj<;-NN~+^b;;Z*#)k{e1D<=Wv+yDE(R4294z`+CpQ`^!A;$Ta>~_q+FK^JKa4O zPe@9UV2r|I!q_?4(k7Ho`{8A$<{68J)@xHR`NWlAltmH1dt?bUUX1M=*lGzC)|y)( z=1S95a$Q9A+?=F`$#o>rERsAXO=0l`xL@16pMmEh#nsqy02|(3HOd8c3)0otWB?J~ zPL;<6Hkv3iqoF+7BWh;o=uLs|7qn=w-4U5nAjyq#d{j35)0Z*XU&q|M*_BwYpb5Nf z=&gpYkVL zMXSjBqlHF`p<@fJy4k~jkzH4o3JnX#KZ&!O!ySshR%B_?7wvzMmx_L6N$bB6Ty_Ev zhOIveGXEgnsyik5tp7$xDogF>6bk=E;x;~7Egt(f^4phiDwP85BKkp!K~)lR_SIfj zjiYk$A{fPc0!C-}CYB7zTGdvr4V&A#^-bdE*z?QmTOuaYE2z`Kv4r?5s2U72rw&=^ zf|9x6THIiJ^N0;ATz#_TF%f1DPOwJTY27x6uWrZcFt$F9`WUl+EVJ}wBlyFAa~5< za)twM{+Ot8#@(Ylt)LEJ7H{Ee4nX#9F2-3h-c%)b_8^%{+E%@-LF>U)`mwBpmwS;d zx^U^qQ-jxh&QdppqzR3F*-Glw)-_}HT&Maz>}xWf``QxY>VAJ;Hc=$1mDlr{i{Vju zdr|F(qiTqX-Z8~)qp?bP1A(GsD|I3YnS1bQY#CkUlg{mORt%>SS}u993Q!w`82%1;Vk`9QN-IMxIM$a_NyzZSc;__$YfNAgXuT zQpf2c4V=?g^#xrN11=Y_Xmi4pxK4ZW_ol_*dfnJDakl>*g)}bbH5qW^0wXhh=9#|l4k_Ca|ve)DZ^W1^_l}~ zUclBevY#bD_4WRnhBlh65#64dhi()nYfHS!5}O*?RDu!fm^gS)ZcG8~(89rrku>8$ zDw0&TUtC64?AqO<`3x$~32*1~Md66QfmZfcam@1-e}w_8$x-QQFccmYGnQ+8OUsf} zZtN8((KSZ8dj)Zyf@?0_SikCMay8|+9;&Nc@SSH*3w|)9NK2#R3XS`a`E1m3VfgXD zYcuCd=J@^o(a)|iNvkz(9Da8-eObrZAytVTTdhE?d*qZH#w#M+6=)-r4wE6E|Dj|tZN7%y)^k^gP1) zd?fN1@qN^-E!vq<-JNRGOrwc;KkY=N-R$3<=9XO2sLZ7S*ZHZ?j4+axrRJY76cqf% zic5+D#9^^9M!b%wJ!`6yb$P zq=zVHrt9gltrN#7%OP37@2K6zP*R4tNo5fD{E^BdhEFN13e;moyKc}QlQuU1J>I^% z0D8ikYCOYfdcu^Yy{a-i)7pcovYyAfJ}1CZ0U9Aj?XevhTU!rf>e#S%;~j>u4Q+j6 zZ^6s=lxy$aTEQ}qe$=LOMsr^C9&S6bRi;ly?%HDcmuVnS3K)DZPSMJbona;xCV6lS%Tfhy8N z4WHTmncKpO@rwzpZP^jY$@D2js%rooY51I5b@Pk78rM@OyqDl@Np&Z zLwb=uVii6D6yzLi?HuV0Z5{p;+>cn@{|!+17*iwS_$7PjKMkDwZ)ESTlb#ltnK8*N z2}Ag!^l6H%H|xa*5o7Wfr>CIq?QQb zg{yl;6XG?-w%WiZm_gu5FP%ygP@_hEV_Q(O?oa#tM=_<;CJIERtT)Qce*5mUN>s^p7V16hI96z zcdy}|5@O!WQLqT`?DL`D_J%V=tCAm<|K0)GcK5wnCGM2BHhQ+ z{7)S=#`L5!_aTTYA36;2?}DgrXZObd`oD7c@z_3&48<;+6?)h z)Us9pv;4)v>1OC&jXT1-)Kll8Mo12UbCPaqQ+(b(S;>>ZhLI77@$(!{iJhAx8&mbn zaCcC1a$5})=sOPL{f{e?hG8QgQ7t_F+-PM+Uh@Vj6)#n&t5HlBr77}YE6Vi25Nk1X zW6R07yHQcKq{KzfQBpEHb@sR^kS$5q9k|2n!u1P0ljQuRsUD-cD#b6H8<)XH{RsRV zX>?gga1WyHu;m>ibUW5T$z2A=6W|CfKb&;NOK9(Pje4VUCZ_}rGDm^zWpRYkZx72> z1YIj0Ib(HaXKpHo_il3vY&``I(76L1$>KUbe$an(1t4IW57X_xzvSf4-~Dgozr6B9 zPV(Ow{QJun{)Pn + +**This is test 1** 0:08 +Correct, he is not. + + + +**This is test 2** 0:16 +Yeah, exactly. \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index f37b4874..61ddd2a0 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -9,6 +9,7 @@ from docling.datamodel.document import ( DoclingDocument, InputDocument, SectionHeaderItem, + TextItem, ) from docling.document_converter import DocumentConverter @@ -131,3 +132,42 @@ def test_e2e_docx_conversions(): @pytest.mark.xfail(strict=False) def test_textbox_conversion(): _test_e2e_docx_conversions_impl(docx_paths=[flaky_path]) + + +def test_text_after_image_anchors(): + """ + Test to analyse whether text gets parsed after image anchors. + """ + + in_path = Path("tests/data/docx/word_image_anchors.docx") + in_doc = InputDocument( + path_or_stream=in_path, + format=InputFormat.DOCX, + backend=MsWordDocumentBackend, + ) + backend = MsWordDocumentBackend( + in_doc=in_doc, + path_or_stream=in_path, + ) + doc = backend.convert() + + found_text_after_anchor_1 = found_text_after_anchor_2 = ( + found_text_after_anchor_3 + ) = found_text_after_anchor_4 = False + for item, _ in doc.iterate_items(): + if isinstance(item, TextItem): + if item.text == "This is test 1": + found_text_after_anchor_1 = True + elif item.text == "0:08\nCorrect, he is not.": + found_text_after_anchor_2 = True + elif item.text == "This is test 2": + found_text_after_anchor_3 = True + elif item.text == "0:16\nYeah, exactly.": + found_text_after_anchor_4 = True + + assert ( + found_text_after_anchor_1 + and found_text_after_anchor_2 + and found_text_after_anchor_3 + and found_text_after_anchor_4 + ) From d26dac61a86b0af5b16686f78956ba047bcbddba Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Fri, 20 Jun 2025 14:47:25 +0200 Subject: [PATCH 2/2] fix(docx): ensure list items have a list parent (#1827) Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/msword_backend.py | 125 +++++----- .../docling_v2/unit_test_formatting.docx.itxt | 24 +- .../docling_v2/unit_test_formatting.docx.json | 216 ++++++++++-------- tests/test_backend_msword.py | 8 +- 4 files changed, 200 insertions(+), 173 deletions(-) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index ec071ef8..8386082a 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -14,7 +14,7 @@ from docling_core.types.doc import ( TableCell, TableData, ) -from docling_core.types.doc.document import Formatting +from docling_core.types.doc.document import Formatting, OrderedList, UnorderedList from docx import Document from docx.document import Document as DocxDocument from docx.oxml.table import CT_Tc @@ -84,7 +84,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.valid = True except Exception as e: raise RuntimeError( - f"MsPowerpointDocumentBackend could not load document with hash {self.document_hash}" + f"MsWordDocumentBackend could not load document with hash {self.document_hash}" ) from e @override @@ -274,6 +274,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._handle_text_elements(element, docx_obj, doc) else: _log.debug(f"Ignoring element in DOCX with tag: {tag_name}") + return doc def _str_to_int( @@ -584,7 +585,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): all_paragraphs = [] # Sort paragraphs within each container, then process containers - for container_id, paragraphs in container_paragraphs.items(): + for paragraphs in container_paragraphs.values(): # Sort by vertical position within each container sorted_container_paragraphs = sorted( paragraphs, @@ -695,14 +696,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc: DoclingDocument, ) -> None: paragraph = Paragraph(element, docx_obj) - + paragraph_elements = self._get_paragraph_elements(paragraph) text, equations = self._handle_equations_in_text( element=element, text=paragraph.text ) if text is None: return - paragraph_elements = self._get_paragraph_elements(paragraph) text = text.strip() # Common styles for bullet and numbered lists. @@ -918,6 +918,44 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): ) return + def _add_formatted_list_item( + self, + doc: DoclingDocument, + elements: list, + marker: str, + enumerated: bool, + level: int, + ) -> None: + # This should not happen by construction + if not isinstance(self.parents[level], (OrderedList, UnorderedList)): + return + if len(elements) == 1: + text, format, hyperlink = elements[0] + doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text=text, + formatting=format, + hyperlink=hyperlink, + ) + else: + new_item = doc.add_list_item( + marker=marker, + enumerated=enumerated, + parent=self.parents[level], + text="", + ) + new_parent = doc.add_group(label=GroupLabel.INLINE, parent=new_item) + for text, format, hyperlink in elements: + doc.add_text( + label=DocItemLabel.TEXT, + parent=new_parent, + text=text, + formatting=format, + hyperlink=hyperlink, + ) + def _add_list_item( self, *, @@ -927,6 +965,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elements: list, is_numbered: bool = False, ) -> None: + # TODO: this method is always called with is_numbered. Numbered lists should be properly addressed. + if not elements: + return None enum_marker = "" level = self._get_level() @@ -943,21 +984,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) - elif ( self._prev_numid() == numid and self.level_at_new_list is not None @@ -987,28 +1016,20 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) elif ( self._prev_numid() == numid and self.level_at_new_list is not None and prev_indent is not None and ilevel < prev_indent ): # Close list - for k, v in self.parents.items(): + for k in self.parents: if k > self.level_at_new_list + ilevel: self.parents[k] = None @@ -1017,20 +1038,13 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[self.level_at_new_list + ilevel], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, + elements, + enum_marker, + is_numbered, + self.level_at_new_list + ilevel, ) - for text, format, hyperlink in elements: - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) self.listIter = 0 elif self._prev_numid() == numid or prev_indent == ilevel: @@ -1039,21 +1053,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): if is_numbered: enum_marker = str(self.listIter) + "." is_numbered = True - new_parent = self._create_or_reuse_parent( - doc=doc, - prev_parent=self.parents[level - 1], - paragraph_elements=elements, + self._add_formatted_list_item( + doc, elements, enum_marker, is_numbered, level - 1 ) - for text, format, hyperlink in elements: - # Add the list item to the parent group - doc.add_list_item( - marker=enum_marker, - enumerated=is_numbered, - parent=new_parent, - text=text, - formatting=format, - hyperlink=hyperlink, - ) + return def _handle_tables( diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt index 2860c30b..fccb44c6 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -17,14 +17,16 @@ item-0 at level 0: unspecified: group _root_ item-16 at level 2: list_item: Italic bullet 1 item-17 at level 2: list_item: Bold bullet 2 item-18 at level 2: list_item: Underline bullet 3 - item-19 at level 2: inline: group group - item-20 at level 3: list_item: Some - item-21 at level 3: list_item: italic - item-22 at level 3: list_item: bold - item-23 at level 3: list_item: underline - item-24 at level 2: list: group list - item-25 at level 3: inline: group group - item-26 at level 4: list_item: Nested - item-27 at level 4: list_item: italic - item-28 at level 4: list_item: bold - item-29 at level 1: paragraph: \ No newline at end of file + item-19 at level 2: list_item: + item-20 at level 3: inline: group group + item-21 at level 4: text: Some + item-22 at level 4: text: italic + item-23 at level 4: text: bold + item-24 at level 4: text: underline + item-25 at level 2: list: group list + item-26 at level 3: list_item: + item-27 at level 4: inline: group group + item-28 at level 5: text: Nested + item-29 at level 5: text: italic + item-30 at level 5: text: bold + item-31 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index 8b6ee9db..967aff11 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -42,7 +42,7 @@ "$ref": "#/groups/1" }, { - "$ref": "#/texts/23" + "$ref": "#/texts/25" } ], "content_layer": "body", @@ -98,7 +98,7 @@ "$ref": "#/texts/15" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/16" }, { "$ref": "#/groups/3" @@ -111,12 +111,9 @@ { "self_ref": "#/groups/2", "parent": { - "$ref": "#/groups/1" + "$ref": "#/texts/16" }, "children": [ - { - "$ref": "#/texts/16" - }, { "$ref": "#/texts/17" }, @@ -125,6 +122,9 @@ }, { "$ref": "#/texts/19" + }, + { + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -138,7 +138,7 @@ }, "children": [ { - "$ref": "#/groups/4" + "$ref": "#/texts/21" } ], "content_layer": "body", @@ -148,17 +148,17 @@ { "self_ref": "#/groups/4", "parent": { - "$ref": "#/groups/3" + "$ref": "#/texts/21" }, "children": [ - { - "$ref": "#/texts/20" - }, - { - "$ref": "#/texts/21" - }, { "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" + }, + { + "$ref": "#/texts/24" } ], "content_layer": "body", @@ -461,20 +461,18 @@ { "self_ref": "#/texts/16", "parent": { - "$ref": "#/groups/2" + "$ref": "#/groups/1" }, - "children": [], + "children": [ + { + "$ref": "#/groups/2" + } + ], "content_layer": "body", "label": "list_item", "prov": [], - "orig": "Some", - "text": "Some", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - }, + "orig": "", + "text": "", "enumerated": false, "marker": "-" }, @@ -485,18 +483,16 @@ }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "text", "prov": [], - "orig": "italic", - "text": "italic", + "orig": "Some", + "text": "Some", "formatting": { "bold": false, - "italic": true, + "italic": false, "underline": false, "strikethrough": false - }, - "enumerated": false, - "marker": "-" + } }, { "self_ref": "#/texts/18", @@ -505,67 +501,7 @@ }, "children": [], "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "bold", - "text": "bold", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/19", - "parent": { - "$ref": "#/groups/2" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "underline", - "text": "underline", - "formatting": { - "bold": false, - "italic": false, - "underline": true, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "Nested", - "text": "Nested", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "list_item", + "label": "text", "prov": [], "orig": "italic", "text": "italic", @@ -574,7 +510,59 @@ "italic": true, "underline": false, "strikethrough": false + } + }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/2" }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "bold", + "text": "bold", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/groups/3" + }, + "children": [ + { + "$ref": "#/groups/4" + } + ], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "", + "text": "", "enumerated": false, "marker": "-" }, @@ -585,7 +573,43 @@ }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "text", + "prov": [], + "orig": "Nested", + "text": "Nested", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/23", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/24", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "text", "prov": [], "orig": "bold", "text": "bold", @@ -594,12 +618,10 @@ "italic": false, "underline": false, "strikethrough": false - }, - "enumerated": false, - "marker": "-" + } }, { - "self_ref": "#/texts/23", + "self_ref": "#/texts/25", "parent": { "$ref": "#/body" }, diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 61ddd2a0..9da0ea25 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -97,18 +97,18 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( - "export to md" + f"export to markdown failed on {docx_path}" ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( - "export to indented-text" + f"export to indented-text failed on {docx_path}" ) assert verify_document(doc, str(gt_path) + ".json", generate=GENERATE), ( - "document document" + f"DoclingDocument verification failed on {docx_path}" ) if docx_path.name == "word_tables.docx": @@ -117,7 +117,7 @@ def _test_e2e_docx_conversions_impl(docx_paths: list[Path]): pred_text=pred_html, gtfile=str(gt_path) + ".html", generate=GENERATE, - ), "export to html" + ), f"export to html failed on {docx_path}" flaky_path = Path("tests/data/docx/textbox.docx")