From 387dd659c18c93e392c0a48e488cac290ffc7eb1 Mon Sep 17 00:00:00 2001 From: Manuel030 Date: Mon, 28 Apr 2025 13:46:08 +0200 Subject: [PATCH 1/2] fix: find paragraphs in elements with images in docx Signed-off-by: Manuel030 --- docling/backend/msword_backend.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index f1362220..be1c8223 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -123,6 +123,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): doc = DoclingDocument(name=self.file.stem or "file", origin=origin) if self.is_valid(): assert self.docx_obj is not None + doc = self._walk_linear(self.docx_obj.element.body, self.docx_obj, doc) return doc else: @@ -188,6 +189,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elif drawing_blip: self._handle_pictures(docx_obj, drawing_blip, doc) + self._handle_text_elements(element, docx_obj, doc) # Check for the sdt containers, like table of contents elif tag_name in ["sdt"]: sdt_content = element.find(".//w:sdtContent", namespaces=namespaces) From 50c108c6d32b27de93cafa1376bdbcff7ec7b73c Mon Sep 17 00:00:00 2001 From: Manuel030 Date: Tue, 29 Apr 2025 16:18:09 +0200 Subject: [PATCH 2/2] add test file Signed-off-by: Manuel030 --- tests/data/docx/paragraph_in_image.docx | Bin 0 -> 11882 bytes .../docling_v2/paragraph_in_image.docx.itxt | 9 + .../docling_v2/paragraph_in_image.docx.json | 162 ++++++++++++++++++ .../docling_v2/paragraph_in_image.docx.md | 8 + .../docling_v2/test_emf_docx.docx.itxt | 11 +- .../docling_v2/test_emf_docx.docx.json | 55 +++++- .../docling_v2/word_sample.docx.itxt | 49 +++--- .../docling_v2/word_sample.docx.json | 119 +++++++------ 8 files changed, 328 insertions(+), 85 deletions(-) create mode 100644 tests/data/docx/paragraph_in_image.docx create mode 100644 tests/data/groundtruth/docling_v2/paragraph_in_image.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/paragraph_in_image.docx.json create mode 100644 tests/data/groundtruth/docling_v2/paragraph_in_image.docx.md diff --git a/tests/data/docx/paragraph_in_image.docx b/tests/data/docx/paragraph_in_image.docx new file mode 100644 index 0000000000000000000000000000000000000000..8795e408341379662605ed42d697e3f4badc688f GIT binary patch literal 11882 zcmch7Wmp{Bvi4xX-Q9z`2MO-3!QI{6-95OwyE_DeI|O%khoG0e&$;JIa?k$z^*p_L zP0zeFQ&p?0YE_l21PCY!004jhe6mVclTcE=S$KcP4GaK415g0kc7_)AbhLksKmdgI z4}L!?;-n-%_+Wz0Ur+_kRxPMPM4jCcbJ$`ZL3$;foK5!qC1)ScaRsLW~BH=C7efXeP1YX6OCn* zlz9Lzg5dcpR7zW;6#=N~?NMUJ3@= znpI1!a- z$`VnwLD`pu#(oMjTz;fe@o2DJy_7h@jw;eFA-XXAhb^#e& ziVY|}sS$kz@lA*FnY)wdr!4Y5gK%+!Ad(pxDFS(l(^L-}N-YB`U&6fhaJF*?)rxdU zUT%EKqZ~cutC_Na6oT-{yY%WVyNU^4ekPsGP#MVF_GACfjR`j>E-To(DzW~hN@r_3 z1G;y8I$9c9Is6klr>d3;%j^iRYuY)N`ygG+Bpap&HtqGJldK??=XFFQHE^oaqT>9I z?vFBO(W~OM8R+Ek{JPC8TPZHi1)lXwnqpMfd#6%kQxRr;`53~7R!D@DA=JsqFnM%Y z6Ha$|O~F-bqu2a#rc!}I*#wf6!E%Nb!`llnVe<m>QTmhm>E__pEGEv$!<{oh3N&m0X75<9v`rmYMT7guX`b+2yLFEI&Z9 zykf6+R+Cl=SB|$_QPcP2RvPDn;^#I>Ukr4%gl1}+D5@)<+Cl4tuNv`|;K#(}KZ-W2v9aFu7GCPAJn>Uto-j^^&x?l3=-Ud_Op{CY+0RBy`r0cl zhP;mCytG-AGm&3C;K&+Kqw63<7MDx6j~Ei7x?e`D%i1byaT2TRB6Jjq(bQM~>uteG zVrRyIyo!Uh9l9gmR9n8EX{5Zac6cmZ`^~~oVV1X?%6x=vU~|EJFdVAY-tMU1l&9KV-t2c3fSsCacw$ zDeQ2=2}ONT97QEDi#lwC&x(89O{gz`vfUU=>1`>rFG81#}0@{(|$pqUBB}|(w0IYf=l45 zYYiNQGa01-j-+cDStMZE^4W_1aY3r#e zYKn4-wD|`=Ogye%+RfWX&WTW%Y;0&TOoyPJ-Qcw2-AsFe)7ij^Awx9~TgKwN!5$#+ zm_YzKXPOOukeee6ea944>>o_|&>#DOVVsQb=3(V1Te`a6q;wG^8->`ZR1?%^SjiYo zHd+|D4w2YOj^l&JrGU4SRoh5(g`H3DcrEmxqkjNbr4Dhvh%g^a>J@|5Io^7${%8tR z8`d53G)KjEtKEa^-2DzHQ%Bu$5NOTx(JxXoB{-^nccc**pB!K>KecX0Kz&?KXmzq4 zOpEN_18*bq-7ct|9_Y-g6^;U!Yp!u{WI7HsYNjVcgF25fV>jbQH#=(g)?Q=re?opa zVDSe=wEX~GlhoX0Bum?F>2=C{F#_2yRr-)gMbSJQyo=Sy)78eBa#db*L)T)y;u&S^ zyuwnu=DGi1k29ff+@MMRMWOO1T}YrEKK|BQ)w1@NpEv!oFVDM2=ONGZL}%^ShD7C< z1r!0sANJAt4}`QR;fmeyF#MHv$-;YjO5DBdlt^0W?zJ+z&R-|%3%XPivbLY5Mi!p4 zrb_5ej#nv=ZOHV68HTCec3~r{2RaFsI<8YKvRk2+Q2W8=lIrtQq4?W)(dxe`z@fur z)XC-sxl3t@n>HyNfP=BL*i|~Q=(-_T^`K6^ykr&?7}@ao(a4&WG1OH|gP+uix4P@o zIm~T-T&mSKt!Nu;;40b`H&0%m$0B)(f;9qXO&2WJF=vQKvI{UTw#&5F=@UWMRrKog4#Ds)^~XVVUg zL?GqQewdLvfMybxG>DAgc{d}K}MJfT$CRFQHa(rqs4duGl7$R&^{)0lM$nRfqDVpGJc!Nycf%q z5hrZ=35&=)@MM`Jxn=p>vSt@dxXljA2|719I0dbk&ea0w9fU0klM-}QZBjAF{0Yj@&kn;!Q7Je zq1>Zk2UlssYrZ@7*`=_H5nr!Ws(yCUp^l7Ig;%zShlB8^00`oft>T2j6&}<(h1e04 zrWuM>Vp^;C-`o2R-b!Tp9n9Hk($y?`cd?M_Cb6A?^K)h{j#QxGe+rIOVlTKbM zMSy?xzQ}%88XQFxN0oxX~R8{OC z4j3qBujRVkZqtF*v*Ejho=rkPkwojj<42L{BU8+aUtJS5{PamlKJ zN5)EhMgDH22U{i~hT-|`GRKZb7cEFO6M96EWl5WnddveRQl3c@tO&-GDD_D*hu~b` zI3Fl40@YV&<-&TQA5>L9DVa1Pfv=(QLxU|Fxc*4yT2ICevr?o~;rsmQbn$gZC~`xc zJQ|kvUd!#4<8$6U0B*PVmJglaedIkOLvW?xlIOy9%|kcn&+D{|72$*5UIr?O&Rbm~ za8Q=eqpe<-6?TpnKYPKZWKO^+pPm=olM?Kp9@$_reQWeVC8wa_kig1c`X&)Y*mo~I znH%LYt9meg7UJ6N3N%NdIf!jI6C36m|71{&ADg5!)r* zM~@JE&l9+tvE;x@ZYEztfT2S$2BMK6p*Q4Bl!ZOh^j&HhG9U)Z;_iiKz394v?R>r( z62$ae+^a1iJx8H)u9D8R&a=H`Kh3O0T4-OB6568?!&NqXY-7ceaV8I7$UdGV(O7vJo%mat+Tzu8*F2)1p@u z-L>_pSP%IwO(&@N>|Obco>s{J^xYnW2n+Ry-%SwEOb<;U9}J7EL)(GAiDUeZ9XTK^ zj%1Z|+ONN4Y@054LlW*pv0Hi|C@YL&LfIrnYZ)G&Fn%iY!31AfJd?3hT~rf+!4?s7 z%zfcDv8*e9p?r-=R}-1FNo3EEGuS!b&)H)j?G>wC79Z z6B7VQh(4^-f(<9SP%G5ZH_V{xd}i_6ZQFzLn^)J{;==;;Czq}saL+Uvqare0aD3*6 zYU}t|XRd3`hw7ea_lJ!eldTM=ix2gNkX=~mJ#I#3qUkm8{ulkZVAitI;T%Hlm3Noq zq*+Wg0G||%Y_g_VOKE*1pfeD|ABv=iH9XU}b9RG3r0W2|D?T548KcyTtSVmO8Pl}y z7*wm~W$d~q{Hu&^A0rzX^@rSm0zc|w%_+oN6RL(CZ?O6$*OM~dTIV*RtH=3c{19Cj ztUZpbzmfzdJn8Vnj*hias_Zd41W7IKs&kY(Y0~6hjU2KxP>-qNbSbCzzS8eq0e%oQ zwPL5H@sLfOsr^U|9ezO5gXA|V_w)zIiZ#m7QJ$1ZeM%p#&*TL{u$aoMs4qcveQ zcF~sLf=3+fChYWB7J6wl{>a#5oP)lkEO^y|>u>F2kSgFEXKfK)4SVu4mh!{HX@iUP zw>n+rmOP!TIZh%w!a2y;<=n)QQdZuzgY!V&#=dKqZr4(@*V&gqHTL_WuLBH`vzc)p zp&feJmZTR~ z6nyr%2@UyD*Pv~*PyeKRN=(Djq%9MkUkt61eO=GOzB)It#8;S?wC=1vaCeX4&g_P% z9}Ln6htJwDqn#^#j5)sJ_Px~N7SkZB8+j`Vm6S19Blb(!E9JVo%TYI!@v&QcW!=ly z8FF~O_02E0AmzEedSYsJdb5|0mUKnIJ5?i{BRv(hx2N(ZR%U0ajL~S2qMwN6X+jB7 ziymbwczQ%e88H-<3oXpkLfdleB7^N2W|!vCbHM4|mJ#_Z!(aLv${@+kV2oNa5Kl)- zGdA09JO?6(vNOcKrX|ac8W4)PfL?`lj4bUEEm|mbBwyua45Nu19)u&NiN%dsRTo|& zox~v78hr}*)*?pY0jnSn!LUqRM+4A~o9oAI=3;{oSAr4D!jFQ9@R7u*=J}R>sc&mo z+H@hCa4}~0Jx`5$0pYrMl1WXtV_%>soV~IclO~ZGJ`7cJsX?U14WKe#>D1jmf?Xu4 zy)q}}iC4;m&m=7+AFCupQr*pwY;GvrKor-DjL(88-~=bt?SNVkW=$GLReDS-2_=38 z^#;kL%+j({AeQP$t7eBQpgV;7GPxk`eI-d3#yWCOHzkde4{}@?kwHrv?D4kanouSQ zhr_PEn_+KMrmsQC(S%6p=NvAU@Al;2WM6{A3tdjfXUH?e7d+ z&Hu8=>07&Z=nP!yOx(@nNsRDzg$?Au;+ zI`w!QSDYfp#{?a%uJZMrfo5P!{%g)ykKk6{Dws!dcs*W*kVZwR#Fbax} zCgvzyxoi-9WEZJV%dhdXrQg0K}S;zQv& zDcuKOO`lM3f!cYInskj*h8W0Ar7i`#b$>)|3Y=PX0{#AUbf_-u5yHMlQ19n|A}C8k z15;f(Q%hZALk3zKE93u`qz1x)-M$Kyl@@`8!h(8_USY*V1?Atb27jFpAKpistwhE5 z3y{6M2tS~59On=KfSDH)qsF%ARvi_2!i0l2BW?#V^~n=QDzUr zPX=`h!`Eq`%$1Wv4axy-8mNsUT;w=3-wC23-dCzu!y!Nza7;?Rp8j}dyz^x62EzOIkj{(!m z3s40x0&aj~PELh6eUJewfTM1k5QsUL1!7+(Uhy@x>hj8unQfz6QtUq->UB#K98j~P zfc29=Fo?bkP$F0Y@^9moY8rKRFRMVWqr)^c09%wtoN**A0U-k*%><=*s#Wb742^Lm z!&iP``q&jJ=<0bR0NY5J5qZ)>Cwf-QZXvxkRE8!R0yWB#qhm0t=pRC(q+wwU;NO%I zQmRDB4Fwf|zO{SCw-V`D)RgP#@(H~t$2$hZ^h*+AAw(E4>VzQ->3<9GA!V$o{Q{)V zN#J27Z{W%bauyv6XK5Pb16G?mTH{VxhZ-}SJNndZQt9k&7M`m&It!~8jjlY~Wuns7 z2-r$ZW{p5G#q-rPPukENO7c}^P|c9LsC=NQv*D$bm_Gkd0|t<9l{aNDX@iY8qtjTs z2pen+0Isbeu}36s70LIx91q8Ecip*qG|k;X0S-A18to~Sl<1~cOVC*jgV)W-uciPy zpHmT`a-Q|MP-yJUcgBdpuApCS;Q@LwlqVB%Uy6o8P_ccSqg-GFbhOnmf(@^c_oskf=Gg(rpCExM($N zVFOEvtT|vU6r8+eEaS>G3=CqJK@AjaAHhS-8XB;D$DdmgA8N0Hj(Z!h(GaRW>+6<- z$sF8;BnDj<1%L`d^T+J2LhfjUp{K9L<))(#Ut=J}MD}CSIXE|&w-X?=BMo^7Z|j%YNvu(r>oHA}85% z*ra^K-#B8w!y$AadjJU5GL$M-5d&X@-%v0Ef8_j#K@=Uw0hEZ)(Q_Ood<@BMY~Z!z z_HAtSH(6=}yM5F04H`3dXkc-W=6KX9wx%_EH6m-|9@K9fLyc{htQU>Ty}4hveP0u* z39nT2%0_;%hDaO@@U>*)yz|NGvFTnPyJ05seJv3-2ftfq9~A`qFQJtPzC$TOO^kGf z`tUvKeSJ`jJ*-4F?3Y5QV%(iE<-uVa-~zO(qRLhMN(13Pq7ig|?X|n_=ksyFmBE(G zG4myVnXbXGTldJn*><+brGkWlAd#oL=ctvAijg8snLwJ#n<}BMX3Z)L;%NcyxTZzy zWt9Wcjd(e$-Mx*rCCvcMuVvqsm`XAn&t$|lMosh^>clqw)O-P!PPeu`$7^Xm2a%tE zCpyjrZ8xz7q@a-t46Ng>+w$~81~gR}J9Gs{m{o%xzd2DZoTVE`xY&qS+1rE)E!%d) zqKVbHck^^9wG>6ovd@L{)4-Idk}xw`RxarJ*l`J>rmr;3sR!QnEJ8(~6898Vq}qE5 zX4cweV%FJaZ-HWIzS-$k8_81c{^q+m*d1`(s|vkSJjGZt$#Am;PEZ2R`Qp}5g1-6> z{rxIy<{=ucl);KpQLV3^L~fkkWjTh&8^&}O+Icz*@7;3$jA&QY*=pA?f6`yMF3mrZ z!~Jz04<&L{1p|eldPr!17{XF=;|FUsxD`U zpJF0|myDD0l=OIfY(0s}?ju=eH+b1G(!{epV>qF<#|9T{ST@9#US4rc)MUgp{i(t$ zeujyL(a-@Fy!MmZZ0$t>fXg;zqn@N?T2V32Yq3fvNa?BNUBeN5Y-~e z{q$UI9?@op>h#$@+?03e7+(MU+1=J+rxuBi?O)%>^qQcaCA4g z(Y~KD?f!U+`$tE0o7W8sEXH~Pt*lB?X|;Nd#c7D4*red=nnC$I&m8?5xlJE08p!V} zjR|domG}7r&Uw^>s`gh67d+m~&%b3a?*(k+<=*vu)_>bd3G&D#m_&0_Z|rz~R)w-Q ziRGG%LJyJI&O>75M;&Ub3bsi$6yrPT?1n#l$9bd@%qHtgygC{Q;v znS|3jccoG+BI6Hqi$w=$YzT~P_w9KB;3SKuJ#^6Tx}GI)*6&miAi@2YSI3+6_y=*d zM|xHlUT$dkSHN^{HfqNlw%!Wt72HR9xNWS}0`uNHu%D^UHJar=yqAiB*d(DjG<_i) z&DVQ7!FqhRg?X>;np55$=5I|Fo*5F9UsI_kjTb}>o;S>*jhPh7F-nSG4q(UnOwBh7 zk=r7|a|-d`qLx-d8qYhyC~(!@c4pT^If${TTSOLKotRtYUWO%cH1F>TK6Mi8b6*L& z*n0;oS#YZPdtMi9F3dY3+R>|w>1)&vJzLeIB_mHohe25VFgx*5djkwoJVif&x#UZb z(-rKp0$tQiY=)uRZd4Qzl~^^53p8>R3xe3?9!Z>JI2WBc?5`iMlo$?TaV}oOm#-Ji3>wC%rUZ>sX&9 zm3S$coalV{$%0N>RIPtJvVW7FX6zdrC(Im(kaK!KI3{N}KsMb{^t>}sG1AfLY^!6? z0hI5d$ZQ|)onU3Cpe;=hOE$;{k}v9k5#z1ic?@3SA4K>w!a|gSyW#U2|7nG+S{nBc z_&W0!u#h1QYg^1^7#yizFqOo43={Z)m^WlUI_vA}11Q{}OXqj^X7Yf=S%sJHL7N^tI%g+!mLP5i;uvlgS|v zlVt+C%=02BIFb9-ggvcod3Uc|=nbM|7Qr}E%@#rEW6BrHBRf2Kw7`($;Dm;fk$BfJ zS2Hdw6G3j4v-lp49+9Y^i|-Gfx$K9OM1cNE2h=6EEfR;CfRIy2O|BvrvfWrQ#SX=a<3V~5)Gd4P$_7rjPeNx^7ZVpPr0O|YdwGO+;K zNhlGzE*g!T+h&a=@7BX*3*pA2ElCpc;@fEi&W-$fX{s7VJ#lvSv%Gm(NjZTdC(cy} zI-sia)F*uj&v=h2!pNyfQJm|O;78@cBsy^vh*uB25265iByr5C*-DSVLEaR>2%4`- z)Dfr%4ETf@sM%n0qSL237PYt>5=75#O+F?i`K49M+$@1{bbG{tG6+_DAH`@1b1!q5 zn3DlUocb1|3W>YQ@l$CVmMJ8(fXgJcHZ3wzO|as;ZVVYbmTz`dy(-1PQy+NM=t^+S zU4?;rp7Zq3oqa&1ERkz#a~k){bow9%^uEW z#*9=Pu|j4*S-qzss?wQ~j^ioLl`a!5IPwFQhr<5Wn__EJ?qV@=Mh{7iv2M%dT4xI1y54hg23u zp%bS%Kp^lbS%>9vX>9L9ag{EQDA?(&3N;x!f@G~PrNk!-`fi}o&y{)(O8w-7wALOO zKL;cOG^zn&rfXkg%vhON3U!v-T?3|?b$MbQe4r&!FT~k<_~~sSgaS$&JZ9`X?epa9 zqgM?}lTzCgg1{lLPOw6sB1!_kAxKDX8$!kOI46Y0WJhEvu6QY9&qC!9Oo$2LPxbicPTk7$T z7O%8p=N7N5WB9srI(yIb;|Xu?oyYp*SL(v9FjHkoZ}gcuy~m=#O!UQYB6lSNGNHSI zLFw>1>+qMh!4UsSj17ZA8SA~rnQh}1Fa6_+j{Ty{my%>GDC8Y~=)2Dbgp^ehU|Kc@ zEh#FJp~jxtt~c%8<{cJ?ng?QnRplbVG4{)i_zfPsHYjCgtHXjhm%^wwc}`;uRF#%3 zlsv>26BIR7@ z_tCS};P3D2J8u3cvJkhO+=IL)(6HZ^F3$fh-G6oW{o4NY&z*kpQdWHg2!Zz;f&HBW z(o6NFesv|3hUzWg+4KX&dm{8Dsmtj7W+$R0b6_KE^yi*1)yv*#IS+ma4RsN-hC)!) zy<(QA#OJ5StH_INWvkdQI5S@t%<;o#^@)kObG-|h!|-6D3$Ar$#P|I+lMQ5N?JtK= zCmoe6O_SYY_J-@&MTL9$=JQQt@Lso1Tcx30Y`09hw<001fEY4}Kg!^t5D<4IGP)wV zVe4if=h&mEzcO@xv!S5jv#1>hk^vblvYD5-yyw_%5R=gnXCq+24|hDQcnsepBolzw z;7=wH)}$NLNxR2oPv4JrV|ZgQ<|$lT#7*D*(oR2F_Ia(`3-G%w_@Z$+T=Xu`E^q(< z`CXvu0`JAX_a-uJMOPa`d(D5USv4e%{FRgxx_=2HSD$UJ=7nf5?9qp$9tLl|Bw_dl zzbF9dJ;9!%sVF^Ndiqh$$m0w*n(XR1N6xVpXun`8{N__y0p!67M9GNIV~tzNS^@mX zkjJfg4_UqPlaLzWlKX&%mhTo16B@H2)HaI9qXTIj*V`6pb=u;|*Ii*^lXSgF(WGlK z@(1V0A$x(%@uUl8*OPg$GNIv$3d8Z(X=(%21bf*hAe+2?EytO+VMd~5md?0&?ORR{ zHS&9$d?@v&dP8_U$8?BG_8MhY@c@@}azjK?elBSpc;Aiy)P7!T0oHMfI?N4ns+`~z zm?*v-W+}NroPeP&R6@*4nW(qHHDR^}YiYZSc_8#oU|uETQ_e3PQ}{diK06tl9=l-< zm<;FRUu3S&3CyEbmz-olxEn}7;Yd$}9VcV+quExLDFP}X5y0KvFvu3*c;>i@W3A!V zW_0C?oH6?g@GAO1h-_b<9W4??B*+IH`hT#bcLinKxZqzj1xYyM%;7dv9jE{~dwRV| zKlm`wyCA+vyUgMrdIg-hx*=AK&c`*}U)o)P%Sr$N^So>IzxT!d#nI!XV8|K5H1D;(+F5B&%J?_SJ5(Z6>){)*msNB`};etr1=g#X^N^DA5x z;@|N9*TM5A`1gYNuV9Y%Eal(7{abnbPyFvy;a~9%aR0{tStI_F#P3CeUr8jtYxeKI z@Jk8dPxS9O?qAVlX#Zj0KN;^o(Z8o;e?>Q<{~P^hg7!}mzegLtlE5SVHwpV+Vvj%J xzgzrY;frMdhW~^8|C7@1rtVitloWvfX82?!!2Ze|17P1@Jnt17G^)Se{SRo + +**User** 0:08 +Ein beispielhafter Paragraph. \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt index 220b5533..346093c2 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt @@ -2,7 +2,10 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: paragraph: Test with three images in unusual formats item-2 at level 1: paragraph: Raster in emf: item-3 at level 1: picture - item-4 at level 1: paragraph: Vector in emf: - item-5 at level 1: picture - item-6 at level 1: paragraph: Raster in webp: - item-7 at level 1: picture \ No newline at end of file + item-4 at level 1: paragraph: + item-5 at level 1: paragraph: Vector in emf: + item-6 at level 1: picture + item-7 at level 1: paragraph: + item-8 at level 1: paragraph: Raster in webp: + item-9 at level 1: picture + item-10 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json index bb8807f6..98f2abc1 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json @@ -29,14 +29,23 @@ { "$ref": "#/texts/2" }, - { - "$ref": "#/pictures/1" - }, { "$ref": "#/texts/3" }, + { + "$ref": "#/pictures/1" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, { "$ref": "#/pictures/2" + }, + { + "$ref": "#/texts/6" } ], "content_layer": "body", @@ -78,8 +87,8 @@ "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Vector in emf:", - "text": "Vector in emf:" + "orig": "", + "text": "" }, { "self_ref": "#/texts/3", @@ -90,8 +99,44 @@ "content_layer": "body", "label": "paragraph", "prov": [], + "orig": "Vector in emf:", + "text": "Vector in emf:" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], "orig": "Raster in webp:", "text": "Raster in webp:" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt index ce60ad26..b4d98b44 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt @@ -3,27 +3,28 @@ item-0 at level 0: unspecified: group _root_ item-2 at level 1: title: Swimming in the lake item-3 at level 2: paragraph: Duck item-4 at level 2: picture - item-5 at level 2: paragraph: Figure 1: This is a cute duckling - item-6 at level 2: section_header: Let’s swim! - item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: - item-8 at level 3: list: group list - item-9 at level 4: list_item: You can relax and look around - item-10 at level 4: list_item: Paddle about - item-11 at level 4: list_item: Enjoy summer warmth - item-12 at level 3: paragraph: Also, don’t forget: - item-13 at level 3: list: group list - item-14 at level 4: list_item: Wear sunglasses - item-15 at level 4: list_item: Don’t forget to drink water - item-16 at level 4: list_item: Use sun cream - item-17 at level 3: paragraph: Hmm, what else… - item-18 at level 3: section_header: Let’s eat - item-19 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice - item-20 at level 4: paragraph: I like to eat leaves - item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat: - item-22 at level 4: table with [4x3] - item-23 at level 4: paragraph: - item-24 at level 4: paragraph: And let’s add another list in the end: - item-25 at level 4: list: group list - item-26 at level 5: list_item: Leaves - item-27 at level 5: list_item: Berries - item-28 at level 5: list_item: Grain \ No newline at end of file + item-5 at level 2: paragraph: + item-6 at level 2: paragraph: Figure 1: This is a cute duckling + item-7 at level 2: section_header: Let’s swim! + item-8 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: + item-9 at level 3: list: group list + item-10 at level 4: list_item: You can relax and look around + item-11 at level 4: list_item: Paddle about + item-12 at level 4: list_item: Enjoy summer warmth + item-13 at level 3: paragraph: Also, don’t forget: + item-14 at level 3: list: group list + item-15 at level 4: list_item: Wear sunglasses + item-16 at level 4: list_item: Don’t forget to drink water + item-17 at level 4: list_item: Use sun cream + item-18 at level 3: paragraph: Hmm, what else… + item-19 at level 3: section_header: Let’s eat + item-20 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice + item-21 at level 4: paragraph: I like to eat leaves + item-22 at level 4: paragraph: Here are some interesting things a respectful duck could eat: + item-23 at level 4: table with [4x3] + item-24 at level 4: paragraph: + item-25 at level 4: paragraph: And let’s add another list in the end: + item-26 at level 4: list: group list + item-27 at level 5: list_item: Leaves + item-28 at level 5: list_item: Berries + item-29 at level 5: list_item: Grain \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 1d305cbc..355ac741 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -32,17 +32,17 @@ { "self_ref": "#/groups/0", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/6" - }, { "$ref": "#/texts/7" }, { "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" } ], "content_layer": "body", @@ -52,17 +52,17 @@ { "self_ref": "#/groups/1", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/10" - }, { "$ref": "#/texts/11" }, { "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" } ], "content_layer": "body", @@ -72,17 +72,17 @@ { "self_ref": "#/groups/2", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [ - { - "$ref": "#/texts/20" - }, { "$ref": "#/texts/21" }, { "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" } ], "content_layer": "body", @@ -120,6 +120,9 @@ }, { "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" } ], "content_layer": "body", @@ -149,32 +152,44 @@ "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Figure 1: This is a cute duckling", - "text": "Figure 1: This is a cute duckling" + "orig": "", + "text": "" }, { "self_ref": "#/texts/4", "parent": { "$ref": "#/texts/1" }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Figure 1: This is a cute duckling", + "text": "Figure 1: This is a cute duckling" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/1" + }, "children": [ { - "$ref": "#/texts/5" + "$ref": "#/texts/6" }, { "$ref": "#/groups/0" }, { - "$ref": "#/texts/9" + "$ref": "#/texts/10" }, { "$ref": "#/groups/1" }, { - "$ref": "#/texts/13" + "$ref": "#/texts/14" }, { - "$ref": "#/texts/14" + "$ref": "#/texts/15" } ], "content_layer": "body", @@ -185,9 +200,9 @@ "level": 1 }, { - "self_ref": "#/texts/5", + "self_ref": "#/texts/6", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -197,7 +212,7 @@ "text": "To get started with swimming, first lay down in a water and try not to drown:" }, { - "self_ref": "#/texts/6", + "self_ref": "#/texts/7", "parent": { "$ref": "#/groups/0" }, @@ -211,7 +226,7 @@ "marker": "-" }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/8", "parent": { "$ref": "#/groups/0" }, @@ -225,7 +240,7 @@ "marker": "-" }, { - "self_ref": "#/texts/8", + "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/0" }, @@ -239,9 +254,9 @@ "marker": "-" }, { - "self_ref": "#/texts/9", + "self_ref": "#/texts/10", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -251,7 +266,7 @@ "text": "Also, don\u2019t forget:" }, { - "self_ref": "#/texts/10", + "self_ref": "#/texts/11", "parent": { "$ref": "#/groups/1" }, @@ -265,7 +280,7 @@ "marker": "-" }, { - "self_ref": "#/texts/11", + "self_ref": "#/texts/12", "parent": { "$ref": "#/groups/1" }, @@ -279,7 +294,7 @@ "marker": "-" }, { - "self_ref": "#/texts/12", + "self_ref": "#/texts/13", "parent": { "$ref": "#/groups/1" }, @@ -293,9 +308,9 @@ "marker": "-" }, { - "self_ref": "#/texts/13", + "self_ref": "#/texts/14", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -305,29 +320,29 @@ "text": "Hmm, what else\u2026" }, { - "self_ref": "#/texts/14", + "self_ref": "#/texts/15", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/15" - }, { "$ref": "#/texts/16" }, { "$ref": "#/texts/17" }, - { - "$ref": "#/tables/0" - }, { "$ref": "#/texts/18" }, + { + "$ref": "#/tables/0" + }, { "$ref": "#/texts/19" }, + { + "$ref": "#/texts/20" + }, { "$ref": "#/groups/2" } @@ -340,9 +355,9 @@ "level": 2 }, { - "self_ref": "#/texts/15", + "self_ref": "#/texts/16", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -352,9 +367,9 @@ "text": "After we had a good day of swimming in the lake, it\u2019s important to eat something nice" }, { - "self_ref": "#/texts/16", + "self_ref": "#/texts/17", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -364,9 +379,9 @@ "text": "I like to eat leaves" }, { - "self_ref": "#/texts/17", + "self_ref": "#/texts/18", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -376,9 +391,9 @@ "text": "Here are some interesting things a respectful duck could eat:" }, { - "self_ref": "#/texts/18", + "self_ref": "#/texts/19", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -388,9 +403,9 @@ "text": "" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/20", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -400,7 +415,7 @@ "text": "And let\u2019s add another list in the end:" }, { - "self_ref": "#/texts/20", + "self_ref": "#/texts/21", "parent": { "$ref": "#/groups/2" }, @@ -414,7 +429,7 @@ "marker": "-" }, { - "self_ref": "#/texts/21", + "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/2" }, @@ -428,7 +443,7 @@ "marker": "-" }, { - "self_ref": "#/texts/22", + "self_ref": "#/texts/23", "parent": { "$ref": "#/groups/2" }, @@ -471,7 +486,7 @@ { "self_ref": "#/tables/0", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body",