From e58055465cb3635ff17ee03e0a74126c03c55f49 Mon Sep 17 00:00:00 2001 From: Michele Dolfi <97102151+dolfim-ibm@users.noreply.github.com> Date: Mon, 24 Nov 2025 08:49:21 +0100 Subject: [PATCH] fix(docx): Missing list items after numbered header (#2665) * fix #2250. list items after numbered headers Signed-off-by: Michele Dolfi * add test for new case Signed-off-by: Michele Dolfi * chore(docx): remove unnecessary check Remove 'current_parent is None' check in '_add_list_item' function since it will always be None. Signed-off-by: Cesar Berrospi Ramis --------- Signed-off-by: Michele Dolfi Signed-off-by: Cesar Berrospi Ramis Co-authored-by: Cesar Berrospi Ramis --- docling/backend/msword_backend.py | 9 +- tests/data/docx/list_after_num_headers.docx | Bin 0 -> 15698 bytes .../list_after_num_headers.docx.itxt | 9 + .../list_after_num_headers.docx.json | 173 ++++++++++++++++++ .../docling_v2/list_after_num_headers.docx.md | 6 + 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 tests/data/docx/list_after_num_headers.docx create mode 100644 tests/data/groundtruth/docling_v2/list_after_num_headers.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/list_after_num_headers.docx.json create mode 100644 tests/data/groundtruth/docling_v2/list_after_num_headers.docx.md diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 48d32809..ac6170b8 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -1145,6 +1145,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): elem_ref: list[RefItem] = [] # This should not happen by construction if not isinstance(self.parents[level], ListGroup): + _log.warning( + "Parent element of the list item is not a ListGroup. The list item will be ignored." + ) return elem_ref if not elements: return elem_ref @@ -1197,7 +1200,9 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): level = self._get_level() prev_indent = self._prev_indent() - if self._prev_numid() is None: # Open new list + if self._prev_numid() is None or ( + self._prev_numid() == numid and self.level_at_new_list is None + ): # Open new list self.level_at_new_list = level # Reset counters for the new numbering sequence @@ -1285,6 +1290,8 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self._add_formatted_list_item( doc, elements, enum_marker, is_numbered, level - 1 ) + else: + _log.warning("List item not matching any insert condition.") return elem_ref @staticmethod diff --git a/tests/data/docx/list_after_num_headers.docx b/tests/data/docx/list_after_num_headers.docx new file mode 100644 index 0000000000000000000000000000000000000000..4c9a83aff899de69e187d40b175c5d0e4be7e27a GIT binary patch literal 15698 zcmbWe1yml(vM7wZyF0->xCVE3cY+6Zcemhf!QDN$ySoK@Ok;I{_-`oE5?k%b){?c23%s#6@80Re2oEmJ?) z&8c2V(6FX0cp_EQ!#jH96f9Lz^o#oXnk-bata2wF)5G2#ao#GRhIQ zH=9jQ>7S0ht)|p0%Y19=s0tl3W1imp>vAq@Ce5+5B{+tK%9z4UF(hOHtaURd%-J&YrLK+jWlM{!*n7uu?t(o}2M*59 zLY3eKM)^pB#J)hXIYW6v$hF#|-H=eF+mzcv`2-(CDnBNyNd_3_FrMpE!uE+(`mo9; zK&EGv*`?@N^rDGd5m(4&tSEl@HYN~fFxrbg4%qL%vWbK<-niY!0CTeuES->yL47lZ zQug5;J<aE!uSOR?VCb#mOuZf_ux0Rtpi> zd^2TIfc$9%Vo8h*xazOimF1e_t&w<{%66aoNbrITk@_w|d&1>~2Od#gT-PxiZH%S@ zxqO8gIyaBdu()dCojHS3~aSj-x4K*P@ZZEz~eZDI+6c)J{~05@l#H+ z6vD2ifB*n1|0pMf-{qwD>C>B-(qo3Lx(N`3?w+8ajj#=-Nu-D+XM&nTm&6_7PKlt9 zQ-{DnNW`Dp*_3H2ps`O^Hb>|uZkjA(k=ZK3X=O8v$pLA?6`Lt1Odc4ua(ckaNv*>J zfwaEnGPh)(jj4Y6;3Gry;Sp$-Z~?VXduGBmQ&>Y}PgiqT;B$G`?k8EGj0m>+EZ}!) z^sG9e25u4ZNgz8b6r*7@jps6m ze%H0k1%bC=w8(A8ZPk*{W0VnVAj8g&+IyO}tYVy^8h?L`?O^dZa_dY7Q1M*7zEDX=h6r$cSESk!#N=W+dAQAX+GLu(Xt{C9} z5!4?JF{~pF_R33!Y3wyQ9aWcN)M0U1>p9<-7n{rRmk()yPc?>56=|H-%HbYVu>A;P zT~F;0QZD9NJg`nRuyzr%55$*S#gyy4S2;CtIaSE4G)h8@a)m~U?8eLn%(v*QCX!67 zz_=+GgdrZ}Y`KyYGcZrfDjAZ$GhVv|w+!@emmSV}PnSS#6h<`uMS?dikMOFE`lkAT z8rs(#QTX&RWuQ&@E2poCI-1~ zDPhxmPayR{oZ62h`ZL)dueAomaj(0zd@txYbQ)RA+g}J$yy+e9K7K(@Br62b>8@;O zaGbpWV`-ga{jf?JwwT}UfiYX|zg&YEH%^dJHkms*4eUAwT^nf23ZY@q7EmvQO!g`1 zjQ%}G$3XYy#G)ysROI#>NS~wn&ifF3aXv^bhvLBllP0gFDXpp@o~*t7@`ZYA zY8&pM-C7}>HH?}Z&_T;QaM{zmIbmD-eA9)my{y*W`8KV3#s?q5uw`ruZO!16_{CiC zHh9;X%Q*qh)WeZb)XoN`|H#v~^X4Z{f9K|j+NkX^3qt!H z1sm6me%xn6i8;Zc+MAvm$)m}hAz(fNQGFmF8~~KEj1&c#WU*6r64Q$0mE2q|5BME1 zpmUj8Ly79L5ULXNX%fedRqHAVca}`9Hi!@gK}}s@&M!6AsbD$LO-bKzuY_I}wTIvP zgUo>X%gr!|h$hLj@N$VQ(fZj07rsXULbaS@Fpypb0wV{}r|XGk>f~wyS6eX72qGM! zGd}I)G$t^bsV`kI%@hhQW{$c*5Oo;#pt55hGf-n*=BZ^P;wyF+@+Owcf=%qj56r8S zmpplNvU;}}a5Fy*Gru-gYFy+VRrwM7cuv{P-%`hE^`(d^zqbNlU82V1z&+_S{ zJMT{uXEitDS~itZCfFNs`N}{b^d4sl!ufRj3i07|>3byG# z-bR4jh9T{F5RdF7sm+{vc6y$%;v8>~6IRXR*AjRda+qRh7^&$B_eUf+idXix^!dJcAqNQM(sv5%~_03 zO8rXbC&aKByPRXp+LRlP##W3S=+-6st)&x>?_e>)B0f2Uo=rfQ?<~LEUD^M*I5u=+ z%xFD)YKYScpH15P#D)99q;fUu_kfL74bFDX9_&jxec)bkV4=mDE`ft}yRvX6j$9+r%zQ1`m{k?BgCNxOowyV>f zZ5@INzOT;h`22Q~b>X6csO^x;GdxXY(7S)YMb3lEbF6#t5!V1__Wc>g&TxY>hbQV8 z#!=$SQT*d!{2cR(Sz~~MH{zP=Gb?gmOj@@!X<`21Pj3oOT=w_(1iiS zdZEvB>(V@tj^Q=DxE+0*eQMGj?g^ZI$rihGQ8fBLZpNMF#pQjnMl6NpC`2flWo>81 zg>=4Unj7{pmC+{%Qs<=8>v|@1x%lLCvKIRHc)O+cU&1HhjE^y zN%UU}SBpaOaXzgYSQsH^%m)EB@$78ZKQsh8ZJ z8jtdXIsK(q1se4&%DwJI8uU~>Qm<6g z*6*Y_erJKg{*yO+X<>>1*tRdiSo1U{(ac3)S0~{Cs{r zBX`2>n;4~`I}65>E!(;C@i1S{or#~dQzpDIFr5-LoGQ72CyVpu!A8T~%l*xPR!N5&T{{DAI@&CLsX zeIh)?q)8NSqD?Z!c!`@Zy4-9{+JJFb@zf}|Q5`u|K%x!_A)*8>!z18P4JJ}FB9y~k zMdJ;j=N7uVD#-<+nX=g%%+`5J*u0k*IyJQ7_g={>?{5!MBe(VOLV9?;obYs@l@?j>85Sjl$$;*SCNeS(q-244(aInfI znI4vh`_aA3UiZMsAOZSQ9Jc>M0aI05Bg2a|D0Y0?nqkKlb(ExaZ4{djabHAm?c9$o zA~#F?Y`XU@CD2mqhx{Ve4%yoc%@czfS|4wf(K((L(UEMvW`^+{^La!tz1xxuW>X*< zLqyz53by0#Nk+73cFj-E&4VxSgM>vNK@`-aCq=O7gyHJ}1?E9S4?lbTAr0B4b6=_l z)Q=gkLDkGf$3CgInAMUZKQwUEO9fFZ3>0>dakBc395F;1hrTBfWS_e*3y)?@Qvd_%!LYA2wV2S2)JA%#x6 zbdY{J!7^^LB|OGT=nwRRC?uE&G1rU=J-&u;%&qGW!~E4GS=p=aS{s=KZHTtggpQ&q z>#rN{^ZN09XQMYgOhN|k0$iufI6X}3@53ISL<+DEVk~yuM?WtzzAu@6UL1xie;H}_ zJK`-_Pu6O08qkW&s^V;Xj#%xKW4S&3-c{y0yuBTt7teq(svhBQhAZ_p zb7Y?cjFLn-)7!@|`ChK=Xo2C3C|4&gWnbG9A=G0;O#Pe}?BE)E8qJN5OeDCoYI zf)woB%UB9l>B|W-X!u7+SwFc4%!6y0>t0D%;T;pO-+EjWo(}B|QIb_?5n+*gd0<}Z zNB!R1I!V<_e*YgRmu|w91!@yk$W0?&)&$#I?dG977WJ~9!%l!iNH5{r2};z*h!%zo z^~gyl(dHau7uGXV)9S5r9^6U0y~z?>Yj^Xnn6E%rl#ZXdyW$&lD6HF;Ed1vg`hzOe zl|wEf7mv^_4hyzz!WNm~1<{mH16ULAWSX)4T127>DzL3hj&X-`BNNV3`YSZ7RB|~J z$Z%;knN4tMIz1};EqQ*#kithqewy0lP1PLQawjWWS5xVWNVK%ei!h3V&Zu>nlrD7M zYtOXK6L4tm4PR~7IA1tnkTWdM?HZImz)^7&)bM^9#1~p>^mrUn=}yIl{;JA-!e#p$ zBL>>uiR<*zHjUoK7ou>=Z2NAtV|o$O==f>DrLl^bBZ|{ea1U@Ga{oP&+NhjBZ2FXo zMKv)8yu2fbYppL6kX5bkZQ~@=QcJjU^cJJ~jVkB@VtOL5BM65qArNjMw=$F?2&Jq~ zlR_3h)L%a#_`oSP0(65?*7m=4CrCw6o7esUoKi3g7?o(k2MTeYF95_%EY-vMlY2A zT>3KPVgg^sSzH{SKKsP-1TW9y9MLhDD%mX1LxeJAA=d!_)0qyYLwCsQX~VIZW3Dii z37J$O(uuZPTSB^^kX+y7zKTTRGR-1Wq2a5vS8I7)rr~S)X^CY&rLolNd14xo-U{2; ztdBU%W%;8PQ;^mQi|{;*9(+T@2(!RKr{%WI==s1f_@2j6Zse1FYf}_5!qQ5WFk2P! z(h8E9(dzOUbxQh#5;tddNw_1BxC|8-T`6vW@DGjE$jtemv`z(SUyQQAMeC5yJ^7d3|pWTj0lD0si&KEFLS7 zW-RX_ zof5Cp?O_+SLN&SPhFI%5#PwA#UT^1J>T0pQb9>TcaY@Dw`lW35Np~HTT#{pYg-u-7 z@<90-cbzzVK$=umED@Z&TDiU_u@XDh@K#}AN&lw;9iP5ss|?((*N${Ox3HvPaJRM; zF?EXT%vj^FA>I<}V!N5w2${D(abftr?qT&KK$@Q4b1|_VLdf`@}<%wdv6E& zfwrP#WVWV|noQF><)(Mmbn~Tp--wex87#!J=4OiLu1{vM za3i$i&EIpD38~cm@VAwEmTnSeB3F^SZPt7iKiW|#pjEbw-^;i5oYhQESgWS3#Hq51 zDZhR^vHcPjTNh-(;YLvCr1YHf*!zIkYvkf-T*a=@y=wuVR$*-hHtWZoy7GyuED9^v z=4zzjOk733Gj2<9h&^mNMTWVx#-SX$D39Wn_W7!XwWDz5SiIc-lIK#S?Y7AT(%sC= zFD!zyS+xc)m1hOr&@GV1QxHQ9#Z*l+9LxBNTt)I=8bQ@MrNY8YmX?lL`H8T5s$#iP z8#z9EozC*1oMHvV#QU*?sD@cgXu1s17X`T%G`$fEe;Cl8Z*rf+m;2{fjid}o^r+&1w(%N75oH4L*+iG z%zF!|QP&k^>i$CTN4&Jo7}_;k@tWpy3QenuiX6_giAxUz3ay+RY0Rx1Um2#rtV(A-&}5cb{eW!pHpv1?%<7XgF;Zo&lg5y$o| z98LdY8XN2v3(QBy0(veNe#E0$TLSfM)zt zTb>)eFoFW#5b=pi7~ZYP*~uc_3xxUSEW!9s-^IR#O(ho1EeY?@UpoyMc_Ld{uox%x zekYGY!YVi4bHx5S7eJq`tF5L|CDb%fz)`&&-(&=(TRsU@FOD+*2!aqIPEC1|N&D>z zqtR#S7QI^4>1I(3Ra4XUi!D3>VPCclG)tQA*7KgEBLGMqM#(Nqz(gobq!(P z5qKRNeD#WO;^ZNc8?O0@aDw~JR~N?~Tp(-NNy`bxo-UBjNtaGKP`qK$MeX@w@u2l# zagePfXop%1OYvZ7t}TMocD@-NzGSshg||>x-E`bdH!uD+e2ciO)&vUAG@l!IUzF-3 z`RVP^0`ru(+q;ZOMOQY9O8o)dll59oWRW1Cno6!ucG`}1OIF55nUS+}Z_8-p!R{Fx z7yy6{^Z(Z}GPbs|SJczD_!-t4{oE5>w0xuAJI!H1Bkvl{J}NNcZS=j%DlDAI;I}$u|$4$~irp5;5u;Oz~8> zR^K=~(jnNkkKQDij+r2iK$VZRk7_o<4 zXi((U{hdRH;1v7ppdJ?gP^k>wx%tJZkyN>oNpEBwpKJ%^6 zZNTT}@7fTEg}5z?o@c8qGTNo>7Pz>R-~zM?HKwIR_Ux#AxQunn-=kmYa+%p)?=DBZc-o(@HoU*ihx7W<1ZwMZe;O;V2^49)a#QPBh-bbn!u zwGF+Z%7p;Ea5rg2oTYa~!6@H5T5#p0Ae9={y3J{9)IveCwg&+VsZ$EsL4}ZyloD~< z$!ZZ1HQ(wlo#e33wN}KqTGm|$cr!zik6{;iLm#z0hKG=Pnb|!9uAoe?n+L6?gAjmt zL4lJGt=$wv4~%;5FNX2oSpNBYAM5*9mUI7O*beo7&k_5NMwUi&Kd%gLN6Dy4r|mKU zLfah`#wlbhHG4IgBb*pFxBV9;sqcUggdt#ruoHRDk3^NEDD36a9O>V^^;}xD@Q(F$ z)_;_;JLkw(376_RWAgEVn%ce_Lj7@D*(6FRixq}*B|L$;A`fakhhKgnuTuNvD>b@V zh)U&DvldZaFoS`^Vy$9QQ%(%?oH4n?XNeycAI&3k2$FKKm;jxQb|{Wgb9DXJn7@fg z1vrw#+LlV~7web#@@*{j-1h=?3ZN5co(Vh+m^Mf|fOYL~S6w_(QL}i#w9t_bA16qh z`#(T|-2zX3$GQP1FKF!YbZo>W@Ctv|SQ(72;xdlT-cAuIFaB}iN)6y$&$0d?C+!sd zvYjize7bEM0pu3L+IFVYc^LI@Y%c^a_W1=!CfF5*2(dfYMx5!7cE5MpO^kJNQ?Q(rs9R%a$f z7IdceMwUMp&54GF-7*WB=ekbkB+qoK#ODI>{+KPT@jkIHj?ErN-T)tMq>!KhmoDwU zuiu?}#tX_@9gbNsO)HbM1E-zgq(KM&IN#I!R-Y3KGV1brF&-giugCW?86@o4fjLzSkn%+%^%faw z->hDb4Q@m1@nqDPS>di2%xh-!2K-6f%d1UooL5!i%B)CZ>ASebIJ)`rTB}Zd6)(E| zq1oeXpe>FIXH2`c2nr-6Aw}Le(jdC+WZ;UPEWM7WRiVAjmu)1E&BAht_ilOy$fc1(}=3yNAzH)-aV9JrYc zm!0haQyU5ltg_%}CMDKjHB_oWQ+1NkXh28(3XtMZN~dWt2&bG_ZHm<#UP~IA_#J56 z^6;?UY>dPw2b>bA=$1d#4_z(L zyqLuieWv<486?!02NEjpB~D`)+L0PbUbV~?0ek4fJ%JSz+l}d#FNG&_iD_yFxV|IL zDw(#LGx`&nKxO_T6# zh&06$@`c?O}m)x#-o8;#rSUkZeQqCfT zR&kxkj|gzP<;H4BNO1_qNr$%Cm|u>t7U=Dr3U}{5b_fGmSGaqHj^SfUYgv%rA|uCf z9n@Umhk{~&Kab{ciAh&a;R+$Hc@~LmPcp8#wZjXI#dv2a+K3&1(d4rTT-2b!1`$nt zyV6o%hP)46nUB~E1#lP&Z|m;%cgY@h6tzGug5H4+1re-gR%Az%a5N)hho=IvWvNg@ zu_z4F$G^O2z<9v>rfF>sa`LuY?hbEY)ExPZ?T$7yukSH7}W6lhDaX%(P zml+wh;Oak73Fh}nDi`5_?2~Lq?0EJ|EHO0eEmB3r&TX(Q`H&5g0SWj;Pcz_nmuS;C zR@R&}yRS)xC6KjIW3lkaD^TPXyE5*2az_c!mUm`4Z$uye-cZ!=DN&B;?yRpT?3 zOu2cFh5ReC1jW&9G`IPm9}-4IKt5*1!%iVzZs*B2l!gXlJ-b_RC>aXH@mR%Oi(*wt zxlUMjIV~~U3PmwQW|U_k9@}?X`=&WQ>BJaB5*JW?1Yy@YPlc#@>QOBSF?|h4JS}l} zy=*9)6;g%@D^p1k694!D`NNJ?u|cBr9z(tKdJuzO1pkzpr9Ci;&<(Bti81SjW^nS+ zu8GeLt$Jf8Ys+t!JWrD>$Vuy^2{P-Hjw(HndrC!K2K5_J=DakBbyn2l=+Hb7Sy)Wm z?7*Pyx9_f=*3|oGDVP;su=iR7V~waU+?#3*qgfM2^Gyeb^DPJL+%^&HtoAwT8QI@M zofeAHA5P6lO2RCrEGs%5$j8T#RZHzQ?>5&TEf|M>AY7{^_i2QymYioWw2*}*sDGE+ zt50iS$Z?f#ytF_<0iu2$cw`f-I!;Xl3Tq$(H?8dvq}L$yp`{`S^0t?*Y}du0`E&L+MAV+SoGh2& z(Ea6y!}(0{mN8#DK6%v<2Y07OBOPw?fz(U__ntFq$(2p%?GTn6#r4*@V9X+SQK_hU zIB}883`FGjjrEY6m&^n_P8!@>&}*;#+S*6hqT)JmkOUZ7()$dWNwQWiZ2RGiAA=l^ z?K-6%@Vw3?U%@6#VO6b)3^LW@CRx8u$AnVA;i3V98g9+6NC_c0Z%EjZ5%zwqhr^e4 z_ZcM7)Pte&j;;NQ?)pw9<;V(izp(WfZqOT5>PdK-|MQ$#NTCHsKFrCZyV}FCH=E9t zS`kIgy3>)tGbK3mC|$Y|7S5+|m&`jQc}hMY$f=YEn@<~}op)@4{p*s`lUgbP+K|`@ z7RAsSj=?@g%XwIi)Zhmx7jE#QCjplm{6Jd~Ip#9bO9!+zKR)gLkIdn^JQX=e1DYwbu}XC3YG#IktDrQv=1_ zoZPCn{IprHbacHj#rg}qF`4`juimDn9IcPy%q*tgUFZlr6e10yi)BdGyE0LCqo)%K zzA(A8lE=u}WL7W`z?lo+WN!9f&aNX`G%4KlAyP6$gwnPOMlbkGv8s854&NyAnUuQ& zQ)K057kcX>$=&WKoY1X~?L|a|cZBS`7REcm|MEie1nOSKZ0s zWU4{`ojW@nA!WhLR>-clVU zqWa}MlgL~KF^Z+pIsoC~9~)j1!aj)+fP}W;wq6n3NJOLOIfFw2C__vQQA9(hYUDsb zRr40A<}#9iX)IEv3QZv+ ztFR|w*flhok~&7d%G44zoOyq830lu&@%31#<b!n1_!aoo-qZ*w^+xdO8-5V^ifo zHh)Ia@L)Cot{)5!&+e`Z6xQm}c;kM-7bVzx=PMvs1&Gmg{k+!zd611CA@>q&Kz%&! zF~=U8WFT%^E|Y5ZuFOn6qLfApqHazEc6k)GRXDa)B(~KPU`$aznxE!I7DHcF2#aki zk8LZ1Z7aKNEAuD!+Xs&+W3ep1s2rVc;vq;fI;Z;t>Al0@iHhtV2y5E8Nmf+~iPmo- z>+37-cRFtx-1qeAP#;JDfIIeo(%>-uO@sTH+WS+DgB=-!>LvgHnQ-giZ^f>u4S`e( z7AL$zbA|1rMqnSO)qL>>tLg;{DVlcZePz%j+)e1Qn!4^M*S74K7U;`%6 zHUsyevVjk-)ki0z;4@6&LG66rSi`hVZ}jvY*-bZi zNexga0a&0!v*3haV<}_Kt*}JNrBh74x<~GDK)z2HkT{0-6W(mFaNmea?ZI@5E=fKU z%!-6VBJibD#+mcIF<+f-^%rD>lyZQ%cCP1cM~@0ryz~@U_3v_>vp9pvsmWuV`NDHU z^aPL-{=3AMH9a61w->!%4`JUZKNFnTD}SX6{r`#bpMw5(%2(^b5lxrEAlgC{&#|T~ zu`N7B%OnHf_b*M(C&hP&aUSGG%Ggt1UwZTj~HFX#>9K zJ?hazMQM+s;XV~^m`Q&-8I|7KwoDl_r$$k=5ib?)@#nXN#o9D`zPhu#PYz4K@--(3+u)2RD18an3)15y z=TbI9tykd>W2m zAF&JyRBC_9mcUv`a#F`Tu=C@w=s22|Kl7f(gq{8(yql4leyTs01MDNMk%MMpkd*di z>|Xq5Ms%XyQS;e0E@;mMsE6*Q-o1+RaDm1sXzSbE%p~R|g)Bm}6l#Atm$U(rOby5s zO-Zv0bAHtkV88mnAS7pZ{zvGyb)lD=qqX+BE_nWNUHrS1{&O$LkCl>mEp7=sdk&3t zV`&kqf9yO(qi~i-EZtyEPp|b^H58Ly&a^I31_ed6wtH2B=3_Wf+?N!4Ow zG1Wh{Ri!3ym6m@LLt7PMnvHWp{{E4EBoYz(u!wXSqadXUKm$ifR2X;-w7>w!%23)I zc)&SjY%y5whLZC31EF=20R36klV|3!(AJM#l;MT#(VHjeR4nYAYVW-nkPdTJ`s-7D=Oob)QCXRD9VS%|Q6=nX0 zqEUPfDxI!&&aM$BEXCBQ~V9P5%2hC zK1fk~e!k<#OZ{RvyQ6e^(YojUpf}#z0yXJ8Psoy8C=2Iqy;3DmLB~lwVS7P;wq2Si z*7wbCiOeWW)9!)VasTraJuyj_m>MRTr!1|G8GflGdm!SJIl} zRPJ1&8Y$t^2Hfu`N2}6Y73Am+tW`y}?RQ3(@hKFFq$!y-{NG5T`+Fhdhm zxD`g{6ZY!_)f4B4Ezjzi2V8fjt$;|~-C+R63ovgTW>1x8@=3+=CPhoM=TV0Nkvv|k-Fb1q@G zcEm?Iv+^E6>Ky0=BWEzv(Y@Ggvk~eXoDnu~%pw^=BkbsXyVqd~d4kFeKC`q$&)RsP zfo{SHLNqqWx#ZVI>7+{EHA{OMV>nDoINVW8sp?f zAZ%b6z3$8V0nPF7()@i8+`pf{{{X(6ij3J-<7Vvv$~4mcU*5q*H2deM^62&HT<2`vy6XY`KcuQ`5M2UgMWVw_4j`w p`1ufc{ukoiBJpt2I6uR99>0P5?X%WD|BoAvGP{{u}UISBv& literal 0 HcmV?d00001 diff --git a/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.itxt b/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.itxt new file mode 100644 index 00000000..1f5decb0 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.itxt @@ -0,0 +1,9 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: text: + item-2 at level 1: section: group header-0 + item-3 at level 2: section: group header-1 + item-4 at level 3: section_header: Chính sách bảo hành + item-5 at level 4: section_header: 1 ĐỐI TƯỢNG ÁP DỤNG + item-6 at level 5: list: group list + item-7 at level 6: list_item: Chính sách bảo hành và các điều k + item-8 at level 6: list_item: Pin điện áp cao (sau đây gọi tắt là “ \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.json b/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.json new file mode 100644 index 00000000..f0e81638 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.json @@ -0,0 +1,173 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "list_after_num_headers", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 8533346410935061397, + "filename": "list_after_num_headers.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/groups/1" + } + ], + "content_layer": "body", + "name": "header-0", + "label": "section" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + } + ], + "content_layer": "body", + "name": "header-1", + "label": "section" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/texts/2" + }, + "children": [ + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "list", + "label": "list" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/groups/1" + }, + "children": [ + { + "$ref": "#/texts/2" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Chính sách bảo hành", + "text": "Chính sách bảo hành", + "level": 2 + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/groups/2" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "1 ĐỐI TƯỢNG ÁP DỤNG", + "text": "1 ĐỐI TƯỢNG ÁP DỤNG", + "level": 3 + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Chính sách bảo hành và các điều k", + "text": "Chính sách bảo hành và các điều k", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "Pin điện áp cao (sau đây gọi tắt là “", + "text": "Pin điện áp cao (sau đây gọi tắt là “", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "" + } + ], + "pictures": [], + "tables": [], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.md b/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.md new file mode 100644 index 00000000..6730ab9f --- /dev/null +++ b/tests/data/groundtruth/docling_v2/list_after_num_headers.docx.md @@ -0,0 +1,6 @@ +### Chính sách bảo hành + +#### 1 ĐỐI TƯỢNG ÁP DỤNG + +- Chính sách bảo hành và các điều k +- Pin điện áp cao (sau đây gọi tắt là “ \ No newline at end of file