From 40145b59b3c35e1dd19444b343ee0ab165c49d38 Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Fri, 31 Jan 2025 16:03:28 +0100 Subject: [PATCH] fix(docx): merged cells not properly converted Fix conversion issue of merged cells in Word tables leading to repeated text. Simplify Word table conversion code. Add docx file with several table formats for regression tests. Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/msword_backend.py | 101 +- tests/data/docx/word_tables.docx | Bin 0 -> 16404 bytes .../docling_v2/word_tables.docx.html | 75 + .../docling_v2/word_tables.docx.itxt | 19 + .../docling_v2/word_tables.docx.json | 2356 +++++++++++++++++ .../docling_v2/word_tables.docx.md | 44 + tests/test_backend_msword.py | 5 +- word_tables.html | 75 + 8 files changed, 2612 insertions(+), 63 deletions(-) create mode 100644 tests/data/docx/word_tables.docx create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.html create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.itxt create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.json create mode 100644 tests/data/groundtruth/docling_v2/word_tables.docx.md create mode 100644 word_tables.html diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py index 02f8c86d..18b0418c 100644 --- a/docling/backend/msword_backend.py +++ b/docling/backend/msword_backend.py @@ -2,7 +2,7 @@ import logging import re from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import Optional, Set, Union import docx from docling_core.types.doc import ( @@ -14,6 +14,8 @@ from docling_core.types.doc import ( TableCell, TableData, ) +from docx.oxml.table import CT_Tc +from docx.table import Table, _Cell from lxml import etree from lxml.etree import XPath from PIL import Image, UnidentifiedImageError @@ -449,30 +451,10 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): return def handle_tables(self, element, docx_obj, doc): - - # Function to check if a cell has a colspan (gridSpan) - def get_colspan(cell): - grid_span = cell._element.xpath("@w:gridSpan") - if grid_span: - return int(grid_span[0]) # Return the number of columns spanned - return 1 # Default is 1 (no colspan) - - # Function to check if a cell has a rowspan (vMerge) - def get_rowspan(cell): - v_merge = cell._element.xpath("@w:vMerge") - if v_merge: - return v_merge[ - 0 - ] # 'restart' indicates the beginning of a rowspan, others are continuation - return 1 - - table = docx.table.Table(element, docx_obj) - + table: Table = Table(element, docx_obj) num_rows = len(table.rows) - num_cols = 0 - for row in table.rows: - # Calculate the max number of columns - num_cols = max(num_cols, sum(get_colspan(cell) for cell in row.cells)) + num_cols = len(table.columns) + _log.debug(f"Table grid with {num_rows} rows and {num_cols} columns") if num_rows == 1 and num_cols == 1: cell_element = table.rows[0].cells[0] @@ -481,52 +463,47 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend): self.walk_linear(cell_element._element, docx_obj, doc) return - # Initialize the table grid - table_grid = [[None for _ in range(num_cols)] for _ in range(num_rows)] - - data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=[]) - + data = TableData(num_rows=num_rows, num_cols=num_cols) + cell_set: set[CT_Tc] = set() for row_idx, row in enumerate(table.rows): + _log.debug(f"Row index {row_idx} with {len(row.cells)} populated cells") col_idx = 0 - for c, cell in enumerate(row.cells): - row_span = get_rowspan(cell) - col_span = get_colspan(cell) + while col_idx < num_cols: + cell: _Cell = row.cells[col_idx] + _log.debug( + f" col {col_idx} grid_span {cell.grid_span} grid_cols_before {row.grid_cols_before}" + ) + if cell is None or cell._tc in cell_set: + _log.debug(f" skipped since repeated content") + col_idx += cell.grid_span + continue + else: + cell_set.add(cell._tc) - cell_text = cell.text - # In case cell doesn't return text via docx library: - if len(cell_text) == 0: - cell_xml = cell._element + spanned_idx = row_idx + spanned_tc: Optional[CT_Tc] = cell._tc + while spanned_tc == cell._tc: + spanned_idx += 1 + spanned_tc = ( + table.rows[spanned_idx].cells[col_idx]._tc + if spanned_idx < num_rows + else None + ) + _log.debug(f" spanned before row {spanned_idx}") - texts = [""] - for elem in cell_xml.iter(): - if elem.tag.endswith("t"): # tags that contain text - if elem.text: - texts.append(elem.text) - # Join the collected text - cell_text = " ".join(texts).strip() - - # Find the next available column in the grid - while table_grid[row_idx][col_idx] is not None: - col_idx += 1 - - # Fill the grid with the cell value, considering rowspan and colspan - for i in range(row_span if row_span == "restart" else 1): - for j in range(col_span): - table_grid[row_idx + i][col_idx + j] = "" - - cell = TableCell( - text=cell_text, - row_span=row_span, - col_span=col_span, - start_row_offset_idx=row_idx, - end_row_offset_idx=row_idx + row_span, + table_cell = TableCell( + text=cell.text, + row_span=spanned_idx - row_idx, + col_span=cell.grid_span, + start_row_offset_idx=row.grid_cols_before + row_idx, + end_row_offset_idx=row.grid_cols_before + spanned_idx, start_col_offset_idx=col_idx, - end_col_offset_idx=col_idx + col_span, + end_col_offset_idx=col_idx + cell.grid_span, col_header=False, row_header=False, ) - - data.table_cells.append(cell) + data.table_cells.append(table_cell) + col_idx += cell.grid_span level = self.get_level() doc.add_table(data=data, parent=self.parents[level - 1]) diff --git a/tests/data/docx/word_tables.docx b/tests/data/docx/word_tables.docx new file mode 100644 index 0000000000000000000000000000000000000000..1513796e09a42730b23a5faf788dc96c60995469 GIT binary patch literal 16404 zcmeHuWpE}-uH|QDW^OaIncK|F%*@Qt<~B1kGuv%!Gcz+Y+ihm{dhVOqxpQZBH)8+3 zR}rO%%BrKR%92tkr6eZ_0*VR%2S5S<07AfK!Hks#5CHH63;;j@KmuzD+S)i7+c@be zx!V~#YSX$|TYbp`1*XUW0DnIJKgWOL5vWfXwdtWp6n;p2Mu`2TY;crUMg<(slSHF% z41wtZtnwB&*!JFm0xYi#6bo%dM9TEA!K51%=e$O8OgvJIOCz$DHl!^5Pzi zj}aExb^`qd|1hhs6?0oVmJ}-xJ#j;C+_)mNY({>DKbqPjAc`t6T7~3$BoltXD>~EG zL;D6u@&>6LrECKP4!lkdGE98!B8!)e7!jt?vg53pX`)nsUOj5)iRbZLc7kWJT#Uo2 zza!pd6gDPrikxZ`#Hka36SMr+p#h>ISN+oPYELCC_vW7k)H18MTer%Er1j4$G(vPh zu@h@)d6jCp>scQZ5JizdZCt{;t9S^Mv0K;x&O@01G%4O{k*m=*l!Cmq&=l+@E)mOh zs8}u!0HuTOjlNUEYC>YOBf#^DO|w@vM(_y4Zs0B*dL{4jq`GaEnk-7!PNc`HHt z5<5v5F037D;Yp`g^QFxVkQmrj$l$hMw*#AgaBuQ9LM%=>;vcS#7B-0u9{U)QGTw!< zPbGA}UkqbTL^(NUNK%)dWv{&WkmAjhlgco^U?HaI0a3;++!2J=GJ)j{ZNgiV%tR9t z-Lp}v-KTUC-^3VcL3u}wZADK{r97b=wQ#nVI!S>ydM z>;Q|G^@32nUU%CTCqtY6=F>3!yM7c<|K3T50{}or005-VL2WwyW&u-nx1p{@V`=ZSl}Wct;#eU9fUet*w(`l_Vn8qcMpEsk)scq|3FQ)BRz1 zY^SgW;c@W8H?fDFTy2lZx;|s3{>2pP(!2e|gs+f6{$>nOSUN3y!u05GT2cvJyV&Mxy`cDqrzr8biip}nio7H3U3fkRX>(ko2=BL(+BlC!}p3iKUv4`Z6OPLp>iwvLZc`HBw^}H&sQl zA&tC|NYWy{VlH#D4vtTkAK2!OL4HG@m^}PuhEKLhkW5gpWt<{oTU=-UOk@nK5&~5m zet{=JvZ*1NvMX*0<;fgPJ`F1iO?g$kZCKkhy!r9dTNmw(%!BtstrKZH&G>6woyc^w zQ=>$ZpuDw1&(Zw>G4vD?c^S92k`nGLA)Re&)hepcj$oT`x(1_6UEQ3;y0tV#y*-j&Rn8 zh8a26s*lYRQ0`njOtQhk3>qk;5SPO8Q=Lm^m5G$eqL{e_{gIJQzp0J4GXFiPFM2ex z{@8rxeJSPCR5(jxXoS;q8M0F5>)2wj9VN|^r{~S!8<9-UYASW?X$QzM2n06Johqx* zU2dlcuTzT!?^AMX>NBG|E=4^1E4BERHJ7`5wxqlN8g(zptKdB$QOQ0sUl50K%al?f zqYaFRwP?|XSkJeomsvyrj6n{;SX|o?t!GAIt7}I>oKui4^4k@QSL$%wCRbps_&a`J zSaZ+z(G6Yx9n-Vy^OI)11jOX*t$s||Nc9{_sW(u&Ypbx-US^{ zIrUY4i%z{ z2R-uqor&yu*c;^Yj@Zp5Z$eeKstBiyxAkgl0PD!IinO}-&HdfIa=ufnE7y7ogz#qi zDTa|^AdOfO4u(PQdr3PSQOOA+JUJA4NDud1n!79phw92B~0xUZPZbgsr6iS zIEn`l5iZMC@S!~T1h$pMFTd7{TF)M9SV}JE`2(5lv{!3>@f%N>%#lM|fLLti+31P>@LVmEZumYeoMGrMAiuefb6j+IkxS3QW`3d;Gao1({*U~CM3QtXj>PS zfEQ*}Ft;b!mNy=Y)GY(Dx{JRAHCcn$YKBi;<{OKWCa;)XBa1=or#x{C0(7RtRI5*?3~uy5VZ zTfq#ek5CMAcASc&Z37?QmA*ia5SVIvs!8=nU-p!;B&AUkG2c>SQ>%f#KsN1-&)Y?A zfUPbUHPW({#iNE=i9+xU%|fQ2Fr)_H^A%}o_eF_RzTF~49qRAY>!qUBw6GJ&ea&6~ zy=>P$CArZZi}Q{|SLUc30#RQI5I@`s+{KTM4kfnl=5yUulmlak3-ikgRwBrq5*pe@ zp;0HA@NY=vR9!46>4{@)gOXOV5(N!q5VLgf_hVL{mtvw^QZgsInRhDJYZ+%e&h z_#T87HOK@j;_F~F`*x_UptgVETc@b44Gg5HY0=0}SGveu6$j%LU+8UuvqWC+)HJyy z0^>5Q{;1xka|$X_tl~6|{%m7+4AI}>AVYVR&xpQk>1F>)u!3nSgtwetKAQr5c(W6M zuv@`h=gqYYBZ4I<6bjZ3T~}qwKd}94$CWN5II!kVli9@33f+Oyd65MV2a7(J7;B^kqdE45aCVltuF}$IuN(3i zLv_gcO2=ca^xR4bQs7`u{G|fM5n^0usZ*jEpt+~SP$%ejEL~2|F*PIP9SPw=h z_FZICHJ*VnWR}xwQ^g7Gu3MkdNuDN9-L@1~_v*3=~^PV%0$b>P` zXxDI9QXtu067Dib0JjO}T)-NgerFlsgX8HBk(3+XXN)go;Fss34$Sb)#F|ZO1}sce z1HtoO7MFt=u-RG9Y8$TB1J|OTf9CVR0A;^84^E`ezm%^wEMCga%sn<5r+!P2^lv?@ zPqPv~I<1F927Ye4<0$TdChZiZg4fauYND@c$;WhcwPmgM+ICj^8JjEnW$XNaef77U z?PIMVvk+Ey6p!_qS|1ymO|Qe6$b>yIT|o-s<`|Xk7}(CDliw0D5MO}(hpNaK-RRdnsl}^gprKC7eQp`0D7r6m-{c zy({=UPT{gU|P&0i$+xr>8yajL{LyYdfw> z!8h$_y7@_nHymaaNvw+-I%lKa1i;I-br%lfk%xy0oUDHc?18^QH&X3KY~W}3^cEZp z7sWk2*#4Rio5Po(X=8lFa|>($rIpIeno;W`6AE+-GU=g} zBz@VE9-owJI@^LDUX3BdyR33ul)H?x*e2l0B=5ei=b!gjE?YUs<#!$5J(>amR%p{8vz% zyK*fyqpG0o;UZi4%nLwerMlIAD5Ha`VRd3OPMteie)!&JP3yjjtmE_jg38MHb~R$ZrxXW|;f98bRn%CWtbvW|F?i)}c@=I(&soKD zkB2;pD4Ybrnc=yvA1s(vK*?uph5&`YmrRgnsRo;QNQGUj8&}lhM0wHq9`6braGNkS z?*A$SHk>+k{a!66F11^@LJ`9qS3)D7KDx>-z58_V9eq9VoaZQh)5utUL2ZYfMtg&| z=$UcS26qzGaU{A!dGcsaoJ)ngm}+@e_|`Zr%9xVj>xz_$?b{t|R@)KXb>g6Vy%_HY z!GH4P7L4lTZ$CXahR-+%E&vkf4^QsD!X1D1=l&7;0Q!tVetLxey^o5x(NEX^GYS{@ z?w{gC!|5d_uxBMfxVrEL5XR>%p@tg2-R`Uw6aPJ>*K+AlmEt(*%ep$*9p)M_=TV$9 zFVHeS$ zf5ZUPHN$b(P`&AHc|+V?xtQ)Bk#}f1SfyR!W7Z39UaY%Tb&V1UFaRs?Y$xkah6aey zE&leTY9;IxQcM`C`DV<3T07kH9MP><%0}kA>8yE&xzh&(6;Zg7e)BDh$gV(%!^3jl zkm!T-M|l*bey{`!BMGxhUAE$8tHLHE zl02biT>4R`7A-~!jVH}u&u(N6=RK#iZN4r@$WnhK=SkgkF_0Zc;@9z}gc_YG2Tt!O z8oyf6@N=70%aq5s_kpT+8_O*=xHfR(=<#(HnU-C7y)70B4_KE=uSEhU9B>`~+Q9|* zj&J9`-fH|7b3(R^YQk+AI5SnEHuw$_v>JL^E0b37_b-8!>4a6uvO~($!ne6c$dGd~ zVZda3)e6nvi;)*)!S?k6+$FX~fGDLXlGm#)+!-&^syd&{!h1&{lM3ppC^wH4JOSB^ z8DUwD(iYAVF~yzl-|vqO>qI*!ZW`Ddr8eSgCZoc33 zC%bUb?PkAS9@lMpf3V&4}-K^fc-M3#xP3- z;MFC^vxj}5mAs9Q=>?_2g85EO1ma}~235ZWK2*P}O*R_{_xOID#s5feAn8Zb^;_Nw zfIvYCGE!mLQguM3sh=8fKfGKU(F(U6YJI+)lsq`9G{D_hiYqdd;~p-S+>1ym24E4v zfK>0}=~?>u8z_r$nlGxV6)Sy$KJdvcaMp8tUhuyF-!dl9Ux+Sq1ub|3l2DKm&vkPLOEy@f+&~s0{-r2dI z?)=XhBy}UoMs?#XW&>pwg9bJMr-abvVA=ljDm&`kBtaDtH<5u@jJt+p zZ22@g!%q4I3S)Cc4e?FGhK62=8n(hykYDEy?%lK*dVT0>2Mfm9Rb%++Xoyi>U{((x zkqv(k=|z13e`$33Dh&I8L1e*cUQ?)SV`H<@19w-MM?Ns@NR{(Zbsc-xKWmGx`SVcm zY{{=$@ldHS?(C!p`j_@BEM@xkug(IjZes_SVZ}b+_7XE%sc!YTzSWAM zsN0rnYx&vC{Z=WdR$l~7I310lkT!`kQxmIS%$y|5Rdu1IZb3)KE9{61d>?5iLLdjs z2f9s-VGi5H6LKqhkL)dOaJ$N?V$Etb+N$fQ+H(`g;*dyeo5WEB>XRTupPf72JO8pP z<(Ex0NET_mA@i(^8&a|P61ogr>@dj!TxtS6o5hsOo3dw^A>pNEOeNM+a402gbEG;< zO}T3T>YmVjE7$mr3DXDYu8Fv-#Sv4myLoNY*wvJ%`rYA`G^=Y&p8td^Ei5cMonV-X z{QJoAhq6B{>2a@_3v|?Q+q3UU3iNqTnC3CnX4pV$wn4Z+oCrOy z{Kznu|L;S%%oz^m*)!$TGf#%WT!$IU?3PIk1(3o5b$j0u#n~+0jWt4D1LZw0f90f> zUyjAnU2b})qy#PSOz!g5?>w$k~B zcm4vxk03U9E?4GD0*U+(iu)f{=b^7PvMNuS9_yVq2Ru5-E6GL7Rf^gWiw!@$9WS2d zZO?&{lH9kV+lw|If*ZDnEa%hSOO$`tJ5FfZXmh{TI;Eoumc zTDH99ED$gUHK>?!U6OA!eRCjXmkoNjxs6>lu%2r7`80C>ZU$Q;t#9E$0RRlxKZU^^ zjh&p#ZA=~iW&-O~R&91!5xwZ9eCThrL@`FAh%4$scAJKY&e6-MpQS<*qysSg3U=JW z-#vm8%u1zyhPcLJq^B`CnlN+Wp7aiyPKd|~nN#%>g-qt@2?l&ts2tL3c7u-L zFCK_(f-e_~qwYV8^5cO^mSe;r7?lc6-aS%6uQjQMVz`YU$w$l)&QPJ7@|QvR8 z)0a@-3%u=nc5X{-%F+n7@MecFYnO3!ac{zhEXj6{9BMqo>eU)9rn=zOw>Ob0(Dykr zLBHw9h3t|o4=_Z1CLi!fE?kRRvLWWFB3Fq67lU!L_FM^ZXL5S8-SxQXG9qYXxv3TK zJ;B7aR-R4dfbA^r;0yweYkSJb*w=fxi=0K?U-2>*+j1S34So%}p0P8y z+WE>H>f^JG3gP!+9NAj^@ryCxZorq8Lk`^9vUECv&s^Iz_*u)Gud@59Ds#jeB3i^c z6k#Gb9;3kv`Et>cR9rfUn94>Wd(zw8S508dq7jw3?f7<;+gL>{I#cl>Wi-*kxvbZ=ygr1n+582O*H%&XT#VgTi@W`^6}mGoGhTPk9vh z@wVUjA19B%gPN6<&&{*rCIA58A8TkwCwD93Kc7{<@dYV(Dpz)E{6!shSN`_Kr8R>#?4xIycAW{6R4LZoY2y0wHE!K@fli=NBc3R0 zP+W`aoB$se_rg5I3KCkAdMBnsf7?WOR$%4AlzBUZdOZ1xgKio>D2MwewT3Bmf%^o) z_e6>|w9r{-6Gjlea`rK`3G{x1S3Rd_5>&Qq+#%WW0&w;oAY5SI-5BqF{S1o;FB|in zJ|y_Vrk+q@kikkOaK3teeq5PR?-1UewXTtTB8{CbiKzQP8p@%#y-~6qqHe+oSRph$ zD3}XrsW8VCd9cSOWZbGA)r)h5q*(W!ST`kbuAjhA)bEo|NIrX_&&oC!Hq!*?p^shd z_bACWeLZYW{H)@!;nmB|MV%+M*OR1@;aqNR!q@asw(dUe`Q<6UeC{~Lcey))Q4RuYp17iR+P$WCxW^SbUSPJPc$LzxZRdm3oA(8r%4%o`kF}d4i|BYZ zQ!<%Y$K4f8JxDz4Z#1xFOseA^qfpZ$4jHbbiAtQ3g)v?qj~5LDk*{{W&M@y5?H0ek zq@V7m1=O6Ps-q^DHVj|+9q}WFlYDn`(C~h$N79Ou?sg(Z=6kEudjK-zTE3cd&1aqUs3C4=EZzm^H?o-b`kG%a|?M zc>i4pYt&9?l%C*|m4!P>A#r_lwir_-ys_`0NW}g|0gHU~TUv{acKk!}hcvUb^}VD- z-xi8gBcaiQJYsMU5nEIvo@1T!>9uM}P*JOeSz~!nl*|0oRTVbJva+DUN~@Vz50|l^ zJ@%Eo&^taWh}?@EwS~~GT3^sWjTuy=VJtNEU>+!Q6zSA7dlI|Iu38VOpW@Fc^gxlH zvNY2hk*mBSP22YfP&Q3(#J)H-K0U6zXOkD7SI28`zj!@bx9`a!g~TSLggyfr)6|H9o>GFAk!Gi^o~yZJ&a6gKC@Mc3EEpck%( zv>8e|c@#B!3nRUh22K*jo+a89aV?zMPfW{@ISn>$hOLn8cZQ%;0qXBmm*x>N&eD7n zBs@D*_aY%gC!Py!*5R!K9ovWhHZS0LmJ+qi)5JY36p)j1I}McZ`V2XD(_tDK%#qbw>)iM4A<^bwe^gWVL3r25k+p9IB+zYKS3S0(;5hnmaL=WJ&l`KSo}R`Z}PmIMp88151!}qw{z-BbNHw-z8JtnOlONn{Xp$ zUPGFL%Pjn`A0o?#Yb=Ol!BB#0S8-hw-K`Y%t(Ma0OxrPI1#N_GK?wI298Ai!xjF2p3s zuy*zO?mDuC#<2CJ7RP_&^IKbuk=CfoEjpnbJ*Oh+fkgV{)0 zZQ%mqs~$@i+>*99kHhjtni`g?_{ykP1uhG|hG8o}=>+GpYXX+Bim+PW=&zQ}7^rIV z_4bTJu`0fD3MF^(Ow-k1E;EQLHmYiK16_Ay2qBDQblRkpt+L6m!*!Pa#>-|e{066C zU(_qB#Znuw9_>FHiDzK9Q6mT|!{-vhpE_g_B_ZyT(A_)?&%iFeOgOa+9R=BB1$Jh? z48787^fhEBu2N(S_!QB<7{mp#UXBn{qlDN5+66K}&aXLMju2X-1QD1<3DOUpD#Ld> z0Q$^cAn=}6Adup3r&+GR-ywg0rfhd@!{VGoDdYz_RRA9hnm|q<)nPdC{|ol{EEHM_ zClp#wR!`%S`68 z(Z-0ifON26VtcJ2TqnkMM$Jpk-9T@tmMttu=%M(SQRs?;#(H03vQUnU=extSS8#MF z&vmZkvks*D1gh_3kj9U+k&3T-(vm-i(w&7{%%My&>#9yIHGb&ZEhIV>(vF5JJlcUq zw9Z86bJe7{8e8__wlsYe#R;0~)lOZe4}d=(84TK%{}Ol8csI?*i>JDn3>;8p$BZa* z3aPCR)l4VgjyLPaFWHto|ULw9(GrgB3y1pX^@R*CBPvcyCQD-vlHrhM4(UfpE+tP zjZ#VYTcGv^eX~I}!Q25i>P1Y>fD;U!kJCSE(qKfyRh5h>k8@nJ7$zByqEF!F?2*Gh zgQVc8q<1qolQRoC`+TWg6-jHQR+#xH-$LwP8np_Mnl#S;n=F?OR3@#I{6Sn~CtN;1 z)xNw?<{yNVbG~U_0q0MI7{ci5=U9BUl7Eu8Op7&((#U@y!xAhu4RhlEAV$TXy;|h| z3;D}Scx{o@1dH;mSBR;AzGq>$3SS%Z5AbGOI;Kr$km{PDK61}Jl;p4Zn6^p2^Y%#DpysTQT`WT{)R$F6YvNJm0=m2# ztu%sh?1?_X*oK>Oqr#6^s?`P+YHqhWFc4>K_ys?I&887?SHsmda?MWVN`TvzD!rL; zg3|E3Z~Tx@#e03f&w3SjBk1^4>0Q07#ibbGp;xA3Rix|aQl>mQr# zw}6q;)Zmd)EK1{*;8>#lU0wx)yAPQ8)*5>zMxv++q}=M&YZN5WO;MpC^eo+TOe~NS zw4c@tIjS|@23wD*gI!}0K$ehV>Zi<CKtxlV& zpPf+y%6|N0nF1HpV30+ff9-8m0$uSBi7P9fR?W+Ke1}ORAqaV9(OmUOn+lA9+2s5d zGr7CA!mC=H^mE1Z8aDt*)gJFg9`eL<~+&OBJ%FG3lpC0o9lSw zW4Fr(cEjA6&PirBNy*Kmih$+3A5G*jXM0w z^b>iwnr#J1AhjdhIPDKtf|t`gsfeJomgZwW2~*~+mmQf$uZq7{?GPpX+1 ziJhbFB*%*mJ}Ajcp$J?ZLOii2@iwm1;i{%`*-fN`AYMjZ56U67R28t1nfRj{LntjL z>94~}dmqI~pDwC%(D~AK4q;qYh?h&t#aQfJ{x@s*3Lgem$)w_EIdz9BjrolccLj0U zp_S;1E|S`|_u9ey`9R`CwteR>JAPDMkta#~&E?}h2P|Z)zwTt%tc()~l@m0s4umNV zsl(04E>dw4q{d-Y7fo+TDP$T2<1JgA-LJyp?K2!?XUpvmI-653*w(8%4D#iL(qXCH zB!iy55HI>tU#zVg^NsLtp?I*9n>(TSlt+omwvFi8&9C|}kd;L5cb=Wju_q zcEDl8ZfV*hqI4jUPdBsrqmnmbm52Q1Yo7?nox32Kxc(8`IA`&@WD)a__7gZUChq2w zTqBL0SjZYlqNxH6d6#4tfe63X>7vU%;6+y;ney<1L&%x#Kn1b5W+GQpzW7&o6Mgeg zvN8u8io_i15+?rRqHxBL(dyA);hLJWV9yC*K0KS`C20ZEz19weLAnEH{LFPH`tegW+t+ZDWCTIJvPKSE;bbFmD_v6-7gwmrgu)!~~F1KOEn_2RrnI^m_ zPnd}qCs=rwgfRDqijBN3`a@%4i$x#FCL+bN;fuDDzXIFZ&lN~iZQ1#wk!UNMU~y>A z*ds`2$JxhY)^G$SVlUyIGD+d^OIz{`hzMF19h~a&Z~T9?_XnPR%h?6o?-^d#v5)Zj zq*Os9UMs%O`uxXKYO;-IG272OG@o%~)PID5T#XGB{}e{fm|eH&`GN}AdLi`)4)Q4E zV39OrQAbZ+r7F-8baZosl9A`@OCZzZ z1~R8-czRWVN0ZoRVRfTo4-Et1s?(ld95}1=K7+I&__UBv3>c>!7Tk7>P&l`Nj;|qp zrPr_JD^EqxZp}?wKC8mEn?t*Cph>)i;E0stW$bi;?`mmNeC3;yl z`*Fx1Oi_$&E7|R8{V^8Tky}ujX*G_2Ql)ZCPo-2(zXUeBckfeQ5tROB#Gtjgzd-+& z(-opN-HNukYX}vyZ0ZGg>d0Ox%8JcR9f4pSuIqlf{}`uxwZk5O|EvQchX4SO|M7!N zK1;Ne^bM?x{}$!ni(j?Lr$-FBP(9)NvdM<&V1~_7x?)XyHm*~#0$~>crQ*ehPflXn zX(*^rX4ar@i$Q|=QQ_Mr&T4UhnjNR=6RMI z2jaxrLFQs75srw~AcIi_hbx088JJx|uaf|QJQlfOY?=B4Yz;$gdCpNVzIS3?&`Nk}MJI$4IL{ib+1VsRk+LvrLyI zz4#U#pJ}`cHIV|yGR8mr6NT*DcHRS2VfzVBx|3)rlp&`YlwBR-P4rVlrWDU;gqlqT zehksqnJ#gq-AYkrPU)$7$ckZm@1Z$<0nU4iZFCr}R~n3mjs!JBX}E3UIQBR&GRu|r zTQe|#z-b|IyQUaFbS+S><%?QuQoODc;52?!-i}4#t$T4U_H@|cm~H%kA=oHDk9kFv zz83 zO{@fs^F1162JGkaXFbHfZxK-n5JtNEP4Pb^2{W2jYo}nlkV3o=y#^SHA1NfWCfER*caI_9om*z6C?UAdX29`a z;&EmFR*+?ylRZI1l|)hj+3!F^ti57reTRsL80mWvm}NvzSOw5zFVl+ayi^H)Wb(ncU!3MD^sc+F%r)Wfx6Sy_659hQF^P5;^_}&OT76A zs>#3JlnqNctJz~(P}?qN*yNI(FoGaAvvmAkw%w!z#_;HEwYVXSXw&>0!7C$dQsS0+ zHFN5`!G15W`|+F5!K488WHboyoiJ-%5s!j$JE7@bsrknie8ygeoh#utj2{$F+Q5)Z*7K&ZleAI z_E&t-ZG8WA7mKLz#k$WZu0R@;cL5aiD* z6(vRmHHqh-?YHDhfhLmL#i?;uYqf`{1%&OxyEQYqn3?S(b5vN;0+g>&#>W1qq)!Vdl3+JcGy*MlAbQ$N+dL&dNR?$=o_8GGFZ%{_`xbCw zV|bm?P0F+^{NN}OXrUBO(4XSAa{Vex7_L$DynyIvONLB~Vf4J9ev5z-qL2(uI}kW=TmmmO1#()8cU3e27Lp5SLO38d zg*sAPp@Y#~6fm_Ef!mPDh<*|P7OL)gkcn}qh&&@@JDk#KBdQZOw}uYGWyU99#IYwZ z!fSS?L0tm!h+zB!fP6K-C+cx{uQaffU2yt9Xa`NcJ?z!(*XP_|t+0u71pD;zeh&P9 zi7XH>&F50-KbKGc@qhkA{zEafoaDbU_}603KVbp!pGlE_D*gN`@UK;H|Af|k=EnV7 zP26AM|C)OFPcQ)B^|=uIza?V+mD69-@&3uH4*ma-ocC82e@$ihCyN%mzq9yrV#8nI zf8EUdC;Wl%AMpRZwfR@@UzfoD1iv%=9sHLC@n1Rob&d8<4%sY!=kPBpw!h;4)k^;p w4FIgO0RaDIv;PYJ*Lm~L@Gy=)!T&ys%1MHKipSpyi(vrWpEE5t_usDm7pd?I_5c6? literal 0 HcmV?d00001 diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.html b/tests/data/groundtruth/docling_v2/word_tables.docx.html new file mode 100644 index 00000000..30f6e8d3 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.itxt b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt new file mode 100644 index 00000000..dd42eb0a --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.itxt @@ -0,0 +1,19 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: section: group header-0 + item-2 at level 2: section_header: Test with tables + item-3 at level 3: paragraph: A uniform table + item-4 at level 3: table with [3x3] + item-5 at level 3: paragraph: + item-6 at level 3: paragraph: A non-uniform table with horizontal spans + item-7 at level 3: table with [3x3] + item-8 at level 3: paragraph: + item-9 at level 3: paragraph: A non-uniform table with horizontal spans in inner columns + item-10 at level 3: table with [3x4] + item-11 at level 3: paragraph: + item-12 at level 3: paragraph: A non-uniform table with vertical spans + item-13 at level 3: table with [5x3] + item-14 at level 3: paragraph: + item-15 at level 3: paragraph: A non-uniform table with all kinds of spans and empty cells + item-16 at level 3: table with [9x5] + item-17 at level 3: paragraph: + item-18 at level 3: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.json b/tests/data/groundtruth/docling_v2/word_tables.docx.json new file mode 100644 index 00000000..957a83c8 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json @@ -0,0 +1,2356 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.0.0", + "name": "word_tables", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "binary_hash": 8379738677198259833, + "filename": "word_tables.docx" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/groups/0" + } + ], + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/0" + } + ], + "name": "header-0", + "label": "section" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/groups/0" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + }, + { + "$ref": "#/tables/1" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, + { + "$ref": "#/tables/2" + }, + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + }, + { + "$ref": "#/tables/3" + }, + { + "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" + }, + { + "$ref": "#/tables/4" + }, + { + "$ref": "#/texts/10" + }, + { + "$ref": "#/texts/11" + } + ], + "label": "section_header", + "prov": [], + "orig": "Test with tables", + "text": "Test with tables", + "level": 1 + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A uniform table", + "text": "A uniform table" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans", + "text": "A non-uniform table with horizontal spans" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with horizontal spans in inner columns", + "text": "A non-uniform table with horizontal spans in inner columns" + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with vertical spans", + "text": "A non-uniform table with vertical spans" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "A non-uniform table with all kinds of spans and empty cells", + "text": "A non-uniform table with all kinds of spans and empty cells" + }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/11", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 1.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Cell 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/2", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 3, + "num_cols": 4, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Header 0.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 1.1 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 1.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 2, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 3, + "text": "Merged Cell 2.1 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Cell 2.3", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/3", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 5, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + }, + { + "self_ref": "#/tables/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 9, + "num_cols": 5, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Header 0.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Header 0.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Header 0.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 1.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 1.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 2.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 1.1 2.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 2.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 3.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 3.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Cell 4.0", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Merged Cell 3.1 4.1", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Cell 4.2", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 3, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 2, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 6, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Merged Cell 4.4 5.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 5, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 5, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 4, + "end_col_offset_idx": 5, + "text": "Cell 8.4", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.md b/tests/data/groundtruth/docling_v2/word_tables.docx.md new file mode 100644 index 00000000..90123c3e --- /dev/null +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.md @@ -0,0 +1,44 @@ +## Test with tables + +A uniform table + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|--------------|--------------| +| Cell 1.0 | Cell 1.1 | Cell 1.2 | +| Cell 2.0 | Cell 2.1 | Cell 2.2 | + +A non-uniform table with horizontal spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|---------------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | + +A non-uniform table with horizontal spans in inner columns + +| Header 0.0 | Header 0.1 | Header 0.2 | Header 0.3 | +|--------------|---------------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 1.2 | Merged Cell 1.1 1.2 | Cell 1.3 | +| Cell 2.0 | Merged Cell 2.1 2.2 | Merged Cell 2.1 2.2 | Cell 2.3 | + +A non-uniform table with vertical spans + +| Header 0.0 | Header 0.1 | Header 0.2 | +|--------------|---------------------|--------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | + +A non-uniform table with all kinds of spans and empty cells + +| Header 0.0 | Header 0.1 | Header 0.2 | | | +|--------------|---------------------|--------------|----|---------------------| +| Cell 1.0 | Merged Cell 1.1 2.1 | Cell 1.2 | | | +| Cell 2.0 | Merged Cell 1.1 2.1 | Cell 2.2 | | | +| Cell 3.0 | Merged Cell 3.1 4.1 | Cell 3.2 | | | +| Cell 4.0 | Merged Cell 3.1 4.1 | Cell 4.2 | | Merged Cell 4.4 5.4 | +| | | | | Merged Cell 4.4 5.4 | +| | | | | | +| | | | | | +| | | | | Cell 8.4 | \ No newline at end of file diff --git a/tests/test_backend_msword.py b/tests/test_backend_msword.py index 9edcb3e6..86bd837d 100644 --- a/tests/test_backend_msword.py +++ b/tests/test_backend_msword.py @@ -69,7 +69,6 @@ def verify_export(pred_text: str, gtfile: str): with open(gtfile, "r") as fr: true_text = fr.read() - assert pred_text == true_text, "pred_itxt==true_itxt" return pred_text == true_text @@ -101,3 +100,7 @@ def test_e2e_docx_conversions(): pred_json: str = json.dumps(doc.export_to_dict(), indent=2) assert verify_export(pred_json, str(gt_path) + ".json"), "export to json" + + if docx_path.name == "word_tables.docx": + pred_html: str = doc.export_to_html() + assert verify_export(pred_html, str(gt_path) + ".html"), "export to html" diff --git a/word_tables.html b/word_tables.html new file mode 100644 index 00000000..30f6e8d3 --- /dev/null +++ b/word_tables.html @@ -0,0 +1,75 @@ + + + + + + + Powered by Docling + + + +

Test with tables

+

A uniform table

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Cell 1.1Cell 1.2
Cell 2.0Cell 2.1Cell 2.2
+

+

A non-uniform table with horizontal spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 1.2
Cell 2.0Merged Cell 2.1 2.2
+

+

A non-uniform table with horizontal spans in inner columns

+
Header 0.0Header 0.1Header 0.2Header 0.3
Cell 1.0Merged Cell 1.1 1.2Cell 1.3
Cell 2.0Merged Cell 2.1 2.2Cell 2.3
+

+

A non-uniform table with vertical spans

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2
+

+

A non-uniform table with all kinds of spans and empty cells

+
Header 0.0Header 0.1Header 0.2
Cell 1.0Merged Cell 1.1 2.1Cell 1.2
Cell 2.0Cell 2.2
Cell 3.0Merged Cell 3.1 4.1Cell 3.2
Cell 4.0Cell 4.2Merged Cell 4.4 5.4
Cell 8.4
+

+

+ \ No newline at end of file