From d9c90eb45e899b2f3b608d959cb664c016e63821 Mon Sep 17 00:00:00 2001 From: glypt <63284048+glypt@users.noreply.github.com> Date: Wed, 29 Oct 2025 11:35:51 +0100 Subject: [PATCH] fix: xlsx cell parsing, now returning values instead of formulas (#2520) * fix: xlsx doc parsing, now returning values instead of formulas Signed-off-by: glypt <8trash-can8@protonmail.ch> * fix: add test for better coverage of xlsx backend Signed-off-by: glypt <8trash-can8@protonmail.ch> * fix: add the total of ducks as a formula in the tests/data This also adds the test that the value 310 is contained in the table. Without the fix from the previous commit, it would return "B7+C7" Signed-off-by: glypt <8trash-can8@protonmail.ch> --------- Signed-off-by: glypt <8trash-can8@protonmail.ch> --- docling/backend/msexcel_backend.py | 8 +- .../docling_v2/xlsx_03_chartsheet.xlsx.itxt | 2 +- .../docling_v2/xlsx_03_chartsheet.xlsx.json | 190 +++++++++++++++++- .../docling_v2/xlsx_03_chartsheet.xlsx.md | 16 +- tests/data/xlsx/xlsx_03_chartsheet.xlsx | Bin 11896 -> 10491 bytes tests/test_backend_msexcel.py | 90 +++++++++ 6 files changed, 291 insertions(+), 15 deletions(-) diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py index 443e9d46..aeaed4f1 100644 --- a/docling/backend/msexcel_backend.py +++ b/docling/backend/msexcel_backend.py @@ -139,10 +139,14 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken self.workbook = None try: if isinstance(self.path_or_stream, BytesIO): - self.workbook = load_workbook(filename=self.path_or_stream) + self.workbook = load_workbook( + filename=self.path_or_stream, data_only=True + ) elif isinstance(self.path_or_stream, Path): - self.workbook = load_workbook(filename=str(self.path_or_stream)) + self.workbook = load_workbook( + filename=str(self.path_or_stream), data_only=True + ) self.valid = self.workbook is not None except Exception as e: diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt index 8b435a35..2bb5754e 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.itxt @@ -1,4 +1,4 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: section: group sheet: Duck Observations - item-2 at level 2: table with [7x3] + item-2 at level 2: table with [7x4] item-3 at level 1: section: group sheet: Duck Chart \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json index 449b5f40..66ede3e3 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.json @@ -71,7 +71,7 @@ "bbox": { "l": 0.0, "t": 0.0, - "r": 3.0, + "r": 4.0, "b": 7.0, "coord_origin": "TOPLEFT" }, @@ -125,6 +125,19 @@ "row_section": false, "fillable": false }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false + }, { "row_span": 1, "col_span": 1, @@ -164,6 +177,19 @@ "row_section": false, "fillable": false }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "200", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, { "row_span": 1, "col_span": 1, @@ -203,6 +229,19 @@ "row_section": false, "fillable": false }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "230", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, { "row_span": 1, "col_span": 1, @@ -242,6 +281,19 @@ "row_section": false, "fillable": false }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "250", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, { "row_span": 1, "col_span": 1, @@ -281,6 +333,19 @@ "row_section": false, "fillable": false }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "280", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, { "row_span": 1, "col_span": 1, @@ -320,6 +385,19 @@ "row_section": false, "fillable": false }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "280", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, { "row_span": 1, "col_span": 1, @@ -358,10 +436,23 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "310", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false } ], "num_rows": 7, - "num_cols": 3, + "num_cols": 4, "grid": [ [ { @@ -402,6 +493,19 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Ducks", + "column_header": true, + "row_header": false, + "row_section": false, + "fillable": false } ], [ @@ -443,6 +547,19 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "200", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false } ], [ @@ -484,6 +601,19 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "230", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false } ], [ @@ -525,6 +655,19 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "250", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false } ], [ @@ -566,6 +709,19 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "280", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false } ], [ @@ -607,6 +763,19 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "280", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false } ], [ @@ -648,6 +817,19 @@ "row_header": false, "row_section": false, "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "310", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false } ] ] @@ -660,7 +842,7 @@ "pages": { "1": { "size": { - "width": 3.0, + "width": 4.0, "height": 7.0 }, "page_no": 1 @@ -673,4 +855,4 @@ "page_no": 2 } } -} \ No newline at end of file +} diff --git a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md index 3638d491..e0977b8b 100644 --- a/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md +++ b/tests/data/groundtruth/docling_v2/xlsx_03_chartsheet.xlsx.md @@ -1,8 +1,8 @@ -| Year | Freshwater Ducks | Saltwater Ducks | -|--------|--------------------|-------------------| -| 2019 | 120 | 80 | -| 2020 | 135 | 95 | -| 2021 | 150 | 100 | -| 2022 | 170 | 110 | -| 2023 | 160 | 120 | -| 2024 | 180 | 130 | \ No newline at end of file +| Year | Freshwater Ducks | Saltwater Ducks | Ducks | +|--------|--------------------|-------------------|---------| +| 2019 | 120 | 80 | 200 | +| 2020 | 135 | 95 | 230 | +| 2021 | 150 | 100 | 250 | +| 2022 | 170 | 110 | 280 | +| 2023 | 160 | 120 | 280 | +| 2024 | 180 | 130 | 310 | \ No newline at end of file diff --git a/tests/data/xlsx/xlsx_03_chartsheet.xlsx b/tests/data/xlsx/xlsx_03_chartsheet.xlsx index 366859a27c6db40a021077f95505028a17569f38..fab687878be961d1ef7c8bf8e7befbcbff9430c3 100644 GIT binary patch literal 10491 zcmbVy1yo$i(k|}q?(Ul4?gY2s7TjTQx8NS!5?li$xVyW%Yj6oJ4|2~t;pG4Cd27Ae zYhdf&xO!*3blcB{2zGejB0kh4Z7V)v6{lMq6LQwAIIVtQA7xNpe!tPJ)bxsFF!oMFDh zoOyy%8g5&p-ytHBWe zW3bnzicspaUSL9Mx}pVJwxZWD$4vlLi;M=cQtC9b&nt-ON+AJ{JsZz@T1Ga?Wl%)p zl~||qIA0z#b#*8Ta0arfjb(-o0zMUBiFc?A>zm03Fp*|dC~qrMK{Ipu6TdBX9>^ZQ zdso;Lbd?(-X=c@Q;@KfpP=H;8Lna23OKUzh$-Od(V`?o0qzTOr<5#Gwz#th^KKkVU zCDe~~mTFp=$Od zU?4D>M$v}wC47>os4~?0)}#;zVz*~MAyU5qMS`HozRR{PTgyi29@6or?UndSWy@q$ zq0Y`9+lufZ8pDqmbJY=US;>3p`Irn2j^WUeyB>~w^&7AwRw1Bik;WU#&X68z^GSp4 zt|C0r-+QVRn3KU=N>}IKt95>nSGET#nbDx?ZBg7bp7!S5!>Ko$U3Tw<+ZCR9NMQeX z>_E1r+HElHi<<#WR)&cOddo*xwQA!N8*AY|ng#3|fFRTezpPGgMLJ+CC*7h4=gyYJ94ou+A+XiZvye|u4HA$ToY*Bqt zR}-cwgLz{_8bR5DSj(~ygkl?gL`^bEc*5T#xA)|<6ocwpDfg~%i%+B60}R}|MLD}R zFFOTF@F@u=>T9yMuVAZh!C%&s{T;F^DhLoz)4#4K)c>p}fT@v{5#z6)%rEXYq}FD? zK=iVvu#TW(={UbpIw45%@&fW$Wp06?h(jQV;YV{G??}qY(Kx?OaHZe+=({#*6C4=m zuH2V!x@0L*CxpcW;S%3t5i;^%Zg#2Hl~vPBz?{JFR@?cCg*AYz4X6i^qDbv|=bs&n)1A`yj4C71- z4sJv<>Gp9%(Cf_08<{dEVRQqrT~PeEC#PVU5a(Fpq#iu0ac1|)$;nYCeLk6TRm~cM z8_dgoG{c=6ZKwxwc|mCKPXMG09G80j6X3wc)QxG_hk4$j(7_TRp42SQZyAq@k z>mt$09U@}wT)7SGnHYg8!JFK4V;#1(yWK4G1HKp?m@M|{DogQPOhr~Jf3{Mt$tG>g zBVqcIPJgQCeo&&!w^jn0A;s0e6~P7w+!C2);_MFPK_7GkuPt6(4 z0+t~YEcq$vpsSQRgknyn#yUcYTndVbaVT51azTwYzPY2wUh{ilrME_BD<-)e=KT$n ziJ1$5(1jw92AbH28ePZeAZM)q_Z`S=;TWT>ZGL*6(-3ebFX_V2t6cLm*)Zq6@!eg7 zdpfI%uL3hk+&yG#M}$H>f++?KhDyme(DlP6F|jw1Yq zmnBCxeKPv6H$^7yg|agVP0tARY}B7XUYv&e%L>fJv(tdU|634#e)y8L900DCMh-9T za;UBymCAwUxuWZP-`sVBMNtB!GC)=Asjjk6sBBh+h=64*qp#4ugFM|Zfp}>3>dwRE~y}4w`1eS3^EX`*SiMtsrCHaJR^CYJ^uP#jbFi0J00$R-u@L~6RqfF+}={~sP?#8NoIQ za_}|66mlA98)}?)h-YCOSQ7yginu+YD!8@ga{;R&7V4>~ppZhEBOMXYnM%Ve9Vt)m z3B;o!CbVw=5uso8t5HUmS2W~2{n_AG%E&nOj0bU+owQ}&*>Dt#$c%6`D=AKBIcut# z$fDZsVp--G=tfv1wHB?A=mkyCLB`=T!musvwd7DthmBo`#%zya1(C5R0DR%6t^oqD@qfZOZ>i^t3QgUmXqmd9fmE69xO}6CDK}fAd1n&8fq^1L21AmeQ6XFF(~o zF7L#X#C)@7yY9Wa{A}XNZmvC57gR(EaN}pjvCNa9ps4ZcWtOkW<)tUo#Bq)?2oIFb zlmW?!NcRy>*b-yV)b_E}QVKE0ki!qqCSD;9C>D0@4bG>_3h}9&SX8IbU0w_v6yKj# z^}jMXa%+6>R8rX=e4yUMs|dn9wPX6o{}v>PF;dPq3e@TCx06W7jX+G`!NZ{*br()O zXHx8s6MKgQvY05T9lYakD{D9YjM?hjn;)7RMUlG|HZQkv5t$b42n;w{7-t^mF4cOs zxK4)la$!y*CUrxiCyoc$iH;iEkh#w>dYU;f9C}WtBD=JDEipu`hbvtfC>ij80R5 z5B((Fx`;1}AXs=nA3H8LOvwlu8#MqoJ+OIMwU|+UXbOhg0(jbj7=1c=3HjjobV>DPj>urvG*V_`at1we zBOY-S>zg)Y@*b)&#I4CwZRa1G!DvqLYad%EA*Gd2Ht55mI2k^~2N;HuePU!$45e65 zI{WGYEpB33RhX}|Uaetv?i#1loYt8JozQPLcvfHOl?ZEcS~Nm=^wjcB2dXq9IOWp{ z*}%pXAer>hg^pj~2u0!$n*2=ttz}sv*OQucZh+KaOF7ubG!h!7lK5Sp_%~I7wgqzN@84Vz{GOa4ZpHI=;6`m`3KD z(~1;-{~q$mcqe#lq!lF6%z2FvZZrjmG6;^Qv7^vZNvg7?2dxK%a%sm&Tv9*oZpUN9 zG?l20noF@z%I6TvW}ULY#RwCIGVoK0vZhFq;#8hhm((5QW~Qv=eqhBtpBNgE>(_<4 zCux3bG4x^Xi@``7>87IP5?;e7nw+zGbBTJ`z#5Ss^d-gR^k8GCUyJ74CagcZO`|;h z2r1}tGN5t1s>dEKwlo)JX}1quTd?L^)vSScknp*v?!r(jVM)`Yz-!4lD1qD3;A!_Hr4Lw(Lwfj?qOTP?En1LiQ^NNw(g z6#|vj{mw+DsbtU!Ev7Kq3cc*ZT1S)ChlID$G!QN7hJ?TIR(}H1l{IeIr<-*z3CCZSBeebF@I5@&MDchs!Wc9hmuJjd_B}Y$K}>*! zIULu70^0*4*Z1d#@MFnXfLoH0`?d%OHebn}Ni2YIwyEj_G*4XQ#LYefC-vsn$>GKR z)+XQ7=Gvx?uDxVr+q8(Qek@AHlIUPDy({LH&k8Gav7Ha#`QJ))k@%5jlg#;dBjX*` z&>w$5(0w|xA0Z1D(>0zqpb^}4b#N)Z*ZXmI1QwhV-nT}cU)9%xvZx+%!|MthRt*%6 zPg!v2V3pT+Tm4h6I#=a($(o%TC4^lN1lOFATJ#qHN6$L`!T2Qwi%T zU0Z88_0d`K>~7EgWJWrHEZ!lgW32Au0F5EKby-7do7G)>-6Ql%JU(13P*VT{0xCrK z{|HOJB5{}cudw8~QigP)vyZU@F(4XKQUg9-|2|(+Z^hK^1M(LM2FV4hE~)QS zetdK*ypcbmA8 ziLjYZ#gXRS^t&Um>(LhN@T~6_ z!2W&-f70olS)UUq9u4dT_@J8&XO?BgeCehk1PuF}!@juyPhrqCur!0=TJ2_%5>1dD zXxbe`cE9HxH1P1q`mTDvlM-?yoRJyVocHOBcQ_Zrac|tByUwlBm0?x#HFILokJJ%RD}vk~x}MOI%r50l9^Mjo80`CTe* z86)B;wlsuoc9AMOtjfVAY28SBbqV+*&bC02D|c>r%wW6o(mJbGTZ<^t39$50$dT>H zO>b4Jm&8c}dKmG%I;g;z2>`lJIYhef2vMj;vd5qnf}O~35J5Eqe0gC(xe-C#2_P;> z;0}ziW{a81)p4NSM#~!c2GGFfD610{_Jf$IU`UjF6EKelk1eK)*4gsCph5iUu0qt* z54xv9AW?QE;1&-#Ud;2?qZ~C+q9z)UJr(31h;0IHDq!QqB+)wPz86JMIci-`rD}bh z;|eLz4XVp!5jaG{rhGR(4RMgyh95wkn6_{V=f0yQoZup1Ty(B+`&5^>c}4?_w@tE~ z;e6|yVJ7B8kMS=pXybL)gcqg4=n)&baigu@j(i)SlRjVP8JS>hRFXD(x}(B|pKJ<6 zQ~3P$0K5Mh#3ngJZZ~~u*>d3a(6p>8W}sr4AfrFK)<&SDLI%&rcs94Dc_0Z@khxF@ zp*l?&kr3=*nTb244teUsdZcRWO@w87h;>>Ewa%zpaZ=pPMduE+qGZ>iM`Ql%>9;d9 zkxx_X_6!H1W)6Nh9TrmF{AoH<_QceOY1$qnaqJ8Y;o2@*a%~A=%XLu~>>bqma+R!{ zJx|hfPrr&9K=jroE6=2D;hCQN_aYhEU;8steS4!1N&tH^Ym;BwwD{->Sx{!UkIoq+ zXR;2WP%0|12%?_Adb_4u?-+k{aT@6Q*4POA82{oo)N=3nNZv4{Mlcjol+#X-5QYF} zN9VcjA5=55^Z*l1Wglgze|91Rl4`pnGRYR-)$y6Dv^1VWUQGQ{NUhbf^iTo)w10dd z$|Vbu8Ns3O4DVccNLwe%cQ+R>R)^apGxxqWNtmM=1A5f%=|yX(DOAB(R8Ed?3p#vkYC}3Lc(z z*0EXXBkiqm{Mkx1h;@i1QG!KDq6DCYqW8>NJ9FUFlWP$-Ls@T4jTlTOv2yCAWNHhG z=(}wo*kCF4A%H7z&GjBwNBJ%JBjd8N)!s@oJh}Mx_a?#0i^-@1?bvJ?PBsDo4{&Ie zSm?Z{t=k-JmiMeS+P|v(7iaxj?biyIDvw$FF+Fe5)z~~U!2$0=OvAg47*i1!AlY&y zP)+-+RPOJp_nMlRbl<3iMB-J-vvjRdwFXP)u@GNTKi<#s$uf_B z><>?nIz!8>>mKOpXzU~6+=C~&Yn}{|QPRRvq6{!qJqi60Q&d+ur6njDA)_zH@|m(& zE1nF-FKHKtV+hbzHwQpJ__>R3j*kA#`$;Ze!Ic4Ph03X*mMtCXBo>DoRy$Z@Lv$gZ zvyh!}ApS!jNGGelDp!{i9>UKwJmo3Tcky9*vbW_u%jX_I4JD;MTtCZU-7Q-$)f2TW z!|P`QHx|6&t~pa44Y)EEo!~o|QWn1*!CwjN3zgH4hB^oHG{cqSU$ZviJNt+v7&Iu2 zV<@XshDiv0=i0p6{_j1LVO7WXfPt^j2h5Rf8-H(%K zuKVFF(yhcYV{FH~_Y&>uEAR-OGi`L+(E8cUcWal8b_ZOF#0f2i*F@i7C`z0F;{`e7t8s1398ZttPZq=$xEqHi`Rei6WLC{__|nlRy2%- z_30Rd3#~a0@OFV~q>`=K3Oq7-43X;8tTJ0^{Ps@W9Cf&z`ANuVCueG>^KgTN0WPnh^D7vgUVlTCtuy2gDxI}=Fmf&6Zds_q9h%+b>7D}U>ZU6~71_yJx64^F!z6ZG? zFq|iGUkSxFNm~{LGm+2A@`iHFc$*E?&RG{uf!O_i+n(U(ndEtWVm~svgd8CWJR=Ik z1D1GD46NDK$+o&ydylBr`*t%$IW~H|fq37Flu=DAmAWFpz_yQhevjZM9xVBWYeXPJ zB!nMUZh_+E2h?Bne*i_uqZJ=Me-sPW#y`L-fQ+<`$127lop0+7KVaSUW>q-bed7rbG|U}gXjdZ;Hh>T)(5Ao$kaPv8>K$ipQRMSiWT8 z94ROFL_bG?z-8eP7xUh>PstJMd{9xQ@4W!K-~P^^%EFFGx;y>K2E_L9a z9YI5WA?Tbf6~Yt!gmVJ>z7f1RLv*n?*epKNkKGHVIVQ9Hcr_naWJ+mV>A?B^G9%;i zqB1|9mzS4@HxOt3BxKL5YO&0JWkFW<(a9dBI&Q+4P;GPa*4Q5B%RQ>({c=Sp-FXS#qa(!=8-SJ zk&xfuxwJI1)Kk##Q)U74hxM3a=oD{v^|(@fr3xIsd1N`-@qW#|)>H$z!o-N+o`iZ;SD_EBo(tOo1tEDM!yn7=G>^{P#45{QMUmYz*b?ZET;j zn2o*BO9e4PvCX=j39jjwwrEj$z8$H^H2O`anQ0u5)jT~J1j{^|6i8r;Zpt_aYLF!7 ztQX^mmtA+9QVgW{sMseKVLL`4UrXdXqIM@NUIEGaaekQEa%B=zL}+YFc)W zT}9t&kaw#AIg-i!qr)2U29~o3Fy?NN#Q3$;j)JAqKQUt%|brT~&Go zT%v%O&5}4}$aHt$VWv%apGt_HCM?}L41hj?&Pgx)ccjabRpKge?#sU5$oDELWk1}D zQu(`e&uao}aA4H=(LqJJ69pCXd^J{mQ6*BH=+EZ7(-};H$rR4;x?{KpOg|x@I5+oP zk349Uhe*qqU!>F~`Ri*??K{o-ID;qdC|e^QwL$mVR-JtMs$9mB-#xrE{ubnVV9t!E zpb8Z|ah{)!yu%13zmMp!=WT)p@QQke!lZW;S+iNJzB=%*BZY= zZ@Wn0fLGTD-S}n=2Z4`EO5XrJqJR^14Z1+R*OIv~9(4|g)d>+Fmrlq&$mV@dSxGa- zCaC}xx-V9s9r9+%f=4{&{2hDA42%>2C@A5=*F(dq)eV>d25d`e($xk7i+kDk0|T@b zI#+3R?2Y_9{N24j9ggk+XnT8$~ZvjNgtd{TYJFSBU?=KPCJ$H-MTuH6jIUr_k z+rB6s(zJ)_y3o)7j7Qf6k z5L;S#hra{9;fSnsVz*t@lDfOUNtGpxW9u|q&5ALhhgX8Ut#*rQZUz3{NP3-x;_*HG z5%|RkNLG7X5uc6O@UJxerRV(H2^<{&HdepvIb88vLq`i-Tgr!;TTq6K8LtfE;8wnh zUakgqu<4Zv^K>WmJBkb<QWcs%EUpASJa+f$j`p__0aJ$PUdNwHtqPYC`#QY=5Rp<57xbtnGE9in^ z=atQ(U09Uzau8}LNK?LX4*6U(ELpUF0uX7h^N91{EOvx5z+m#0oI2%(op96_|7e_K z%G1E~QwZwm%#3Z~01e~?(UN-X&{pJ@M#eU`WTlFwn2FSt1hVRlxmL z2&bCaH`VGRX|N2HA+H}6?{LO z7g7k`ZaA+po+yF2BojV9o5M2N*u}^jCCi55Yqx&gU|bwLS;`9jWVg|pYW_XYvB;8M zckCYH7*67v9^3O*EXWcxb;^5Qx?NB&(+muP2K4*X((B_nFQ=CNDgXTp)1Nr6k7c|Z z=J_qm&;R)u=k?K^KL@{VPrt0T-{SjR6nP%}=Q)l)0bVz~Ub@)7r5fUop7x(ZU$>23 z>I=W+3(VhN{NH+He*(TP_`dAie@n_UoA}ETe*ymPJ^hLDx+3&X6z1ok{lB36Di!~U z^17_@Qa$=Ds?WF2C=P$tlKw<^U7`4=dDPHsatlfFY9qHO$LPgFm|v R5D?Py#r#~$s3d#2`#%la{N(@u literal 11896 zcmeHN1y@{Kvc}zAgS)#12<{djxVyVML4vzO2oT)e3GVLh5Zs-F*GcY7a+y27;LTa9 z`*g3<^{uYGt7?BG@=~B+Xh0A^P(VOHL_i)x8(0CrKtKo(KtQNKP#_vYHr9?t){Z(# zZnj1aTJ)|~mW0`0Ae31^Ab{Wh@A`kd0yXhNvR#ZwBDdmCLfdqb%T)qUY{&k6DAWpk zU*_X_&Gj{2-Co~HFGE5FLQGeDh?{pf9WXQ&jg!$Lqd6-m9avz*tpE$JkxX+NT{V{5 zr;}oY$`Lk#-?oJ9C*K+ho3K0tLJeh4u4V>v>EjwiG51Sx(lyPmXv6S(^(~!;2vZ;7 z$(^xbV=@(pJonRt?$!h-n!Z}#ww+M93+XFr!+~!nx?2e>8=9X#LkSPQ6ZLYCxJAGg z>19zh4}SN)!?jqb`m(x%?<7!htJdF;+S3(-P47&mXKt2ZwN%S6UqqeKO2HI*aL`Bw zyMNpb2}a6S{+*g;wYsUpI^PBXB1zOxg-P`8W40}1FfA3`GxUYo_#2dh3*WxO`+*Q< z{2;74blo*@B$4KHs{JhOfc#g-`0K^akii%FD}-m?M(*9#V80c8b)}717UZ=7- zSS+kZF!Of1d;G``S3u6~L<+`d${ATur^3%YdBoE^ot07sqZGywHRbsP@X+UHP$2og z`EIQ;BgqxOAf*AW2@CLD9eX272L}2d@Bj1J|HqE{hw3GJJT#G8qfynQ8>5o;rINO4!%De#b#Z~{QZd|JJq2A5WNBM$nBZZ=s^3bvkLu8dv0Pk9Yf16m%?f7sNX#4WYhZJbj@#ApIRkE^KWJLPT zXZGB2;bP81LKTKnl!ys$#$c+k6cr}{S1Eop?Ce`~;Iqa>R1?pjh3l6wG%)ts<@S!A z-o?m|y;HSN*?$4MN4!AEM6ndb^U`AxSJx2#9X3M^3UnJLh7JP1cf-7SwYF)iC{UKA zzr$WpGE&-Z_V$hRJ3&8_LYV>lWMdIaBd{?``u8-jDDpTlu~LB=@8??}gcy>T(G4^=bxOgT4Y<8*|;tb)9 z{Dd7y_LbQ0kW`84ZBD6n6N@i7-G1B?Hd#1+g072tz#P%_2V~rR4_95IY>ui>1l0*t ztejI(1Hw{K8YZ;U2-6Ke4c}l)DYcL#m8Ot)GcMW>vglao%S{sG8T}}p2zjfaIf!{> zWfp3_NfTqN-G;8wm&?Aut?kt5jG;VClY@TKH`8l0#Sk9x7D!AFRNrSOp&EDyiNqmQ zGJ8Gs_Lg@F*hTyYA_AYpsWA*o>Wjv0efn88&fg|H2Vc1!uuw!>4HbC#q;Mgz*in=> zvOukxY?aGz>Ue^J2iF-tK5_K0^eZET2Y7!S^DB8>4MxD^PwlGy z)-`n}1ldir5IbC;bRovPo40I=?V%C~*>VvXvTJ@!PRBGT=7Sd}%TTi1>GnG1Uhiuj zV3I>yVpH|C9%;9jQTU6r;Jk?Hw6iWBfPMaFnD3+I3kL?+wge~;5DpL&Fu;O;MfyMO z_+L>U7{Js3zy1GxmBaycSHr-jSfI2sEys7~n-HwF{RFnCN`#BGlaxXR0`tJyWiA2nZmD01CR%*oM6s(>v9tU z!e_95G9byv&5=Vu2=ssi0>T1V#$O@O!PLme(ShOTjp;`$OplYX$zenaI;FYhS8am? zHEV|US5huA_b6^!IbKL6Qay*X;L$01SaTtSR~jUvX70_b{OHpA>Y}+-#SnH>$^s@; zn2AjHeQB;5`qkLsmZ9he#APK-HBj)^zWFX;#RT`GO|>>aAaDH7!L0;MS$bPc1sn(# zFyP$H4_Og8ooIG7AX^SN%z7p{+l*4BgreRLYjsvQ(Sh@-R!-=4B5w+u`<_3F)_oVA z+C)j;wQe<|=sdtc{;b4|=T1qcZYW1%fX{%w+bUPqNaztv*W=i)G0Jb&w#H2S z*K!XxD|znuz7m&{uG%J{+sR{I@eL&MQ)z5_eS1o%xYX$-J-gtbL;{v4`IJjDHo53j ze+DhXgyPknMPiS2d3z3RAncHYYq`-#ZmUyFanBajJsa<a$E0NDqy6n4%2f$f5xlSebg`rV6oiOEaj4O?sx@yJRYQfu(bd82i~35iC)p|V*bx3S z*4zr;;?o5Yx%Y0kauTaZ83E8EQkcy`XeV|gnFWYQF%5T-3auGt@S9>zwE-hQe zuSGC;TT)$J$&v!%0TbVn?sn^G%r>aW{rJt`I1zH}MG3kR!#fb+s%SCRFpZ83@6$n{L15Qx1G)~9Eko-}ZxtEdOXHbNb}#5vOcWF$ z+Y|FhX9*3f1D}1kQ~1a$53H|B?K%NNy^C66sjbz4+e4n0ctm%6m?BqMK93Zk0xyS* zXe?das(v|m6W#{QSQ~Qa+?-+UBsvCUi{+oElE^%wZqvG$IZ5kxG$*o{A=#?va8CAM z0n5y>bnM)!*028oC;FSZ#+UMUIJKq9kTwDsIyztG7Bwj!fu#jh6X( zgZWnR)49ryBOMA-r?MO^*Cawnc3*X+bTSQu2tMEwAa72by!sre1@E-$Ppc#jhy$}p2u9M9FPZJr zqPU@D9mHquUi&2pDJ!PJYEUf2KPpaUjj|o^kx?< z%6^kEyJ^xNrc~N;>2Y=Ew(6MJmimx^O6H1vqND0sf6lo?f^--LIo90tSat&OeBabe zn>40wwFi!oy6vW8*|hCw7RQ?#j$Z&x!tr@Ojtmb&0TDw6)F!WCVLYDVtd{ZwfgWT2 z=#77IRDmP{O1e;#78{8UE$n2aFjo@eQi>6aXDn*X2&crS5%xGetueT|SK=kkZiY#7 zgVh`1;VEbaEIzW>rDh-|Ms?0R$H~g2N>U@cMJjCBDT;dq>~FC>8wzk}M@yi!p!`?C zOO}0+wT2L?p^Zr=o$DPgr|D9!1DnLjIf(5jeILMad*&Kh^ zT>)lZ+y|6uc9sb1J3G~v$TEuoTX8{UXJ@mXWd$q&1<89=WV>i#G1D)-`UC-wP6hsa z$O*PQder{jzQwFoR+ocxqau?D3>9#rWd?~94V3lVBKawn8{|v zJnhuvI}G9p_E^tjN)vm~nwTXbi=afy+c4pdZFL~@N}ZozcA()sK=l>ij8YUBB2|0SFRUb?nW9k% zxgA~6R^|uqRR>AwC(kjHO9=|$!u)-Ck2nCL(%SxeVMdh9?qwZh6Hf(Q;)+kNcCHHb z!{?l={nwAua%?n^_qrKlBWu3M0xa%goKgjM)IgG71C|0G(!qgb_ z+LzknX$4|2DetEiJ4LqhC3L`%92OK6j)(5Jj?uPJt+o%9EWzG1=nWJ3%Cq<3?$tR6 zTGr~tF4D9b!L+e1iXZ4&%ywJD=hR*Y=Mvgs$$rSy+yK_2FMbA&ndYru%9^y7!q>6u zSZV9)6j!-?tep%h1Cu|_hAx%Il#vPNc95R*(Akb@VO~7w)dHrUkm;hiiM5tg6f9;z zA#6vWk`%|%nO;qKM=TSf$8)mNB{FNeLtErnj28ke>;=0!Ch7C;^YeCwkW$P7fvFg{ zE3tHF$)q6ewgC56%~J)XDx%Fqm^S9+U^XTDN`mD|mn(CEP14|14~lbMgs?y%%@EY$ zc~l^=7(dTQY~@Y}{G#p6ho(nk}Nuji^?sR&&^YfKAGuRgE^l(7sufeuy=^Ye^>uavgrI-KRVhj(&C-#+7vV z)4b4!C=;DXu{1w#{u}|1;nxxQ8-1iO0T3>eVu8KdNYHgYRQ`AggnRq}n;id0fO9U^LTdp}E&(sJzw8-6 zfQ<~_I@+6AoBT9u%gCWn5GFwRAN0FWL2I5emT+tX);Z*K4{g{94v%#HWL}ZiQ!8DU zW%o{FFzB3>bs-BTH42MI@)$K^x05yTM_xR{=6#rOO8WpU;|p5~Kbd4tPGmNNq0LXq z$8)oiD~f_CJDIMm{Eej3$U_5?I#9S~A6T876_h`+>lrdSy;6Nm+X06+2I}?A!4@T8 z>P$v8eP;bA?zv8j$gI3TY1p~IXJ_WLa;knYGQaC*`tXuX#=?CWI0x{2Q9zpfS0Zd^ zujleZLx#UT{7QtC6l|6Vk-R_FJo~!3gyWCNlyJ{XcE9~>e$aITiq*@M3JfcL@S`T^ zH-Kj3F+T{($9b}K@`$juDq}A|VO=s_)VtuX+l@X|Dj$sF^E^vR6JC;2^f^GKrI%rO zvw5TsvHq<^kx4TP6i|GruG-dWW2NNVA$R&ht*D$DaJ0)#cT@JPTF*xF7D?I;5n z)jKkxW<_A}BMQzfrnZ;l9`nR;Dq{UD!wj1lW+mbn-aS6$HI60iq(zMmh5}n4fO(jX zSsLvScuD}fsjoDGPchGEQDE_3A|9ED;M~EewZ(Fx+?72Eqkvd~h50IZx43anMt{4< zEfV*lc^_&b+R?K6UL^^Q&-`Y}y#&kEi~Hy~W4@4O@)BZ4(+v9pMp6_$z#aGi56$81 zd}3Ka%;fuXvs20(>Xoi?MK;nAz7-FNh_8>kNOdyLNMtMx2R~U?@HlE6W=>DPQrqLO zP@2dLJvG9?H*ykrM=CibJ0#y@Xl@#(Z+U`Q&)}Bc0Au*i=zds0b72Jz1hfRmIR5g6 zU+Eyj&)cu8Lv_h|i4)07NB8*+4cON>3yC&TWPQxKbTpATa??qSW}A57rDgeCNyW%k zk!P=9%z>otG)IpP{Rv*qNmI^LE?(a4uiHIUS*swh5}deq8<$^h8HZeD=PlRUo4G9T z?=zJ`c0%lfAiu*%J@=f=>Bv@brjdLqlZw|v^k>920re?U!meLpPTK+**lX#A%am!P zjP}=Hve*Mzd2{ApmqYlqUIu9mT;Q2lzz?dpFDyTNZ;EQ*?7fMyUrLo4lynRT*min+ zRb*nI7xGG&YP7WNK>mSSZ&)3?3MP0*r*?CpITYcJ_{JIMS9e%pNbmLf+8AQ?YKUR= z)3Q>9v(Oz*xdvTEeBAFM7*PeC{%;E2vw*Adw1iR+X%pIMB+!Z1zbAex_(46T>e~i% zx9l$7+1cu56gsjh!V~RL8PQj7DxeO%#{&FDw?O?aYucGmeGPdm;ig6BOYOK$>}*z% zh02`m^%MIwzx0&T-U@h1^N<|hK(T!BKpIcBK zbRzR@BIKHxu$voDGBYdv)`l@cXU;%BHF6YIX`C`AsM6Bmqn?CQ)ht!0{fP|yM{LTjJkK$WiSyvGQX9Gnr?HQcY+Okf_Cj1j-8nl^68 zMA8#8N>@)D@iitZ;Wx=k05jfL=F=psl;qsexC)@^kQc&;^K482!Q^t@z%RE_R{T)! zS|lbNJaz(nbyOd4{-&UVk$yd$P#*SJy9jL7*Iv4kDq%3-_8=kWHAWXp*4HNl6@SKg zDST7C3OcssRDLG`4+BLrq$KMyDi&0R+*CrYdYkxk6i_hfB=535To=lW4(Y;oQyi`k zQ!TcJ&dMhJUq*^HKMuXZ7e4R!x?WL;FUnf0@j==(GMIr03MjjG1S-GsWOK>~i$FeW z%SxKH45MpY4+~tOVvIulFS+(kvbi;8)7S63@N+q#}lxm1LUYt5EEva{xL(9%YD z(t##UYFENLXy!A1vW!uMXwnrGn@N5vr+}?oHojxw(y1;}6np|__YBhemdX_GbCt!F zd65UZp8h_Vc6;VKDqEjY@BAezkyh(zoa;^n8u16QjH#)u9EtYFg(EgKIIgrE&3B*P z)y>psnZX_xEk?7OxMkOn=@41Xg%o`pWIywvUW$vSn;1;=G%;FI4t-1;INxf*;4QP* z*lMcNF~HS2eDZOmd*ap_8hcAUV&^`3I(iOQ{lvD)Yw8cu8Fven+8)s>yo8dwnx+33 zl5;$#%4^!*Njyb2V4ZH4kcXxd*Um2FcfnpbB7g6xjHnt8oo)Hb zaYLMud2jzw$S8<1-I)zV6G?n3ebTB7To2Q9zb*v6ZyB$?{@~3J&51g|(C)V$!fmEy zzDVqs$ge)6bDgccLB;XZ?!e1Lbq;g5gVLp;9sj7g9DQ^H@*oxl5g-S2LZtUdK#c|V zWIGU2HmP268ierole*)V9J{+a&v+VwkBemVi z>**Nm_JRW9Jy;lQD-LydE8oW3`z_|j8!-EbWWU?bgj{E+Qd2G8IAvQ4L(#Y?`AX9Y z%B@wN)2Bf4}9VsSHTlkFn8aUDLF~AC? z>O9}ZhTh+Q-O42Cn{nPEf4;&p)x~;7kRk?QKei3zoLK{F-wXyAg(HU8sd$$(+@pJA z!6>W#pxiQ|pGY3(?07z!vt;GInP4Jc^ia}lAN1SA%D2EH=sb#t+4BW=1HKPoRI(~U zybCD8EWt-lACH%u86D4d*<+;mY2zdA`bU^i48$Y?;!pA0gB5rCvkoM^luLB}QGEpA ztF;!u6EcAMB)`=EH;MVB`=3lZR5@l^+H+$XhnvB*UYMTD$1R#Hya|+ph}V%;sK*gp*(FTv$=`mqoO8Ue8-i*MzC3p5ingFiUT0_UpNXn&1N%i3Kye@dze5J}!VGN;EXG#iQYeRf=DV167Qe3xf^@rc+leXM$eyl61SI zP?~Fo#EdK0Jw8EDD|>m+Ldhc=Uf2Qk`gg7gEZr5vL`#UU;K+-hHhBa!N(9W;c00=1 zJn0MQ)XuPFg&)D_Utz&L5|tA9thpK1MA+n#%fF+NzkXIESACpEm?$t~yRV-4XBSqt`54m! zu;O)q6(jv=#d@~3|1)BM_5O90yww6!Qqa5zj`*QkIqaJhMWdBx4RZnlgUF9FmG8MR zD#>z+lMUOw=h*Esz;EOqocR&N_A))lP7#y`aG*?ErS2VLZ6QYQhMTMgKXUMwwb=(L zG)pI~z8Q@2eUkP1PWrB64k;VlPU@ZGgQzelec>Y7xl+tfi<(nuwFhopD$;Uo^enPhDZx$Bjip_Elg z*elqZZ@O9W`fzc%4)DeVmb^v6p?1`BqDhST6iQ`MU{tFx7r>4dYm!U8lI(V~z90A) zYkc~@;Ermu3DVH+L?Ea%i-TyWesr``Gk`m&N93UeB9?sP+cqC)aKu^1VZI1VdB;$( zmx{^l3FcN@_Ayrb^+z8A1L(Wt^2H?jK^B}kJ{b<~h(VZ% zu-ALL$7}W=)}zSN8YmD!l=@gSds5)N%CLY+_*EB%N>=%Ps0p8M4!3QQ1&of# diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index e3dc8f8c..6084a4b6 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -1,4 +1,5 @@ import logging +from io import BytesIO from pathlib import Path import pytest @@ -116,6 +117,52 @@ def test_chartsheet(documents) -> None: assert doc.pages[2].size.width == 0 +def test_chartsheet_data_values(documents) -> None: + """Test that data values are extracted correctly from xlsx_03_chartsheet. + + This test verifies that calculated values (not formulas) are returned. + The file contains duck observations with year 2024 having a total of 310 ducks. + We need to verify that both 2024 and 310 appear in the parsed data. + + Args: + documents: The paths and converted documents. + """ + doc = next(item for path, item in documents if path.stem == "xlsx_03_chartsheet") + + # Find all tables + tables = list(doc.tables) + assert len(tables) > 0, "Should have at least one table" + + # Look for a table that has the year 2024 in it + table_with_2024 = None + row_index_of_2024 = None + + for table in tables: + for cell in table.data.table_cells: + if cell.text == "2024": + table_with_2024 = table + row_index_of_2024 = cell.start_row_offset_idx + break + if table_with_2024: + break + + assert table_with_2024 is not None, "Should find a table containing year 2024" + assert row_index_of_2024 is not None, "Should find row index for 2024" + + # Now verify that the value 310 exists in the document + # (it may be in the same table or a different table due to how the parser splits tables) + found_310 = False + for table in tables: + for cell in table.data.table_cells: + if cell.text == "310": + found_310 = True + break + if found_310: + break + + assert found_310, "Should find the value 310 (total ducks for 2024) in the document" + + def test_inflated_rows_handling(documents) -> None: """Test that files with inflated max_row are handled correctly. @@ -178,3 +225,46 @@ def test_inflated_rows_handling(documents) -> None: f"reported {reported_max_row:,} rows, " f"correctly processed as {page_count} pages with proper dimensions" ) + + +def test_bytesio_stream(): + """Test that Excel files can be loaded from BytesIO streams. + + This test verifies that the BytesIO code path in the backend is working correctly, + ensuring that data_only=True is applied when loading workbooks from streams. + """ + # Get a test Excel file + path = next(item for item in get_excel_paths() if item.stem == "xlsx_01") + + # Load the file into a BytesIO stream + buf = BytesIO(path.open("rb").read()) + + # Create an InputDocument with the BytesIO stream + in_doc = InputDocument( + path_or_stream=buf, + format=InputFormat.XLSX, + filename=path.stem, + backend=MsExcelDocumentBackend, + ) + + # Initialize the backend with the BytesIO stream + backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=buf) + + # Verify the backend is valid + assert backend.is_valid(), "Backend should be valid when loaded from BytesIO" + + # Verify page count matches expected value + assert backend.page_count() == 4, "Should detect 4 pages from BytesIO stream" + + # Convert the document + doc = backend.convert() + + # Verify the document was converted successfully + assert doc is not None, "Document should be converted from BytesIO stream" + assert len(doc.pages) == 4, "Document should have 4 pages" + + # Verify page sizes match expected dimensions + assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) + assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) + assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) + assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)