From e6a070234f7ca2b9cb5d5140af74c0a6f1ebcb5f Mon Sep 17 00:00:00 2001 From: ShiroYasha18 <85089952+ShiroYasha18@users.noreply.github.com> Date: Sun, 4 May 2025 01:07:27 +0530 Subject: [PATCH] code for xlsm support --- docling/datamodel/base_models.py | 3 +- docling/document_converter.py | 1 + tests/input/sample_sales_macro (1).xlsm | Bin 0 -> 6039 bytes tests/output/sample_sales_macro (1).json | 2353 ++++++++++++++++++++++ tests/test_backend_msexcel_xlsm.py | 47 + 5 files changed, 2403 insertions(+), 1 deletion(-) create mode 100644 tests/input/sample_sales_macro (1).xlsm create mode 100644 tests/output/sample_sales_macro (1).json create mode 100644 tests/test_backend_msexcel_xlsm.py diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index 95dcfe75..9fec8838 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -34,11 +34,12 @@ class ConversionStatus(str, Enum): class InputFormat(str, Enum): """A document format supported by document backend parsers.""" + PDF = "pdf" DOCX = "docx" + XLSM = "xlsm" PPTX = "pptx" HTML = "html" IMAGE = "image" - PDF = "pdf" ASCIIDOC = "asciidoc" MD = "md" CSV = "csv" diff --git a/docling/document_converter.py b/docling/document_converter.py index 08095d43..e1446f0b 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -156,6 +156,7 @@ def _get_default_option(format: InputFormat) -> FormatOption: InputFormat.JSON_DOCLING: FormatOption( pipeline_cls=SimplePipeline, backend=DoclingJSONBackend ), + InputFormat.XLSM: InputFormat.XLSX, } if (options := format_to_default_options.get(format)) is not None: return options diff --git a/tests/input/sample_sales_macro (1).xlsm b/tests/input/sample_sales_macro (1).xlsm new file mode 100644 index 0000000000000000000000000000000000000000..0bd6663e21fd428c5a8f8a4014538bcf0ef3ad8a GIT binary patch literal 6039 zcmZ`-by(AF+a4k@7!uN<#6VI;NC-%G4UiT_jb^|ALAn{KlqjIINJ}FP(v2w6D53N~ zO8T2V&-?hw_q^M&`*$2We&_ked0qE;1ymLLDmefEz{mWU08`;O@dzvcpzA6Ca2@lH zv8*$~9gc8^-GAf)cQd)|<>XkB)TZ$c0t!cq%O`W$SGr2!mn;Gn@e>2_T&_D4sV1@y zDI6Zu@i0oo5XGt%emK7veweLRI@M)O7h~1P)mcX!33~B;st(Ta@`m#+VZ7Fr{5J21 zyb0#WqSf9dSGp(slwzFnFSP5D67I(mLBC+%jpO!pWZV(D4F^;Qv-VYvt~}sGlhr5X z56PR z(HpoM|0P4CnW@HT)RMs}tUjGc(L`hM_VG6x3pQaK|K; zI#RAjc=!so)J|gi?zrkcYPk#O(#JIpn^FFxH5wS@|24CshYzv^&@KZ{2E?J!>%Nkw zhYScJ(tX*eQ}5Fgnt%iUGTDRSkv$6NAL03)p**#U35)6#0DutlgdyRMZoIdDo+Syt z!XkzCm0vnZMJVBC9jCGuY?jbvrkq}!6}6lg5;8?|`QPn!V04aca; z)Ly)$PYnxZ@iYWlp7OuHB|PE__)`&bS(_>XOu`Ca0|2xbMZ6q&{}RR18R=l*?CkI} zW&ajv5R>(X-3ESkU9I+xWCr z*~pZ?V7oze)cxFCxOlsI|AWppZ?X1>X44gamx&hycS-~F%9fAj0s@^>@GPnKDnEbH zXgC1-_%E;ygCz?Fcbd)%GQzx82Q^ig&qh8EPILzuFr|A9`6qu*tMgy#JYxO`9&I%0 z1B-1Kx$aMye=jvZQ)m~uPaMC$5TgC@*?E>Ju@zNgVQ0nYNv6-6y|{YC6w^YVqDeyX zZjh`Nbs-Umh*PI`kG9}C7hz~&HHgfI#7KrAGKbs6aN8(0dAj%uN4I~<`80c z=jCIUu^e+I$+3r{py5gZXVyZLuV~`%7c~?=s+22mjK`_1+@R(VFD?I+6jw-L5hCMA zpEV)@d3ave7M`19ABNIGO~eCdu`Sc$`0Axs*{6w>f-S(M2Ha}9&)5vC#!6^R&9xqu38p$PHinUOHasO$G@xX=gL&=&)GO}PkfV(VN?&t7uz$*fQ>Du$`Y43V z_U0&&*zRz#1eM3PWO4nXCt?%>2M2=^4^&4%`fJpZgdFmBW=%e1kj))<7VaaNmfXhz zriKIDt_57WB=>8cQwCye#tm~(h<{&_{}@cHR$CnAZZsZpanBnYg3mQ<_9LOH2wJ#C#o`7Jp!@?jN|@ml|Br5K0C~PVRd^aJ=ar zqnGr9F@}7KmVJfG>-f#t8=7LDBg7`5bkh!wlg!Gv>qZf-(HFEM9xlv&-H4Q=UUfqZ zPv#hxvFffyBKWa!7N|+Ic=p0;qF#?;!|7wKS^Ho|5GB^wiGs&VgmCryZC^SRs`P~0 zD8}z&lI@pw(uIJEOd1)nR>{!y29vTqe9GQnGDj@LTE<~N_O~MbXcd10xf93k99Ay< zQSvBHP@PY2C9abh}Ni%EypgSb}kO#zBe)KZt>yFr=-Su7Nfuy~nklb&9=giH3$ilh}|2K0mceoW-OMP2;Rc*uX6=wh%2r;>t6kKOJpE zYRw!Q6D&(iP59Z`U}}P!tvM2IrQ?pYL)f_ebTf~{L6r^&xx(Q+`5u2Z$?_iAP1EB>k}xE7_;?>D`xXR zo@I@_@kc|2PsE>!_lu~``P~wEfcL^pN6v;ou*~wcPFR?f4ytAK?oFO{q>t)&&bxkO zK!m`LDsO+J2X@cPrwvcX_Ms8}-j|48!I-BikkXQ#9H}+TO>1i-xHLzeIC?rf`KGB+~JE;8M!*o1bX7XvLr zF@An2Y*9mOIlLUt%VPPejySs^vukr}UAMi;Wj2R6mG|O!#jsG7kjGU9GRsl2biyne z*=Q+C_wmtvA*Oi&>yUCHgOgOYz5FI~m{Cy3w3zp3@{(5mSWDW>h@-XI9QgbWPo-om zMasd`mdtz^;XQc@ySe1kbhCiYDIlgbM#IWQ$HM-ubG|Q*q8h>a@5BP_ut?E0NQka_3W5*cejN*!T&mc z)$UD0^~ck4)XlqG3eB}~*Tn`)Q%i&>@I304WcD!&H%yswzBV_h*=6Ah>q1;5 zAqsqt*p{rOL@eYy?`c?WyK*3nrqXmOnVre>@n_Dt`=pV;E8pc13pErR$%v_%J#)JL zesNad44o^fC5|fTR8VVotC01>COwBKzE_c9Wc1jdrgKR5fwWfKB_%YwH$uwfdL7@{ zEQ}1Zu91rhQHxDJPVc*YeFeX=1_yb?U!0LRDUA9l0ivqJ=)8QRr05_WRX|ZiYa)4( z`WqYW zlzr9Pp}puL^UoPu^`fKjwG8r1*)A~}n=RGf-nxS3S)ZTWdYDGhSlcR`vVF!yr(PR1 zEKQoz>jiw#>2~9}s8z=#5lbV^_Bb~dCA)EdY4AO*A|y}O;oB8#M&jFgg3|V>38V*D zpE`G-&~#vH+EoY7C?42tCwRq^Jsfr2>0|azC@Eo2(KpFP?{o4d@AIXlFO`WAk{ROO z?%%g>VSgb?554=PAARp)Jxy~YL$k&2=!dF+>DrHZ|I@usux1E$@-^R`kBRdc5|4KF zZ328RCIIu4Ua#Nb39^yRGIy^8cAKS118AalwLr{jT;w8GarD>#$Jy6jDWrf_pwA-_Bh(B0 zcR28~N`F|RZ?CynTeQf=6)Lgg52$eii|TgnzGSDTyVJ!t7B5^LrKWO2BMjzRcb{bS z2_()NEB?Klo3R?#+LpzNT-O_Px!nVK0{L#XCG~-~aWh{Q5uuA8UJIx*Gn{16%SKsL z4JT^>O-r6Q61lea_PNaG$-O8mMvp-FZz(;rVeTt6S#9A{AF}U?krDQvkFnWy@)CTm zvfxf=8g_D1XpTW#`2e_deA0ZiC|4vfi`y`QEbl?Pa;s6Vn52d_ z!H8qah9pOU_MS=-G$fA2V$1|`C` z!3)SYwL@U1>@?%48=kt zAxhhcL*)P|T9I2X)18^Df5^SGSmhLJmJr-8g5%HcAoVOvdHTDqU7thrQB~X5DtCJ> zqEeJn*nKer1sq*}hl4e+;R@_d_ zX@r(bOt>TSHOR^)3fD!8KO zz6`g|eEO@aH<8GRg!L;2F=0xW*PdmE!Q} ztUgW;Y}7&p*V*z|kTa{9ldslj!_O|ZmljK(`3~J6rVO6h9bhgU->X(^>zXcb8B%nb zb#$FIKG8~R|AGyNjnoxvPg*oHY-6MDrD`VOlkI31akm_OC2>dCb-|0zC;9A2gU6%1 zU`yKtq7CKklQz*lZZv03F1`6Hxm#$Wwu$oVN^1$9q0-{ETm{7XJ{{o|&fZtk z^H&K}x07@asvDuH(Xj7brXFjGvn>987ot$UiF7R@liTgxQ>rwo1#2pT(ZJn10t-H!l z8YkW6NF!_iEfY>r-j73p@6?6Hi9}>r=Yjhu$bciEP`sW*nU5g7w)BDL%PAG|mOvMj zPUMXQoZ$s-F)Ega@gl1!PfQEst+bKMiXxwe%Fl{Zau`D#veJXCd>LGwCz;M1Yo7eL z8JV)%VTsGvU)PU*`wU^(U@{_oY)WuD6`ncqCSnC!#w{Of|| zGRx(e^*0MO-ajn=nP4x2FV9fF!JU{*@xM(}ml-Y(6u%jcfqyalGhSSVUhZgrL#r@b z&wuG}FB4pL%-;mb7(2ue{OO;U;g{|6H@q7I|9@tB8GhL+e#2=oJN19ye;Wr>6$kg{ R8-$o|3noq4e`Ye^{{R|1MO6R* literal 0 HcmV?d00001 diff --git a/tests/output/sample_sales_macro (1).json b/tests/output/sample_sales_macro (1).json new file mode 100644 index 00000000..d61c438d --- /dev/null +++ b/tests/output/sample_sales_macro (1).json @@ -0,0 +1,2353 @@ +{ + "input": { + "file": "sample_sales_macro (1).xlsm", + "document_hash": "bca115316d047aa424d9afc428ad78c28e82408971a72379256e47e40c62506a", + "valid": true, + "limits": { + "max_num_pages": 9223372036854775807, + "max_file_size": 9223372036854775807, + "page_range": [ + 1, + 9223372036854775807 + ] + }, + "format": "xlsx", + "filesize": 6039, + "page_count": 1 + }, + "status": "success", + "errors": [], + "pages": [], + "assembled": { + "elements": [], + "body": [], + "headers": [] + }, + "timings": {}, + "document": { + "schema_name": "DoclingDocument", + "version": "1.3.0", + "name": "sample_sales_macro (1)", + "origin": { + "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "binary_hash": 2697172271627391082, + "filename": "sample_sales_macro (1).xlsm", + "uri": null + }, + "furniture": { + "self_ref": "#/furniture", + "parent": null, + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "parent": null, + "children": [ + { + "cref": "#/groups/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "cref": "#/body" + }, + "children": [ + { + "cref": "#/tables/0" + } + ], + "content_layer": "body", + "name": "sheet: SalesData", + "label": "section" + } + ], + "texts": [], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "cref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "table", + "prov": [ + { + "page_no": 1, + "bbox": { + "l": 0.0, + "t": 0.0, + "r": 4.0, + "b": 21.0, + "coord_origin": "TOPLEFT" + }, + "charspan": [ + 0, + 0 + ] + } + ], + "captions": [], + "references": [], + "footnotes": [], + "image": null, + "data": { + "table_cells": [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Product", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Date", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Quantity", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-01", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-02", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-03", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-04", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-05", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-06", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-07", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "15000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-08", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-09", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "4000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "11", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "11000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-11", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8500", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-13", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6200", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-14", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7100", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-15", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "10500", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-16", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3200", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-17", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9400", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-18", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12500", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-19", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6100", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-20", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8900", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + "num_rows": 21, + "num_cols": 4, + "grid": [ + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Product", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "Date", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "Quantity", + "column_header": true, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "Revenue", + "column_header": true, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-01", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-02", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-03", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-04", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-05", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-06", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 6, + "end_row_offset_idx": 7, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-07", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 7, + "end_row_offset_idx": 8, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "15000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-08", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 8, + "end_row_offset_idx": 9, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-09", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 9, + "end_row_offset_idx": 10, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "4000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "11", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 10, + "end_row_offset_idx": 11, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "11000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-11", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 11, + "end_row_offset_idx": 12, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "5000", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 12, + "end_row_offset_idx": 13, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8500", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-13", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 13, + "end_row_offset_idx": 14, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6200", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-14", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 14, + "end_row_offset_idx": 15, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "7100", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-15", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 15, + "end_row_offset_idx": 16, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "10500", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-16", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 16, + "end_row_offset_idx": 17, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "3200", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget A", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-17", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 17, + "end_row_offset_idx": 18, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "9400", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget B", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-18", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 18, + "end_row_offset_idx": 19, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "12500", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget C", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-19", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 19, + "end_row_offset_idx": 20, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "6100", + "column_header": false, + "row_header": false, + "row_section": false + } + ], + [ + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "Widget D", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2024-01-20", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false + }, + { + "bbox": null, + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 20, + "end_row_offset_idx": 21, + "start_col_offset_idx": 3, + "end_col_offset_idx": 4, + "text": "8900", + "column_header": false, + "row_header": false, + "row_section": false + } + ] + ] + } + } + ], + "key_value_items": [], + "form_items": [], + "pages": { + "1": { + "size": { + "width": 4.0, + "height": 21.0 + }, + "image": null, + "page_no": 1 + } + } + } +} \ No newline at end of file diff --git a/tests/test_backend_msexcel_xlsm.py b/tests/test_backend_msexcel_xlsm.py new file mode 100644 index 00000000..ac4ec405 --- /dev/null +++ b/tests/test_backend_msexcel_xlsm.py @@ -0,0 +1,47 @@ +import os +from pathlib import Path +from docling.document_converter import DocumentConverter +from docling.datamodel.base_models import InputFormat, FormatToExtensions +import json + +INPUT_DIR = Path("tests/input") +OUTPUT_DIR = Path("tests/output") +OUTPUT_DIR.mkdir(exist_ok=True) + +supported_exts = set() +for fmt in InputFormat: + exts = FormatToExtensions.get(fmt, []) + if exts: + supported_exts.update(exts) + else: + supported_exts.add(fmt.value) +supported_exts.add('xlsm') + +print(f"Supported extensions: {sorted(supported_exts)}") + +input_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix[1:].lower() in supported_exts] +print(f"Found {len(input_files)} files to process: {[f.name for f in input_files]}") + +converter = DocumentConverter() + +def convert_paths(obj): + if isinstance(obj, dict): + return {k: convert_paths(v) for k, v in obj.items()} + elif isinstance(obj, list): + return [convert_paths(i) for i in obj] + elif hasattr(obj, "__module__") and obj.__module__.startswith("pathlib"): + return str(obj) + else: + return obj + +for file in input_files: + try: + print(f"Processing {file}...") + result = converter.convert(str(file)) + out_path = OUTPUT_DIR / (file.stem + ".json") + result_dict = convert_paths(result.model_dump()) + with open(out_path, "w", encoding="utf-8") as f: + json.dump(result_dict, f, ensure_ascii=False, indent=2) + print(f"Converted {file.name} -> {out_path.name}") + except Exception as e: + print(f"Failed to convert {file.name}: {e}") \ No newline at end of file