{"input":{"file":"tests/data/2203.01017v2.pdf","document_hash":"4fa8dff93d74a84069210c84a38d14d62a39ec8f4e4c90bf955fdebdebcf6636","valid":true,"limits":{"max_num_pages":9223372036854775807,"max_file_size":9223372036854775807},"filesize":7215177,"page_count":16},"status":"4","errors":[],"pages":[{"page_no":0,"page_hash":"5deca8f7af439d2d968a480d07761ace8f704461e79d8b3d1dce2c394acdeab7","size":{"width":612.0,"height":792.0},"cells":[{"id":0,"text":"TableFormer: Table Structure Understanding with Transformers.","bbox":{"l":96.301003,"t":107.03412000000003,"r":498.92708999999996,"b":119.93133999999998,"coord_origin":"1"}},{"id":1,"text":"Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar","bbox":{"l":142.47701,"t":146.68535999999995,"r":452.75027,"b":157.37334999999996,"coord_origin":"1"}},{"id":2,"text":"IBM Research","bbox":{"l":262.918,"t":160.63239,"r":332.30597,"b":171.32037000000003,"coord_origin":"1"}},{"id":3,"text":"{","bbox":{"l":208.123,"t":175.96123999999998,"r":212.73083,"b":184.42553999999996,"coord_origin":"1"}},{"id":4,"text":"ahn,nli,mly,taa","bbox":{"l":212.73,"t":177.08203000000003,"r":293.42761,"b":184.00409000000002,"coord_origin":"1"}},{"id":5,"text":"}","bbox":{"l":293.42798,"t":175.96123999999998,"r":298.0358,"b":184.42553999999996,"coord_origin":"1"}},{"id":6,"text":"@zurich.ibm.com","bbox":{"l":298.03497,"t":177.08203000000003,"r":378.73257,"b":184.00409000000002,"coord_origin":"1"}},{"id":7,"text":"Abstract","bbox":{"l":145.99498,"t":215.48297000000002,"r":190.48029,"b":226.23071000000004,"coord_origin":"1"}},{"id":8,"text":"Tables organize valuable content in a concise and com-","bbox":{"l":62.066978,"t":241.39508,"r":286.36493,"b":249.98284999999998,"coord_origin":"1"}},{"id":9,"text":"pact representation. This content is extremely valuable for","bbox":{"l":50.111977,"t":253.3501,"r":286.36508,"b":261.93787,"coord_origin":"1"}},{"id":10,"text":"systems such as search engines, Knowledge Graph’s, etc,","bbox":{"l":50.111977,"t":265.30511,"r":286.36508,"b":273.89288,"coord_origin":"1"}},{"id":11,"text":"since they enhance their predictive capabilities. Unfortu-","bbox":{"l":50.111977,"t":277.26111000000003,"r":286.36505,"b":285.84888,"coord_origin":"1"}},{"id":12,"text":"nately, tables come in a large variety of shapes and sizes.","bbox":{"l":50.111977,"t":289.21609,"r":286.36505,"b":297.80386,"coord_origin":"1"}},{"id":13,"text":"Furthermore, they can have complex column/row-header","bbox":{"l":50.111977,"t":301.17108,"r":286.36505,"b":309.75884999999994,"coord_origin":"1"}},{"id":14,"text":"configurations, multiline rows, different variety of separa-","bbox":{"l":50.111977,"t":313.12607,"r":286.36508,"b":321.71384,"coord_origin":"1"}},{"id":15,"text":"tion lines, missing entries, etc. As such, the correct iden-","bbox":{"l":50.111977,"t":325.08105,"r":286.36508,"b":333.66882,"coord_origin":"1"}},{"id":16,"text":"tification of the table-structure from an image is a non-","bbox":{"l":50.111977,"t":337.03604,"r":286.36505,"b":345.62381,"coord_origin":"1"}},{"id":17,"text":"trivial task. In this paper, we present a new table-structure","bbox":{"l":50.111977,"t":348.99203,"r":286.36508,"b":357.5798,"coord_origin":"1"}},{"id":18,"text":"identification model. The latter improves the latest end-to-","bbox":{"l":50.111977,"t":360.94701999999995,"r":286.36505,"b":369.53479,"coord_origin":"1"}},{"id":19,"text":"end deep learning model (i.e. encoder-dual-decoder from","bbox":{"l":50.111977,"t":372.90201,"r":286.36508,"b":381.48978,"coord_origin":"1"}},{"id":20,"text":"PubTabNet) in two significant ways. First, we introduce a","bbox":{"l":50.111977,"t":384.85699,"r":286.36505,"b":393.44476,"coord_origin":"1"}},{"id":21,"text":"new object detection decoder for table-cells. In this way,","bbox":{"l":50.111977,"t":396.81198,"r":286.36511,"b":405.39975000000004,"coord_origin":"1"}},{"id":22,"text":"we can obtain the content of the table-cells from program-","bbox":{"l":50.111977,"t":408.76697,"r":286.36508,"b":417.35474,"coord_origin":"1"}},{"id":23,"text":"matic PDF’s directly from the PDF source and avoid the","bbox":{"l":50.111977,"t":420.72296000000006,"r":286.36505,"b":429.31073,"coord_origin":"1"}},{"id":24,"text":"training of the custom OCR decoders.","bbox":{"l":50.111977,"t":432.67795,"r":207.23216,"b":441.26572,"coord_origin":"1"}},{"id":25,"text":"This architectural","bbox":{"l":214.09639,"t":432.67795,"r":286.36508,"b":441.26572,"coord_origin":"1"}},{"id":26,"text":"change leads to more accurate table-content extraction and","bbox":{"l":50.111977,"t":444.63293,"r":286.36508,"b":453.2207,"coord_origin":"1"}},{"id":27,"text":"allows us to tackle non-english tables. Second, we replace","bbox":{"l":50.111977,"t":456.58792000000005,"r":286.36505,"b":465.17569,"coord_origin":"1"}},{"id":28,"text":"the LSTM decoders with transformer based decoders. This","bbox":{"l":50.111977,"t":468.54291,"r":286.36505,"b":477.13068,"coord_origin":"1"}},{"id":29,"text":"upgrade improves significantly the previous state-of-the-art","bbox":{"l":50.111977,"t":480.4989,"r":286.36508,"b":489.08667,"coord_origin":"1"}},{"id":30,"text":"tree-editing-distance-score (TEDS) from 91% to 98.5% on","bbox":{"l":50.111977,"t":492.45389,"r":286.36505,"b":501.04166,"coord_origin":"1"}},{"id":31,"text":"simple tables and from 88.7% to 95% on complex tables.","bbox":{"l":50.111977,"t":504.40887,"r":276.65152,"b":512.9966400000001,"coord_origin":"1"}},{"id":32,"text":"1.","bbox":{"l":50.111977,"t":539.94276,"r":58.121296,"b":550.69049,"coord_origin":"1"}},{"id":33,"text":"Introduction","bbox":{"l":68.800385,"t":539.94276,"r":126.94804,"b":550.69049,"coord_origin":"1"}},{"id":34,"text":"The occurrence of tables in documents is ubiquitous.","bbox":{"l":62.066978,"t":560.7832,"r":286.36496,"b":569.68976,"coord_origin":"1"}},{"id":35,"text":"They often summarise quantitative or factual data, which is","bbox":{"l":50.111977,"t":572.7382,"r":286.36508,"b":581.64476,"coord_origin":"1"}},{"id":36,"text":"cumbersome to describe in verbose text but nevertheless ex-","bbox":{"l":50.111977,"t":584.69321,"r":286.36505,"b":593.5997600000001,"coord_origin":"1"}},{"id":37,"text":"tremely valuable. Unfortunately, this compact representa-","bbox":{"l":50.111977,"t":596.6492000000001,"r":286.36505,"b":605.55576,"coord_origin":"1"}},{"id":38,"text":"tion is often not easy to parse by machines. There are many","bbox":{"l":50.111977,"t":608.6042,"r":286.36505,"b":617.51076,"coord_origin":"1"}},{"id":39,"text":"implicit conventions used to obtain a compact table repre-","bbox":{"l":50.111977,"t":620.5592,"r":286.36505,"b":629.46576,"coord_origin":"1"}},{"id":40,"text":"sentation. For example, tables often have complex column-","bbox":{"l":50.111977,"t":632.51421,"r":286.36508,"b":641.42076,"coord_origin":"1"}},{"id":41,"text":"and row-headers in order to reduce duplicated cell content.","bbox":{"l":50.111977,"t":644.46921,"r":286.36508,"b":653.37576,"coord_origin":"1"}},{"id":42,"text":"Lines of different shapes and sizes are leveraged to separate","bbox":{"l":50.111977,"t":656.42421,"r":286.36502,"b":665.33077,"coord_origin":"1"}},{"id":43,"text":"content or indicate a tree structure. Additionally, tables can","bbox":{"l":50.111977,"t":668.3802000000001,"r":286.36505,"b":677.28677,"coord_origin":"1"}},{"id":44,"text":"also have empty/missing table-entries or multi-row textual","bbox":{"l":50.111977,"t":680.33521,"r":286.36505,"b":689.2417800000001,"coord_origin":"1"}},{"id":45,"text":"table-entries. Fig. 1 shows a table which presents all these","bbox":{"l":50.111977,"t":692.290207,"r":286.36505,"b":701.196777,"coord_origin":"1"}},{"id":46,"text":"issues.","bbox":{"l":50.111977,"t":704.245209,"r":76.403275,"b":713.151779,"coord_origin":"1"}},{"id":47,"text":"a.","bbox":{"l":315.56702,"t":218.00684,"r":324.01007,"b":226.75482,"coord_origin":"1"}},{"id":48,"text":"Picture of a table:","bbox":{"l":328.2316,"t":218.00684,"r":408.4407,"b":226.75482,"coord_origin":"1"}},{"id":49,"text":"b.","bbox":{"l":315.56702,"t":313.69478999999995,"r":325.05786,"b":322.44281,"coord_origin":"1"}},{"id":50,"text":"Red-annotation of bounding boxes,","bbox":{"l":329.80325,"t":313.69478999999995,"r":486.40194999999994,"b":322.44281,"coord_origin":"1"}},{"id":51,"text":"Blue-predictions by TableFormer","bbox":{"l":326.46252,"t":324.49478,"r":472.47411999999997,"b":333.2428,"coord_origin":"1"}},{"id":52,"text":"c.","bbox":{"l":315.56702,"t":420.1828,"r":324.81039,"b":428.93082,"coord_origin":"1"}},{"id":53,"text":"Structure predicted by TableFormer:","bbox":{"l":329.4321,"t":420.1828,"r":491.1912500000001,"b":428.93082,"coord_origin":"1"}},{"id":54,"text":"1","bbox":{"l":408.14752,"t":342.82828,"r":412.54001,"b":351.61322,"coord_origin":"1"}},{"id":55,"text":"0","bbox":{"l":356.11011,"t":341.57217,"r":360.50259,"b":350.35712,"coord_origin":"1"}},{"id":56,"text":"2","bbox":{"l":500.6777,"t":340.93768,"r":505.0701900000001,"b":349.7226299999999,"coord_origin":"1"}},{"id":57,"text":"3","bbox":{"l":356.13382,"t":351.74789,"r":360.52631,"b":360.53284,"coord_origin":"1"}},{"id":58,"text":"4","bbox":{"l":402.53992,"t":355.8765,"r":406.9324,"b":364.66144,"coord_origin":"1"}},{"id":59,"text":"5","bbox":{"l":448.58178999999996,"t":352.84018,"r":452.97427,"b":361.62512,"coord_origin":"1"}},{"id":60,"text":"6","bbox":{"l":491.65161000000006,"t":353.70657,"r":496.0441,"b":362.49152,"coord_origin":"1"}},{"id":61,"text":"7","bbox":{"l":535.13843,"t":353.33969,"r":539.53088,"b":362.12463,"coord_origin":"1"}},{"id":62,"text":"8","bbox":{"l":348.82822,"t":387.09781,"r":353.2207,"b":395.88275,"coord_origin":"1"}},{"id":63,"text":"9","bbox":{"l":389.27151,"t":375.37228,"r":393.664,"b":384.15723,"coord_origin":"1"}},{"id":64,"text":"10","bbox":{"l":442.67479999999995,"t":375.64621,"r":451.45889000000005,"b":384.43115,"coord_origin":"1"}},{"id":65,"text":"11","bbox":{"l":477.4382299999999,"t":375.534,"r":485.90167,"b":384.31894000000005,"coord_origin":"1"}},{"id":66,"text":"12","bbox":{"l":522.57263,"t":375.64621,"r":531.35669,"b":384.43115,"coord_origin":"1"}},{"id":67,"text":"13","bbox":{"l":400.22992,"t":387.11429,"r":409.01401,"b":395.89923,"coord_origin":"1"}},{"id":68,"text":"14","bbox":{"l":442.30792,"t":386.98981000000003,"r":451.0920100000001,"b":395.77475000000004,"coord_origin":"1"}},{"id":69,"text":"15","bbox":{"l":478.21941999999996,"t":387.37469,"r":487.00351000000006,"b":396.15964,"coord_origin":"1"}},{"id":70,"text":"16","bbox":{"l":523.2287,"t":386.98981000000003,"r":532.01276,"b":395.77475000000004,"coord_origin":"1"}},{"id":71,"text":"1","bbox":{"l":411.57233,"t":399.42477,"r":415.96481,"b":408.20972,"coord_origin":"1"}},{"id":72,"text":"7","bbox":{"l":415.96393,"t":399.42477,"r":420.35641,"b":408.20972,"coord_origin":"1"}},{"id":73,"text":"18","bbox":{"l":442.30521,"t":399.0371999999999,"r":451.08929,"b":407.82213999999993,"coord_origin":"1"}},{"id":74,"text":"19","bbox":{"l":478.77893,"t":398.99639999999994,"r":487.56302,"b":407.78133999999994,"coord_origin":"1"}},{"id":75,"text":"20","bbox":{"l":523.97241,"t":398.6114799999999,"r":532.75647,"b":407.39642,"coord_origin":"1"}},{"id":76,"text":"1","bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"}},{"id":78,"text":"2","bbox":{"l":394.10422,"t":437.68588,"r":398.4967,"b":446.47083,"coord_origin":"1"}},{"id":79,"text":"3","bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"}},{"id":80,"text":"4","bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"}},{"id":81,"text":"5","bbox":{"l":394.10422,"t":449.5455,"r":398.4967,"b":458.33044,"coord_origin":"1"}},{"id":82,"text":"6","bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"}},{"id":83,"text":"7","bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"}},{"id":84,"text":"8","bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"}},{"id":85,"text":"9","bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"}},{"id":86,"text":"10","bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"}},{"id":87,"text":"11","bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"}},{"id":88,"text":"12","bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"}},{"id":89,"text":"13","bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"}},{"id":90,"text":"14","bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"}},{"id":91,"text":"15","bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"}},{"id":92,"text":"16","bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"}},{"id":93,"text":"17","bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"}},{"id":94,"text":"18","bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"}},{"id":95,"text":"19","bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"}},{"id":96,"text":"20","bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"}},{"id":97,"text":"1","bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"}},{"id":98,"text":"3","bbox":{"l":385.09399,"t":357.76030999999995,"r":391.09879,"b":367.89072,"coord_origin":"1"}},{"id":99,"text":"3","bbox":{"l":366.70102,"t":449.12082,"r":372.70581,"b":459.25122,"coord_origin":"1"}},{"id":100,"text":"2","bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"}},{"id":101,"text":"2","bbox":{"l":333.43451,"t":380.7265,"r":339.4393,"b":390.85689999999994,"coord_origin":"1"}},{"id":102,"text":"2","bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"}},{"id":103,"text":"1","bbox":{"l":478.07210999999995,"t":341.0368000000001,"r":484.0769,"b":351.16720999999995,"coord_origin":"1"}},{"id":104,"text":"1","bbox":{"l":459.87621999999993,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"}},{"id":105,"text":"3","bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"}},{"id":106,"text":"Figure 1:","bbox":{"l":308.862,"t":514.50037,"r":345.73361,"b":523.40692,"coord_origin":"1"}},{"id":107,"text":"Picture of a table with subtle, complex features","bbox":{"l":353.17566,"t":514.50037,"r":545.11511,"b":523.40692,"coord_origin":"1"}},{"id":108,"text":"such as (1) multi-column headers, (2) cell with multi-row","bbox":{"l":308.862,"t":526.45535,"r":545.11511,"b":535.3619100000001,"coord_origin":"1"}},{"id":109,"text":"text and (3) cells with no content. Image from PubTabNet","bbox":{"l":308.862,"t":538.41035,"r":545.11517,"b":547.31691,"coord_origin":"1"}},{"id":110,"text":"evaluation set, filename: ‘PMC2944238 004 02’.","bbox":{"l":308.862,"t":550.36635,"r":505.6917700000001,"b":559.2729,"coord_origin":"1"}},{"id":111,"text":"Recently, significant progress has been made with vi-","bbox":{"l":320.81699,"t":584.40936,"r":545.11493,"b":593.31592,"coord_origin":"1"}},{"id":112,"text":"sion based approaches to extract tables in documents. For","bbox":{"l":308.862,"t":596.36436,"r":545.11517,"b":605.2709199999999,"coord_origin":"1"}},{"id":113,"text":"the sake of completeness, the issue of table extraction from","bbox":{"l":308.862,"t":608.31937,"r":545.11511,"b":617.22592,"coord_origin":"1"}},{"id":114,"text":"documents is typically decomposed into two separate chal-","bbox":{"l":308.862,"t":620.27437,"r":545.11505,"b":629.18092,"coord_origin":"1"}},{"id":115,"text":"lenges, i.e.","bbox":{"l":308.862,"t":632.23036,"r":353.6937,"b":641.13692,"coord_origin":"1"}},{"id":116,"text":"(1)","bbox":{"l":362.11209,"t":632.23036,"r":374.66617,"b":641.13692,"coord_origin":"1"}},{"id":117,"text":"finding the location of the table(s) on a","bbox":{"l":377.35785,"t":632.23036,"r":545.11505,"b":641.13692,"coord_origin":"1"}},{"id":118,"text":"document-page and (2) finding the structure of a given table","bbox":{"l":308.862,"t":644.18536,"r":545.11517,"b":653.09192,"coord_origin":"1"}},{"id":119,"text":"in the document.","bbox":{"l":308.862,"t":656.14037,"r":375.55167,"b":665.04693,"coord_origin":"1"}},{"id":120,"text":"The first problem is called table-location and has been","bbox":{"l":320.81699,"t":668.38036,"r":545.11493,"b":677.28693,"coord_origin":"1"}},{"id":121,"text":"previously addressed [30, 38, 19, 21, 23, 26, 8] with state-","bbox":{"l":308.862,"t":680.33536,"r":545.11511,"b":689.24193,"coord_origin":"1"}},{"id":122,"text":"of-the-art object-detection networks (e.g. YOLO and later","bbox":{"l":308.862,"t":692.290359,"r":545.11511,"b":701.19693,"coord_origin":"1"}},{"id":123,"text":"on Mask-RCNN [9]). For all practical purposes, it can be","bbox":{"l":308.862,"t":704.245361,"r":545.11499,"b":713.151932,"coord_origin":"1"}},{"id":124,"text":"1","bbox":{"l":295.121,"t":734.133366,"r":300.10229,"b":743.039928,"coord_origin":"1"}},{"id":125,"text":"arXiv:2203.01017v2 [cs.CV] 11 Mar 2022","bbox":{"l":18.340221,"t":207.82001000000002,"r":36.339779,"b":560.00003,"coord_origin":"1"}}],"predictions":{"layout":{"clusters":[{"id":0,"label":"Section-header","bbox":{"l":95.52632689476013,"t":106.14017257690432,"r":498.92708999999996,"b":119.93133999999998,"coord_origin":"1"},"confidence":0.8527692556381226,"cells":[{"id":0,"text":"TableFormer: Table Structure Understanding with Transformers.","bbox":{"l":96.301003,"t":107.03412000000003,"r":498.92708999999996,"b":119.93133999999998,"coord_origin":"1"}}]},{"id":1,"label":"Section-header","bbox":{"l":141.7966592788696,"t":145.70029220581057,"r":453.00207595825196,"b":171.32037000000003,"coord_origin":"1"},"confidence":0.8149818778038025,"cells":[{"id":1,"text":"Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar","bbox":{"l":142.47701,"t":146.68535999999995,"r":452.75027,"b":157.37334999999996,"coord_origin":"1"}},{"id":2,"text":"IBM Research","bbox":{"l":262.918,"t":160.63239,"r":332.30597,"b":171.32037000000003,"coord_origin":"1"}}]},{"id":2,"label":"Text","bbox":{"l":208.123,"t":175.47459583282466,"r":379.3107976913452,"b":185.46755561828616,"coord_origin":"1"},"confidence":0.8852415084838867,"cells":[{"id":3,"text":"{","bbox":{"l":208.123,"t":175.96123999999998,"r":212.73083,"b":184.42553999999996,"coord_origin":"1"}},{"id":4,"text":"ahn,nli,mly,taa","bbox":{"l":212.73,"t":177.08203000000003,"r":293.42761,"b":184.00409000000002,"coord_origin":"1"}},{"id":5,"text":"}","bbox":{"l":293.42798,"t":175.96123999999998,"r":298.0358,"b":184.42553999999996,"coord_origin":"1"}},{"id":6,"text":"@zurich.ibm.com","bbox":{"l":298.03497,"t":177.08203000000003,"r":378.73257,"b":184.00409000000002,"coord_origin":"1"}}]},{"id":3,"label":"Section-header","bbox":{"l":145.0311819076538,"t":215.15237388610842,"r":190.65908489227294,"b":226.23071000000004,"coord_origin":"1"},"confidence":0.9270482659339905,"cells":[{"id":7,"text":"Abstract","bbox":{"l":145.99498,"t":215.48297000000002,"r":190.48029,"b":226.23071000000004,"coord_origin":"1"}}]},{"id":4,"label":"Text","bbox":{"l":48.88530099391937,"t":240.16773319244385,"r":286.7518209457398,"b":514.1875190734863,"coord_origin":"1"},"confidence":0.987241268157959,"cells":[{"id":8,"text":"Tables organize valuable content in a concise and com-","bbox":{"l":62.066978,"t":241.39508,"r":286.36493,"b":249.98284999999998,"coord_origin":"1"}},{"id":9,"text":"pact representation. This content is extremely valuable for","bbox":{"l":50.111977,"t":253.3501,"r":286.36508,"b":261.93787,"coord_origin":"1"}},{"id":10,"text":"systems such as search engines, Knowledge Graph’s, etc,","bbox":{"l":50.111977,"t":265.30511,"r":286.36508,"b":273.89288,"coord_origin":"1"}},{"id":11,"text":"since they enhance their predictive capabilities. Unfortu-","bbox":{"l":50.111977,"t":277.26111000000003,"r":286.36505,"b":285.84888,"coord_origin":"1"}},{"id":12,"text":"nately, tables come in a large variety of shapes and sizes.","bbox":{"l":50.111977,"t":289.21609,"r":286.36505,"b":297.80386,"coord_origin":"1"}},{"id":13,"text":"Furthermore, they can have complex column/row-header","bbox":{"l":50.111977,"t":301.17108,"r":286.36505,"b":309.75884999999994,"coord_origin":"1"}},{"id":14,"text":"configurations, multiline rows, different variety of separa-","bbox":{"l":50.111977,"t":313.12607,"r":286.36508,"b":321.71384,"coord_origin":"1"}},{"id":15,"text":"tion lines, missing entries, etc. As such, the correct iden-","bbox":{"l":50.111977,"t":325.08105,"r":286.36508,"b":333.66882,"coord_origin":"1"}},{"id":16,"text":"tification of the table-structure from an image is a non-","bbox":{"l":50.111977,"t":337.03604,"r":286.36505,"b":345.62381,"coord_origin":"1"}},{"id":17,"text":"trivial task. In this paper, we present a new table-structure","bbox":{"l":50.111977,"t":348.99203,"r":286.36508,"b":357.5798,"coord_origin":"1"}},{"id":18,"text":"identification model. The latter improves the latest end-to-","bbox":{"l":50.111977,"t":360.94701999999995,"r":286.36505,"b":369.53479,"coord_origin":"1"}},{"id":19,"text":"end deep learning model (i.e. encoder-dual-decoder from","bbox":{"l":50.111977,"t":372.90201,"r":286.36508,"b":381.48978,"coord_origin":"1"}},{"id":20,"text":"PubTabNet) in two significant ways. First, we introduce a","bbox":{"l":50.111977,"t":384.85699,"r":286.36505,"b":393.44476,"coord_origin":"1"}},{"id":21,"text":"new object detection decoder for table-cells. In this way,","bbox":{"l":50.111977,"t":396.81198,"r":286.36511,"b":405.39975000000004,"coord_origin":"1"}},{"id":22,"text":"we can obtain the content of the table-cells from program-","bbox":{"l":50.111977,"t":408.76697,"r":286.36508,"b":417.35474,"coord_origin":"1"}},{"id":23,"text":"matic PDF’s directly from the PDF source and avoid the","bbox":{"l":50.111977,"t":420.72296000000006,"r":286.36505,"b":429.31073,"coord_origin":"1"}},{"id":24,"text":"training of the custom OCR decoders.","bbox":{"l":50.111977,"t":432.67795,"r":207.23216,"b":441.26572,"coord_origin":"1"}},{"id":25,"text":"This architectural","bbox":{"l":214.09639,"t":432.67795,"r":286.36508,"b":441.26572,"coord_origin":"1"}},{"id":26,"text":"change leads to more accurate table-content extraction and","bbox":{"l":50.111977,"t":444.63293,"r":286.36508,"b":453.2207,"coord_origin":"1"}},{"id":27,"text":"allows us to tackle non-english tables. Second, we replace","bbox":{"l":50.111977,"t":456.58792000000005,"r":286.36505,"b":465.17569,"coord_origin":"1"}},{"id":28,"text":"the LSTM decoders with transformer based decoders. This","bbox":{"l":50.111977,"t":468.54291,"r":286.36505,"b":477.13068,"coord_origin":"1"}},{"id":29,"text":"upgrade improves significantly the previous state-of-the-art","bbox":{"l":50.111977,"t":480.4989,"r":286.36508,"b":489.08667,"coord_origin":"1"}},{"id":30,"text":"tree-editing-distance-score (TEDS) from 91% to 98.5% on","bbox":{"l":50.111977,"t":492.45389,"r":286.36505,"b":501.04166,"coord_origin":"1"}},{"id":31,"text":"simple tables and from 88.7% to 95% on complex tables.","bbox":{"l":50.111977,"t":504.40887,"r":276.65152,"b":512.9966400000001,"coord_origin":"1"}}]},{"id":5,"label":"Section-header","bbox":{"l":50.111977,"t":539.1871112823486,"r":126.94804,"b":550.69049,"coord_origin":"1"},"confidence":0.9362080097198486,"cells":[{"id":32,"text":"1.","bbox":{"l":50.111977,"t":539.94276,"r":58.121296,"b":550.69049,"coord_origin":"1"}},{"id":33,"text":"Introduction","bbox":{"l":68.800385,"t":539.94276,"r":126.94804,"b":550.69049,"coord_origin":"1"}}]},{"id":6,"label":"Text","bbox":{"l":49.18265175819397,"t":559.7423080444336,"r":286.36508,"b":713.151779,"coord_origin":"1"},"confidence":0.9877704977989197,"cells":[{"id":34,"text":"The occurrence of tables in documents is ubiquitous.","bbox":{"l":62.066978,"t":560.7832,"r":286.36496,"b":569.68976,"coord_origin":"1"}},{"id":35,"text":"They often summarise quantitative or factual data, which is","bbox":{"l":50.111977,"t":572.7382,"r":286.36508,"b":581.64476,"coord_origin":"1"}},{"id":36,"text":"cumbersome to describe in verbose text but nevertheless ex-","bbox":{"l":50.111977,"t":584.69321,"r":286.36505,"b":593.5997600000001,"coord_origin":"1"}},{"id":37,"text":"tremely valuable. Unfortunately, this compact representa-","bbox":{"l":50.111977,"t":596.6492000000001,"r":286.36505,"b":605.55576,"coord_origin":"1"}},{"id":38,"text":"tion is often not easy to parse by machines. There are many","bbox":{"l":50.111977,"t":608.6042,"r":286.36505,"b":617.51076,"coord_origin":"1"}},{"id":39,"text":"implicit conventions used to obtain a compact table repre-","bbox":{"l":50.111977,"t":620.5592,"r":286.36505,"b":629.46576,"coord_origin":"1"}},{"id":40,"text":"sentation. For example, tables often have complex column-","bbox":{"l":50.111977,"t":632.51421,"r":286.36508,"b":641.42076,"coord_origin":"1"}},{"id":41,"text":"and row-headers in order to reduce duplicated cell content.","bbox":{"l":50.111977,"t":644.46921,"r":286.36508,"b":653.37576,"coord_origin":"1"}},{"id":42,"text":"Lines of different shapes and sizes are leveraged to separate","bbox":{"l":50.111977,"t":656.42421,"r":286.36502,"b":665.33077,"coord_origin":"1"}},{"id":43,"text":"content or indicate a tree structure. Additionally, tables can","bbox":{"l":50.111977,"t":668.3802000000001,"r":286.36505,"b":677.28677,"coord_origin":"1"}},{"id":44,"text":"also have empty/missing table-entries or multi-row textual","bbox":{"l":50.111977,"t":680.33521,"r":286.36505,"b":689.2417800000001,"coord_origin":"1"}},{"id":45,"text":"table-entries. Fig. 1 shows a table which presents all these","bbox":{"l":50.111977,"t":692.290207,"r":286.36505,"b":701.196777,"coord_origin":"1"}},{"id":46,"text":"issues.","bbox":{"l":50.111977,"t":704.245209,"r":76.403275,"b":713.151779,"coord_origin":"1"}}]},{"id":7,"label":"Section-header","bbox":{"l":315.3704212188721,"t":216.8857538223267,"r":408.4407,"b":226.75482,"coord_origin":"1"},"confidence":0.8334906101226807,"cells":[{"id":47,"text":"a.","bbox":{"l":315.56702,"t":218.00684,"r":324.01007,"b":226.75482,"coord_origin":"1"}},{"id":48,"text":"Picture of a table:","bbox":{"l":328.2316,"t":218.00684,"r":408.4407,"b":226.75482,"coord_origin":"1"}}]},{"id":8,"label":"List-item","bbox":{"l":315.2541000366211,"t":312.6587104797363,"r":486.40194999999994,"b":333.5001319885254,"coord_origin":"1"},"confidence":0.6923348307609558,"cells":[{"id":49,"text":"b.","bbox":{"l":315.56702,"t":313.69478999999995,"r":325.05786,"b":322.44281,"coord_origin":"1"}},{"id":50,"text":"Red-annotation of bounding boxes,","bbox":{"l":329.80325,"t":313.69478999999995,"r":486.40194999999994,"b":322.44281,"coord_origin":"1"}},{"id":51,"text":"Blue-predictions by TableFormer","bbox":{"l":326.46252,"t":324.49478,"r":472.47411999999997,"b":333.2428,"coord_origin":"1"}}]},{"id":9,"label":"List-item","bbox":{"l":315.30837936401366,"t":419.3768840789795,"r":491.1912500000001,"b":429.9300762176514,"coord_origin":"1"},"confidence":0.6545922756195068,"cells":[{"id":52,"text":"c.","bbox":{"l":315.56702,"t":420.1828,"r":324.81039,"b":428.93082,"coord_origin":"1"}},{"id":53,"text":"Structure predicted by TableFormer:","bbox":{"l":329.4321,"t":420.1828,"r":491.1912500000001,"b":428.93082,"coord_origin":"1"}}]},{"id":10,"label":"Picture","bbox":{"l":314.3844051361084,"t":338.26564750671383,"r":539.53088,"b":409.7582748413086,"coord_origin":"1"},"confidence":0.9185528755187988,"cells":[{"id":54,"text":"1","bbox":{"l":408.14752,"t":342.82828,"r":412.54001,"b":351.61322,"coord_origin":"1"}},{"id":55,"text":"0","bbox":{"l":356.11011,"t":341.57217,"r":360.50259,"b":350.35712,"coord_origin":"1"}},{"id":56,"text":"2","bbox":{"l":500.6777,"t":340.93768,"r":505.0701900000001,"b":349.7226299999999,"coord_origin":"1"}},{"id":57,"text":"3","bbox":{"l":356.13382,"t":351.74789,"r":360.52631,"b":360.53284,"coord_origin":"1"}},{"id":58,"text":"4","bbox":{"l":402.53992,"t":355.8765,"r":406.9324,"b":364.66144,"coord_origin":"1"}},{"id":59,"text":"5","bbox":{"l":448.58178999999996,"t":352.84018,"r":452.97427,"b":361.62512,"coord_origin":"1"}},{"id":60,"text":"6","bbox":{"l":491.65161000000006,"t":353.70657,"r":496.0441,"b":362.49152,"coord_origin":"1"}},{"id":61,"text":"7","bbox":{"l":535.13843,"t":353.33969,"r":539.53088,"b":362.12463,"coord_origin":"1"}},{"id":62,"text":"8","bbox":{"l":348.82822,"t":387.09781,"r":353.2207,"b":395.88275,"coord_origin":"1"}},{"id":63,"text":"9","bbox":{"l":389.27151,"t":375.37228,"r":393.664,"b":384.15723,"coord_origin":"1"}},{"id":64,"text":"10","bbox":{"l":442.67479999999995,"t":375.64621,"r":451.45889000000005,"b":384.43115,"coord_origin":"1"}},{"id":65,"text":"11","bbox":{"l":477.4382299999999,"t":375.534,"r":485.90167,"b":384.31894000000005,"coord_origin":"1"}},{"id":66,"text":"12","bbox":{"l":522.57263,"t":375.64621,"r":531.35669,"b":384.43115,"coord_origin":"1"}},{"id":67,"text":"13","bbox":{"l":400.22992,"t":387.11429,"r":409.01401,"b":395.89923,"coord_origin":"1"}},{"id":68,"text":"14","bbox":{"l":442.30792,"t":386.98981000000003,"r":451.0920100000001,"b":395.77475000000004,"coord_origin":"1"}},{"id":69,"text":"15","bbox":{"l":478.21941999999996,"t":387.37469,"r":487.00351000000006,"b":396.15964,"coord_origin":"1"}},{"id":70,"text":"16","bbox":{"l":523.2287,"t":386.98981000000003,"r":532.01276,"b":395.77475000000004,"coord_origin":"1"}},{"id":71,"text":"1","bbox":{"l":411.57233,"t":399.42477,"r":415.96481,"b":408.20972,"coord_origin":"1"}},{"id":72,"text":"7","bbox":{"l":415.96393,"t":399.42477,"r":420.35641,"b":408.20972,"coord_origin":"1"}},{"id":73,"text":"18","bbox":{"l":442.30521,"t":399.0371999999999,"r":451.08929,"b":407.82213999999993,"coord_origin":"1"}},{"id":74,"text":"19","bbox":{"l":478.77893,"t":398.99639999999994,"r":487.56302,"b":407.78133999999994,"coord_origin":"1"}},{"id":75,"text":"20","bbox":{"l":523.97241,"t":398.6114799999999,"r":532.75647,"b":407.39642,"coord_origin":"1"}},{"id":98,"text":"3","bbox":{"l":385.09399,"t":357.76030999999995,"r":391.09879,"b":367.89072,"coord_origin":"1"}},{"id":101,"text":"2","bbox":{"l":333.43451,"t":380.7265,"r":339.4393,"b":390.85689999999994,"coord_origin":"1"}},{"id":103,"text":"1","bbox":{"l":478.07210999999995,"t":341.0368000000001,"r":484.0769,"b":351.16720999999995,"coord_origin":"1"}}]},{"id":11,"label":"Table","bbox":{"l":315.6885681152344,"t":434.2295654296875,"r":536.9868450164795,"b":496.1293773651123,"coord_origin":"1"},"confidence":0.9250026941299438,"cells":[{"id":76,"text":"1","bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"}},{"id":78,"text":"2","bbox":{"l":394.10422,"t":437.68588,"r":398.4967,"b":446.47083,"coord_origin":"1"}},{"id":79,"text":"3","bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"}},{"id":80,"text":"4","bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"}},{"id":81,"text":"5","bbox":{"l":394.10422,"t":449.5455,"r":398.4967,"b":458.33044,"coord_origin":"1"}},{"id":82,"text":"6","bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"}},{"id":83,"text":"7","bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"}},{"id":84,"text":"8","bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"}},{"id":85,"text":"9","bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"}},{"id":86,"text":"10","bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"}},{"id":87,"text":"11","bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"}},{"id":88,"text":"12","bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"}},{"id":89,"text":"13","bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"}},{"id":90,"text":"14","bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"}},{"id":91,"text":"15","bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"}},{"id":92,"text":"16","bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"}},{"id":93,"text":"17","bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"}},{"id":94,"text":"18","bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"}},{"id":95,"text":"19","bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"}},{"id":96,"text":"20","bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"}},{"id":99,"text":"3","bbox":{"l":366.70102,"t":449.12082,"r":372.70581,"b":459.25122,"coord_origin":"1"}},{"id":102,"text":"2","bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"}},{"id":104,"text":"1","bbox":{"l":459.87621999999993,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"}}]},{"id":12,"label":"Table","bbox":{"l":315.6885681152344,"t":230.98197326660159,"r":537.0928356170654,"b":302.49661788940426,"coord_origin":"1"},"confidence":0.8390322327613831,"cells":[{"id":97,"text":"1","bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"}},{"id":100,"text":"2","bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"}},{"id":105,"text":"3","bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"}}]},{"id":13,"label":"Caption","bbox":{"l":307.8611972808838,"t":513.6277587890626,"r":545.11517,"b":559.2729,"coord_origin":"1"},"confidence":0.9685324430465698,"cells":[{"id":106,"text":"Figure 1:","bbox":{"l":308.862,"t":514.50037,"r":345.73361,"b":523.40692,"coord_origin":"1"}},{"id":107,"text":"Picture of a table with subtle, complex features","bbox":{"l":353.17566,"t":514.50037,"r":545.11511,"b":523.40692,"coord_origin":"1"}},{"id":108,"text":"such as (1) multi-column headers, (2) cell with multi-row","bbox":{"l":308.862,"t":526.45535,"r":545.11511,"b":535.3619100000001,"coord_origin":"1"}},{"id":109,"text":"text and (3) cells with no content. Image from PubTabNet","bbox":{"l":308.862,"t":538.41035,"r":545.11517,"b":547.31691,"coord_origin":"1"}},{"id":110,"text":"evaluation set, filename: ‘PMC2944238 004 02’.","bbox":{"l":308.862,"t":550.36635,"r":505.6917700000001,"b":559.2729,"coord_origin":"1"}}]},{"id":14,"label":"Text","bbox":{"l":307.8420244216919,"t":583.5986251831055,"r":545.50438041687,"b":665.04693,"coord_origin":"1"},"confidence":0.9861506223678589,"cells":[{"id":111,"text":"Recently, significant progress has been made with vi-","bbox":{"l":320.81699,"t":584.40936,"r":545.11493,"b":593.31592,"coord_origin":"1"}},{"id":112,"text":"sion based approaches to extract tables in documents. For","bbox":{"l":308.862,"t":596.36436,"r":545.11517,"b":605.2709199999999,"coord_origin":"1"}},{"id":113,"text":"the sake of completeness, the issue of table extraction from","bbox":{"l":308.862,"t":608.31937,"r":545.11511,"b":617.22592,"coord_origin":"1"}},{"id":114,"text":"documents is typically decomposed into two separate chal-","bbox":{"l":308.862,"t":620.27437,"r":545.11505,"b":629.18092,"coord_origin":"1"}},{"id":115,"text":"lenges, i.e.","bbox":{"l":308.862,"t":632.23036,"r":353.6937,"b":641.13692,"coord_origin":"1"}},{"id":116,"text":"(1)","bbox":{"l":362.11209,"t":632.23036,"r":374.66617,"b":641.13692,"coord_origin":"1"}},{"id":117,"text":"finding the location of the table(s) on a","bbox":{"l":377.35785,"t":632.23036,"r":545.11505,"b":641.13692,"coord_origin":"1"}},{"id":118,"text":"document-page and (2) finding the structure of a given table","bbox":{"l":308.862,"t":644.18536,"r":545.11517,"b":653.09192,"coord_origin":"1"}},{"id":119,"text":"in the document.","bbox":{"l":308.862,"t":656.14037,"r":375.55167,"b":665.04693,"coord_origin":"1"}}]},{"id":15,"label":"Text","bbox":{"l":307.9032199859619,"t":667.6026237487794,"r":545.4091873168945,"b":713.8419502258301,"coord_origin":"1"},"confidence":0.9836903810501099,"cells":[{"id":120,"text":"The first problem is called table-location and has been","bbox":{"l":320.81699,"t":668.38036,"r":545.11493,"b":677.28693,"coord_origin":"1"}},{"id":121,"text":"previously addressed [30, 38, 19, 21, 23, 26, 8] with state-","bbox":{"l":308.862,"t":680.33536,"r":545.11511,"b":689.24193,"coord_origin":"1"}},{"id":122,"text":"of-the-art object-detection networks (e.g. YOLO and later","bbox":{"l":308.862,"t":692.290359,"r":545.11511,"b":701.19693,"coord_origin":"1"}},{"id":123,"text":"on Mask-RCNN [9]). For all practical purposes, it can be","bbox":{"l":308.862,"t":704.245361,"r":545.11499,"b":713.151932,"coord_origin":"1"}}]},{"id":16,"label":"Page-footer","bbox":{"l":295.121,"t":733.3784980773926,"r":300.10229,"b":743.039928,"coord_origin":"1"},"confidence":0.8475339412689209,"cells":[{"id":124,"text":"1","bbox":{"l":295.121,"t":734.133366,"r":300.10229,"b":743.039928,"coord_origin":"1"}}]},{"id":17,"label":"Page-header","bbox":{"l":16.783903062343597,"t":207.82001000000002,"r":36.339779,"b":560.00003,"coord_origin":"1"},"confidence":0.8388436436653137,"cells":[{"id":125,"text":"arXiv:2203.01017v2 [cs.CV] 11 Mar 2022","bbox":{"l":18.340221,"t":207.82001000000002,"r":36.339779,"b":560.00003,"coord_origin":"1"}}]}]},"tablestructure":{"table_map":{"11":{"label":"Table","id":11,"page_no":0,"cluster":{"id":11,"label":"Table","bbox":{"l":315.6885681152344,"t":434.2295654296875,"r":536.9868450164795,"b":496.1293773651123,"coord_origin":"1"},"confidence":0.9250026941299438,"cells":[{"id":76,"text":"1","bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"}},{"id":78,"text":"2","bbox":{"l":394.10422,"t":437.68588,"r":398.4967,"b":446.47083,"coord_origin":"1"}},{"id":79,"text":"3","bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"}},{"id":80,"text":"4","bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"}},{"id":81,"text":"5","bbox":{"l":394.10422,"t":449.5455,"r":398.4967,"b":458.33044,"coord_origin":"1"}},{"id":82,"text":"6","bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"}},{"id":83,"text":"7","bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"}},{"id":84,"text":"8","bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"}},{"id":85,"text":"9","bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"}},{"id":86,"text":"10","bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"}},{"id":87,"text":"11","bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"}},{"id":88,"text":"12","bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"}},{"id":89,"text":"13","bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"}},{"id":90,"text":"14","bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"}},{"id":91,"text":"15","bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"}},{"id":92,"text":"16","bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"}},{"id":93,"text":"17","bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"}},{"id":94,"text":"18","bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"}},{"id":95,"text":"19","bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"}},{"id":96,"text":"20","bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"}},{"id":99,"text":"3","bbox":{"l":366.70102,"t":449.12082,"r":372.70581,"b":459.25122,"coord_origin":"1"}},{"id":102,"text":"2","bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"}},{"id":104,"text":"1","bbox":{"l":459.87621999999993,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ched","ched","lcel","ched","lcel","ched","nl","fcel","fcel","fcel","fcel","fcel","fcel","nl","fcel","fcel","fcel","fcel","fcel","fcel","nl","fcel","fcel","fcel","fcel","fcel","ucel","nl","fcel","fcel","fcel","fcel","fcel","ucel","nl"],"num_rows":5,"num_cols":6,"table_cells":[{"bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"},"row_span":1,"col_span":2,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":3,"text":"1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"0","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"},"row_span":1,"col_span":2,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":3,"end_col_offset_idx":5,"text":"2 1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"4","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":366.70102,"t":449.12082,"r":398.4967,"b":459.25122,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"5 3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"6","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"7","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"8","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"9","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"10","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"11","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"12","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"13","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"14","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"15","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"16","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"17","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"18","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"19","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"20","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"},"row_span":3,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":5,"start_col_offset_idx":5,"end_col_offset_idx":6,"text":"2","column_header":false,"row_header":false,"row_section":false}]},"12":{"label":"Table","id":12,"page_no":0,"cluster":{"id":12,"label":"Table","bbox":{"l":315.6885681152344,"t":230.98197326660159,"r":537.0928356170654,"b":302.49661788940426,"coord_origin":"1"},"confidence":0.8390322327613831,"cells":[{"id":97,"text":"1","bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"}},{"id":100,"text":"2","bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"}},{"id":105,"text":"3","bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ecel","ched","ched","ched","ched","nl","rhed","rhed","fcel","fcel","fcel","nl","rhed","rhed","fcel","fcel","fcel","nl","rhed","rhed","fcel","fcel","fcel","nl"],"num_rows":2,"num_cols":3,"table_cells":[{"bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"2","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":true,"row_header":false,"row_section":false}]}}},"figures_classification":null,"equations_prediction":null},"assembled":{"elements":[{"label":"Section-header","id":0,"page_no":0,"cluster":{"id":0,"label":"Section-header","bbox":{"l":95.52632689476013,"t":106.14017257690432,"r":498.92708999999996,"b":119.93133999999998,"coord_origin":"1"},"confidence":0.8527692556381226,"cells":[{"id":0,"text":"TableFormer: Table Structure Understanding with Transformers.","bbox":{"l":96.301003,"t":107.03412000000003,"r":498.92708999999996,"b":119.93133999999998,"coord_origin":"1"}}]},"text":"TableFormer: Table Structure Understanding with Transformers."},{"label":"Section-header","id":1,"page_no":0,"cluster":{"id":1,"label":"Section-header","bbox":{"l":141.7966592788696,"t":145.70029220581057,"r":453.00207595825196,"b":171.32037000000003,"coord_origin":"1"},"confidence":0.8149818778038025,"cells":[{"id":1,"text":"Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar","bbox":{"l":142.47701,"t":146.68535999999995,"r":452.75027,"b":157.37334999999996,"coord_origin":"1"}},{"id":2,"text":"IBM Research","bbox":{"l":262.918,"t":160.63239,"r":332.30597,"b":171.32037000000003,"coord_origin":"1"}}]},"text":"Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar IBM Research"},{"label":"Text","id":2,"page_no":0,"cluster":{"id":2,"label":"Text","bbox":{"l":208.123,"t":175.47459583282466,"r":379.3107976913452,"b":185.46755561828616,"coord_origin":"1"},"confidence":0.8852415084838867,"cells":[{"id":3,"text":"{","bbox":{"l":208.123,"t":175.96123999999998,"r":212.73083,"b":184.42553999999996,"coord_origin":"1"}},{"id":4,"text":"ahn,nli,mly,taa","bbox":{"l":212.73,"t":177.08203000000003,"r":293.42761,"b":184.00409000000002,"coord_origin":"1"}},{"id":5,"text":"}","bbox":{"l":293.42798,"t":175.96123999999998,"r":298.0358,"b":184.42553999999996,"coord_origin":"1"}},{"id":6,"text":"@zurich.ibm.com","bbox":{"l":298.03497,"t":177.08203000000003,"r":378.73257,"b":184.00409000000002,"coord_origin":"1"}}]},"text":"{ ahn,nli,mly,taa } @zurich.ibm.com"},{"label":"Section-header","id":3,"page_no":0,"cluster":{"id":3,"label":"Section-header","bbox":{"l":145.0311819076538,"t":215.15237388610842,"r":190.65908489227294,"b":226.23071000000004,"coord_origin":"1"},"confidence":0.9270482659339905,"cells":[{"id":7,"text":"Abstract","bbox":{"l":145.99498,"t":215.48297000000002,"r":190.48029,"b":226.23071000000004,"coord_origin":"1"}}]},"text":"Abstract"},{"label":"Text","id":4,"page_no":0,"cluster":{"id":4,"label":"Text","bbox":{"l":48.88530099391937,"t":240.16773319244385,"r":286.7518209457398,"b":514.1875190734863,"coord_origin":"1"},"confidence":0.987241268157959,"cells":[{"id":8,"text":"Tables organize valuable content in a concise and com-","bbox":{"l":62.066978,"t":241.39508,"r":286.36493,"b":249.98284999999998,"coord_origin":"1"}},{"id":9,"text":"pact representation. This content is extremely valuable for","bbox":{"l":50.111977,"t":253.3501,"r":286.36508,"b":261.93787,"coord_origin":"1"}},{"id":10,"text":"systems such as search engines, Knowledge Graph’s, etc,","bbox":{"l":50.111977,"t":265.30511,"r":286.36508,"b":273.89288,"coord_origin":"1"}},{"id":11,"text":"since they enhance their predictive capabilities. Unfortu-","bbox":{"l":50.111977,"t":277.26111000000003,"r":286.36505,"b":285.84888,"coord_origin":"1"}},{"id":12,"text":"nately, tables come in a large variety of shapes and sizes.","bbox":{"l":50.111977,"t":289.21609,"r":286.36505,"b":297.80386,"coord_origin":"1"}},{"id":13,"text":"Furthermore, they can have complex column/row-header","bbox":{"l":50.111977,"t":301.17108,"r":286.36505,"b":309.75884999999994,"coord_origin":"1"}},{"id":14,"text":"configurations, multiline rows, different variety of separa-","bbox":{"l":50.111977,"t":313.12607,"r":286.36508,"b":321.71384,"coord_origin":"1"}},{"id":15,"text":"tion lines, missing entries, etc. As such, the correct iden-","bbox":{"l":50.111977,"t":325.08105,"r":286.36508,"b":333.66882,"coord_origin":"1"}},{"id":16,"text":"tification of the table-structure from an image is a non-","bbox":{"l":50.111977,"t":337.03604,"r":286.36505,"b":345.62381,"coord_origin":"1"}},{"id":17,"text":"trivial task. In this paper, we present a new table-structure","bbox":{"l":50.111977,"t":348.99203,"r":286.36508,"b":357.5798,"coord_origin":"1"}},{"id":18,"text":"identification model. The latter improves the latest end-to-","bbox":{"l":50.111977,"t":360.94701999999995,"r":286.36505,"b":369.53479,"coord_origin":"1"}},{"id":19,"text":"end deep learning model (i.e. encoder-dual-decoder from","bbox":{"l":50.111977,"t":372.90201,"r":286.36508,"b":381.48978,"coord_origin":"1"}},{"id":20,"text":"PubTabNet) in two significant ways. First, we introduce a","bbox":{"l":50.111977,"t":384.85699,"r":286.36505,"b":393.44476,"coord_origin":"1"}},{"id":21,"text":"new object detection decoder for table-cells. In this way,","bbox":{"l":50.111977,"t":396.81198,"r":286.36511,"b":405.39975000000004,"coord_origin":"1"}},{"id":22,"text":"we can obtain the content of the table-cells from program-","bbox":{"l":50.111977,"t":408.76697,"r":286.36508,"b":417.35474,"coord_origin":"1"}},{"id":23,"text":"matic PDF’s directly from the PDF source and avoid the","bbox":{"l":50.111977,"t":420.72296000000006,"r":286.36505,"b":429.31073,"coord_origin":"1"}},{"id":24,"text":"training of the custom OCR decoders.","bbox":{"l":50.111977,"t":432.67795,"r":207.23216,"b":441.26572,"coord_origin":"1"}},{"id":25,"text":"This architectural","bbox":{"l":214.09639,"t":432.67795,"r":286.36508,"b":441.26572,"coord_origin":"1"}},{"id":26,"text":"change leads to more accurate table-content extraction and","bbox":{"l":50.111977,"t":444.63293,"r":286.36508,"b":453.2207,"coord_origin":"1"}},{"id":27,"text":"allows us to tackle non-english tables. Second, we replace","bbox":{"l":50.111977,"t":456.58792000000005,"r":286.36505,"b":465.17569,"coord_origin":"1"}},{"id":28,"text":"the LSTM decoders with transformer based decoders. This","bbox":{"l":50.111977,"t":468.54291,"r":286.36505,"b":477.13068,"coord_origin":"1"}},{"id":29,"text":"upgrade improves significantly the previous state-of-the-art","bbox":{"l":50.111977,"t":480.4989,"r":286.36508,"b":489.08667,"coord_origin":"1"}},{"id":30,"text":"tree-editing-distance-score (TEDS) from 91% to 98.5% on","bbox":{"l":50.111977,"t":492.45389,"r":286.36505,"b":501.04166,"coord_origin":"1"}},{"id":31,"text":"simple tables and from 88.7% to 95% on complex tables.","bbox":{"l":50.111977,"t":504.40887,"r":276.65152,"b":512.9966400000001,"coord_origin":"1"}}]},"text":"Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph’s, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF’s directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables."},{"label":"Section-header","id":5,"page_no":0,"cluster":{"id":5,"label":"Section-header","bbox":{"l":50.111977,"t":539.1871112823486,"r":126.94804,"b":550.69049,"coord_origin":"1"},"confidence":0.9362080097198486,"cells":[{"id":32,"text":"1.","bbox":{"l":50.111977,"t":539.94276,"r":58.121296,"b":550.69049,"coord_origin":"1"}},{"id":33,"text":"Introduction","bbox":{"l":68.800385,"t":539.94276,"r":126.94804,"b":550.69049,"coord_origin":"1"}}]},"text":"1. Introduction"},{"label":"Text","id":6,"page_no":0,"cluster":{"id":6,"label":"Text","bbox":{"l":49.18265175819397,"t":559.7423080444336,"r":286.36508,"b":713.151779,"coord_origin":"1"},"confidence":0.9877704977989197,"cells":[{"id":34,"text":"The occurrence of tables in documents is ubiquitous.","bbox":{"l":62.066978,"t":560.7832,"r":286.36496,"b":569.68976,"coord_origin":"1"}},{"id":35,"text":"They often summarise quantitative or factual data, which is","bbox":{"l":50.111977,"t":572.7382,"r":286.36508,"b":581.64476,"coord_origin":"1"}},{"id":36,"text":"cumbersome to describe in verbose text but nevertheless ex-","bbox":{"l":50.111977,"t":584.69321,"r":286.36505,"b":593.5997600000001,"coord_origin":"1"}},{"id":37,"text":"tremely valuable. Unfortunately, this compact representa-","bbox":{"l":50.111977,"t":596.6492000000001,"r":286.36505,"b":605.55576,"coord_origin":"1"}},{"id":38,"text":"tion is often not easy to parse by machines. There are many","bbox":{"l":50.111977,"t":608.6042,"r":286.36505,"b":617.51076,"coord_origin":"1"}},{"id":39,"text":"implicit conventions used to obtain a compact table repre-","bbox":{"l":50.111977,"t":620.5592,"r":286.36505,"b":629.46576,"coord_origin":"1"}},{"id":40,"text":"sentation. For example, tables often have complex column-","bbox":{"l":50.111977,"t":632.51421,"r":286.36508,"b":641.42076,"coord_origin":"1"}},{"id":41,"text":"and row-headers in order to reduce duplicated cell content.","bbox":{"l":50.111977,"t":644.46921,"r":286.36508,"b":653.37576,"coord_origin":"1"}},{"id":42,"text":"Lines of different shapes and sizes are leveraged to separate","bbox":{"l":50.111977,"t":656.42421,"r":286.36502,"b":665.33077,"coord_origin":"1"}},{"id":43,"text":"content or indicate a tree structure. Additionally, tables can","bbox":{"l":50.111977,"t":668.3802000000001,"r":286.36505,"b":677.28677,"coord_origin":"1"}},{"id":44,"text":"also have empty/missing table-entries or multi-row textual","bbox":{"l":50.111977,"t":680.33521,"r":286.36505,"b":689.2417800000001,"coord_origin":"1"}},{"id":45,"text":"table-entries. Fig. 1 shows a table which presents all these","bbox":{"l":50.111977,"t":692.290207,"r":286.36505,"b":701.196777,"coord_origin":"1"}},{"id":46,"text":"issues.","bbox":{"l":50.111977,"t":704.245209,"r":76.403275,"b":713.151779,"coord_origin":"1"}}]},"text":"The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues."},{"label":"Section-header","id":7,"page_no":0,"cluster":{"id":7,"label":"Section-header","bbox":{"l":315.3704212188721,"t":216.8857538223267,"r":408.4407,"b":226.75482,"coord_origin":"1"},"confidence":0.8334906101226807,"cells":[{"id":47,"text":"a.","bbox":{"l":315.56702,"t":218.00684,"r":324.01007,"b":226.75482,"coord_origin":"1"}},{"id":48,"text":"Picture of a table:","bbox":{"l":328.2316,"t":218.00684,"r":408.4407,"b":226.75482,"coord_origin":"1"}}]},"text":"a. Picture of a table:"},{"label":"List-item","id":8,"page_no":0,"cluster":{"id":8,"label":"List-item","bbox":{"l":315.2541000366211,"t":312.6587104797363,"r":486.40194999999994,"b":333.5001319885254,"coord_origin":"1"},"confidence":0.6923348307609558,"cells":[{"id":49,"text":"b.","bbox":{"l":315.56702,"t":313.69478999999995,"r":325.05786,"b":322.44281,"coord_origin":"1"}},{"id":50,"text":"Red-annotation of bounding boxes,","bbox":{"l":329.80325,"t":313.69478999999995,"r":486.40194999999994,"b":322.44281,"coord_origin":"1"}},{"id":51,"text":"Blue-predictions by TableFormer","bbox":{"l":326.46252,"t":324.49478,"r":472.47411999999997,"b":333.2428,"coord_origin":"1"}}]},"text":"b. Red-annotation of bounding boxes, Blue-predictions by TableFormer"},{"label":"List-item","id":9,"page_no":0,"cluster":{"id":9,"label":"List-item","bbox":{"l":315.30837936401366,"t":419.3768840789795,"r":491.1912500000001,"b":429.9300762176514,"coord_origin":"1"},"confidence":0.6545922756195068,"cells":[{"id":52,"text":"c.","bbox":{"l":315.56702,"t":420.1828,"r":324.81039,"b":428.93082,"coord_origin":"1"}},{"id":53,"text":"Structure predicted by TableFormer:","bbox":{"l":329.4321,"t":420.1828,"r":491.1912500000001,"b":428.93082,"coord_origin":"1"}}]},"text":"c. Structure predicted by TableFormer:"},{"label":"Picture","id":10,"page_no":0,"cluster":{"id":10,"label":"Picture","bbox":{"l":314.3844051361084,"t":338.26564750671383,"r":539.53088,"b":409.7582748413086,"coord_origin":"1"},"confidence":0.9185528755187988,"cells":[{"id":54,"text":"1","bbox":{"l":408.14752,"t":342.82828,"r":412.54001,"b":351.61322,"coord_origin":"1"}},{"id":55,"text":"0","bbox":{"l":356.11011,"t":341.57217,"r":360.50259,"b":350.35712,"coord_origin":"1"}},{"id":56,"text":"2","bbox":{"l":500.6777,"t":340.93768,"r":505.0701900000001,"b":349.7226299999999,"coord_origin":"1"}},{"id":57,"text":"3","bbox":{"l":356.13382,"t":351.74789,"r":360.52631,"b":360.53284,"coord_origin":"1"}},{"id":58,"text":"4","bbox":{"l":402.53992,"t":355.8765,"r":406.9324,"b":364.66144,"coord_origin":"1"}},{"id":59,"text":"5","bbox":{"l":448.58178999999996,"t":352.84018,"r":452.97427,"b":361.62512,"coord_origin":"1"}},{"id":60,"text":"6","bbox":{"l":491.65161000000006,"t":353.70657,"r":496.0441,"b":362.49152,"coord_origin":"1"}},{"id":61,"text":"7","bbox":{"l":535.13843,"t":353.33969,"r":539.53088,"b":362.12463,"coord_origin":"1"}},{"id":62,"text":"8","bbox":{"l":348.82822,"t":387.09781,"r":353.2207,"b":395.88275,"coord_origin":"1"}},{"id":63,"text":"9","bbox":{"l":389.27151,"t":375.37228,"r":393.664,"b":384.15723,"coord_origin":"1"}},{"id":64,"text":"10","bbox":{"l":442.67479999999995,"t":375.64621,"r":451.45889000000005,"b":384.43115,"coord_origin":"1"}},{"id":65,"text":"11","bbox":{"l":477.4382299999999,"t":375.534,"r":485.90167,"b":384.31894000000005,"coord_origin":"1"}},{"id":66,"text":"12","bbox":{"l":522.57263,"t":375.64621,"r":531.35669,"b":384.43115,"coord_origin":"1"}},{"id":67,"text":"13","bbox":{"l":400.22992,"t":387.11429,"r":409.01401,"b":395.89923,"coord_origin":"1"}},{"id":68,"text":"14","bbox":{"l":442.30792,"t":386.98981000000003,"r":451.0920100000001,"b":395.77475000000004,"coord_origin":"1"}},{"id":69,"text":"15","bbox":{"l":478.21941999999996,"t":387.37469,"r":487.00351000000006,"b":396.15964,"coord_origin":"1"}},{"id":70,"text":"16","bbox":{"l":523.2287,"t":386.98981000000003,"r":532.01276,"b":395.77475000000004,"coord_origin":"1"}},{"id":71,"text":"1","bbox":{"l":411.57233,"t":399.42477,"r":415.96481,"b":408.20972,"coord_origin":"1"}},{"id":72,"text":"7","bbox":{"l":415.96393,"t":399.42477,"r":420.35641,"b":408.20972,"coord_origin":"1"}},{"id":73,"text":"18","bbox":{"l":442.30521,"t":399.0371999999999,"r":451.08929,"b":407.82213999999993,"coord_origin":"1"}},{"id":74,"text":"19","bbox":{"l":478.77893,"t":398.99639999999994,"r":487.56302,"b":407.78133999999994,"coord_origin":"1"}},{"id":75,"text":"20","bbox":{"l":523.97241,"t":398.6114799999999,"r":532.75647,"b":407.39642,"coord_origin":"1"}},{"id":98,"text":"3","bbox":{"l":385.09399,"t":357.76030999999995,"r":391.09879,"b":367.89072,"coord_origin":"1"}},{"id":101,"text":"2","bbox":{"l":333.43451,"t":380.7265,"r":339.4393,"b":390.85689999999994,"coord_origin":"1"}},{"id":103,"text":"1","bbox":{"l":478.07210999999995,"t":341.0368000000001,"r":484.0769,"b":351.16720999999995,"coord_origin":"1"}}]},"text":"","data":null,"provenance":null,"predicted_class":null,"confidence":null},{"label":"Table","id":11,"page_no":0,"cluster":{"id":11,"label":"Table","bbox":{"l":315.6885681152344,"t":434.2295654296875,"r":536.9868450164795,"b":496.1293773651123,"coord_origin":"1"},"confidence":0.9250026941299438,"cells":[{"id":76,"text":"1","bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"}},{"id":78,"text":"2","bbox":{"l":394.10422,"t":437.68588,"r":398.4967,"b":446.47083,"coord_origin":"1"}},{"id":79,"text":"3","bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"}},{"id":80,"text":"4","bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"}},{"id":81,"text":"5","bbox":{"l":394.10422,"t":449.5455,"r":398.4967,"b":458.33044,"coord_origin":"1"}},{"id":82,"text":"6","bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"}},{"id":83,"text":"7","bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"}},{"id":84,"text":"8","bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"}},{"id":85,"text":"9","bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"}},{"id":86,"text":"10","bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"}},{"id":87,"text":"11","bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"}},{"id":88,"text":"12","bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"}},{"id":89,"text":"13","bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"}},{"id":90,"text":"14","bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"}},{"id":91,"text":"15","bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"}},{"id":92,"text":"16","bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"}},{"id":93,"text":"17","bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"}},{"id":94,"text":"18","bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"}},{"id":95,"text":"19","bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"}},{"id":96,"text":"20","bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"}},{"id":99,"text":"3","bbox":{"l":366.70102,"t":449.12082,"r":372.70581,"b":459.25122,"coord_origin":"1"}},{"id":102,"text":"2","bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"}},{"id":104,"text":"1","bbox":{"l":459.87621999999993,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ched","ched","lcel","ched","lcel","ched","nl","fcel","fcel","fcel","fcel","fcel","fcel","nl","fcel","fcel","fcel","fcel","fcel","fcel","nl","fcel","fcel","fcel","fcel","fcel","ucel","nl","fcel","fcel","fcel","fcel","fcel","ucel","nl"],"num_rows":5,"num_cols":6,"table_cells":[{"bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"},"row_span":1,"col_span":2,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":3,"text":"1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"0","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"},"row_span":1,"col_span":2,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":3,"end_col_offset_idx":5,"text":"2 1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"4","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":366.70102,"t":449.12082,"r":398.4967,"b":459.25122,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"5 3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"6","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"7","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"8","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"9","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"10","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"11","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"12","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"13","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"14","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"15","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"16","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"17","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"18","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"19","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"20","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"},"row_span":3,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":5,"start_col_offset_idx":5,"end_col_offset_idx":6,"text":"2","column_header":false,"row_header":false,"row_section":false}]},{"label":"Table","id":12,"page_no":0,"cluster":{"id":12,"label":"Table","bbox":{"l":315.6885681152344,"t":230.98197326660159,"r":537.0928356170654,"b":302.49661788940426,"coord_origin":"1"},"confidence":0.8390322327613831,"cells":[{"id":97,"text":"1","bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"}},{"id":100,"text":"2","bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"}},{"id":105,"text":"3","bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ecel","ched","ched","ched","ched","nl","rhed","rhed","fcel","fcel","fcel","nl","rhed","rhed","fcel","fcel","fcel","nl","rhed","rhed","fcel","fcel","fcel","nl"],"num_rows":2,"num_cols":3,"table_cells":[{"bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"2","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":true,"row_header":false,"row_section":false}]},{"label":"Caption","id":13,"page_no":0,"cluster":{"id":13,"label":"Caption","bbox":{"l":307.8611972808838,"t":513.6277587890626,"r":545.11517,"b":559.2729,"coord_origin":"1"},"confidence":0.9685324430465698,"cells":[{"id":106,"text":"Figure 1:","bbox":{"l":308.862,"t":514.50037,"r":345.73361,"b":523.40692,"coord_origin":"1"}},{"id":107,"text":"Picture of a table with subtle, complex features","bbox":{"l":353.17566,"t":514.50037,"r":545.11511,"b":523.40692,"coord_origin":"1"}},{"id":108,"text":"such as (1) multi-column headers, (2) cell with multi-row","bbox":{"l":308.862,"t":526.45535,"r":545.11511,"b":535.3619100000001,"coord_origin":"1"}},{"id":109,"text":"text and (3) cells with no content. Image from PubTabNet","bbox":{"l":308.862,"t":538.41035,"r":545.11517,"b":547.31691,"coord_origin":"1"}},{"id":110,"text":"evaluation set, filename: ‘PMC2944238 004 02’.","bbox":{"l":308.862,"t":550.36635,"r":505.6917700000001,"b":559.2729,"coord_origin":"1"}}]},"text":"Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: ‘PMC2944238 004 02’."},{"label":"Text","id":14,"page_no":0,"cluster":{"id":14,"label":"Text","bbox":{"l":307.8420244216919,"t":583.5986251831055,"r":545.50438041687,"b":665.04693,"coord_origin":"1"},"confidence":0.9861506223678589,"cells":[{"id":111,"text":"Recently, significant progress has been made with vi-","bbox":{"l":320.81699,"t":584.40936,"r":545.11493,"b":593.31592,"coord_origin":"1"}},{"id":112,"text":"sion based approaches to extract tables in documents. For","bbox":{"l":308.862,"t":596.36436,"r":545.11517,"b":605.2709199999999,"coord_origin":"1"}},{"id":113,"text":"the sake of completeness, the issue of table extraction from","bbox":{"l":308.862,"t":608.31937,"r":545.11511,"b":617.22592,"coord_origin":"1"}},{"id":114,"text":"documents is typically decomposed into two separate chal-","bbox":{"l":308.862,"t":620.27437,"r":545.11505,"b":629.18092,"coord_origin":"1"}},{"id":115,"text":"lenges, i.e.","bbox":{"l":308.862,"t":632.23036,"r":353.6937,"b":641.13692,"coord_origin":"1"}},{"id":116,"text":"(1)","bbox":{"l":362.11209,"t":632.23036,"r":374.66617,"b":641.13692,"coord_origin":"1"}},{"id":117,"text":"finding the location of the table(s) on a","bbox":{"l":377.35785,"t":632.23036,"r":545.11505,"b":641.13692,"coord_origin":"1"}},{"id":118,"text":"document-page and (2) finding the structure of a given table","bbox":{"l":308.862,"t":644.18536,"r":545.11517,"b":653.09192,"coord_origin":"1"}},{"id":119,"text":"in the document.","bbox":{"l":308.862,"t":656.14037,"r":375.55167,"b":665.04693,"coord_origin":"1"}}]},"text":"Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document."},{"label":"Text","id":15,"page_no":0,"cluster":{"id":15,"label":"Text","bbox":{"l":307.9032199859619,"t":667.6026237487794,"r":545.4091873168945,"b":713.8419502258301,"coord_origin":"1"},"confidence":0.9836903810501099,"cells":[{"id":120,"text":"The first problem is called table-location and has been","bbox":{"l":320.81699,"t":668.38036,"r":545.11493,"b":677.28693,"coord_origin":"1"}},{"id":121,"text":"previously addressed [30, 38, 19, 21, 23, 26, 8] with state-","bbox":{"l":308.862,"t":680.33536,"r":545.11511,"b":689.24193,"coord_origin":"1"}},{"id":122,"text":"of-the-art object-detection networks (e.g. YOLO and later","bbox":{"l":308.862,"t":692.290359,"r":545.11511,"b":701.19693,"coord_origin":"1"}},{"id":123,"text":"on Mask-RCNN [9]). For all practical purposes, it can be","bbox":{"l":308.862,"t":704.245361,"r":545.11499,"b":713.151932,"coord_origin":"1"}}]},"text":"The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be"},{"label":"Page-footer","id":16,"page_no":0,"cluster":{"id":16,"label":"Page-footer","bbox":{"l":295.121,"t":733.3784980773926,"r":300.10229,"b":743.039928,"coord_origin":"1"},"confidence":0.8475339412689209,"cells":[{"id":124,"text":"1","bbox":{"l":295.121,"t":734.133366,"r":300.10229,"b":743.039928,"coord_origin":"1"}}]},"text":"1"},{"label":"Page-header","id":17,"page_no":0,"cluster":{"id":17,"label":"Page-header","bbox":{"l":16.783903062343597,"t":207.82001000000002,"r":36.339779,"b":560.00003,"coord_origin":"1"},"confidence":0.8388436436653137,"cells":[{"id":125,"text":"arXiv:2203.01017v2 [cs.CV] 11 Mar 2022","bbox":{"l":18.340221,"t":207.82001000000002,"r":36.339779,"b":560.00003,"coord_origin":"1"}}]},"text":"arXiv:2203.01017v2 [cs.CV] 11 Mar 2022"}],"body":[{"label":"Section-header","id":0,"page_no":0,"cluster":{"id":0,"label":"Section-header","bbox":{"l":95.52632689476013,"t":106.14017257690432,"r":498.92708999999996,"b":119.93133999999998,"coord_origin":"1"},"confidence":0.8527692556381226,"cells":[{"id":0,"text":"TableFormer: Table Structure Understanding with Transformers.","bbox":{"l":96.301003,"t":107.03412000000003,"r":498.92708999999996,"b":119.93133999999998,"coord_origin":"1"}}]},"text":"TableFormer: Table Structure Understanding with Transformers."},{"label":"Section-header","id":1,"page_no":0,"cluster":{"id":1,"label":"Section-header","bbox":{"l":141.7966592788696,"t":145.70029220581057,"r":453.00207595825196,"b":171.32037000000003,"coord_origin":"1"},"confidence":0.8149818778038025,"cells":[{"id":1,"text":"Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar","bbox":{"l":142.47701,"t":146.68535999999995,"r":452.75027,"b":157.37334999999996,"coord_origin":"1"}},{"id":2,"text":"IBM Research","bbox":{"l":262.918,"t":160.63239,"r":332.30597,"b":171.32037000000003,"coord_origin":"1"}}]},"text":"Ahmed Nassar, Nikolaos Livathinos, Maksym Lysak, Peter Staar IBM Research"},{"label":"Text","id":2,"page_no":0,"cluster":{"id":2,"label":"Text","bbox":{"l":208.123,"t":175.47459583282466,"r":379.3107976913452,"b":185.46755561828616,"coord_origin":"1"},"confidence":0.8852415084838867,"cells":[{"id":3,"text":"{","bbox":{"l":208.123,"t":175.96123999999998,"r":212.73083,"b":184.42553999999996,"coord_origin":"1"}},{"id":4,"text":"ahn,nli,mly,taa","bbox":{"l":212.73,"t":177.08203000000003,"r":293.42761,"b":184.00409000000002,"coord_origin":"1"}},{"id":5,"text":"}","bbox":{"l":293.42798,"t":175.96123999999998,"r":298.0358,"b":184.42553999999996,"coord_origin":"1"}},{"id":6,"text":"@zurich.ibm.com","bbox":{"l":298.03497,"t":177.08203000000003,"r":378.73257,"b":184.00409000000002,"coord_origin":"1"}}]},"text":"{ ahn,nli,mly,taa } @zurich.ibm.com"},{"label":"Section-header","id":3,"page_no":0,"cluster":{"id":3,"label":"Section-header","bbox":{"l":145.0311819076538,"t":215.15237388610842,"r":190.65908489227294,"b":226.23071000000004,"coord_origin":"1"},"confidence":0.9270482659339905,"cells":[{"id":7,"text":"Abstract","bbox":{"l":145.99498,"t":215.48297000000002,"r":190.48029,"b":226.23071000000004,"coord_origin":"1"}}]},"text":"Abstract"},{"label":"Text","id":4,"page_no":0,"cluster":{"id":4,"label":"Text","bbox":{"l":48.88530099391937,"t":240.16773319244385,"r":286.7518209457398,"b":514.1875190734863,"coord_origin":"1"},"confidence":0.987241268157959,"cells":[{"id":8,"text":"Tables organize valuable content in a concise and com-","bbox":{"l":62.066978,"t":241.39508,"r":286.36493,"b":249.98284999999998,"coord_origin":"1"}},{"id":9,"text":"pact representation. This content is extremely valuable for","bbox":{"l":50.111977,"t":253.3501,"r":286.36508,"b":261.93787,"coord_origin":"1"}},{"id":10,"text":"systems such as search engines, Knowledge Graph’s, etc,","bbox":{"l":50.111977,"t":265.30511,"r":286.36508,"b":273.89288,"coord_origin":"1"}},{"id":11,"text":"since they enhance their predictive capabilities. Unfortu-","bbox":{"l":50.111977,"t":277.26111000000003,"r":286.36505,"b":285.84888,"coord_origin":"1"}},{"id":12,"text":"nately, tables come in a large variety of shapes and sizes.","bbox":{"l":50.111977,"t":289.21609,"r":286.36505,"b":297.80386,"coord_origin":"1"}},{"id":13,"text":"Furthermore, they can have complex column/row-header","bbox":{"l":50.111977,"t":301.17108,"r":286.36505,"b":309.75884999999994,"coord_origin":"1"}},{"id":14,"text":"configurations, multiline rows, different variety of separa-","bbox":{"l":50.111977,"t":313.12607,"r":286.36508,"b":321.71384,"coord_origin":"1"}},{"id":15,"text":"tion lines, missing entries, etc. As such, the correct iden-","bbox":{"l":50.111977,"t":325.08105,"r":286.36508,"b":333.66882,"coord_origin":"1"}},{"id":16,"text":"tification of the table-structure from an image is a non-","bbox":{"l":50.111977,"t":337.03604,"r":286.36505,"b":345.62381,"coord_origin":"1"}},{"id":17,"text":"trivial task. In this paper, we present a new table-structure","bbox":{"l":50.111977,"t":348.99203,"r":286.36508,"b":357.5798,"coord_origin":"1"}},{"id":18,"text":"identification model. The latter improves the latest end-to-","bbox":{"l":50.111977,"t":360.94701999999995,"r":286.36505,"b":369.53479,"coord_origin":"1"}},{"id":19,"text":"end deep learning model (i.e. encoder-dual-decoder from","bbox":{"l":50.111977,"t":372.90201,"r":286.36508,"b":381.48978,"coord_origin":"1"}},{"id":20,"text":"PubTabNet) in two significant ways. First, we introduce a","bbox":{"l":50.111977,"t":384.85699,"r":286.36505,"b":393.44476,"coord_origin":"1"}},{"id":21,"text":"new object detection decoder for table-cells. In this way,","bbox":{"l":50.111977,"t":396.81198,"r":286.36511,"b":405.39975000000004,"coord_origin":"1"}},{"id":22,"text":"we can obtain the content of the table-cells from program-","bbox":{"l":50.111977,"t":408.76697,"r":286.36508,"b":417.35474,"coord_origin":"1"}},{"id":23,"text":"matic PDF’s directly from the PDF source and avoid the","bbox":{"l":50.111977,"t":420.72296000000006,"r":286.36505,"b":429.31073,"coord_origin":"1"}},{"id":24,"text":"training of the custom OCR decoders.","bbox":{"l":50.111977,"t":432.67795,"r":207.23216,"b":441.26572,"coord_origin":"1"}},{"id":25,"text":"This architectural","bbox":{"l":214.09639,"t":432.67795,"r":286.36508,"b":441.26572,"coord_origin":"1"}},{"id":26,"text":"change leads to more accurate table-content extraction and","bbox":{"l":50.111977,"t":444.63293,"r":286.36508,"b":453.2207,"coord_origin":"1"}},{"id":27,"text":"allows us to tackle non-english tables. Second, we replace","bbox":{"l":50.111977,"t":456.58792000000005,"r":286.36505,"b":465.17569,"coord_origin":"1"}},{"id":28,"text":"the LSTM decoders with transformer based decoders. This","bbox":{"l":50.111977,"t":468.54291,"r":286.36505,"b":477.13068,"coord_origin":"1"}},{"id":29,"text":"upgrade improves significantly the previous state-of-the-art","bbox":{"l":50.111977,"t":480.4989,"r":286.36508,"b":489.08667,"coord_origin":"1"}},{"id":30,"text":"tree-editing-distance-score (TEDS) from 91% to 98.5% on","bbox":{"l":50.111977,"t":492.45389,"r":286.36505,"b":501.04166,"coord_origin":"1"}},{"id":31,"text":"simple tables and from 88.7% to 95% on complex tables.","bbox":{"l":50.111977,"t":504.40887,"r":276.65152,"b":512.9966400000001,"coord_origin":"1"}}]},"text":"Tables organize valuable content in a concise and compact representation. This content is extremely valuable for systems such as search engines, Knowledge Graph’s, etc, since they enhance their predictive capabilities. Unfortunately, tables come in a large variety of shapes and sizes. Furthermore, they can have complex column/row-header configurations, multiline rows, different variety of separation lines, missing entries, etc. As such, the correct identification of the table-structure from an image is a nontrivial task. In this paper, we present a new table-structure identification model. The latter improves the latest end-toend deep learning model (i.e. encoder-dual-decoder from PubTabNet) in two significant ways. First, we introduce a new object detection decoder for table-cells. In this way, we can obtain the content of the table-cells from programmatic PDF’s directly from the PDF source and avoid the training of the custom OCR decoders. This architectural change leads to more accurate table-content extraction and allows us to tackle non-english tables. Second, we replace the LSTM decoders with transformer based decoders. This upgrade improves significantly the previous state-of-the-art tree-editing-distance-score (TEDS) from 91% to 98.5% on simple tables and from 88.7% to 95% on complex tables."},{"label":"Section-header","id":5,"page_no":0,"cluster":{"id":5,"label":"Section-header","bbox":{"l":50.111977,"t":539.1871112823486,"r":126.94804,"b":550.69049,"coord_origin":"1"},"confidence":0.9362080097198486,"cells":[{"id":32,"text":"1.","bbox":{"l":50.111977,"t":539.94276,"r":58.121296,"b":550.69049,"coord_origin":"1"}},{"id":33,"text":"Introduction","bbox":{"l":68.800385,"t":539.94276,"r":126.94804,"b":550.69049,"coord_origin":"1"}}]},"text":"1. Introduction"},{"label":"Text","id":6,"page_no":0,"cluster":{"id":6,"label":"Text","bbox":{"l":49.18265175819397,"t":559.7423080444336,"r":286.36508,"b":713.151779,"coord_origin":"1"},"confidence":0.9877704977989197,"cells":[{"id":34,"text":"The occurrence of tables in documents is ubiquitous.","bbox":{"l":62.066978,"t":560.7832,"r":286.36496,"b":569.68976,"coord_origin":"1"}},{"id":35,"text":"They often summarise quantitative or factual data, which is","bbox":{"l":50.111977,"t":572.7382,"r":286.36508,"b":581.64476,"coord_origin":"1"}},{"id":36,"text":"cumbersome to describe in verbose text but nevertheless ex-","bbox":{"l":50.111977,"t":584.69321,"r":286.36505,"b":593.5997600000001,"coord_origin":"1"}},{"id":37,"text":"tremely valuable. Unfortunately, this compact representa-","bbox":{"l":50.111977,"t":596.6492000000001,"r":286.36505,"b":605.55576,"coord_origin":"1"}},{"id":38,"text":"tion is often not easy to parse by machines. There are many","bbox":{"l":50.111977,"t":608.6042,"r":286.36505,"b":617.51076,"coord_origin":"1"}},{"id":39,"text":"implicit conventions used to obtain a compact table repre-","bbox":{"l":50.111977,"t":620.5592,"r":286.36505,"b":629.46576,"coord_origin":"1"}},{"id":40,"text":"sentation. For example, tables often have complex column-","bbox":{"l":50.111977,"t":632.51421,"r":286.36508,"b":641.42076,"coord_origin":"1"}},{"id":41,"text":"and row-headers in order to reduce duplicated cell content.","bbox":{"l":50.111977,"t":644.46921,"r":286.36508,"b":653.37576,"coord_origin":"1"}},{"id":42,"text":"Lines of different shapes and sizes are leveraged to separate","bbox":{"l":50.111977,"t":656.42421,"r":286.36502,"b":665.33077,"coord_origin":"1"}},{"id":43,"text":"content or indicate a tree structure. Additionally, tables can","bbox":{"l":50.111977,"t":668.3802000000001,"r":286.36505,"b":677.28677,"coord_origin":"1"}},{"id":44,"text":"also have empty/missing table-entries or multi-row textual","bbox":{"l":50.111977,"t":680.33521,"r":286.36505,"b":689.2417800000001,"coord_origin":"1"}},{"id":45,"text":"table-entries. Fig. 1 shows a table which presents all these","bbox":{"l":50.111977,"t":692.290207,"r":286.36505,"b":701.196777,"coord_origin":"1"}},{"id":46,"text":"issues.","bbox":{"l":50.111977,"t":704.245209,"r":76.403275,"b":713.151779,"coord_origin":"1"}}]},"text":"The occurrence of tables in documents is ubiquitous. They often summarise quantitative or factual data, which is cumbersome to describe in verbose text but nevertheless extremely valuable. Unfortunately, this compact representation is often not easy to parse by machines. There are many implicit conventions used to obtain a compact table representation. For example, tables often have complex columnand row-headers in order to reduce duplicated cell content. Lines of different shapes and sizes are leveraged to separate content or indicate a tree structure. Additionally, tables can also have empty/missing table-entries or multi-row textual table-entries. Fig. 1 shows a table which presents all these issues."},{"label":"Section-header","id":7,"page_no":0,"cluster":{"id":7,"label":"Section-header","bbox":{"l":315.3704212188721,"t":216.8857538223267,"r":408.4407,"b":226.75482,"coord_origin":"1"},"confidence":0.8334906101226807,"cells":[{"id":47,"text":"a.","bbox":{"l":315.56702,"t":218.00684,"r":324.01007,"b":226.75482,"coord_origin":"1"}},{"id":48,"text":"Picture of a table:","bbox":{"l":328.2316,"t":218.00684,"r":408.4407,"b":226.75482,"coord_origin":"1"}}]},"text":"a. Picture of a table:"},{"label":"List-item","id":8,"page_no":0,"cluster":{"id":8,"label":"List-item","bbox":{"l":315.2541000366211,"t":312.6587104797363,"r":486.40194999999994,"b":333.5001319885254,"coord_origin":"1"},"confidence":0.6923348307609558,"cells":[{"id":49,"text":"b.","bbox":{"l":315.56702,"t":313.69478999999995,"r":325.05786,"b":322.44281,"coord_origin":"1"}},{"id":50,"text":"Red-annotation of bounding boxes,","bbox":{"l":329.80325,"t":313.69478999999995,"r":486.40194999999994,"b":322.44281,"coord_origin":"1"}},{"id":51,"text":"Blue-predictions by TableFormer","bbox":{"l":326.46252,"t":324.49478,"r":472.47411999999997,"b":333.2428,"coord_origin":"1"}}]},"text":"b. Red-annotation of bounding boxes, Blue-predictions by TableFormer"},{"label":"List-item","id":9,"page_no":0,"cluster":{"id":9,"label":"List-item","bbox":{"l":315.30837936401366,"t":419.3768840789795,"r":491.1912500000001,"b":429.9300762176514,"coord_origin":"1"},"confidence":0.6545922756195068,"cells":[{"id":52,"text":"c.","bbox":{"l":315.56702,"t":420.1828,"r":324.81039,"b":428.93082,"coord_origin":"1"}},{"id":53,"text":"Structure predicted by TableFormer:","bbox":{"l":329.4321,"t":420.1828,"r":491.1912500000001,"b":428.93082,"coord_origin":"1"}}]},"text":"c. Structure predicted by TableFormer:"},{"label":"Picture","id":10,"page_no":0,"cluster":{"id":10,"label":"Picture","bbox":{"l":314.3844051361084,"t":338.26564750671383,"r":539.53088,"b":409.7582748413086,"coord_origin":"1"},"confidence":0.9185528755187988,"cells":[{"id":54,"text":"1","bbox":{"l":408.14752,"t":342.82828,"r":412.54001,"b":351.61322,"coord_origin":"1"}},{"id":55,"text":"0","bbox":{"l":356.11011,"t":341.57217,"r":360.50259,"b":350.35712,"coord_origin":"1"}},{"id":56,"text":"2","bbox":{"l":500.6777,"t":340.93768,"r":505.0701900000001,"b":349.7226299999999,"coord_origin":"1"}},{"id":57,"text":"3","bbox":{"l":356.13382,"t":351.74789,"r":360.52631,"b":360.53284,"coord_origin":"1"}},{"id":58,"text":"4","bbox":{"l":402.53992,"t":355.8765,"r":406.9324,"b":364.66144,"coord_origin":"1"}},{"id":59,"text":"5","bbox":{"l":448.58178999999996,"t":352.84018,"r":452.97427,"b":361.62512,"coord_origin":"1"}},{"id":60,"text":"6","bbox":{"l":491.65161000000006,"t":353.70657,"r":496.0441,"b":362.49152,"coord_origin":"1"}},{"id":61,"text":"7","bbox":{"l":535.13843,"t":353.33969,"r":539.53088,"b":362.12463,"coord_origin":"1"}},{"id":62,"text":"8","bbox":{"l":348.82822,"t":387.09781,"r":353.2207,"b":395.88275,"coord_origin":"1"}},{"id":63,"text":"9","bbox":{"l":389.27151,"t":375.37228,"r":393.664,"b":384.15723,"coord_origin":"1"}},{"id":64,"text":"10","bbox":{"l":442.67479999999995,"t":375.64621,"r":451.45889000000005,"b":384.43115,"coord_origin":"1"}},{"id":65,"text":"11","bbox":{"l":477.4382299999999,"t":375.534,"r":485.90167,"b":384.31894000000005,"coord_origin":"1"}},{"id":66,"text":"12","bbox":{"l":522.57263,"t":375.64621,"r":531.35669,"b":384.43115,"coord_origin":"1"}},{"id":67,"text":"13","bbox":{"l":400.22992,"t":387.11429,"r":409.01401,"b":395.89923,"coord_origin":"1"}},{"id":68,"text":"14","bbox":{"l":442.30792,"t":386.98981000000003,"r":451.0920100000001,"b":395.77475000000004,"coord_origin":"1"}},{"id":69,"text":"15","bbox":{"l":478.21941999999996,"t":387.37469,"r":487.00351000000006,"b":396.15964,"coord_origin":"1"}},{"id":70,"text":"16","bbox":{"l":523.2287,"t":386.98981000000003,"r":532.01276,"b":395.77475000000004,"coord_origin":"1"}},{"id":71,"text":"1","bbox":{"l":411.57233,"t":399.42477,"r":415.96481,"b":408.20972,"coord_origin":"1"}},{"id":72,"text":"7","bbox":{"l":415.96393,"t":399.42477,"r":420.35641,"b":408.20972,"coord_origin":"1"}},{"id":73,"text":"18","bbox":{"l":442.30521,"t":399.0371999999999,"r":451.08929,"b":407.82213999999993,"coord_origin":"1"}},{"id":74,"text":"19","bbox":{"l":478.77893,"t":398.99639999999994,"r":487.56302,"b":407.78133999999994,"coord_origin":"1"}},{"id":75,"text":"20","bbox":{"l":523.97241,"t":398.6114799999999,"r":532.75647,"b":407.39642,"coord_origin":"1"}},{"id":98,"text":"3","bbox":{"l":385.09399,"t":357.76030999999995,"r":391.09879,"b":367.89072,"coord_origin":"1"}},{"id":101,"text":"2","bbox":{"l":333.43451,"t":380.7265,"r":339.4393,"b":390.85689999999994,"coord_origin":"1"}},{"id":103,"text":"1","bbox":{"l":478.07210999999995,"t":341.0368000000001,"r":484.0769,"b":351.16720999999995,"coord_origin":"1"}}]},"text":"","data":null,"provenance":null,"predicted_class":null,"confidence":null},{"label":"Table","id":11,"page_no":0,"cluster":{"id":11,"label":"Table","bbox":{"l":315.6885681152344,"t":434.2295654296875,"r":536.9868450164795,"b":496.1293773651123,"coord_origin":"1"},"confidence":0.9250026941299438,"cells":[{"id":76,"text":"1","bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"}},{"id":78,"text":"2","bbox":{"l":394.10422,"t":437.68588,"r":398.4967,"b":446.47083,"coord_origin":"1"}},{"id":79,"text":"3","bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"}},{"id":80,"text":"4","bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"}},{"id":81,"text":"5","bbox":{"l":394.10422,"t":449.5455,"r":398.4967,"b":458.33044,"coord_origin":"1"}},{"id":82,"text":"6","bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"}},{"id":83,"text":"7","bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"}},{"id":84,"text":"8","bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"}},{"id":85,"text":"9","bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"}},{"id":86,"text":"10","bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"}},{"id":87,"text":"11","bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"}},{"id":88,"text":"12","bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"}},{"id":89,"text":"13","bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"}},{"id":90,"text":"14","bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"}},{"id":91,"text":"15","bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"}},{"id":92,"text":"16","bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"}},{"id":93,"text":"17","bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"}},{"id":94,"text":"18","bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"}},{"id":95,"text":"19","bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"}},{"id":96,"text":"20","bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"}},{"id":99,"text":"3","bbox":{"l":366.70102,"t":449.12082,"r":372.70581,"b":459.25122,"coord_origin":"1"}},{"id":102,"text":"2","bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"}},{"id":104,"text":"1","bbox":{"l":459.87621999999993,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ched","ched","lcel","ched","lcel","ched","nl","fcel","fcel","fcel","fcel","fcel","fcel","nl","fcel","fcel","fcel","fcel","fcel","fcel","nl","fcel","fcel","fcel","fcel","fcel","ucel","nl","fcel","fcel","fcel","fcel","fcel","ucel","nl"],"num_rows":5,"num_cols":6,"table_cells":[{"bbox":{"l":347.24872,"t":437.68588,"r":351.6412,"b":446.47083,"coord_origin":"1"},"row_span":1,"col_span":2,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":3,"text":"1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":318.88071,"t":437.68588,"r":323.27319,"b":446.47083,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"0","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":437.5936,"r":465.88101,"b":447.724,"coord_origin":"1"},"row_span":1,"col_span":2,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":3,"end_col_offset_idx":5,"text":"2 1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":318.77316,"t":449.5455,"r":323.16565,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":449.5455,"r":351.6412,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"4","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":366.70102,"t":449.12082,"r":398.4967,"b":459.25122,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"5 3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":449.5455,"r":445.3519,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"6","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":449.5455,"r":492.2074,"b":458.33044,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"7","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":318.77316,"t":473.70425,"r":323.16565,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"8","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":461.8446,"r":351.6412,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"9","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":461.8446,"r":402.88831,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"10","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":461.8446,"r":449.42285,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"11","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":461.8446,"r":496.599,"b":470.62955,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"12","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":473.70425,"r":356.03281,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"13","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":473.70425,"r":402.88831,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"14","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":473.70425,"r":449.7435,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"15","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":473.70425,"r":496.599,"b":482.4892,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"16","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":347.24872,"t":485.12469,"r":356.03281,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"17","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":394.10422,"t":485.12469,"r":402.88831,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"18","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":440.95941000000005,"t":485.12469,"r":449.7435,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"19","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":487.81491,"t":485.12469,"r":496.599,"b":493.90964,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"20","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":331.90424,"t":473.32291,"r":337.90903,"b":483.45331,"coord_origin":"1"},"row_span":3,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":5,"start_col_offset_idx":5,"end_col_offset_idx":6,"text":"2","column_header":false,"row_header":false,"row_section":false}]},{"label":"Table","id":12,"page_no":0,"cluster":{"id":12,"label":"Table","bbox":{"l":315.6885681152344,"t":230.98197326660159,"r":537.0928356170654,"b":302.49661788940426,"coord_origin":"1"},"confidence":0.8390322327613831,"cells":[{"id":97,"text":"1","bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"}},{"id":100,"text":"2","bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"}},{"id":105,"text":"3","bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ecel","ched","ched","ched","ched","nl","rhed","rhed","fcel","fcel","fcel","nl","rhed","rhed","fcel","fcel","fcel","nl","rhed","rhed","fcel","fcel","fcel","nl"],"num_rows":2,"num_cols":3,"table_cells":[{"bbox":{"l":451.9457100000001,"t":235.34704999999997,"r":457.95050000000003,"b":245.47748,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"1","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":331.19681,"t":269.35266,"r":337.2016,"b":279.48308999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"2","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":384.0329,"t":252.67895999999996,"r":390.03769,"b":262.80939,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":true,"row_header":false,"row_section":false}]},{"label":"Caption","id":13,"page_no":0,"cluster":{"id":13,"label":"Caption","bbox":{"l":307.8611972808838,"t":513.6277587890626,"r":545.11517,"b":559.2729,"coord_origin":"1"},"confidence":0.9685324430465698,"cells":[{"id":106,"text":"Figure 1:","bbox":{"l":308.862,"t":514.50037,"r":345.73361,"b":523.40692,"coord_origin":"1"}},{"id":107,"text":"Picture of a table with subtle, complex features","bbox":{"l":353.17566,"t":514.50037,"r":545.11511,"b":523.40692,"coord_origin":"1"}},{"id":108,"text":"such as (1) multi-column headers, (2) cell with multi-row","bbox":{"l":308.862,"t":526.45535,"r":545.11511,"b":535.3619100000001,"coord_origin":"1"}},{"id":109,"text":"text and (3) cells with no content. Image from PubTabNet","bbox":{"l":308.862,"t":538.41035,"r":545.11517,"b":547.31691,"coord_origin":"1"}},{"id":110,"text":"evaluation set, filename: ‘PMC2944238 004 02’.","bbox":{"l":308.862,"t":550.36635,"r":505.6917700000001,"b":559.2729,"coord_origin":"1"}}]},"text":"Figure 1: Picture of a table with subtle, complex features such as (1) multi-column headers, (2) cell with multi-row text and (3) cells with no content. Image from PubTabNet evaluation set, filename: ‘PMC2944238 004 02’."},{"label":"Text","id":14,"page_no":0,"cluster":{"id":14,"label":"Text","bbox":{"l":307.8420244216919,"t":583.5986251831055,"r":545.50438041687,"b":665.04693,"coord_origin":"1"},"confidence":0.9861506223678589,"cells":[{"id":111,"text":"Recently, significant progress has been made with vi-","bbox":{"l":320.81699,"t":584.40936,"r":545.11493,"b":593.31592,"coord_origin":"1"}},{"id":112,"text":"sion based approaches to extract tables in documents. For","bbox":{"l":308.862,"t":596.36436,"r":545.11517,"b":605.2709199999999,"coord_origin":"1"}},{"id":113,"text":"the sake of completeness, the issue of table extraction from","bbox":{"l":308.862,"t":608.31937,"r":545.11511,"b":617.22592,"coord_origin":"1"}},{"id":114,"text":"documents is typically decomposed into two separate chal-","bbox":{"l":308.862,"t":620.27437,"r":545.11505,"b":629.18092,"coord_origin":"1"}},{"id":115,"text":"lenges, i.e.","bbox":{"l":308.862,"t":632.23036,"r":353.6937,"b":641.13692,"coord_origin":"1"}},{"id":116,"text":"(1)","bbox":{"l":362.11209,"t":632.23036,"r":374.66617,"b":641.13692,"coord_origin":"1"}},{"id":117,"text":"finding the location of the table(s) on a","bbox":{"l":377.35785,"t":632.23036,"r":545.11505,"b":641.13692,"coord_origin":"1"}},{"id":118,"text":"document-page and (2) finding the structure of a given table","bbox":{"l":308.862,"t":644.18536,"r":545.11517,"b":653.09192,"coord_origin":"1"}},{"id":119,"text":"in the document.","bbox":{"l":308.862,"t":656.14037,"r":375.55167,"b":665.04693,"coord_origin":"1"}}]},"text":"Recently, significant progress has been made with vision based approaches to extract tables in documents. For the sake of completeness, the issue of table extraction from documents is typically decomposed into two separate challenges, i.e. (1) finding the location of the table(s) on a document-page and (2) finding the structure of a given table in the document."},{"label":"Text","id":15,"page_no":0,"cluster":{"id":15,"label":"Text","bbox":{"l":307.9032199859619,"t":667.6026237487794,"r":545.4091873168945,"b":713.8419502258301,"coord_origin":"1"},"confidence":0.9836903810501099,"cells":[{"id":120,"text":"The first problem is called table-location and has been","bbox":{"l":320.81699,"t":668.38036,"r":545.11493,"b":677.28693,"coord_origin":"1"}},{"id":121,"text":"previously addressed [30, 38, 19, 21, 23, 26, 8] with state-","bbox":{"l":308.862,"t":680.33536,"r":545.11511,"b":689.24193,"coord_origin":"1"}},{"id":122,"text":"of-the-art object-detection networks (e.g. YOLO and later","bbox":{"l":308.862,"t":692.290359,"r":545.11511,"b":701.19693,"coord_origin":"1"}},{"id":123,"text":"on Mask-RCNN [9]). For all practical purposes, it can be","bbox":{"l":308.862,"t":704.245361,"r":545.11499,"b":713.151932,"coord_origin":"1"}}]},"text":"The first problem is called table-location and has been previously addressed [30, 38, 19, 21, 23, 26, 8] with stateof-the-art object-detection networks (e.g. YOLO and later on Mask-RCNN [9]). For all practical purposes, it can be"}],"headers":[{"label":"Page-footer","id":16,"page_no":0,"cluster":{"id":16,"label":"Page-footer","bbox":{"l":295.121,"t":733.3784980773926,"r":300.10229,"b":743.039928,"coord_origin":"1"},"confidence":0.8475339412689209,"cells":[{"id":124,"text":"1","bbox":{"l":295.121,"t":734.133366,"r":300.10229,"b":743.039928,"coord_origin":"1"}}]},"text":"1"},{"label":"Page-header","id":17,"page_no":0,"cluster":{"id":17,"label":"Page-header","bbox":{"l":16.783903062343597,"t":207.82001000000002,"r":36.339779,"b":560.00003,"coord_origin":"1"},"confidence":0.8388436436653137,"cells":[{"id":125,"text":"arXiv:2203.01017v2 [cs.CV] 11 Mar 2022","bbox":{"l":18.340221,"t":207.82001000000002,"r":36.339779,"b":560.00003,"coord_origin":"1"}}]},"text":"arXiv:2203.01017v2 [cs.CV] 11 Mar 2022"}]}},{"page_no":1,"page_hash":"81bd44713b62df481eaab1ac092cbc8b66359e53c7ecd637bb30d2680b1d2692","size":{"width":612.0,"height":792.0},"cells":[{"id":0,"text":"considered as a solved problem, given enough ground-truth","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36505,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"data to train on.","bbox":{"l":50.112,"t":87.16339000000005,"r":112.64721999999999,"b":96.06994999999995,"coord_origin":"1"}},{"id":2,"text":"The second problem is called table-structure decompo-","bbox":{"l":62.067001,"t":99.57141000000001,"r":286.36496,"b":108.47797000000003,"coord_origin":"1"}},{"id":3,"text":"sition.","bbox":{"l":50.112,"t":111.52643,"r":74.749512,"b":120.43297999999993,"coord_origin":"1"}},{"id":4,"text":"The latter is a long standing problem in the com-","bbox":{"l":81.334793,"t":111.52643,"r":286.36514,"b":120.43297999999993,"coord_origin":"1"}},{"id":5,"text":"munity of document understanding [6, 4, 14]. Contrary to","bbox":{"l":50.112,"t":123.48145,"r":286.36511,"b":132.38800000000003,"coord_origin":"1"}},{"id":6,"text":"the table-location problem, there are no commonly used ap-","bbox":{"l":50.112,"t":135.43646,"r":286.36511,"b":144.34302000000002,"coord_origin":"1"}},{"id":7,"text":"proaches that can easily be re-purposed to solve this prob-","bbox":{"l":50.112,"t":147.39246000000003,"r":286.36505,"b":156.29900999999995,"coord_origin":"1"}},{"id":8,"text":"lem. Lately, a set of new model-architectures has been pro-","bbox":{"l":50.112,"t":159.34747000000004,"r":286.36511,"b":168.25402999999994,"coord_origin":"1"}},{"id":9,"text":"posed by the community to address table-structure decom-","bbox":{"l":50.112,"t":171.30249000000003,"r":286.36508,"b":180.20905000000005,"coord_origin":"1"}},{"id":10,"text":"position [37, 36, 18, 20]. All these models have some weak-","bbox":{"l":50.112,"t":183.25751000000002,"r":286.36511,"b":192.16405999999995,"coord_origin":"1"}},{"id":11,"text":"nesses (see Sec. 2). The common denominator here is the","bbox":{"l":50.112,"t":195.21252000000004,"r":286.36508,"b":204.11908000000005,"coord_origin":"1"}},{"id":12,"text":"reliance on textual features and/or the inability to provide","bbox":{"l":50.112,"t":207.16754000000003,"r":286.36514,"b":216.07410000000004,"coord_origin":"1"}},{"id":13,"text":"the bounding box of each table-cell in the original image.","bbox":{"l":50.112,"t":219.12354000000005,"r":278.66397,"b":228.03008999999997,"coord_origin":"1"}},{"id":14,"text":"In this paper, we want to address these weaknesses and","bbox":{"l":62.067001,"t":231.53156,"r":286.36493,"b":240.43811000000005,"coord_origin":"1"}},{"id":15,"text":"present a robust table-structure decomposition algorithm.","bbox":{"l":50.112,"t":243.48657000000003,"r":286.36511,"b":252.39313000000004,"coord_origin":"1"}},{"id":16,"text":"The design criteria for our model are the following. First,","bbox":{"l":50.112,"t":255.44159000000002,"r":286.36511,"b":264.34813999999994,"coord_origin":"1"}},{"id":17,"text":"we want our algorithm to be language agnostic. In this way,","bbox":{"l":50.112,"t":267.39661,"r":286.36502,"b":276.30316000000005,"coord_origin":"1"}},{"id":18,"text":"we can obtain the structure of any table, irregardless of the","bbox":{"l":50.112,"t":279.35155999999995,"r":286.36508,"b":288.25815,"coord_origin":"1"}},{"id":19,"text":"language.","bbox":{"l":50.112,"t":291.30759,"r":88.567635,"b":300.21414,"coord_origin":"1"}},{"id":20,"text":"Second, we want our algorithm to leverage as","bbox":{"l":95.501602,"t":291.30759,"r":286.36505,"b":300.21414,"coord_origin":"1"}},{"id":21,"text":"much data as possible from the original PDF document. For","bbox":{"l":50.112,"t":303.26257,"r":286.36508,"b":312.16913,"coord_origin":"1"}},{"id":22,"text":"programmatic PDF documents, the text-cells can often be","bbox":{"l":50.112,"t":315.21756,"r":286.36511,"b":324.12411,"coord_origin":"1"}},{"id":23,"text":"extracted much faster and with higher accuracy compared","bbox":{"l":50.112,"t":327.17255,"r":286.36505,"b":336.0791,"coord_origin":"1"}},{"id":24,"text":"to OCR methods. Last but not least, we want to have a di-","bbox":{"l":50.112,"t":339.12753,"r":286.36511,"b":348.03409,"coord_origin":"1"}},{"id":25,"text":"rect link between the table-cell and its bounding box in the","bbox":{"l":50.112,"t":351.08353,"r":286.36508,"b":359.99008,"coord_origin":"1"}},{"id":26,"text":"image.","bbox":{"l":50.112,"t":363.03851,"r":76.951241,"b":371.94507,"coord_origin":"1"}},{"id":27,"text":"To meet the design criteria listed above, we developed a","bbox":{"l":62.067001,"t":375.4465,"r":286.36499,"b":384.35306,"coord_origin":"1"}},{"id":28,"text":"new model called","bbox":{"l":50.112,"t":387.40149,"r":120.98594,"b":396.30804,"coord_origin":"1"}},{"id":29,"text":"TableFormer","bbox":{"l":123.901,"t":387.28192,"r":179.7314,"b":396.23830999999996,"coord_origin":"1"}},{"id":30,"text":"and a synthetically gener-","bbox":{"l":182.646,"t":387.40149,"r":286.36658,"b":396.30804,"coord_origin":"1"}},{"id":31,"text":"ated table structure dataset called","bbox":{"l":50.112,"t":399.35648,"r":181.75778,"b":408.26302999999996,"coord_origin":"1"}},{"id":32,"text":"SynthTabNet","bbox":{"l":184.104,"t":399.23690999999997,"r":240.2034,"b":408.1933,"coord_origin":"1"}},{"id":33,"text":"$^{1}$. In partic-","bbox":{"l":240.20401,"t":399.35648,"r":286.36069,"b":408.26302999999996,"coord_origin":"1"}},{"id":34,"text":"ular, our contributions in this work can be summarised as","bbox":{"l":50.112015,"t":411.31146,"r":286.36511,"b":420.21802,"coord_origin":"1"}},{"id":35,"text":"follows:","bbox":{"l":50.112015,"t":423.26645,"r":82.520355,"b":432.173,"coord_origin":"1"}},{"id":36,"text":"•","bbox":{"l":61.569016,"t":444.55145,"r":70.741714,"b":453.45801,"coord_origin":"1"}},{"id":37,"text":"We propose","bbox":{"l":73.034889,"t":444.55145,"r":117.10054,"b":453.45801,"coord_origin":"1"}},{"id":38,"text":"TableFormer","bbox":{"l":119.59001,"t":444.43188,"r":175.42041,"b":453.38828,"coord_origin":"1"}},{"id":39,"text":", a transformer based model","bbox":{"l":175.42102,"t":444.55145,"r":286.36453,"b":453.45801,"coord_origin":"1"}},{"id":40,"text":"that predicts tables structure and bounding boxes for","bbox":{"l":70.037018,"t":456.50644000000005,"r":286.3649,"b":465.41299,"coord_origin":"1"}},{"id":41,"text":"the table content simultaneously in an end-to-end ap-","bbox":{"l":70.037018,"t":468.46143,"r":286.3649,"b":477.36798,"coord_origin":"1"}},{"id":42,"text":"proach.","bbox":{"l":70.037018,"t":480.41641,"r":99.635902,"b":489.32297,"coord_origin":"1"}},{"id":43,"text":"•","bbox":{"l":61.569016,"t":502.15341,"r":71.619438,"b":511.05997,"coord_origin":"1"}},{"id":44,"text":"Across all benchmark datasets","bbox":{"l":74.132042,"t":502.15341,"r":196.10396,"b":511.05997,"coord_origin":"1"}},{"id":45,"text":"TableFormer","bbox":{"l":200.31001,"t":502.03384,"r":256.14041,"b":510.99023,"coord_origin":"1"}},{"id":46,"text":"signif-","bbox":{"l":260.35001,"t":502.15341,"r":286.36237,"b":511.05997,"coord_origin":"1"}},{"id":47,"text":"icantly outperforms existing state-of-the-art metrics,","bbox":{"l":70.037003,"t":514.1084000000001,"r":286.3649,"b":523.01495,"coord_origin":"1"}},{"id":48,"text":"while being much more efficient in training and infer-","bbox":{"l":70.037003,"t":526.06439,"r":286.36487,"b":534.97095,"coord_origin":"1"}},{"id":49,"text":"ence to existing works.","bbox":{"l":70.037003,"t":538.0193899999999,"r":161.65305,"b":546.9259500000001,"coord_origin":"1"}},{"id":50,"text":"•","bbox":{"l":61.569,"t":559.75639,"r":71.115913,"b":568.66295,"coord_origin":"1"}},{"id":51,"text":"We present","bbox":{"l":73.502647,"t":559.75639,"r":116.71199,"b":568.66295,"coord_origin":"1"}},{"id":52,"text":"SynthTabNet","bbox":{"l":121.583,"t":559.63684,"r":177.68239,"b":568.59322,"coord_origin":"1"}},{"id":53,"text":"a synthetically generated","bbox":{"l":182.55301,"t":559.75639,"r":286.36328,"b":568.66295,"coord_origin":"1"}},{"id":54,"text":"dataset, with various appearance styles and complex-","bbox":{"l":70.03701,"t":571.7114,"r":286.36493,"b":580.6179500000001,"coord_origin":"1"}},{"id":55,"text":"ity.","bbox":{"l":70.03701,"t":583.6664000000001,"r":82.400597,"b":592.57295,"coord_origin":"1"}},{"id":56,"text":"•","bbox":{"l":61.569008000000004,"t":605.4034,"r":72.332527,"b":614.30995,"coord_origin":"1"}},{"id":57,"text":"An augmented dataset based on PubTabNet [37],","bbox":{"l":75.023399,"t":605.4034,"r":286.36508,"b":614.30995,"coord_origin":"1"}},{"id":58,"text":"FinTabNet [36], and TableBank [17] with generated","bbox":{"l":70.03701,"t":617.3584,"r":286.36487,"b":626.26495,"coord_origin":"1"}},{"id":59,"text":"ground-truth for reproducibility.","bbox":{"l":70.03701,"t":629.31439,"r":198.05641,"b":638.22095,"coord_origin":"1"}},{"id":60,"text":"The paper is structured as follows. In Sec. 2, we give","bbox":{"l":62.067009000000006,"t":650.59839,"r":286.36496,"b":659.50494,"coord_origin":"1"}},{"id":61,"text":"a brief overview of the current state-of-the-art. In Sec. 3,","bbox":{"l":50.112007,"t":662.55339,"r":286.36511,"b":671.45995,"coord_origin":"1"}},{"id":62,"text":"we describe the datasets on which we train. In Sec. 4, we","bbox":{"l":50.112007,"t":674.50839,"r":286.36511,"b":683.41496,"coord_origin":"1"}},{"id":63,"text":"introduce the TableFormer model-architecture and describe","bbox":{"l":50.112007,"t":686.46339,"r":286.36511,"b":695.369957,"coord_origin":"1"}},{"id":64,"text":"$^{1}$https://github.com/IBM/SynthTabNet","bbox":{"l":60.97100100000001,"t":705.596275,"r":183.73055,"b":712.721542,"coord_origin":"1"}},{"id":65,"text":"its results & performance in Sec. 5. As a conclusion, we de-","bbox":{"l":308.862,"t":75.20836999999995,"r":545.11511,"b":84.11492999999996,"coord_origin":"1"}},{"id":66,"text":"scribe how this new model-architecture can be re-purposed","bbox":{"l":308.862,"t":87.16339000000005,"r":545.11505,"b":96.06994999999995,"coord_origin":"1"}},{"id":67,"text":"for other tasks in the computer-vision community.","bbox":{"l":308.862,"t":99.11841000000004,"r":508.08417000000003,"b":108.02495999999985,"coord_origin":"1"}},{"id":68,"text":"2.","bbox":{"l":308.862,"t":121.73193000000003,"r":315.5831,"b":132.47968000000003,"coord_origin":"1"}},{"id":69,"text":"Previous work and State of the Art","bbox":{"l":324.54456,"t":121.73193000000003,"r":498.28021,"b":132.47968000000003,"coord_origin":"1"}},{"id":70,"text":"Identifying the structure of a table has been an outstand-","bbox":{"l":320.81699,"t":142.22136999999998,"r":545.11493,"b":151.12793,"coord_origin":"1"}},{"id":71,"text":"ing problem in the document-parsing community, that mo-","bbox":{"l":308.862,"t":154.17638999999997,"r":545.11505,"b":163.08294999999998,"coord_origin":"1"}},{"id":72,"text":"tivates many organised public challenges [6, 4, 14].","bbox":{"l":308.862,"t":166.13140999999996,"r":522.55975,"b":175.03796,"coord_origin":"1"}},{"id":73,"text":"The","bbox":{"l":529.62323,"t":166.13140999999996,"r":545.11505,"b":175.03796,"coord_origin":"1"}},{"id":74,"text":"difficulty of the problem can be attributed to a number of","bbox":{"l":308.862,"t":178.08642999999995,"r":545.11517,"b":186.99298,"coord_origin":"1"}},{"id":75,"text":"factors. First, there is a large variety in the shapes and sizes","bbox":{"l":308.862,"t":190.04143999999997,"r":545.11511,"b":198.94799999999998,"coord_origin":"1"}},{"id":76,"text":"of tables.","bbox":{"l":308.862,"t":201.99645999999996,"r":346.97891,"b":210.90301999999997,"coord_origin":"1"}},{"id":77,"text":"Such large variety requires a flexible method.","bbox":{"l":354.86929,"t":201.99645999999996,"r":545.11511,"b":210.90301999999997,"coord_origin":"1"}},{"id":78,"text":"This is especially true for complex column- and row head-","bbox":{"l":308.862,"t":213.95245,"r":545.11505,"b":222.85901,"coord_origin":"1"}},{"id":79,"text":"ers, which can be extremely intricate and demanding.","bbox":{"l":308.862,"t":225.90747,"r":530.9184,"b":234.81403,"coord_origin":"1"}},{"id":80,"text":"A","bbox":{"l":537.92212,"t":225.90747,"r":545.11511,"b":234.81403,"coord_origin":"1"}},{"id":81,"text":"second factor of complexity is the lack of data with regard","bbox":{"l":308.862,"t":237.86248999999998,"r":545.11517,"b":246.76904000000002,"coord_origin":"1"}},{"id":82,"text":"to table-structure. Until the publication of PubTabNet [37],","bbox":{"l":308.862,"t":249.8175,"r":545.11511,"b":258.72406,"coord_origin":"1"}},{"id":83,"text":"there were no large datasets (i.e.","bbox":{"l":308.862,"t":261.77252,"r":439.8402699999999,"b":270.67908,"coord_origin":"1"}},{"id":84,"text":">","bbox":{"l":444.43999999999994,"t":261.61310000000003,"r":452.1889,"b":270.45989999999995,"coord_origin":"1"}},{"id":85,"text":"100","bbox":{"l":455.89001,"t":261.61310000000003,"r":470.83392000000003,"b":270.45989999999995,"coord_origin":"1"}},{"id":86,"text":"K tables) that pro-","bbox":{"l":470.83401,"t":261.77252,"r":545.11517,"b":270.67908,"coord_origin":"1"}},{"id":87,"text":"vided structure information. This happens primarily due to","bbox":{"l":308.862,"t":273.72748,"r":545.11511,"b":282.63406,"coord_origin":"1"}},{"id":88,"text":"the fact that tables are notoriously time-consuming to an-","bbox":{"l":308.862,"t":285.6835,"r":545.11511,"b":294.59006,"coord_origin":"1"}},{"id":89,"text":"notate by hand. However, this has definitely changed in re-","bbox":{"l":308.862,"t":297.63849,"r":545.11511,"b":306.54504,"coord_origin":"1"}},{"id":90,"text":"cent years with the deliverance of PubTabNet [37], FinTab-","bbox":{"l":308.862,"t":309.59348,"r":545.11517,"b":318.50003000000004,"coord_origin":"1"}},{"id":91,"text":"Net [36], TableBank [17] etc.","bbox":{"l":308.862,"t":321.54846,"r":425.92255,"b":330.45502,"coord_origin":"1"}},{"id":92,"text":"Before the rising popularity of deep neural networks,","bbox":{"l":320.81699,"t":333.56946,"r":545.11499,"b":342.47601,"coord_origin":"1"}},{"id":93,"text":"the community relied heavily on heuristic and/or statistical","bbox":{"l":308.862,"t":345.52444,"r":545.11499,"b":354.43100000000004,"coord_origin":"1"}},{"id":94,"text":"methods to do table structure identification [3, 7, 11, 5, 13,","bbox":{"l":308.862,"t":357.47943,"r":545.11517,"b":366.38599,"coord_origin":"1"}},{"id":95,"text":"28]. Although such methods work well on constrained ta-","bbox":{"l":308.862,"t":369.43542,"r":545.11511,"b":378.34198,"coord_origin":"1"}},{"id":96,"text":"bles [12], a more data-driven approach can be applied due","bbox":{"l":308.862,"t":381.39041,"r":545.11505,"b":390.29697,"coord_origin":"1"}},{"id":97,"text":"to the advent of convolutional neural networks (CNNs) and","bbox":{"l":308.862,"t":393.3453999999999,"r":545.11505,"b":402.25195,"coord_origin":"1"}},{"id":98,"text":"the availability of large datasets. To the best-of-our knowl-","bbox":{"l":308.862,"t":405.30038,"r":545.11517,"b":414.20694,"coord_origin":"1"}},{"id":99,"text":"edge, there are currently two different types of network ar-","bbox":{"l":308.862,"t":417.25537,"r":545.11523,"b":426.16193,"coord_origin":"1"}},{"id":100,"text":"chitecture that are being pursued for state-of-the-art table-","bbox":{"l":308.862,"t":429.21136000000007,"r":545.11511,"b":438.11792,"coord_origin":"1"}},{"id":101,"text":"structure identification.","bbox":{"l":308.862,"t":441.16635,"r":401.28503,"b":450.0729099999999,"coord_origin":"1"}},{"id":102,"text":"Image-to-Text networks","bbox":{"l":320.81699,"t":453.06778,"r":423.26236,"b":462.02417,"coord_origin":"1"}},{"id":103,"text":": In this type of network, one","bbox":{"l":423.26697,"t":453.18735,"r":545.10956,"b":462.0939,"coord_origin":"1"}},{"id":104,"text":"predicts a sequence of tokens starting from an encoded","bbox":{"l":308.86197,"t":465.14233,"r":545.11511,"b":474.04889,"coord_origin":"1"}},{"id":105,"text":"image.","bbox":{"l":308.86197,"t":477.09732,"r":335.7012,"b":486.00388,"coord_origin":"1"}},{"id":106,"text":"Such sequences of tokens can be HTML table","bbox":{"l":345.85309,"t":477.09732,"r":545.11505,"b":486.00388,"coord_origin":"1"}},{"id":107,"text":"tags [37, 17] or LaTeX symbols[10]. The choice of sym-","bbox":{"l":308.86197,"t":489.05231,"r":545.11493,"b":497.95886,"coord_origin":"1"}},{"id":108,"text":"bols is ultimately not very important, since one can be trans-","bbox":{"l":308.86197,"t":501.00729,"r":545.11499,"b":509.91385,"coord_origin":"1"}},{"id":109,"text":"formed into the other. There are however subtle variations","bbox":{"l":308.86197,"t":512.9632899999999,"r":545.11505,"b":521.8698400000001,"coord_origin":"1"}},{"id":110,"text":"in the Image-to-Text networks. The easiest network archi-","bbox":{"l":308.86197,"t":524.91827,"r":545.11505,"b":533.82483,"coord_origin":"1"}},{"id":111,"text":"tectures are “image-encoder","bbox":{"l":308.86197,"t":536.87328,"r":420.94119,"b":545.77983,"coord_origin":"1"}},{"id":112,"text":"→","bbox":{"l":423.59497,"t":536.1559599999999,"r":433.5575600000001,"b":545.56065,"coord_origin":"1"}},{"id":113,"text":"text-decoder” (IETD), sim-","bbox":{"l":436.21198,"t":536.87328,"r":545.11316,"b":545.77983,"coord_origin":"1"}},{"id":114,"text":"ilar to network architectures that try to provide captions to","bbox":{"l":308.86197,"t":548.82828,"r":545.11511,"b":557.73483,"coord_origin":"1"}},{"id":115,"text":"images [32]. In these IETD networks, one expects as output","bbox":{"l":308.86197,"t":560.78328,"r":545.11493,"b":569.68983,"coord_origin":"1"}},{"id":116,"text":"the LaTeX/HTML string of the entire table, i.e. the sym-","bbox":{"l":308.86197,"t":572.73828,"r":545.11499,"b":581.6448399999999,"coord_origin":"1"}},{"id":117,"text":"bols necessary for creating the table with the content of the","bbox":{"l":308.86197,"t":584.69427,"r":545.11505,"b":593.60083,"coord_origin":"1"}},{"id":118,"text":"table. Another approach is the “image-encoder","bbox":{"l":308.86197,"t":596.6492800000001,"r":497.07541,"b":605.55583,"coord_origin":"1"}},{"id":119,"text":"→","bbox":{"l":499.80496,"t":595.93196,"r":509.76755,"b":605.33665,"coord_origin":"1"}},{"id":120,"text":"dual de-","bbox":{"l":512.50098,"t":596.6492800000001,"r":545.10852,"b":605.55583,"coord_origin":"1"}},{"id":121,"text":"coder” (IEDD) networks. In these type of networks, one has","bbox":{"l":308.86197,"t":608.60428,"r":545.11511,"b":617.5108299999999,"coord_origin":"1"}},{"id":122,"text":"two consecutive decoders with different purposes. The first","bbox":{"l":308.86197,"t":620.55928,"r":545.11505,"b":629.46584,"coord_origin":"1"}},{"id":123,"text":"decoder is the","bbox":{"l":308.86197,"t":632.51428,"r":364.78201,"b":641.42084,"coord_origin":"1"}},{"id":124,"text":"tag-decoder","bbox":{"l":367.57397,"t":632.60394,"r":415.61362,"b":641.1917,"coord_origin":"1"}},{"id":125,"text":", i.e. it only produces the HTM-","bbox":{"l":415.61298,"t":632.51428,"r":545.11688,"b":641.42084,"coord_origin":"1"}},{"id":126,"text":"L/LaTeX tags which construct an empty table. The second","bbox":{"l":308.86197,"t":644.46928,"r":545.11511,"b":653.37584,"coord_origin":"1"}},{"id":127,"text":"content-decoder","bbox":{"l":308.86197,"t":656.51494,"r":373.59894,"b":665.1027,"coord_origin":"1"}},{"id":128,"text":"uses the encoding of the image in combi-","bbox":{"l":376.90698,"t":656.4252799999999,"r":545.11548,"b":665.33184,"coord_origin":"1"}},{"id":129,"text":"nation with the output encoding of each cell-tag (from the","bbox":{"l":308.862,"t":668.38028,"r":545.11517,"b":677.28684,"coord_origin":"1"}},{"id":130,"text":"tag-decoder","bbox":{"l":308.862,"t":680.42494,"r":356.90164,"b":689.0127,"coord_origin":"1"}},{"id":131,"text":") to generate the textual content of each table","bbox":{"l":357.13101,"t":680.33528,"r":545.1153,"b":689.24184,"coord_origin":"1"}},{"id":132,"text":"cell. The network architecture of IEDD is certainly more","bbox":{"l":308.862,"t":692.290283,"r":545.11511,"b":701.196846,"coord_origin":"1"}},{"id":133,"text":"elaborate, but it has the advantage that one can pre-train the","bbox":{"l":308.862,"t":704.245285,"r":545.11517,"b":713.151848,"coord_origin":"1"}},{"id":134,"text":"2","bbox":{"l":295.121,"t":734.133282,"r":300.10229,"b":743.039845,"coord_origin":"1"}}],"predictions":{"layout":{"clusters":[{"id":0,"label":"Text","bbox":{"l":49.3856703042984,"t":74.23337373733523,"r":286.36505,"b":96.06994999999995,"coord_origin":"1"},"confidence":0.9677466154098511,"cells":[{"id":0,"text":"considered as a solved problem, given enough ground-truth","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36505,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"data to train on.","bbox":{"l":50.112,"t":87.16339000000005,"r":112.64721999999999,"b":96.06994999999995,"coord_origin":"1"}}]},{"id":1,"label":"Text","bbox":{"l":49.32149448394775,"t":98.4006237030029,"r":286.36514,"b":228.18181228637695,"coord_origin":"1"},"confidence":0.9865381717681885,"cells":[{"id":2,"text":"The second problem is called table-structure decompo-","bbox":{"l":62.067001,"t":99.57141000000001,"r":286.36496,"b":108.47797000000003,"coord_origin":"1"}},{"id":3,"text":"sition.","bbox":{"l":50.112,"t":111.52643,"r":74.749512,"b":120.43297999999993,"coord_origin":"1"}},{"id":4,"text":"The latter is a long standing problem in the com-","bbox":{"l":81.334793,"t":111.52643,"r":286.36514,"b":120.43297999999993,"coord_origin":"1"}},{"id":5,"text":"munity of document understanding [6, 4, 14]. Contrary to","bbox":{"l":50.112,"t":123.48145,"r":286.36511,"b":132.38800000000003,"coord_origin":"1"}},{"id":6,"text":"the table-location problem, there are no commonly used ap-","bbox":{"l":50.112,"t":135.43646,"r":286.36511,"b":144.34302000000002,"coord_origin":"1"}},{"id":7,"text":"proaches that can easily be re-purposed to solve this prob-","bbox":{"l":50.112,"t":147.39246000000003,"r":286.36505,"b":156.29900999999995,"coord_origin":"1"}},{"id":8,"text":"lem. Lately, a set of new model-architectures has been pro-","bbox":{"l":50.112,"t":159.34747000000004,"r":286.36511,"b":168.25402999999994,"coord_origin":"1"}},{"id":9,"text":"posed by the community to address table-structure decom-","bbox":{"l":50.112,"t":171.30249000000003,"r":286.36508,"b":180.20905000000005,"coord_origin":"1"}},{"id":10,"text":"position [37, 36, 18, 20]. All these models have some weak-","bbox":{"l":50.112,"t":183.25751000000002,"r":286.36511,"b":192.16405999999995,"coord_origin":"1"}},{"id":11,"text":"nesses (see Sec. 2). The common denominator here is the","bbox":{"l":50.112,"t":195.21252000000004,"r":286.36508,"b":204.11908000000005,"coord_origin":"1"}},{"id":12,"text":"reliance on textual features and/or the inability to provide","bbox":{"l":50.112,"t":207.16754000000003,"r":286.36514,"b":216.07410000000004,"coord_origin":"1"}},{"id":13,"text":"the bounding box of each table-cell in the original image.","bbox":{"l":50.112,"t":219.12354000000005,"r":278.66397,"b":228.03008999999997,"coord_origin":"1"}}]},{"id":2,"label":"Text","bbox":{"l":49.250409722328186,"t":230.44071807861326,"r":286.436358833313,"b":371.94507,"coord_origin":"1"},"confidence":0.9869692921638489,"cells":[{"id":14,"text":"In this paper, we want to address these weaknesses and","bbox":{"l":62.067001,"t":231.53156,"r":286.36493,"b":240.43811000000005,"coord_origin":"1"}},{"id":15,"text":"present a robust table-structure decomposition algorithm.","bbox":{"l":50.112,"t":243.48657000000003,"r":286.36511,"b":252.39313000000004,"coord_origin":"1"}},{"id":16,"text":"The design criteria for our model are the following. First,","bbox":{"l":50.112,"t":255.44159000000002,"r":286.36511,"b":264.34813999999994,"coord_origin":"1"}},{"id":17,"text":"we want our algorithm to be language agnostic. In this way,","bbox":{"l":50.112,"t":267.39661,"r":286.36502,"b":276.30316000000005,"coord_origin":"1"}},{"id":18,"text":"we can obtain the structure of any table, irregardless of the","bbox":{"l":50.112,"t":279.35155999999995,"r":286.36508,"b":288.25815,"coord_origin":"1"}},{"id":19,"text":"language.","bbox":{"l":50.112,"t":291.30759,"r":88.567635,"b":300.21414,"coord_origin":"1"}},{"id":20,"text":"Second, we want our algorithm to leverage as","bbox":{"l":95.501602,"t":291.30759,"r":286.36505,"b":300.21414,"coord_origin":"1"}},{"id":21,"text":"much data as possible from the original PDF document. For","bbox":{"l":50.112,"t":303.26257,"r":286.36508,"b":312.16913,"coord_origin":"1"}},{"id":22,"text":"programmatic PDF documents, the text-cells can often be","bbox":{"l":50.112,"t":315.21756,"r":286.36511,"b":324.12411,"coord_origin":"1"}},{"id":23,"text":"extracted much faster and with higher accuracy compared","bbox":{"l":50.112,"t":327.17255,"r":286.36505,"b":336.0791,"coord_origin":"1"}},{"id":24,"text":"to OCR methods. Last but not least, we want to have a di-","bbox":{"l":50.112,"t":339.12753,"r":286.36511,"b":348.03409,"coord_origin":"1"}},{"id":25,"text":"rect link between the table-cell and its bounding box in the","bbox":{"l":50.112,"t":351.08353,"r":286.36508,"b":359.99008,"coord_origin":"1"}},{"id":26,"text":"image.","bbox":{"l":50.112,"t":363.03851,"r":76.951241,"b":371.94507,"coord_origin":"1"}}]},{"id":3,"label":"Text","bbox":{"l":49.56145799160004,"t":374.04506263732907,"r":286.36658,"b":432.173,"coord_origin":"1"},"confidence":0.9837217330932617,"cells":[{"id":27,"text":"To meet the design criteria listed above, we developed a","bbox":{"l":62.067001,"t":375.4465,"r":286.36499,"b":384.35306,"coord_origin":"1"}},{"id":28,"text":"new model called","bbox":{"l":50.112,"t":387.40149,"r":120.98594,"b":396.30804,"coord_origin":"1"}},{"id":29,"text":"TableFormer","bbox":{"l":123.901,"t":387.28192,"r":179.7314,"b":396.23830999999996,"coord_origin":"1"}},{"id":30,"text":"and a synthetically gener-","bbox":{"l":182.646,"t":387.40149,"r":286.36658,"b":396.30804,"coord_origin":"1"}},{"id":31,"text":"ated table structure dataset called","bbox":{"l":50.112,"t":399.35648,"r":181.75778,"b":408.26302999999996,"coord_origin":"1"}},{"id":32,"text":"SynthTabNet","bbox":{"l":184.104,"t":399.23690999999997,"r":240.2034,"b":408.1933,"coord_origin":"1"}},{"id":33,"text":"$^{1}$. In partic-","bbox":{"l":240.20401,"t":399.35648,"r":286.36069,"b":408.26302999999996,"coord_origin":"1"}},{"id":34,"text":"ular, our contributions in this work can be summarised as","bbox":{"l":50.112015,"t":411.31146,"r":286.36511,"b":420.21802,"coord_origin":"1"}},{"id":35,"text":"follows:","bbox":{"l":50.112015,"t":423.26645,"r":82.520355,"b":432.173,"coord_origin":"1"}}]},{"id":4,"label":"List-item","bbox":{"l":61.25934247970581,"t":443.2520462036133,"r":286.6215797424316,"b":489.61486587524416,"coord_origin":"1"},"confidence":0.9750838279724121,"cells":[{"id":36,"text":"•","bbox":{"l":61.569016,"t":444.55145,"r":70.741714,"b":453.45801,"coord_origin":"1"}},{"id":37,"text":"We propose","bbox":{"l":73.034889,"t":444.55145,"r":117.10054,"b":453.45801,"coord_origin":"1"}},{"id":38,"text":"TableFormer","bbox":{"l":119.59001,"t":444.43188,"r":175.42041,"b":453.38828,"coord_origin":"1"}},{"id":39,"text":", a transformer based model","bbox":{"l":175.42102,"t":444.55145,"r":286.36453,"b":453.45801,"coord_origin":"1"}},{"id":40,"text":"that predicts tables structure and bounding boxes for","bbox":{"l":70.037018,"t":456.50644000000005,"r":286.3649,"b":465.41299,"coord_origin":"1"}},{"id":41,"text":"the table content simultaneously in an end-to-end ap-","bbox":{"l":70.037018,"t":468.46143,"r":286.3649,"b":477.36798,"coord_origin":"1"}},{"id":42,"text":"proach.","bbox":{"l":70.037018,"t":480.41641,"r":99.635902,"b":489.32297,"coord_origin":"1"}}]},{"id":5,"label":"List-item","bbox":{"l":61.222047328948975,"t":500.8945152282715,"r":286.3649,"b":547.1242492675782,"coord_origin":"1"},"confidence":0.9774568676948547,"cells":[{"id":43,"text":"•","bbox":{"l":61.569016,"t":502.15341,"r":71.619438,"b":511.05997,"coord_origin":"1"}},{"id":44,"text":"Across all benchmark datasets","bbox":{"l":74.132042,"t":502.15341,"r":196.10396,"b":511.05997,"coord_origin":"1"}},{"id":45,"text":"TableFormer","bbox":{"l":200.31001,"t":502.03384,"r":256.14041,"b":510.99023,"coord_origin":"1"}},{"id":46,"text":"signif-","bbox":{"l":260.35001,"t":502.15341,"r":286.36237,"b":511.05997,"coord_origin":"1"}},{"id":47,"text":"icantly outperforms existing state-of-the-art metrics,","bbox":{"l":70.037003,"t":514.1084000000001,"r":286.3649,"b":523.01495,"coord_origin":"1"}},{"id":48,"text":"while being much more efficient in training and infer-","bbox":{"l":70.037003,"t":526.06439,"r":286.36487,"b":534.97095,"coord_origin":"1"}},{"id":49,"text":"ence to existing works.","bbox":{"l":70.037003,"t":538.0193899999999,"r":161.65305,"b":546.9259500000001,"coord_origin":"1"}}]},{"id":6,"label":"List-item","bbox":{"l":61.282082891464235,"t":558.6514892578125,"r":286.36493,"b":592.8100090026855,"coord_origin":"1"},"confidence":0.9757609367370605,"cells":[{"id":50,"text":"•","bbox":{"l":61.569,"t":559.75639,"r":71.115913,"b":568.66295,"coord_origin":"1"}},{"id":51,"text":"We present","bbox":{"l":73.502647,"t":559.75639,"r":116.71199,"b":568.66295,"coord_origin":"1"}},{"id":52,"text":"SynthTabNet","bbox":{"l":121.583,"t":559.63684,"r":177.68239,"b":568.59322,"coord_origin":"1"}},{"id":53,"text":"a synthetically generated","bbox":{"l":182.55301,"t":559.75639,"r":286.36328,"b":568.66295,"coord_origin":"1"}},{"id":54,"text":"dataset, with various appearance styles and complex-","bbox":{"l":70.03701,"t":571.7114,"r":286.36493,"b":580.6179500000001,"coord_origin":"1"}},{"id":55,"text":"ity.","bbox":{"l":70.03701,"t":583.6664000000001,"r":82.400597,"b":592.57295,"coord_origin":"1"}}]},{"id":7,"label":"List-item","bbox":{"l":61.13663399219513,"t":604.1612617492675,"r":286.36508,"b":638.7555198669434,"coord_origin":"1"},"confidence":0.975217878818512,"cells":[{"id":56,"text":"•","bbox":{"l":61.569008000000004,"t":605.4034,"r":72.332527,"b":614.30995,"coord_origin":"1"}},{"id":57,"text":"An augmented dataset based on PubTabNet [37],","bbox":{"l":75.023399,"t":605.4034,"r":286.36508,"b":614.30995,"coord_origin":"1"}},{"id":58,"text":"FinTabNet [36], and TableBank [17] with generated","bbox":{"l":70.03701,"t":617.3584,"r":286.36487,"b":626.26495,"coord_origin":"1"}},{"id":59,"text":"ground-truth for reproducibility.","bbox":{"l":70.03701,"t":629.31439,"r":198.05641,"b":638.22095,"coord_origin":"1"}}]},{"id":8,"label":"Text","bbox":{"l":49.23523485660553,"t":649.6770011901855,"r":286.6603340148926,"b":695.579761505127,"coord_origin":"1"},"confidence":0.963869571685791,"cells":[{"id":60,"text":"The paper is structured as follows. In Sec. 2, we give","bbox":{"l":62.067009000000006,"t":650.59839,"r":286.36496,"b":659.50494,"coord_origin":"1"}},{"id":61,"text":"a brief overview of the current state-of-the-art. In Sec. 3,","bbox":{"l":50.112007,"t":662.55339,"r":286.36511,"b":671.45995,"coord_origin":"1"}},{"id":62,"text":"we describe the datasets on which we train. In Sec. 4, we","bbox":{"l":50.112007,"t":674.50839,"r":286.36511,"b":683.41496,"coord_origin":"1"}},{"id":63,"text":"introduce the TableFormer model-architecture and describe","bbox":{"l":50.112007,"t":686.46339,"r":286.36511,"b":695.369957,"coord_origin":"1"}}]},{"id":9,"label":"Footnote","bbox":{"l":60.97100100000001,"t":704.329801940918,"r":183.73055,"b":713.4629356384277,"coord_origin":"1"},"confidence":0.8912795782089233,"cells":[{"id":64,"text":"$^{1}$https://github.com/IBM/SynthTabNet","bbox":{"l":60.97100100000001,"t":705.596275,"r":183.73055,"b":712.721542,"coord_origin":"1"}}]},{"id":10,"label":"Text","bbox":{"l":308.2300924301147,"t":74.18210706710818,"r":545.4613071441651,"b":108.43959131240842,"coord_origin":"1"},"confidence":0.9728982448577881,"cells":[{"id":65,"text":"its results & performance in Sec. 5. As a conclusion, we de-","bbox":{"l":308.862,"t":75.20836999999995,"r":545.11511,"b":84.11492999999996,"coord_origin":"1"}},{"id":66,"text":"scribe how this new model-architecture can be re-purposed","bbox":{"l":308.862,"t":87.16339000000005,"r":545.11505,"b":96.06994999999995,"coord_origin":"1"}},{"id":67,"text":"for other tasks in the computer-vision community.","bbox":{"l":308.862,"t":99.11841000000004,"r":508.08417000000003,"b":108.02495999999985,"coord_origin":"1"}}]},{"id":11,"label":"Section-header","bbox":{"l":307.962664604187,"t":120.99532756805422,"r":498.4370830535889,"b":132.47968000000003,"coord_origin":"1"},"confidence":0.9394794702529907,"cells":[{"id":68,"text":"2.","bbox":{"l":308.862,"t":121.73193000000003,"r":315.5831,"b":132.47968000000003,"coord_origin":"1"}},{"id":69,"text":"Previous work and State of the Art","bbox":{"l":324.54456,"t":121.73193000000003,"r":498.28021,"b":132.47968000000003,"coord_origin":"1"}}]},{"id":12,"label":"Text","bbox":{"l":307.7052463531494,"t":140.9155918121338,"r":545.4184089660645,"b":330.45502,"coord_origin":"1"},"confidence":0.9880366921424866,"cells":[{"id":70,"text":"Identifying the structure of a table has been an outstand-","bbox":{"l":320.81699,"t":142.22136999999998,"r":545.11493,"b":151.12793,"coord_origin":"1"}},{"id":71,"text":"ing problem in the document-parsing community, that mo-","bbox":{"l":308.862,"t":154.17638999999997,"r":545.11505,"b":163.08294999999998,"coord_origin":"1"}},{"id":72,"text":"tivates many organised public challenges [6, 4, 14].","bbox":{"l":308.862,"t":166.13140999999996,"r":522.55975,"b":175.03796,"coord_origin":"1"}},{"id":73,"text":"The","bbox":{"l":529.62323,"t":166.13140999999996,"r":545.11505,"b":175.03796,"coord_origin":"1"}},{"id":74,"text":"difficulty of the problem can be attributed to a number of","bbox":{"l":308.862,"t":178.08642999999995,"r":545.11517,"b":186.99298,"coord_origin":"1"}},{"id":75,"text":"factors. First, there is a large variety in the shapes and sizes","bbox":{"l":308.862,"t":190.04143999999997,"r":545.11511,"b":198.94799999999998,"coord_origin":"1"}},{"id":76,"text":"of tables.","bbox":{"l":308.862,"t":201.99645999999996,"r":346.97891,"b":210.90301999999997,"coord_origin":"1"}},{"id":77,"text":"Such large variety requires a flexible method.","bbox":{"l":354.86929,"t":201.99645999999996,"r":545.11511,"b":210.90301999999997,"coord_origin":"1"}},{"id":78,"text":"This is especially true for complex column- and row head-","bbox":{"l":308.862,"t":213.95245,"r":545.11505,"b":222.85901,"coord_origin":"1"}},{"id":79,"text":"ers, which can be extremely intricate and demanding.","bbox":{"l":308.862,"t":225.90747,"r":530.9184,"b":234.81403,"coord_origin":"1"}},{"id":80,"text":"A","bbox":{"l":537.92212,"t":225.90747,"r":545.11511,"b":234.81403,"coord_origin":"1"}},{"id":81,"text":"second factor of complexity is the lack of data with regard","bbox":{"l":308.862,"t":237.86248999999998,"r":545.11517,"b":246.76904000000002,"coord_origin":"1"}},{"id":82,"text":"to table-structure. Until the publication of PubTabNet [37],","bbox":{"l":308.862,"t":249.8175,"r":545.11511,"b":258.72406,"coord_origin":"1"}},{"id":83,"text":"there were no large datasets (i.e.","bbox":{"l":308.862,"t":261.77252,"r":439.8402699999999,"b":270.67908,"coord_origin":"1"}},{"id":84,"text":">","bbox":{"l":444.43999999999994,"t":261.61310000000003,"r":452.1889,"b":270.45989999999995,"coord_origin":"1"}},{"id":85,"text":"100","bbox":{"l":455.89001,"t":261.61310000000003,"r":470.83392000000003,"b":270.45989999999995,"coord_origin":"1"}},{"id":86,"text":"K tables) that pro-","bbox":{"l":470.83401,"t":261.77252,"r":545.11517,"b":270.67908,"coord_origin":"1"}},{"id":87,"text":"vided structure information. This happens primarily due to","bbox":{"l":308.862,"t":273.72748,"r":545.11511,"b":282.63406,"coord_origin":"1"}},{"id":88,"text":"the fact that tables are notoriously time-consuming to an-","bbox":{"l":308.862,"t":285.6835,"r":545.11511,"b":294.59006,"coord_origin":"1"}},{"id":89,"text":"notate by hand. However, this has definitely changed in re-","bbox":{"l":308.862,"t":297.63849,"r":545.11511,"b":306.54504,"coord_origin":"1"}},{"id":90,"text":"cent years with the deliverance of PubTabNet [37], FinTab-","bbox":{"l":308.862,"t":309.59348,"r":545.11517,"b":318.50003000000004,"coord_origin":"1"}},{"id":91,"text":"Net [36], TableBank [17] etc.","bbox":{"l":308.862,"t":321.54846,"r":425.92255,"b":330.45502,"coord_origin":"1"}}]},{"id":13,"label":"Text","bbox":{"l":307.6594591140747,"t":332.2529640197754,"r":545.287671661377,"b":450.0729099999999,"coord_origin":"1"},"confidence":0.988025426864624,"cells":[{"id":92,"text":"Before the rising popularity of deep neural networks,","bbox":{"l":320.81699,"t":333.56946,"r":545.11499,"b":342.47601,"coord_origin":"1"}},{"id":93,"text":"the community relied heavily on heuristic and/or statistical","bbox":{"l":308.862,"t":345.52444,"r":545.11499,"b":354.43100000000004,"coord_origin":"1"}},{"id":94,"text":"methods to do table structure identification [3, 7, 11, 5, 13,","bbox":{"l":308.862,"t":357.47943,"r":545.11517,"b":366.38599,"coord_origin":"1"}},{"id":95,"text":"28]. Although such methods work well on constrained ta-","bbox":{"l":308.862,"t":369.43542,"r":545.11511,"b":378.34198,"coord_origin":"1"}},{"id":96,"text":"bles [12], a more data-driven approach can be applied due","bbox":{"l":308.862,"t":381.39041,"r":545.11505,"b":390.29697,"coord_origin":"1"}},{"id":97,"text":"to the advent of convolutional neural networks (CNNs) and","bbox":{"l":308.862,"t":393.3453999999999,"r":545.11505,"b":402.25195,"coord_origin":"1"}},{"id":98,"text":"the availability of large datasets. To the best-of-our knowl-","bbox":{"l":308.862,"t":405.30038,"r":545.11517,"b":414.20694,"coord_origin":"1"}},{"id":99,"text":"edge, there are currently two different types of network ar-","bbox":{"l":308.862,"t":417.25537,"r":545.11523,"b":426.16193,"coord_origin":"1"}},{"id":100,"text":"chitecture that are being pursued for state-of-the-art table-","bbox":{"l":308.862,"t":429.21136000000007,"r":545.11511,"b":438.11792,"coord_origin":"1"}},{"id":101,"text":"structure identification.","bbox":{"l":308.862,"t":441.16635,"r":401.28503,"b":450.0729099999999,"coord_origin":"1"}}]},{"id":14,"label":"Text","bbox":{"l":307.7598466873169,"t":451.91541481018066,"r":545.4887969970704,"b":713.2906494140625,"coord_origin":"1"},"confidence":0.9885419607162476,"cells":[{"id":102,"text":"Image-to-Text networks","bbox":{"l":320.81699,"t":453.06778,"r":423.26236,"b":462.02417,"coord_origin":"1"}},{"id":103,"text":": In this type of network, one","bbox":{"l":423.26697,"t":453.18735,"r":545.10956,"b":462.0939,"coord_origin":"1"}},{"id":104,"text":"predicts a sequence of tokens starting from an encoded","bbox":{"l":308.86197,"t":465.14233,"r":545.11511,"b":474.04889,"coord_origin":"1"}},{"id":105,"text":"image.","bbox":{"l":308.86197,"t":477.09732,"r":335.7012,"b":486.00388,"coord_origin":"1"}},{"id":106,"text":"Such sequences of tokens can be HTML table","bbox":{"l":345.85309,"t":477.09732,"r":545.11505,"b":486.00388,"coord_origin":"1"}},{"id":107,"text":"tags [37, 17] or LaTeX symbols[10]. The choice of sym-","bbox":{"l":308.86197,"t":489.05231,"r":545.11493,"b":497.95886,"coord_origin":"1"}},{"id":108,"text":"bols is ultimately not very important, since one can be trans-","bbox":{"l":308.86197,"t":501.00729,"r":545.11499,"b":509.91385,"coord_origin":"1"}},{"id":109,"text":"formed into the other. There are however subtle variations","bbox":{"l":308.86197,"t":512.9632899999999,"r":545.11505,"b":521.8698400000001,"coord_origin":"1"}},{"id":110,"text":"in the Image-to-Text networks. The easiest network archi-","bbox":{"l":308.86197,"t":524.91827,"r":545.11505,"b":533.82483,"coord_origin":"1"}},{"id":111,"text":"tectures are “image-encoder","bbox":{"l":308.86197,"t":536.87328,"r":420.94119,"b":545.77983,"coord_origin":"1"}},{"id":112,"text":"→","bbox":{"l":423.59497,"t":536.1559599999999,"r":433.5575600000001,"b":545.56065,"coord_origin":"1"}},{"id":113,"text":"text-decoder” (IETD), sim-","bbox":{"l":436.21198,"t":536.87328,"r":545.11316,"b":545.77983,"coord_origin":"1"}},{"id":114,"text":"ilar to network architectures that try to provide captions to","bbox":{"l":308.86197,"t":548.82828,"r":545.11511,"b":557.73483,"coord_origin":"1"}},{"id":115,"text":"images [32]. In these IETD networks, one expects as output","bbox":{"l":308.86197,"t":560.78328,"r":545.11493,"b":569.68983,"coord_origin":"1"}},{"id":116,"text":"the LaTeX/HTML string of the entire table, i.e. the sym-","bbox":{"l":308.86197,"t":572.73828,"r":545.11499,"b":581.6448399999999,"coord_origin":"1"}},{"id":117,"text":"bols necessary for creating the table with the content of the","bbox":{"l":308.86197,"t":584.69427,"r":545.11505,"b":593.60083,"coord_origin":"1"}},{"id":118,"text":"table. Another approach is the “image-encoder","bbox":{"l":308.86197,"t":596.6492800000001,"r":497.07541,"b":605.55583,"coord_origin":"1"}},{"id":119,"text":"→","bbox":{"l":499.80496,"t":595.93196,"r":509.76755,"b":605.33665,"coord_origin":"1"}},{"id":120,"text":"dual de-","bbox":{"l":512.50098,"t":596.6492800000001,"r":545.10852,"b":605.55583,"coord_origin":"1"}},{"id":121,"text":"coder” (IEDD) networks. In these type of networks, one has","bbox":{"l":308.86197,"t":608.60428,"r":545.11511,"b":617.5108299999999,"coord_origin":"1"}},{"id":122,"text":"two consecutive decoders with different purposes. The first","bbox":{"l":308.86197,"t":620.55928,"r":545.11505,"b":629.46584,"coord_origin":"1"}},{"id":123,"text":"decoder is the","bbox":{"l":308.86197,"t":632.51428,"r":364.78201,"b":641.42084,"coord_origin":"1"}},{"id":124,"text":"tag-decoder","bbox":{"l":367.57397,"t":632.60394,"r":415.61362,"b":641.1917,"coord_origin":"1"}},{"id":125,"text":", i.e. it only produces the HTM-","bbox":{"l":415.61298,"t":632.51428,"r":545.11688,"b":641.42084,"coord_origin":"1"}},{"id":126,"text":"L/LaTeX tags which construct an empty table. The second","bbox":{"l":308.86197,"t":644.46928,"r":545.11511,"b":653.37584,"coord_origin":"1"}},{"id":127,"text":"content-decoder","bbox":{"l":308.86197,"t":656.51494,"r":373.59894,"b":665.1027,"coord_origin":"1"}},{"id":128,"text":"uses the encoding of the image in combi-","bbox":{"l":376.90698,"t":656.4252799999999,"r":545.11548,"b":665.33184,"coord_origin":"1"}},{"id":129,"text":"nation with the output encoding of each cell-tag (from the","bbox":{"l":308.862,"t":668.38028,"r":545.11517,"b":677.28684,"coord_origin":"1"}},{"id":130,"text":"tag-decoder","bbox":{"l":308.862,"t":680.42494,"r":356.90164,"b":689.0127,"coord_origin":"1"}},{"id":131,"text":") to generate the textual content of each table","bbox":{"l":357.13101,"t":680.33528,"r":545.1153,"b":689.24184,"coord_origin":"1"}},{"id":132,"text":"cell. The network architecture of IEDD is certainly more","bbox":{"l":308.862,"t":692.290283,"r":545.11511,"b":701.196846,"coord_origin":"1"}},{"id":133,"text":"elaborate, but it has the advantage that one can pre-train the","bbox":{"l":308.862,"t":704.245285,"r":545.11517,"b":713.151848,"coord_origin":"1"}}]},{"id":15,"label":"Page-footer","bbox":{"l":294.621068572998,"t":733.353044128418,"r":300.12242431640624,"b":743.039845,"coord_origin":"1"},"confidence":0.8898882269859314,"cells":[{"id":134,"text":"2","bbox":{"l":295.121,"t":734.133282,"r":300.10229,"b":743.039845,"coord_origin":"1"}}]}]},"tablestructure":{"table_map":{}},"figures_classification":null,"equations_prediction":null},"assembled":{"elements":[{"label":"Text","id":0,"page_no":1,"cluster":{"id":0,"label":"Text","bbox":{"l":49.3856703042984,"t":74.23337373733523,"r":286.36505,"b":96.06994999999995,"coord_origin":"1"},"confidence":0.9677466154098511,"cells":[{"id":0,"text":"considered as a solved problem, given enough ground-truth","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36505,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"data to train on.","bbox":{"l":50.112,"t":87.16339000000005,"r":112.64721999999999,"b":96.06994999999995,"coord_origin":"1"}}]},"text":"considered as a solved problem, given enough ground-truth data to train on."},{"label":"Text","id":1,"page_no":1,"cluster":{"id":1,"label":"Text","bbox":{"l":49.32149448394775,"t":98.4006237030029,"r":286.36514,"b":228.18181228637695,"coord_origin":"1"},"confidence":0.9865381717681885,"cells":[{"id":2,"text":"The second problem is called table-structure decompo-","bbox":{"l":62.067001,"t":99.57141000000001,"r":286.36496,"b":108.47797000000003,"coord_origin":"1"}},{"id":3,"text":"sition.","bbox":{"l":50.112,"t":111.52643,"r":74.749512,"b":120.43297999999993,"coord_origin":"1"}},{"id":4,"text":"The latter is a long standing problem in the com-","bbox":{"l":81.334793,"t":111.52643,"r":286.36514,"b":120.43297999999993,"coord_origin":"1"}},{"id":5,"text":"munity of document understanding [6, 4, 14]. Contrary to","bbox":{"l":50.112,"t":123.48145,"r":286.36511,"b":132.38800000000003,"coord_origin":"1"}},{"id":6,"text":"the table-location problem, there are no commonly used ap-","bbox":{"l":50.112,"t":135.43646,"r":286.36511,"b":144.34302000000002,"coord_origin":"1"}},{"id":7,"text":"proaches that can easily be re-purposed to solve this prob-","bbox":{"l":50.112,"t":147.39246000000003,"r":286.36505,"b":156.29900999999995,"coord_origin":"1"}},{"id":8,"text":"lem. Lately, a set of new model-architectures has been pro-","bbox":{"l":50.112,"t":159.34747000000004,"r":286.36511,"b":168.25402999999994,"coord_origin":"1"}},{"id":9,"text":"posed by the community to address table-structure decom-","bbox":{"l":50.112,"t":171.30249000000003,"r":286.36508,"b":180.20905000000005,"coord_origin":"1"}},{"id":10,"text":"position [37, 36, 18, 20]. All these models have some weak-","bbox":{"l":50.112,"t":183.25751000000002,"r":286.36511,"b":192.16405999999995,"coord_origin":"1"}},{"id":11,"text":"nesses (see Sec. 2). The common denominator here is the","bbox":{"l":50.112,"t":195.21252000000004,"r":286.36508,"b":204.11908000000005,"coord_origin":"1"}},{"id":12,"text":"reliance on textual features and/or the inability to provide","bbox":{"l":50.112,"t":207.16754000000003,"r":286.36514,"b":216.07410000000004,"coord_origin":"1"}},{"id":13,"text":"the bounding box of each table-cell in the original image.","bbox":{"l":50.112,"t":219.12354000000005,"r":278.66397,"b":228.03008999999997,"coord_origin":"1"}}]},"text":"The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image."},{"label":"Text","id":2,"page_no":1,"cluster":{"id":2,"label":"Text","bbox":{"l":49.250409722328186,"t":230.44071807861326,"r":286.436358833313,"b":371.94507,"coord_origin":"1"},"confidence":0.9869692921638489,"cells":[{"id":14,"text":"In this paper, we want to address these weaknesses and","bbox":{"l":62.067001,"t":231.53156,"r":286.36493,"b":240.43811000000005,"coord_origin":"1"}},{"id":15,"text":"present a robust table-structure decomposition algorithm.","bbox":{"l":50.112,"t":243.48657000000003,"r":286.36511,"b":252.39313000000004,"coord_origin":"1"}},{"id":16,"text":"The design criteria for our model are the following. First,","bbox":{"l":50.112,"t":255.44159000000002,"r":286.36511,"b":264.34813999999994,"coord_origin":"1"}},{"id":17,"text":"we want our algorithm to be language agnostic. In this way,","bbox":{"l":50.112,"t":267.39661,"r":286.36502,"b":276.30316000000005,"coord_origin":"1"}},{"id":18,"text":"we can obtain the structure of any table, irregardless of the","bbox":{"l":50.112,"t":279.35155999999995,"r":286.36508,"b":288.25815,"coord_origin":"1"}},{"id":19,"text":"language.","bbox":{"l":50.112,"t":291.30759,"r":88.567635,"b":300.21414,"coord_origin":"1"}},{"id":20,"text":"Second, we want our algorithm to leverage as","bbox":{"l":95.501602,"t":291.30759,"r":286.36505,"b":300.21414,"coord_origin":"1"}},{"id":21,"text":"much data as possible from the original PDF document. For","bbox":{"l":50.112,"t":303.26257,"r":286.36508,"b":312.16913,"coord_origin":"1"}},{"id":22,"text":"programmatic PDF documents, the text-cells can often be","bbox":{"l":50.112,"t":315.21756,"r":286.36511,"b":324.12411,"coord_origin":"1"}},{"id":23,"text":"extracted much faster and with higher accuracy compared","bbox":{"l":50.112,"t":327.17255,"r":286.36505,"b":336.0791,"coord_origin":"1"}},{"id":24,"text":"to OCR methods. Last but not least, we want to have a di-","bbox":{"l":50.112,"t":339.12753,"r":286.36511,"b":348.03409,"coord_origin":"1"}},{"id":25,"text":"rect link between the table-cell and its bounding box in the","bbox":{"l":50.112,"t":351.08353,"r":286.36508,"b":359.99008,"coord_origin":"1"}},{"id":26,"text":"image.","bbox":{"l":50.112,"t":363.03851,"r":76.951241,"b":371.94507,"coord_origin":"1"}}]},"text":"In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image."},{"label":"Text","id":3,"page_no":1,"cluster":{"id":3,"label":"Text","bbox":{"l":49.56145799160004,"t":374.04506263732907,"r":286.36658,"b":432.173,"coord_origin":"1"},"confidence":0.9837217330932617,"cells":[{"id":27,"text":"To meet the design criteria listed above, we developed a","bbox":{"l":62.067001,"t":375.4465,"r":286.36499,"b":384.35306,"coord_origin":"1"}},{"id":28,"text":"new model called","bbox":{"l":50.112,"t":387.40149,"r":120.98594,"b":396.30804,"coord_origin":"1"}},{"id":29,"text":"TableFormer","bbox":{"l":123.901,"t":387.28192,"r":179.7314,"b":396.23830999999996,"coord_origin":"1"}},{"id":30,"text":"and a synthetically gener-","bbox":{"l":182.646,"t":387.40149,"r":286.36658,"b":396.30804,"coord_origin":"1"}},{"id":31,"text":"ated table structure dataset called","bbox":{"l":50.112,"t":399.35648,"r":181.75778,"b":408.26302999999996,"coord_origin":"1"}},{"id":32,"text":"SynthTabNet","bbox":{"l":184.104,"t":399.23690999999997,"r":240.2034,"b":408.1933,"coord_origin":"1"}},{"id":33,"text":"$^{1}$. In partic-","bbox":{"l":240.20401,"t":399.35648,"r":286.36069,"b":408.26302999999996,"coord_origin":"1"}},{"id":34,"text":"ular, our contributions in this work can be summarised as","bbox":{"l":50.112015,"t":411.31146,"r":286.36511,"b":420.21802,"coord_origin":"1"}},{"id":35,"text":"follows:","bbox":{"l":50.112015,"t":423.26645,"r":82.520355,"b":432.173,"coord_origin":"1"}}]},"text":"To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:"},{"label":"List-item","id":4,"page_no":1,"cluster":{"id":4,"label":"List-item","bbox":{"l":61.25934247970581,"t":443.2520462036133,"r":286.6215797424316,"b":489.61486587524416,"coord_origin":"1"},"confidence":0.9750838279724121,"cells":[{"id":36,"text":"•","bbox":{"l":61.569016,"t":444.55145,"r":70.741714,"b":453.45801,"coord_origin":"1"}},{"id":37,"text":"We propose","bbox":{"l":73.034889,"t":444.55145,"r":117.10054,"b":453.45801,"coord_origin":"1"}},{"id":38,"text":"TableFormer","bbox":{"l":119.59001,"t":444.43188,"r":175.42041,"b":453.38828,"coord_origin":"1"}},{"id":39,"text":", a transformer based model","bbox":{"l":175.42102,"t":444.55145,"r":286.36453,"b":453.45801,"coord_origin":"1"}},{"id":40,"text":"that predicts tables structure and bounding boxes for","bbox":{"l":70.037018,"t":456.50644000000005,"r":286.3649,"b":465.41299,"coord_origin":"1"}},{"id":41,"text":"the table content simultaneously in an end-to-end ap-","bbox":{"l":70.037018,"t":468.46143,"r":286.3649,"b":477.36798,"coord_origin":"1"}},{"id":42,"text":"proach.","bbox":{"l":70.037018,"t":480.41641,"r":99.635902,"b":489.32297,"coord_origin":"1"}}]},"text":"• We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach."},{"label":"List-item","id":5,"page_no":1,"cluster":{"id":5,"label":"List-item","bbox":{"l":61.222047328948975,"t":500.8945152282715,"r":286.3649,"b":547.1242492675782,"coord_origin":"1"},"confidence":0.9774568676948547,"cells":[{"id":43,"text":"•","bbox":{"l":61.569016,"t":502.15341,"r":71.619438,"b":511.05997,"coord_origin":"1"}},{"id":44,"text":"Across all benchmark datasets","bbox":{"l":74.132042,"t":502.15341,"r":196.10396,"b":511.05997,"coord_origin":"1"}},{"id":45,"text":"TableFormer","bbox":{"l":200.31001,"t":502.03384,"r":256.14041,"b":510.99023,"coord_origin":"1"}},{"id":46,"text":"signif-","bbox":{"l":260.35001,"t":502.15341,"r":286.36237,"b":511.05997,"coord_origin":"1"}},{"id":47,"text":"icantly outperforms existing state-of-the-art metrics,","bbox":{"l":70.037003,"t":514.1084000000001,"r":286.3649,"b":523.01495,"coord_origin":"1"}},{"id":48,"text":"while being much more efficient in training and infer-","bbox":{"l":70.037003,"t":526.06439,"r":286.36487,"b":534.97095,"coord_origin":"1"}},{"id":49,"text":"ence to existing works.","bbox":{"l":70.037003,"t":538.0193899999999,"r":161.65305,"b":546.9259500000001,"coord_origin":"1"}}]},"text":"• Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works."},{"label":"List-item","id":6,"page_no":1,"cluster":{"id":6,"label":"List-item","bbox":{"l":61.282082891464235,"t":558.6514892578125,"r":286.36493,"b":592.8100090026855,"coord_origin":"1"},"confidence":0.9757609367370605,"cells":[{"id":50,"text":"•","bbox":{"l":61.569,"t":559.75639,"r":71.115913,"b":568.66295,"coord_origin":"1"}},{"id":51,"text":"We present","bbox":{"l":73.502647,"t":559.75639,"r":116.71199,"b":568.66295,"coord_origin":"1"}},{"id":52,"text":"SynthTabNet","bbox":{"l":121.583,"t":559.63684,"r":177.68239,"b":568.59322,"coord_origin":"1"}},{"id":53,"text":"a synthetically generated","bbox":{"l":182.55301,"t":559.75639,"r":286.36328,"b":568.66295,"coord_origin":"1"}},{"id":54,"text":"dataset, with various appearance styles and complex-","bbox":{"l":70.03701,"t":571.7114,"r":286.36493,"b":580.6179500000001,"coord_origin":"1"}},{"id":55,"text":"ity.","bbox":{"l":70.03701,"t":583.6664000000001,"r":82.400597,"b":592.57295,"coord_origin":"1"}}]},"text":"• We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity."},{"label":"List-item","id":7,"page_no":1,"cluster":{"id":7,"label":"List-item","bbox":{"l":61.13663399219513,"t":604.1612617492675,"r":286.36508,"b":638.7555198669434,"coord_origin":"1"},"confidence":0.975217878818512,"cells":[{"id":56,"text":"•","bbox":{"l":61.569008000000004,"t":605.4034,"r":72.332527,"b":614.30995,"coord_origin":"1"}},{"id":57,"text":"An augmented dataset based on PubTabNet [37],","bbox":{"l":75.023399,"t":605.4034,"r":286.36508,"b":614.30995,"coord_origin":"1"}},{"id":58,"text":"FinTabNet [36], and TableBank [17] with generated","bbox":{"l":70.03701,"t":617.3584,"r":286.36487,"b":626.26495,"coord_origin":"1"}},{"id":59,"text":"ground-truth for reproducibility.","bbox":{"l":70.03701,"t":629.31439,"r":198.05641,"b":638.22095,"coord_origin":"1"}}]},"text":"• An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility."},{"label":"Text","id":8,"page_no":1,"cluster":{"id":8,"label":"Text","bbox":{"l":49.23523485660553,"t":649.6770011901855,"r":286.6603340148926,"b":695.579761505127,"coord_origin":"1"},"confidence":0.963869571685791,"cells":[{"id":60,"text":"The paper is structured as follows. In Sec. 2, we give","bbox":{"l":62.067009000000006,"t":650.59839,"r":286.36496,"b":659.50494,"coord_origin":"1"}},{"id":61,"text":"a brief overview of the current state-of-the-art. In Sec. 3,","bbox":{"l":50.112007,"t":662.55339,"r":286.36511,"b":671.45995,"coord_origin":"1"}},{"id":62,"text":"we describe the datasets on which we train. In Sec. 4, we","bbox":{"l":50.112007,"t":674.50839,"r":286.36511,"b":683.41496,"coord_origin":"1"}},{"id":63,"text":"introduce the TableFormer model-architecture and describe","bbox":{"l":50.112007,"t":686.46339,"r":286.36511,"b":695.369957,"coord_origin":"1"}}]},"text":"The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe"},{"label":"Footnote","id":9,"page_no":1,"cluster":{"id":9,"label":"Footnote","bbox":{"l":60.97100100000001,"t":704.329801940918,"r":183.73055,"b":713.4629356384277,"coord_origin":"1"},"confidence":0.8912795782089233,"cells":[{"id":64,"text":"$^{1}$https://github.com/IBM/SynthTabNet","bbox":{"l":60.97100100000001,"t":705.596275,"r":183.73055,"b":712.721542,"coord_origin":"1"}}]},"text":"$^{1}$https://github.com/IBM/SynthTabNet"},{"label":"Text","id":10,"page_no":1,"cluster":{"id":10,"label":"Text","bbox":{"l":308.2300924301147,"t":74.18210706710818,"r":545.4613071441651,"b":108.43959131240842,"coord_origin":"1"},"confidence":0.9728982448577881,"cells":[{"id":65,"text":"its results & performance in Sec. 5. As a conclusion, we de-","bbox":{"l":308.862,"t":75.20836999999995,"r":545.11511,"b":84.11492999999996,"coord_origin":"1"}},{"id":66,"text":"scribe how this new model-architecture can be re-purposed","bbox":{"l":308.862,"t":87.16339000000005,"r":545.11505,"b":96.06994999999995,"coord_origin":"1"}},{"id":67,"text":"for other tasks in the computer-vision community.","bbox":{"l":308.862,"t":99.11841000000004,"r":508.08417000000003,"b":108.02495999999985,"coord_origin":"1"}}]},"text":"its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community."},{"label":"Section-header","id":11,"page_no":1,"cluster":{"id":11,"label":"Section-header","bbox":{"l":307.962664604187,"t":120.99532756805422,"r":498.4370830535889,"b":132.47968000000003,"coord_origin":"1"},"confidence":0.9394794702529907,"cells":[{"id":68,"text":"2.","bbox":{"l":308.862,"t":121.73193000000003,"r":315.5831,"b":132.47968000000003,"coord_origin":"1"}},{"id":69,"text":"Previous work and State of the Art","bbox":{"l":324.54456,"t":121.73193000000003,"r":498.28021,"b":132.47968000000003,"coord_origin":"1"}}]},"text":"2. Previous work and State of the Art"},{"label":"Text","id":12,"page_no":1,"cluster":{"id":12,"label":"Text","bbox":{"l":307.7052463531494,"t":140.9155918121338,"r":545.4184089660645,"b":330.45502,"coord_origin":"1"},"confidence":0.9880366921424866,"cells":[{"id":70,"text":"Identifying the structure of a table has been an outstand-","bbox":{"l":320.81699,"t":142.22136999999998,"r":545.11493,"b":151.12793,"coord_origin":"1"}},{"id":71,"text":"ing problem in the document-parsing community, that mo-","bbox":{"l":308.862,"t":154.17638999999997,"r":545.11505,"b":163.08294999999998,"coord_origin":"1"}},{"id":72,"text":"tivates many organised public challenges [6, 4, 14].","bbox":{"l":308.862,"t":166.13140999999996,"r":522.55975,"b":175.03796,"coord_origin":"1"}},{"id":73,"text":"The","bbox":{"l":529.62323,"t":166.13140999999996,"r":545.11505,"b":175.03796,"coord_origin":"1"}},{"id":74,"text":"difficulty of the problem can be attributed to a number of","bbox":{"l":308.862,"t":178.08642999999995,"r":545.11517,"b":186.99298,"coord_origin":"1"}},{"id":75,"text":"factors. First, there is a large variety in the shapes and sizes","bbox":{"l":308.862,"t":190.04143999999997,"r":545.11511,"b":198.94799999999998,"coord_origin":"1"}},{"id":76,"text":"of tables.","bbox":{"l":308.862,"t":201.99645999999996,"r":346.97891,"b":210.90301999999997,"coord_origin":"1"}},{"id":77,"text":"Such large variety requires a flexible method.","bbox":{"l":354.86929,"t":201.99645999999996,"r":545.11511,"b":210.90301999999997,"coord_origin":"1"}},{"id":78,"text":"This is especially true for complex column- and row head-","bbox":{"l":308.862,"t":213.95245,"r":545.11505,"b":222.85901,"coord_origin":"1"}},{"id":79,"text":"ers, which can be extremely intricate and demanding.","bbox":{"l":308.862,"t":225.90747,"r":530.9184,"b":234.81403,"coord_origin":"1"}},{"id":80,"text":"A","bbox":{"l":537.92212,"t":225.90747,"r":545.11511,"b":234.81403,"coord_origin":"1"}},{"id":81,"text":"second factor of complexity is the lack of data with regard","bbox":{"l":308.862,"t":237.86248999999998,"r":545.11517,"b":246.76904000000002,"coord_origin":"1"}},{"id":82,"text":"to table-structure. Until the publication of PubTabNet [37],","bbox":{"l":308.862,"t":249.8175,"r":545.11511,"b":258.72406,"coord_origin":"1"}},{"id":83,"text":"there were no large datasets (i.e.","bbox":{"l":308.862,"t":261.77252,"r":439.8402699999999,"b":270.67908,"coord_origin":"1"}},{"id":84,"text":">","bbox":{"l":444.43999999999994,"t":261.61310000000003,"r":452.1889,"b":270.45989999999995,"coord_origin":"1"}},{"id":85,"text":"100","bbox":{"l":455.89001,"t":261.61310000000003,"r":470.83392000000003,"b":270.45989999999995,"coord_origin":"1"}},{"id":86,"text":"K tables) that pro-","bbox":{"l":470.83401,"t":261.77252,"r":545.11517,"b":270.67908,"coord_origin":"1"}},{"id":87,"text":"vided structure information. This happens primarily due to","bbox":{"l":308.862,"t":273.72748,"r":545.11511,"b":282.63406,"coord_origin":"1"}},{"id":88,"text":"the fact that tables are notoriously time-consuming to an-","bbox":{"l":308.862,"t":285.6835,"r":545.11511,"b":294.59006,"coord_origin":"1"}},{"id":89,"text":"notate by hand. However, this has definitely changed in re-","bbox":{"l":308.862,"t":297.63849,"r":545.11511,"b":306.54504,"coord_origin":"1"}},{"id":90,"text":"cent years with the deliverance of PubTabNet [37], FinTab-","bbox":{"l":308.862,"t":309.59348,"r":545.11517,"b":318.50003000000004,"coord_origin":"1"}},{"id":91,"text":"Net [36], TableBank [17] etc.","bbox":{"l":308.862,"t":321.54846,"r":425.92255,"b":330.45502,"coord_origin":"1"}}]},"text":"Identifying the structure of a table has been an outstanding problem in the document-parsing community, that motivates many organised public challenges [6, 4, 14]. The difficulty of the problem can be attributed to a number of factors. First, there is a large variety in the shapes and sizes of tables. Such large variety requires a flexible method. This is especially true for complex column- and row headers, which can be extremely intricate and demanding. A second factor of complexity is the lack of data with regard to table-structure. Until the publication of PubTabNet [37], there were no large datasets (i.e. > 100 K tables) that provided structure information. This happens primarily due to the fact that tables are notoriously time-consuming to annotate by hand. However, this has definitely changed in recent years with the deliverance of PubTabNet [37], FinTabNet [36], TableBank [17] etc."},{"label":"Text","id":13,"page_no":1,"cluster":{"id":13,"label":"Text","bbox":{"l":307.6594591140747,"t":332.2529640197754,"r":545.287671661377,"b":450.0729099999999,"coord_origin":"1"},"confidence":0.988025426864624,"cells":[{"id":92,"text":"Before the rising popularity of deep neural networks,","bbox":{"l":320.81699,"t":333.56946,"r":545.11499,"b":342.47601,"coord_origin":"1"}},{"id":93,"text":"the community relied heavily on heuristic and/or statistical","bbox":{"l":308.862,"t":345.52444,"r":545.11499,"b":354.43100000000004,"coord_origin":"1"}},{"id":94,"text":"methods to do table structure identification [3, 7, 11, 5, 13,","bbox":{"l":308.862,"t":357.47943,"r":545.11517,"b":366.38599,"coord_origin":"1"}},{"id":95,"text":"28]. Although such methods work well on constrained ta-","bbox":{"l":308.862,"t":369.43542,"r":545.11511,"b":378.34198,"coord_origin":"1"}},{"id":96,"text":"bles [12], a more data-driven approach can be applied due","bbox":{"l":308.862,"t":381.39041,"r":545.11505,"b":390.29697,"coord_origin":"1"}},{"id":97,"text":"to the advent of convolutional neural networks (CNNs) and","bbox":{"l":308.862,"t":393.3453999999999,"r":545.11505,"b":402.25195,"coord_origin":"1"}},{"id":98,"text":"the availability of large datasets. To the best-of-our knowl-","bbox":{"l":308.862,"t":405.30038,"r":545.11517,"b":414.20694,"coord_origin":"1"}},{"id":99,"text":"edge, there are currently two different types of network ar-","bbox":{"l":308.862,"t":417.25537,"r":545.11523,"b":426.16193,"coord_origin":"1"}},{"id":100,"text":"chitecture that are being pursued for state-of-the-art table-","bbox":{"l":308.862,"t":429.21136000000007,"r":545.11511,"b":438.11792,"coord_origin":"1"}},{"id":101,"text":"structure identification.","bbox":{"l":308.862,"t":441.16635,"r":401.28503,"b":450.0729099999999,"coord_origin":"1"}}]},"text":"Before the rising popularity of deep neural networks, the community relied heavily on heuristic and/or statistical methods to do table structure identification [3, 7, 11, 5, 13, 28]. Although such methods work well on constrained tables [12], a more data-driven approach can be applied due to the advent of convolutional neural networks (CNNs) and the availability of large datasets. To the best-of-our knowledge, there are currently two different types of network architecture that are being pursued for state-of-the-art tablestructure identification."},{"label":"Text","id":14,"page_no":1,"cluster":{"id":14,"label":"Text","bbox":{"l":307.7598466873169,"t":451.91541481018066,"r":545.4887969970704,"b":713.2906494140625,"coord_origin":"1"},"confidence":0.9885419607162476,"cells":[{"id":102,"text":"Image-to-Text networks","bbox":{"l":320.81699,"t":453.06778,"r":423.26236,"b":462.02417,"coord_origin":"1"}},{"id":103,"text":": In this type of network, one","bbox":{"l":423.26697,"t":453.18735,"r":545.10956,"b":462.0939,"coord_origin":"1"}},{"id":104,"text":"predicts a sequence of tokens starting from an encoded","bbox":{"l":308.86197,"t":465.14233,"r":545.11511,"b":474.04889,"coord_origin":"1"}},{"id":105,"text":"image.","bbox":{"l":308.86197,"t":477.09732,"r":335.7012,"b":486.00388,"coord_origin":"1"}},{"id":106,"text":"Such sequences of tokens can be HTML table","bbox":{"l":345.85309,"t":477.09732,"r":545.11505,"b":486.00388,"coord_origin":"1"}},{"id":107,"text":"tags [37, 17] or LaTeX symbols[10]. The choice of sym-","bbox":{"l":308.86197,"t":489.05231,"r":545.11493,"b":497.95886,"coord_origin":"1"}},{"id":108,"text":"bols is ultimately not very important, since one can be trans-","bbox":{"l":308.86197,"t":501.00729,"r":545.11499,"b":509.91385,"coord_origin":"1"}},{"id":109,"text":"formed into the other. There are however subtle variations","bbox":{"l":308.86197,"t":512.9632899999999,"r":545.11505,"b":521.8698400000001,"coord_origin":"1"}},{"id":110,"text":"in the Image-to-Text networks. The easiest network archi-","bbox":{"l":308.86197,"t":524.91827,"r":545.11505,"b":533.82483,"coord_origin":"1"}},{"id":111,"text":"tectures are “image-encoder","bbox":{"l":308.86197,"t":536.87328,"r":420.94119,"b":545.77983,"coord_origin":"1"}},{"id":112,"text":"→","bbox":{"l":423.59497,"t":536.1559599999999,"r":433.5575600000001,"b":545.56065,"coord_origin":"1"}},{"id":113,"text":"text-decoder” (IETD), sim-","bbox":{"l":436.21198,"t":536.87328,"r":545.11316,"b":545.77983,"coord_origin":"1"}},{"id":114,"text":"ilar to network architectures that try to provide captions to","bbox":{"l":308.86197,"t":548.82828,"r":545.11511,"b":557.73483,"coord_origin":"1"}},{"id":115,"text":"images [32]. In these IETD networks, one expects as output","bbox":{"l":308.86197,"t":560.78328,"r":545.11493,"b":569.68983,"coord_origin":"1"}},{"id":116,"text":"the LaTeX/HTML string of the entire table, i.e. the sym-","bbox":{"l":308.86197,"t":572.73828,"r":545.11499,"b":581.6448399999999,"coord_origin":"1"}},{"id":117,"text":"bols necessary for creating the table with the content of the","bbox":{"l":308.86197,"t":584.69427,"r":545.11505,"b":593.60083,"coord_origin":"1"}},{"id":118,"text":"table. Another approach is the “image-encoder","bbox":{"l":308.86197,"t":596.6492800000001,"r":497.07541,"b":605.55583,"coord_origin":"1"}},{"id":119,"text":"→","bbox":{"l":499.80496,"t":595.93196,"r":509.76755,"b":605.33665,"coord_origin":"1"}},{"id":120,"text":"dual de-","bbox":{"l":512.50098,"t":596.6492800000001,"r":545.10852,"b":605.55583,"coord_origin":"1"}},{"id":121,"text":"coder” (IEDD) networks. In these type of networks, one has","bbox":{"l":308.86197,"t":608.60428,"r":545.11511,"b":617.5108299999999,"coord_origin":"1"}},{"id":122,"text":"two consecutive decoders with different purposes. The first","bbox":{"l":308.86197,"t":620.55928,"r":545.11505,"b":629.46584,"coord_origin":"1"}},{"id":123,"text":"decoder is the","bbox":{"l":308.86197,"t":632.51428,"r":364.78201,"b":641.42084,"coord_origin":"1"}},{"id":124,"text":"tag-decoder","bbox":{"l":367.57397,"t":632.60394,"r":415.61362,"b":641.1917,"coord_origin":"1"}},{"id":125,"text":", i.e. it only produces the HTM-","bbox":{"l":415.61298,"t":632.51428,"r":545.11688,"b":641.42084,"coord_origin":"1"}},{"id":126,"text":"L/LaTeX tags which construct an empty table. The second","bbox":{"l":308.86197,"t":644.46928,"r":545.11511,"b":653.37584,"coord_origin":"1"}},{"id":127,"text":"content-decoder","bbox":{"l":308.86197,"t":656.51494,"r":373.59894,"b":665.1027,"coord_origin":"1"}},{"id":128,"text":"uses the encoding of the image in combi-","bbox":{"l":376.90698,"t":656.4252799999999,"r":545.11548,"b":665.33184,"coord_origin":"1"}},{"id":129,"text":"nation with the output encoding of each cell-tag (from the","bbox":{"l":308.862,"t":668.38028,"r":545.11517,"b":677.28684,"coord_origin":"1"}},{"id":130,"text":"tag-decoder","bbox":{"l":308.862,"t":680.42494,"r":356.90164,"b":689.0127,"coord_origin":"1"}},{"id":131,"text":") to generate the textual content of each table","bbox":{"l":357.13101,"t":680.33528,"r":545.1153,"b":689.24184,"coord_origin":"1"}},{"id":132,"text":"cell. The network architecture of IEDD is certainly more","bbox":{"l":308.862,"t":692.290283,"r":545.11511,"b":701.196846,"coord_origin":"1"}},{"id":133,"text":"elaborate, but it has the advantage that one can pre-train the","bbox":{"l":308.862,"t":704.245285,"r":545.11517,"b":713.151848,"coord_origin":"1"}}]},"text":"Image-to-Text networks : In this type of network, one predicts a sequence of tokens starting from an encoded image. Such sequences of tokens can be HTML table tags [37, 17] or LaTeX symbols[10]. The choice of symbols is ultimately not very important, since one can be transformed into the other. There are however subtle variations in the Image-to-Text networks. The easiest network architectures are “image-encoder → text-decoder” (IETD), similar to network architectures that try to provide captions to images [32]. In these IETD networks, one expects as output the LaTeX/HTML string of the entire table, i.e. the symbols necessary for creating the table with the content of the table. Another approach is the “image-encoder → dual decoder” (IEDD) networks. In these type of networks, one has two consecutive decoders with different purposes. The first decoder is the tag-decoder , i.e. it only produces the HTML/LaTeX tags which construct an empty table. The second content-decoder uses the encoding of the image in combination with the output encoding of each cell-tag (from the tag-decoder ) to generate the textual content of each table cell. The network architecture of IEDD is certainly more elaborate, but it has the advantage that one can pre-train the"},{"label":"Page-footer","id":15,"page_no":1,"cluster":{"id":15,"label":"Page-footer","bbox":{"l":294.621068572998,"t":733.353044128418,"r":300.12242431640624,"b":743.039845,"coord_origin":"1"},"confidence":0.8898882269859314,"cells":[{"id":134,"text":"2","bbox":{"l":295.121,"t":734.133282,"r":300.10229,"b":743.039845,"coord_origin":"1"}}]},"text":"2"}],"body":[{"label":"Text","id":0,"page_no":1,"cluster":{"id":0,"label":"Text","bbox":{"l":49.3856703042984,"t":74.23337373733523,"r":286.36505,"b":96.06994999999995,"coord_origin":"1"},"confidence":0.9677466154098511,"cells":[{"id":0,"text":"considered as a solved problem, given enough ground-truth","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36505,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"data to train on.","bbox":{"l":50.112,"t":87.16339000000005,"r":112.64721999999999,"b":96.06994999999995,"coord_origin":"1"}}]},"text":"considered as a solved problem, given enough ground-truth data to train on."},{"label":"Text","id":1,"page_no":1,"cluster":{"id":1,"label":"Text","bbox":{"l":49.32149448394775,"t":98.4006237030029,"r":286.36514,"b":228.18181228637695,"coord_origin":"1"},"confidence":0.9865381717681885,"cells":[{"id":2,"text":"The second problem is called table-structure decompo-","bbox":{"l":62.067001,"t":99.57141000000001,"r":286.36496,"b":108.47797000000003,"coord_origin":"1"}},{"id":3,"text":"sition.","bbox":{"l":50.112,"t":111.52643,"r":74.749512,"b":120.43297999999993,"coord_origin":"1"}},{"id":4,"text":"The latter is a long standing problem in the com-","bbox":{"l":81.334793,"t":111.52643,"r":286.36514,"b":120.43297999999993,"coord_origin":"1"}},{"id":5,"text":"munity of document understanding [6, 4, 14]. Contrary to","bbox":{"l":50.112,"t":123.48145,"r":286.36511,"b":132.38800000000003,"coord_origin":"1"}},{"id":6,"text":"the table-location problem, there are no commonly used ap-","bbox":{"l":50.112,"t":135.43646,"r":286.36511,"b":144.34302000000002,"coord_origin":"1"}},{"id":7,"text":"proaches that can easily be re-purposed to solve this prob-","bbox":{"l":50.112,"t":147.39246000000003,"r":286.36505,"b":156.29900999999995,"coord_origin":"1"}},{"id":8,"text":"lem. Lately, a set of new model-architectures has been pro-","bbox":{"l":50.112,"t":159.34747000000004,"r":286.36511,"b":168.25402999999994,"coord_origin":"1"}},{"id":9,"text":"posed by the community to address table-structure decom-","bbox":{"l":50.112,"t":171.30249000000003,"r":286.36508,"b":180.20905000000005,"coord_origin":"1"}},{"id":10,"text":"position [37, 36, 18, 20]. All these models have some weak-","bbox":{"l":50.112,"t":183.25751000000002,"r":286.36511,"b":192.16405999999995,"coord_origin":"1"}},{"id":11,"text":"nesses (see Sec. 2). The common denominator here is the","bbox":{"l":50.112,"t":195.21252000000004,"r":286.36508,"b":204.11908000000005,"coord_origin":"1"}},{"id":12,"text":"reliance on textual features and/or the inability to provide","bbox":{"l":50.112,"t":207.16754000000003,"r":286.36514,"b":216.07410000000004,"coord_origin":"1"}},{"id":13,"text":"the bounding box of each table-cell in the original image.","bbox":{"l":50.112,"t":219.12354000000005,"r":278.66397,"b":228.03008999999997,"coord_origin":"1"}}]},"text":"The second problem is called table-structure decomposition. The latter is a long standing problem in the community of document understanding [6, 4, 14]. Contrary to the table-location problem, there are no commonly used approaches that can easily be re-purposed to solve this problem. Lately, a set of new model-architectures has been proposed by the community to address table-structure decomposition [37, 36, 18, 20]. All these models have some weaknesses (see Sec. 2). The common denominator here is the reliance on textual features and/or the inability to provide the bounding box of each table-cell in the original image."},{"label":"Text","id":2,"page_no":1,"cluster":{"id":2,"label":"Text","bbox":{"l":49.250409722328186,"t":230.44071807861326,"r":286.436358833313,"b":371.94507,"coord_origin":"1"},"confidence":0.9869692921638489,"cells":[{"id":14,"text":"In this paper, we want to address these weaknesses and","bbox":{"l":62.067001,"t":231.53156,"r":286.36493,"b":240.43811000000005,"coord_origin":"1"}},{"id":15,"text":"present a robust table-structure decomposition algorithm.","bbox":{"l":50.112,"t":243.48657000000003,"r":286.36511,"b":252.39313000000004,"coord_origin":"1"}},{"id":16,"text":"The design criteria for our model are the following. First,","bbox":{"l":50.112,"t":255.44159000000002,"r":286.36511,"b":264.34813999999994,"coord_origin":"1"}},{"id":17,"text":"we want our algorithm to be language agnostic. In this way,","bbox":{"l":50.112,"t":267.39661,"r":286.36502,"b":276.30316000000005,"coord_origin":"1"}},{"id":18,"text":"we can obtain the structure of any table, irregardless of the","bbox":{"l":50.112,"t":279.35155999999995,"r":286.36508,"b":288.25815,"coord_origin":"1"}},{"id":19,"text":"language.","bbox":{"l":50.112,"t":291.30759,"r":88.567635,"b":300.21414,"coord_origin":"1"}},{"id":20,"text":"Second, we want our algorithm to leverage as","bbox":{"l":95.501602,"t":291.30759,"r":286.36505,"b":300.21414,"coord_origin":"1"}},{"id":21,"text":"much data as possible from the original PDF document. For","bbox":{"l":50.112,"t":303.26257,"r":286.36508,"b":312.16913,"coord_origin":"1"}},{"id":22,"text":"programmatic PDF documents, the text-cells can often be","bbox":{"l":50.112,"t":315.21756,"r":286.36511,"b":324.12411,"coord_origin":"1"}},{"id":23,"text":"extracted much faster and with higher accuracy compared","bbox":{"l":50.112,"t":327.17255,"r":286.36505,"b":336.0791,"coord_origin":"1"}},{"id":24,"text":"to OCR methods. Last but not least, we want to have a di-","bbox":{"l":50.112,"t":339.12753,"r":286.36511,"b":348.03409,"coord_origin":"1"}},{"id":25,"text":"rect link between the table-cell and its bounding box in the","bbox":{"l":50.112,"t":351.08353,"r":286.36508,"b":359.99008,"coord_origin":"1"}},{"id":26,"text":"image.","bbox":{"l":50.112,"t":363.03851,"r":76.951241,"b":371.94507,"coord_origin":"1"}}]},"text":"In this paper, we want to address these weaknesses and present a robust table-structure decomposition algorithm. The design criteria for our model are the following. First, we want our algorithm to be language agnostic. In this way, we can obtain the structure of any table, irregardless of the language. Second, we want our algorithm to leverage as much data as possible from the original PDF document. For programmatic PDF documents, the text-cells can often be extracted much faster and with higher accuracy compared to OCR methods. Last but not least, we want to have a direct link between the table-cell and its bounding box in the image."},{"label":"Text","id":3,"page_no":1,"cluster":{"id":3,"label":"Text","bbox":{"l":49.56145799160004,"t":374.04506263732907,"r":286.36658,"b":432.173,"coord_origin":"1"},"confidence":0.9837217330932617,"cells":[{"id":27,"text":"To meet the design criteria listed above, we developed a","bbox":{"l":62.067001,"t":375.4465,"r":286.36499,"b":384.35306,"coord_origin":"1"}},{"id":28,"text":"new model called","bbox":{"l":50.112,"t":387.40149,"r":120.98594,"b":396.30804,"coord_origin":"1"}},{"id":29,"text":"TableFormer","bbox":{"l":123.901,"t":387.28192,"r":179.7314,"b":396.23830999999996,"coord_origin":"1"}},{"id":30,"text":"and a synthetically gener-","bbox":{"l":182.646,"t":387.40149,"r":286.36658,"b":396.30804,"coord_origin":"1"}},{"id":31,"text":"ated table structure dataset called","bbox":{"l":50.112,"t":399.35648,"r":181.75778,"b":408.26302999999996,"coord_origin":"1"}},{"id":32,"text":"SynthTabNet","bbox":{"l":184.104,"t":399.23690999999997,"r":240.2034,"b":408.1933,"coord_origin":"1"}},{"id":33,"text":"$^{1}$. In partic-","bbox":{"l":240.20401,"t":399.35648,"r":286.36069,"b":408.26302999999996,"coord_origin":"1"}},{"id":34,"text":"ular, our contributions in this work can be summarised as","bbox":{"l":50.112015,"t":411.31146,"r":286.36511,"b":420.21802,"coord_origin":"1"}},{"id":35,"text":"follows:","bbox":{"l":50.112015,"t":423.26645,"r":82.520355,"b":432.173,"coord_origin":"1"}}]},"text":"To meet the design criteria listed above, we developed a new model called TableFormer and a synthetically generated table structure dataset called SynthTabNet $^{1}$. In particular, our contributions in this work can be summarised as follows:"},{"label":"List-item","id":4,"page_no":1,"cluster":{"id":4,"label":"List-item","bbox":{"l":61.25934247970581,"t":443.2520462036133,"r":286.6215797424316,"b":489.61486587524416,"coord_origin":"1"},"confidence":0.9750838279724121,"cells":[{"id":36,"text":"•","bbox":{"l":61.569016,"t":444.55145,"r":70.741714,"b":453.45801,"coord_origin":"1"}},{"id":37,"text":"We propose","bbox":{"l":73.034889,"t":444.55145,"r":117.10054,"b":453.45801,"coord_origin":"1"}},{"id":38,"text":"TableFormer","bbox":{"l":119.59001,"t":444.43188,"r":175.42041,"b":453.38828,"coord_origin":"1"}},{"id":39,"text":", a transformer based model","bbox":{"l":175.42102,"t":444.55145,"r":286.36453,"b":453.45801,"coord_origin":"1"}},{"id":40,"text":"that predicts tables structure and bounding boxes for","bbox":{"l":70.037018,"t":456.50644000000005,"r":286.3649,"b":465.41299,"coord_origin":"1"}},{"id":41,"text":"the table content simultaneously in an end-to-end ap-","bbox":{"l":70.037018,"t":468.46143,"r":286.3649,"b":477.36798,"coord_origin":"1"}},{"id":42,"text":"proach.","bbox":{"l":70.037018,"t":480.41641,"r":99.635902,"b":489.32297,"coord_origin":"1"}}]},"text":"• We propose TableFormer , a transformer based model that predicts tables structure and bounding boxes for the table content simultaneously in an end-to-end approach."},{"label":"List-item","id":5,"page_no":1,"cluster":{"id":5,"label":"List-item","bbox":{"l":61.222047328948975,"t":500.8945152282715,"r":286.3649,"b":547.1242492675782,"coord_origin":"1"},"confidence":0.9774568676948547,"cells":[{"id":43,"text":"•","bbox":{"l":61.569016,"t":502.15341,"r":71.619438,"b":511.05997,"coord_origin":"1"}},{"id":44,"text":"Across all benchmark datasets","bbox":{"l":74.132042,"t":502.15341,"r":196.10396,"b":511.05997,"coord_origin":"1"}},{"id":45,"text":"TableFormer","bbox":{"l":200.31001,"t":502.03384,"r":256.14041,"b":510.99023,"coord_origin":"1"}},{"id":46,"text":"signif-","bbox":{"l":260.35001,"t":502.15341,"r":286.36237,"b":511.05997,"coord_origin":"1"}},{"id":47,"text":"icantly outperforms existing state-of-the-art metrics,","bbox":{"l":70.037003,"t":514.1084000000001,"r":286.3649,"b":523.01495,"coord_origin":"1"}},{"id":48,"text":"while being much more efficient in training and infer-","bbox":{"l":70.037003,"t":526.06439,"r":286.36487,"b":534.97095,"coord_origin":"1"}},{"id":49,"text":"ence to existing works.","bbox":{"l":70.037003,"t":538.0193899999999,"r":161.65305,"b":546.9259500000001,"coord_origin":"1"}}]},"text":"• Across all benchmark datasets TableFormer significantly outperforms existing state-of-the-art metrics, while being much more efficient in training and inference to existing works."},{"label":"List-item","id":6,"page_no":1,"cluster":{"id":6,"label":"List-item","bbox":{"l":61.282082891464235,"t":558.6514892578125,"r":286.36493,"b":592.8100090026855,"coord_origin":"1"},"confidence":0.9757609367370605,"cells":[{"id":50,"text":"•","bbox":{"l":61.569,"t":559.75639,"r":71.115913,"b":568.66295,"coord_origin":"1"}},{"id":51,"text":"We present","bbox":{"l":73.502647,"t":559.75639,"r":116.71199,"b":568.66295,"coord_origin":"1"}},{"id":52,"text":"SynthTabNet","bbox":{"l":121.583,"t":559.63684,"r":177.68239,"b":568.59322,"coord_origin":"1"}},{"id":53,"text":"a synthetically generated","bbox":{"l":182.55301,"t":559.75639,"r":286.36328,"b":568.66295,"coord_origin":"1"}},{"id":54,"text":"dataset, with various appearance styles and complex-","bbox":{"l":70.03701,"t":571.7114,"r":286.36493,"b":580.6179500000001,"coord_origin":"1"}},{"id":55,"text":"ity.","bbox":{"l":70.03701,"t":583.6664000000001,"r":82.400597,"b":592.57295,"coord_origin":"1"}}]},"text":"• We present SynthTabNet a synthetically generated dataset, with various appearance styles and complexity."},{"label":"List-item","id":7,"page_no":1,"cluster":{"id":7,"label":"List-item","bbox":{"l":61.13663399219513,"t":604.1612617492675,"r":286.36508,"b":638.7555198669434,"coord_origin":"1"},"confidence":0.975217878818512,"cells":[{"id":56,"text":"•","bbox":{"l":61.569008000000004,"t":605.4034,"r":72.332527,"b":614.30995,"coord_origin":"1"}},{"id":57,"text":"An augmented dataset based on PubTabNet [37],","bbox":{"l":75.023399,"t":605.4034,"r":286.36508,"b":614.30995,"coord_origin":"1"}},{"id":58,"text":"FinTabNet [36], and TableBank [17] with generated","bbox":{"l":70.03701,"t":617.3584,"r":286.36487,"b":626.26495,"coord_origin":"1"}},{"id":59,"text":"ground-truth for reproducibility.","bbox":{"l":70.03701,"t":629.31439,"r":198.05641,"b":638.22095,"coord_origin":"1"}}]},"text":"• An augmented dataset based on PubTabNet [37], FinTabNet [36], and TableBank [17] with generated ground-truth for reproducibility."},{"label":"Text","id":8,"page_no":1,"cluster":{"id":8,"label":"Text","bbox":{"l":49.23523485660553,"t":649.6770011901855,"r":286.6603340148926,"b":695.579761505127,"coord_origin":"1"},"confidence":0.963869571685791,"cells":[{"id":60,"text":"The paper is structured as follows. In Sec. 2, we give","bbox":{"l":62.067009000000006,"t":650.59839,"r":286.36496,"b":659.50494,"coord_origin":"1"}},{"id":61,"text":"a brief overview of the current state-of-the-art. In Sec. 3,","bbox":{"l":50.112007,"t":662.55339,"r":286.36511,"b":671.45995,"coord_origin":"1"}},{"id":62,"text":"we describe the datasets on which we train. In Sec. 4, we","bbox":{"l":50.112007,"t":674.50839,"r":286.36511,"b":683.41496,"coord_origin":"1"}},{"id":63,"text":"introduce the TableFormer model-architecture and describe","bbox":{"l":50.112007,"t":686.46339,"r":286.36511,"b":695.369957,"coord_origin":"1"}}]},"text":"The paper is structured as follows. In Sec. 2, we give a brief overview of the current state-of-the-art. In Sec. 3, we describe the datasets on which we train. In Sec. 4, we introduce the TableFormer model-architecture and describe"},{"label":"Footnote","id":9,"page_no":1,"cluster":{"id":9,"label":"Footnote","bbox":{"l":60.97100100000001,"t":704.329801940918,"r":183.73055,"b":713.4629356384277,"coord_origin":"1"},"confidence":0.8912795782089233,"cells":[{"id":64,"text":"$^{1}$https://github.com/IBM/SynthTabNet","bbox":{"l":60.97100100000001,"t":705.596275,"r":183.73055,"b":712.721542,"coord_origin":"1"}}]},"text":"$^{1}$https://github.com/IBM/SynthTabNet"},{"label":"Text","id":10,"page_no":1,"cluster":{"id":10,"label":"Text","bbox":{"l":308.2300924301147,"t":74.18210706710818,"r":545.4613071441651,"b":108.43959131240842,"coord_origin":"1"},"confidence":0.9728982448577881,"cells":[{"id":65,"text":"its results & performance in Sec. 5. As a conclusion, we de-","bbox":{"l":308.862,"t":75.20836999999995,"r":545.11511,"b":84.11492999999996,"coord_origin":"1"}},{"id":66,"text":"scribe how this new model-architecture can be re-purposed","bbox":{"l":308.862,"t":87.16339000000005,"r":545.11505,"b":96.06994999999995,"coord_origin":"1"}},{"id":67,"text":"for other tasks in the computer-vision community.","bbox":{"l":308.862,"t":99.11841000000004,"r":508.08417000000003,"b":108.02495999999985,"coord_origin":"1"}}]},"text":"its results & performance in Sec. 5. As a conclusion, we describe how this new model-architecture can be re-purposed for other tasks in the computer-vision community."},{"label":"Section-header","id":11,"page_no":1,"cluster":{"id":11,"label":"Section-header","bbox":{"l":307.962664604187,"t":120.99532756805422,"r":498.4370830535889,"b":132.47968000000003,"coord_origin":"1"},"confidence":0.9394794702529907,"cells":[{"id":68,"text":"2.","bbox":{"l":308.862,"t":121.73193000000003,"r":315.5831,"b":132.47968000000003,"coord_origin":"1"}},{"id":69,"text":"Previous work and State of the Art","bbox":{"l":324.54456,"t":121.73193000000003,"r":498.28021,"b":132.47968000000003,"coord_origin":"1"}}]},"text":"2. Previous work and State of the Art"},{"label":"Text","id":12,"page_no":1,"cluster":{"id":12,"label":"Text","bbox":{"l":307.7052463531494,"t":140.9155918121338,"r":545.4184089660645,"b":330.45502,"coord_origin":"1"},"confidence":0.9880366921424866,"cells":[{"id":70,"text":"Identifying the structure of a table has been an outstand-","bbox":{"l":320.81699,"t":142.22136999999998,"r":545.11493,"b":151.12793,"coord_origin":"1"}},{"id":71,"text":"ing problem in the document-parsing community, that mo-","bbox":{"l":308.862,"t":154.17638999999997,"r":545.11505,"b":163.08294999999998,"coord_origin":"1"}},{"id":72,"text":"tivates many organised public challenges [6, 4, 14].","bbox":{"l":308.862,"t":166.13140999999996,"r":522.55975,"b":175.03796,"coord_origin":"1"}},{"id":73,"text":"The","bbox":{"l":529.62323,"t":166.13140999999996,"r":545.11505,"b":175.03796,"coord_origin":"1"}},{"id":74,"text":"difficulty of the problem can be attributed to a number of","bbox":{"l":308.862,"t":178.08642999999995,"r":545.11517,"b":186.99298,"coord_origin":"1"}},{"id":75,"text":"factors. First, there is a large variety in the shapes and sizes","bbox":{"l":308.862,"t":190.04143999999997,"r":545.11511,"b":198.94799999999998,"coord_origin":"1"}},{"id":76,"text":"of tables.","bbox":{"l":308.862,"t":201.99645999999996,"r":346.97891,"b":210.90301999999997,"coord_origin":"1"}},{"id":77,"text":"Such large variety requires a flexible method.","bbox":{"l":354.86929,"t":201.99645999999996,"r":545.11511,"b":210.90301999999997,"coord_origin":"1"}},{"id":78,"text":"This is especially true for complex column- and row head-","bbox":{"l":308.862,"t":213.95245,"r":545.11505,"b":222.85901,"coord_origin":"1"}},{"id":79,"text":"ers, which can be extremely intricate and demanding.","bbox":{"l":308.862,"t":225.90747,"r":530.9184,"b":234.81403,"coord_origin":"1"}},{"id":80,"text":"A","bbox":{"l":537.92212,"t":225.90747,"r":545.11511,"b":234.81403,"coord_origin":"1"}},{"id":81,"text":"second factor of complexity is the lack of data with regard","bbox":{"l":308.862,"t":237.86248999999998,"r":545.11517,"b":246.76904000000002,"coord_origin":"1"}},{"id":82,"text":"to table-structure. Until the publication of PubTabNet [37],","bbox":{"l":308.862,"t":249.8175,"r":545.11511,"b":258.72406,"coord_origin":"1"}},{"id":83,"text":"there were no large datasets (i.e.","bbox":{"l":308.862,"t":261.77252,"r":439.8402699999999,"b":270.67908,"coord_origin":"1"}},{"id":84,"text":">","bbox":{"l":444.43999999999994,"t":261.61310000000003,"r":452.1889,"b":270.45989999999995,"coord_origin":"1"}},{"id":85,"text":"100","bbox":{"l":455.89001,"t":261.61310000000003,"r":470.83392000000003,"b":270.45989999999995,"coord_origin":"1"}},{"id":86,"text":"K tables) that pro-","bbox":{"l":470.83401,"t":261.77252,"r":545.11517,"b":270.67908,"coord_origin":"1"}},{"id":87,"text":"vided structure information. This happens primarily due to","bbox":{"l":308.862,"t":273.72748,"r":545.11511,"b":282.63406,"coord_origin":"1"}},{"id":88,"text":"the fact that tables are notoriously time-consuming to an-","bbox":{"l":308.862,"t":285.6835,"r":545.11511,"b":294.59006,"coord_origin":"1"}},{"id":89,"text":"notate by hand. However, this has definitely changed in re-","bbox":{"l":308.862,"t":297.63849,"r":545.11511,"b":306.54504,"coord_origin":"1"}},{"id":90,"text":"cent years with the deliverance of PubTabNet [37], FinTab-","bbox":{"l":308.862,"t":309.59348,"r":545.11517,"b":318.50003000000004,"coord_origin":"1"}},{"id":91,"text":"Net [36], TableBank [17] etc.","bbox":{"l":308.862,"t":321.54846,"r":425.92255,"b":330.45502,"coord_origin":"1"}}]},"text":"Identifying the structure of a table has been an outstanding problem in the document-parsing community, that motivates many organised public challenges [6, 4, 14]. The difficulty of the problem can be attributed to a number of factors. First, there is a large variety in the shapes and sizes of tables. Such large variety requires a flexible method. This is especially true for complex column- and row headers, which can be extremely intricate and demanding. A second factor of complexity is the lack of data with regard to table-structure. Until the publication of PubTabNet [37], there were no large datasets (i.e. > 100 K tables) that provided structure information. This happens primarily due to the fact that tables are notoriously time-consuming to annotate by hand. However, this has definitely changed in recent years with the deliverance of PubTabNet [37], FinTabNet [36], TableBank [17] etc."},{"label":"Text","id":13,"page_no":1,"cluster":{"id":13,"label":"Text","bbox":{"l":307.6594591140747,"t":332.2529640197754,"r":545.287671661377,"b":450.0729099999999,"coord_origin":"1"},"confidence":0.988025426864624,"cells":[{"id":92,"text":"Before the rising popularity of deep neural networks,","bbox":{"l":320.81699,"t":333.56946,"r":545.11499,"b":342.47601,"coord_origin":"1"}},{"id":93,"text":"the community relied heavily on heuristic and/or statistical","bbox":{"l":308.862,"t":345.52444,"r":545.11499,"b":354.43100000000004,"coord_origin":"1"}},{"id":94,"text":"methods to do table structure identification [3, 7, 11, 5, 13,","bbox":{"l":308.862,"t":357.47943,"r":545.11517,"b":366.38599,"coord_origin":"1"}},{"id":95,"text":"28]. Although such methods work well on constrained ta-","bbox":{"l":308.862,"t":369.43542,"r":545.11511,"b":378.34198,"coord_origin":"1"}},{"id":96,"text":"bles [12], a more data-driven approach can be applied due","bbox":{"l":308.862,"t":381.39041,"r":545.11505,"b":390.29697,"coord_origin":"1"}},{"id":97,"text":"to the advent of convolutional neural networks (CNNs) and","bbox":{"l":308.862,"t":393.3453999999999,"r":545.11505,"b":402.25195,"coord_origin":"1"}},{"id":98,"text":"the availability of large datasets. To the best-of-our knowl-","bbox":{"l":308.862,"t":405.30038,"r":545.11517,"b":414.20694,"coord_origin":"1"}},{"id":99,"text":"edge, there are currently two different types of network ar-","bbox":{"l":308.862,"t":417.25537,"r":545.11523,"b":426.16193,"coord_origin":"1"}},{"id":100,"text":"chitecture that are being pursued for state-of-the-art table-","bbox":{"l":308.862,"t":429.21136000000007,"r":545.11511,"b":438.11792,"coord_origin":"1"}},{"id":101,"text":"structure identification.","bbox":{"l":308.862,"t":441.16635,"r":401.28503,"b":450.0729099999999,"coord_origin":"1"}}]},"text":"Before the rising popularity of deep neural networks, the community relied heavily on heuristic and/or statistical methods to do table structure identification [3, 7, 11, 5, 13, 28]. Although such methods work well on constrained tables [12], a more data-driven approach can be applied due to the advent of convolutional neural networks (CNNs) and the availability of large datasets. To the best-of-our knowledge, there are currently two different types of network architecture that are being pursued for state-of-the-art tablestructure identification."},{"label":"Text","id":14,"page_no":1,"cluster":{"id":14,"label":"Text","bbox":{"l":307.7598466873169,"t":451.91541481018066,"r":545.4887969970704,"b":713.2906494140625,"coord_origin":"1"},"confidence":0.9885419607162476,"cells":[{"id":102,"text":"Image-to-Text networks","bbox":{"l":320.81699,"t":453.06778,"r":423.26236,"b":462.02417,"coord_origin":"1"}},{"id":103,"text":": In this type of network, one","bbox":{"l":423.26697,"t":453.18735,"r":545.10956,"b":462.0939,"coord_origin":"1"}},{"id":104,"text":"predicts a sequence of tokens starting from an encoded","bbox":{"l":308.86197,"t":465.14233,"r":545.11511,"b":474.04889,"coord_origin":"1"}},{"id":105,"text":"image.","bbox":{"l":308.86197,"t":477.09732,"r":335.7012,"b":486.00388,"coord_origin":"1"}},{"id":106,"text":"Such sequences of tokens can be HTML table","bbox":{"l":345.85309,"t":477.09732,"r":545.11505,"b":486.00388,"coord_origin":"1"}},{"id":107,"text":"tags [37, 17] or LaTeX symbols[10]. The choice of sym-","bbox":{"l":308.86197,"t":489.05231,"r":545.11493,"b":497.95886,"coord_origin":"1"}},{"id":108,"text":"bols is ultimately not very important, since one can be trans-","bbox":{"l":308.86197,"t":501.00729,"r":545.11499,"b":509.91385,"coord_origin":"1"}},{"id":109,"text":"formed into the other. There are however subtle variations","bbox":{"l":308.86197,"t":512.9632899999999,"r":545.11505,"b":521.8698400000001,"coord_origin":"1"}},{"id":110,"text":"in the Image-to-Text networks. The easiest network archi-","bbox":{"l":308.86197,"t":524.91827,"r":545.11505,"b":533.82483,"coord_origin":"1"}},{"id":111,"text":"tectures are “image-encoder","bbox":{"l":308.86197,"t":536.87328,"r":420.94119,"b":545.77983,"coord_origin":"1"}},{"id":112,"text":"→","bbox":{"l":423.59497,"t":536.1559599999999,"r":433.5575600000001,"b":545.56065,"coord_origin":"1"}},{"id":113,"text":"text-decoder” (IETD), sim-","bbox":{"l":436.21198,"t":536.87328,"r":545.11316,"b":545.77983,"coord_origin":"1"}},{"id":114,"text":"ilar to network architectures that try to provide captions to","bbox":{"l":308.86197,"t":548.82828,"r":545.11511,"b":557.73483,"coord_origin":"1"}},{"id":115,"text":"images [32]. In these IETD networks, one expects as output","bbox":{"l":308.86197,"t":560.78328,"r":545.11493,"b":569.68983,"coord_origin":"1"}},{"id":116,"text":"the LaTeX/HTML string of the entire table, i.e. the sym-","bbox":{"l":308.86197,"t":572.73828,"r":545.11499,"b":581.6448399999999,"coord_origin":"1"}},{"id":117,"text":"bols necessary for creating the table with the content of the","bbox":{"l":308.86197,"t":584.69427,"r":545.11505,"b":593.60083,"coord_origin":"1"}},{"id":118,"text":"table. Another approach is the “image-encoder","bbox":{"l":308.86197,"t":596.6492800000001,"r":497.07541,"b":605.55583,"coord_origin":"1"}},{"id":119,"text":"→","bbox":{"l":499.80496,"t":595.93196,"r":509.76755,"b":605.33665,"coord_origin":"1"}},{"id":120,"text":"dual de-","bbox":{"l":512.50098,"t":596.6492800000001,"r":545.10852,"b":605.55583,"coord_origin":"1"}},{"id":121,"text":"coder” (IEDD) networks. In these type of networks, one has","bbox":{"l":308.86197,"t":608.60428,"r":545.11511,"b":617.5108299999999,"coord_origin":"1"}},{"id":122,"text":"two consecutive decoders with different purposes. The first","bbox":{"l":308.86197,"t":620.55928,"r":545.11505,"b":629.46584,"coord_origin":"1"}},{"id":123,"text":"decoder is the","bbox":{"l":308.86197,"t":632.51428,"r":364.78201,"b":641.42084,"coord_origin":"1"}},{"id":124,"text":"tag-decoder","bbox":{"l":367.57397,"t":632.60394,"r":415.61362,"b":641.1917,"coord_origin":"1"}},{"id":125,"text":", i.e. it only produces the HTM-","bbox":{"l":415.61298,"t":632.51428,"r":545.11688,"b":641.42084,"coord_origin":"1"}},{"id":126,"text":"L/LaTeX tags which construct an empty table. The second","bbox":{"l":308.86197,"t":644.46928,"r":545.11511,"b":653.37584,"coord_origin":"1"}},{"id":127,"text":"content-decoder","bbox":{"l":308.86197,"t":656.51494,"r":373.59894,"b":665.1027,"coord_origin":"1"}},{"id":128,"text":"uses the encoding of the image in combi-","bbox":{"l":376.90698,"t":656.4252799999999,"r":545.11548,"b":665.33184,"coord_origin":"1"}},{"id":129,"text":"nation with the output encoding of each cell-tag (from the","bbox":{"l":308.862,"t":668.38028,"r":545.11517,"b":677.28684,"coord_origin":"1"}},{"id":130,"text":"tag-decoder","bbox":{"l":308.862,"t":680.42494,"r":356.90164,"b":689.0127,"coord_origin":"1"}},{"id":131,"text":") to generate the textual content of each table","bbox":{"l":357.13101,"t":680.33528,"r":545.1153,"b":689.24184,"coord_origin":"1"}},{"id":132,"text":"cell. The network architecture of IEDD is certainly more","bbox":{"l":308.862,"t":692.290283,"r":545.11511,"b":701.196846,"coord_origin":"1"}},{"id":133,"text":"elaborate, but it has the advantage that one can pre-train the","bbox":{"l":308.862,"t":704.245285,"r":545.11517,"b":713.151848,"coord_origin":"1"}}]},"text":"Image-to-Text networks : In this type of network, one predicts a sequence of tokens starting from an encoded image. Such sequences of tokens can be HTML table tags [37, 17] or LaTeX symbols[10]. The choice of symbols is ultimately not very important, since one can be transformed into the other. There are however subtle variations in the Image-to-Text networks. The easiest network architectures are “image-encoder → text-decoder” (IETD), similar to network architectures that try to provide captions to images [32]. In these IETD networks, one expects as output the LaTeX/HTML string of the entire table, i.e. the symbols necessary for creating the table with the content of the table. Another approach is the “image-encoder → dual decoder” (IEDD) networks. In these type of networks, one has two consecutive decoders with different purposes. The first decoder is the tag-decoder , i.e. it only produces the HTML/LaTeX tags which construct an empty table. The second content-decoder uses the encoding of the image in combination with the output encoding of each cell-tag (from the tag-decoder ) to generate the textual content of each table cell. The network architecture of IEDD is certainly more elaborate, but it has the advantage that one can pre-train the"}],"headers":[{"label":"Page-footer","id":15,"page_no":1,"cluster":{"id":15,"label":"Page-footer","bbox":{"l":294.621068572998,"t":733.353044128418,"r":300.12242431640624,"b":743.039845,"coord_origin":"1"},"confidence":0.8898882269859314,"cells":[{"id":134,"text":"2","bbox":{"l":295.121,"t":734.133282,"r":300.10229,"b":743.039845,"coord_origin":"1"}}]},"text":"2"}]}},{"page_no":2,"page_hash":"95b5360d00f9fbcd6d5c5caa4529279e7f31219fd75e4495a349a1897700a2fe","size":{"width":612.0,"height":792.0},"cells":[{"id":0,"text":"tag-decoder which is constrained to the table-tags.","bbox":{"l":50.112,"t":75.20836999999995,"r":250.15102,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"In","bbox":{"l":62.067001,"t":87.21935999999994,"r":70.365845,"b":96.12591999999995,"coord_origin":"1"}},{"id":2,"text":"practice,","bbox":{"l":76.931198,"t":87.21935999999994,"r":110.95348000000001,"b":96.12591999999995,"coord_origin":"1"}},{"id":3,"text":"both","bbox":{"l":118.54498,"t":87.21935999999994,"r":136.25848,"b":96.12591999999995,"coord_origin":"1"}},{"id":4,"text":"network","bbox":{"l":142.82384,"t":87.21935999999994,"r":175.37166,"b":96.12591999999995,"coord_origin":"1"}},{"id":5,"text":"architectures","bbox":{"l":181.94698,"t":87.21935999999994,"r":232.83594000000002,"b":96.12591999999995,"coord_origin":"1"}},{"id":6,"text":"(IETD","bbox":{"l":239.41125,"t":87.21935999999994,"r":265.41364,"b":96.12591999999995,"coord_origin":"1"}},{"id":7,"text":"and","bbox":{"l":271.979,"t":87.21935999999994,"r":286.36499,"b":96.12591999999995,"coord_origin":"1"}},{"id":8,"text":"IEDD) require an implicit, custom trained object-character-","bbox":{"l":50.112,"t":99.17437999999993,"r":286.36505,"b":108.08092999999997,"coord_origin":"1"}},{"id":9,"text":"recognition (OCR) to obtain the content of the table-cells.","bbox":{"l":50.112,"t":111.13036999999997,"r":286.36511,"b":120.03692999999998,"coord_origin":"1"}},{"id":10,"text":"In the case of IETD, this OCR engine is implicit in the de-","bbox":{"l":50.112,"t":123.08538999999996,"r":286.36505,"b":131.99194,"coord_origin":"1"}},{"id":11,"text":"coder similar to [24]. For the IEDD, the OCR is solely em-","bbox":{"l":50.112,"t":135.04040999999995,"r":286.36514,"b":143.94696,"coord_origin":"1"}},{"id":12,"text":"bedded in the content-decoder. This reliance on a custom,","bbox":{"l":50.112,"t":146.99541999999997,"r":286.36511,"b":155.90197999999998,"coord_origin":"1"}},{"id":13,"text":"implicit OCR decoder is of course problematic. OCR is a","bbox":{"l":50.112,"t":158.95043999999996,"r":286.36505,"b":167.85699,"coord_origin":"1"}},{"id":14,"text":"well known and extremely tough problem, that often needs","bbox":{"l":50.112,"t":170.90545999999995,"r":286.36508,"b":179.81201,"coord_origin":"1"}},{"id":15,"text":"custom training for each individual language. However, the","bbox":{"l":50.112,"t":182.86145,"r":286.36508,"b":191.76801,"coord_origin":"1"}},{"id":16,"text":"limited availability for non-english content in the current","bbox":{"l":50.112,"t":194.81646999999998,"r":286.36511,"b":203.72302000000002,"coord_origin":"1"}},{"id":17,"text":"datasets, makes it impractical to apply the IETD and IEDD","bbox":{"l":50.112,"t":206.77148,"r":286.36511,"b":215.67804,"coord_origin":"1"}},{"id":18,"text":"methods on tables with other languages. Additionally, OCR","bbox":{"l":50.112,"t":218.7265,"r":286.36505,"b":227.63306,"coord_origin":"1"}},{"id":19,"text":"can be completely omitted if the tables originate from pro-","bbox":{"l":50.112,"t":230.68151999999998,"r":286.36505,"b":239.58807000000002,"coord_origin":"1"}},{"id":20,"text":"grammatic PDF documents with known positions of each","bbox":{"l":50.112,"t":242.63653999999997,"r":286.36511,"b":251.54309,"coord_origin":"1"}},{"id":21,"text":"cell. The latter was the inspiration for the work of this pa-","bbox":{"l":50.112,"t":254.59253,"r":286.36508,"b":263.49908000000005,"coord_origin":"1"}},{"id":22,"text":"per.","bbox":{"l":50.112,"t":266.54755,"r":64.776947,"b":275.45410000000004,"coord_origin":"1"}},{"id":23,"text":"Graph Neural networks","bbox":{"l":62.067001,"t":278.43895999999995,"r":171.56593,"b":287.39536,"coord_origin":"1"}},{"id":24,"text":":","bbox":{"l":171.56799,"t":278.55853,"r":174.3376,"b":287.46509,"coord_origin":"1"}},{"id":25,"text":"Graph Neural networks","bbox":{"l":185.18687,"t":278.55853,"r":286.35709,"b":287.46509,"coord_origin":"1"}},{"id":26,"text":"(GNN’s) take a radically different approach to table-","bbox":{"l":50.111992,"t":290.51453000000004,"r":286.36511,"b":299.42108,"coord_origin":"1"}},{"id":27,"text":"structure extraction.","bbox":{"l":50.111992,"t":302.46950999999996,"r":131.16771,"b":311.37607,"coord_origin":"1"}},{"id":28,"text":"Note that one table cell can consti-","bbox":{"l":138.84888,"t":302.46950999999996,"r":286.36508,"b":311.37607,"coord_origin":"1"}},{"id":29,"text":"tute out of multiple text-cells. To obtain the table-structure,","bbox":{"l":50.111992,"t":314.4245,"r":286.36505,"b":323.33105,"coord_origin":"1"}},{"id":30,"text":"one creates an initial graph, where each of the text-cells","bbox":{"l":50.111992,"t":326.37949000000003,"r":286.36508,"b":335.28604,"coord_origin":"1"}},{"id":31,"text":"becomes a node in the graph similar to [33, 34, 2]. Each","bbox":{"l":50.111992,"t":338.33447,"r":286.36505,"b":347.2410300000001,"coord_origin":"1"}},{"id":32,"text":"node is then associated with en embedding vector coming","bbox":{"l":50.111992,"t":350.28946,"r":286.36505,"b":359.19601,"coord_origin":"1"}},{"id":33,"text":"from the encoded image, its coordinates and the encoded","bbox":{"l":50.111992,"t":362.24545000000006,"r":286.36508,"b":371.15201,"coord_origin":"1"}},{"id":34,"text":"text. Furthermore, nodes that represent adjacent text-cells","bbox":{"l":50.111992,"t":374.20044,"r":286.36508,"b":383.10699,"coord_origin":"1"}},{"id":35,"text":"are linked. Graph Convolutional Networks (GCN’s) based","bbox":{"l":50.111992,"t":386.15542999999997,"r":286.36508,"b":395.06198,"coord_origin":"1"}},{"id":36,"text":"methods take the image as an input, but also the position of","bbox":{"l":50.111992,"t":398.11041000000006,"r":286.36508,"b":407.01697,"coord_origin":"1"}},{"id":37,"text":"the text-cells and their content [18]. The purpose of a GCN","bbox":{"l":50.111992,"t":410.0654,"r":286.36508,"b":418.97195,"coord_origin":"1"}},{"id":38,"text":"is to transform the input graph into a new graph, which re-","bbox":{"l":50.111992,"t":422.02038999999996,"r":286.36505,"b":430.92694,"coord_origin":"1"}},{"id":39,"text":"places the old links with new ones.","bbox":{"l":50.111992,"t":433.97638,"r":198.2359,"b":442.88293,"coord_origin":"1"}},{"id":40,"text":"The new links then","bbox":{"l":205.92703,"t":433.97638,"r":286.36505,"b":442.88293,"coord_origin":"1"}},{"id":41,"text":"represent the table-structure. With this approach, one can","bbox":{"l":50.111992,"t":445.93137,"r":286.36508,"b":454.83792000000005,"coord_origin":"1"}},{"id":42,"text":"avoid the need to build custom OCR decoders. However,","bbox":{"l":50.111992,"t":457.88635,"r":286.36505,"b":466.79291,"coord_origin":"1"}},{"id":43,"text":"the quality of the reconstructed structure is not comparable","bbox":{"l":50.111992,"t":469.84134,"r":286.36505,"b":478.74789,"coord_origin":"1"}},{"id":44,"text":"to the current state-of-the-art [18].","bbox":{"l":50.111992,"t":481.79633,"r":186.49998,"b":490.70288,"coord_origin":"1"}},{"id":45,"text":"Hybrid Deep Learning-Rule-Based approach","bbox":{"l":62.066994,"t":493.68875,"r":252.88068000000004,"b":502.64514,"coord_origin":"1"}},{"id":46,"text":": A pop-","bbox":{"l":252.88199,"t":493.80832,"r":286.36627,"b":502.71487,"coord_origin":"1"}},{"id":47,"text":"ular current model for table-structure identification is the","bbox":{"l":50.111984,"t":505.76331,"r":286.36505,"b":514.66986,"coord_origin":"1"}},{"id":48,"text":"use of a hybrid Deep Learning-Rule-Based approach similar","bbox":{"l":50.111984,"t":517.71829,"r":286.36505,"b":526.6248499999999,"coord_origin":"1"}},{"id":49,"text":"to [27, 29]. In this approach, one first detects the position of","bbox":{"l":50.111984,"t":529.67328,"r":286.36508,"b":538.57985,"coord_origin":"1"}},{"id":50,"text":"the table-cells with object detection (e.g. YoloVx or Mask-","bbox":{"l":50.111984,"t":541.62929,"r":286.36508,"b":550.53584,"coord_origin":"1"}},{"id":51,"text":"RCNN), then classifies the table into different types (from","bbox":{"l":50.111984,"t":553.58429,"r":286.36511,"b":562.4908399999999,"coord_origin":"1"}},{"id":52,"text":"its images) and finally uses different rule-sets to obtain","bbox":{"l":50.111984,"t":565.5392899999999,"r":286.36511,"b":574.44585,"coord_origin":"1"}},{"id":53,"text":"its table-structure. Currently, this approach achieves state-","bbox":{"l":50.111984,"t":577.49429,"r":286.36502,"b":586.40085,"coord_origin":"1"}},{"id":54,"text":"of-the-art results, but is not an end-to-end deep-learning","bbox":{"l":50.111984,"t":589.4493,"r":286.36505,"b":598.35585,"coord_origin":"1"}},{"id":55,"text":"method. As such, new rules need to be written if different","bbox":{"l":50.111984,"t":601.4043,"r":286.36502,"b":610.31085,"coord_origin":"1"}},{"id":56,"text":"types of tables are encountered.","bbox":{"l":50.111984,"t":613.36029,"r":175.98943,"b":622.26685,"coord_origin":"1"}},{"id":57,"text":"3.","bbox":{"l":50.111984,"t":635.94484,"r":57.82375699999999,"b":646.6925699999999,"coord_origin":"1"}},{"id":58,"text":"Datasets","bbox":{"l":68.106125,"t":635.94484,"r":105.22546,"b":646.6925699999999,"coord_origin":"1"}},{"id":59,"text":"We rely on large-scale datasets such as PubTabNet [37],","bbox":{"l":62.06698600000001,"t":656.42529,"r":286.36493,"b":665.33186,"coord_origin":"1"}},{"id":60,"text":"FinTabNet [36], and TableBank [17] datasets to train and","bbox":{"l":50.111984,"t":668.38029,"r":286.36508,"b":677.2868599999999,"coord_origin":"1"}},{"id":61,"text":"evaluate our models. These datasets span over various ap-","bbox":{"l":50.111984,"t":680.3353,"r":286.36502,"b":689.24186,"coord_origin":"1"}},{"id":62,"text":"pearance styles and content.","bbox":{"l":50.111984,"t":692.290298,"r":166.24602,"b":701.196861,"coord_origin":"1"}},{"id":63,"text":"We also introduce our own","bbox":{"l":173.68808,"t":692.290298,"r":286.36508,"b":701.196861,"coord_origin":"1"}},{"id":64,"text":"synthetically generated SynthTabNet dataset to fix an im-","bbox":{"l":50.111984,"t":704.2453,"r":286.36505,"b":713.151863,"coord_origin":"1"}},{"id":65,"text":"PubTabNet + FinTabNet","bbox":{"l":380.79849,"t":79.81176999999991,"r":486.84909,"b":88.55975000000001,"coord_origin":"1"}},{"id":66,"text":"Rows / Columns","bbox":{"l":396.76776,"t":242.02697999999998,"r":469.78748,"b":250.77495999999996,"coord_origin":"1"}},{"id":67,"text":"0","bbox":{"l":320.97653,"t":233.42296999999996,"r":324.79254,"b":239.255,"coord_origin":"1"}},{"id":68,"text":"20","bbox":{"l":410.483,"t":233.42296999999996,"r":418.11319,"b":239.255,"coord_origin":"1"}},{"id":69,"text":"40","bbox":{"l":500.84949,"t":233.42296999999996,"r":508.47968000000003,"b":239.255,"coord_origin":"1"}},{"id":70,"text":"10","bbox":{"l":365.29999,"t":233.42296999999996,"r":372.93018,"b":239.255,"coord_origin":"1"}},{"id":71,"text":"30","bbox":{"l":455.66626,"t":233.42296999999996,"r":463.29645,"b":239.255,"coord_origin":"1"}},{"id":72,"text":"50","bbox":{"l":542.03528,"t":233.42296999999996,"r":549.66547,"b":239.255,"coord_origin":"1"}},{"id":73,"text":"0","bbox":{"l":316.04474,"t":230.44617000000005,"r":319.86075,"b":236.27819999999997,"coord_origin":"1"}},{"id":74,"text":"2","bbox":{"l":312.62521,"t":198.69073000000003,"r":316.44122,"b":204.52277000000004,"coord_origin":"1"}},{"id":75,"text":"0","bbox":{"l":316.43942,"t":198.69073000000003,"r":320.2554,"b":204.52277000000004,"coord_origin":"1"}},{"id":76,"text":"4","bbox":{"l":313.14951,"t":168.09795999999994,"r":316.96552,"b":173.92998999999998,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":316.96371,"t":168.09795999999994,"r":320.77969,"b":173.92998999999998,"coord_origin":"1"}},{"id":78,"text":"6","bbox":{"l":312.92972,"t":136.58771000000002,"r":316.74573,"b":142.41974000000005,"coord_origin":"1"}},{"id":79,"text":"0","bbox":{"l":316.74393,"t":136.58771000000002,"r":320.55991,"b":142.41974000000005,"coord_origin":"1"}},{"id":80,"text":"8","bbox":{"l":312.48227,"t":105.60175000000004,"r":316.29828,"b":111.43377999999996,"coord_origin":"1"}},{"id":81,"text":"0","bbox":{"l":316.29648,"t":105.60175000000004,"r":320.11246,"b":111.43377999999996,"coord_origin":"1"}},{"id":82,"text":"1","bbox":{"l":312.48227,"t":212.25922000000003,"r":316.29828,"b":218.09124999999995,"coord_origin":"1"}},{"id":83,"text":"0","bbox":{"l":316.29648,"t":212.25922000000003,"r":320.11246,"b":218.09124999999995,"coord_origin":"1"}},{"id":84,"text":"3","bbox":{"l":313.07639,"t":183.72198000000003,"r":316.8924,"b":189.55402000000004,"coord_origin":"1"}},{"id":85,"text":"0","bbox":{"l":316.89059,"t":183.72198000000003,"r":320.70657,"b":189.55402000000004,"coord_origin":"1"}},{"id":86,"text":"5","bbox":{"l":312.76321,"t":152.47400000000005,"r":316.57922,"b":158.30602999999996,"coord_origin":"1"}},{"id":87,"text":"0","bbox":{"l":316.57742,"t":152.47400000000005,"r":320.3934,"b":158.30602999999996,"coord_origin":"1"}},{"id":88,"text":"7","bbox":{"l":312.19775,"t":120.57050000000004,"r":316.01376,"b":126.40252999999996,"coord_origin":"1"}},{"id":89,"text":"0","bbox":{"l":316.01196,"t":120.57050000000004,"r":319.82794,"b":126.40252999999996,"coord_origin":"1"}},{"id":90,"text":"9","bbox":{"l":312.8165,"t":90.1087,"r":316.63251,"b":95.94073000000003,"coord_origin":"1"}},{"id":91,"text":"0","bbox":{"l":316.63071,"t":90.1087,"r":320.44669,"b":95.94073000000003,"coord_origin":"1"}},{"id":92,"text":"0","bbox":{"l":532.17426,"t":222.72729000000004,"r":536.94427,"b":230.01727000000005,"coord_origin":"1"}},{"id":93,"text":"10K","bbox":{"l":532.87952,"t":108.26702999999986,"r":547.61249,"b":115.55700999999999,"coord_origin":"1"}},{"id":94,"text":"8K","bbox":{"l":532.7735,"t":130.78101000000004,"r":542.73877,"b":138.07097999999996,"coord_origin":"1"}},{"id":95,"text":"6K","bbox":{"l":532.79901,"t":153.92352000000005,"r":542.76428,"b":161.21349999999995,"coord_origin":"1"}},{"id":96,"text":"4K","bbox":{"l":532.5705,"t":176.75800000000004,"r":542.53577,"b":184.04796999999996,"coord_origin":"1"}},{"id":97,"text":"2K","bbox":{"l":532.14551,"t":199.6463,"r":542.11078,"b":206.93628,"coord_origin":"1"}},{"id":98,"text":"Figure 2:","bbox":{"l":308.862,"t":267.83636,"r":346.06238,"b":276.74292,"coord_origin":"1"}},{"id":99,"text":"Distribution of the tables across different table","bbox":{"l":354.49072,"t":267.83636,"r":545.11511,"b":276.74292,"coord_origin":"1"}},{"id":100,"text":"dimensions in PubTabNet + FinTabNet datasets","bbox":{"l":308.862,"t":279.79132000000004,"r":498.56989,"b":288.6979099999999,"coord_origin":"1"}},{"id":101,"text":"balance in the previous datasets.","bbox":{"l":308.862,"t":317.47336,"r":437.27002,"b":326.37991,"coord_origin":"1"}},{"id":102,"text":"The PubTabNet dataset contains 509k tables delivered as","bbox":{"l":320.81699,"t":331.53137,"r":545.11505,"b":340.43793,"coord_origin":"1"}},{"id":103,"text":"annotated PNG images. The annotations consist of the table","bbox":{"l":308.862,"t":343.48635999999993,"r":545.11517,"b":352.39291,"coord_origin":"1"}},{"id":104,"text":"structure represented in HTML format, the tokenized text","bbox":{"l":308.862,"t":355.44235,"r":545.11505,"b":364.34890999999993,"coord_origin":"1"}},{"id":105,"text":"and its bounding boxes per table cell. Fig. 1 shows the ap-","bbox":{"l":308.862,"t":367.39734,"r":545.11505,"b":376.30389,"coord_origin":"1"}},{"id":106,"text":"pearance style of PubTabNet. Depending on its complexity,","bbox":{"l":308.862,"t":379.35233,"r":545.11511,"b":388.25888,"coord_origin":"1"}},{"id":107,"text":"a table is characterized as “simple” when it does not contain","bbox":{"l":308.862,"t":391.30731,"r":545.11511,"b":400.21386999999993,"coord_origin":"1"}},{"id":108,"text":"row spans or column spans, otherwise it is “complex”. The","bbox":{"l":308.862,"t":403.26230000000004,"r":545.11505,"b":412.16885,"coord_origin":"1"}},{"id":109,"text":"dataset is divided into Train and Val splits (roughly 98% and","bbox":{"l":308.862,"t":415.21729,"r":545.11511,"b":424.12384,"coord_origin":"1"}},{"id":110,"text":"2%). The Train split consists of 54% simple and 46% com-","bbox":{"l":308.862,"t":427.17328,"r":545.11517,"b":436.0798300000001,"coord_origin":"1"}},{"id":111,"text":"plex tables and the Val split of 51% and 49% respectively.","bbox":{"l":308.862,"t":439.12827,"r":545.11517,"b":448.03482,"coord_origin":"1"}},{"id":112,"text":"The FinTabNet dataset contains 112k tables delivered as","bbox":{"l":308.862,"t":451.08325,"r":545.11511,"b":459.98981000000003,"coord_origin":"1"}},{"id":113,"text":"single-page PDF documents with mixed table structures and","bbox":{"l":308.862,"t":463.03824,"r":545.11505,"b":471.94479,"coord_origin":"1"}},{"id":114,"text":"text content. Similarly to the PubTabNet, the annotations","bbox":{"l":308.862,"t":474.99323,"r":545.11511,"b":483.89978,"coord_origin":"1"}},{"id":115,"text":"of FinTabNet include the table structure in HTML, the to-","bbox":{"l":308.862,"t":486.94922,"r":545.11511,"b":495.85577,"coord_origin":"1"}},{"id":116,"text":"kenized text and the bounding boxes on a table cell basis.","bbox":{"l":308.862,"t":498.90421,"r":545.11511,"b":507.81076,"coord_origin":"1"}},{"id":117,"text":"The dataset is divided into Train, Test and Val splits (81%,","bbox":{"l":308.862,"t":510.85919,"r":545.11517,"b":519.76575,"coord_origin":"1"}},{"id":118,"text":"9.5%, 9.5%), and each one is almost equally divided into","bbox":{"l":308.862,"t":522.8141800000001,"r":545.11517,"b":531.72073,"coord_origin":"1"}},{"id":119,"text":"simple and complex tables (Train: 48% simple, 52% com-","bbox":{"l":308.862,"t":534.76917,"r":545.11505,"b":543.67574,"coord_origin":"1"}},{"id":120,"text":"plex, Test: 48% simple, 52% complex, Test: 53% simple,","bbox":{"l":308.862,"t":546.72418,"r":545.11511,"b":555.6307400000001,"coord_origin":"1"}},{"id":121,"text":"47% complex). Finally the TableBank dataset consists of","bbox":{"l":308.862,"t":558.6801800000001,"r":545.11511,"b":567.58673,"coord_origin":"1"}},{"id":122,"text":"145k tables provided as JPEG images. The latter has anno-","bbox":{"l":308.862,"t":570.63518,"r":545.11505,"b":579.54173,"coord_origin":"1"}},{"id":123,"text":"tations for the table structure, but only few with bounding","bbox":{"l":308.862,"t":582.59018,"r":545.11499,"b":591.49673,"coord_origin":"1"}},{"id":124,"text":"boxes of the table cells. The entire dataset consists of sim-","bbox":{"l":308.862,"t":594.54518,"r":545.11517,"b":603.45174,"coord_origin":"1"}},{"id":125,"text":"ple tables and it is divided into 90% Train, 3% Test and 7%","bbox":{"l":308.862,"t":606.50018,"r":545.11511,"b":615.40674,"coord_origin":"1"}},{"id":126,"text":"Val splits.","bbox":{"l":308.862,"t":618.45518,"r":348.16446,"b":627.36174,"coord_origin":"1"}},{"id":127,"text":"Due to the heterogeneity across the dataset formats, it","bbox":{"l":320.81699,"t":632.51419,"r":545.11487,"b":641.42075,"coord_origin":"1"}},{"id":128,"text":"was necessary to combine all available data into one homog-","bbox":{"l":308.862,"t":644.46919,"r":545.11511,"b":653.37575,"coord_origin":"1"}},{"id":129,"text":"enized dataset before we could train our models for practi-","bbox":{"l":308.862,"t":656.42419,"r":545.11511,"b":665.33076,"coord_origin":"1"}},{"id":130,"text":"cal purposes. Given the size of PubTabNet, we adopted its","bbox":{"l":308.862,"t":668.38019,"r":545.11499,"b":677.28676,"coord_origin":"1"}},{"id":131,"text":"annotation format and we extracted and converted all tables","bbox":{"l":308.862,"t":680.33519,"r":545.11505,"b":689.24176,"coord_origin":"1"}},{"id":132,"text":"as PNG images with a resolution of 72 dpi. Additionally,","bbox":{"l":308.862,"t":692.290192,"r":545.11505,"b":701.196762,"coord_origin":"1"}},{"id":133,"text":"we have filtered out tables with extreme sizes due to small","bbox":{"l":308.862,"t":704.245193,"r":545.11511,"b":713.151764,"coord_origin":"1"}},{"id":134,"text":"3","bbox":{"l":295.121,"t":734.133198,"r":300.10229,"b":743.039761,"coord_origin":"1"}}],"predictions":{"layout":{"clusters":[{"id":0,"label":"Text","bbox":{"l":49.48567485809326,"t":74.29503536224365,"r":250.15102,"b":84.11492999999996,"coord_origin":"1"},"confidence":0.8762400150299072,"cells":[{"id":0,"text":"tag-decoder which is constrained to the table-tags.","bbox":{"l":50.112,"t":75.20836999999995,"r":250.15102,"b":84.11492999999996,"coord_origin":"1"}}]},{"id":1,"label":"Text","bbox":{"l":49.11482083797455,"t":85.99874410629275,"r":286.36514,"b":276.05459117889404,"coord_origin":"1"},"confidence":0.9855231046676636,"cells":[{"id":1,"text":"In","bbox":{"l":62.067001,"t":87.21935999999994,"r":70.365845,"b":96.12591999999995,"coord_origin":"1"}},{"id":2,"text":"practice,","bbox":{"l":76.931198,"t":87.21935999999994,"r":110.95348000000001,"b":96.12591999999995,"coord_origin":"1"}},{"id":3,"text":"both","bbox":{"l":118.54498,"t":87.21935999999994,"r":136.25848,"b":96.12591999999995,"coord_origin":"1"}},{"id":4,"text":"network","bbox":{"l":142.82384,"t":87.21935999999994,"r":175.37166,"b":96.12591999999995,"coord_origin":"1"}},{"id":5,"text":"architectures","bbox":{"l":181.94698,"t":87.21935999999994,"r":232.83594000000002,"b":96.12591999999995,"coord_origin":"1"}},{"id":6,"text":"(IETD","bbox":{"l":239.41125,"t":87.21935999999994,"r":265.41364,"b":96.12591999999995,"coord_origin":"1"}},{"id":7,"text":"and","bbox":{"l":271.979,"t":87.21935999999994,"r":286.36499,"b":96.12591999999995,"coord_origin":"1"}},{"id":8,"text":"IEDD) require an implicit, custom trained object-character-","bbox":{"l":50.112,"t":99.17437999999993,"r":286.36505,"b":108.08092999999997,"coord_origin":"1"}},{"id":9,"text":"recognition (OCR) to obtain the content of the table-cells.","bbox":{"l":50.112,"t":111.13036999999997,"r":286.36511,"b":120.03692999999998,"coord_origin":"1"}},{"id":10,"text":"In the case of IETD, this OCR engine is implicit in the de-","bbox":{"l":50.112,"t":123.08538999999996,"r":286.36505,"b":131.99194,"coord_origin":"1"}},{"id":11,"text":"coder similar to [24]. For the IEDD, the OCR is solely em-","bbox":{"l":50.112,"t":135.04040999999995,"r":286.36514,"b":143.94696,"coord_origin":"1"}},{"id":12,"text":"bedded in the content-decoder. This reliance on a custom,","bbox":{"l":50.112,"t":146.99541999999997,"r":286.36511,"b":155.90197999999998,"coord_origin":"1"}},{"id":13,"text":"implicit OCR decoder is of course problematic. OCR is a","bbox":{"l":50.112,"t":158.95043999999996,"r":286.36505,"b":167.85699,"coord_origin":"1"}},{"id":14,"text":"well known and extremely tough problem, that often needs","bbox":{"l":50.112,"t":170.90545999999995,"r":286.36508,"b":179.81201,"coord_origin":"1"}},{"id":15,"text":"custom training for each individual language. However, the","bbox":{"l":50.112,"t":182.86145,"r":286.36508,"b":191.76801,"coord_origin":"1"}},{"id":16,"text":"limited availability for non-english content in the current","bbox":{"l":50.112,"t":194.81646999999998,"r":286.36511,"b":203.72302000000002,"coord_origin":"1"}},{"id":17,"text":"datasets, makes it impractical to apply the IETD and IEDD","bbox":{"l":50.112,"t":206.77148,"r":286.36511,"b":215.67804,"coord_origin":"1"}},{"id":18,"text":"methods on tables with other languages. Additionally, OCR","bbox":{"l":50.112,"t":218.7265,"r":286.36505,"b":227.63306,"coord_origin":"1"}},{"id":19,"text":"can be completely omitted if the tables originate from pro-","bbox":{"l":50.112,"t":230.68151999999998,"r":286.36505,"b":239.58807000000002,"coord_origin":"1"}},{"id":20,"text":"grammatic PDF documents with known positions of each","bbox":{"l":50.112,"t":242.63653999999997,"r":286.36511,"b":251.54309,"coord_origin":"1"}},{"id":21,"text":"cell. The latter was the inspiration for the work of this pa-","bbox":{"l":50.112,"t":254.59253,"r":286.36508,"b":263.49908000000005,"coord_origin":"1"}},{"id":22,"text":"per.","bbox":{"l":50.112,"t":266.54755,"r":64.776947,"b":275.45410000000004,"coord_origin":"1"}}]},{"id":2,"label":"Text","bbox":{"l":49.138185024261475,"t":277.6425739288329,"r":286.5478357315064,"b":490.70288,"coord_origin":"1"},"confidence":0.9872949719429016,"cells":[{"id":23,"text":"Graph Neural networks","bbox":{"l":62.067001,"t":278.43895999999995,"r":171.56593,"b":287.39536,"coord_origin":"1"}},{"id":24,"text":":","bbox":{"l":171.56799,"t":278.55853,"r":174.3376,"b":287.46509,"coord_origin":"1"}},{"id":25,"text":"Graph Neural networks","bbox":{"l":185.18687,"t":278.55853,"r":286.35709,"b":287.46509,"coord_origin":"1"}},{"id":26,"text":"(GNN’s) take a radically different approach to table-","bbox":{"l":50.111992,"t":290.51453000000004,"r":286.36511,"b":299.42108,"coord_origin":"1"}},{"id":27,"text":"structure extraction.","bbox":{"l":50.111992,"t":302.46950999999996,"r":131.16771,"b":311.37607,"coord_origin":"1"}},{"id":28,"text":"Note that one table cell can consti-","bbox":{"l":138.84888,"t":302.46950999999996,"r":286.36508,"b":311.37607,"coord_origin":"1"}},{"id":29,"text":"tute out of multiple text-cells. To obtain the table-structure,","bbox":{"l":50.111992,"t":314.4245,"r":286.36505,"b":323.33105,"coord_origin":"1"}},{"id":30,"text":"one creates an initial graph, where each of the text-cells","bbox":{"l":50.111992,"t":326.37949000000003,"r":286.36508,"b":335.28604,"coord_origin":"1"}},{"id":31,"text":"becomes a node in the graph similar to [33, 34, 2]. Each","bbox":{"l":50.111992,"t":338.33447,"r":286.36505,"b":347.2410300000001,"coord_origin":"1"}},{"id":32,"text":"node is then associated with en embedding vector coming","bbox":{"l":50.111992,"t":350.28946,"r":286.36505,"b":359.19601,"coord_origin":"1"}},{"id":33,"text":"from the encoded image, its coordinates and the encoded","bbox":{"l":50.111992,"t":362.24545000000006,"r":286.36508,"b":371.15201,"coord_origin":"1"}},{"id":34,"text":"text. Furthermore, nodes that represent adjacent text-cells","bbox":{"l":50.111992,"t":374.20044,"r":286.36508,"b":383.10699,"coord_origin":"1"}},{"id":35,"text":"are linked. Graph Convolutional Networks (GCN’s) based","bbox":{"l":50.111992,"t":386.15542999999997,"r":286.36508,"b":395.06198,"coord_origin":"1"}},{"id":36,"text":"methods take the image as an input, but also the position of","bbox":{"l":50.111992,"t":398.11041000000006,"r":286.36508,"b":407.01697,"coord_origin":"1"}},{"id":37,"text":"the text-cells and their content [18]. The purpose of a GCN","bbox":{"l":50.111992,"t":410.0654,"r":286.36508,"b":418.97195,"coord_origin":"1"}},{"id":38,"text":"is to transform the input graph into a new graph, which re-","bbox":{"l":50.111992,"t":422.02038999999996,"r":286.36505,"b":430.92694,"coord_origin":"1"}},{"id":39,"text":"places the old links with new ones.","bbox":{"l":50.111992,"t":433.97638,"r":198.2359,"b":442.88293,"coord_origin":"1"}},{"id":40,"text":"The new links then","bbox":{"l":205.92703,"t":433.97638,"r":286.36505,"b":442.88293,"coord_origin":"1"}},{"id":41,"text":"represent the table-structure. With this approach, one can","bbox":{"l":50.111992,"t":445.93137,"r":286.36508,"b":454.83792000000005,"coord_origin":"1"}},{"id":42,"text":"avoid the need to build custom OCR decoders. However,","bbox":{"l":50.111992,"t":457.88635,"r":286.36505,"b":466.79291,"coord_origin":"1"}},{"id":43,"text":"the quality of the reconstructed structure is not comparable","bbox":{"l":50.111992,"t":469.84134,"r":286.36505,"b":478.74789,"coord_origin":"1"}},{"id":44,"text":"to the current state-of-the-art [18].","bbox":{"l":50.111992,"t":481.79633,"r":186.49998,"b":490.70288,"coord_origin":"1"}}]},{"id":3,"label":"Text","bbox":{"l":49.34700357913971,"t":492.7602035522461,"r":286.6784271240235,"b":622.8181755065917,"coord_origin":"1"},"confidence":0.9880095720291138,"cells":[{"id":45,"text":"Hybrid Deep Learning-Rule-Based approach","bbox":{"l":62.066994,"t":493.68875,"r":252.88068000000004,"b":502.64514,"coord_origin":"1"}},{"id":46,"text":": A pop-","bbox":{"l":252.88199,"t":493.80832,"r":286.36627,"b":502.71487,"coord_origin":"1"}},{"id":47,"text":"ular current model for table-structure identification is the","bbox":{"l":50.111984,"t":505.76331,"r":286.36505,"b":514.66986,"coord_origin":"1"}},{"id":48,"text":"use of a hybrid Deep Learning-Rule-Based approach similar","bbox":{"l":50.111984,"t":517.71829,"r":286.36505,"b":526.6248499999999,"coord_origin":"1"}},{"id":49,"text":"to [27, 29]. In this approach, one first detects the position of","bbox":{"l":50.111984,"t":529.67328,"r":286.36508,"b":538.57985,"coord_origin":"1"}},{"id":50,"text":"the table-cells with object detection (e.g. YoloVx or Mask-","bbox":{"l":50.111984,"t":541.62929,"r":286.36508,"b":550.53584,"coord_origin":"1"}},{"id":51,"text":"RCNN), then classifies the table into different types (from","bbox":{"l":50.111984,"t":553.58429,"r":286.36511,"b":562.4908399999999,"coord_origin":"1"}},{"id":52,"text":"its images) and finally uses different rule-sets to obtain","bbox":{"l":50.111984,"t":565.5392899999999,"r":286.36511,"b":574.44585,"coord_origin":"1"}},{"id":53,"text":"its table-structure. Currently, this approach achieves state-","bbox":{"l":50.111984,"t":577.49429,"r":286.36502,"b":586.40085,"coord_origin":"1"}},{"id":54,"text":"of-the-art results, but is not an end-to-end deep-learning","bbox":{"l":50.111984,"t":589.4493,"r":286.36505,"b":598.35585,"coord_origin":"1"}},{"id":55,"text":"method. As such, new rules need to be written if different","bbox":{"l":50.111984,"t":601.4043,"r":286.36502,"b":610.31085,"coord_origin":"1"}},{"id":56,"text":"types of tables are encountered.","bbox":{"l":50.111984,"t":613.36029,"r":175.98943,"b":622.26685,"coord_origin":"1"}}]},{"id":4,"label":"Section-header","bbox":{"l":49.34483313560486,"t":635.3652351379395,"r":105.30262470245361,"b":646.6925699999999,"coord_origin":"1"},"confidence":0.9314619302749634,"cells":[{"id":57,"text":"3.","bbox":{"l":50.111984,"t":635.94484,"r":57.82375699999999,"b":646.6925699999999,"coord_origin":"1"}},{"id":58,"text":"Datasets","bbox":{"l":68.106125,"t":635.94484,"r":105.22546,"b":646.6925699999999,"coord_origin":"1"}}]},{"id":5,"label":"Text","bbox":{"l":49.34266269207001,"t":655.4480232238769,"r":286.3688890457153,"b":713.1628509521485,"coord_origin":"1"},"confidence":0.9872632026672363,"cells":[{"id":59,"text":"We rely on large-scale datasets such as PubTabNet [37],","bbox":{"l":62.06698600000001,"t":656.42529,"r":286.36493,"b":665.33186,"coord_origin":"1"}},{"id":60,"text":"FinTabNet [36], and TableBank [17] datasets to train and","bbox":{"l":50.111984,"t":668.38029,"r":286.36508,"b":677.2868599999999,"coord_origin":"1"}},{"id":61,"text":"evaluate our models. These datasets span over various ap-","bbox":{"l":50.111984,"t":680.3353,"r":286.36502,"b":689.24186,"coord_origin":"1"}},{"id":62,"text":"pearance styles and content.","bbox":{"l":50.111984,"t":692.290298,"r":166.24602,"b":701.196861,"coord_origin":"1"}},{"id":63,"text":"We also introduce our own","bbox":{"l":173.68808,"t":692.290298,"r":286.36508,"b":701.196861,"coord_origin":"1"}},{"id":64,"text":"synthetically generated SynthTabNet dataset to fix an im-","bbox":{"l":50.111984,"t":704.2453,"r":286.36505,"b":713.151863,"coord_origin":"1"}}]},{"id":6,"label":"Picture","bbox":{"l":311.3420780181885,"t":78.12818670272827,"r":550.2800857543945,"b":251.03088569641113,"coord_origin":"1"},"confidence":0.979541003704071,"cells":[{"id":65,"text":"PubTabNet + FinTabNet","bbox":{"l":380.79849,"t":79.81176999999991,"r":486.84909,"b":88.55975000000001,"coord_origin":"1"}},{"id":66,"text":"Rows / Columns","bbox":{"l":396.76776,"t":242.02697999999998,"r":469.78748,"b":250.77495999999996,"coord_origin":"1"}},{"id":67,"text":"0","bbox":{"l":320.97653,"t":233.42296999999996,"r":324.79254,"b":239.255,"coord_origin":"1"}},{"id":68,"text":"20","bbox":{"l":410.483,"t":233.42296999999996,"r":418.11319,"b":239.255,"coord_origin":"1"}},{"id":69,"text":"40","bbox":{"l":500.84949,"t":233.42296999999996,"r":508.47968000000003,"b":239.255,"coord_origin":"1"}},{"id":70,"text":"10","bbox":{"l":365.29999,"t":233.42296999999996,"r":372.93018,"b":239.255,"coord_origin":"1"}},{"id":71,"text":"30","bbox":{"l":455.66626,"t":233.42296999999996,"r":463.29645,"b":239.255,"coord_origin":"1"}},{"id":72,"text":"50","bbox":{"l":542.03528,"t":233.42296999999996,"r":549.66547,"b":239.255,"coord_origin":"1"}},{"id":73,"text":"0","bbox":{"l":316.04474,"t":230.44617000000005,"r":319.86075,"b":236.27819999999997,"coord_origin":"1"}},{"id":74,"text":"2","bbox":{"l":312.62521,"t":198.69073000000003,"r":316.44122,"b":204.52277000000004,"coord_origin":"1"}},{"id":75,"text":"0","bbox":{"l":316.43942,"t":198.69073000000003,"r":320.2554,"b":204.52277000000004,"coord_origin":"1"}},{"id":76,"text":"4","bbox":{"l":313.14951,"t":168.09795999999994,"r":316.96552,"b":173.92998999999998,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":316.96371,"t":168.09795999999994,"r":320.77969,"b":173.92998999999998,"coord_origin":"1"}},{"id":78,"text":"6","bbox":{"l":312.92972,"t":136.58771000000002,"r":316.74573,"b":142.41974000000005,"coord_origin":"1"}},{"id":79,"text":"0","bbox":{"l":316.74393,"t":136.58771000000002,"r":320.55991,"b":142.41974000000005,"coord_origin":"1"}},{"id":80,"text":"8","bbox":{"l":312.48227,"t":105.60175000000004,"r":316.29828,"b":111.43377999999996,"coord_origin":"1"}},{"id":81,"text":"0","bbox":{"l":316.29648,"t":105.60175000000004,"r":320.11246,"b":111.43377999999996,"coord_origin":"1"}},{"id":82,"text":"1","bbox":{"l":312.48227,"t":212.25922000000003,"r":316.29828,"b":218.09124999999995,"coord_origin":"1"}},{"id":83,"text":"0","bbox":{"l":316.29648,"t":212.25922000000003,"r":320.11246,"b":218.09124999999995,"coord_origin":"1"}},{"id":84,"text":"3","bbox":{"l":313.07639,"t":183.72198000000003,"r":316.8924,"b":189.55402000000004,"coord_origin":"1"}},{"id":85,"text":"0","bbox":{"l":316.89059,"t":183.72198000000003,"r":320.70657,"b":189.55402000000004,"coord_origin":"1"}},{"id":86,"text":"5","bbox":{"l":312.76321,"t":152.47400000000005,"r":316.57922,"b":158.30602999999996,"coord_origin":"1"}},{"id":87,"text":"0","bbox":{"l":316.57742,"t":152.47400000000005,"r":320.3934,"b":158.30602999999996,"coord_origin":"1"}},{"id":88,"text":"7","bbox":{"l":312.19775,"t":120.57050000000004,"r":316.01376,"b":126.40252999999996,"coord_origin":"1"}},{"id":89,"text":"0","bbox":{"l":316.01196,"t":120.57050000000004,"r":319.82794,"b":126.40252999999996,"coord_origin":"1"}},{"id":90,"text":"9","bbox":{"l":312.8165,"t":90.1087,"r":316.63251,"b":95.94073000000003,"coord_origin":"1"}},{"id":91,"text":"0","bbox":{"l":316.63071,"t":90.1087,"r":320.44669,"b":95.94073000000003,"coord_origin":"1"}},{"id":92,"text":"0","bbox":{"l":532.17426,"t":222.72729000000004,"r":536.94427,"b":230.01727000000005,"coord_origin":"1"}},{"id":93,"text":"10K","bbox":{"l":532.87952,"t":108.26702999999986,"r":547.61249,"b":115.55700999999999,"coord_origin":"1"}},{"id":94,"text":"8K","bbox":{"l":532.7735,"t":130.78101000000004,"r":542.73877,"b":138.07097999999996,"coord_origin":"1"}},{"id":95,"text":"6K","bbox":{"l":532.79901,"t":153.92352000000005,"r":542.76428,"b":161.21349999999995,"coord_origin":"1"}},{"id":96,"text":"4K","bbox":{"l":532.5705,"t":176.75800000000004,"r":542.53577,"b":184.04796999999996,"coord_origin":"1"}},{"id":97,"text":"2K","bbox":{"l":532.14551,"t":199.6463,"r":542.11078,"b":206.93628,"coord_origin":"1"}}]},{"id":7,"label":"Caption","bbox":{"l":308.0231597900391,"t":267.0846988677979,"r":545.11511,"b":288.6979099999999,"coord_origin":"1"},"confidence":0.9705182313919067,"cells":[{"id":98,"text":"Figure 2:","bbox":{"l":308.862,"t":267.83636,"r":346.06238,"b":276.74292,"coord_origin":"1"}},{"id":99,"text":"Distribution of the tables across different table","bbox":{"l":354.49072,"t":267.83636,"r":545.11511,"b":276.74292,"coord_origin":"1"}},{"id":100,"text":"dimensions in PubTabNet + FinTabNet datasets","bbox":{"l":308.862,"t":279.79132000000004,"r":498.56989,"b":288.6979099999999,"coord_origin":"1"}}]},{"id":8,"label":"Text","bbox":{"l":307.9623727798462,"t":316.53379096984867,"r":437.27002,"b":326.5220111846924,"coord_origin":"1"},"confidence":0.8795380592346191,"cells":[{"id":101,"text":"balance in the previous datasets.","bbox":{"l":308.862,"t":317.47336,"r":437.27002,"b":326.37991,"coord_origin":"1"}}]},{"id":9,"label":"Text","bbox":{"l":307.9255153656006,"t":330.2963710784912,"r":545.6851364135742,"b":627.7393981933593,"coord_origin":"1"},"confidence":0.9885321259498596,"cells":[{"id":102,"text":"The PubTabNet dataset contains 509k tables delivered as","bbox":{"l":320.81699,"t":331.53137,"r":545.11505,"b":340.43793,"coord_origin":"1"}},{"id":103,"text":"annotated PNG images. The annotations consist of the table","bbox":{"l":308.862,"t":343.48635999999993,"r":545.11517,"b":352.39291,"coord_origin":"1"}},{"id":104,"text":"structure represented in HTML format, the tokenized text","bbox":{"l":308.862,"t":355.44235,"r":545.11505,"b":364.34890999999993,"coord_origin":"1"}},{"id":105,"text":"and its bounding boxes per table cell. Fig. 1 shows the ap-","bbox":{"l":308.862,"t":367.39734,"r":545.11505,"b":376.30389,"coord_origin":"1"}},{"id":106,"text":"pearance style of PubTabNet. Depending on its complexity,","bbox":{"l":308.862,"t":379.35233,"r":545.11511,"b":388.25888,"coord_origin":"1"}},{"id":107,"text":"a table is characterized as “simple” when it does not contain","bbox":{"l":308.862,"t":391.30731,"r":545.11511,"b":400.21386999999993,"coord_origin":"1"}},{"id":108,"text":"row spans or column spans, otherwise it is “complex”. The","bbox":{"l":308.862,"t":403.26230000000004,"r":545.11505,"b":412.16885,"coord_origin":"1"}},{"id":109,"text":"dataset is divided into Train and Val splits (roughly 98% and","bbox":{"l":308.862,"t":415.21729,"r":545.11511,"b":424.12384,"coord_origin":"1"}},{"id":110,"text":"2%). The Train split consists of 54% simple and 46% com-","bbox":{"l":308.862,"t":427.17328,"r":545.11517,"b":436.0798300000001,"coord_origin":"1"}},{"id":111,"text":"plex tables and the Val split of 51% and 49% respectively.","bbox":{"l":308.862,"t":439.12827,"r":545.11517,"b":448.03482,"coord_origin":"1"}},{"id":112,"text":"The FinTabNet dataset contains 112k tables delivered as","bbox":{"l":308.862,"t":451.08325,"r":545.11511,"b":459.98981000000003,"coord_origin":"1"}},{"id":113,"text":"single-page PDF documents with mixed table structures and","bbox":{"l":308.862,"t":463.03824,"r":545.11505,"b":471.94479,"coord_origin":"1"}},{"id":114,"text":"text content. Similarly to the PubTabNet, the annotations","bbox":{"l":308.862,"t":474.99323,"r":545.11511,"b":483.89978,"coord_origin":"1"}},{"id":115,"text":"of FinTabNet include the table structure in HTML, the to-","bbox":{"l":308.862,"t":486.94922,"r":545.11511,"b":495.85577,"coord_origin":"1"}},{"id":116,"text":"kenized text and the bounding boxes on a table cell basis.","bbox":{"l":308.862,"t":498.90421,"r":545.11511,"b":507.81076,"coord_origin":"1"}},{"id":117,"text":"The dataset is divided into Train, Test and Val splits (81%,","bbox":{"l":308.862,"t":510.85919,"r":545.11517,"b":519.76575,"coord_origin":"1"}},{"id":118,"text":"9.5%, 9.5%), and each one is almost equally divided into","bbox":{"l":308.862,"t":522.8141800000001,"r":545.11517,"b":531.72073,"coord_origin":"1"}},{"id":119,"text":"simple and complex tables (Train: 48% simple, 52% com-","bbox":{"l":308.862,"t":534.76917,"r":545.11505,"b":543.67574,"coord_origin":"1"}},{"id":120,"text":"plex, Test: 48% simple, 52% complex, Test: 53% simple,","bbox":{"l":308.862,"t":546.72418,"r":545.11511,"b":555.6307400000001,"coord_origin":"1"}},{"id":121,"text":"47% complex). Finally the TableBank dataset consists of","bbox":{"l":308.862,"t":558.6801800000001,"r":545.11511,"b":567.58673,"coord_origin":"1"}},{"id":122,"text":"145k tables provided as JPEG images. The latter has anno-","bbox":{"l":308.862,"t":570.63518,"r":545.11505,"b":579.54173,"coord_origin":"1"}},{"id":123,"text":"tations for the table structure, but only few with bounding","bbox":{"l":308.862,"t":582.59018,"r":545.11499,"b":591.49673,"coord_origin":"1"}},{"id":124,"text":"boxes of the table cells. The entire dataset consists of sim-","bbox":{"l":308.862,"t":594.54518,"r":545.11517,"b":603.45174,"coord_origin":"1"}},{"id":125,"text":"ple tables and it is divided into 90% Train, 3% Test and 7%","bbox":{"l":308.862,"t":606.50018,"r":545.11511,"b":615.40674,"coord_origin":"1"}},{"id":126,"text":"Val splits.","bbox":{"l":308.862,"t":618.45518,"r":348.16446,"b":627.36174,"coord_origin":"1"}}]},{"id":10,"label":"Text","bbox":{"l":307.7597299575806,"t":631.6698051452637,"r":545.2829441070556,"b":713.151764,"coord_origin":"1"},"confidence":0.9847508668899536,"cells":[{"id":127,"text":"Due to the heterogeneity across the dataset formats, it","bbox":{"l":320.81699,"t":632.51419,"r":545.11487,"b":641.42075,"coord_origin":"1"}},{"id":128,"text":"was necessary to combine all available data into one homog-","bbox":{"l":308.862,"t":644.46919,"r":545.11511,"b":653.37575,"coord_origin":"1"}},{"id":129,"text":"enized dataset before we could train our models for practi-","bbox":{"l":308.862,"t":656.42419,"r":545.11511,"b":665.33076,"coord_origin":"1"}},{"id":130,"text":"cal purposes. Given the size of PubTabNet, we adopted its","bbox":{"l":308.862,"t":668.38019,"r":545.11499,"b":677.28676,"coord_origin":"1"}},{"id":131,"text":"annotation format and we extracted and converted all tables","bbox":{"l":308.862,"t":680.33519,"r":545.11505,"b":689.24176,"coord_origin":"1"}},{"id":132,"text":"as PNG images with a resolution of 72 dpi. Additionally,","bbox":{"l":308.862,"t":692.290192,"r":545.11505,"b":701.196762,"coord_origin":"1"}},{"id":133,"text":"we have filtered out tables with extreme sizes due to small","bbox":{"l":308.862,"t":704.245193,"r":545.11511,"b":713.151764,"coord_origin":"1"}}]},{"id":11,"label":"Page-footer","bbox":{"l":294.43619785308834,"t":733.3885437011719,"r":300.10229,"b":743.039761,"coord_origin":"1"},"confidence":0.893059253692627,"cells":[{"id":134,"text":"3","bbox":{"l":295.121,"t":734.133198,"r":300.10229,"b":743.039761,"coord_origin":"1"}}]}]},"tablestructure":{"table_map":{}},"figures_classification":null,"equations_prediction":null},"assembled":{"elements":[{"label":"Text","id":0,"page_no":2,"cluster":{"id":0,"label":"Text","bbox":{"l":49.48567485809326,"t":74.29503536224365,"r":250.15102,"b":84.11492999999996,"coord_origin":"1"},"confidence":0.8762400150299072,"cells":[{"id":0,"text":"tag-decoder which is constrained to the table-tags.","bbox":{"l":50.112,"t":75.20836999999995,"r":250.15102,"b":84.11492999999996,"coord_origin":"1"}}]},"text":"tag-decoder which is constrained to the table-tags."},{"label":"Text","id":1,"page_no":2,"cluster":{"id":1,"label":"Text","bbox":{"l":49.11482083797455,"t":85.99874410629275,"r":286.36514,"b":276.05459117889404,"coord_origin":"1"},"confidence":0.9855231046676636,"cells":[{"id":1,"text":"In","bbox":{"l":62.067001,"t":87.21935999999994,"r":70.365845,"b":96.12591999999995,"coord_origin":"1"}},{"id":2,"text":"practice,","bbox":{"l":76.931198,"t":87.21935999999994,"r":110.95348000000001,"b":96.12591999999995,"coord_origin":"1"}},{"id":3,"text":"both","bbox":{"l":118.54498,"t":87.21935999999994,"r":136.25848,"b":96.12591999999995,"coord_origin":"1"}},{"id":4,"text":"network","bbox":{"l":142.82384,"t":87.21935999999994,"r":175.37166,"b":96.12591999999995,"coord_origin":"1"}},{"id":5,"text":"architectures","bbox":{"l":181.94698,"t":87.21935999999994,"r":232.83594000000002,"b":96.12591999999995,"coord_origin":"1"}},{"id":6,"text":"(IETD","bbox":{"l":239.41125,"t":87.21935999999994,"r":265.41364,"b":96.12591999999995,"coord_origin":"1"}},{"id":7,"text":"and","bbox":{"l":271.979,"t":87.21935999999994,"r":286.36499,"b":96.12591999999995,"coord_origin":"1"}},{"id":8,"text":"IEDD) require an implicit, custom trained object-character-","bbox":{"l":50.112,"t":99.17437999999993,"r":286.36505,"b":108.08092999999997,"coord_origin":"1"}},{"id":9,"text":"recognition (OCR) to obtain the content of the table-cells.","bbox":{"l":50.112,"t":111.13036999999997,"r":286.36511,"b":120.03692999999998,"coord_origin":"1"}},{"id":10,"text":"In the case of IETD, this OCR engine is implicit in the de-","bbox":{"l":50.112,"t":123.08538999999996,"r":286.36505,"b":131.99194,"coord_origin":"1"}},{"id":11,"text":"coder similar to [24]. For the IEDD, the OCR is solely em-","bbox":{"l":50.112,"t":135.04040999999995,"r":286.36514,"b":143.94696,"coord_origin":"1"}},{"id":12,"text":"bedded in the content-decoder. This reliance on a custom,","bbox":{"l":50.112,"t":146.99541999999997,"r":286.36511,"b":155.90197999999998,"coord_origin":"1"}},{"id":13,"text":"implicit OCR decoder is of course problematic. OCR is a","bbox":{"l":50.112,"t":158.95043999999996,"r":286.36505,"b":167.85699,"coord_origin":"1"}},{"id":14,"text":"well known and extremely tough problem, that often needs","bbox":{"l":50.112,"t":170.90545999999995,"r":286.36508,"b":179.81201,"coord_origin":"1"}},{"id":15,"text":"custom training for each individual language. However, the","bbox":{"l":50.112,"t":182.86145,"r":286.36508,"b":191.76801,"coord_origin":"1"}},{"id":16,"text":"limited availability for non-english content in the current","bbox":{"l":50.112,"t":194.81646999999998,"r":286.36511,"b":203.72302000000002,"coord_origin":"1"}},{"id":17,"text":"datasets, makes it impractical to apply the IETD and IEDD","bbox":{"l":50.112,"t":206.77148,"r":286.36511,"b":215.67804,"coord_origin":"1"}},{"id":18,"text":"methods on tables with other languages. Additionally, OCR","bbox":{"l":50.112,"t":218.7265,"r":286.36505,"b":227.63306,"coord_origin":"1"}},{"id":19,"text":"can be completely omitted if the tables originate from pro-","bbox":{"l":50.112,"t":230.68151999999998,"r":286.36505,"b":239.58807000000002,"coord_origin":"1"}},{"id":20,"text":"grammatic PDF documents with known positions of each","bbox":{"l":50.112,"t":242.63653999999997,"r":286.36511,"b":251.54309,"coord_origin":"1"}},{"id":21,"text":"cell. The latter was the inspiration for the work of this pa-","bbox":{"l":50.112,"t":254.59253,"r":286.36508,"b":263.49908000000005,"coord_origin":"1"}},{"id":22,"text":"per.","bbox":{"l":50.112,"t":266.54755,"r":64.776947,"b":275.45410000000004,"coord_origin":"1"}}]},"text":"In practice, both network architectures (IETD and IEDD) require an implicit, custom trained object-characterrecognition (OCR) to obtain the content of the table-cells. In the case of IETD, this OCR engine is implicit in the decoder similar to [24]. For the IEDD, the OCR is solely embedded in the content-decoder. This reliance on a custom, implicit OCR decoder is of course problematic. OCR is a well known and extremely tough problem, that often needs custom training for each individual language. However, the limited availability for non-english content in the current datasets, makes it impractical to apply the IETD and IEDD methods on tables with other languages. Additionally, OCR can be completely omitted if the tables originate from programmatic PDF documents with known positions of each cell. The latter was the inspiration for the work of this paper."},{"label":"Text","id":2,"page_no":2,"cluster":{"id":2,"label":"Text","bbox":{"l":49.138185024261475,"t":277.6425739288329,"r":286.5478357315064,"b":490.70288,"coord_origin":"1"},"confidence":0.9872949719429016,"cells":[{"id":23,"text":"Graph Neural networks","bbox":{"l":62.067001,"t":278.43895999999995,"r":171.56593,"b":287.39536,"coord_origin":"1"}},{"id":24,"text":":","bbox":{"l":171.56799,"t":278.55853,"r":174.3376,"b":287.46509,"coord_origin":"1"}},{"id":25,"text":"Graph Neural networks","bbox":{"l":185.18687,"t":278.55853,"r":286.35709,"b":287.46509,"coord_origin":"1"}},{"id":26,"text":"(GNN’s) take a radically different approach to table-","bbox":{"l":50.111992,"t":290.51453000000004,"r":286.36511,"b":299.42108,"coord_origin":"1"}},{"id":27,"text":"structure extraction.","bbox":{"l":50.111992,"t":302.46950999999996,"r":131.16771,"b":311.37607,"coord_origin":"1"}},{"id":28,"text":"Note that one table cell can consti-","bbox":{"l":138.84888,"t":302.46950999999996,"r":286.36508,"b":311.37607,"coord_origin":"1"}},{"id":29,"text":"tute out of multiple text-cells. To obtain the table-structure,","bbox":{"l":50.111992,"t":314.4245,"r":286.36505,"b":323.33105,"coord_origin":"1"}},{"id":30,"text":"one creates an initial graph, where each of the text-cells","bbox":{"l":50.111992,"t":326.37949000000003,"r":286.36508,"b":335.28604,"coord_origin":"1"}},{"id":31,"text":"becomes a node in the graph similar to [33, 34, 2]. Each","bbox":{"l":50.111992,"t":338.33447,"r":286.36505,"b":347.2410300000001,"coord_origin":"1"}},{"id":32,"text":"node is then associated with en embedding vector coming","bbox":{"l":50.111992,"t":350.28946,"r":286.36505,"b":359.19601,"coord_origin":"1"}},{"id":33,"text":"from the encoded image, its coordinates and the encoded","bbox":{"l":50.111992,"t":362.24545000000006,"r":286.36508,"b":371.15201,"coord_origin":"1"}},{"id":34,"text":"text. Furthermore, nodes that represent adjacent text-cells","bbox":{"l":50.111992,"t":374.20044,"r":286.36508,"b":383.10699,"coord_origin":"1"}},{"id":35,"text":"are linked. Graph Convolutional Networks (GCN’s) based","bbox":{"l":50.111992,"t":386.15542999999997,"r":286.36508,"b":395.06198,"coord_origin":"1"}},{"id":36,"text":"methods take the image as an input, but also the position of","bbox":{"l":50.111992,"t":398.11041000000006,"r":286.36508,"b":407.01697,"coord_origin":"1"}},{"id":37,"text":"the text-cells and their content [18]. The purpose of a GCN","bbox":{"l":50.111992,"t":410.0654,"r":286.36508,"b":418.97195,"coord_origin":"1"}},{"id":38,"text":"is to transform the input graph into a new graph, which re-","bbox":{"l":50.111992,"t":422.02038999999996,"r":286.36505,"b":430.92694,"coord_origin":"1"}},{"id":39,"text":"places the old links with new ones.","bbox":{"l":50.111992,"t":433.97638,"r":198.2359,"b":442.88293,"coord_origin":"1"}},{"id":40,"text":"The new links then","bbox":{"l":205.92703,"t":433.97638,"r":286.36505,"b":442.88293,"coord_origin":"1"}},{"id":41,"text":"represent the table-structure. With this approach, one can","bbox":{"l":50.111992,"t":445.93137,"r":286.36508,"b":454.83792000000005,"coord_origin":"1"}},{"id":42,"text":"avoid the need to build custom OCR decoders. However,","bbox":{"l":50.111992,"t":457.88635,"r":286.36505,"b":466.79291,"coord_origin":"1"}},{"id":43,"text":"the quality of the reconstructed structure is not comparable","bbox":{"l":50.111992,"t":469.84134,"r":286.36505,"b":478.74789,"coord_origin":"1"}},{"id":44,"text":"to the current state-of-the-art [18].","bbox":{"l":50.111992,"t":481.79633,"r":186.49998,"b":490.70288,"coord_origin":"1"}}]},"text":"Graph Neural networks : Graph Neural networks (GNN’s) take a radically different approach to tablestructure extraction. Note that one table cell can constitute out of multiple text-cells. To obtain the table-structure, one creates an initial graph, where each of the text-cells becomes a node in the graph similar to [33, 34, 2]. Each node is then associated with en embedding vector coming from the encoded image, its coordinates and the encoded text. Furthermore, nodes that represent adjacent text-cells are linked. Graph Convolutional Networks (GCN’s) based methods take the image as an input, but also the position of the text-cells and their content [18]. The purpose of a GCN is to transform the input graph into a new graph, which replaces the old links with new ones. The new links then represent the table-structure. With this approach, one can avoid the need to build custom OCR decoders. However, the quality of the reconstructed structure is not comparable to the current state-of-the-art [18]."},{"label":"Text","id":3,"page_no":2,"cluster":{"id":3,"label":"Text","bbox":{"l":49.34700357913971,"t":492.7602035522461,"r":286.6784271240235,"b":622.8181755065917,"coord_origin":"1"},"confidence":0.9880095720291138,"cells":[{"id":45,"text":"Hybrid Deep Learning-Rule-Based approach","bbox":{"l":62.066994,"t":493.68875,"r":252.88068000000004,"b":502.64514,"coord_origin":"1"}},{"id":46,"text":": A pop-","bbox":{"l":252.88199,"t":493.80832,"r":286.36627,"b":502.71487,"coord_origin":"1"}},{"id":47,"text":"ular current model for table-structure identification is the","bbox":{"l":50.111984,"t":505.76331,"r":286.36505,"b":514.66986,"coord_origin":"1"}},{"id":48,"text":"use of a hybrid Deep Learning-Rule-Based approach similar","bbox":{"l":50.111984,"t":517.71829,"r":286.36505,"b":526.6248499999999,"coord_origin":"1"}},{"id":49,"text":"to [27, 29]. In this approach, one first detects the position of","bbox":{"l":50.111984,"t":529.67328,"r":286.36508,"b":538.57985,"coord_origin":"1"}},{"id":50,"text":"the table-cells with object detection (e.g. YoloVx or Mask-","bbox":{"l":50.111984,"t":541.62929,"r":286.36508,"b":550.53584,"coord_origin":"1"}},{"id":51,"text":"RCNN), then classifies the table into different types (from","bbox":{"l":50.111984,"t":553.58429,"r":286.36511,"b":562.4908399999999,"coord_origin":"1"}},{"id":52,"text":"its images) and finally uses different rule-sets to obtain","bbox":{"l":50.111984,"t":565.5392899999999,"r":286.36511,"b":574.44585,"coord_origin":"1"}},{"id":53,"text":"its table-structure. Currently, this approach achieves state-","bbox":{"l":50.111984,"t":577.49429,"r":286.36502,"b":586.40085,"coord_origin":"1"}},{"id":54,"text":"of-the-art results, but is not an end-to-end deep-learning","bbox":{"l":50.111984,"t":589.4493,"r":286.36505,"b":598.35585,"coord_origin":"1"}},{"id":55,"text":"method. As such, new rules need to be written if different","bbox":{"l":50.111984,"t":601.4043,"r":286.36502,"b":610.31085,"coord_origin":"1"}},{"id":56,"text":"types of tables are encountered.","bbox":{"l":50.111984,"t":613.36029,"r":175.98943,"b":622.26685,"coord_origin":"1"}}]},"text":"Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered."},{"label":"Section-header","id":4,"page_no":2,"cluster":{"id":4,"label":"Section-header","bbox":{"l":49.34483313560486,"t":635.3652351379395,"r":105.30262470245361,"b":646.6925699999999,"coord_origin":"1"},"confidence":0.9314619302749634,"cells":[{"id":57,"text":"3.","bbox":{"l":50.111984,"t":635.94484,"r":57.82375699999999,"b":646.6925699999999,"coord_origin":"1"}},{"id":58,"text":"Datasets","bbox":{"l":68.106125,"t":635.94484,"r":105.22546,"b":646.6925699999999,"coord_origin":"1"}}]},"text":"3. Datasets"},{"label":"Text","id":5,"page_no":2,"cluster":{"id":5,"label":"Text","bbox":{"l":49.34266269207001,"t":655.4480232238769,"r":286.3688890457153,"b":713.1628509521485,"coord_origin":"1"},"confidence":0.9872632026672363,"cells":[{"id":59,"text":"We rely on large-scale datasets such as PubTabNet [37],","bbox":{"l":62.06698600000001,"t":656.42529,"r":286.36493,"b":665.33186,"coord_origin":"1"}},{"id":60,"text":"FinTabNet [36], and TableBank [17] datasets to train and","bbox":{"l":50.111984,"t":668.38029,"r":286.36508,"b":677.2868599999999,"coord_origin":"1"}},{"id":61,"text":"evaluate our models. These datasets span over various ap-","bbox":{"l":50.111984,"t":680.3353,"r":286.36502,"b":689.24186,"coord_origin":"1"}},{"id":62,"text":"pearance styles and content.","bbox":{"l":50.111984,"t":692.290298,"r":166.24602,"b":701.196861,"coord_origin":"1"}},{"id":63,"text":"We also introduce our own","bbox":{"l":173.68808,"t":692.290298,"r":286.36508,"b":701.196861,"coord_origin":"1"}},{"id":64,"text":"synthetically generated SynthTabNet dataset to fix an im-","bbox":{"l":50.111984,"t":704.2453,"r":286.36505,"b":713.151863,"coord_origin":"1"}}]},"text":"We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-"},{"label":"Picture","id":6,"page_no":2,"cluster":{"id":6,"label":"Picture","bbox":{"l":311.3420780181885,"t":78.12818670272827,"r":550.2800857543945,"b":251.03088569641113,"coord_origin":"1"},"confidence":0.979541003704071,"cells":[{"id":65,"text":"PubTabNet + FinTabNet","bbox":{"l":380.79849,"t":79.81176999999991,"r":486.84909,"b":88.55975000000001,"coord_origin":"1"}},{"id":66,"text":"Rows / Columns","bbox":{"l":396.76776,"t":242.02697999999998,"r":469.78748,"b":250.77495999999996,"coord_origin":"1"}},{"id":67,"text":"0","bbox":{"l":320.97653,"t":233.42296999999996,"r":324.79254,"b":239.255,"coord_origin":"1"}},{"id":68,"text":"20","bbox":{"l":410.483,"t":233.42296999999996,"r":418.11319,"b":239.255,"coord_origin":"1"}},{"id":69,"text":"40","bbox":{"l":500.84949,"t":233.42296999999996,"r":508.47968000000003,"b":239.255,"coord_origin":"1"}},{"id":70,"text":"10","bbox":{"l":365.29999,"t":233.42296999999996,"r":372.93018,"b":239.255,"coord_origin":"1"}},{"id":71,"text":"30","bbox":{"l":455.66626,"t":233.42296999999996,"r":463.29645,"b":239.255,"coord_origin":"1"}},{"id":72,"text":"50","bbox":{"l":542.03528,"t":233.42296999999996,"r":549.66547,"b":239.255,"coord_origin":"1"}},{"id":73,"text":"0","bbox":{"l":316.04474,"t":230.44617000000005,"r":319.86075,"b":236.27819999999997,"coord_origin":"1"}},{"id":74,"text":"2","bbox":{"l":312.62521,"t":198.69073000000003,"r":316.44122,"b":204.52277000000004,"coord_origin":"1"}},{"id":75,"text":"0","bbox":{"l":316.43942,"t":198.69073000000003,"r":320.2554,"b":204.52277000000004,"coord_origin":"1"}},{"id":76,"text":"4","bbox":{"l":313.14951,"t":168.09795999999994,"r":316.96552,"b":173.92998999999998,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":316.96371,"t":168.09795999999994,"r":320.77969,"b":173.92998999999998,"coord_origin":"1"}},{"id":78,"text":"6","bbox":{"l":312.92972,"t":136.58771000000002,"r":316.74573,"b":142.41974000000005,"coord_origin":"1"}},{"id":79,"text":"0","bbox":{"l":316.74393,"t":136.58771000000002,"r":320.55991,"b":142.41974000000005,"coord_origin":"1"}},{"id":80,"text":"8","bbox":{"l":312.48227,"t":105.60175000000004,"r":316.29828,"b":111.43377999999996,"coord_origin":"1"}},{"id":81,"text":"0","bbox":{"l":316.29648,"t":105.60175000000004,"r":320.11246,"b":111.43377999999996,"coord_origin":"1"}},{"id":82,"text":"1","bbox":{"l":312.48227,"t":212.25922000000003,"r":316.29828,"b":218.09124999999995,"coord_origin":"1"}},{"id":83,"text":"0","bbox":{"l":316.29648,"t":212.25922000000003,"r":320.11246,"b":218.09124999999995,"coord_origin":"1"}},{"id":84,"text":"3","bbox":{"l":313.07639,"t":183.72198000000003,"r":316.8924,"b":189.55402000000004,"coord_origin":"1"}},{"id":85,"text":"0","bbox":{"l":316.89059,"t":183.72198000000003,"r":320.70657,"b":189.55402000000004,"coord_origin":"1"}},{"id":86,"text":"5","bbox":{"l":312.76321,"t":152.47400000000005,"r":316.57922,"b":158.30602999999996,"coord_origin":"1"}},{"id":87,"text":"0","bbox":{"l":316.57742,"t":152.47400000000005,"r":320.3934,"b":158.30602999999996,"coord_origin":"1"}},{"id":88,"text":"7","bbox":{"l":312.19775,"t":120.57050000000004,"r":316.01376,"b":126.40252999999996,"coord_origin":"1"}},{"id":89,"text":"0","bbox":{"l":316.01196,"t":120.57050000000004,"r":319.82794,"b":126.40252999999996,"coord_origin":"1"}},{"id":90,"text":"9","bbox":{"l":312.8165,"t":90.1087,"r":316.63251,"b":95.94073000000003,"coord_origin":"1"}},{"id":91,"text":"0","bbox":{"l":316.63071,"t":90.1087,"r":320.44669,"b":95.94073000000003,"coord_origin":"1"}},{"id":92,"text":"0","bbox":{"l":532.17426,"t":222.72729000000004,"r":536.94427,"b":230.01727000000005,"coord_origin":"1"}},{"id":93,"text":"10K","bbox":{"l":532.87952,"t":108.26702999999986,"r":547.61249,"b":115.55700999999999,"coord_origin":"1"}},{"id":94,"text":"8K","bbox":{"l":532.7735,"t":130.78101000000004,"r":542.73877,"b":138.07097999999996,"coord_origin":"1"}},{"id":95,"text":"6K","bbox":{"l":532.79901,"t":153.92352000000005,"r":542.76428,"b":161.21349999999995,"coord_origin":"1"}},{"id":96,"text":"4K","bbox":{"l":532.5705,"t":176.75800000000004,"r":542.53577,"b":184.04796999999996,"coord_origin":"1"}},{"id":97,"text":"2K","bbox":{"l":532.14551,"t":199.6463,"r":542.11078,"b":206.93628,"coord_origin":"1"}}]},"text":"","data":null,"provenance":null,"predicted_class":null,"confidence":null},{"label":"Caption","id":7,"page_no":2,"cluster":{"id":7,"label":"Caption","bbox":{"l":308.0231597900391,"t":267.0846988677979,"r":545.11511,"b":288.6979099999999,"coord_origin":"1"},"confidence":0.9705182313919067,"cells":[{"id":98,"text":"Figure 2:","bbox":{"l":308.862,"t":267.83636,"r":346.06238,"b":276.74292,"coord_origin":"1"}},{"id":99,"text":"Distribution of the tables across different table","bbox":{"l":354.49072,"t":267.83636,"r":545.11511,"b":276.74292,"coord_origin":"1"}},{"id":100,"text":"dimensions in PubTabNet + FinTabNet datasets","bbox":{"l":308.862,"t":279.79132000000004,"r":498.56989,"b":288.6979099999999,"coord_origin":"1"}}]},"text":"Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets"},{"label":"Text","id":8,"page_no":2,"cluster":{"id":8,"label":"Text","bbox":{"l":307.9623727798462,"t":316.53379096984867,"r":437.27002,"b":326.5220111846924,"coord_origin":"1"},"confidence":0.8795380592346191,"cells":[{"id":101,"text":"balance in the previous datasets.","bbox":{"l":308.862,"t":317.47336,"r":437.27002,"b":326.37991,"coord_origin":"1"}}]},"text":"balance in the previous datasets."},{"label":"Text","id":9,"page_no":2,"cluster":{"id":9,"label":"Text","bbox":{"l":307.9255153656006,"t":330.2963710784912,"r":545.6851364135742,"b":627.7393981933593,"coord_origin":"1"},"confidence":0.9885321259498596,"cells":[{"id":102,"text":"The PubTabNet dataset contains 509k tables delivered as","bbox":{"l":320.81699,"t":331.53137,"r":545.11505,"b":340.43793,"coord_origin":"1"}},{"id":103,"text":"annotated PNG images. The annotations consist of the table","bbox":{"l":308.862,"t":343.48635999999993,"r":545.11517,"b":352.39291,"coord_origin":"1"}},{"id":104,"text":"structure represented in HTML format, the tokenized text","bbox":{"l":308.862,"t":355.44235,"r":545.11505,"b":364.34890999999993,"coord_origin":"1"}},{"id":105,"text":"and its bounding boxes per table cell. Fig. 1 shows the ap-","bbox":{"l":308.862,"t":367.39734,"r":545.11505,"b":376.30389,"coord_origin":"1"}},{"id":106,"text":"pearance style of PubTabNet. Depending on its complexity,","bbox":{"l":308.862,"t":379.35233,"r":545.11511,"b":388.25888,"coord_origin":"1"}},{"id":107,"text":"a table is characterized as “simple” when it does not contain","bbox":{"l":308.862,"t":391.30731,"r":545.11511,"b":400.21386999999993,"coord_origin":"1"}},{"id":108,"text":"row spans or column spans, otherwise it is “complex”. The","bbox":{"l":308.862,"t":403.26230000000004,"r":545.11505,"b":412.16885,"coord_origin":"1"}},{"id":109,"text":"dataset is divided into Train and Val splits (roughly 98% and","bbox":{"l":308.862,"t":415.21729,"r":545.11511,"b":424.12384,"coord_origin":"1"}},{"id":110,"text":"2%). The Train split consists of 54% simple and 46% com-","bbox":{"l":308.862,"t":427.17328,"r":545.11517,"b":436.0798300000001,"coord_origin":"1"}},{"id":111,"text":"plex tables and the Val split of 51% and 49% respectively.","bbox":{"l":308.862,"t":439.12827,"r":545.11517,"b":448.03482,"coord_origin":"1"}},{"id":112,"text":"The FinTabNet dataset contains 112k tables delivered as","bbox":{"l":308.862,"t":451.08325,"r":545.11511,"b":459.98981000000003,"coord_origin":"1"}},{"id":113,"text":"single-page PDF documents with mixed table structures and","bbox":{"l":308.862,"t":463.03824,"r":545.11505,"b":471.94479,"coord_origin":"1"}},{"id":114,"text":"text content. Similarly to the PubTabNet, the annotations","bbox":{"l":308.862,"t":474.99323,"r":545.11511,"b":483.89978,"coord_origin":"1"}},{"id":115,"text":"of FinTabNet include the table structure in HTML, the to-","bbox":{"l":308.862,"t":486.94922,"r":545.11511,"b":495.85577,"coord_origin":"1"}},{"id":116,"text":"kenized text and the bounding boxes on a table cell basis.","bbox":{"l":308.862,"t":498.90421,"r":545.11511,"b":507.81076,"coord_origin":"1"}},{"id":117,"text":"The dataset is divided into Train, Test and Val splits (81%,","bbox":{"l":308.862,"t":510.85919,"r":545.11517,"b":519.76575,"coord_origin":"1"}},{"id":118,"text":"9.5%, 9.5%), and each one is almost equally divided into","bbox":{"l":308.862,"t":522.8141800000001,"r":545.11517,"b":531.72073,"coord_origin":"1"}},{"id":119,"text":"simple and complex tables (Train: 48% simple, 52% com-","bbox":{"l":308.862,"t":534.76917,"r":545.11505,"b":543.67574,"coord_origin":"1"}},{"id":120,"text":"plex, Test: 48% simple, 52% complex, Test: 53% simple,","bbox":{"l":308.862,"t":546.72418,"r":545.11511,"b":555.6307400000001,"coord_origin":"1"}},{"id":121,"text":"47% complex). Finally the TableBank dataset consists of","bbox":{"l":308.862,"t":558.6801800000001,"r":545.11511,"b":567.58673,"coord_origin":"1"}},{"id":122,"text":"145k tables provided as JPEG images. The latter has anno-","bbox":{"l":308.862,"t":570.63518,"r":545.11505,"b":579.54173,"coord_origin":"1"}},{"id":123,"text":"tations for the table structure, but only few with bounding","bbox":{"l":308.862,"t":582.59018,"r":545.11499,"b":591.49673,"coord_origin":"1"}},{"id":124,"text":"boxes of the table cells. The entire dataset consists of sim-","bbox":{"l":308.862,"t":594.54518,"r":545.11517,"b":603.45174,"coord_origin":"1"}},{"id":125,"text":"ple tables and it is divided into 90% Train, 3% Test and 7%","bbox":{"l":308.862,"t":606.50018,"r":545.11511,"b":615.40674,"coord_origin":"1"}},{"id":126,"text":"Val splits.","bbox":{"l":308.862,"t":618.45518,"r":348.16446,"b":627.36174,"coord_origin":"1"}}]},"text":"The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as “simple” when it does not contain row spans or column spans, otherwise it is “complex”. The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits."},{"label":"Text","id":10,"page_no":2,"cluster":{"id":10,"label":"Text","bbox":{"l":307.7597299575806,"t":631.6698051452637,"r":545.2829441070556,"b":713.151764,"coord_origin":"1"},"confidence":0.9847508668899536,"cells":[{"id":127,"text":"Due to the heterogeneity across the dataset formats, it","bbox":{"l":320.81699,"t":632.51419,"r":545.11487,"b":641.42075,"coord_origin":"1"}},{"id":128,"text":"was necessary to combine all available data into one homog-","bbox":{"l":308.862,"t":644.46919,"r":545.11511,"b":653.37575,"coord_origin":"1"}},{"id":129,"text":"enized dataset before we could train our models for practi-","bbox":{"l":308.862,"t":656.42419,"r":545.11511,"b":665.33076,"coord_origin":"1"}},{"id":130,"text":"cal purposes. Given the size of PubTabNet, we adopted its","bbox":{"l":308.862,"t":668.38019,"r":545.11499,"b":677.28676,"coord_origin":"1"}},{"id":131,"text":"annotation format and we extracted and converted all tables","bbox":{"l":308.862,"t":680.33519,"r":545.11505,"b":689.24176,"coord_origin":"1"}},{"id":132,"text":"as PNG images with a resolution of 72 dpi. Additionally,","bbox":{"l":308.862,"t":692.290192,"r":545.11505,"b":701.196762,"coord_origin":"1"}},{"id":133,"text":"we have filtered out tables with extreme sizes due to small","bbox":{"l":308.862,"t":704.245193,"r":545.11511,"b":713.151764,"coord_origin":"1"}}]},"text":"Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small"},{"label":"Page-footer","id":11,"page_no":2,"cluster":{"id":11,"label":"Page-footer","bbox":{"l":294.43619785308834,"t":733.3885437011719,"r":300.10229,"b":743.039761,"coord_origin":"1"},"confidence":0.893059253692627,"cells":[{"id":134,"text":"3","bbox":{"l":295.121,"t":734.133198,"r":300.10229,"b":743.039761,"coord_origin":"1"}}]},"text":"3"}],"body":[{"label":"Text","id":0,"page_no":2,"cluster":{"id":0,"label":"Text","bbox":{"l":49.48567485809326,"t":74.29503536224365,"r":250.15102,"b":84.11492999999996,"coord_origin":"1"},"confidence":0.8762400150299072,"cells":[{"id":0,"text":"tag-decoder which is constrained to the table-tags.","bbox":{"l":50.112,"t":75.20836999999995,"r":250.15102,"b":84.11492999999996,"coord_origin":"1"}}]},"text":"tag-decoder which is constrained to the table-tags."},{"label":"Text","id":1,"page_no":2,"cluster":{"id":1,"label":"Text","bbox":{"l":49.11482083797455,"t":85.99874410629275,"r":286.36514,"b":276.05459117889404,"coord_origin":"1"},"confidence":0.9855231046676636,"cells":[{"id":1,"text":"In","bbox":{"l":62.067001,"t":87.21935999999994,"r":70.365845,"b":96.12591999999995,"coord_origin":"1"}},{"id":2,"text":"practice,","bbox":{"l":76.931198,"t":87.21935999999994,"r":110.95348000000001,"b":96.12591999999995,"coord_origin":"1"}},{"id":3,"text":"both","bbox":{"l":118.54498,"t":87.21935999999994,"r":136.25848,"b":96.12591999999995,"coord_origin":"1"}},{"id":4,"text":"network","bbox":{"l":142.82384,"t":87.21935999999994,"r":175.37166,"b":96.12591999999995,"coord_origin":"1"}},{"id":5,"text":"architectures","bbox":{"l":181.94698,"t":87.21935999999994,"r":232.83594000000002,"b":96.12591999999995,"coord_origin":"1"}},{"id":6,"text":"(IETD","bbox":{"l":239.41125,"t":87.21935999999994,"r":265.41364,"b":96.12591999999995,"coord_origin":"1"}},{"id":7,"text":"and","bbox":{"l":271.979,"t":87.21935999999994,"r":286.36499,"b":96.12591999999995,"coord_origin":"1"}},{"id":8,"text":"IEDD) require an implicit, custom trained object-character-","bbox":{"l":50.112,"t":99.17437999999993,"r":286.36505,"b":108.08092999999997,"coord_origin":"1"}},{"id":9,"text":"recognition (OCR) to obtain the content of the table-cells.","bbox":{"l":50.112,"t":111.13036999999997,"r":286.36511,"b":120.03692999999998,"coord_origin":"1"}},{"id":10,"text":"In the case of IETD, this OCR engine is implicit in the de-","bbox":{"l":50.112,"t":123.08538999999996,"r":286.36505,"b":131.99194,"coord_origin":"1"}},{"id":11,"text":"coder similar to [24]. For the IEDD, the OCR is solely em-","bbox":{"l":50.112,"t":135.04040999999995,"r":286.36514,"b":143.94696,"coord_origin":"1"}},{"id":12,"text":"bedded in the content-decoder. This reliance on a custom,","bbox":{"l":50.112,"t":146.99541999999997,"r":286.36511,"b":155.90197999999998,"coord_origin":"1"}},{"id":13,"text":"implicit OCR decoder is of course problematic. OCR is a","bbox":{"l":50.112,"t":158.95043999999996,"r":286.36505,"b":167.85699,"coord_origin":"1"}},{"id":14,"text":"well known and extremely tough problem, that often needs","bbox":{"l":50.112,"t":170.90545999999995,"r":286.36508,"b":179.81201,"coord_origin":"1"}},{"id":15,"text":"custom training for each individual language. However, the","bbox":{"l":50.112,"t":182.86145,"r":286.36508,"b":191.76801,"coord_origin":"1"}},{"id":16,"text":"limited availability for non-english content in the current","bbox":{"l":50.112,"t":194.81646999999998,"r":286.36511,"b":203.72302000000002,"coord_origin":"1"}},{"id":17,"text":"datasets, makes it impractical to apply the IETD and IEDD","bbox":{"l":50.112,"t":206.77148,"r":286.36511,"b":215.67804,"coord_origin":"1"}},{"id":18,"text":"methods on tables with other languages. Additionally, OCR","bbox":{"l":50.112,"t":218.7265,"r":286.36505,"b":227.63306,"coord_origin":"1"}},{"id":19,"text":"can be completely omitted if the tables originate from pro-","bbox":{"l":50.112,"t":230.68151999999998,"r":286.36505,"b":239.58807000000002,"coord_origin":"1"}},{"id":20,"text":"grammatic PDF documents with known positions of each","bbox":{"l":50.112,"t":242.63653999999997,"r":286.36511,"b":251.54309,"coord_origin":"1"}},{"id":21,"text":"cell. The latter was the inspiration for the work of this pa-","bbox":{"l":50.112,"t":254.59253,"r":286.36508,"b":263.49908000000005,"coord_origin":"1"}},{"id":22,"text":"per.","bbox":{"l":50.112,"t":266.54755,"r":64.776947,"b":275.45410000000004,"coord_origin":"1"}}]},"text":"In practice, both network architectures (IETD and IEDD) require an implicit, custom trained object-characterrecognition (OCR) to obtain the content of the table-cells. In the case of IETD, this OCR engine is implicit in the decoder similar to [24]. For the IEDD, the OCR is solely embedded in the content-decoder. This reliance on a custom, implicit OCR decoder is of course problematic. OCR is a well known and extremely tough problem, that often needs custom training for each individual language. However, the limited availability for non-english content in the current datasets, makes it impractical to apply the IETD and IEDD methods on tables with other languages. Additionally, OCR can be completely omitted if the tables originate from programmatic PDF documents with known positions of each cell. The latter was the inspiration for the work of this paper."},{"label":"Text","id":2,"page_no":2,"cluster":{"id":2,"label":"Text","bbox":{"l":49.138185024261475,"t":277.6425739288329,"r":286.5478357315064,"b":490.70288,"coord_origin":"1"},"confidence":0.9872949719429016,"cells":[{"id":23,"text":"Graph Neural networks","bbox":{"l":62.067001,"t":278.43895999999995,"r":171.56593,"b":287.39536,"coord_origin":"1"}},{"id":24,"text":":","bbox":{"l":171.56799,"t":278.55853,"r":174.3376,"b":287.46509,"coord_origin":"1"}},{"id":25,"text":"Graph Neural networks","bbox":{"l":185.18687,"t":278.55853,"r":286.35709,"b":287.46509,"coord_origin":"1"}},{"id":26,"text":"(GNN’s) take a radically different approach to table-","bbox":{"l":50.111992,"t":290.51453000000004,"r":286.36511,"b":299.42108,"coord_origin":"1"}},{"id":27,"text":"structure extraction.","bbox":{"l":50.111992,"t":302.46950999999996,"r":131.16771,"b":311.37607,"coord_origin":"1"}},{"id":28,"text":"Note that one table cell can consti-","bbox":{"l":138.84888,"t":302.46950999999996,"r":286.36508,"b":311.37607,"coord_origin":"1"}},{"id":29,"text":"tute out of multiple text-cells. To obtain the table-structure,","bbox":{"l":50.111992,"t":314.4245,"r":286.36505,"b":323.33105,"coord_origin":"1"}},{"id":30,"text":"one creates an initial graph, where each of the text-cells","bbox":{"l":50.111992,"t":326.37949000000003,"r":286.36508,"b":335.28604,"coord_origin":"1"}},{"id":31,"text":"becomes a node in the graph similar to [33, 34, 2]. Each","bbox":{"l":50.111992,"t":338.33447,"r":286.36505,"b":347.2410300000001,"coord_origin":"1"}},{"id":32,"text":"node is then associated with en embedding vector coming","bbox":{"l":50.111992,"t":350.28946,"r":286.36505,"b":359.19601,"coord_origin":"1"}},{"id":33,"text":"from the encoded image, its coordinates and the encoded","bbox":{"l":50.111992,"t":362.24545000000006,"r":286.36508,"b":371.15201,"coord_origin":"1"}},{"id":34,"text":"text. Furthermore, nodes that represent adjacent text-cells","bbox":{"l":50.111992,"t":374.20044,"r":286.36508,"b":383.10699,"coord_origin":"1"}},{"id":35,"text":"are linked. Graph Convolutional Networks (GCN’s) based","bbox":{"l":50.111992,"t":386.15542999999997,"r":286.36508,"b":395.06198,"coord_origin":"1"}},{"id":36,"text":"methods take the image as an input, but also the position of","bbox":{"l":50.111992,"t":398.11041000000006,"r":286.36508,"b":407.01697,"coord_origin":"1"}},{"id":37,"text":"the text-cells and their content [18]. The purpose of a GCN","bbox":{"l":50.111992,"t":410.0654,"r":286.36508,"b":418.97195,"coord_origin":"1"}},{"id":38,"text":"is to transform the input graph into a new graph, which re-","bbox":{"l":50.111992,"t":422.02038999999996,"r":286.36505,"b":430.92694,"coord_origin":"1"}},{"id":39,"text":"places the old links with new ones.","bbox":{"l":50.111992,"t":433.97638,"r":198.2359,"b":442.88293,"coord_origin":"1"}},{"id":40,"text":"The new links then","bbox":{"l":205.92703,"t":433.97638,"r":286.36505,"b":442.88293,"coord_origin":"1"}},{"id":41,"text":"represent the table-structure. With this approach, one can","bbox":{"l":50.111992,"t":445.93137,"r":286.36508,"b":454.83792000000005,"coord_origin":"1"}},{"id":42,"text":"avoid the need to build custom OCR decoders. However,","bbox":{"l":50.111992,"t":457.88635,"r":286.36505,"b":466.79291,"coord_origin":"1"}},{"id":43,"text":"the quality of the reconstructed structure is not comparable","bbox":{"l":50.111992,"t":469.84134,"r":286.36505,"b":478.74789,"coord_origin":"1"}},{"id":44,"text":"to the current state-of-the-art [18].","bbox":{"l":50.111992,"t":481.79633,"r":186.49998,"b":490.70288,"coord_origin":"1"}}]},"text":"Graph Neural networks : Graph Neural networks (GNN’s) take a radically different approach to tablestructure extraction. Note that one table cell can constitute out of multiple text-cells. To obtain the table-structure, one creates an initial graph, where each of the text-cells becomes a node in the graph similar to [33, 34, 2]. Each node is then associated with en embedding vector coming from the encoded image, its coordinates and the encoded text. Furthermore, nodes that represent adjacent text-cells are linked. Graph Convolutional Networks (GCN’s) based methods take the image as an input, but also the position of the text-cells and their content [18]. The purpose of a GCN is to transform the input graph into a new graph, which replaces the old links with new ones. The new links then represent the table-structure. With this approach, one can avoid the need to build custom OCR decoders. However, the quality of the reconstructed structure is not comparable to the current state-of-the-art [18]."},{"label":"Text","id":3,"page_no":2,"cluster":{"id":3,"label":"Text","bbox":{"l":49.34700357913971,"t":492.7602035522461,"r":286.6784271240235,"b":622.8181755065917,"coord_origin":"1"},"confidence":0.9880095720291138,"cells":[{"id":45,"text":"Hybrid Deep Learning-Rule-Based approach","bbox":{"l":62.066994,"t":493.68875,"r":252.88068000000004,"b":502.64514,"coord_origin":"1"}},{"id":46,"text":": A pop-","bbox":{"l":252.88199,"t":493.80832,"r":286.36627,"b":502.71487,"coord_origin":"1"}},{"id":47,"text":"ular current model for table-structure identification is the","bbox":{"l":50.111984,"t":505.76331,"r":286.36505,"b":514.66986,"coord_origin":"1"}},{"id":48,"text":"use of a hybrid Deep Learning-Rule-Based approach similar","bbox":{"l":50.111984,"t":517.71829,"r":286.36505,"b":526.6248499999999,"coord_origin":"1"}},{"id":49,"text":"to [27, 29]. In this approach, one first detects the position of","bbox":{"l":50.111984,"t":529.67328,"r":286.36508,"b":538.57985,"coord_origin":"1"}},{"id":50,"text":"the table-cells with object detection (e.g. YoloVx or Mask-","bbox":{"l":50.111984,"t":541.62929,"r":286.36508,"b":550.53584,"coord_origin":"1"}},{"id":51,"text":"RCNN), then classifies the table into different types (from","bbox":{"l":50.111984,"t":553.58429,"r":286.36511,"b":562.4908399999999,"coord_origin":"1"}},{"id":52,"text":"its images) and finally uses different rule-sets to obtain","bbox":{"l":50.111984,"t":565.5392899999999,"r":286.36511,"b":574.44585,"coord_origin":"1"}},{"id":53,"text":"its table-structure. Currently, this approach achieves state-","bbox":{"l":50.111984,"t":577.49429,"r":286.36502,"b":586.40085,"coord_origin":"1"}},{"id":54,"text":"of-the-art results, but is not an end-to-end deep-learning","bbox":{"l":50.111984,"t":589.4493,"r":286.36505,"b":598.35585,"coord_origin":"1"}},{"id":55,"text":"method. As such, new rules need to be written if different","bbox":{"l":50.111984,"t":601.4043,"r":286.36502,"b":610.31085,"coord_origin":"1"}},{"id":56,"text":"types of tables are encountered.","bbox":{"l":50.111984,"t":613.36029,"r":175.98943,"b":622.26685,"coord_origin":"1"}}]},"text":"Hybrid Deep Learning-Rule-Based approach : A popular current model for table-structure identification is the use of a hybrid Deep Learning-Rule-Based approach similar to [27, 29]. In this approach, one first detects the position of the table-cells with object detection (e.g. YoloVx or MaskRCNN), then classifies the table into different types (from its images) and finally uses different rule-sets to obtain its table-structure. Currently, this approach achieves stateof-the-art results, but is not an end-to-end deep-learning method. As such, new rules need to be written if different types of tables are encountered."},{"label":"Section-header","id":4,"page_no":2,"cluster":{"id":4,"label":"Section-header","bbox":{"l":49.34483313560486,"t":635.3652351379395,"r":105.30262470245361,"b":646.6925699999999,"coord_origin":"1"},"confidence":0.9314619302749634,"cells":[{"id":57,"text":"3.","bbox":{"l":50.111984,"t":635.94484,"r":57.82375699999999,"b":646.6925699999999,"coord_origin":"1"}},{"id":58,"text":"Datasets","bbox":{"l":68.106125,"t":635.94484,"r":105.22546,"b":646.6925699999999,"coord_origin":"1"}}]},"text":"3. Datasets"},{"label":"Text","id":5,"page_no":2,"cluster":{"id":5,"label":"Text","bbox":{"l":49.34266269207001,"t":655.4480232238769,"r":286.3688890457153,"b":713.1628509521485,"coord_origin":"1"},"confidence":0.9872632026672363,"cells":[{"id":59,"text":"We rely on large-scale datasets such as PubTabNet [37],","bbox":{"l":62.06698600000001,"t":656.42529,"r":286.36493,"b":665.33186,"coord_origin":"1"}},{"id":60,"text":"FinTabNet [36], and TableBank [17] datasets to train and","bbox":{"l":50.111984,"t":668.38029,"r":286.36508,"b":677.2868599999999,"coord_origin":"1"}},{"id":61,"text":"evaluate our models. These datasets span over various ap-","bbox":{"l":50.111984,"t":680.3353,"r":286.36502,"b":689.24186,"coord_origin":"1"}},{"id":62,"text":"pearance styles and content.","bbox":{"l":50.111984,"t":692.290298,"r":166.24602,"b":701.196861,"coord_origin":"1"}},{"id":63,"text":"We also introduce our own","bbox":{"l":173.68808,"t":692.290298,"r":286.36508,"b":701.196861,"coord_origin":"1"}},{"id":64,"text":"synthetically generated SynthTabNet dataset to fix an im-","bbox":{"l":50.111984,"t":704.2453,"r":286.36505,"b":713.151863,"coord_origin":"1"}}]},"text":"We rely on large-scale datasets such as PubTabNet [37], FinTabNet [36], and TableBank [17] datasets to train and evaluate our models. These datasets span over various appearance styles and content. We also introduce our own synthetically generated SynthTabNet dataset to fix an im-"},{"label":"Picture","id":6,"page_no":2,"cluster":{"id":6,"label":"Picture","bbox":{"l":311.3420780181885,"t":78.12818670272827,"r":550.2800857543945,"b":251.03088569641113,"coord_origin":"1"},"confidence":0.979541003704071,"cells":[{"id":65,"text":"PubTabNet + FinTabNet","bbox":{"l":380.79849,"t":79.81176999999991,"r":486.84909,"b":88.55975000000001,"coord_origin":"1"}},{"id":66,"text":"Rows / Columns","bbox":{"l":396.76776,"t":242.02697999999998,"r":469.78748,"b":250.77495999999996,"coord_origin":"1"}},{"id":67,"text":"0","bbox":{"l":320.97653,"t":233.42296999999996,"r":324.79254,"b":239.255,"coord_origin":"1"}},{"id":68,"text":"20","bbox":{"l":410.483,"t":233.42296999999996,"r":418.11319,"b":239.255,"coord_origin":"1"}},{"id":69,"text":"40","bbox":{"l":500.84949,"t":233.42296999999996,"r":508.47968000000003,"b":239.255,"coord_origin":"1"}},{"id":70,"text":"10","bbox":{"l":365.29999,"t":233.42296999999996,"r":372.93018,"b":239.255,"coord_origin":"1"}},{"id":71,"text":"30","bbox":{"l":455.66626,"t":233.42296999999996,"r":463.29645,"b":239.255,"coord_origin":"1"}},{"id":72,"text":"50","bbox":{"l":542.03528,"t":233.42296999999996,"r":549.66547,"b":239.255,"coord_origin":"1"}},{"id":73,"text":"0","bbox":{"l":316.04474,"t":230.44617000000005,"r":319.86075,"b":236.27819999999997,"coord_origin":"1"}},{"id":74,"text":"2","bbox":{"l":312.62521,"t":198.69073000000003,"r":316.44122,"b":204.52277000000004,"coord_origin":"1"}},{"id":75,"text":"0","bbox":{"l":316.43942,"t":198.69073000000003,"r":320.2554,"b":204.52277000000004,"coord_origin":"1"}},{"id":76,"text":"4","bbox":{"l":313.14951,"t":168.09795999999994,"r":316.96552,"b":173.92998999999998,"coord_origin":"1"}},{"id":77,"text":"0","bbox":{"l":316.96371,"t":168.09795999999994,"r":320.77969,"b":173.92998999999998,"coord_origin":"1"}},{"id":78,"text":"6","bbox":{"l":312.92972,"t":136.58771000000002,"r":316.74573,"b":142.41974000000005,"coord_origin":"1"}},{"id":79,"text":"0","bbox":{"l":316.74393,"t":136.58771000000002,"r":320.55991,"b":142.41974000000005,"coord_origin":"1"}},{"id":80,"text":"8","bbox":{"l":312.48227,"t":105.60175000000004,"r":316.29828,"b":111.43377999999996,"coord_origin":"1"}},{"id":81,"text":"0","bbox":{"l":316.29648,"t":105.60175000000004,"r":320.11246,"b":111.43377999999996,"coord_origin":"1"}},{"id":82,"text":"1","bbox":{"l":312.48227,"t":212.25922000000003,"r":316.29828,"b":218.09124999999995,"coord_origin":"1"}},{"id":83,"text":"0","bbox":{"l":316.29648,"t":212.25922000000003,"r":320.11246,"b":218.09124999999995,"coord_origin":"1"}},{"id":84,"text":"3","bbox":{"l":313.07639,"t":183.72198000000003,"r":316.8924,"b":189.55402000000004,"coord_origin":"1"}},{"id":85,"text":"0","bbox":{"l":316.89059,"t":183.72198000000003,"r":320.70657,"b":189.55402000000004,"coord_origin":"1"}},{"id":86,"text":"5","bbox":{"l":312.76321,"t":152.47400000000005,"r":316.57922,"b":158.30602999999996,"coord_origin":"1"}},{"id":87,"text":"0","bbox":{"l":316.57742,"t":152.47400000000005,"r":320.3934,"b":158.30602999999996,"coord_origin":"1"}},{"id":88,"text":"7","bbox":{"l":312.19775,"t":120.57050000000004,"r":316.01376,"b":126.40252999999996,"coord_origin":"1"}},{"id":89,"text":"0","bbox":{"l":316.01196,"t":120.57050000000004,"r":319.82794,"b":126.40252999999996,"coord_origin":"1"}},{"id":90,"text":"9","bbox":{"l":312.8165,"t":90.1087,"r":316.63251,"b":95.94073000000003,"coord_origin":"1"}},{"id":91,"text":"0","bbox":{"l":316.63071,"t":90.1087,"r":320.44669,"b":95.94073000000003,"coord_origin":"1"}},{"id":92,"text":"0","bbox":{"l":532.17426,"t":222.72729000000004,"r":536.94427,"b":230.01727000000005,"coord_origin":"1"}},{"id":93,"text":"10K","bbox":{"l":532.87952,"t":108.26702999999986,"r":547.61249,"b":115.55700999999999,"coord_origin":"1"}},{"id":94,"text":"8K","bbox":{"l":532.7735,"t":130.78101000000004,"r":542.73877,"b":138.07097999999996,"coord_origin":"1"}},{"id":95,"text":"6K","bbox":{"l":532.79901,"t":153.92352000000005,"r":542.76428,"b":161.21349999999995,"coord_origin":"1"}},{"id":96,"text":"4K","bbox":{"l":532.5705,"t":176.75800000000004,"r":542.53577,"b":184.04796999999996,"coord_origin":"1"}},{"id":97,"text":"2K","bbox":{"l":532.14551,"t":199.6463,"r":542.11078,"b":206.93628,"coord_origin":"1"}}]},"text":"","data":null,"provenance":null,"predicted_class":null,"confidence":null},{"label":"Caption","id":7,"page_no":2,"cluster":{"id":7,"label":"Caption","bbox":{"l":308.0231597900391,"t":267.0846988677979,"r":545.11511,"b":288.6979099999999,"coord_origin":"1"},"confidence":0.9705182313919067,"cells":[{"id":98,"text":"Figure 2:","bbox":{"l":308.862,"t":267.83636,"r":346.06238,"b":276.74292,"coord_origin":"1"}},{"id":99,"text":"Distribution of the tables across different table","bbox":{"l":354.49072,"t":267.83636,"r":545.11511,"b":276.74292,"coord_origin":"1"}},{"id":100,"text":"dimensions in PubTabNet + FinTabNet datasets","bbox":{"l":308.862,"t":279.79132000000004,"r":498.56989,"b":288.6979099999999,"coord_origin":"1"}}]},"text":"Figure 2: Distribution of the tables across different table dimensions in PubTabNet + FinTabNet datasets"},{"label":"Text","id":8,"page_no":2,"cluster":{"id":8,"label":"Text","bbox":{"l":307.9623727798462,"t":316.53379096984867,"r":437.27002,"b":326.5220111846924,"coord_origin":"1"},"confidence":0.8795380592346191,"cells":[{"id":101,"text":"balance in the previous datasets.","bbox":{"l":308.862,"t":317.47336,"r":437.27002,"b":326.37991,"coord_origin":"1"}}]},"text":"balance in the previous datasets."},{"label":"Text","id":9,"page_no":2,"cluster":{"id":9,"label":"Text","bbox":{"l":307.9255153656006,"t":330.2963710784912,"r":545.6851364135742,"b":627.7393981933593,"coord_origin":"1"},"confidence":0.9885321259498596,"cells":[{"id":102,"text":"The PubTabNet dataset contains 509k tables delivered as","bbox":{"l":320.81699,"t":331.53137,"r":545.11505,"b":340.43793,"coord_origin":"1"}},{"id":103,"text":"annotated PNG images. The annotations consist of the table","bbox":{"l":308.862,"t":343.48635999999993,"r":545.11517,"b":352.39291,"coord_origin":"1"}},{"id":104,"text":"structure represented in HTML format, the tokenized text","bbox":{"l":308.862,"t":355.44235,"r":545.11505,"b":364.34890999999993,"coord_origin":"1"}},{"id":105,"text":"and its bounding boxes per table cell. Fig. 1 shows the ap-","bbox":{"l":308.862,"t":367.39734,"r":545.11505,"b":376.30389,"coord_origin":"1"}},{"id":106,"text":"pearance style of PubTabNet. Depending on its complexity,","bbox":{"l":308.862,"t":379.35233,"r":545.11511,"b":388.25888,"coord_origin":"1"}},{"id":107,"text":"a table is characterized as “simple” when it does not contain","bbox":{"l":308.862,"t":391.30731,"r":545.11511,"b":400.21386999999993,"coord_origin":"1"}},{"id":108,"text":"row spans or column spans, otherwise it is “complex”. The","bbox":{"l":308.862,"t":403.26230000000004,"r":545.11505,"b":412.16885,"coord_origin":"1"}},{"id":109,"text":"dataset is divided into Train and Val splits (roughly 98% and","bbox":{"l":308.862,"t":415.21729,"r":545.11511,"b":424.12384,"coord_origin":"1"}},{"id":110,"text":"2%). The Train split consists of 54% simple and 46% com-","bbox":{"l":308.862,"t":427.17328,"r":545.11517,"b":436.0798300000001,"coord_origin":"1"}},{"id":111,"text":"plex tables and the Val split of 51% and 49% respectively.","bbox":{"l":308.862,"t":439.12827,"r":545.11517,"b":448.03482,"coord_origin":"1"}},{"id":112,"text":"The FinTabNet dataset contains 112k tables delivered as","bbox":{"l":308.862,"t":451.08325,"r":545.11511,"b":459.98981000000003,"coord_origin":"1"}},{"id":113,"text":"single-page PDF documents with mixed table structures and","bbox":{"l":308.862,"t":463.03824,"r":545.11505,"b":471.94479,"coord_origin":"1"}},{"id":114,"text":"text content. Similarly to the PubTabNet, the annotations","bbox":{"l":308.862,"t":474.99323,"r":545.11511,"b":483.89978,"coord_origin":"1"}},{"id":115,"text":"of FinTabNet include the table structure in HTML, the to-","bbox":{"l":308.862,"t":486.94922,"r":545.11511,"b":495.85577,"coord_origin":"1"}},{"id":116,"text":"kenized text and the bounding boxes on a table cell basis.","bbox":{"l":308.862,"t":498.90421,"r":545.11511,"b":507.81076,"coord_origin":"1"}},{"id":117,"text":"The dataset is divided into Train, Test and Val splits (81%,","bbox":{"l":308.862,"t":510.85919,"r":545.11517,"b":519.76575,"coord_origin":"1"}},{"id":118,"text":"9.5%, 9.5%), and each one is almost equally divided into","bbox":{"l":308.862,"t":522.8141800000001,"r":545.11517,"b":531.72073,"coord_origin":"1"}},{"id":119,"text":"simple and complex tables (Train: 48% simple, 52% com-","bbox":{"l":308.862,"t":534.76917,"r":545.11505,"b":543.67574,"coord_origin":"1"}},{"id":120,"text":"plex, Test: 48% simple, 52% complex, Test: 53% simple,","bbox":{"l":308.862,"t":546.72418,"r":545.11511,"b":555.6307400000001,"coord_origin":"1"}},{"id":121,"text":"47% complex). Finally the TableBank dataset consists of","bbox":{"l":308.862,"t":558.6801800000001,"r":545.11511,"b":567.58673,"coord_origin":"1"}},{"id":122,"text":"145k tables provided as JPEG images. The latter has anno-","bbox":{"l":308.862,"t":570.63518,"r":545.11505,"b":579.54173,"coord_origin":"1"}},{"id":123,"text":"tations for the table structure, but only few with bounding","bbox":{"l":308.862,"t":582.59018,"r":545.11499,"b":591.49673,"coord_origin":"1"}},{"id":124,"text":"boxes of the table cells. The entire dataset consists of sim-","bbox":{"l":308.862,"t":594.54518,"r":545.11517,"b":603.45174,"coord_origin":"1"}},{"id":125,"text":"ple tables and it is divided into 90% Train, 3% Test and 7%","bbox":{"l":308.862,"t":606.50018,"r":545.11511,"b":615.40674,"coord_origin":"1"}},{"id":126,"text":"Val splits.","bbox":{"l":308.862,"t":618.45518,"r":348.16446,"b":627.36174,"coord_origin":"1"}}]},"text":"The PubTabNet dataset contains 509k tables delivered as annotated PNG images. The annotations consist of the table structure represented in HTML format, the tokenized text and its bounding boxes per table cell. Fig. 1 shows the appearance style of PubTabNet. Depending on its complexity, a table is characterized as “simple” when it does not contain row spans or column spans, otherwise it is “complex”. The dataset is divided into Train and Val splits (roughly 98% and 2%). The Train split consists of 54% simple and 46% complex tables and the Val split of 51% and 49% respectively. The FinTabNet dataset contains 112k tables delivered as single-page PDF documents with mixed table structures and text content. Similarly to the PubTabNet, the annotations of FinTabNet include the table structure in HTML, the tokenized text and the bounding boxes on a table cell basis. The dataset is divided into Train, Test and Val splits (81%, 9.5%, 9.5%), and each one is almost equally divided into simple and complex tables (Train: 48% simple, 52% complex, Test: 48% simple, 52% complex, Test: 53% simple, 47% complex). Finally the TableBank dataset consists of 145k tables provided as JPEG images. The latter has annotations for the table structure, but only few with bounding boxes of the table cells. The entire dataset consists of simple tables and it is divided into 90% Train, 3% Test and 7% Val splits."},{"label":"Text","id":10,"page_no":2,"cluster":{"id":10,"label":"Text","bbox":{"l":307.7597299575806,"t":631.6698051452637,"r":545.2829441070556,"b":713.151764,"coord_origin":"1"},"confidence":0.9847508668899536,"cells":[{"id":127,"text":"Due to the heterogeneity across the dataset formats, it","bbox":{"l":320.81699,"t":632.51419,"r":545.11487,"b":641.42075,"coord_origin":"1"}},{"id":128,"text":"was necessary to combine all available data into one homog-","bbox":{"l":308.862,"t":644.46919,"r":545.11511,"b":653.37575,"coord_origin":"1"}},{"id":129,"text":"enized dataset before we could train our models for practi-","bbox":{"l":308.862,"t":656.42419,"r":545.11511,"b":665.33076,"coord_origin":"1"}},{"id":130,"text":"cal purposes. Given the size of PubTabNet, we adopted its","bbox":{"l":308.862,"t":668.38019,"r":545.11499,"b":677.28676,"coord_origin":"1"}},{"id":131,"text":"annotation format and we extracted and converted all tables","bbox":{"l":308.862,"t":680.33519,"r":545.11505,"b":689.24176,"coord_origin":"1"}},{"id":132,"text":"as PNG images with a resolution of 72 dpi. Additionally,","bbox":{"l":308.862,"t":692.290192,"r":545.11505,"b":701.196762,"coord_origin":"1"}},{"id":133,"text":"we have filtered out tables with extreme sizes due to small","bbox":{"l":308.862,"t":704.245193,"r":545.11511,"b":713.151764,"coord_origin":"1"}}]},"text":"Due to the heterogeneity across the dataset formats, it was necessary to combine all available data into one homogenized dataset before we could train our models for practical purposes. Given the size of PubTabNet, we adopted its annotation format and we extracted and converted all tables as PNG images with a resolution of 72 dpi. Additionally, we have filtered out tables with extreme sizes due to small"}],"headers":[{"label":"Page-footer","id":11,"page_no":2,"cluster":{"id":11,"label":"Page-footer","bbox":{"l":294.43619785308834,"t":733.3885437011719,"r":300.10229,"b":743.039761,"coord_origin":"1"},"confidence":0.893059253692627,"cells":[{"id":134,"text":"3","bbox":{"l":295.121,"t":734.133198,"r":300.10229,"b":743.039761,"coord_origin":"1"}}]},"text":"3"}]}},{"page_no":3,"page_hash":"6d4e2424458b083b36c5559a7fe1a42175b082247c516ca8fef9f0d46e6f0bbc","size":{"width":612.0,"height":792.0},"cells":[{"id":0,"text":"amount of such tables, and kept only those ones ranging","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36511,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"between 1*1 and 20*10 (rows/columns).","bbox":{"l":50.112,"t":87.16339000000005,"r":212.28319,"b":96.06994999999995,"coord_origin":"1"}},{"id":2,"text":"The availability of the bounding boxes for all table cells","bbox":{"l":62.067001,"t":100.96038999999996,"r":286.36502,"b":109.86694,"coord_origin":"1"}},{"id":3,"text":"is essential to train our models. In order to distinguish be-","bbox":{"l":50.112,"t":112.91540999999995,"r":286.36508,"b":121.82195999999999,"coord_origin":"1"}},{"id":4,"text":"tween empty and non-empty bounding boxes, we have in-","bbox":{"l":50.112,"t":124.87041999999997,"r":286.36508,"b":133.77697999999998,"coord_origin":"1"}},{"id":5,"text":"troduced a binary class in the annotation. Unfortunately, the","bbox":{"l":50.112,"t":136.82641999999998,"r":286.36511,"b":145.73297000000002,"coord_origin":"1"}},{"id":6,"text":"original datasets either omit the bounding boxes for whole","bbox":{"l":50.112,"t":148.78143,"r":286.36511,"b":157.68799,"coord_origin":"1"}},{"id":7,"text":"tables (e.g. TableBank) or they narrow their scope only to","bbox":{"l":50.112,"t":160.73645,"r":286.36508,"b":169.64301,"coord_origin":"1"}},{"id":8,"text":"non-empty cells. Therefore, it was imperative to introduce","bbox":{"l":50.112,"t":172.69146999999998,"r":286.36505,"b":181.59802000000002,"coord_origin":"1"}},{"id":9,"text":"a data pre-processing procedure that generates the missing","bbox":{"l":50.112,"t":184.64648,"r":286.36508,"b":193.55304,"coord_origin":"1"}},{"id":10,"text":"bounding boxes out of the annotation information. This pro-","bbox":{"l":50.112,"t":196.60248,"r":286.36508,"b":205.50903000000005,"coord_origin":"1"}},{"id":11,"text":"cedure first parses the provided table structure and calcu-","bbox":{"l":50.112,"t":208.5575,"r":286.36508,"b":217.46405000000004,"coord_origin":"1"}},{"id":12,"text":"lates the dimensions of the most fine-grained grid that cov-","bbox":{"l":50.112,"t":220.51251000000002,"r":286.36511,"b":229.41907000000003,"coord_origin":"1"}},{"id":13,"text":"ers the table structure. Notice that each table cell may oc-","bbox":{"l":50.112,"t":232.46753,"r":286.36508,"b":241.37408000000005,"coord_origin":"1"}},{"id":14,"text":"cupy multiple grid squares due to row or column spans. In","bbox":{"l":50.112,"t":244.42255,"r":286.36508,"b":253.32910000000004,"coord_origin":"1"}},{"id":15,"text":"case of PubTabNet we had to compute missing bounding","bbox":{"l":50.112,"t":256.37756,"r":286.36505,"b":265.28412000000003,"coord_origin":"1"}},{"id":16,"text":"boxes for 48% of the simple and 69% of the complex ta-","bbox":{"l":50.112,"t":268.33356000000003,"r":286.36505,"b":277.24010999999996,"coord_origin":"1"}},{"id":17,"text":"bles.","bbox":{"l":50.112,"t":280.28853999999995,"r":68.652397,"b":289.1951,"coord_origin":"1"}},{"id":18,"text":"Regarding FinTabNet, 68% of the simple and 98%","bbox":{"l":75.566444,"t":280.28853999999995,"r":286.36514,"b":289.1951,"coord_origin":"1"}},{"id":19,"text":"of the complex tables require the generation of bounding","bbox":{"l":50.112,"t":292.24353,"r":286.36511,"b":301.15009,"coord_origin":"1"}},{"id":20,"text":"boxes.","bbox":{"l":50.112,"t":304.19852000000003,"r":75.695961,"b":313.10507,"coord_origin":"1"}},{"id":21,"text":"As it is illustrated in Fig. 2, the table distributions from","bbox":{"l":62.067001,"t":317.99550999999997,"r":286.36499,"b":326.90207,"coord_origin":"1"}},{"id":22,"text":"all datasets are skewed towards simpler structures with","bbox":{"l":50.112,"t":329.95151,"r":286.36511,"b":338.8580600000001,"coord_origin":"1"}},{"id":23,"text":"fewer number of rows/columns. Additionally, there is very","bbox":{"l":50.112,"t":341.90649,"r":286.36502,"b":350.81305,"coord_origin":"1"}},{"id":24,"text":"limited variance in the table styles, which in case of Pub-","bbox":{"l":50.112,"t":353.8614799999999,"r":286.36505,"b":362.76804,"coord_origin":"1"}},{"id":25,"text":"TabNet and FinTabNet means one styling format for the","bbox":{"l":50.112,"t":365.81647,"r":286.36508,"b":374.72301999999996,"coord_origin":"1"}},{"id":26,"text":"majority of the tables.","bbox":{"l":50.112,"t":377.77145,"r":141.58859,"b":386.67801,"coord_origin":"1"}},{"id":27,"text":"Similar limitations appear also in","bbox":{"l":148.70189,"t":377.77145,"r":286.36508,"b":386.67801,"coord_origin":"1"}},{"id":28,"text":"the type of table content, which in some cases (e.g. FinTab-","bbox":{"l":50.112,"t":389.72644,"r":286.36508,"b":398.63300000000004,"coord_origin":"1"}},{"id":29,"text":"Net) is restricted to a certain domain. Ultimately, the lack","bbox":{"l":50.112,"t":401.68243,"r":286.36511,"b":410.58899,"coord_origin":"1"}},{"id":30,"text":"of diversity in the training dataset damages the ability of the","bbox":{"l":50.112,"t":413.63742,"r":286.36511,"b":422.54398,"coord_origin":"1"}},{"id":31,"text":"models to generalize well on unseen data.","bbox":{"l":50.112,"t":425.59241,"r":216.39774,"b":434.49896,"coord_origin":"1"}},{"id":32,"text":"Motivated by those observations we aimed at generating","bbox":{"l":62.067001,"t":439.3894,"r":286.36499,"b":448.2959599999999,"coord_origin":"1"}},{"id":33,"text":"a synthetic table dataset named","bbox":{"l":50.112,"t":451.34439,"r":172.14388,"b":460.25095,"coord_origin":"1"}},{"id":34,"text":"SynthTabNet","bbox":{"l":174.14801,"t":451.43405,"r":224.70818999999997,"b":460.02182,"coord_origin":"1"}},{"id":35,"text":". This approach","bbox":{"l":224.70801,"t":451.34439,"r":286.36655,"b":460.25095,"coord_origin":"1"}},{"id":36,"text":"offers control over: 1) the size of the dataset, 2) the table","bbox":{"l":50.112015,"t":463.30038,"r":286.36505,"b":472.20694,"coord_origin":"1"}},{"id":37,"text":"structure, 3) the table style and 4) the type of content. The","bbox":{"l":50.112015,"t":475.25537,"r":286.36511,"b":484.16193,"coord_origin":"1"}},{"id":38,"text":"complexity of the table structure is described by the size of","bbox":{"l":50.112015,"t":487.21036,"r":286.36511,"b":496.11691,"coord_origin":"1"}},{"id":39,"text":"the table header and the table body, as well as the percentage","bbox":{"l":50.112015,"t":499.16534,"r":286.36508,"b":508.0719,"coord_origin":"1"}},{"id":40,"text":"of the table cells covered by row spans and column spans.","bbox":{"l":50.112015,"t":511.12033,"r":286.36505,"b":520.02689,"coord_origin":"1"}},{"id":41,"text":"A set of carefully designed styling templates provides the","bbox":{"l":50.112015,"t":523.07632,"r":286.36508,"b":531.98288,"coord_origin":"1"}},{"id":42,"text":"basis to build a wide range of table appearances. Lastly, the","bbox":{"l":50.112015,"t":535.0313100000001,"r":286.36508,"b":543.93788,"coord_origin":"1"}},{"id":43,"text":"table content is generated out of a curated collection of text","bbox":{"l":50.112015,"t":546.98633,"r":286.36511,"b":555.89288,"coord_origin":"1"}},{"id":44,"text":"corpora. By controlling the size and scope of the synthetic","bbox":{"l":50.112015,"t":558.94133,"r":286.36508,"b":567.84789,"coord_origin":"1"}},{"id":45,"text":"datasets we are able to train and evaluate our models in a","bbox":{"l":50.112015,"t":570.89633,"r":286.36511,"b":579.8028899999999,"coord_origin":"1"}},{"id":46,"text":"variety of different conditions. For example, we can first","bbox":{"l":50.112015,"t":582.85133,"r":286.36511,"b":591.75789,"coord_origin":"1"}},{"id":47,"text":"generate a highly diverse dataset to train our models and","bbox":{"l":50.112015,"t":594.80733,"r":286.36505,"b":603.71388,"coord_origin":"1"}},{"id":48,"text":"then evaluate their performance on other synthetic datasets","bbox":{"l":50.112015,"t":606.76233,"r":286.36508,"b":615.6688800000001,"coord_origin":"1"}},{"id":49,"text":"which are focused on a specific domain.","bbox":{"l":50.112015,"t":618.71733,"r":209.7527,"b":627.62389,"coord_origin":"1"}},{"id":50,"text":"In this regard, we have prepared four synthetic datasets,","bbox":{"l":62.067017,"t":632.51433,"r":286.36499,"b":641.42088,"coord_origin":"1"}},{"id":51,"text":"each one containing 150k examples. The corpora to gener-","bbox":{"l":50.112015,"t":644.46933,"r":286.36508,"b":653.37589,"coord_origin":"1"}},{"id":52,"text":"ate the table text consists of the most frequent terms appear-","bbox":{"l":50.112015,"t":656.42532,"r":286.36511,"b":665.33189,"coord_origin":"1"}},{"id":53,"text":"ing in PubTabNet and FinTabNet together with randomly","bbox":{"l":50.112015,"t":668.38033,"r":286.36505,"b":677.28689,"coord_origin":"1"}},{"id":54,"text":"generated text. The first two synthetic datasets have been","bbox":{"l":50.112015,"t":680.33533,"r":286.36508,"b":689.24189,"coord_origin":"1"}},{"id":55,"text":"fine-tuned to mimic the appearance of the original datasets","bbox":{"l":50.112015,"t":692.290329,"r":286.36508,"b":701.196892,"coord_origin":"1"}},{"id":56,"text":"but encompass more complicated table structures. The third","bbox":{"l":50.112015,"t":704.245331,"r":286.36511,"b":713.151894,"coord_origin":"1"}},{"id":57,"text":"Tags","bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"}},{"id":58,"text":"Bbox","bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"}},{"id":59,"text":"Size","bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"}},{"id":60,"text":"Format","bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"}},{"id":61,"text":"PubTabNet","bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"}},{"id":62,"text":"3","bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"}},{"id":63,"text":"3","bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"}},{"id":64,"text":"509k","bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"}},{"id":65,"text":"PNG","bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"}},{"id":66,"text":"FinTabNet","bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"}},{"id":67,"text":"3","bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"}},{"id":68,"text":"3","bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"}},{"id":69,"text":"112k","bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"}},{"id":70,"text":"PDF","bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"}},{"id":71,"text":"TableBank","bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"}},{"id":72,"text":"3","bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"}},{"id":73,"text":"7","bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"}},{"id":74,"text":"145k","bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"}},{"id":75,"text":"JPEG","bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"}},{"id":76,"text":"Combined-Tabnet(*)","bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"}},{"id":77,"text":"3","bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"}},{"id":78,"text":"3","bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"}},{"id":79,"text":"400k","bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"}},{"id":80,"text":"PNG","bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"}},{"id":81,"text":"Combined(**)","bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"}},{"id":82,"text":"3","bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"}},{"id":83,"text":"3","bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"}},{"id":84,"text":"500k","bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"}},{"id":85,"text":"PNG","bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"}},{"id":86,"text":"SynthTabNet","bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"}},{"id":87,"text":"3","bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"}},{"id":88,"text":"3","bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"}},{"id":89,"text":"600k","bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"}},{"id":90,"text":"PNG","bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"}},{"id":91,"text":"Table 1:","bbox":{"l":308.862,"t":167.66138,"r":344.6178,"b":176.56793000000005,"coord_origin":"1"}},{"id":92,"text":"Both","bbox":{"l":361.07602,"t":167.66138,"r":380.45328,"b":176.56793000000005,"coord_origin":"1"}},{"id":93,"text":"“Combined-Tabnet”","bbox":{"l":386.56799,"t":167.75104,"r":468.67974999999996,"b":176.33880999999997,"coord_origin":"1"}},{"id":94,"text":"and","bbox":{"l":474.79599,"t":167.66138,"r":489.18198,"b":176.56793000000005,"coord_origin":"1"}},{"id":95,"text":"”Combined-","bbox":{"l":495.29898000000003,"t":167.75104,"r":545.112,"b":176.33880999999997,"coord_origin":"1"}},{"id":96,"text":"Tabnet”","bbox":{"l":308.862,"t":179.70605,"r":341.16077,"b":188.29381999999998,"coord_origin":"1"}},{"id":97,"text":"are variations of the following: (*) The Combined-","bbox":{"l":343.457,"t":179.61639000000002,"r":545.11005,"b":188.52295000000004,"coord_origin":"1"}},{"id":98,"text":"Tabnet dataset is the processed combination of PubTabNet","bbox":{"l":308.862,"t":191.57141000000001,"r":545.11505,"b":200.47797000000003,"coord_origin":"1"}},{"id":99,"text":"and Fintabnet. (**) The combined dataset is the processed","bbox":{"l":308.862,"t":203.52643,"r":545.11499,"b":212.43298000000004,"coord_origin":"1"}},{"id":100,"text":"combination of PubTabNet, Fintabnet and TableBank.","bbox":{"l":308.862,"t":215.48242000000005,"r":523.93469,"b":224.38897999999995,"coord_origin":"1"}},{"id":101,"text":"one adopts a colorful appearance with high contrast and the","bbox":{"l":308.862,"t":249.62041999999997,"r":545.11517,"b":258.52698,"coord_origin":"1"}},{"id":102,"text":"last one contains tables with sparse content. Lastly, we have","bbox":{"l":308.862,"t":261.57543999999996,"r":545.11517,"b":270.48199,"coord_origin":"1"}},{"id":103,"text":"combined all synthetic datasets into one big unified syn-","bbox":{"l":308.862,"t":273.5304,"r":545.11505,"b":282.43698,"coord_origin":"1"}},{"id":104,"text":"thetic dataset of 600k examples.","bbox":{"l":308.862,"t":285.48541000000006,"r":436.82169,"b":294.39197,"coord_origin":"1"}},{"id":105,"text":"Tab. 1 summarizes the various attributes of the datasets.","bbox":{"l":320.81699,"t":297.77240000000006,"r":542.74396,"b":306.67896,"coord_origin":"1"}},{"id":106,"text":"4.","bbox":{"l":308.862,"t":321.18396,"r":316.28476,"b":331.93167000000005,"coord_origin":"1"}},{"id":107,"text":"The TableFormer model","bbox":{"l":326.18176,"t":321.18396,"r":444.93607000000003,"b":331.93167000000005,"coord_origin":"1"}},{"id":108,"text":"Given the image of a table, TableFormer is able to pre-","bbox":{"l":320.81699,"t":341.93939,"r":545.11499,"b":350.84594999999996,"coord_origin":"1"}},{"id":109,"text":"dict: 1) a sequence of tokens that represent the structure of","bbox":{"l":308.862,"t":353.89438,"r":545.11511,"b":362.80092999999994,"coord_origin":"1"}},{"id":110,"text":"a table, and 2) a bounding box coupled to a subset of those","bbox":{"l":308.862,"t":365.84937,"r":545.11517,"b":374.75592,"coord_origin":"1"}},{"id":111,"text":"tokens. The conversion of an image into a sequence of to-","bbox":{"l":308.862,"t":377.80435,"r":545.11505,"b":386.71091,"coord_origin":"1"}},{"id":112,"text":"kens is a well-known task [35, 16]. While attention is often","bbox":{"l":308.862,"t":389.75934000000007,"r":545.11517,"b":398.66588999999993,"coord_origin":"1"}},{"id":113,"text":"used as an implicit method to associate each token of the","bbox":{"l":308.862,"t":401.71432000000004,"r":545.11523,"b":410.62088,"coord_origin":"1"}},{"id":114,"text":"sequence with a position in the original image, an explicit","bbox":{"l":308.862,"t":413.67032,"r":545.11517,"b":422.57687,"coord_origin":"1"}},{"id":115,"text":"association between the individual table-cells and the image","bbox":{"l":308.862,"t":425.62531,"r":545.11505,"b":434.53186,"coord_origin":"1"}},{"id":116,"text":"bounding boxes is also required.","bbox":{"l":308.862,"t":437.58029,"r":437.9375,"b":446.48685000000006,"coord_origin":"1"}},{"id":117,"text":"4.1.","bbox":{"l":308.862,"t":457.69427,"r":323.14081,"b":467.54633,"coord_origin":"1"}},{"id":118,"text":"Model architecture.","bbox":{"l":332.66003,"t":457.69427,"r":420.16058,"b":467.54633,"coord_origin":"1"}},{"id":119,"text":"We now describe in detail the proposed method, which","bbox":{"l":320.81699,"t":476.76529,"r":545.11487,"b":485.67184,"coord_origin":"1"}},{"id":120,"text":"is composed of three main components, see Fig.","bbox":{"l":308.862,"t":488.72028,"r":509.02054,"b":497.62683,"coord_origin":"1"}},{"id":121,"text":"4.","bbox":{"l":515.58588,"t":488.72028,"r":523.05786,"b":497.62683,"coord_origin":"1"}},{"id":122,"text":"Our","bbox":{"l":529.62323,"t":488.72028,"r":545.11505,"b":497.62683,"coord_origin":"1"}},{"id":123,"text":"CNN Backbone Network","bbox":{"l":308.862,"t":500.76492,"r":406.34601,"b":509.35269,"coord_origin":"1"}},{"id":124,"text":"encodes the input as a feature vec-","bbox":{"l":408.87201,"t":500.67526,"r":545.1106,"b":509.58182,"coord_origin":"1"}},{"id":125,"text":"tor of predefined length.","bbox":{"l":308.862,"t":512.63126,"r":409.39459,"b":521.53781,"coord_origin":"1"}},{"id":126,"text":"The input feature vector of the","bbox":{"l":416.72705,"t":512.63126,"r":545.11505,"b":521.53781,"coord_origin":"1"}},{"id":127,"text":"encoded image is passed to the","bbox":{"l":308.862,"t":524.58624,"r":436.194,"b":533.4928,"coord_origin":"1"}},{"id":128,"text":"Structure Decoder","bbox":{"l":439.526,"t":524.6759,"r":513.86694,"b":533.26367,"coord_origin":"1"}},{"id":129,"text":"to pro-","bbox":{"l":517.43201,"t":524.58624,"r":545.10815,"b":533.4928,"coord_origin":"1"}},{"id":130,"text":"duce a sequence of HTML tags that represent the structure","bbox":{"l":308.862,"t":536.54124,"r":545.11511,"b":545.4478,"coord_origin":"1"}},{"id":131,"text":"of the table.","bbox":{"l":308.862,"t":548.49625,"r":358.5455,"b":557.4028000000001,"coord_origin":"1"}},{"id":132,"text":"With each prediction of an HTML standard","bbox":{"l":365.19055,"t":548.49625,"r":545.11517,"b":557.4028000000001,"coord_origin":"1"}},{"id":133,"text":"data cell (‘","bbox":{"l":308.862,"t":560.45125,"r":352.40851,"b":569.3578,"coord_origin":"1"}},{"id":134,"text":"<","bbox":{"l":352.409,"t":560.29184,"r":360.1579,"b":569.13863,"coord_origin":"1"}},{"id":135,"text":"td","bbox":{"l":360.15799,"t":560.45125,"r":367.90891,"b":569.3578,"coord_origin":"1"}},{"id":136,"text":">","bbox":{"l":367.909,"t":560.29184,"r":375.6579,"b":569.13863,"coord_origin":"1"}},{"id":137,"text":"’) the hidden state of that cell is passed to","bbox":{"l":375.65799,"t":560.45125,"r":545.11182,"b":569.3578,"coord_origin":"1"}},{"id":138,"text":"the Cell BBox Decoder. As for spanning cells, such as row","bbox":{"l":308.862,"t":572.40724,"r":545.11499,"b":581.3138,"coord_origin":"1"}},{"id":139,"text":"or column span, the tag is broken down to ‘","bbox":{"l":308.862,"t":584.3622399999999,"r":483.11768,"b":593.2688,"coord_origin":"1"}},{"id":140,"text":"<","bbox":{"l":483.11902,"t":584.20284,"r":490.86792,"b":593.04962,"coord_origin":"1"}},{"id":141,"text":"’, ‘rowspan=’","bbox":{"l":490.86800999999997,"t":584.3622399999999,"r":545.11438,"b":593.2688,"coord_origin":"1"}},{"id":142,"text":"or ‘colspan=’, with the number of spanning cells (attribute),","bbox":{"l":308.862,"t":596.31725,"r":545.11493,"b":605.2238,"coord_origin":"1"}},{"id":143,"text":"and ‘","bbox":{"l":308.862,"t":608.27225,"r":329.64395,"b":617.1788,"coord_origin":"1"}},{"id":144,"text":">","bbox":{"l":329.646,"t":608.11284,"r":337.3949,"b":616.9596300000001,"coord_origin":"1"}},{"id":145,"text":"’. The hidden state attached to ‘","bbox":{"l":337.39398,"t":608.27225,"r":468.5914,"b":617.1788,"coord_origin":"1"}},{"id":146,"text":"<","bbox":{"l":468.59496999999993,"t":608.11284,"r":476.34387000000004,"b":616.9596300000001,"coord_origin":"1"}},{"id":147,"text":"’ is passed to the","bbox":{"l":476.3439599999999,"t":608.27225,"r":545.11572,"b":617.1788,"coord_origin":"1"}},{"id":148,"text":"Cell BBox Decoder. A shared feed forward network (FFN)","bbox":{"l":308.86197,"t":620.22725,"r":545.11499,"b":629.1338000000001,"coord_origin":"1"}},{"id":149,"text":"receives the hidden states from the Structure Decoder, to","bbox":{"l":308.86197,"t":632.1822500000001,"r":545.11517,"b":641.08881,"coord_origin":"1"}},{"id":150,"text":"provide the final detection predictions of the bounding box","bbox":{"l":308.86197,"t":644.13824,"r":545.11511,"b":653.0448,"coord_origin":"1"}},{"id":151,"text":"coordinates and their classification.","bbox":{"l":308.86197,"t":656.09325,"r":449.42432,"b":664.99981,"coord_origin":"1"}},{"id":152,"text":"CNN Backbone Network.","bbox":{"l":320.81696,"t":668.2607,"r":431.90985,"b":677.21707,"coord_origin":"1"}},{"id":153,"text":"A ResNet-18 CNN is the","bbox":{"l":439.49896,"t":668.3802499999999,"r":545.11255,"b":677.2868100000001,"coord_origin":"1"}},{"id":154,"text":"backbone that receives the table image and encodes it as a","bbox":{"l":308.86197,"t":680.33525,"r":545.11499,"b":689.24181,"coord_origin":"1"}},{"id":155,"text":"vector of predefined length. The network has been modified","bbox":{"l":308.86197,"t":692.290253,"r":545.11511,"b":701.196815,"coord_origin":"1"}},{"id":156,"text":"by removing the linear and pooling layer, as we are not per-","bbox":{"l":308.86197,"t":704.245255,"r":545.11505,"b":713.1518169999999,"coord_origin":"1"}},{"id":157,"text":"4","bbox":{"l":295.12097,"t":734.133251,"r":300.10226,"b":743.039814,"coord_origin":"1"}}],"predictions":{"layout":{"clusters":[{"id":0,"label":"Text","bbox":{"l":49.14799032211304,"t":74.29706525802612,"r":286.36511,"b":96.06994999999995,"coord_origin":"1"},"confidence":0.9636521339416504,"cells":[{"id":0,"text":"amount of such tables, and kept only those ones ranging","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36511,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"between 1*1 and 20*10 (rows/columns).","bbox":{"l":50.112,"t":87.16339000000005,"r":212.28319,"b":96.06994999999995,"coord_origin":"1"}}]},{"id":1,"label":"Text","bbox":{"l":49.21861910820007,"t":100.05254344940181,"r":286.5638860702515,"b":313.10507,"coord_origin":"1"},"confidence":0.9862114191055298,"cells":[{"id":2,"text":"The availability of the bounding boxes for all table cells","bbox":{"l":62.067001,"t":100.96038999999996,"r":286.36502,"b":109.86694,"coord_origin":"1"}},{"id":3,"text":"is essential to train our models. In order to distinguish be-","bbox":{"l":50.112,"t":112.91540999999995,"r":286.36508,"b":121.82195999999999,"coord_origin":"1"}},{"id":4,"text":"tween empty and non-empty bounding boxes, we have in-","bbox":{"l":50.112,"t":124.87041999999997,"r":286.36508,"b":133.77697999999998,"coord_origin":"1"}},{"id":5,"text":"troduced a binary class in the annotation. Unfortunately, the","bbox":{"l":50.112,"t":136.82641999999998,"r":286.36511,"b":145.73297000000002,"coord_origin":"1"}},{"id":6,"text":"original datasets either omit the bounding boxes for whole","bbox":{"l":50.112,"t":148.78143,"r":286.36511,"b":157.68799,"coord_origin":"1"}},{"id":7,"text":"tables (e.g. TableBank) or they narrow their scope only to","bbox":{"l":50.112,"t":160.73645,"r":286.36508,"b":169.64301,"coord_origin":"1"}},{"id":8,"text":"non-empty cells. Therefore, it was imperative to introduce","bbox":{"l":50.112,"t":172.69146999999998,"r":286.36505,"b":181.59802000000002,"coord_origin":"1"}},{"id":9,"text":"a data pre-processing procedure that generates the missing","bbox":{"l":50.112,"t":184.64648,"r":286.36508,"b":193.55304,"coord_origin":"1"}},{"id":10,"text":"bounding boxes out of the annotation information. This pro-","bbox":{"l":50.112,"t":196.60248,"r":286.36508,"b":205.50903000000005,"coord_origin":"1"}},{"id":11,"text":"cedure first parses the provided table structure and calcu-","bbox":{"l":50.112,"t":208.5575,"r":286.36508,"b":217.46405000000004,"coord_origin":"1"}},{"id":12,"text":"lates the dimensions of the most fine-grained grid that cov-","bbox":{"l":50.112,"t":220.51251000000002,"r":286.36511,"b":229.41907000000003,"coord_origin":"1"}},{"id":13,"text":"ers the table structure. Notice that each table cell may oc-","bbox":{"l":50.112,"t":232.46753,"r":286.36508,"b":241.37408000000005,"coord_origin":"1"}},{"id":14,"text":"cupy multiple grid squares due to row or column spans. In","bbox":{"l":50.112,"t":244.42255,"r":286.36508,"b":253.32910000000004,"coord_origin":"1"}},{"id":15,"text":"case of PubTabNet we had to compute missing bounding","bbox":{"l":50.112,"t":256.37756,"r":286.36505,"b":265.28412000000003,"coord_origin":"1"}},{"id":16,"text":"boxes for 48% of the simple and 69% of the complex ta-","bbox":{"l":50.112,"t":268.33356000000003,"r":286.36505,"b":277.24010999999996,"coord_origin":"1"}},{"id":17,"text":"bles.","bbox":{"l":50.112,"t":280.28853999999995,"r":68.652397,"b":289.1951,"coord_origin":"1"}},{"id":18,"text":"Regarding FinTabNet, 68% of the simple and 98%","bbox":{"l":75.566444,"t":280.28853999999995,"r":286.36514,"b":289.1951,"coord_origin":"1"}},{"id":19,"text":"of the complex tables require the generation of bounding","bbox":{"l":50.112,"t":292.24353,"r":286.36511,"b":301.15009,"coord_origin":"1"}},{"id":20,"text":"boxes.","bbox":{"l":50.112,"t":304.19852000000003,"r":75.695961,"b":313.10507,"coord_origin":"1"}}]},{"id":2,"label":"Text","bbox":{"l":49.344085335731506,"t":317.04177474975586,"r":286.7534259796143,"b":435.1035724639893,"coord_origin":"1"},"confidence":0.9866138100624084,"cells":[{"id":21,"text":"As it is illustrated in Fig. 2, the table distributions from","bbox":{"l":62.067001,"t":317.99550999999997,"r":286.36499,"b":326.90207,"coord_origin":"1"}},{"id":22,"text":"all datasets are skewed towards simpler structures with","bbox":{"l":50.112,"t":329.95151,"r":286.36511,"b":338.8580600000001,"coord_origin":"1"}},{"id":23,"text":"fewer number of rows/columns. Additionally, there is very","bbox":{"l":50.112,"t":341.90649,"r":286.36502,"b":350.81305,"coord_origin":"1"}},{"id":24,"text":"limited variance in the table styles, which in case of Pub-","bbox":{"l":50.112,"t":353.8614799999999,"r":286.36505,"b":362.76804,"coord_origin":"1"}},{"id":25,"text":"TabNet and FinTabNet means one styling format for the","bbox":{"l":50.112,"t":365.81647,"r":286.36508,"b":374.72301999999996,"coord_origin":"1"}},{"id":26,"text":"majority of the tables.","bbox":{"l":50.112,"t":377.77145,"r":141.58859,"b":386.67801,"coord_origin":"1"}},{"id":27,"text":"Similar limitations appear also in","bbox":{"l":148.70189,"t":377.77145,"r":286.36508,"b":386.67801,"coord_origin":"1"}},{"id":28,"text":"the type of table content, which in some cases (e.g. FinTab-","bbox":{"l":50.112,"t":389.72644,"r":286.36508,"b":398.63300000000004,"coord_origin":"1"}},{"id":29,"text":"Net) is restricted to a certain domain. Ultimately, the lack","bbox":{"l":50.112,"t":401.68243,"r":286.36511,"b":410.58899,"coord_origin":"1"}},{"id":30,"text":"of diversity in the training dataset damages the ability of the","bbox":{"l":50.112,"t":413.63742,"r":286.36511,"b":422.54398,"coord_origin":"1"}},{"id":31,"text":"models to generalize well on unseen data.","bbox":{"l":50.112,"t":425.59241,"r":216.39774,"b":434.49896,"coord_origin":"1"}}]},{"id":3,"label":"Text","bbox":{"l":49.16742217540741,"t":438.18905181884764,"r":286.7348659515381,"b":627.6961944580079,"coord_origin":"1"},"confidence":0.9882907867431641,"cells":[{"id":32,"text":"Motivated by those observations we aimed at generating","bbox":{"l":62.067001,"t":439.3894,"r":286.36499,"b":448.2959599999999,"coord_origin":"1"}},{"id":33,"text":"a synthetic table dataset named","bbox":{"l":50.112,"t":451.34439,"r":172.14388,"b":460.25095,"coord_origin":"1"}},{"id":34,"text":"SynthTabNet","bbox":{"l":174.14801,"t":451.43405,"r":224.70818999999997,"b":460.02182,"coord_origin":"1"}},{"id":35,"text":". This approach","bbox":{"l":224.70801,"t":451.34439,"r":286.36655,"b":460.25095,"coord_origin":"1"}},{"id":36,"text":"offers control over: 1) the size of the dataset, 2) the table","bbox":{"l":50.112015,"t":463.30038,"r":286.36505,"b":472.20694,"coord_origin":"1"}},{"id":37,"text":"structure, 3) the table style and 4) the type of content. The","bbox":{"l":50.112015,"t":475.25537,"r":286.36511,"b":484.16193,"coord_origin":"1"}},{"id":38,"text":"complexity of the table structure is described by the size of","bbox":{"l":50.112015,"t":487.21036,"r":286.36511,"b":496.11691,"coord_origin":"1"}},{"id":39,"text":"the table header and the table body, as well as the percentage","bbox":{"l":50.112015,"t":499.16534,"r":286.36508,"b":508.0719,"coord_origin":"1"}},{"id":40,"text":"of the table cells covered by row spans and column spans.","bbox":{"l":50.112015,"t":511.12033,"r":286.36505,"b":520.02689,"coord_origin":"1"}},{"id":41,"text":"A set of carefully designed styling templates provides the","bbox":{"l":50.112015,"t":523.07632,"r":286.36508,"b":531.98288,"coord_origin":"1"}},{"id":42,"text":"basis to build a wide range of table appearances. Lastly, the","bbox":{"l":50.112015,"t":535.0313100000001,"r":286.36508,"b":543.93788,"coord_origin":"1"}},{"id":43,"text":"table content is generated out of a curated collection of text","bbox":{"l":50.112015,"t":546.98633,"r":286.36511,"b":555.89288,"coord_origin":"1"}},{"id":44,"text":"corpora. By controlling the size and scope of the synthetic","bbox":{"l":50.112015,"t":558.94133,"r":286.36508,"b":567.84789,"coord_origin":"1"}},{"id":45,"text":"datasets we are able to train and evaluate our models in a","bbox":{"l":50.112015,"t":570.89633,"r":286.36511,"b":579.8028899999999,"coord_origin":"1"}},{"id":46,"text":"variety of different conditions. For example, we can first","bbox":{"l":50.112015,"t":582.85133,"r":286.36511,"b":591.75789,"coord_origin":"1"}},{"id":47,"text":"generate a highly diverse dataset to train our models and","bbox":{"l":50.112015,"t":594.80733,"r":286.36505,"b":603.71388,"coord_origin":"1"}},{"id":48,"text":"then evaluate their performance on other synthetic datasets","bbox":{"l":50.112015,"t":606.76233,"r":286.36508,"b":615.6688800000001,"coord_origin":"1"}},{"id":49,"text":"which are focused on a specific domain.","bbox":{"l":50.112015,"t":618.71733,"r":209.7527,"b":627.62389,"coord_origin":"1"}}]},{"id":4,"label":"Text","bbox":{"l":49.457112550735474,"t":631.730834197998,"r":286.5352872848511,"b":713.151894,"coord_origin":"1"},"confidence":0.9875262975692749,"cells":[{"id":50,"text":"In this regard, we have prepared four synthetic datasets,","bbox":{"l":62.067017,"t":632.51433,"r":286.36499,"b":641.42088,"coord_origin":"1"}},{"id":51,"text":"each one containing 150k examples. The corpora to gener-","bbox":{"l":50.112015,"t":644.46933,"r":286.36508,"b":653.37589,"coord_origin":"1"}},{"id":52,"text":"ate the table text consists of the most frequent terms appear-","bbox":{"l":50.112015,"t":656.42532,"r":286.36511,"b":665.33189,"coord_origin":"1"}},{"id":53,"text":"ing in PubTabNet and FinTabNet together with randomly","bbox":{"l":50.112015,"t":668.38033,"r":286.36505,"b":677.28689,"coord_origin":"1"}},{"id":54,"text":"generated text. The first two synthetic datasets have been","bbox":{"l":50.112015,"t":680.33533,"r":286.36508,"b":689.24189,"coord_origin":"1"}},{"id":55,"text":"fine-tuned to mimic the appearance of the original datasets","bbox":{"l":50.112015,"t":692.290329,"r":286.36508,"b":701.196892,"coord_origin":"1"}},{"id":56,"text":"but encompass more complicated table structures. The third","bbox":{"l":50.112015,"t":704.245331,"r":286.36511,"b":713.151894,"coord_origin":"1"}}]},{"id":5,"label":"Table","bbox":{"l":309.9828769683838,"t":72.70983781814573,"r":542.3903228759765,"b":155.58427963256838,"coord_origin":"1"},"confidence":0.9519661068916321,"cells":[{"id":57,"text":"Tags","bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"}},{"id":58,"text":"Bbox","bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"}},{"id":59,"text":"Size","bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"}},{"id":60,"text":"Format","bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"}},{"id":61,"text":"PubTabNet","bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"}},{"id":62,"text":"3","bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"}},{"id":63,"text":"3","bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"}},{"id":64,"text":"509k","bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"}},{"id":65,"text":"PNG","bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"}},{"id":66,"text":"FinTabNet","bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"}},{"id":67,"text":"3","bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"}},{"id":68,"text":"3","bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"}},{"id":69,"text":"112k","bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"}},{"id":70,"text":"PDF","bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"}},{"id":71,"text":"TableBank","bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"}},{"id":72,"text":"3","bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"}},{"id":73,"text":"7","bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"}},{"id":74,"text":"145k","bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"}},{"id":75,"text":"JPEG","bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"}},{"id":76,"text":"Combined-Tabnet(*)","bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"}},{"id":77,"text":"3","bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"}},{"id":78,"text":"3","bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"}},{"id":79,"text":"400k","bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"}},{"id":80,"text":"PNG","bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"}},{"id":81,"text":"Combined(**)","bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"}},{"id":82,"text":"3","bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"}},{"id":83,"text":"3","bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"}},{"id":84,"text":"500k","bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"}},{"id":85,"text":"PNG","bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"}},{"id":86,"text":"SynthTabNet","bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"}},{"id":87,"text":"3","bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"}},{"id":88,"text":"3","bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"}},{"id":89,"text":"600k","bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"}},{"id":90,"text":"PNG","bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"}}]},{"id":6,"label":"Caption","bbox":{"l":307.6622022628784,"t":166.97985019683836,"r":545.11505,"b":224.38897999999995,"coord_origin":"1"},"confidence":0.9559885263442993,"cells":[{"id":91,"text":"Table 1:","bbox":{"l":308.862,"t":167.66138,"r":344.6178,"b":176.56793000000005,"coord_origin":"1"}},{"id":92,"text":"Both","bbox":{"l":361.07602,"t":167.66138,"r":380.45328,"b":176.56793000000005,"coord_origin":"1"}},{"id":93,"text":"“Combined-Tabnet”","bbox":{"l":386.56799,"t":167.75104,"r":468.67974999999996,"b":176.33880999999997,"coord_origin":"1"}},{"id":94,"text":"and","bbox":{"l":474.79599,"t":167.66138,"r":489.18198,"b":176.56793000000005,"coord_origin":"1"}},{"id":95,"text":"”Combined-","bbox":{"l":495.29898000000003,"t":167.75104,"r":545.112,"b":176.33880999999997,"coord_origin":"1"}},{"id":96,"text":"Tabnet”","bbox":{"l":308.862,"t":179.70605,"r":341.16077,"b":188.29381999999998,"coord_origin":"1"}},{"id":97,"text":"are variations of the following: (*) The Combined-","bbox":{"l":343.457,"t":179.61639000000002,"r":545.11005,"b":188.52295000000004,"coord_origin":"1"}},{"id":98,"text":"Tabnet dataset is the processed combination of PubTabNet","bbox":{"l":308.862,"t":191.57141000000001,"r":545.11505,"b":200.47797000000003,"coord_origin":"1"}},{"id":99,"text":"and Fintabnet. (**) The combined dataset is the processed","bbox":{"l":308.862,"t":203.52643,"r":545.11499,"b":212.43298000000004,"coord_origin":"1"}},{"id":100,"text":"combination of PubTabNet, Fintabnet and TableBank.","bbox":{"l":308.862,"t":215.48242000000005,"r":523.93469,"b":224.38897999999995,"coord_origin":"1"}}]},{"id":7,"label":"Text","bbox":{"l":307.8670337677002,"t":249.03968067169194,"r":545.144327545166,"b":294.39197,"coord_origin":"1"},"confidence":0.9780857563018799,"cells":[{"id":101,"text":"one adopts a colorful appearance with high contrast and the","bbox":{"l":308.862,"t":249.62041999999997,"r":545.11517,"b":258.52698,"coord_origin":"1"}},{"id":102,"text":"last one contains tables with sparse content. Lastly, we have","bbox":{"l":308.862,"t":261.57543999999996,"r":545.11517,"b":270.48199,"coord_origin":"1"}},{"id":103,"text":"combined all synthetic datasets into one big unified syn-","bbox":{"l":308.862,"t":273.5304,"r":545.11505,"b":282.43698,"coord_origin":"1"}},{"id":104,"text":"thetic dataset of 600k examples.","bbox":{"l":308.862,"t":285.48541000000006,"r":436.82169,"b":294.39197,"coord_origin":"1"}}]},{"id":8,"label":"Text","bbox":{"l":320.1442880630493,"t":297.1657905578613,"r":542.74396,"b":306.67896,"coord_origin":"1"},"confidence":0.8936340808868408,"cells":[{"id":105,"text":"Tab. 1 summarizes the various attributes of the datasets.","bbox":{"l":320.81699,"t":297.77240000000006,"r":542.74396,"b":306.67896,"coord_origin":"1"}}]},{"id":9,"label":"Section-header","bbox":{"l":307.9104280471802,"t":320.20690155029297,"r":444.93607000000003,"b":331.93167000000005,"coord_origin":"1"},"confidence":0.9415073394775391,"cells":[{"id":106,"text":"4.","bbox":{"l":308.862,"t":321.18396,"r":316.28476,"b":331.93167000000005,"coord_origin":"1"}},{"id":107,"text":"The TableFormer model","bbox":{"l":326.18176,"t":321.18396,"r":444.93607000000003,"b":331.93167000000005,"coord_origin":"1"}}]},{"id":10,"label":"Text","bbox":{"l":307.7100322723389,"t":340.7741077423096,"r":545.5623950958252,"b":447.6058799743652,"coord_origin":"1"},"confidence":0.9868811964988708,"cells":[{"id":108,"text":"Given the image of a table, TableFormer is able to pre-","bbox":{"l":320.81699,"t":341.93939,"r":545.11499,"b":350.84594999999996,"coord_origin":"1"}},{"id":109,"text":"dict: 1) a sequence of tokens that represent the structure of","bbox":{"l":308.862,"t":353.89438,"r":545.11511,"b":362.80092999999994,"coord_origin":"1"}},{"id":110,"text":"a table, and 2) a bounding box coupled to a subset of those","bbox":{"l":308.862,"t":365.84937,"r":545.11517,"b":374.75592,"coord_origin":"1"}},{"id":111,"text":"tokens. The conversion of an image into a sequence of to-","bbox":{"l":308.862,"t":377.80435,"r":545.11505,"b":386.71091,"coord_origin":"1"}},{"id":112,"text":"kens is a well-known task [35, 16]. While attention is often","bbox":{"l":308.862,"t":389.75934000000007,"r":545.11517,"b":398.66588999999993,"coord_origin":"1"}},{"id":113,"text":"used as an implicit method to associate each token of the","bbox":{"l":308.862,"t":401.71432000000004,"r":545.11523,"b":410.62088,"coord_origin":"1"}},{"id":114,"text":"sequence with a position in the original image, an explicit","bbox":{"l":308.862,"t":413.67032,"r":545.11517,"b":422.57687,"coord_origin":"1"}},{"id":115,"text":"association between the individual table-cells and the image","bbox":{"l":308.862,"t":425.62531,"r":545.11505,"b":434.53186,"coord_origin":"1"}},{"id":116,"text":"bounding boxes is also required.","bbox":{"l":308.862,"t":437.58029,"r":437.9375,"b":446.48685000000006,"coord_origin":"1"}}]},{"id":11,"label":"Section-header","bbox":{"l":307.7091567993164,"t":457.1930305480957,"r":420.16058,"b":467.54633,"coord_origin":"1"},"confidence":0.930464506149292,"cells":[{"id":117,"text":"4.1.","bbox":{"l":308.862,"t":457.69427,"r":323.14081,"b":467.54633,"coord_origin":"1"}},{"id":118,"text":"Model architecture.","bbox":{"l":332.66003,"t":457.69427,"r":420.16058,"b":467.54633,"coord_origin":"1"}}]},{"id":12,"label":"Text","bbox":{"l":307.7804786682129,"t":475.39464340209963,"r":545.63599319458,"b":664.99981,"coord_origin":"1"},"confidence":0.9881805777549744,"cells":[{"id":119,"text":"We now describe in detail the proposed method, which","bbox":{"l":320.81699,"t":476.76529,"r":545.11487,"b":485.67184,"coord_origin":"1"}},{"id":120,"text":"is composed of three main components, see Fig.","bbox":{"l":308.862,"t":488.72028,"r":509.02054,"b":497.62683,"coord_origin":"1"}},{"id":121,"text":"4.","bbox":{"l":515.58588,"t":488.72028,"r":523.05786,"b":497.62683,"coord_origin":"1"}},{"id":122,"text":"Our","bbox":{"l":529.62323,"t":488.72028,"r":545.11505,"b":497.62683,"coord_origin":"1"}},{"id":123,"text":"CNN Backbone Network","bbox":{"l":308.862,"t":500.76492,"r":406.34601,"b":509.35269,"coord_origin":"1"}},{"id":124,"text":"encodes the input as a feature vec-","bbox":{"l":408.87201,"t":500.67526,"r":545.1106,"b":509.58182,"coord_origin":"1"}},{"id":125,"text":"tor of predefined length.","bbox":{"l":308.862,"t":512.63126,"r":409.39459,"b":521.53781,"coord_origin":"1"}},{"id":126,"text":"The input feature vector of the","bbox":{"l":416.72705,"t":512.63126,"r":545.11505,"b":521.53781,"coord_origin":"1"}},{"id":127,"text":"encoded image is passed to the","bbox":{"l":308.862,"t":524.58624,"r":436.194,"b":533.4928,"coord_origin":"1"}},{"id":128,"text":"Structure Decoder","bbox":{"l":439.526,"t":524.6759,"r":513.86694,"b":533.26367,"coord_origin":"1"}},{"id":129,"text":"to pro-","bbox":{"l":517.43201,"t":524.58624,"r":545.10815,"b":533.4928,"coord_origin":"1"}},{"id":130,"text":"duce a sequence of HTML tags that represent the structure","bbox":{"l":308.862,"t":536.54124,"r":545.11511,"b":545.4478,"coord_origin":"1"}},{"id":131,"text":"of the table.","bbox":{"l":308.862,"t":548.49625,"r":358.5455,"b":557.4028000000001,"coord_origin":"1"}},{"id":132,"text":"With each prediction of an HTML standard","bbox":{"l":365.19055,"t":548.49625,"r":545.11517,"b":557.4028000000001,"coord_origin":"1"}},{"id":133,"text":"data cell (‘","bbox":{"l":308.862,"t":560.45125,"r":352.40851,"b":569.3578,"coord_origin":"1"}},{"id":134,"text":"<","bbox":{"l":352.409,"t":560.29184,"r":360.1579,"b":569.13863,"coord_origin":"1"}},{"id":135,"text":"td","bbox":{"l":360.15799,"t":560.45125,"r":367.90891,"b":569.3578,"coord_origin":"1"}},{"id":136,"text":">","bbox":{"l":367.909,"t":560.29184,"r":375.6579,"b":569.13863,"coord_origin":"1"}},{"id":137,"text":"’) the hidden state of that cell is passed to","bbox":{"l":375.65799,"t":560.45125,"r":545.11182,"b":569.3578,"coord_origin":"1"}},{"id":138,"text":"the Cell BBox Decoder. As for spanning cells, such as row","bbox":{"l":308.862,"t":572.40724,"r":545.11499,"b":581.3138,"coord_origin":"1"}},{"id":139,"text":"or column span, the tag is broken down to ‘","bbox":{"l":308.862,"t":584.3622399999999,"r":483.11768,"b":593.2688,"coord_origin":"1"}},{"id":140,"text":"<","bbox":{"l":483.11902,"t":584.20284,"r":490.86792,"b":593.04962,"coord_origin":"1"}},{"id":141,"text":"’, ‘rowspan=’","bbox":{"l":490.86800999999997,"t":584.3622399999999,"r":545.11438,"b":593.2688,"coord_origin":"1"}},{"id":142,"text":"or ‘colspan=’, with the number of spanning cells (attribute),","bbox":{"l":308.862,"t":596.31725,"r":545.11493,"b":605.2238,"coord_origin":"1"}},{"id":143,"text":"and ‘","bbox":{"l":308.862,"t":608.27225,"r":329.64395,"b":617.1788,"coord_origin":"1"}},{"id":144,"text":">","bbox":{"l":329.646,"t":608.11284,"r":337.3949,"b":616.9596300000001,"coord_origin":"1"}},{"id":145,"text":"’. The hidden state attached to ‘","bbox":{"l":337.39398,"t":608.27225,"r":468.5914,"b":617.1788,"coord_origin":"1"}},{"id":146,"text":"<","bbox":{"l":468.59496999999993,"t":608.11284,"r":476.34387000000004,"b":616.9596300000001,"coord_origin":"1"}},{"id":147,"text":"’ is passed to the","bbox":{"l":476.3439599999999,"t":608.27225,"r":545.11572,"b":617.1788,"coord_origin":"1"}},{"id":148,"text":"Cell BBox Decoder. A shared feed forward network (FFN)","bbox":{"l":308.86197,"t":620.22725,"r":545.11499,"b":629.1338000000001,"coord_origin":"1"}},{"id":149,"text":"receives the hidden states from the Structure Decoder, to","bbox":{"l":308.86197,"t":632.1822500000001,"r":545.11517,"b":641.08881,"coord_origin":"1"}},{"id":150,"text":"provide the final detection predictions of the bounding box","bbox":{"l":308.86197,"t":644.13824,"r":545.11511,"b":653.0448,"coord_origin":"1"}},{"id":151,"text":"coordinates and their classification.","bbox":{"l":308.86197,"t":656.09325,"r":449.42432,"b":664.99981,"coord_origin":"1"}}]},{"id":13,"label":"Text","bbox":{"l":307.78812446594236,"t":667.3782211303711,"r":545.5024543762207,"b":713.2305267333985,"coord_origin":"1"},"confidence":0.9761532545089722,"cells":[{"id":152,"text":"CNN Backbone Network.","bbox":{"l":320.81696,"t":668.2607,"r":431.90985,"b":677.21707,"coord_origin":"1"}},{"id":153,"text":"A ResNet-18 CNN is the","bbox":{"l":439.49896,"t":668.3802499999999,"r":545.11255,"b":677.2868100000001,"coord_origin":"1"}},{"id":154,"text":"backbone that receives the table image and encodes it as a","bbox":{"l":308.86197,"t":680.33525,"r":545.11499,"b":689.24181,"coord_origin":"1"}},{"id":155,"text":"vector of predefined length. The network has been modified","bbox":{"l":308.86197,"t":692.290253,"r":545.11511,"b":701.196815,"coord_origin":"1"}},{"id":156,"text":"by removing the linear and pooling layer, as we are not per-","bbox":{"l":308.86197,"t":704.245255,"r":545.11505,"b":713.1518169999999,"coord_origin":"1"}}]},{"id":14,"label":"Page-footer","bbox":{"l":294.49780197143554,"t":733.5627182006837,"r":300.23938751220703,"b":743.039814,"coord_origin":"1"},"confidence":0.8719172477722168,"cells":[{"id":157,"text":"4","bbox":{"l":295.12097,"t":734.133251,"r":300.10226,"b":743.039814,"coord_origin":"1"}}]}]},"tablestructure":{"table_map":{"5":{"label":"Table","id":5,"page_no":3,"cluster":{"id":5,"label":"Table","bbox":{"l":309.9828769683838,"t":72.70983781814573,"r":542.3903228759765,"b":155.58427963256838,"coord_origin":"1"},"confidence":0.9519661068916321,"cells":[{"id":57,"text":"Tags","bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"}},{"id":58,"text":"Bbox","bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"}},{"id":59,"text":"Size","bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"}},{"id":60,"text":"Format","bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"}},{"id":61,"text":"PubTabNet","bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"}},{"id":62,"text":"3","bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"}},{"id":63,"text":"3","bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"}},{"id":64,"text":"509k","bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"}},{"id":65,"text":"PNG","bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"}},{"id":66,"text":"FinTabNet","bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"}},{"id":67,"text":"3","bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"}},{"id":68,"text":"3","bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"}},{"id":69,"text":"112k","bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"}},{"id":70,"text":"PDF","bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"}},{"id":71,"text":"TableBank","bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"}},{"id":72,"text":"3","bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"}},{"id":73,"text":"7","bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"}},{"id":74,"text":"145k","bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"}},{"id":75,"text":"JPEG","bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"}},{"id":76,"text":"Combined-Tabnet(*)","bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"}},{"id":77,"text":"3","bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"}},{"id":78,"text":"3","bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"}},{"id":79,"text":"400k","bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"}},{"id":80,"text":"PNG","bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"}},{"id":81,"text":"Combined(**)","bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"}},{"id":82,"text":"3","bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"}},{"id":83,"text":"3","bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"}},{"id":84,"text":"500k","bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"}},{"id":85,"text":"PNG","bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"}},{"id":86,"text":"SynthTabNet","bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"}},{"id":87,"text":"3","bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"}},{"id":88,"text":"3","bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"}},{"id":89,"text":"600k","bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"}},{"id":90,"text":"PNG","bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ecel","ched","ched","ched","ched","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl"],"num_rows":7,"num_cols":5,"table_cells":[{"bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"Tags","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"Bbox","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"Size","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"Format","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"PubTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"509k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"FinTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"112k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PDF","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"TableBank","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"7","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"145k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"JPEG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"Combined-Tabnet(*)","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"400k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"Combined(**)","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"500k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"SynthTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"600k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false}]}}},"figures_classification":null,"equations_prediction":null},"assembled":{"elements":[{"label":"Text","id":0,"page_no":3,"cluster":{"id":0,"label":"Text","bbox":{"l":49.14799032211304,"t":74.29706525802612,"r":286.36511,"b":96.06994999999995,"coord_origin":"1"},"confidence":0.9636521339416504,"cells":[{"id":0,"text":"amount of such tables, and kept only those ones ranging","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36511,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"between 1*1 and 20*10 (rows/columns).","bbox":{"l":50.112,"t":87.16339000000005,"r":212.28319,"b":96.06994999999995,"coord_origin":"1"}}]},"text":"amount of such tables, and kept only those ones ranging between 1*1 and 20*10 (rows/columns)."},{"label":"Text","id":1,"page_no":3,"cluster":{"id":1,"label":"Text","bbox":{"l":49.21861910820007,"t":100.05254344940181,"r":286.5638860702515,"b":313.10507,"coord_origin":"1"},"confidence":0.9862114191055298,"cells":[{"id":2,"text":"The availability of the bounding boxes for all table cells","bbox":{"l":62.067001,"t":100.96038999999996,"r":286.36502,"b":109.86694,"coord_origin":"1"}},{"id":3,"text":"is essential to train our models. In order to distinguish be-","bbox":{"l":50.112,"t":112.91540999999995,"r":286.36508,"b":121.82195999999999,"coord_origin":"1"}},{"id":4,"text":"tween empty and non-empty bounding boxes, we have in-","bbox":{"l":50.112,"t":124.87041999999997,"r":286.36508,"b":133.77697999999998,"coord_origin":"1"}},{"id":5,"text":"troduced a binary class in the annotation. Unfortunately, the","bbox":{"l":50.112,"t":136.82641999999998,"r":286.36511,"b":145.73297000000002,"coord_origin":"1"}},{"id":6,"text":"original datasets either omit the bounding boxes for whole","bbox":{"l":50.112,"t":148.78143,"r":286.36511,"b":157.68799,"coord_origin":"1"}},{"id":7,"text":"tables (e.g. TableBank) or they narrow their scope only to","bbox":{"l":50.112,"t":160.73645,"r":286.36508,"b":169.64301,"coord_origin":"1"}},{"id":8,"text":"non-empty cells. Therefore, it was imperative to introduce","bbox":{"l":50.112,"t":172.69146999999998,"r":286.36505,"b":181.59802000000002,"coord_origin":"1"}},{"id":9,"text":"a data pre-processing procedure that generates the missing","bbox":{"l":50.112,"t":184.64648,"r":286.36508,"b":193.55304,"coord_origin":"1"}},{"id":10,"text":"bounding boxes out of the annotation information. This pro-","bbox":{"l":50.112,"t":196.60248,"r":286.36508,"b":205.50903000000005,"coord_origin":"1"}},{"id":11,"text":"cedure first parses the provided table structure and calcu-","bbox":{"l":50.112,"t":208.5575,"r":286.36508,"b":217.46405000000004,"coord_origin":"1"}},{"id":12,"text":"lates the dimensions of the most fine-grained grid that cov-","bbox":{"l":50.112,"t":220.51251000000002,"r":286.36511,"b":229.41907000000003,"coord_origin":"1"}},{"id":13,"text":"ers the table structure. Notice that each table cell may oc-","bbox":{"l":50.112,"t":232.46753,"r":286.36508,"b":241.37408000000005,"coord_origin":"1"}},{"id":14,"text":"cupy multiple grid squares due to row or column spans. In","bbox":{"l":50.112,"t":244.42255,"r":286.36508,"b":253.32910000000004,"coord_origin":"1"}},{"id":15,"text":"case of PubTabNet we had to compute missing bounding","bbox":{"l":50.112,"t":256.37756,"r":286.36505,"b":265.28412000000003,"coord_origin":"1"}},{"id":16,"text":"boxes for 48% of the simple and 69% of the complex ta-","bbox":{"l":50.112,"t":268.33356000000003,"r":286.36505,"b":277.24010999999996,"coord_origin":"1"}},{"id":17,"text":"bles.","bbox":{"l":50.112,"t":280.28853999999995,"r":68.652397,"b":289.1951,"coord_origin":"1"}},{"id":18,"text":"Regarding FinTabNet, 68% of the simple and 98%","bbox":{"l":75.566444,"t":280.28853999999995,"r":286.36514,"b":289.1951,"coord_origin":"1"}},{"id":19,"text":"of the complex tables require the generation of bounding","bbox":{"l":50.112,"t":292.24353,"r":286.36511,"b":301.15009,"coord_origin":"1"}},{"id":20,"text":"boxes.","bbox":{"l":50.112,"t":304.19852000000003,"r":75.695961,"b":313.10507,"coord_origin":"1"}}]},"text":"The availability of the bounding boxes for all table cells is essential to train our models. In order to distinguish between empty and non-empty bounding boxes, we have introduced a binary class in the annotation. Unfortunately, the original datasets either omit the bounding boxes for whole tables (e.g. TableBank) or they narrow their scope only to non-empty cells. Therefore, it was imperative to introduce a data pre-processing procedure that generates the missing bounding boxes out of the annotation information. This procedure first parses the provided table structure and calculates the dimensions of the most fine-grained grid that covers the table structure. Notice that each table cell may occupy multiple grid squares due to row or column spans. In case of PubTabNet we had to compute missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes."},{"label":"Text","id":2,"page_no":3,"cluster":{"id":2,"label":"Text","bbox":{"l":49.344085335731506,"t":317.04177474975586,"r":286.7534259796143,"b":435.1035724639893,"coord_origin":"1"},"confidence":0.9866138100624084,"cells":[{"id":21,"text":"As it is illustrated in Fig. 2, the table distributions from","bbox":{"l":62.067001,"t":317.99550999999997,"r":286.36499,"b":326.90207,"coord_origin":"1"}},{"id":22,"text":"all datasets are skewed towards simpler structures with","bbox":{"l":50.112,"t":329.95151,"r":286.36511,"b":338.8580600000001,"coord_origin":"1"}},{"id":23,"text":"fewer number of rows/columns. Additionally, there is very","bbox":{"l":50.112,"t":341.90649,"r":286.36502,"b":350.81305,"coord_origin":"1"}},{"id":24,"text":"limited variance in the table styles, which in case of Pub-","bbox":{"l":50.112,"t":353.8614799999999,"r":286.36505,"b":362.76804,"coord_origin":"1"}},{"id":25,"text":"TabNet and FinTabNet means one styling format for the","bbox":{"l":50.112,"t":365.81647,"r":286.36508,"b":374.72301999999996,"coord_origin":"1"}},{"id":26,"text":"majority of the tables.","bbox":{"l":50.112,"t":377.77145,"r":141.58859,"b":386.67801,"coord_origin":"1"}},{"id":27,"text":"Similar limitations appear also in","bbox":{"l":148.70189,"t":377.77145,"r":286.36508,"b":386.67801,"coord_origin":"1"}},{"id":28,"text":"the type of table content, which in some cases (e.g. FinTab-","bbox":{"l":50.112,"t":389.72644,"r":286.36508,"b":398.63300000000004,"coord_origin":"1"}},{"id":29,"text":"Net) is restricted to a certain domain. Ultimately, the lack","bbox":{"l":50.112,"t":401.68243,"r":286.36511,"b":410.58899,"coord_origin":"1"}},{"id":30,"text":"of diversity in the training dataset damages the ability of the","bbox":{"l":50.112,"t":413.63742,"r":286.36511,"b":422.54398,"coord_origin":"1"}},{"id":31,"text":"models to generalize well on unseen data.","bbox":{"l":50.112,"t":425.59241,"r":216.39774,"b":434.49896,"coord_origin":"1"}}]},"text":"As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data."},{"label":"Text","id":3,"page_no":3,"cluster":{"id":3,"label":"Text","bbox":{"l":49.16742217540741,"t":438.18905181884764,"r":286.7348659515381,"b":627.6961944580079,"coord_origin":"1"},"confidence":0.9882907867431641,"cells":[{"id":32,"text":"Motivated by those observations we aimed at generating","bbox":{"l":62.067001,"t":439.3894,"r":286.36499,"b":448.2959599999999,"coord_origin":"1"}},{"id":33,"text":"a synthetic table dataset named","bbox":{"l":50.112,"t":451.34439,"r":172.14388,"b":460.25095,"coord_origin":"1"}},{"id":34,"text":"SynthTabNet","bbox":{"l":174.14801,"t":451.43405,"r":224.70818999999997,"b":460.02182,"coord_origin":"1"}},{"id":35,"text":". This approach","bbox":{"l":224.70801,"t":451.34439,"r":286.36655,"b":460.25095,"coord_origin":"1"}},{"id":36,"text":"offers control over: 1) the size of the dataset, 2) the table","bbox":{"l":50.112015,"t":463.30038,"r":286.36505,"b":472.20694,"coord_origin":"1"}},{"id":37,"text":"structure, 3) the table style and 4) the type of content. The","bbox":{"l":50.112015,"t":475.25537,"r":286.36511,"b":484.16193,"coord_origin":"1"}},{"id":38,"text":"complexity of the table structure is described by the size of","bbox":{"l":50.112015,"t":487.21036,"r":286.36511,"b":496.11691,"coord_origin":"1"}},{"id":39,"text":"the table header and the table body, as well as the percentage","bbox":{"l":50.112015,"t":499.16534,"r":286.36508,"b":508.0719,"coord_origin":"1"}},{"id":40,"text":"of the table cells covered by row spans and column spans.","bbox":{"l":50.112015,"t":511.12033,"r":286.36505,"b":520.02689,"coord_origin":"1"}},{"id":41,"text":"A set of carefully designed styling templates provides the","bbox":{"l":50.112015,"t":523.07632,"r":286.36508,"b":531.98288,"coord_origin":"1"}},{"id":42,"text":"basis to build a wide range of table appearances. Lastly, the","bbox":{"l":50.112015,"t":535.0313100000001,"r":286.36508,"b":543.93788,"coord_origin":"1"}},{"id":43,"text":"table content is generated out of a curated collection of text","bbox":{"l":50.112015,"t":546.98633,"r":286.36511,"b":555.89288,"coord_origin":"1"}},{"id":44,"text":"corpora. By controlling the size and scope of the synthetic","bbox":{"l":50.112015,"t":558.94133,"r":286.36508,"b":567.84789,"coord_origin":"1"}},{"id":45,"text":"datasets we are able to train and evaluate our models in a","bbox":{"l":50.112015,"t":570.89633,"r":286.36511,"b":579.8028899999999,"coord_origin":"1"}},{"id":46,"text":"variety of different conditions. For example, we can first","bbox":{"l":50.112015,"t":582.85133,"r":286.36511,"b":591.75789,"coord_origin":"1"}},{"id":47,"text":"generate a highly diverse dataset to train our models and","bbox":{"l":50.112015,"t":594.80733,"r":286.36505,"b":603.71388,"coord_origin":"1"}},{"id":48,"text":"then evaluate their performance on other synthetic datasets","bbox":{"l":50.112015,"t":606.76233,"r":286.36508,"b":615.6688800000001,"coord_origin":"1"}},{"id":49,"text":"which are focused on a specific domain.","bbox":{"l":50.112015,"t":618.71733,"r":209.7527,"b":627.62389,"coord_origin":"1"}}]},"text":"Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain."},{"label":"Text","id":4,"page_no":3,"cluster":{"id":4,"label":"Text","bbox":{"l":49.457112550735474,"t":631.730834197998,"r":286.5352872848511,"b":713.151894,"coord_origin":"1"},"confidence":0.9875262975692749,"cells":[{"id":50,"text":"In this regard, we have prepared four synthetic datasets,","bbox":{"l":62.067017,"t":632.51433,"r":286.36499,"b":641.42088,"coord_origin":"1"}},{"id":51,"text":"each one containing 150k examples. The corpora to gener-","bbox":{"l":50.112015,"t":644.46933,"r":286.36508,"b":653.37589,"coord_origin":"1"}},{"id":52,"text":"ate the table text consists of the most frequent terms appear-","bbox":{"l":50.112015,"t":656.42532,"r":286.36511,"b":665.33189,"coord_origin":"1"}},{"id":53,"text":"ing in PubTabNet and FinTabNet together with randomly","bbox":{"l":50.112015,"t":668.38033,"r":286.36505,"b":677.28689,"coord_origin":"1"}},{"id":54,"text":"generated text. The first two synthetic datasets have been","bbox":{"l":50.112015,"t":680.33533,"r":286.36508,"b":689.24189,"coord_origin":"1"}},{"id":55,"text":"fine-tuned to mimic the appearance of the original datasets","bbox":{"l":50.112015,"t":692.290329,"r":286.36508,"b":701.196892,"coord_origin":"1"}},{"id":56,"text":"but encompass more complicated table structures. The third","bbox":{"l":50.112015,"t":704.245331,"r":286.36511,"b":713.151894,"coord_origin":"1"}}]},"text":"In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third"},{"label":"Table","id":5,"page_no":3,"cluster":{"id":5,"label":"Table","bbox":{"l":309.9828769683838,"t":72.70983781814573,"r":542.3903228759765,"b":155.58427963256838,"coord_origin":"1"},"confidence":0.9519661068916321,"cells":[{"id":57,"text":"Tags","bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"}},{"id":58,"text":"Bbox","bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"}},{"id":59,"text":"Size","bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"}},{"id":60,"text":"Format","bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"}},{"id":61,"text":"PubTabNet","bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"}},{"id":62,"text":"3","bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"}},{"id":63,"text":"3","bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"}},{"id":64,"text":"509k","bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"}},{"id":65,"text":"PNG","bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"}},{"id":66,"text":"FinTabNet","bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"}},{"id":67,"text":"3","bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"}},{"id":68,"text":"3","bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"}},{"id":69,"text":"112k","bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"}},{"id":70,"text":"PDF","bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"}},{"id":71,"text":"TableBank","bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"}},{"id":72,"text":"3","bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"}},{"id":73,"text":"7","bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"}},{"id":74,"text":"145k","bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"}},{"id":75,"text":"JPEG","bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"}},{"id":76,"text":"Combined-Tabnet(*)","bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"}},{"id":77,"text":"3","bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"}},{"id":78,"text":"3","bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"}},{"id":79,"text":"400k","bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"}},{"id":80,"text":"PNG","bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"}},{"id":81,"text":"Combined(**)","bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"}},{"id":82,"text":"3","bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"}},{"id":83,"text":"3","bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"}},{"id":84,"text":"500k","bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"}},{"id":85,"text":"PNG","bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"}},{"id":86,"text":"SynthTabNet","bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"}},{"id":87,"text":"3","bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"}},{"id":88,"text":"3","bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"}},{"id":89,"text":"600k","bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"}},{"id":90,"text":"PNG","bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ecel","ched","ched","ched","ched","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl"],"num_rows":7,"num_cols":5,"table_cells":[{"bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"Tags","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"Bbox","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"Size","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"Format","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"PubTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"509k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"FinTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"112k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PDF","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"TableBank","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"7","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"145k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"JPEG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"Combined-Tabnet(*)","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"400k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"Combined(**)","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"500k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"SynthTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"600k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false}]},{"label":"Caption","id":6,"page_no":3,"cluster":{"id":6,"label":"Caption","bbox":{"l":307.6622022628784,"t":166.97985019683836,"r":545.11505,"b":224.38897999999995,"coord_origin":"1"},"confidence":0.9559885263442993,"cells":[{"id":91,"text":"Table 1:","bbox":{"l":308.862,"t":167.66138,"r":344.6178,"b":176.56793000000005,"coord_origin":"1"}},{"id":92,"text":"Both","bbox":{"l":361.07602,"t":167.66138,"r":380.45328,"b":176.56793000000005,"coord_origin":"1"}},{"id":93,"text":"“Combined-Tabnet”","bbox":{"l":386.56799,"t":167.75104,"r":468.67974999999996,"b":176.33880999999997,"coord_origin":"1"}},{"id":94,"text":"and","bbox":{"l":474.79599,"t":167.66138,"r":489.18198,"b":176.56793000000005,"coord_origin":"1"}},{"id":95,"text":"”Combined-","bbox":{"l":495.29898000000003,"t":167.75104,"r":545.112,"b":176.33880999999997,"coord_origin":"1"}},{"id":96,"text":"Tabnet”","bbox":{"l":308.862,"t":179.70605,"r":341.16077,"b":188.29381999999998,"coord_origin":"1"}},{"id":97,"text":"are variations of the following: (*) The Combined-","bbox":{"l":343.457,"t":179.61639000000002,"r":545.11005,"b":188.52295000000004,"coord_origin":"1"}},{"id":98,"text":"Tabnet dataset is the processed combination of PubTabNet","bbox":{"l":308.862,"t":191.57141000000001,"r":545.11505,"b":200.47797000000003,"coord_origin":"1"}},{"id":99,"text":"and Fintabnet. (**) The combined dataset is the processed","bbox":{"l":308.862,"t":203.52643,"r":545.11499,"b":212.43298000000004,"coord_origin":"1"}},{"id":100,"text":"combination of PubTabNet, Fintabnet and TableBank.","bbox":{"l":308.862,"t":215.48242000000005,"r":523.93469,"b":224.38897999999995,"coord_origin":"1"}}]},"text":"Table 1: Both “Combined-Tabnet” and ”CombinedTabnet” are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank."},{"label":"Text","id":7,"page_no":3,"cluster":{"id":7,"label":"Text","bbox":{"l":307.8670337677002,"t":249.03968067169194,"r":545.144327545166,"b":294.39197,"coord_origin":"1"},"confidence":0.9780857563018799,"cells":[{"id":101,"text":"one adopts a colorful appearance with high contrast and the","bbox":{"l":308.862,"t":249.62041999999997,"r":545.11517,"b":258.52698,"coord_origin":"1"}},{"id":102,"text":"last one contains tables with sparse content. Lastly, we have","bbox":{"l":308.862,"t":261.57543999999996,"r":545.11517,"b":270.48199,"coord_origin":"1"}},{"id":103,"text":"combined all synthetic datasets into one big unified syn-","bbox":{"l":308.862,"t":273.5304,"r":545.11505,"b":282.43698,"coord_origin":"1"}},{"id":104,"text":"thetic dataset of 600k examples.","bbox":{"l":308.862,"t":285.48541000000006,"r":436.82169,"b":294.39197,"coord_origin":"1"}}]},"text":"one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples."},{"label":"Text","id":8,"page_no":3,"cluster":{"id":8,"label":"Text","bbox":{"l":320.1442880630493,"t":297.1657905578613,"r":542.74396,"b":306.67896,"coord_origin":"1"},"confidence":0.8936340808868408,"cells":[{"id":105,"text":"Tab. 1 summarizes the various attributes of the datasets.","bbox":{"l":320.81699,"t":297.77240000000006,"r":542.74396,"b":306.67896,"coord_origin":"1"}}]},"text":"Tab. 1 summarizes the various attributes of the datasets."},{"label":"Section-header","id":9,"page_no":3,"cluster":{"id":9,"label":"Section-header","bbox":{"l":307.9104280471802,"t":320.20690155029297,"r":444.93607000000003,"b":331.93167000000005,"coord_origin":"1"},"confidence":0.9415073394775391,"cells":[{"id":106,"text":"4.","bbox":{"l":308.862,"t":321.18396,"r":316.28476,"b":331.93167000000005,"coord_origin":"1"}},{"id":107,"text":"The TableFormer model","bbox":{"l":326.18176,"t":321.18396,"r":444.93607000000003,"b":331.93167000000005,"coord_origin":"1"}}]},"text":"4. The TableFormer model"},{"label":"Text","id":10,"page_no":3,"cluster":{"id":10,"label":"Text","bbox":{"l":307.7100322723389,"t":340.7741077423096,"r":545.5623950958252,"b":447.6058799743652,"coord_origin":"1"},"confidence":0.9868811964988708,"cells":[{"id":108,"text":"Given the image of a table, TableFormer is able to pre-","bbox":{"l":320.81699,"t":341.93939,"r":545.11499,"b":350.84594999999996,"coord_origin":"1"}},{"id":109,"text":"dict: 1) a sequence of tokens that represent the structure of","bbox":{"l":308.862,"t":353.89438,"r":545.11511,"b":362.80092999999994,"coord_origin":"1"}},{"id":110,"text":"a table, and 2) a bounding box coupled to a subset of those","bbox":{"l":308.862,"t":365.84937,"r":545.11517,"b":374.75592,"coord_origin":"1"}},{"id":111,"text":"tokens. The conversion of an image into a sequence of to-","bbox":{"l":308.862,"t":377.80435,"r":545.11505,"b":386.71091,"coord_origin":"1"}},{"id":112,"text":"kens is a well-known task [35, 16]. While attention is often","bbox":{"l":308.862,"t":389.75934000000007,"r":545.11517,"b":398.66588999999993,"coord_origin":"1"}},{"id":113,"text":"used as an implicit method to associate each token of the","bbox":{"l":308.862,"t":401.71432000000004,"r":545.11523,"b":410.62088,"coord_origin":"1"}},{"id":114,"text":"sequence with a position in the original image, an explicit","bbox":{"l":308.862,"t":413.67032,"r":545.11517,"b":422.57687,"coord_origin":"1"}},{"id":115,"text":"association between the individual table-cells and the image","bbox":{"l":308.862,"t":425.62531,"r":545.11505,"b":434.53186,"coord_origin":"1"}},{"id":116,"text":"bounding boxes is also required.","bbox":{"l":308.862,"t":437.58029,"r":437.9375,"b":446.48685000000006,"coord_origin":"1"}}]},"text":"Given the image of a table, TableFormer is able to predict: 1) a sequence of tokens that represent the structure of a table, and 2) a bounding box coupled to a subset of those tokens. The conversion of an image into a sequence of tokens is a well-known task [35, 16]. While attention is often used as an implicit method to associate each token of the sequence with a position in the original image, an explicit association between the individual table-cells and the image bounding boxes is also required."},{"label":"Section-header","id":11,"page_no":3,"cluster":{"id":11,"label":"Section-header","bbox":{"l":307.7091567993164,"t":457.1930305480957,"r":420.16058,"b":467.54633,"coord_origin":"1"},"confidence":0.930464506149292,"cells":[{"id":117,"text":"4.1.","bbox":{"l":308.862,"t":457.69427,"r":323.14081,"b":467.54633,"coord_origin":"1"}},{"id":118,"text":"Model architecture.","bbox":{"l":332.66003,"t":457.69427,"r":420.16058,"b":467.54633,"coord_origin":"1"}}]},"text":"4.1. Model architecture."},{"label":"Text","id":12,"page_no":3,"cluster":{"id":12,"label":"Text","bbox":{"l":307.7804786682129,"t":475.39464340209963,"r":545.63599319458,"b":664.99981,"coord_origin":"1"},"confidence":0.9881805777549744,"cells":[{"id":119,"text":"We now describe in detail the proposed method, which","bbox":{"l":320.81699,"t":476.76529,"r":545.11487,"b":485.67184,"coord_origin":"1"}},{"id":120,"text":"is composed of three main components, see Fig.","bbox":{"l":308.862,"t":488.72028,"r":509.02054,"b":497.62683,"coord_origin":"1"}},{"id":121,"text":"4.","bbox":{"l":515.58588,"t":488.72028,"r":523.05786,"b":497.62683,"coord_origin":"1"}},{"id":122,"text":"Our","bbox":{"l":529.62323,"t":488.72028,"r":545.11505,"b":497.62683,"coord_origin":"1"}},{"id":123,"text":"CNN Backbone Network","bbox":{"l":308.862,"t":500.76492,"r":406.34601,"b":509.35269,"coord_origin":"1"}},{"id":124,"text":"encodes the input as a feature vec-","bbox":{"l":408.87201,"t":500.67526,"r":545.1106,"b":509.58182,"coord_origin":"1"}},{"id":125,"text":"tor of predefined length.","bbox":{"l":308.862,"t":512.63126,"r":409.39459,"b":521.53781,"coord_origin":"1"}},{"id":126,"text":"The input feature vector of the","bbox":{"l":416.72705,"t":512.63126,"r":545.11505,"b":521.53781,"coord_origin":"1"}},{"id":127,"text":"encoded image is passed to the","bbox":{"l":308.862,"t":524.58624,"r":436.194,"b":533.4928,"coord_origin":"1"}},{"id":128,"text":"Structure Decoder","bbox":{"l":439.526,"t":524.6759,"r":513.86694,"b":533.26367,"coord_origin":"1"}},{"id":129,"text":"to pro-","bbox":{"l":517.43201,"t":524.58624,"r":545.10815,"b":533.4928,"coord_origin":"1"}},{"id":130,"text":"duce a sequence of HTML tags that represent the structure","bbox":{"l":308.862,"t":536.54124,"r":545.11511,"b":545.4478,"coord_origin":"1"}},{"id":131,"text":"of the table.","bbox":{"l":308.862,"t":548.49625,"r":358.5455,"b":557.4028000000001,"coord_origin":"1"}},{"id":132,"text":"With each prediction of an HTML standard","bbox":{"l":365.19055,"t":548.49625,"r":545.11517,"b":557.4028000000001,"coord_origin":"1"}},{"id":133,"text":"data cell (‘","bbox":{"l":308.862,"t":560.45125,"r":352.40851,"b":569.3578,"coord_origin":"1"}},{"id":134,"text":"<","bbox":{"l":352.409,"t":560.29184,"r":360.1579,"b":569.13863,"coord_origin":"1"}},{"id":135,"text":"td","bbox":{"l":360.15799,"t":560.45125,"r":367.90891,"b":569.3578,"coord_origin":"1"}},{"id":136,"text":">","bbox":{"l":367.909,"t":560.29184,"r":375.6579,"b":569.13863,"coord_origin":"1"}},{"id":137,"text":"’) the hidden state of that cell is passed to","bbox":{"l":375.65799,"t":560.45125,"r":545.11182,"b":569.3578,"coord_origin":"1"}},{"id":138,"text":"the Cell BBox Decoder. As for spanning cells, such as row","bbox":{"l":308.862,"t":572.40724,"r":545.11499,"b":581.3138,"coord_origin":"1"}},{"id":139,"text":"or column span, the tag is broken down to ‘","bbox":{"l":308.862,"t":584.3622399999999,"r":483.11768,"b":593.2688,"coord_origin":"1"}},{"id":140,"text":"<","bbox":{"l":483.11902,"t":584.20284,"r":490.86792,"b":593.04962,"coord_origin":"1"}},{"id":141,"text":"’, ‘rowspan=’","bbox":{"l":490.86800999999997,"t":584.3622399999999,"r":545.11438,"b":593.2688,"coord_origin":"1"}},{"id":142,"text":"or ‘colspan=’, with the number of spanning cells (attribute),","bbox":{"l":308.862,"t":596.31725,"r":545.11493,"b":605.2238,"coord_origin":"1"}},{"id":143,"text":"and ‘","bbox":{"l":308.862,"t":608.27225,"r":329.64395,"b":617.1788,"coord_origin":"1"}},{"id":144,"text":">","bbox":{"l":329.646,"t":608.11284,"r":337.3949,"b":616.9596300000001,"coord_origin":"1"}},{"id":145,"text":"’. The hidden state attached to ‘","bbox":{"l":337.39398,"t":608.27225,"r":468.5914,"b":617.1788,"coord_origin":"1"}},{"id":146,"text":"<","bbox":{"l":468.59496999999993,"t":608.11284,"r":476.34387000000004,"b":616.9596300000001,"coord_origin":"1"}},{"id":147,"text":"’ is passed to the","bbox":{"l":476.3439599999999,"t":608.27225,"r":545.11572,"b":617.1788,"coord_origin":"1"}},{"id":148,"text":"Cell BBox Decoder. A shared feed forward network (FFN)","bbox":{"l":308.86197,"t":620.22725,"r":545.11499,"b":629.1338000000001,"coord_origin":"1"}},{"id":149,"text":"receives the hidden states from the Structure Decoder, to","bbox":{"l":308.86197,"t":632.1822500000001,"r":545.11517,"b":641.08881,"coord_origin":"1"}},{"id":150,"text":"provide the final detection predictions of the bounding box","bbox":{"l":308.86197,"t":644.13824,"r":545.11511,"b":653.0448,"coord_origin":"1"}},{"id":151,"text":"coordinates and their classification.","bbox":{"l":308.86197,"t":656.09325,"r":449.42432,"b":664.99981,"coord_origin":"1"}}]},"text":"We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (‘ < td > ’) the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ‘ < ’, ‘rowspan=’ or ‘colspan=’, with the number of spanning cells (attribute), and ‘ > ’. The hidden state attached to ‘ < ’ is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification."},{"label":"Text","id":13,"page_no":3,"cluster":{"id":13,"label":"Text","bbox":{"l":307.78812446594236,"t":667.3782211303711,"r":545.5024543762207,"b":713.2305267333985,"coord_origin":"1"},"confidence":0.9761532545089722,"cells":[{"id":152,"text":"CNN Backbone Network.","bbox":{"l":320.81696,"t":668.2607,"r":431.90985,"b":677.21707,"coord_origin":"1"}},{"id":153,"text":"A ResNet-18 CNN is the","bbox":{"l":439.49896,"t":668.3802499999999,"r":545.11255,"b":677.2868100000001,"coord_origin":"1"}},{"id":154,"text":"backbone that receives the table image and encodes it as a","bbox":{"l":308.86197,"t":680.33525,"r":545.11499,"b":689.24181,"coord_origin":"1"}},{"id":155,"text":"vector of predefined length. The network has been modified","bbox":{"l":308.86197,"t":692.290253,"r":545.11511,"b":701.196815,"coord_origin":"1"}},{"id":156,"text":"by removing the linear and pooling layer, as we are not per-","bbox":{"l":308.86197,"t":704.245255,"r":545.11505,"b":713.1518169999999,"coord_origin":"1"}}]},"text":"CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-"},{"label":"Page-footer","id":14,"page_no":3,"cluster":{"id":14,"label":"Page-footer","bbox":{"l":294.49780197143554,"t":733.5627182006837,"r":300.23938751220703,"b":743.039814,"coord_origin":"1"},"confidence":0.8719172477722168,"cells":[{"id":157,"text":"4","bbox":{"l":295.12097,"t":734.133251,"r":300.10226,"b":743.039814,"coord_origin":"1"}}]},"text":"4"}],"body":[{"label":"Text","id":0,"page_no":3,"cluster":{"id":0,"label":"Text","bbox":{"l":49.14799032211304,"t":74.29706525802612,"r":286.36511,"b":96.06994999999995,"coord_origin":"1"},"confidence":0.9636521339416504,"cells":[{"id":0,"text":"amount of such tables, and kept only those ones ranging","bbox":{"l":50.112,"t":75.20836999999995,"r":286.36511,"b":84.11492999999996,"coord_origin":"1"}},{"id":1,"text":"between 1*1 and 20*10 (rows/columns).","bbox":{"l":50.112,"t":87.16339000000005,"r":212.28319,"b":96.06994999999995,"coord_origin":"1"}}]},"text":"amount of such tables, and kept only those ones ranging between 1*1 and 20*10 (rows/columns)."},{"label":"Text","id":1,"page_no":3,"cluster":{"id":1,"label":"Text","bbox":{"l":49.21861910820007,"t":100.05254344940181,"r":286.5638860702515,"b":313.10507,"coord_origin":"1"},"confidence":0.9862114191055298,"cells":[{"id":2,"text":"The availability of the bounding boxes for all table cells","bbox":{"l":62.067001,"t":100.96038999999996,"r":286.36502,"b":109.86694,"coord_origin":"1"}},{"id":3,"text":"is essential to train our models. In order to distinguish be-","bbox":{"l":50.112,"t":112.91540999999995,"r":286.36508,"b":121.82195999999999,"coord_origin":"1"}},{"id":4,"text":"tween empty and non-empty bounding boxes, we have in-","bbox":{"l":50.112,"t":124.87041999999997,"r":286.36508,"b":133.77697999999998,"coord_origin":"1"}},{"id":5,"text":"troduced a binary class in the annotation. Unfortunately, the","bbox":{"l":50.112,"t":136.82641999999998,"r":286.36511,"b":145.73297000000002,"coord_origin":"1"}},{"id":6,"text":"original datasets either omit the bounding boxes for whole","bbox":{"l":50.112,"t":148.78143,"r":286.36511,"b":157.68799,"coord_origin":"1"}},{"id":7,"text":"tables (e.g. TableBank) or they narrow their scope only to","bbox":{"l":50.112,"t":160.73645,"r":286.36508,"b":169.64301,"coord_origin":"1"}},{"id":8,"text":"non-empty cells. Therefore, it was imperative to introduce","bbox":{"l":50.112,"t":172.69146999999998,"r":286.36505,"b":181.59802000000002,"coord_origin":"1"}},{"id":9,"text":"a data pre-processing procedure that generates the missing","bbox":{"l":50.112,"t":184.64648,"r":286.36508,"b":193.55304,"coord_origin":"1"}},{"id":10,"text":"bounding boxes out of the annotation information. This pro-","bbox":{"l":50.112,"t":196.60248,"r":286.36508,"b":205.50903000000005,"coord_origin":"1"}},{"id":11,"text":"cedure first parses the provided table structure and calcu-","bbox":{"l":50.112,"t":208.5575,"r":286.36508,"b":217.46405000000004,"coord_origin":"1"}},{"id":12,"text":"lates the dimensions of the most fine-grained grid that cov-","bbox":{"l":50.112,"t":220.51251000000002,"r":286.36511,"b":229.41907000000003,"coord_origin":"1"}},{"id":13,"text":"ers the table structure. Notice that each table cell may oc-","bbox":{"l":50.112,"t":232.46753,"r":286.36508,"b":241.37408000000005,"coord_origin":"1"}},{"id":14,"text":"cupy multiple grid squares due to row or column spans. In","bbox":{"l":50.112,"t":244.42255,"r":286.36508,"b":253.32910000000004,"coord_origin":"1"}},{"id":15,"text":"case of PubTabNet we had to compute missing bounding","bbox":{"l":50.112,"t":256.37756,"r":286.36505,"b":265.28412000000003,"coord_origin":"1"}},{"id":16,"text":"boxes for 48% of the simple and 69% of the complex ta-","bbox":{"l":50.112,"t":268.33356000000003,"r":286.36505,"b":277.24010999999996,"coord_origin":"1"}},{"id":17,"text":"bles.","bbox":{"l":50.112,"t":280.28853999999995,"r":68.652397,"b":289.1951,"coord_origin":"1"}},{"id":18,"text":"Regarding FinTabNet, 68% of the simple and 98%","bbox":{"l":75.566444,"t":280.28853999999995,"r":286.36514,"b":289.1951,"coord_origin":"1"}},{"id":19,"text":"of the complex tables require the generation of bounding","bbox":{"l":50.112,"t":292.24353,"r":286.36511,"b":301.15009,"coord_origin":"1"}},{"id":20,"text":"boxes.","bbox":{"l":50.112,"t":304.19852000000003,"r":75.695961,"b":313.10507,"coord_origin":"1"}}]},"text":"The availability of the bounding boxes for all table cells is essential to train our models. In order to distinguish between empty and non-empty bounding boxes, we have introduced a binary class in the annotation. Unfortunately, the original datasets either omit the bounding boxes for whole tables (e.g. TableBank) or they narrow their scope only to non-empty cells. Therefore, it was imperative to introduce a data pre-processing procedure that generates the missing bounding boxes out of the annotation information. This procedure first parses the provided table structure and calculates the dimensions of the most fine-grained grid that covers the table structure. Notice that each table cell may occupy multiple grid squares due to row or column spans. In case of PubTabNet we had to compute missing bounding boxes for 48% of the simple and 69% of the complex tables. Regarding FinTabNet, 68% of the simple and 98% of the complex tables require the generation of bounding boxes."},{"label":"Text","id":2,"page_no":3,"cluster":{"id":2,"label":"Text","bbox":{"l":49.344085335731506,"t":317.04177474975586,"r":286.7534259796143,"b":435.1035724639893,"coord_origin":"1"},"confidence":0.9866138100624084,"cells":[{"id":21,"text":"As it is illustrated in Fig. 2, the table distributions from","bbox":{"l":62.067001,"t":317.99550999999997,"r":286.36499,"b":326.90207,"coord_origin":"1"}},{"id":22,"text":"all datasets are skewed towards simpler structures with","bbox":{"l":50.112,"t":329.95151,"r":286.36511,"b":338.8580600000001,"coord_origin":"1"}},{"id":23,"text":"fewer number of rows/columns. Additionally, there is very","bbox":{"l":50.112,"t":341.90649,"r":286.36502,"b":350.81305,"coord_origin":"1"}},{"id":24,"text":"limited variance in the table styles, which in case of Pub-","bbox":{"l":50.112,"t":353.8614799999999,"r":286.36505,"b":362.76804,"coord_origin":"1"}},{"id":25,"text":"TabNet and FinTabNet means one styling format for the","bbox":{"l":50.112,"t":365.81647,"r":286.36508,"b":374.72301999999996,"coord_origin":"1"}},{"id":26,"text":"majority of the tables.","bbox":{"l":50.112,"t":377.77145,"r":141.58859,"b":386.67801,"coord_origin":"1"}},{"id":27,"text":"Similar limitations appear also in","bbox":{"l":148.70189,"t":377.77145,"r":286.36508,"b":386.67801,"coord_origin":"1"}},{"id":28,"text":"the type of table content, which in some cases (e.g. FinTab-","bbox":{"l":50.112,"t":389.72644,"r":286.36508,"b":398.63300000000004,"coord_origin":"1"}},{"id":29,"text":"Net) is restricted to a certain domain. Ultimately, the lack","bbox":{"l":50.112,"t":401.68243,"r":286.36511,"b":410.58899,"coord_origin":"1"}},{"id":30,"text":"of diversity in the training dataset damages the ability of the","bbox":{"l":50.112,"t":413.63742,"r":286.36511,"b":422.54398,"coord_origin":"1"}},{"id":31,"text":"models to generalize well on unseen data.","bbox":{"l":50.112,"t":425.59241,"r":216.39774,"b":434.49896,"coord_origin":"1"}}]},"text":"As it is illustrated in Fig. 2, the table distributions from all datasets are skewed towards simpler structures with fewer number of rows/columns. Additionally, there is very limited variance in the table styles, which in case of PubTabNet and FinTabNet means one styling format for the majority of the tables. Similar limitations appear also in the type of table content, which in some cases (e.g. FinTabNet) is restricted to a certain domain. Ultimately, the lack of diversity in the training dataset damages the ability of the models to generalize well on unseen data."},{"label":"Text","id":3,"page_no":3,"cluster":{"id":3,"label":"Text","bbox":{"l":49.16742217540741,"t":438.18905181884764,"r":286.7348659515381,"b":627.6961944580079,"coord_origin":"1"},"confidence":0.9882907867431641,"cells":[{"id":32,"text":"Motivated by those observations we aimed at generating","bbox":{"l":62.067001,"t":439.3894,"r":286.36499,"b":448.2959599999999,"coord_origin":"1"}},{"id":33,"text":"a synthetic table dataset named","bbox":{"l":50.112,"t":451.34439,"r":172.14388,"b":460.25095,"coord_origin":"1"}},{"id":34,"text":"SynthTabNet","bbox":{"l":174.14801,"t":451.43405,"r":224.70818999999997,"b":460.02182,"coord_origin":"1"}},{"id":35,"text":". This approach","bbox":{"l":224.70801,"t":451.34439,"r":286.36655,"b":460.25095,"coord_origin":"1"}},{"id":36,"text":"offers control over: 1) the size of the dataset, 2) the table","bbox":{"l":50.112015,"t":463.30038,"r":286.36505,"b":472.20694,"coord_origin":"1"}},{"id":37,"text":"structure, 3) the table style and 4) the type of content. The","bbox":{"l":50.112015,"t":475.25537,"r":286.36511,"b":484.16193,"coord_origin":"1"}},{"id":38,"text":"complexity of the table structure is described by the size of","bbox":{"l":50.112015,"t":487.21036,"r":286.36511,"b":496.11691,"coord_origin":"1"}},{"id":39,"text":"the table header and the table body, as well as the percentage","bbox":{"l":50.112015,"t":499.16534,"r":286.36508,"b":508.0719,"coord_origin":"1"}},{"id":40,"text":"of the table cells covered by row spans and column spans.","bbox":{"l":50.112015,"t":511.12033,"r":286.36505,"b":520.02689,"coord_origin":"1"}},{"id":41,"text":"A set of carefully designed styling templates provides the","bbox":{"l":50.112015,"t":523.07632,"r":286.36508,"b":531.98288,"coord_origin":"1"}},{"id":42,"text":"basis to build a wide range of table appearances. Lastly, the","bbox":{"l":50.112015,"t":535.0313100000001,"r":286.36508,"b":543.93788,"coord_origin":"1"}},{"id":43,"text":"table content is generated out of a curated collection of text","bbox":{"l":50.112015,"t":546.98633,"r":286.36511,"b":555.89288,"coord_origin":"1"}},{"id":44,"text":"corpora. By controlling the size and scope of the synthetic","bbox":{"l":50.112015,"t":558.94133,"r":286.36508,"b":567.84789,"coord_origin":"1"}},{"id":45,"text":"datasets we are able to train and evaluate our models in a","bbox":{"l":50.112015,"t":570.89633,"r":286.36511,"b":579.8028899999999,"coord_origin":"1"}},{"id":46,"text":"variety of different conditions. For example, we can first","bbox":{"l":50.112015,"t":582.85133,"r":286.36511,"b":591.75789,"coord_origin":"1"}},{"id":47,"text":"generate a highly diverse dataset to train our models and","bbox":{"l":50.112015,"t":594.80733,"r":286.36505,"b":603.71388,"coord_origin":"1"}},{"id":48,"text":"then evaluate their performance on other synthetic datasets","bbox":{"l":50.112015,"t":606.76233,"r":286.36508,"b":615.6688800000001,"coord_origin":"1"}},{"id":49,"text":"which are focused on a specific domain.","bbox":{"l":50.112015,"t":618.71733,"r":209.7527,"b":627.62389,"coord_origin":"1"}}]},"text":"Motivated by those observations we aimed at generating a synthetic table dataset named SynthTabNet . This approach offers control over: 1) the size of the dataset, 2) the table structure, 3) the table style and 4) the type of content. The complexity of the table structure is described by the size of the table header and the table body, as well as the percentage of the table cells covered by row spans and column spans. A set of carefully designed styling templates provides the basis to build a wide range of table appearances. Lastly, the table content is generated out of a curated collection of text corpora. By controlling the size and scope of the synthetic datasets we are able to train and evaluate our models in a variety of different conditions. For example, we can first generate a highly diverse dataset to train our models and then evaluate their performance on other synthetic datasets which are focused on a specific domain."},{"label":"Text","id":4,"page_no":3,"cluster":{"id":4,"label":"Text","bbox":{"l":49.457112550735474,"t":631.730834197998,"r":286.5352872848511,"b":713.151894,"coord_origin":"1"},"confidence":0.9875262975692749,"cells":[{"id":50,"text":"In this regard, we have prepared four synthetic datasets,","bbox":{"l":62.067017,"t":632.51433,"r":286.36499,"b":641.42088,"coord_origin":"1"}},{"id":51,"text":"each one containing 150k examples. The corpora to gener-","bbox":{"l":50.112015,"t":644.46933,"r":286.36508,"b":653.37589,"coord_origin":"1"}},{"id":52,"text":"ate the table text consists of the most frequent terms appear-","bbox":{"l":50.112015,"t":656.42532,"r":286.36511,"b":665.33189,"coord_origin":"1"}},{"id":53,"text":"ing in PubTabNet and FinTabNet together with randomly","bbox":{"l":50.112015,"t":668.38033,"r":286.36505,"b":677.28689,"coord_origin":"1"}},{"id":54,"text":"generated text. The first two synthetic datasets have been","bbox":{"l":50.112015,"t":680.33533,"r":286.36508,"b":689.24189,"coord_origin":"1"}},{"id":55,"text":"fine-tuned to mimic the appearance of the original datasets","bbox":{"l":50.112015,"t":692.290329,"r":286.36508,"b":701.196892,"coord_origin":"1"}},{"id":56,"text":"but encompass more complicated table structures. The third","bbox":{"l":50.112015,"t":704.245331,"r":286.36511,"b":713.151894,"coord_origin":"1"}}]},"text":"In this regard, we have prepared four synthetic datasets, each one containing 150k examples. The corpora to generate the table text consists of the most frequent terms appearing in PubTabNet and FinTabNet together with randomly generated text. The first two synthetic datasets have been fine-tuned to mimic the appearance of the original datasets but encompass more complicated table structures. The third"},{"label":"Table","id":5,"page_no":3,"cluster":{"id":5,"label":"Table","bbox":{"l":309.9828769683838,"t":72.70983781814573,"r":542.3903228759765,"b":155.58427963256838,"coord_origin":"1"},"confidence":0.9519661068916321,"cells":[{"id":57,"text":"Tags","bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"}},{"id":58,"text":"Bbox","bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"}},{"id":59,"text":"Size","bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"}},{"id":60,"text":"Format","bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"}},{"id":61,"text":"PubTabNet","bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"}},{"id":62,"text":"3","bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"}},{"id":63,"text":"3","bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"}},{"id":64,"text":"509k","bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"}},{"id":65,"text":"PNG","bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"}},{"id":66,"text":"FinTabNet","bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"}},{"id":67,"text":"3","bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"}},{"id":68,"text":"3","bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"}},{"id":69,"text":"112k","bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"}},{"id":70,"text":"PDF","bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"}},{"id":71,"text":"TableBank","bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"}},{"id":72,"text":"3","bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"}},{"id":73,"text":"7","bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"}},{"id":74,"text":"145k","bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"}},{"id":75,"text":"JPEG","bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"}},{"id":76,"text":"Combined-Tabnet(*)","bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"}},{"id":77,"text":"3","bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"}},{"id":78,"text":"3","bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"}},{"id":79,"text":"400k","bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"}},{"id":80,"text":"PNG","bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"}},{"id":81,"text":"Combined(**)","bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"}},{"id":82,"text":"3","bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"}},{"id":83,"text":"3","bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"}},{"id":84,"text":"500k","bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"}},{"id":85,"text":"PNG","bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"}},{"id":86,"text":"SynthTabNet","bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"}},{"id":87,"text":"3","bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"}},{"id":88,"text":"3","bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"}},{"id":89,"text":"600k","bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"}},{"id":90,"text":"PNG","bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"}}]},"text":null,"otsl_seq":["ecel","ched","ched","ched","ched","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl","rhed","fcel","fcel","fcel","fcel","nl"],"num_rows":7,"num_cols":5,"table_cells":[{"bbox":{"l":412.332,"t":73.61437999999998,"r":430.90231,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"Tags","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":442.85742,"t":73.61437999999998,"r":464.4463799999999,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"Bbox","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":477.78632,"t":73.61437999999998,"r":494.94193,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"Size","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":508.28186,"t":73.61437999999998,"r":536.91437,"b":82.52094,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":0,"end_row_offset_idx":1,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"Format","column_header":true,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":85.9673499999999,"r":361.64264,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"PubTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":85.6684600000001,"r":425.37775,"b":94.88385000000017,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":85.6684600000001,"r":457.41745000000003,"b":94.88385000000017,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":85.9673499999999,"r":496.3262,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"509k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":85.9673499999999,"r":532.56012,"b":94.87390000000005,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":1,"end_row_offset_idx":2,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":97.92236000000003,"r":359.43094,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"FinTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":97.62347,"r":425.37775,"b":106.83887000000016,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":97.62347,"r":457.41745000000003,"b":106.83887000000016,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":97.92236000000003,"r":496.3262,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"112k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":513.46185,"t":97.92236000000003,"r":531.73328,"b":106.82892000000004,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":2,"end_row_offset_idx":3,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PDF","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":109.87836000000004,"r":359.97888,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"TableBank","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":109.57947000000001,"r":425.37775,"b":118.79485999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":450.81226,"t":109.57947000000001,"r":456.50091999999995,"b":118.79485999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"7","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":109.87836000000004,"r":496.3262,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"145k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":511.25017999999994,"t":109.87836000000004,"r":533.94501,"b":118.78490999999997,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":3,"end_row_offset_idx":4,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"JPEG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":121.83336999999995,"r":400.37723,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"Combined-Tabnet(*)","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":121.53448000000003,"r":425.37775,"b":130.74987999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":121.53448000000003,"r":457.41745000000003,"b":130.74987999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":121.83336999999995,"r":496.3262,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"400k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":121.83336999999995,"r":532.56012,"b":130.73992999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":4,"end_row_offset_idx":5,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":133.78839000000005,"r":375.17184,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"Combined(**)","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":133.48950000000002,"r":425.37775,"b":142.70489999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":133.48950000000002,"r":457.41745000000003,"b":142.70489999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":133.78839000000005,"r":496.3262,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"500k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":133.78839000000005,"r":532.56012,"b":142.69494999999995,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":5,"end_row_offset_idx":6,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":317.06,"t":145.74341000000004,"r":369.39352,"b":154.64995999999996,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":0,"end_col_offset_idx":1,"text":"SynthTabNet","column_header":false,"row_header":true,"row_section":false},{"bbox":{"l":417.85599,"t":145.44446000000005,"r":425.37775,"b":154.65985,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":1,"end_col_offset_idx":2,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":449.89569,"t":145.44446000000005,"r":457.41745000000003,"b":154.65985,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":2,"end_col_offset_idx":3,"text":"3","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":476.401,"t":145.74334999999996,"r":496.3262,"b":154.6499,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":3,"end_col_offset_idx":4,"text":"600k","column_header":false,"row_header":false,"row_section":false},{"bbox":{"l":512.63495,"t":145.74334999999996,"r":532.56012,"b":154.6499,"coord_origin":"1"},"row_span":1,"col_span":1,"start_row_offset_idx":6,"end_row_offset_idx":7,"start_col_offset_idx":4,"end_col_offset_idx":5,"text":"PNG","column_header":false,"row_header":false,"row_section":false}]},{"label":"Caption","id":6,"page_no":3,"cluster":{"id":6,"label":"Caption","bbox":{"l":307.6622022628784,"t":166.97985019683836,"r":545.11505,"b":224.38897999999995,"coord_origin":"1"},"confidence":0.9559885263442993,"cells":[{"id":91,"text":"Table 1:","bbox":{"l":308.862,"t":167.66138,"r":344.6178,"b":176.56793000000005,"coord_origin":"1"}},{"id":92,"text":"Both","bbox":{"l":361.07602,"t":167.66138,"r":380.45328,"b":176.56793000000005,"coord_origin":"1"}},{"id":93,"text":"“Combined-Tabnet”","bbox":{"l":386.56799,"t":167.75104,"r":468.67974999999996,"b":176.33880999999997,"coord_origin":"1"}},{"id":94,"text":"and","bbox":{"l":474.79599,"t":167.66138,"r":489.18198,"b":176.56793000000005,"coord_origin":"1"}},{"id":95,"text":"”Combined-","bbox":{"l":495.29898000000003,"t":167.75104,"r":545.112,"b":176.33880999999997,"coord_origin":"1"}},{"id":96,"text":"Tabnet”","bbox":{"l":308.862,"t":179.70605,"r":341.16077,"b":188.29381999999998,"coord_origin":"1"}},{"id":97,"text":"are variations of the following: (*) The Combined-","bbox":{"l":343.457,"t":179.61639000000002,"r":545.11005,"b":188.52295000000004,"coord_origin":"1"}},{"id":98,"text":"Tabnet dataset is the processed combination of PubTabNet","bbox":{"l":308.862,"t":191.57141000000001,"r":545.11505,"b":200.47797000000003,"coord_origin":"1"}},{"id":99,"text":"and Fintabnet. (**) The combined dataset is the processed","bbox":{"l":308.862,"t":203.52643,"r":545.11499,"b":212.43298000000004,"coord_origin":"1"}},{"id":100,"text":"combination of PubTabNet, Fintabnet and TableBank.","bbox":{"l":308.862,"t":215.48242000000005,"r":523.93469,"b":224.38897999999995,"coord_origin":"1"}}]},"text":"Table 1: Both “Combined-Tabnet” and ”CombinedTabnet” are variations of the following: (*) The CombinedTabnet dataset is the processed combination of PubTabNet and Fintabnet. (**) The combined dataset is the processed combination of PubTabNet, Fintabnet and TableBank."},{"label":"Text","id":7,"page_no":3,"cluster":{"id":7,"label":"Text","bbox":{"l":307.8670337677002,"t":249.03968067169194,"r":545.144327545166,"b":294.39197,"coord_origin":"1"},"confidence":0.9780857563018799,"cells":[{"id":101,"text":"one adopts a colorful appearance with high contrast and the","bbox":{"l":308.862,"t":249.62041999999997,"r":545.11517,"b":258.52698,"coord_origin":"1"}},{"id":102,"text":"last one contains tables with sparse content. Lastly, we have","bbox":{"l":308.862,"t":261.57543999999996,"r":545.11517,"b":270.48199,"coord_origin":"1"}},{"id":103,"text":"combined all synthetic datasets into one big unified syn-","bbox":{"l":308.862,"t":273.5304,"r":545.11505,"b":282.43698,"coord_origin":"1"}},{"id":104,"text":"thetic dataset of 600k examples.","bbox":{"l":308.862,"t":285.48541000000006,"r":436.82169,"b":294.39197,"coord_origin":"1"}}]},"text":"one adopts a colorful appearance with high contrast and the last one contains tables with sparse content. Lastly, we have combined all synthetic datasets into one big unified synthetic dataset of 600k examples."},{"label":"Text","id":8,"page_no":3,"cluster":{"id":8,"label":"Text","bbox":{"l":320.1442880630493,"t":297.1657905578613,"r":542.74396,"b":306.67896,"coord_origin":"1"},"confidence":0.8936340808868408,"cells":[{"id":105,"text":"Tab. 1 summarizes the various attributes of the datasets.","bbox":{"l":320.81699,"t":297.77240000000006,"r":542.74396,"b":306.67896,"coord_origin":"1"}}]},"text":"Tab. 1 summarizes the various attributes of the datasets."},{"label":"Section-header","id":9,"page_no":3,"cluster":{"id":9,"label":"Section-header","bbox":{"l":307.9104280471802,"t":320.20690155029297,"r":444.93607000000003,"b":331.93167000000005,"coord_origin":"1"},"confidence":0.9415073394775391,"cells":[{"id":106,"text":"4.","bbox":{"l":308.862,"t":321.18396,"r":316.28476,"b":331.93167000000005,"coord_origin":"1"}},{"id":107,"text":"The TableFormer model","bbox":{"l":326.18176,"t":321.18396,"r":444.93607000000003,"b":331.93167000000005,"coord_origin":"1"}}]},"text":"4. The TableFormer model"},{"label":"Text","id":10,"page_no":3,"cluster":{"id":10,"label":"Text","bbox":{"l":307.7100322723389,"t":340.7741077423096,"r":545.5623950958252,"b":447.6058799743652,"coord_origin":"1"},"confidence":0.9868811964988708,"cells":[{"id":108,"text":"Given the image of a table, TableFormer is able to pre-","bbox":{"l":320.81699,"t":341.93939,"r":545.11499,"b":350.84594999999996,"coord_origin":"1"}},{"id":109,"text":"dict: 1) a sequence of tokens that represent the structure of","bbox":{"l":308.862,"t":353.89438,"r":545.11511,"b":362.80092999999994,"coord_origin":"1"}},{"id":110,"text":"a table, and 2) a bounding box coupled to a subset of those","bbox":{"l":308.862,"t":365.84937,"r":545.11517,"b":374.75592,"coord_origin":"1"}},{"id":111,"text":"tokens. The conversion of an image into a sequence of to-","bbox":{"l":308.862,"t":377.80435,"r":545.11505,"b":386.71091,"coord_origin":"1"}},{"id":112,"text":"kens is a well-known task [35, 16]. While attention is often","bbox":{"l":308.862,"t":389.75934000000007,"r":545.11517,"b":398.66588999999993,"coord_origin":"1"}},{"id":113,"text":"used as an implicit method to associate each token of the","bbox":{"l":308.862,"t":401.71432000000004,"r":545.11523,"b":410.62088,"coord_origin":"1"}},{"id":114,"text":"sequence with a position in the original image, an explicit","bbox":{"l":308.862,"t":413.67032,"r":545.11517,"b":422.57687,"coord_origin":"1"}},{"id":115,"text":"association between the individual table-cells and the image","bbox":{"l":308.862,"t":425.62531,"r":545.11505,"b":434.53186,"coord_origin":"1"}},{"id":116,"text":"bounding boxes is also required.","bbox":{"l":308.862,"t":437.58029,"r":437.9375,"b":446.48685000000006,"coord_origin":"1"}}]},"text":"Given the image of a table, TableFormer is able to predict: 1) a sequence of tokens that represent the structure of a table, and 2) a bounding box coupled to a subset of those tokens. The conversion of an image into a sequence of tokens is a well-known task [35, 16]. While attention is often used as an implicit method to associate each token of the sequence with a position in the original image, an explicit association between the individual table-cells and the image bounding boxes is also required."},{"label":"Section-header","id":11,"page_no":3,"cluster":{"id":11,"label":"Section-header","bbox":{"l":307.7091567993164,"t":457.1930305480957,"r":420.16058,"b":467.54633,"coord_origin":"1"},"confidence":0.930464506149292,"cells":[{"id":117,"text":"4.1.","bbox":{"l":308.862,"t":457.69427,"r":323.14081,"b":467.54633,"coord_origin":"1"}},{"id":118,"text":"Model architecture.","bbox":{"l":332.66003,"t":457.69427,"r":420.16058,"b":467.54633,"coord_origin":"1"}}]},"text":"4.1. Model architecture."},{"label":"Text","id":12,"page_no":3,"cluster":{"id":12,"label":"Text","bbox":{"l":307.7804786682129,"t":475.39464340209963,"r":545.63599319458,"b":664.99981,"coord_origin":"1"},"confidence":0.9881805777549744,"cells":[{"id":119,"text":"We now describe in detail the proposed method, which","bbox":{"l":320.81699,"t":476.76529,"r":545.11487,"b":485.67184,"coord_origin":"1"}},{"id":120,"text":"is composed of three main components, see Fig.","bbox":{"l":308.862,"t":488.72028,"r":509.02054,"b":497.62683,"coord_origin":"1"}},{"id":121,"text":"4.","bbox":{"l":515.58588,"t":488.72028,"r":523.05786,"b":497.62683,"coord_origin":"1"}},{"id":122,"text":"Our","bbox":{"l":529.62323,"t":488.72028,"r":545.11505,"b":497.62683,"coord_origin":"1"}},{"id":123,"text":"CNN Backbone Network","bbox":{"l":308.862,"t":500.76492,"r":406.34601,"b":509.35269,"coord_origin":"1"}},{"id":124,"text":"encodes the input as a feature vec-","bbox":{"l":408.87201,"t":500.67526,"r":545.1106,"b":509.58182,"coord_origin":"1"}},{"id":125,"text":"tor of predefined length.","bbox":{"l":308.862,"t":512.63126,"r":409.39459,"b":521.53781,"coord_origin":"1"}},{"id":126,"text":"The input feature vector of the","bbox":{"l":416.72705,"t":512.63126,"r":545.11505,"b":521.53781,"coord_origin":"1"}},{"id":127,"text":"encoded image is passed to the","bbox":{"l":308.862,"t":524.58624,"r":436.194,"b":533.4928,"coord_origin":"1"}},{"id":128,"text":"Structure Decoder","bbox":{"l":439.526,"t":524.6759,"r":513.86694,"b":533.26367,"coord_origin":"1"}},{"id":129,"text":"to pro-","bbox":{"l":517.43201,"t":524.58624,"r":545.10815,"b":533.4928,"coord_origin":"1"}},{"id":130,"text":"duce a sequence of HTML tags that represent the structure","bbox":{"l":308.862,"t":536.54124,"r":545.11511,"b":545.4478,"coord_origin":"1"}},{"id":131,"text":"of the table.","bbox":{"l":308.862,"t":548.49625,"r":358.5455,"b":557.4028000000001,"coord_origin":"1"}},{"id":132,"text":"With each prediction of an HTML standard","bbox":{"l":365.19055,"t":548.49625,"r":545.11517,"b":557.4028000000001,"coord_origin":"1"}},{"id":133,"text":"data cell (‘","bbox":{"l":308.862,"t":560.45125,"r":352.40851,"b":569.3578,"coord_origin":"1"}},{"id":134,"text":"<","bbox":{"l":352.409,"t":560.29184,"r":360.1579,"b":569.13863,"coord_origin":"1"}},{"id":135,"text":"td","bbox":{"l":360.15799,"t":560.45125,"r":367.90891,"b":569.3578,"coord_origin":"1"}},{"id":136,"text":">","bbox":{"l":367.909,"t":560.29184,"r":375.6579,"b":569.13863,"coord_origin":"1"}},{"id":137,"text":"’) the hidden state of that cell is passed to","bbox":{"l":375.65799,"t":560.45125,"r":545.11182,"b":569.3578,"coord_origin":"1"}},{"id":138,"text":"the Cell BBox Decoder. As for spanning cells, such as row","bbox":{"l":308.862,"t":572.40724,"r":545.11499,"b":581.3138,"coord_origin":"1"}},{"id":139,"text":"or column span, the tag is broken down to ‘","bbox":{"l":308.862,"t":584.3622399999999,"r":483.11768,"b":593.2688,"coord_origin":"1"}},{"id":140,"text":"<","bbox":{"l":483.11902,"t":584.20284,"r":490.86792,"b":593.04962,"coord_origin":"1"}},{"id":141,"text":"’, ‘rowspan=’","bbox":{"l":490.86800999999997,"t":584.3622399999999,"r":545.11438,"b":593.2688,"coord_origin":"1"}},{"id":142,"text":"or ‘colspan=’, with the number of spanning cells (attribute),","bbox":{"l":308.862,"t":596.31725,"r":545.11493,"b":605.2238,"coord_origin":"1"}},{"id":143,"text":"and ‘","bbox":{"l":308.862,"t":608.27225,"r":329.64395,"b":617.1788,"coord_origin":"1"}},{"id":144,"text":">","bbox":{"l":329.646,"t":608.11284,"r":337.3949,"b":616.9596300000001,"coord_origin":"1"}},{"id":145,"text":"’. The hidden state attached to ‘","bbox":{"l":337.39398,"t":608.27225,"r":468.5914,"b":617.1788,"coord_origin":"1"}},{"id":146,"text":"<","bbox":{"l":468.59496999999993,"t":608.11284,"r":476.34387000000004,"b":616.9596300000001,"coord_origin":"1"}},{"id":147,"text":"’ is passed to the","bbox":{"l":476.3439599999999,"t":608.27225,"r":545.11572,"b":617.1788,"coord_origin":"1"}},{"id":148,"text":"Cell BBox Decoder. A shared feed forward network (FFN)","bbox":{"l":308.86197,"t":620.22725,"r":545.11499,"b":629.1338000000001,"coord_origin":"1"}},{"id":149,"text":"receives the hidden states from the Structure Decoder, to","bbox":{"l":308.86197,"t":632.1822500000001,"r":545.11517,"b":641.08881,"coord_origin":"1"}},{"id":150,"text":"provide the final detection predictions of the bounding box","bbox":{"l":308.86197,"t":644.13824,"r":545.11511,"b":653.0448,"coord_origin":"1"}},{"id":151,"text":"coordinates and their classification.","bbox":{"l":308.86197,"t":656.09325,"r":449.42432,"b":664.99981,"coord_origin":"1"}}]},"text":"We now describe in detail the proposed method, which is composed of three main components, see Fig. 4. Our CNN Backbone Network encodes the input as a feature vector of predefined length. The input feature vector of the encoded image is passed to the Structure Decoder to produce a sequence of HTML tags that represent the structure of the table. With each prediction of an HTML standard data cell (‘ < td > ’) the hidden state of that cell is passed to the Cell BBox Decoder. As for spanning cells, such as row or column span, the tag is broken down to ‘ < ’, ‘rowspan=’ or ‘colspan=’, with the number of spanning cells (attribute), and ‘ > ’. The hidden state attached to ‘ < ’ is passed to the Cell BBox Decoder. A shared feed forward network (FFN) receives the hidden states from the Structure Decoder, to provide the final detection predictions of the bounding box coordinates and their classification."},{"label":"Text","id":13,"page_no":3,"cluster":{"id":13,"label":"Text","bbox":{"l":307.78812446594236,"t":667.3782211303711,"r":545.5024543762207,"b":713.2305267333985,"coord_origin":"1"},"confidence":0.9761532545089722,"cells":[{"id":152,"text":"CNN Backbone Network.","bbox":{"l":320.81696,"t":668.2607,"r":431.90985,"b":677.21707,"coord_origin":"1"}},{"id":153,"text":"A ResNet-18 CNN is the","bbox":{"l":439.49896,"t":668.3802499999999,"r":545.11255,"b":677.2868100000001,"coord_origin":"1"}},{"id":154,"text":"backbone that receives the table image and encodes it as a","bbox":{"l":308.86197,"t":680.33525,"r":545.11499,"b":689.24181,"coord_origin":"1"}},{"id":155,"text":"vector of predefined length. The network has been modified","bbox":{"l":308.86197,"t":692.290253,"r":545.11511,"b":701.196815,"coord_origin":"1"}},{"id":156,"text":"by removing the linear and pooling layer, as we are not per-","bbox":{"l":308.86197,"t":704.245255,"r":545.11505,"b":713.1518169999999,"coord_origin":"1"}}]},"text":"CNN Backbone Network. A ResNet-18 CNN is the backbone that receives the table image and encodes it as a vector of predefined length. The network has been modified by removing the linear and pooling layer, as we are not per-"}],"headers":[{"label":"Page-footer","id":14,"page_no":3,"cluster":{"id":14,"label":"Page-footer","bbox":{"l":294.49780197143554,"t":733.5627182006837,"r":300.23938751220703,"b":743.039814,"coord_origin":"1"},"confidence":0.8719172477722168,"cells":[{"id":157,"text":"4","bbox":{"l":295.12097,"t":734.133251,"r":300.10226,"b":743.039814,"coord_origin":"1"}}]},"text":"4"}]}},{"page_no":4,"page_hash":"50115d582a0897fe1dd520a6876ec3f9321690ed0f6cfdc99a8d09019be073e8","size":{"width":612.0,"height":792.0},"cells":[{"id":0,"text":"1.","bbox":{"l":81.688072,"t":122.43970000000002,"r":84.927567,"b":125.62891000000002,"coord_origin":"1"}},{"id":1,"text":"Item","bbox":{"l":86.54731,"t":122.43970000000002,"r":93.026291,"b":125.62891000000002,"coord_origin":"1"}},{"id":2,"text":"Amount","bbox":{"l":102.50498,"t":115.25214000000005,"r":115.3461,"b":118.44135000000006,"coord_origin":"1"}},{"id":3,"text":"Names","bbox":{"l":82.140205,"t":115.21489999999994,"r":93.291527,"b":118.40410999999995,"coord_origin":"1"}},{"id":4,"text":"1000","bbox":{"l":96.748268,"t":122.43970000000002,"r":104.3119,"b":125.62891000000002,"coord_origin":"1"}},{"id":5,"text":"500","bbox":{"l":96.748268,"t":127.74370999999985,"r":102.42083,"b":130.93291999999997,"coord_origin":"1"}},{"id":6,"text":"3500","bbox":{"l":96.748268,"t":133.45569,"r":104.3119,"b":136.6449,"coord_origin":"1"}},{"id":7,"text":"150","bbox":{"l":96.748268,"t":139.16772000000003,"r":102.42083,"b":142.35693000000003,"coord_origin":"1"}},{"id":8,"text":"unit","bbox":{"l":110.66107,"t":122.43970000000002,"r":116.14391,"b":125.62891000000002,"coord_origin":"1"}},{"id":9,"text":"unit","bbox":{"l":110.66107,"t":127.74370999999985,"r":116.14391,"b":130.93291999999997,"coord_origin":"1"}},{"id":10,"text":"unit","bbox":{"l":110.66107,"t":133.45569,"r":116.14391,"b":136.6449,"coord_origin":"1"}},{"id":11,"text":"unit","bbox":{"l":110.66107,"t":139.16772000000003,"r":116.14391,"b":142.35693000000003,"coord_origin":"1"}},{"id":12,"text":"2.","bbox":{"l":81.688072,"t":127.74370999999985,"r":84.927567,"b":130.93291999999997,"coord_origin":"1"}},{"id":13,"text":"Item","bbox":{"l":86.54731,"t":127.74370999999985,"r":93.026291,"b":130.93291999999997,"coord_origin":"1"}},{"id":14,"text":"3.","bbox":{"l":81.688072,"t":133.45569,"r":84.927567,"b":136.6449,"coord_origin":"1"}},{"id":15,"text":"Item","bbox":{"l":86.54731,"t":133.45569,"r":93.026291,"b":136.6449,"coord_origin":"1"}},{"id":16,"text":"4.","bbox":{"l":81.688072,"t":139.16772000000003,"r":84.927567,"b":142.35693000000003,"coord_origin":"1"}},{"id":17,"text":"Item","bbox":{"l":86.54731,"t":139.16772000000003,"r":93.026291,"b":142.35693000000003,"coord_origin":"1"}},{"id":18,"text":"Extracted","bbox":{"l":88.084389,"t":90.49738000000002,"r":113.93649,"b":96.23798,"coord_origin":"1"}},{"id":19,"text":"Table Images","bbox":{"l":82.81002,"t":97.63738999999998,"r":119.21240000000002,"b":103.37798999999995,"coord_origin":"1"}},{"id":20,"text":"Standardized","bbox":{"l":143.94247,"t":100.60235999999998,"r":180.01131,"b":106.34295999999995,"coord_origin":"1"}},{"id":21,"text":"Images","bbox":{"l":151.94064,"t":107.74237000000005,"r":172.0118,"b":113.48297000000014,"coord_origin":"1"}},{"id":22,"text":"BBox","bbox":{"l":251.76939000000002,"t":80.93096999999989,"r":266.39557,"b":86.67156999999997,"coord_origin":"1"}},{"id":23,"text":"Decoder","bbox":{"l":247.51601,"t":86.03101000000004,"r":270.65021,"b":91.77161000000001,"coord_origin":"1"}},{"id":24,"text":"BBoxes","bbox":{"l":331.03699,"t":78.55980999999997,"r":352.12589,"b":84.30042000000003,"coord_origin":"1"}},{"id":25,"text":"BBoxes can be","bbox":{"l":390.56421,"t":96.03223000000003,"r":431.7261,"b":101.77282999999989,"coord_origin":"1"}},{"id":26,"text":"traced back to the","bbox":{"l":386.82422,"t":102.15228000000013,"r":435.46966999999995,"b":107.89287999999999,"coord_origin":"1"}},{"id":27,"text":"original image to","bbox":{"l":388.69589,"t":108.27228000000002,"r":433.6032400000001,"b":114.01288000000011,"coord_origin":"1"}},{"id":28,"text":"extract content","bbox":{"l":391.07761,"t":114.39227000000005,"r":431.22542999999996,"b":120.13286999999991,"coord_origin":"1"}},{"id":29,"text":"Structure Tags sequence","bbox":{"l":431.22650000000004,"t":151.68511999999998,"r":498.82068,"b":157.42571999999996,"coord_origin":"1"}},{"id":30,"text":"provide full description of","bbox":{"l":431.1738,"t":157.80517999999995,"r":498.87753000000004,"b":163.54578000000004,"coord_origin":"1"}},{"id":31,"text":"the table structure","bbox":{"l":440.5289,"t":163.92516999999998,"r":489.51827999999995,"b":169.66576999999995,"coord_origin":"1"}},{"id":32,"text":"Structure Tags","bbox":{"l":328.37479,"t":178.25385000000006,"r":367.72333,"b":183.99445000000003,"coord_origin":"1"}},{"id":33,"text":"BBoxes in sync","bbox":{"l":331.84451,"t":123.90886999999998,"r":373.67963,"b":129.64948000000015,"coord_origin":"1"}},{"id":34,"text":"with tag sequence","bbox":{"l":331.84451,"t":129.00885000000017,"r":381.17786,"b":134.74945000000002,"coord_origin":"1"}},{"id":35,"text":"Encoder","bbox":{"l":196.62633,"t":88.11621000000002,"r":219.42332,"b":93.85681,"coord_origin":"1"}},{"id":36,"text":"Structure","bbox":{"l":246.66771,"t":129.4946900000001,"r":271.49899,"b":135.23528999999996,"coord_origin":"1"}},{"id":37,"text":"Decoder","bbox":{"l":247.51601,"t":134.59473000000003,"r":270.65021,"b":140.33533,"coord_origin":"1"}},{"id":38,"text":"[x1, y2, x2, y2]","bbox":{"l":330.63071,"t":89.01923,"r":365.55347,"b":94.75982999999997,"coord_origin":"1"}},{"id":39,"text":"[x1', y2', x2', y2']","bbox":{"l":330.63071,"t":97.17926,"r":370.22717,"b":102.91985999999997,"coord_origin":"1"}},{"id":40,"text":"[x1'', y2'', x2'', y2'']","bbox":{"l":330.63071,"t":105.33922999999993,"r":374.51157,"b":111.07983000000002,"coord_origin":"1"}},{"id":41,"text":"...","bbox":{"l":330.63071,"t":113.49926999999991,"r":335.73233,"b":119.23987,"coord_origin":"1"}},{"id":42,"text":"