mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-30 14:04:27 +00:00
1 line
33 KiB
JSON
1 line
33 KiB
JSON
{"_name": "", "type": "pdf-document", "description": {"title": null, "abstract": null, "authors": null, "affiliations": null, "subjects": null, "keywords": null, "publication_date": null, "languages": null, "license": null, "publishers": null, "url_refs": null, "references": null, "publication": null, "reference_count": null, "citation_count": null, "citation_date": null, "advanced": null, "analytics": null, "logs": [], "collection": null, "acquisition": null}, "file-info": {"filename": "2206.01062.pdf", "filename-prov": null, "document-hash": "ea5bd3ba45359d9f21632f29ac48cd8d7931b4e3dce1595ac524a1e3e8f17c68", "#-pages": 9, "collection-name": null, "description": null, "page-hashes": [{"hash": "8953a93154d76e567fd12cbedc80fdd96acd7b95f8796fdd99e6323e9b5e62e5", "model": "default", "page": 1}, {"hash": "95fd7493687c826ad61870d95fe51c293e5ff2d0ced3852dccca2724152476ab", "model": "default", "page": 2}, {"hash": "eb5b7ec90656ea3cfa128b31b9432372311744f14c489749e696d6a2eab71cc2", "model": "default", "page": 3}, {"hash": "c21e9c23ddb16c953b61dc355143d0df64f633c9d3e9933811a01475c6361444", "model": "default", "page": 4}, {"hash": "8bdd7d75da6d0379991f2d1ec5d4593ecd41a6423d24b77d6d18f339b22c8fc2", "model": "default", "page": 5}, {"hash": "a32fa49cde50042ed0a0620f5015e210f5ef4c09508fb7a2d801ebeaa36418ba", "model": "default", "page": 6}, {"hash": "874e4b99a0c8e3ade493554d3d3dab9020e212a30b13906b54802e625fec32f8", "model": "default", "page": 7}, {"hash": "fc85d29ecb3220967463748596069586cfb6b5a9ee4196aa4a4a5c7da14cd9ca", "model": "default", "page": 8}, {"hash": "63f84ea4aeecf4daa62599747b3722a22426f99924ca5fef9424a1a7f9ba7be2", "model": "default", "page": 9}]}, "main-text": [{"prov": [{"bbox": [107.3, 692.51, 505.062, 707.772], "page": 1, "span": [0, 46], "__ref_s3_data": null}], "text": "DocLayNet: A Large Human-Annotated Dataset for", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [102.06, 647.352, 182.638, 657.754], "page": 1, "span": [0, 16], "__ref_s3_data": null}], "text": "Birgit Pfitzmann", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [268.624, 647.352, 344.599, 657.754], "page": 1, "span": [0, 14], "__ref_s3_data": null}], "text": "Christoph Auer", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [437.693, 647.352, 503.602, 657.754], "page": 1, "span": [0, 13], "__ref_s3_data": null}], "text": "Michele Dolfi", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [182.268, 588.967, 265.393, 599.369], "page": 1, "span": [0, 15], "__ref_s3_data": null}], "text": "Ahmed S. Nassar", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 534.292, 111.944, 543.359], "page": 1, "span": [0, 8], "__ref_s3_data": null}], "text": "ABSTRACT", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [53.484, 520.864, 295.556, 528.665], "page": 1, "span": [0, 64], "__ref_s3_data": null}], "text": "Accurate document layout analysis is a key requirement for high-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 230.99800000000005, 134.82, 240.06500000000005], "page": 1, "span": [0, 12], "__ref_s3_data": null}], "text": "CCS CONCEPTS", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [53.798, 217.56899999999996, 56.945, 225.37], "page": 1, "span": [0, 1], "__ref_s3_data": null}], "text": "\u2022", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 151.19900000000007, 294.177, 157.26700000000005], "page": 1, "span": [0, 125], "__ref_s3_data": null}, {"bbox": [53.798, 151.19900000000007, 294.177, 157.26700000000005], "page": 1, "span": [0, 125], "__ref_s3_data": null}], "text": "Permission to make digital or hard copies of part or all of this work for personal or https://doi.org/10.1145/3534678.3539043", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [361.528, 588.967, 414.848, 599.369], "page": 1, "span": [0, 11], "__ref_s3_data": null}], "text": "Peter Staar", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/0"}, {"prov": [{"bbox": [317.955, 189.529, 379.821, 198.596], "page": 1, "span": [0, 8], "__ref_s3_data": null}], "text": "KEYWORDS", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.955, 176.101, 559.186, 183.90200000000004], "page": 1, "span": [0, 63], "__ref_s3_data": null}], "text": "PDF document conversion, layout segmentation, object-detection,", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.66, 144.63599999999997, 404.654, 151.26], "page": 1, "span": [0, 20], "__ref_s3_data": null}], "text": "ACMReference Format:", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.955, 134.563, 558.353, 141.49700000000007], "page": 1, "span": [0, 75], "__ref_s3_data": null}], "text": "Birgit Pfitzmann, Christoph Auer, Michele Dolfi, Ahmed S. Nassar, and Peter", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 696.135, 59.405, 705.202], "page": 2, "span": [0, 1], "__ref_s3_data": null}], "text": "1", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [53.798, 672.785, 303.017, 680.586], "page": 2, "span": [0, 67], "__ref_s3_data": null}], "text": "Despite the substantial improvements achieved with machine-learning", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 552.237, 295.564, 560.038], "page": 2, "span": [0, 62], "__ref_s3_data": null}], "text": "Akeyproblem in the process of document conversion is to under-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 278.265, 295.564, 286.066], "page": 2, "span": [0, 66], "__ref_s3_data": null}], "text": "In this paper, we present the DocLayNet dataset. It provides page-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [64.708, 199.187, 74.221, 206.98800000000006], "page": 2, "span": [0, 3], "__ref_s3_data": null}], "text": "- (1)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 166.30999999999995, 74.221, 174.111], "page": 2, "span": [0, 3], "__ref_s3_data": null}], "text": "- (2)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 144.39300000000003, 74.221, 152.19399999999996], "page": 2, "span": [0, 3], "__ref_s3_data": null}], "text": "- (3)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 111.51599999999996, 74.221, 119.31700000000001], "page": 2, "span": [0, 3], "__ref_s3_data": null}], "text": "- (4)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.672, 86.33600000000001, 56.22, 91.10299999999995], "page": 2, "span": [0, 1], "__ref_s3_data": null}], "text": "1", "type": "footnote", "payload": null, "name": "Footnote", "font": null}, {"prov": [{"bbox": [342.095, 696.405, 558.432, 704.206], "page": 2, "span": [0, 56], "__ref_s3_data": null}], "text": "This enables experimentation with annotation uncertainty", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [328.865, 674.487, 338.378, 682.288], "page": 2, "span": [0, 3], "__ref_s3_data": null}], "text": "- (5)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [327.918, 615.793, 559.19, 623.594], "page": 2, "span": [0, 67], "__ref_s3_data": null}], "text": "All aspects outlined above are detailed in Section 3. In Section 4,", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 560.998, 558.198, 568.799], "page": 2, "span": [0, 61], "__ref_s3_data": null}], "text": "In Section 5, we will present baseline accuracy numbers for a", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 460.786, 323.562, 469.853], "page": 2, "span": [0, 1], "__ref_s3_data": null}], "text": "2", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.525, 437.436, 559.713, 445.237], "page": 2, "span": [0, 61], "__ref_s3_data": null}], "text": "While early approaches in document-layout analysis used rule-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 316.888, 558.198, 324.689], "page": 2, "span": [0, 59], "__ref_s3_data": null}], "text": "Lately, new types of ML models for document-layout analysis", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 216.67499999999995, 323.562, 225.74199999999996], "page": 2, "span": [0, 1], "__ref_s3_data": null}], "text": "3", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.955, 193.32500000000005, 558.202, 201.12599999999998], "page": 2, "span": [0, 63], "__ref_s3_data": null}], "text": "DocLayNet contains 80863 PDF pages. Among these, 7059 carry two", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 105.654, 558.198, 113.45500000000004], "page": 2, "span": [0, 61], "__ref_s3_data": null}], "text": "In addition to open intellectual property constraints for the", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 86.90599999999995, 56.346, 91.673], "page": 3, "span": [0, 1], "__ref_s3_data": null}], "text": "2", "type": "footnote", "payload": null, "name": "Footnote", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/1"}, {"prov": [{"bbox": [53.798, 501.965, 294.046, 509.766], "page": 3, "span": [0, 66], "__ref_s3_data": null}], "text": "to a minimum, since they introduce difficulties in annotation (see", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 414.294, 295.559, 422.095], "page": 3, "span": [0, 61], "__ref_s3_data": null}], "text": "The pages in DocLayNet can be grouped into six distinct cate-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 271.828, 295.559, 279.629], "page": 3, "span": [0, 61], "__ref_s3_data": null}], "text": "We did not control the document selection with regard to lan-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 173.19799999999998, 295.564, 180.99900000000002], "page": 3, "span": [0, 62], "__ref_s3_data": null}], "text": "To ensure that future benchmarks in the document-layout analy-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 696.405, 558.201, 704.206], "page": 3, "span": [0, 66], "__ref_s3_data": null}], "text": "Table 1 shows the overall frequency and distribution of the labels", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 619.693, 558.438, 627.494], "page": 3, "span": [0, 63], "__ref_s3_data": null}], "text": "In order to accommodate the different types of models currently", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 510.104, 559.185, 517.905], "page": 3, "span": [0, 65], "__ref_s3_data": null}], "text": "Despite being cost-intense and far less scalable than automation,", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 175.14499999999998, 323.562, 184.212], "page": 3, "span": [0, 1], "__ref_s3_data": null}], "text": "4", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.686, 151.79499999999996, 558.201, 159.596], "page": 3, "span": [0, 64], "__ref_s3_data": null}], "text": "The annotation campaign was carried out in four phases. In phase", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Table", "type": "table", "$ref": "#/tables/0"}, {"prov": [{"bbox": [53.798, 229.76599999999996, 295.649, 237.21900000000005], "page": 4, "span": [0, 58], "__ref_s3_data": null}], "text": "Figure 3: Corpus Conversion Service annotation user inter-", "type": "caption", "payload": null, "name": "Caption", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/2"}, {"prov": [{"bbox": [53.467, 149.47699999999998, 294.047, 157.27800000000002], "page": 4, "span": [0, 63], "__ref_s3_data": null}], "text": "we distributed the annotation workload and performed continuous", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 105.76499999999999, 226.725, 113.21799999999996], "page": 4, "span": [0, 40], "__ref_s3_data": null}], "text": "Phase 1: Data selection and preparation.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 471.689, 481.034, 479.49], "page": 4, "span": [0, 46], "__ref_s3_data": null}], "text": "include publication repositories such as arXiv", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 405.936, 558.206, 413.737], "page": 4, "span": [0, 59], "__ref_s3_data": null}], "text": "Preparation work included uploading and parsing the sourced", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 274.553, 482.418, 282.006], "page": 4, "span": [0, 39], "__ref_s3_data": null}], "text": "Phase 2: Label selection and guideline.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 86.20900000000006, 320.503, 90.976], "page": 4, "span": [0, 1], "__ref_s3_data": null}], "text": "3", "type": "footnote", "payload": null, "name": "Footnote", "font": null}, {"prov": [{"bbox": [53.798, 696.405, 294.045, 704.206], "page": 5, "span": [0, 66], "__ref_s3_data": null}], "text": "the textual content of an element, which goes beyond visual layout", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 674.487, 294.043, 682.288], "page": 5, "span": [0, 65], "__ref_s3_data": null}], "text": "At first sight, the task of visual document-layout interpretation", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 532.022, 294.219, 539.823], "page": 5, "span": [0, 65], "__ref_s3_data": null}], "text": "Obviously, this inconsistency in annotations is not desirable for", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [64.708, 435.256, 74.221, 443.057], "page": 5, "span": [0, 3], "__ref_s3_data": null}], "text": "- (1)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 391.42, 74.221, 399.221], "page": 5, "span": [0, 3], "__ref_s3_data": null}], "text": "- (2)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 347.584, 74.221, 355.385], "page": 5, "span": [0, 3], "__ref_s3_data": null}], "text": "- (3)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 325.667, 74.221, 333.468], "page": 5, "span": [0, 3], "__ref_s3_data": null}], "text": "- (4)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 303.749, 74.221, 311.55], "page": 5, "span": [0, 3], "__ref_s3_data": null}], "text": "- (5)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [64.708, 292.79, 74.221, 300.591], "page": 5, "span": [0, 3], "__ref_s3_data": null}], "text": "- (6)", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.529, 250.81799999999998, 294.043, 258.619], "page": 5, "span": [0, 62], "__ref_s3_data": null}], "text": "The complete annotation guideline is over 100 pages long and a", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 207.10699999999997, 136.774, 214.55999999999995], "page": 5, "span": [0, 18], "__ref_s3_data": null}], "text": "Phase 3: Training.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/3"}, {"prov": [{"bbox": [317.623, 258.27099999999996, 558.204, 266.072], "page": 5, "span": [0, 67], "__ref_s3_data": null}], "text": "were carried out over a timeframe of 12 weeks, after which 8 of the", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 236.47699999999998, 456.801, 243.93000000000006], "page": 5, "span": [0, 31], "__ref_s3_data": null}], "text": "Phase 4: Production annotation.", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.502, 696.903, 294.044, 704.356], "page": 6, "span": [0, 56], "__ref_s3_data": null}], "text": "Table 2: Prediction performance (mAP@0.5-0.95) of object", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Table", "type": "table", "$ref": "#/tables/1"}, {"prov": [{"bbox": [53.798, 412.841, 294.047, 420.642], "page": 6, "span": [0, 67], "__ref_s3_data": null}], "text": "to avoid this at any cost in order to have clear, unbiased baseline", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 193.865, 59.405, 202.93200000000002], "page": 6, "span": [0, 1], "__ref_s3_data": null}], "text": "5", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [53.529, 170.515, 294.049, 178.31600000000003], "page": 6, "span": [0, 65], "__ref_s3_data": null}], "text": "The primary goal of DocLayNet is to obtain high-quality ML models", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/4"}, {"prov": [{"bbox": [317.955, 399.757, 558.204, 407.558], "page": 6, "span": [0, 62], "__ref_s3_data": null}], "text": "paper and leave the detailed evaluation of more recent methods", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 377.839, 558.198, 385.64], "page": 6, "span": [0, 63], "__ref_s3_data": null}], "text": "In this section, we will present several aspects related to the", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 285.173, 466.853, 294.24], "page": 6, "span": [0, 30], "__ref_s3_data": null}], "text": "Baselines for Object Detection", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.955, 271.744, 558.431, 279.5450000000001], "page": 6, "span": [0, 66], "__ref_s3_data": null}], "text": "In Table 2, we present baseline experiments (given in mAP) on Mask", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.502, 696.903, 294.044, 704.356], "page": 7, "span": [0, 51], "__ref_s3_data": null}], "text": "Table 3: Performance of a Mask R-CNN R50 network in", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Table", "type": "table", "$ref": "#/tables/2"}, {"prov": [{"bbox": [53.798, 462.425, 131.056, 471.492], "page": 7, "span": [0, 14], "__ref_s3_data": null}], "text": "Learning Curve", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [53.798, 448.997, 294.042, 456.798], "page": 7, "span": [0, 67], "__ref_s3_data": null}], "text": "One of the fundamental questions related to any dataset is if it is", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 239.485, 164.329, 248.55200000000002], "page": 7, "span": [0, 22], "__ref_s3_data": null}], "text": "Impact of Class Labels", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [53.529, 226.05700000000002, 294.043, 233.85799999999995], "page": 7, "span": [0, 64], "__ref_s3_data": null}], "text": "The choice and number of labels can have a significant effect on", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.659, 696.903, 558.201, 704.356], "page": 7, "span": [0, 53], "__ref_s3_data": null}], "text": "Table 4: Performance of a Mask R-CNN R50 network with", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Table", "type": "table", "$ref": "#/tables/3"}, {"prov": [{"bbox": [317.955, 452.365, 558.202, 460.166], "page": 7, "span": [0, 66], "__ref_s3_data": null}], "text": "lists in PubLayNet (grouped list-items) versus DocLayNet (separate", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 352.6, 549.861, 361.667], "page": 7, "span": [0, 46], "__ref_s3_data": null}], "text": "Impact of Document Split in Train and Test Set", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.955, 339.172, 558.369, 346.973], "page": 7, "span": [0, 59], "__ref_s3_data": null}], "text": "Many documents in DocLayNet have a unique styling. In order", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 173.65300000000002, 418.548, 182.72000000000003], "page": 7, "span": [0, 18], "__ref_s3_data": null}], "text": "Dataset Comparison", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.686, 160.22500000000002, 558.206, 168.02600000000007], "page": 7, "span": [0, 65], "__ref_s3_data": null}], "text": "Throughout this paper, we claim that DocLayNet's wider variety of", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.502, 696.903, 294.322, 704.356], "page": 8, "span": [0, 56], "__ref_s3_data": null}], "text": "Table 5: Prediction Performance (mAP@0.5-0.95) of a Mask", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"name": "Table", "type": "table", "$ref": "#/tables/4"}, {"prov": [{"bbox": [53.798, 392.804, 106.239, 400.712], "page": 8, "span": [0, 14], "__ref_s3_data": null}], "text": "Section-header", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [63.761, 338.054, 294.276, 345.855], "page": 8, "span": [0, 57], "__ref_s3_data": null}], "text": "For comparison of DocBank with DocLayNet, we trained only", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [53.798, 176.93399999999997, 156.005, 186.00099999999998], "page": 8, "span": [0, 19], "__ref_s3_data": null}], "text": "Example Predictions", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [53.529, 163.505, 295.557, 171.30600000000004], "page": 8, "span": [0, 69], "__ref_s3_data": null}], "text": "To conclude this section, we illustrate the quality of layout predic-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 696.135, 323.562, 705.202], "page": 8, "span": [0, 1], "__ref_s3_data": null}], "text": "6", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [317.955, 682.707, 558.204, 690.508], "page": 8, "span": [0, 66], "__ref_s3_data": null}], "text": "In this paper, we presented the DocLayNet dataset. It provides the", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 595.035, 558.198, 602.836], "page": 8, "span": [0, 59], "__ref_s3_data": null}], "text": "From the dataset, we have derived on the one hand reference", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [327.918, 496.405, 558.435, 504.206], "page": 8, "span": [0, 62], "__ref_s3_data": null}], "text": "To date, there is still a significant gap between human and ML", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 446.903, 387.37, 455.97], "page": 8, "span": [0, 10], "__ref_s3_data": null}], "text": "REFERENCES", "type": "subtitle-level-1", "payload": null, "name": "Section-header", "font": null}, {"prov": [{"bbox": [321.198, 436.895, 329.406, 442.963], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [1]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 412.985, 329.406, 419.053], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [2]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 381.105, 329.406, 387.173], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [3]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 357.194, 329.406, 363.262], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [4]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 325.314, 329.406, 331.382], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [5]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 293.433, 329.406, 299.501], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [6]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 269.523, 329.406, 275.591], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [7]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 229.67200000000003, 329.406, 235.74], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [8]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [321.198, 205.76199999999994, 329.406, 211.83000000000004], "page": 8, "span": [0, 3], "__ref_s3_data": null}], "text": "- [9]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [317.955, 173.88200000000006, 329.406, 179.95000000000005], "page": 8, "span": [0, 4], "__ref_s3_data": null}], "text": "- [10]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [317.955, 157.94100000000003, 329.406, 164.00900000000001], "page": 8, "span": [0, 4], "__ref_s3_data": null}], "text": "- [11]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [317.955, 134.03099999999995, 329.406, 140.09899999999993], "page": 8, "span": [0, 4], "__ref_s3_data": null}], "text": "- [12]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [317.955, 110.12099999999998, 329.406, 116.18899999999996], "page": 8, "span": [0, 4], "__ref_s3_data": null}], "text": "- [13]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"name": "Picture", "type": "figure", "$ref": "#/figures/5"}, {"prov": [{"bbox": [53.798, 319.294, 558.203, 326.747], "page": 9, "span": [0, 120], "__ref_s3_data": null}], "text": "Figure 6: Example layout predictions on selected pages from the DocLayNet test-set. (A, D) exhibit favourable results on", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [69.234, 258.53099999999995, 295.224, 264.59899999999993], "page": 9, "span": [0, 76], "__ref_s3_data": null}], "text": "Diaconu, Mai Thanh Minh, Marc, albinxavi, fatih, oleg, and wanghao yang. ul-", "type": "paragraph", "payload": null, "name": "Text", "font": null}, {"prov": [{"bbox": [317.955, 258.53099999999995, 329.406, 264.59899999999993], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [20]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [317.955, 242.591, 329.406, 248.659], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [21]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [317.955, 218.68100000000004, 329.406, 224.74900000000002], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [22]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [317.955, 186.79999999999995, 329.406, 192.86799999999994], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [23]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.798, 234.62099999999998, 65.249, 240.68899999999996], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [14]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.798, 210.711, 65.249, 216.779], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [15]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.798, 194.76999999999998, 65.249, 200.83799999999997], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [16]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.798, 170.86, 65.249, 176.928], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [17]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.798, 154.91999999999996, 65.249, 160.98800000000006], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [18]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}, {"prov": [{"bbox": [53.798, 115.06899999999996, 65.249, 121.13699999999994], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "- [19]", "type": "paragraph", "payload": null, "name": "List-item", "font": null}], "figures": [{"prov": [{"bbox": [323.408203125, 266.1492919921875, 553.2952270507812, 541.6512603759766], "page": 1, "span": [0, 9], "__ref_s3_data": null}], "text": "Figure 1:", "type": "figure", "payload": null, "bounding-box": null}, {"prov": [{"bbox": [88.33030700683594, 571.4317321777344, 263.7049560546875, 699.1134796142578], "page": 3, "span": [0, 57], "__ref_s3_data": null}], "text": "Figure 2: Distribution of DocLayNet pages across document", "type": "figure", "payload": null, "bounding-box": null}, {"prov": [{"bbox": [53.05912780761719, 251.135986328125, 295.8506164550781, 481.2087097167969], "page": 4, "span": [0, 123], "__ref_s3_data": null}], "text": "Table 1: DocLayNet dataset overview. Along with the frequency of each class label, we present the relative occurrence (as %", "type": "figure", "payload": null, "bounding-box": null}, {"prov": [{"bbox": [315.960205078125, 332.31915283203125, 559.396484375, 706.6611862182617], "page": 5, "span": [0, 59], "__ref_s3_data": null}], "text": "Figure 4: Examples of plausible annotation alternatives for", "type": "figure", "payload": null, "bounding-box": null}, {"prov": [{"bbox": [323.48431396484375, 531.9892272949219, 553.5411376953125, 702.1139678955078], "page": 6, "span": [0, 57], "__ref_s3_data": null}], "text": "Figure 5: Prediction performance (mAP@0.5-0.95) of a Mask", "type": "figure", "payload": null, "bounding-box": null}, {"prov": [{"bbox": [52.963985443115234, 349.8648681640625, 556.931640625, 707.2641143798828], "page": 9, "span": [0, 4], "__ref_s3_data": null}], "text": "Text", "type": "figure", "payload": null, "bounding-box": null}], "tables": [{"prov": [{"bbox": [98.93103790283203, 497.91851806640625, 512.579833984375, 654.5245208740234], "page": 4, "span": [0, 0], "__ref_s3_data": null}], "text": "Figure 3: Corpus Conversion Service annotation user inter-", "type": "table", "payload": null, "#-cols": 0, "#-rows": 0, "data": [], "model": null, "bounding-box": null}, {"prov": [{"bbox": [62.02753829956055, 440.3381042480469, 285.78955078125, 596.3199310302734], "page": 6, "span": [0, 0], "__ref_s3_data": null}], "text": "", "type": "table", "payload": null, "#-cols": 0, "#-rows": 0, "data": [], "model": null, "bounding-box": null}, {"prov": [{"bbox": [80.35525512695312, 496.5545349121094, 267.0082092285156, 641.0637054443359], "page": 7, "span": [0, 0], "__ref_s3_data": null}], "text": "", "type": "table", "payload": null, "#-cols": 0, "#-rows": 0, "data": [], "model": null, "bounding-box": null}, {"prov": [{"bbox": [352.97747802734375, 485.7341613769531, 522.9158935546875, 641.208740234375], "page": 7, "span": [0, 0], "__ref_s3_data": null}], "text": "", "type": "table", "payload": null, "#-cols": 0, "#-rows": 0, "data": [], "model": null, "bounding-box": null}, {"prov": [{"bbox": [72.6590347290039, 452.1459655761719, 274.83465576171875, 619.5191955566406], "page": 8, "span": [0, 0], "__ref_s3_data": null}], "text": "", "type": "table", "payload": null, "#-cols": 0, "#-rows": 0, "data": [], "model": null, "bounding-box": null}], "bitmaps": null, "equations": [], "footnotes": [], "page-dimensions": [{"height": 792.0, "page": 1, "width": 612.0}, {"height": 792.0, "page": 2, "width": 612.0}, {"height": 792.0, "page": 3, "width": 612.0}, {"height": 792.0, "page": 4, "width": 612.0}, {"height": 792.0, "page": 5, "width": 612.0}, {"height": 792.0, "page": 6, "width": 612.0}, {"height": 792.0, "page": 7, "width": 612.0}, {"height": 792.0, "page": 8, "width": 612.0}, {"height": 792.0, "page": 9, "width": 612.0}], "page-footers": [], "page-headers": [], "_s3_data": null, "identifiers": null} |