From 3a76433b83cd72405713a582a8da5cd7fa86bac7 Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 10 Jun 2025 09:52:31 +0200 Subject: [PATCH] Update test files Signed-off-by: Christoph Auer --- .../docling_v2/test_emf_docx.docx.itxt | 11 +- .../docling_v2/test_emf_docx.docx.json | 53 +- .../groundtruth/docling_v2/textbox.docx.itxt | 127 ++- .../groundtruth/docling_v2/textbox.docx.json | 800 +++++++++--------- .../groundtruth/docling_v2/textbox.docx.md | 8 +- .../docling_v2/word_sample.docx.itxt | 49 +- .../docling_v2/word_sample.docx.json | 117 +-- 7 files changed, 604 insertions(+), 561 deletions(-) diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt index 220b5533..346093c2 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.itxt @@ -2,7 +2,10 @@ item-0 at level 0: unspecified: group _root_ item-1 at level 1: paragraph: Test with three images in unusual formats item-2 at level 1: paragraph: Raster in emf: item-3 at level 1: picture - item-4 at level 1: paragraph: Vector in emf: - item-5 at level 1: picture - item-6 at level 1: paragraph: Raster in webp: - item-7 at level 1: picture \ No newline at end of file + item-4 at level 1: paragraph: + item-5 at level 1: paragraph: Vector in emf: + item-6 at level 1: picture + item-7 at level 1: paragraph: + item-8 at level 1: paragraph: Raster in webp: + item-9 at level 1: picture + item-10 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json index 65d0d30d..34e41094 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json @@ -29,14 +29,23 @@ { "$ref": "#/texts/2" }, - { - "$ref": "#/pictures/1" - }, { "$ref": "#/texts/3" }, + { + "$ref": "#/pictures/1" + }, + { + "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" + }, { "$ref": "#/pictures/2" + }, + { + "$ref": "#/texts/6" } ], "content_layer": "body", @@ -90,6 +99,18 @@ "content_layer": "body", "label": "paragraph", "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], "orig": "Vector in emf:", "text": "Vector in emf:", "formatting": { @@ -100,7 +121,19 @@ } }, { - "self_ref": "#/texts/3", + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/5", "parent": { "$ref": "#/body" }, @@ -116,6 +149,18 @@ "underline": false, "strikethrough": false } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" } ], "pictures": [ diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index e17e2be2..406de95f 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -5,92 +5,89 @@ item-0 at level 0: unspecified: group _root_ item-4 at level 1: section: group textbox item-5 at level 2: paragraph: Student falls ill item-6 at level 2: paragraph: - item-7 at level 2: paragraph: - item-8 at level 2: list: group list - item-9 at level 3: list_item: Suggested Reportable Symptoms: + item-7 at level 2: list: group list + item-8 at level 3: list_item: Suggested Reportable Symptoms: * ... sh * Blisters * Headache * Sore throat - item-10 at level 1: list_item: + item-9 at level 1: list_item: + item-10 at level 1: paragraph: item-11 at level 1: paragraph: - item-12 at level 1: paragraph: - item-13 at level 1: section: group textbox - item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-12 at level 1: section: group textbox + item-13 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-14 at level 1: paragraph: item-15 at level 1: paragraph: item-16 at level 1: paragraph: item-17 at level 1: paragraph: - item-18 at level 1: paragraph: - item-19 at level 1: section: group textbox - item-20 at level 2: paragraph: Yes + item-18 at level 1: section: group textbox + item-19 at level 2: paragraph: Yes + item-20 at level 1: paragraph: item-21 at level 1: paragraph: - item-22 at level 1: paragraph: - item-23 at level 1: section: group textbox - item-24 at level 2: list: group list - item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. - item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. - item-27 at level 2: paragraph: - item-28 at level 2: paragraph: - item-29 at level 1: list: group list - item-30 at level 2: list_item: + item-22 at level 1: section: group textbox + item-23 at level 2: list: group list + item-24 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. + item-25 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. + item-26 at level 2: paragraph: + item-27 at level 1: list: group list + item-28 at level 2: list_item: + item-29 at level 1: paragraph: + item-30 at level 1: paragraph: item-31 at level 1: paragraph: item-32 at level 1: paragraph: item-33 at level 1: paragraph: - item-34 at level 1: paragraph: - item-35 at level 1: paragraph: - item-36 at level 1: section: group textbox - item-37 at level 2: paragraph: Health Bureau: - item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-39 at level 2: list: group list - item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-41 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-42 at level 2: paragraph: - item-43 at level 2: paragraph: - item-44 at level 1: list: group list - item-45 at level 2: list_item: - item-46 at level 1: paragraph: - item-47 at level 1: section: group textbox - item-48 at level 2: paragraph: Department of Education: + item-34 at level 1: section: group textbox + item-35 at level 2: paragraph: Health Bureau: + item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-37 at level 2: list: group list + item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-40 at level 2: paragraph: + item-41 at level 1: list: group list + item-42 at level 2: list_item: + item-43 at level 1: paragraph: + item-44 at level 1: section: group textbox + item-45 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. + item-46 at level 1: paragraph: + item-47 at level 1: paragraph: + item-48 at level 1: paragraph: item-49 at level 1: paragraph: item-50 at level 1: paragraph: item-51 at level 1: paragraph: item-52 at level 1: paragraph: - item-53 at level 1: paragraph: - item-54 at level 1: paragraph: - item-55 at level 1: paragraph: - item-56 at level 1: section: group textbox - item-57 at level 2: inline: group group - item-58 at level 3: paragraph: The Health Bureau will handle - item-59 at level 3: paragraph: reporting and specimen collection - item-60 at level 3: paragraph: . - item-61 at level 2: paragraph: - item-62 at level 2: paragraph: - item-63 at level 1: paragraph: - item-64 at level 1: paragraph: + item-53 at level 1: section: group textbox + item-54 at level 2: inline: group group + item-55 at level 3: paragraph: The Health Bureau will handle + item-56 at level 3: paragraph: reporting and specimen collection + item-57 at level 3: paragraph: . + item-58 at level 2: paragraph: + item-59 at level 1: paragraph: + item-60 at level 1: paragraph: + item-61 at level 1: paragraph: + item-62 at level 1: section: group textbox + item-63 at level 2: paragraph: Whether the epidemic has eased. + item-64 at level 2: paragraph: item-65 at level 1: paragraph: item-66 at level 1: section: group textbox - item-67 at level 2: paragraph: Whether the epidemic has eased. - item-68 at level 2: paragraph: - item-69 at level 2: paragraph: + item-67 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-68 at level 2: paragraph: No + item-69 at level 1: paragraph: item-70 at level 1: paragraph: item-71 at level 1: section: group textbox - item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-73 at level 2: paragraph: No - item-74 at level 1: paragraph: - item-75 at level 1: paragraph: - item-76 at level 1: section: group textbox + item-72 at level 2: paragraph: Yes + item-73 at level 1: paragraph: + item-74 at level 1: section: group textbox + item-75 at level 2: paragraph: Yes + item-76 at level 1: paragraph: item-77 at level 1: paragraph: item-78 at level 1: section: group textbox - item-79 at level 1: paragraph: - item-80 at level 1: paragraph: - item-81 at level 1: section: group textbox - item-82 at level 2: paragraph: Case closed. - item-83 at level 2: paragraph: - item-84 at level 2: paragraph: - item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-79 at level 2: paragraph: Case closed. + item-80 at level 2: paragraph: + item-81 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-82 at level 1: paragraph: + item-83 at level 1: section: group textbox + item-84 at level 2: paragraph: No + item-85 at level 1: paragraph: item-86 at level 1: paragraph: - item-87 at level 1: section: group textbox - item-88 at level 1: paragraph: - item-89 at level 1: paragraph: - item-90 at level 1: paragraph: \ No newline at end of file + item-87 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 743fb578..840e937a 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -29,6 +29,9 @@ { "$ref": "#/groups/0" }, + { + "$ref": "#/texts/6" + }, { "$ref": "#/texts/7" }, @@ -36,10 +39,10 @@ "$ref": "#/texts/8" }, { - "$ref": "#/texts/9" + "$ref": "#/groups/2" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/10" }, { "$ref": "#/texts/11" @@ -50,17 +53,14 @@ { "$ref": "#/texts/13" }, - { - "$ref": "#/texts/14" - }, { "$ref": "#/groups/3" }, { - "$ref": "#/texts/16" + "$ref": "#/texts/15" }, { - "$ref": "#/texts/17" + "$ref": "#/texts/16" }, { "$ref": "#/groups/4" @@ -68,6 +68,12 @@ { "$ref": "#/groups/6" }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, { "$ref": "#/texts/23" }, @@ -77,12 +83,6 @@ { "$ref": "#/texts/25" }, - { - "$ref": "#/texts/26" - }, - { - "$ref": "#/texts/27" - }, { "$ref": "#/groups/7" }, @@ -90,11 +90,20 @@ "$ref": "#/groups/9" }, { - "$ref": "#/texts/35" + "$ref": "#/texts/32" }, { "$ref": "#/groups/10" }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, { "$ref": "#/texts/37" }, @@ -107,74 +116,65 @@ { "$ref": "#/texts/40" }, - { - "$ref": "#/texts/41" - }, - { - "$ref": "#/texts/42" - }, - { - "$ref": "#/texts/43" - }, { "$ref": "#/groups/11" }, { - "$ref": "#/texts/49" + "$ref": "#/texts/45" }, { - "$ref": "#/texts/50" + "$ref": "#/texts/46" }, { - "$ref": "#/texts/51" + "$ref": "#/texts/47" }, { "$ref": "#/groups/13" }, { - "$ref": "#/texts/55" + "$ref": "#/texts/50" }, { "$ref": "#/groups/14" }, + { + "$ref": "#/texts/53" + }, + { + "$ref": "#/texts/54" + }, + { + "$ref": "#/groups/15" + }, + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/groups/16" + }, { "$ref": "#/texts/58" }, { "$ref": "#/texts/59" }, - { - "$ref": "#/groups/15" - }, - { - "$ref": "#/texts/60" - }, - { - "$ref": "#/groups/16" - }, - { - "$ref": "#/texts/61" - }, - { - "$ref": "#/texts/62" - }, { "$ref": "#/groups/17" }, { - "$ref": "#/texts/67" + "$ref": "#/texts/63" }, { "$ref": "#/groups/18" }, { - "$ref": "#/texts/68" + "$ref": "#/texts/65" }, { - "$ref": "#/texts/69" + "$ref": "#/texts/66" }, { - "$ref": "#/texts/70" + "$ref": "#/texts/67" } ], "content_layer": "body", @@ -194,9 +194,6 @@ { "$ref": "#/texts/4" }, - { - "$ref": "#/texts/5" - }, { "$ref": "#/groups/1" } @@ -212,7 +209,7 @@ }, "children": [ { - "$ref": "#/texts/6" + "$ref": "#/texts/5" } ], "content_layer": "body", @@ -226,7 +223,7 @@ }, "children": [ { - "$ref": "#/texts/10" + "$ref": "#/texts/9" } ], "content_layer": "body", @@ -240,7 +237,7 @@ }, "children": [ { - "$ref": "#/texts/15" + "$ref": "#/texts/14" } ], "content_layer": "body", @@ -257,10 +254,7 @@ "$ref": "#/groups/5" }, { - "$ref": "#/texts/20" - }, - { - "$ref": "#/texts/21" + "$ref": "#/texts/19" } ], "content_layer": "body", @@ -274,10 +268,10 @@ }, "children": [ { - "$ref": "#/texts/18" + "$ref": "#/texts/17" }, { - "$ref": "#/texts/19" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -291,7 +285,7 @@ }, "children": [ { - "$ref": "#/texts/22" + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -305,19 +299,16 @@ }, "children": [ { - "$ref": "#/texts/28" + "$ref": "#/texts/26" }, { - "$ref": "#/texts/29" + "$ref": "#/texts/27" }, { "$ref": "#/groups/8" }, { - "$ref": "#/texts/32" - }, - { - "$ref": "#/texts/33" + "$ref": "#/texts/30" } ], "content_layer": "body", @@ -331,10 +322,10 @@ }, "children": [ { - "$ref": "#/texts/30" + "$ref": "#/texts/28" }, { - "$ref": "#/texts/31" + "$ref": "#/texts/29" } ], "content_layer": "body", @@ -348,7 +339,7 @@ }, "children": [ { - "$ref": "#/texts/34" + "$ref": "#/texts/31" } ], "content_layer": "body", @@ -362,7 +353,7 @@ }, "children": [ { - "$ref": "#/texts/36" + "$ref": "#/texts/33" } ], "content_layer": "body", @@ -379,10 +370,7 @@ "$ref": "#/groups/12" }, { - "$ref": "#/texts/47" - }, - { - "$ref": "#/texts/48" + "$ref": "#/texts/44" } ], "content_layer": "body", @@ -396,13 +384,13 @@ }, "children": [ { - "$ref": "#/texts/44" + "$ref": "#/texts/41" }, { - "$ref": "#/texts/45" + "$ref": "#/texts/42" }, { - "$ref": "#/texts/46" + "$ref": "#/texts/43" } ], "content_layer": "body", @@ -416,13 +404,10 @@ }, "children": [ { - "$ref": "#/texts/52" + "$ref": "#/texts/48" }, { - "$ref": "#/texts/53" - }, - { - "$ref": "#/texts/54" + "$ref": "#/texts/49" } ], "content_layer": "body", @@ -436,10 +421,10 @@ }, "children": [ { - "$ref": "#/texts/56" + "$ref": "#/texts/51" }, { - "$ref": "#/texts/57" + "$ref": "#/texts/52" } ], "content_layer": "body", @@ -451,7 +436,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/55" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -461,7 +450,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/57" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -473,16 +466,13 @@ }, "children": [ { - "$ref": "#/texts/63" + "$ref": "#/texts/60" }, { - "$ref": "#/texts/64" + "$ref": "#/texts/61" }, { - "$ref": "#/texts/65" - }, - { - "$ref": "#/texts/66" + "$ref": "#/texts/62" } ], "content_layer": "body", @@ -494,7 +484,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/64" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -581,18 +575,6 @@ }, { "self_ref": "#/texts/5", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/6", "parent": { "$ref": "#/groups/1" }, @@ -612,7 +594,7 @@ "marker": "-" }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, @@ -625,6 +607,18 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/8", "parent": { @@ -639,18 +633,6 @@ }, { "self_ref": "#/texts/9", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/10", "parent": { "$ref": "#/groups/2" }, @@ -667,6 +649,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/11", "parent": { @@ -705,18 +699,6 @@ }, { "self_ref": "#/texts/14", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/3" }, @@ -733,6 +715,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/16", "parent": { @@ -747,18 +741,6 @@ }, { "self_ref": "#/texts/17", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -778,7 +760,7 @@ "marker": "-" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -797,32 +779,20 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/6" }, @@ -835,6 +805,30 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/23", "parent": { @@ -873,30 +867,6 @@ }, { "self_ref": "#/texts/26", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/27", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/7" }, @@ -914,7 +884,7 @@ } }, { - "self_ref": "#/texts/29", + "self_ref": "#/texts/27", "parent": { "$ref": "#/groups/7" }, @@ -932,7 +902,7 @@ } }, { - "self_ref": "#/texts/30", + "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/8" }, @@ -952,7 +922,7 @@ "marker": "-" }, { - "self_ref": "#/texts/31", + "self_ref": "#/texts/29", "parent": { "$ref": "#/groups/8" }, @@ -972,7 +942,7 @@ "marker": "-" }, { - "self_ref": "#/texts/32", + "self_ref": "#/texts/30", "parent": { "$ref": "#/groups/7" }, @@ -984,19 +954,7 @@ "text": "" }, { - "self_ref": "#/texts/33", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/34", + "self_ref": "#/texts/31", "parent": { "$ref": "#/groups/9" }, @@ -1009,6 +967,48 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/35", "parent": { @@ -1024,20 +1024,14 @@ { "self_ref": "#/texts/36", "parent": { - "$ref": "#/groups/10" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - } + "orig": "", + "text": "" }, { "self_ref": "#/texts/37", @@ -1089,42 +1083,6 @@ }, { "self_ref": "#/texts/41", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/42", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/43", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/12" }, @@ -1142,7 +1100,7 @@ } }, { - "self_ref": "#/texts/45", + "self_ref": "#/texts/42", "parent": { "$ref": "#/groups/12" }, @@ -1160,7 +1118,7 @@ } }, { - "self_ref": "#/texts/46", + "self_ref": "#/texts/43", "parent": { "$ref": "#/groups/12" }, @@ -1178,7 +1136,7 @@ } }, { - "self_ref": "#/texts/47", + "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/11" }, @@ -1189,22 +1147,64 @@ "orig": "", "text": "" }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/48", "parent": { - "$ref": "#/groups/11" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "", - "text": "" + "orig": "Whether the epidemic has eased.", + "text": "Whether the epidemic has eased.", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } }, { "self_ref": "#/texts/49", "parent": { - "$ref": "#/body" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", @@ -1227,72 +1227,6 @@ }, { "self_ref": "#/texts/51", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/52", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "Whether the epidemic has eased.", - "text": "Whether the epidemic has eased.", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false - } - }, - { - "self_ref": "#/texts/53", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/54", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/55", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/56", "parent": { "$ref": "#/groups/14" }, @@ -1310,7 +1244,7 @@ } }, { - "self_ref": "#/texts/57", + "self_ref": "#/texts/52", "parent": { "$ref": "#/groups/14" }, @@ -1327,6 +1261,78 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/53", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/54", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/55", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/56", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, { "self_ref": "#/texts/58", "parent": { @@ -1353,42 +1359,6 @@ }, { "self_ref": "#/texts/60", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/61", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/62", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/63", "parent": { "$ref": "#/groups/17" }, @@ -1406,7 +1376,7 @@ } }, { - "self_ref": "#/texts/64", + "self_ref": "#/texts/61", "parent": { "$ref": "#/groups/17" }, @@ -1418,19 +1388,7 @@ "text": "" }, { - "self_ref": "#/texts/65", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/66", + "self_ref": "#/texts/62", "parent": { "$ref": "#/groups/17" }, @@ -1447,6 +1405,60 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "No", + "text": "No", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/65", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/66", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/67", "parent": { @@ -1458,42 +1470,6 @@ "prov": [], "orig": "", "text": "" - }, - { - "self_ref": "#/texts/68", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/69", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/70", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.md b/tests/data/groundtruth/docling_v2/textbox.docx.md index 9458bd0c..293c4d8c 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.md +++ b/tests/data/groundtruth/docling_v2/textbox.docx.md @@ -40,6 +40,12 @@ The Health Bureau will handle **reporting and specimen collection** . No +Yes + +Yes + **Case closed.** -The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. \ No newline at end of file +The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. + +No \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt index ce60ad26..b4d98b44 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.itxt +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.itxt @@ -3,27 +3,28 @@ item-0 at level 0: unspecified: group _root_ item-2 at level 1: title: Swimming in the lake item-3 at level 2: paragraph: Duck item-4 at level 2: picture - item-5 at level 2: paragraph: Figure 1: This is a cute duckling - item-6 at level 2: section_header: Let’s swim! - item-7 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: - item-8 at level 3: list: group list - item-9 at level 4: list_item: You can relax and look around - item-10 at level 4: list_item: Paddle about - item-11 at level 4: list_item: Enjoy summer warmth - item-12 at level 3: paragraph: Also, don’t forget: - item-13 at level 3: list: group list - item-14 at level 4: list_item: Wear sunglasses - item-15 at level 4: list_item: Don’t forget to drink water - item-16 at level 4: list_item: Use sun cream - item-17 at level 3: paragraph: Hmm, what else… - item-18 at level 3: section_header: Let’s eat - item-19 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice - item-20 at level 4: paragraph: I like to eat leaves - item-21 at level 4: paragraph: Here are some interesting things a respectful duck could eat: - item-22 at level 4: table with [4x3] - item-23 at level 4: paragraph: - item-24 at level 4: paragraph: And let’s add another list in the end: - item-25 at level 4: list: group list - item-26 at level 5: list_item: Leaves - item-27 at level 5: list_item: Berries - item-28 at level 5: list_item: Grain \ No newline at end of file + item-5 at level 2: paragraph: + item-6 at level 2: paragraph: Figure 1: This is a cute duckling + item-7 at level 2: section_header: Let’s swim! + item-8 at level 3: paragraph: To get started with swimming, fi ... down in a water and try not to drown: + item-9 at level 3: list: group list + item-10 at level 4: list_item: You can relax and look around + item-11 at level 4: list_item: Paddle about + item-12 at level 4: list_item: Enjoy summer warmth + item-13 at level 3: paragraph: Also, don’t forget: + item-14 at level 3: list: group list + item-15 at level 4: list_item: Wear sunglasses + item-16 at level 4: list_item: Don’t forget to drink water + item-17 at level 4: list_item: Use sun cream + item-18 at level 3: paragraph: Hmm, what else… + item-19 at level 3: section_header: Let’s eat + item-20 at level 4: paragraph: After we had a good day of swimm ... , it’s important to eat something nice + item-21 at level 4: paragraph: I like to eat leaves + item-22 at level 4: paragraph: Here are some interesting things a respectful duck could eat: + item-23 at level 4: table with [4x3] + item-24 at level 4: paragraph: + item-25 at level 4: paragraph: And let’s add another list in the end: + item-26 at level 4: list: group list + item-27 at level 5: list_item: Leaves + item-28 at level 5: list_item: Berries + item-29 at level 5: list_item: Grain \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 432a5087..04feafeb 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -32,17 +32,17 @@ { "self_ref": "#/groups/0", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/6" - }, { "$ref": "#/texts/7" }, { "$ref": "#/texts/8" + }, + { + "$ref": "#/texts/9" } ], "content_layer": "body", @@ -52,17 +52,17 @@ { "self_ref": "#/groups/1", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/10" - }, { "$ref": "#/texts/11" }, { "$ref": "#/texts/12" + }, + { + "$ref": "#/texts/13" } ], "content_layer": "body", @@ -72,17 +72,17 @@ { "self_ref": "#/groups/2", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [ - { - "$ref": "#/texts/20" - }, { "$ref": "#/texts/21" }, { "$ref": "#/texts/22" + }, + { + "$ref": "#/texts/23" } ], "content_layer": "body", @@ -126,6 +126,9 @@ }, { "$ref": "#/texts/4" + }, + { + "$ref": "#/texts/5" } ], "content_layer": "body", @@ -161,6 +164,18 @@ "content_layer": "body", "label": "paragraph", "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], "orig": "Figure 1: This is a cute duckling", "text": "Figure 1: This is a cute duckling", "formatting": { @@ -171,28 +186,28 @@ } }, { - "self_ref": "#/texts/4", + "self_ref": "#/texts/5", "parent": { "$ref": "#/texts/1" }, "children": [ { - "$ref": "#/texts/5" + "$ref": "#/texts/6" }, { "$ref": "#/groups/0" }, { - "$ref": "#/texts/9" + "$ref": "#/texts/10" }, { "$ref": "#/groups/1" }, { - "$ref": "#/texts/13" + "$ref": "#/texts/14" }, { - "$ref": "#/texts/14" + "$ref": "#/texts/15" } ], "content_layer": "body", @@ -203,9 +218,9 @@ "level": 1 }, { - "self_ref": "#/texts/5", + "self_ref": "#/texts/6", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -221,7 +236,7 @@ } }, { - "self_ref": "#/texts/6", + "self_ref": "#/texts/7", "parent": { "$ref": "#/groups/0" }, @@ -241,7 +256,7 @@ "marker": "-" }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/8", "parent": { "$ref": "#/groups/0" }, @@ -261,7 +276,7 @@ "marker": "-" }, { - "self_ref": "#/texts/8", + "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/0" }, @@ -281,9 +296,9 @@ "marker": "-" }, { - "self_ref": "#/texts/9", + "self_ref": "#/texts/10", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -299,7 +314,7 @@ } }, { - "self_ref": "#/texts/10", + "self_ref": "#/texts/11", "parent": { "$ref": "#/groups/1" }, @@ -319,7 +334,7 @@ "marker": "-" }, { - "self_ref": "#/texts/11", + "self_ref": "#/texts/12", "parent": { "$ref": "#/groups/1" }, @@ -339,7 +354,7 @@ "marker": "-" }, { - "self_ref": "#/texts/12", + "self_ref": "#/texts/13", "parent": { "$ref": "#/groups/1" }, @@ -359,9 +374,9 @@ "marker": "-" }, { - "self_ref": "#/texts/13", + "self_ref": "#/texts/14", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [], "content_layer": "body", @@ -377,29 +392,29 @@ } }, { - "self_ref": "#/texts/14", + "self_ref": "#/texts/15", "parent": { - "$ref": "#/texts/4" + "$ref": "#/texts/5" }, "children": [ - { - "$ref": "#/texts/15" - }, { "$ref": "#/texts/16" }, { "$ref": "#/texts/17" }, - { - "$ref": "#/tables/0" - }, { "$ref": "#/texts/18" }, + { + "$ref": "#/tables/0" + }, { "$ref": "#/texts/19" }, + { + "$ref": "#/texts/20" + }, { "$ref": "#/groups/2" } @@ -412,9 +427,9 @@ "level": 2 }, { - "self_ref": "#/texts/15", + "self_ref": "#/texts/16", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -430,9 +445,9 @@ } }, { - "self_ref": "#/texts/16", + "self_ref": "#/texts/17", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -448,9 +463,9 @@ } }, { - "self_ref": "#/texts/17", + "self_ref": "#/texts/18", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -466,9 +481,9 @@ } }, { - "self_ref": "#/texts/18", + "self_ref": "#/texts/19", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -478,9 +493,9 @@ "text": "" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/20", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body", @@ -496,7 +511,7 @@ } }, { - "self_ref": "#/texts/20", + "self_ref": "#/texts/21", "parent": { "$ref": "#/groups/2" }, @@ -516,7 +531,7 @@ "marker": "-" }, { - "self_ref": "#/texts/21", + "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/2" }, @@ -536,7 +551,7 @@ "marker": "-" }, { - "self_ref": "#/texts/22", + "self_ref": "#/texts/23", "parent": { "$ref": "#/groups/2" }, @@ -585,7 +600,7 @@ { "self_ref": "#/tables/0", "parent": { - "$ref": "#/texts/14" + "$ref": "#/texts/15" }, "children": [], "content_layer": "body",