From c146c8f309fdffcc91dc39e82692dfe2d7c2b43d Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Fri, 20 Jun 2025 13:31:28 +0200 Subject: [PATCH] Update all test cases Signed-off-by: Christoph Auer --- .../docling_v1/2305.03393v1-pg9.json | 4 +- .../docling_v1/2305.03393v1-pg9.pages.json | 58 +-- .../docling_v2/2305.03393v1-pg9.json | 4 +- .../docling_v2/2305.03393v1-pg9.pages.json | 58 +-- .../docling_v2/csv-comma-in-cell.csv.json | 5 +- .../groundtruth/docling_v2/csv-comma.csv.json | 5 +- .../csv-inconsistent-header.csv.json | 5 +- .../groundtruth/docling_v2/csv-pipe.csv.json | 5 +- .../docling_v2/csv-semicolon.csv.json | 5 +- .../groundtruth/docling_v2/csv-tab.csv.json | 5 +- .../docling_v2/csv-too-few-columns.csv.json | 5 +- .../docling_v2/csv-too-many-columns.csv.json | 5 +- .../docling_v2/equations.docx.json | 23 +- .../docling_v2/example_01.html.json | 2 +- .../docling_v2/example_02.html.json | 2 +- .../docling_v2/example_03.html.json | 5 +- .../docling_v2/example_04.html.json | 5 +- .../docling_v2/example_05.html.json | 5 +- .../docling_v2/example_06.html.json | 2 +- .../docling_v2/example_07.html.json | 2 +- .../docling_v2/example_08.html.json | 11 +- .../docling_v2/inline_and_formatting.md.yaml | 9 +- .../docling_v2/ipa20180000016.json | 5 +- .../docling_v2/ipa20200022300.json | 2 +- .../docling_v2/lorem_ipsum.docx.json | 17 +- .../groundtruth/docling_v2/pa20010031492.json | 5 +- .../docling_v2/pftaps057006474.json | 2 +- .../groundtruth/docling_v2/pg06442728.json | 2 +- .../docling_v2/powerpoint_bad_text.pptx.json | 2 +- .../docling_v2/powerpoint_sample.pptx.json | 5 +- .../powerpoint_with_image.pptx.json | 2 +- .../docling_v2/sample_sales_data.xlsm.json | 5 +- .../docling_v2/tablecell.docx.json | 17 +- .../groundtruth/docling_v2/test-01.xlsx.json | 20 +- .../docling_v2/test_emf_docx.docx.json | 14 +- .../groundtruth/docling_v2/textbox.docx.itxt | 119 ++--- .../groundtruth/docling_v2/textbox.docx.json | 446 ++++++++++-------- .../docling_v2/unit_test_01.html.json | 2 +- .../docling_v2/unit_test_formatting.docx.itxt | 27 +- .../docling_v2/unit_test_formatting.docx.json | 332 ++++++++----- .../docling_v2/unit_test_formatting.docx.md | 4 +- .../docling_v2/unit_test_headers.docx.json | 44 +- .../unit_test_headers_numbered.docx.json | 44 +- .../docling_v2/unit_test_lists.docx.json | 83 ++-- .../docling_v2/wiki_duck.html.json | 8 +- .../docling_v2/word_sample.docx.json | 62 ++- .../docling_v2/word_tables.docx.json | 32 +- .../groundtruth/docling_v2/webp-test.json | 2 +- .../groundtruth/docling_v2/ocr_test.json | 2 +- .../docling_v2/ocr_test_rotated_180.json | 2 +- .../docling_v2/ocr_test_rotated_270.json | 2 +- .../docling_v2/ocr_test_rotated_90.json | 2 +- 52 files changed, 923 insertions(+), 618 deletions(-) diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json index e938e2d7..dd51e390 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json @@ -213,10 +213,10 @@ "prov": [ { "bbox": [ - 139.6674041748047, + 139.66741943359375, 322.5054626464844, 475.00927734375, - 454.4546203613281 + 454.45458984375 ], "page": 1, "span": [ diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json index 3c219d95..3010fbb6 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json @@ -2705,7 +2705,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2745,7 +2745,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2785,7 +2785,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2940,7 +2940,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3155,7 +3155,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3339,8 +3339,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7846,7 +7846,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7911,7 +7911,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8243,8 +8243,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13641,7 +13641,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13687,7 +13687,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13733,7 +13733,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13900,7 +13900,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14121,7 +14121,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14311,8 +14311,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19701,7 +19701,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19772,7 +19772,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20116,7 +20116,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20283,7 +20283,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20504,7 +20504,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20694,8 +20694,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26084,7 +26084,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26155,7 +26155,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26499,7 +26499,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26545,7 +26545,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json index 6e7c5c1d..8ce7f748 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json @@ -336,8 +336,8 @@ { "page_no": 1, "bbox": { - "l": 139.6674041748047, - "t": 454.4546203613281, + "l": 139.66741943359375, + "t": 454.45458984375, "r": 475.00927734375, "b": 322.5054626464844, "coord_origin": "BOTTOMLEFT" diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json index 3c219d95..3010fbb6 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json @@ -2705,7 +2705,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2745,7 +2745,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2785,7 +2785,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2940,7 +2940,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3155,7 +3155,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3339,8 +3339,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7846,7 +7846,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7911,7 +7911,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8243,8 +8243,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13641,7 +13641,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13687,7 +13687,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13733,7 +13733,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13900,7 +13900,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14121,7 +14121,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14311,8 +14311,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19701,7 +19701,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19772,7 +19772,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20116,7 +20116,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20283,7 +20283,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20504,7 +20504,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20694,8 +20694,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26084,7 +26084,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26155,7 +26155,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26499,7 +26499,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26545,7 +26545,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json index 4844cf76..82747ca2 100644 --- a/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-comma-in-cell.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-comma-in-cell", "origin": { "mimetype": "text/csv", @@ -538,7 +538,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/csv-comma.csv.json b/tests/data/groundtruth/docling_v2/csv-comma.csv.json index 0a414868..db7f3e34 100644 --- a/tests/data/groundtruth/docling_v2/csv-comma.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-comma.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-comma", "origin": { "mimetype": "text/csv", @@ -1788,7 +1788,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json index def3d8d3..d4a3305d 100644 --- a/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-inconsistent-header.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-inconsistent-header", "origin": { "mimetype": "text/csv", @@ -526,7 +526,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/csv-pipe.csv.json b/tests/data/groundtruth/docling_v2/csv-pipe.csv.json index 298538a1..73566a8a 100644 --- a/tests/data/groundtruth/docling_v2/csv-pipe.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-pipe.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-pipe", "origin": { "mimetype": "text/csv", @@ -1788,7 +1788,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json index d9f7d4b8..54c28a67 100644 --- a/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-semicolon.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-semicolon", "origin": { "mimetype": "text/csv", @@ -1788,7 +1788,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/csv-tab.csv.json b/tests/data/groundtruth/docling_v2/csv-tab.csv.json index 842a76d2f..0f179e4c 100644 --- a/tests/data/groundtruth/docling_v2/csv-tab.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-tab.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-tab", "origin": { "mimetype": "text/csv", @@ -1788,7 +1788,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json index 0f40effc..46eec31e 100644 --- a/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-too-few-columns.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-too-few-columns", "origin": { "mimetype": "text/csv", @@ -526,7 +526,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json index 4683a285..be17c389 100644 --- a/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json +++ b/tests/data/groundtruth/docling_v2/csv-too-many-columns.csv.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "csv-too-many-columns", "origin": { "mimetype": "text/csv", @@ -610,7 +610,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/equations.docx.json b/tests/data/groundtruth/docling_v2/equations.docx.json index 37fb63ab..73779bfe 100644 --- a/tests/data/groundtruth/docling_v2/equations.docx.json +++ b/tests/data/groundtruth/docling_v2/equations.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "equations", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -250,7 +250,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -280,7 +281,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -322,7 +324,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -436,7 +439,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -466,7 +470,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -520,7 +525,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -634,7 +640,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { diff --git a/tests/data/groundtruth/docling_v2/example_01.html.json b/tests/data/groundtruth/docling_v2/example_01.html.json index fce02e10..c9e9384e 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.json +++ b/tests/data/groundtruth/docling_v2/example_01.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_01", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_02.html.json b/tests/data/groundtruth/docling_v2/example_02.html.json index cef7b79d..bfbf9ec9 100644 --- a/tests/data/groundtruth/docling_v2/example_02.html.json +++ b/tests/data/groundtruth/docling_v2/example_02.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_02", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_03.html.json b/tests/data/groundtruth/docling_v2/example_03.html.json index fe2e9c34..6a9fea2b 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.json +++ b/tests/data/groundtruth/docling_v2/example_03.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_03", "origin": { "mimetype": "text/html", @@ -637,7 +637,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/example_04.html.json b/tests/data/groundtruth/docling_v2/example_04.html.json index 854ba4c6..40273c41 100644 --- a/tests/data/groundtruth/docling_v2/example_04.html.json +++ b/tests/data/groundtruth/docling_v2/example_04.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_04", "origin": { "mimetype": "text/html", @@ -325,7 +325,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/example_05.html.json b/tests/data/groundtruth/docling_v2/example_05.html.json index 3168a637..e37e43fe 100644 --- a/tests/data/groundtruth/docling_v2/example_05.html.json +++ b/tests/data/groundtruth/docling_v2/example_05.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_05", "origin": { "mimetype": "text/html", @@ -325,7 +325,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/example_06.html.json b/tests/data/groundtruth/docling_v2/example_06.html.json index f62a2cf3..aed598d5 100644 --- a/tests/data/groundtruth/docling_v2/example_06.html.json +++ b/tests/data/groundtruth/docling_v2/example_06.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_06", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_07.html.json b/tests/data/groundtruth/docling_v2/example_07.html.json index 1fb4f262..ac26ba13 100644 --- a/tests/data/groundtruth/docling_v2/example_07.html.json +++ b/tests/data/groundtruth/docling_v2/example_07.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_07", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_08.html.json b/tests/data/groundtruth/docling_v2/example_08.html.json index 085be7ef..15ac4282 100644 --- a/tests/data/groundtruth/docling_v2/example_08.html.json +++ b/tests/data/groundtruth/docling_v2/example_08.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "example_08", "origin": { "mimetype": "text/html", @@ -661,7 +661,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/1", @@ -1330,7 +1331,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/2", @@ -1999,7 +2001,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml index 0cdc5c54..1bdd9118 100644 --- a/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml +++ b/tests/data/groundtruth/docling_v2/inline_and_formatting.md.yaml @@ -174,6 +174,7 @@ texts: formatting: bold: false italic: true + script: baseline strikethrough: false underline: false label: text @@ -188,6 +189,7 @@ texts: formatting: bold: true italic: false + script: baseline strikethrough: false underline: false label: text @@ -202,6 +204,7 @@ texts: formatting: bold: true italic: true + script: baseline strikethrough: false underline: false label: text @@ -277,6 +280,7 @@ texts: formatting: bold: true italic: false + script: baseline strikethrough: false underline: false hyperlink: https://github.com/docling-project/docling @@ -452,6 +456,7 @@ texts: formatting: bold: false italic: true + script: baseline strikethrough: false underline: false label: text @@ -487,6 +492,7 @@ texts: formatting: bold: true italic: false + script: baseline strikethrough: false underline: false label: text @@ -522,6 +528,7 @@ texts: formatting: bold: true italic: false + script: baseline strikethrough: false underline: false label: text @@ -562,4 +569,4 @@ texts: prov: [] self_ref: '#/texts/37' text: amet. -version: 1.3.0 +version: 1.4.0 diff --git a/tests/data/groundtruth/docling_v2/ipa20180000016.json b/tests/data/groundtruth/docling_v2/ipa20180000016.json index 57381160..835f3ef1 100644 --- a/tests/data/groundtruth/docling_v2/ipa20180000016.json +++ b/tests/data/groundtruth/docling_v2/ipa20180000016.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "ipa20180000016.xml", "origin": { "mimetype": "application/xml", @@ -6005,7 +6005,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/ipa20200022300.json b/tests/data/groundtruth/docling_v2/ipa20200022300.json index 93dee608..1b86290b 100644 --- a/tests/data/groundtruth/docling_v2/ipa20200022300.json +++ b/tests/data/groundtruth/docling_v2/ipa20200022300.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "ipa20200022300.xml", "origin": { "mimetype": "application/xml", diff --git a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json index 6034c21f..866513a1 100644 --- a/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json +++ b/tests/data/groundtruth/docling_v2/lorem_ipsum.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "lorem_ipsum", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -66,7 +66,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -96,7 +97,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -126,7 +128,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -156,7 +159,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -186,7 +190,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } } ], diff --git a/tests/data/groundtruth/docling_v2/pa20010031492.json b/tests/data/groundtruth/docling_v2/pa20010031492.json index fc06619b..6186e475 100644 --- a/tests/data/groundtruth/docling_v2/pa20010031492.json +++ b/tests/data/groundtruth/docling_v2/pa20010031492.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "pa20010031492.xml", "origin": { "mimetype": "application/xml", @@ -2127,7 +2127,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/pftaps057006474.json b/tests/data/groundtruth/docling_v2/pftaps057006474.json index 4ebb899a..94f5bd96 100644 --- a/tests/data/groundtruth/docling_v2/pftaps057006474.json +++ b/tests/data/groundtruth/docling_v2/pftaps057006474.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "pftaps057006474.txt", "origin": { "mimetype": "text/plain", diff --git a/tests/data/groundtruth/docling_v2/pg06442728.json b/tests/data/groundtruth/docling_v2/pg06442728.json index 3c312ae0..c4fa3759 100644 --- a/tests/data/groundtruth/docling_v2/pg06442728.json +++ b/tests/data/groundtruth/docling_v2/pg06442728.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "pg06442728.xml", "origin": { "mimetype": "application/xml", diff --git a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json index d94bfb70..c4d08e40 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_bad_text.pptx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "powerpoint_bad_text", "origin": { "mimetype": "application/vnd.ms-powerpoint", diff --git a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json index c379c6f6..88a82ae3 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_sample.pptx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "powerpoint_sample", "origin": { "mimetype": "application/vnd.ms-powerpoint", @@ -2199,7 +2199,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json b/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json index 192ef753..ffc77b61 100644 --- a/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json +++ b/tests/data/groundtruth/docling_v2/powerpoint_with_image.pptx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "powerpoint_with_image", "origin": { "mimetype": "application/vnd.ms-powerpoint", diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json index 04f8198e..b7982983 100644 --- a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json +++ b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "sample_sales_data", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", @@ -2136,7 +2136,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/tablecell.docx.json b/tests/data/groundtruth/docling_v2/tablecell.docx.json index e168f1b8..ac1473d4 100644 --- a/tests/data/groundtruth/docling_v2/tablecell.docx.json +++ b/tests/data/groundtruth/docling_v2/tablecell.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "tablecell", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -78,7 +78,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -98,7 +99,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -130,7 +132,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -172,7 +175,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } } ], @@ -419,7 +423,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/test-01.xlsx.json b/tests/data/groundtruth/docling_v2/test-01.xlsx.json index 2bdfe509..2a23dc4b 100644 --- a/tests/data/groundtruth/docling_v2/test-01.xlsx.json +++ b/tests/data/groundtruth/docling_v2/test-01.xlsx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "test-01", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", @@ -681,7 +681,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/1", @@ -1599,7 +1600,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/2", @@ -2005,7 +2007,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/3", @@ -2411,7 +2414,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/4", @@ -2893,7 +2897,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/5", @@ -3375,7 +3380,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json index 65d0d30d..88a10027 100644 --- a/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json +++ b/tests/data/groundtruth/docling_v2/test_emf_docx.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "test_emf_docx", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -60,7 +60,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -78,7 +79,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -96,7 +98,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -114,7 +117,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } } ], diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index 406de95f..1372608d 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -11,83 +11,84 @@ item-0 at level 0: unspecified: group _root_ * Blisters * Headache * Sore throat - item-9 at level 1: list_item: - item-10 at level 1: paragraph: + item-9 at level 1: list: group group + item-10 at level 2: list_item: item-11 at level 1: paragraph: - item-12 at level 1: section: group textbox - item-13 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms - item-14 at level 1: paragraph: + item-12 at level 1: paragraph: + item-13 at level 1: section: group textbox + item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms item-15 at level 1: paragraph: item-16 at level 1: paragraph: item-17 at level 1: paragraph: - item-18 at level 1: section: group textbox - item-19 at level 2: paragraph: Yes - item-20 at level 1: paragraph: + item-18 at level 1: paragraph: + item-19 at level 1: section: group textbox + item-20 at level 2: paragraph: Yes item-21 at level 1: paragraph: - item-22 at level 1: section: group textbox - item-23 at level 2: list: group list - item-24 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. - item-25 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. - item-26 at level 2: paragraph: - item-27 at level 1: list: group list - item-28 at level 2: list_item: - item-29 at level 1: paragraph: + item-22 at level 1: paragraph: + item-23 at level 1: section: group textbox + item-24 at level 2: list: group list + item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. + item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. + item-27 at level 2: paragraph: + item-28 at level 1: list: group list + item-29 at level 2: list_item: item-30 at level 1: paragraph: item-31 at level 1: paragraph: item-32 at level 1: paragraph: item-33 at level 1: paragraph: - item-34 at level 1: section: group textbox - item-35 at level 2: paragraph: Health Bureau: - item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-37 at level 2: list: group list - item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-40 at level 2: paragraph: - item-41 at level 1: list: group list - item-42 at level 2: list_item: - item-43 at level 1: paragraph: - item-44 at level 1: section: group textbox - item-45 at level 2: paragraph: Department of Education: + item-34 at level 1: paragraph: + item-35 at level 1: section: group textbox + item-36 at level 2: paragraph: Health Bureau: + item-37 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-38 at level 2: list: group list + item-39 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-40 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-41 at level 2: paragraph: + item-42 at level 1: list: group list + item-43 at level 2: list_item: + item-44 at level 1: paragraph: + item-45 at level 1: section: group textbox + item-46 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. - item-46 at level 1: paragraph: item-47 at level 1: paragraph: item-48 at level 1: paragraph: item-49 at level 1: paragraph: item-50 at level 1: paragraph: item-51 at level 1: paragraph: item-52 at level 1: paragraph: - item-53 at level 1: section: group textbox - item-54 at level 2: inline: group group - item-55 at level 3: paragraph: The Health Bureau will handle - item-56 at level 3: paragraph: reporting and specimen collection - item-57 at level 3: paragraph: . - item-58 at level 2: paragraph: - item-59 at level 1: paragraph: + item-53 at level 1: paragraph: + item-54 at level 1: section: group textbox + item-55 at level 2: inline: group group + item-56 at level 3: paragraph: The Health Bureau will handle + item-57 at level 3: paragraph: reporting and specimen collection + item-58 at level 3: paragraph: . + item-59 at level 2: paragraph: item-60 at level 1: paragraph: item-61 at level 1: paragraph: - item-62 at level 1: section: group textbox - item-63 at level 2: paragraph: Whether the epidemic has eased. - item-64 at level 2: paragraph: - item-65 at level 1: paragraph: - item-66 at level 1: section: group textbox - item-67 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-68 at level 2: paragraph: No - item-69 at level 1: paragraph: + item-62 at level 1: paragraph: + item-63 at level 1: section: group textbox + item-64 at level 2: paragraph: Whether the epidemic has eased. + item-65 at level 2: paragraph: + item-66 at level 1: paragraph: + item-67 at level 1: section: group textbox + item-68 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-69 at level 2: paragraph: No item-70 at level 1: paragraph: - item-71 at level 1: section: group textbox - item-72 at level 2: paragraph: Yes - item-73 at level 1: paragraph: - item-74 at level 1: section: group textbox - item-75 at level 2: paragraph: Yes - item-76 at level 1: paragraph: + item-71 at level 1: paragraph: + item-72 at level 1: section: group textbox + item-73 at level 2: paragraph: Yes + item-74 at level 1: paragraph: + item-75 at level 1: section: group textbox + item-76 at level 2: paragraph: Yes item-77 at level 1: paragraph: - item-78 at level 1: section: group textbox - item-79 at level 2: paragraph: Case closed. - item-80 at level 2: paragraph: - item-81 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. - item-82 at level 1: paragraph: - item-83 at level 1: section: group textbox - item-84 at level 2: paragraph: No - item-85 at level 1: paragraph: + item-78 at level 1: paragraph: + item-79 at level 1: section: group textbox + item-80 at level 2: paragraph: Case closed. + item-81 at level 2: paragraph: + item-82 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-83 at level 1: paragraph: + item-84 at level 1: section: group textbox + item-85 at level 2: paragraph: No item-86 at level 1: paragraph: - item-87 at level 1: paragraph: \ No newline at end of file + item-87 at level 1: paragraph: + item-88 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 840e937a..9b1771f2 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "textbox", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -29,6 +29,9 @@ { "$ref": "#/groups/0" }, + { + "$ref": "#/groups/19" + }, { "$ref": "#/texts/6" }, @@ -36,10 +39,10 @@ "$ref": "#/texts/7" }, { - "$ref": "#/texts/8" + "$ref": "#/groups/2" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/9" }, { "$ref": "#/texts/10" @@ -50,17 +53,14 @@ { "$ref": "#/texts/12" }, - { - "$ref": "#/texts/13" - }, { "$ref": "#/groups/3" }, { - "$ref": "#/texts/15" + "$ref": "#/texts/14" }, { - "$ref": "#/texts/16" + "$ref": "#/texts/15" }, { "$ref": "#/groups/4" @@ -68,6 +68,9 @@ { "$ref": "#/groups/6" }, + { + "$ref": "#/texts/20" + }, { "$ref": "#/texts/21" }, @@ -80,9 +83,6 @@ { "$ref": "#/texts/24" }, - { - "$ref": "#/texts/25" - }, { "$ref": "#/groups/7" }, @@ -90,11 +90,14 @@ "$ref": "#/groups/9" }, { - "$ref": "#/texts/32" + "$ref": "#/texts/31" }, { "$ref": "#/groups/10" }, + { + "$ref": "#/texts/33" + }, { "$ref": "#/texts/34" }, @@ -114,10 +117,10 @@ "$ref": "#/texts/39" }, { - "$ref": "#/texts/40" + "$ref": "#/groups/11" }, { - "$ref": "#/groups/11" + "$ref": "#/texts/44" }, { "$ref": "#/texts/45" @@ -125,56 +128,53 @@ { "$ref": "#/texts/46" }, - { - "$ref": "#/texts/47" - }, { "$ref": "#/groups/13" }, { - "$ref": "#/texts/50" + "$ref": "#/texts/49" }, { "$ref": "#/groups/14" }, { - "$ref": "#/texts/53" + "$ref": "#/texts/52" }, { - "$ref": "#/texts/54" + "$ref": "#/texts/53" }, { "$ref": "#/groups/15" }, { - "$ref": "#/texts/56" + "$ref": "#/texts/55" }, { "$ref": "#/groups/16" }, { - "$ref": "#/texts/58" + "$ref": "#/texts/57" }, { - "$ref": "#/texts/59" + "$ref": "#/texts/58" }, { "$ref": "#/groups/17" }, { - "$ref": "#/texts/63" + "$ref": "#/texts/62" }, { "$ref": "#/groups/18" }, + { + "$ref": "#/texts/64" + }, { "$ref": "#/texts/65" }, { "$ref": "#/texts/66" - }, - { - "$ref": "#/texts/67" } ], "content_layer": "body", @@ -223,7 +223,7 @@ }, "children": [ { - "$ref": "#/texts/9" + "$ref": "#/texts/8" } ], "content_layer": "body", @@ -237,7 +237,7 @@ }, "children": [ { - "$ref": "#/texts/14" + "$ref": "#/texts/13" } ], "content_layer": "body", @@ -254,7 +254,7 @@ "$ref": "#/groups/5" }, { - "$ref": "#/texts/19" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -268,10 +268,10 @@ }, "children": [ { - "$ref": "#/texts/17" + "$ref": "#/texts/16" }, { - "$ref": "#/texts/18" + "$ref": "#/texts/17" } ], "content_layer": "body", @@ -285,7 +285,7 @@ }, "children": [ { - "$ref": "#/texts/20" + "$ref": "#/texts/19" } ], "content_layer": "body", @@ -299,16 +299,16 @@ }, "children": [ { - "$ref": "#/texts/26" + "$ref": "#/texts/25" }, { - "$ref": "#/texts/27" + "$ref": "#/texts/26" }, { "$ref": "#/groups/8" }, { - "$ref": "#/texts/30" + "$ref": "#/texts/29" } ], "content_layer": "body", @@ -322,10 +322,10 @@ }, "children": [ { - "$ref": "#/texts/28" + "$ref": "#/texts/27" }, { - "$ref": "#/texts/29" + "$ref": "#/texts/28" } ], "content_layer": "body", @@ -339,7 +339,7 @@ }, "children": [ { - "$ref": "#/texts/31" + "$ref": "#/texts/30" } ], "content_layer": "body", @@ -353,7 +353,7 @@ }, "children": [ { - "$ref": "#/texts/33" + "$ref": "#/texts/32" } ], "content_layer": "body", @@ -370,7 +370,7 @@ "$ref": "#/groups/12" }, { - "$ref": "#/texts/44" + "$ref": "#/texts/43" } ], "content_layer": "body", @@ -383,14 +383,14 @@ "$ref": "#/groups/11" }, "children": [ + { + "$ref": "#/texts/40" + }, { "$ref": "#/texts/41" }, { "$ref": "#/texts/42" - }, - { - "$ref": "#/texts/43" } ], "content_layer": "body", @@ -404,10 +404,10 @@ }, "children": [ { - "$ref": "#/texts/48" + "$ref": "#/texts/47" }, { - "$ref": "#/texts/49" + "$ref": "#/texts/48" } ], "content_layer": "body", @@ -421,10 +421,10 @@ }, "children": [ { - "$ref": "#/texts/51" + "$ref": "#/texts/50" }, { - "$ref": "#/texts/52" + "$ref": "#/texts/51" } ], "content_layer": "body", @@ -438,7 +438,7 @@ }, "children": [ { - "$ref": "#/texts/55" + "$ref": "#/texts/54" } ], "content_layer": "body", @@ -452,7 +452,7 @@ }, "children": [ { - "$ref": "#/texts/57" + "$ref": "#/texts/56" } ], "content_layer": "body", @@ -465,14 +465,14 @@ "$ref": "#/body" }, "children": [ + { + "$ref": "#/texts/59" + }, { "$ref": "#/texts/60" }, { "$ref": "#/texts/61" - }, - { - "$ref": "#/texts/62" } ], "content_layer": "body", @@ -486,12 +486,26 @@ }, "children": [ { - "$ref": "#/texts/64" + "$ref": "#/texts/63" } ], "content_layer": "body", "name": "textbox", "label": "section" + }, + { + "self_ref": "#/groups/19", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/67" + } + ], + "content_layer": "body", + "name": "group", + "label": "list" } ], "texts": [ @@ -510,7 +524,8 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -528,7 +543,8 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -558,7 +574,8 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -588,7 +605,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -600,12 +618,10 @@ }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "paragraph", "prov": [], "orig": "", - "text": "", - "enumerated": false, - "marker": "-" + "text": "" }, { "self_ref": "#/texts/7", @@ -621,18 +637,6 @@ }, { "self_ref": "#/texts/8", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/9", "parent": { "$ref": "#/groups/2" }, @@ -646,9 +650,22 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/10", "parent": { @@ -687,18 +704,6 @@ }, { "self_ref": "#/texts/13", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/14", "parent": { "$ref": "#/groups/3" }, @@ -712,9 +717,22 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, + { + "self_ref": "#/texts/14", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/15", "parent": { @@ -729,18 +747,6 @@ }, { "self_ref": "#/texts/16", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/17", "parent": { "$ref": "#/groups/5" }, @@ -754,13 +760,14 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" }, { - "self_ref": "#/texts/18", + "self_ref": "#/texts/17", "parent": { "$ref": "#/groups/5" }, @@ -774,13 +781,14 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/4" }, @@ -792,7 +800,7 @@ "text": "" }, { - "self_ref": "#/texts/20", + "self_ref": "#/texts/19", "parent": { "$ref": "#/groups/6" }, @@ -805,6 +813,18 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/21", "parent": { @@ -855,18 +875,6 @@ }, { "self_ref": "#/texts/25", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/26", "parent": { "$ref": "#/groups/7" }, @@ -880,11 +888,12 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/27", + "self_ref": "#/texts/26", "parent": { "$ref": "#/groups/7" }, @@ -898,11 +907,12 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/28", + "self_ref": "#/texts/27", "parent": { "$ref": "#/groups/8" }, @@ -916,13 +926,14 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" }, { - "self_ref": "#/texts/29", + "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/8" }, @@ -936,13 +947,14 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" }, { - "self_ref": "#/texts/30", + "self_ref": "#/texts/29", "parent": { "$ref": "#/groups/7" }, @@ -954,7 +966,7 @@ "text": "" }, { - "self_ref": "#/texts/31", + "self_ref": "#/texts/30", "parent": { "$ref": "#/groups/9" }, @@ -968,7 +980,7 @@ "marker": "-" }, { - "self_ref": "#/texts/32", + "self_ref": "#/texts/31", "parent": { "$ref": "#/body" }, @@ -980,7 +992,7 @@ "text": "" }, { - "self_ref": "#/texts/33", + "self_ref": "#/texts/32", "parent": { "$ref": "#/groups/10" }, @@ -994,9 +1006,22 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/34", "parent": { @@ -1071,18 +1096,6 @@ }, { "self_ref": "#/texts/40", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/41", "parent": { "$ref": "#/groups/12" }, @@ -1096,11 +1109,12 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/42", + "self_ref": "#/texts/41", "parent": { "$ref": "#/groups/12" }, @@ -1114,11 +1128,12 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/43", + "self_ref": "#/texts/42", "parent": { "$ref": "#/groups/12" }, @@ -1132,13 +1147,26 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, + { + "self_ref": "#/texts/43", + "parent": { + "$ref": "#/groups/11" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/44", "parent": { - "$ref": "#/groups/11" + "$ref": "#/body" }, "children": [], "content_layer": "body", @@ -1173,18 +1201,6 @@ }, { "self_ref": "#/texts/47", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/48", "parent": { "$ref": "#/groups/13" }, @@ -1198,11 +1214,12 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/49", + "self_ref": "#/texts/48", "parent": { "$ref": "#/groups/13" }, @@ -1214,7 +1231,7 @@ "text": "" }, { - "self_ref": "#/texts/50", + "self_ref": "#/texts/49", "parent": { "$ref": "#/body" }, @@ -1226,7 +1243,7 @@ "text": "" }, { - "self_ref": "#/texts/51", + "self_ref": "#/texts/50", "parent": { "$ref": "#/groups/14" }, @@ -1240,11 +1257,12 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/52", + "self_ref": "#/texts/51", "parent": { "$ref": "#/groups/14" }, @@ -1258,9 +1276,22 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, + { + "self_ref": "#/texts/52", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/53", "parent": { @@ -1275,18 +1306,6 @@ }, { "self_ref": "#/texts/54", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/55", "parent": { "$ref": "#/groups/15" }, @@ -1300,11 +1319,12 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/56", + "self_ref": "#/texts/55", "parent": { "$ref": "#/body" }, @@ -1316,7 +1336,7 @@ "text": "" }, { - "self_ref": "#/texts/57", + "self_ref": "#/texts/56", "parent": { "$ref": "#/groups/16" }, @@ -1330,9 +1350,22 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/58", "parent": { @@ -1347,18 +1380,6 @@ }, { "self_ref": "#/texts/59", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/60", "parent": { "$ref": "#/groups/17" }, @@ -1372,11 +1393,12 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/61", + "self_ref": "#/texts/60", "parent": { "$ref": "#/groups/17" }, @@ -1388,7 +1410,7 @@ "text": "" }, { - "self_ref": "#/texts/62", + "self_ref": "#/texts/61", "parent": { "$ref": "#/groups/17" }, @@ -1402,11 +1424,12 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { - "self_ref": "#/texts/63", + "self_ref": "#/texts/62", "parent": { "$ref": "#/body" }, @@ -1418,7 +1441,7 @@ "text": "" }, { - "self_ref": "#/texts/64", + "self_ref": "#/texts/63", "parent": { "$ref": "#/groups/18" }, @@ -1432,9 +1455,22 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/65", "parent": { @@ -1462,14 +1498,16 @@ { "self_ref": "#/texts/67", "parent": { - "$ref": "#/body" + "$ref": "#/groups/19" }, "children": [], "content_layer": "body", - "label": "paragraph", + "label": "list_item", "prov": [], "orig": "", - "text": "" + "text": "", + "enumerated": false, + "marker": "-" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/unit_test_01.html.json b/tests/data/groundtruth/docling_v2/unit_test_01.html.json index 498db3c0..08669c21 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_01.html.json +++ b/tests/data/groundtruth/docling_v2/unit_test_01.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "unit_test_01", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt index 2860c30b..bc923c1d 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.itxt @@ -18,13 +18,20 @@ item-0 at level 0: unspecified: group _root_ item-17 at level 2: list_item: Bold bullet 2 item-18 at level 2: list_item: Underline bullet 3 item-19 at level 2: inline: group group - item-20 at level 3: list_item: Some - item-21 at level 3: list_item: italic - item-22 at level 3: list_item: bold - item-23 at level 3: list_item: underline - item-24 at level 2: list: group list - item-25 at level 3: inline: group group - item-26 at level 4: list_item: Nested - item-27 at level 4: list_item: italic - item-28 at level 4: list_item: bold - item-29 at level 1: paragraph: \ No newline at end of file + item-20 at level 3: list: group group + item-21 at level 4: list_item: Some + item-22 at level 3: list: group group + item-23 at level 4: list_item: italic + item-24 at level 3: list: group group + item-25 at level 4: list_item: bold + item-26 at level 3: list: group group + item-27 at level 4: list_item: underline + item-28 at level 2: list: group list + item-29 at level 3: inline: group group + item-30 at level 4: list: group group + item-31 at level 5: list_item: Nested + item-32 at level 4: list: group group + item-33 at level 5: list_item: italic + item-34 at level 4: list: group group + item-35 at level 5: list_item: bold + item-36 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json index 8b6ee9db..a1c3c13f 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "unit_test_formatting", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -42,7 +42,7 @@ "$ref": "#/groups/1" }, { - "$ref": "#/texts/23" + "$ref": "#/texts/16" } ], "content_layer": "body", @@ -115,16 +115,16 @@ }, "children": [ { - "$ref": "#/texts/16" + "$ref": "#/groups/11" }, { - "$ref": "#/texts/17" + "$ref": "#/groups/10" }, { - "$ref": "#/texts/18" + "$ref": "#/groups/9" }, { - "$ref": "#/texts/19" + "$ref": "#/groups/8" } ], "content_layer": "body", @@ -152,18 +152,116 @@ }, "children": [ { - "$ref": "#/texts/20" + "$ref": "#/groups/7" }, { - "$ref": "#/texts/21" + "$ref": "#/groups/6" }, + { + "$ref": "#/groups/5" + } + ], + "content_layer": "body", + "name": "group", + "label": "inline" + }, + { + "self_ref": "#/groups/5", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/17" + } + ], + "content_layer": "body", + "name": "group", + "label": "list" + }, + { + "self_ref": "#/groups/6", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/18" + } + ], + "content_layer": "body", + "name": "group", + "label": "list" + }, + { + "self_ref": "#/groups/7", + "parent": { + "$ref": "#/groups/4" + }, + "children": [ + { + "$ref": "#/texts/19" + } + ], + "content_layer": "body", + "name": "group", + "label": "list" + }, + { + "self_ref": "#/groups/8", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/texts/20" + } + ], + "content_layer": "body", + "name": "group", + "label": "list" + }, + { + "self_ref": "#/groups/9", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/texts/21" + } + ], + "content_layer": "body", + "name": "group", + "label": "list" + }, + { + "self_ref": "#/groups/10", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ { "$ref": "#/texts/22" } ], "content_layer": "body", "name": "group", - "label": "inline" + "label": "list" + }, + { + "self_ref": "#/groups/11", + "parent": { + "$ref": "#/groups/2" + }, + "children": [ + { + "$ref": "#/texts/23" + } + ], + "content_layer": "body", + "name": "group", + "label": "list" } ], "texts": [ @@ -182,7 +280,8 @@ "bold": false, "italic": true, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -200,7 +299,8 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -218,7 +318,8 @@ "bold": false, "italic": false, "underline": true, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -236,7 +337,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "hyperlink": "https:/github.com/DS4SD/docling" }, @@ -255,7 +357,8 @@ "bold": true, "italic": true, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "hyperlink": "https:/github.com/DS4SD/docling" }, @@ -274,7 +377,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -292,7 +396,8 @@ "bold": false, "italic": true, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -310,7 +415,8 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -328,7 +434,8 @@ "bold": false, "italic": false, "underline": true, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -346,7 +453,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -364,7 +472,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "hyperlink": "https:/github.com/DS4SD/docling" }, @@ -383,7 +492,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -413,7 +523,8 @@ "bold": false, "italic": true, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -433,7 +544,8 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -453,7 +565,8 @@ "bold": false, "italic": false, "underline": true, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -461,47 +574,19 @@ { "self_ref": "#/texts/16", "parent": { - "$ref": "#/groups/2" + "$ref": "#/body" }, "children": [], "content_layer": "body", - "label": "list_item", + "label": "paragraph", "prov": [], - "orig": "Some", - "text": "Some", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" + "orig": "", + "text": "" }, { "self_ref": "#/texts/17", "parent": { - "$ref": "#/groups/2" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "italic", - "text": "italic", - "formatting": { - "bold": false, - "italic": true, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/18", - "parent": { - "$ref": "#/groups/2" + "$ref": "#/groups/5" }, "children": [], "content_layer": "body", @@ -513,7 +598,29 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/18", + "parent": { + "$ref": "#/groups/6" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -521,27 +628,7 @@ { "self_ref": "#/texts/19", "parent": { - "$ref": "#/groups/2" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "underline", - "text": "underline", - "formatting": { - "bold": false, - "italic": false, - "underline": true, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" + "$ref": "#/groups/7" }, "children": [], "content_layer": "body", @@ -553,7 +640,29 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/20", + "parent": { + "$ref": "#/groups/8" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "underline", + "text": "underline", + "formatting": { + "bold": false, + "italic": false, + "underline": true, + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -561,27 +670,7 @@ { "self_ref": "#/texts/21", "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "list_item", - "prov": [], - "orig": "italic", - "text": "italic", - "formatting": { - "bold": false, - "italic": true, - "underline": false, - "strikethrough": false - }, - "enumerated": false, - "marker": "-" - }, - { - "self_ref": "#/texts/22", - "parent": { - "$ref": "#/groups/4" + "$ref": "#/groups/9" }, "children": [], "content_layer": "body", @@ -593,7 +682,29 @@ "bold": true, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "-" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "list_item", + "prov": [], + "orig": "italic", + "text": "italic", + "formatting": { + "bold": false, + "italic": true, + "underline": false, + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -601,14 +712,23 @@ { "self_ref": "#/texts/23", "parent": { - "$ref": "#/body" + "$ref": "#/groups/11" }, "children": [], "content_layer": "body", - "label": "paragraph", + "label": "list_item", "prov": [], - "orig": "", - "text": "" + "orig": "Some", + "text": "Some", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + }, + "enumerated": false, + "marker": "-" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md index 918e89e2..05ee80fc 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md +++ b/tests/data/groundtruth/docling_v2/unit_test_formatting.docx.md @@ -13,5 +13,5 @@ Normal *italic* **bold** underline and [hyperlink](https:/github.com/DS4SD/docli - *Italic bullet 1* - **Bold bullet 2** - Underline bullet 3 -- Some *italic* **bold** underline - - Nested *italic* **bold** \ No newline at end of file + - Some - *italic* - **bold** - underline + - Nested - *italic* - **bold** \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json index 6383aba0..32288fe5 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_headers.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "unit_test_headers", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -138,7 +138,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -168,7 +169,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -239,7 +241,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -269,7 +272,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -343,7 +347,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -373,7 +378,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -447,7 +453,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -477,7 +484,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -566,7 +574,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -596,7 +605,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -667,7 +677,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -697,7 +708,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -771,7 +783,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -801,7 +814,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { diff --git a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json index 779d76d7..a0883e7e 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_headers_numbered.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "unit_test_headers_numbered", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -214,7 +214,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -244,7 +245,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -315,7 +317,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -345,7 +348,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -419,7 +423,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -449,7 +454,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -523,7 +529,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -553,7 +560,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -620,7 +628,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -650,7 +659,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -721,7 +731,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -751,7 +762,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -825,7 +837,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -855,7 +868,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { diff --git a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json index 66f2e636..2f0b928d 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json +++ b/tests/data/groundtruth/docling_v2/unit_test_lists.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "unit_test_lists", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -370,7 +370,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -400,7 +401,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -450,7 +452,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -470,7 +473,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -490,7 +494,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -542,7 +547,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -562,7 +568,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -582,7 +589,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -634,7 +642,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -654,7 +663,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -674,7 +684,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -694,7 +705,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -714,7 +726,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -734,7 +747,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -786,7 +800,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -806,7 +821,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -826,7 +842,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -878,7 +895,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -898,7 +916,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -918,7 +937,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -938,7 +958,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -996,7 +1017,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -1016,7 +1038,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -1036,7 +1059,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -1056,7 +1080,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -1076,7 +1101,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -1096,7 +1122,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 05d36454..952c96bf 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "wiki_duck", "origin": { "mimetype": "text/html", @@ -8489,7 +8489,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/1", @@ -8648,7 +8649,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/word_sample.docx.json b/tests/data/groundtruth/docling_v2/word_sample.docx.json index 432a5087..1f94d916 100644 --- a/tests/data/groundtruth/docling_v2/word_sample.docx.json +++ b/tests/data/groundtruth/docling_v2/word_sample.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "word_sample", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -106,7 +106,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -149,7 +150,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -167,7 +169,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -217,7 +220,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -235,7 +239,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -255,7 +260,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -275,7 +281,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -295,7 +302,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -313,7 +321,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -333,7 +342,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -353,7 +363,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -373,7 +384,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -426,7 +438,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -444,7 +457,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -462,7 +476,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -492,7 +507,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -510,7 +526,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -530,7 +547,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -550,7 +568,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" }, "enumerated": false, "marker": "-" @@ -897,7 +916,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.json b/tests/data/groundtruth/docling_v2/word_tables.docx.json index 2b3fc43f..e215c27a 100644 --- a/tests/data/groundtruth/docling_v2/word_tables.docx.json +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "word_tables", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", @@ -119,7 +119,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -149,7 +150,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -179,7 +181,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -209,7 +212,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -239,7 +243,8 @@ "bold": false, "italic": false, "underline": false, - "strikethrough": false + "strikethrough": false, + "script": "baseline" } }, { @@ -510,7 +515,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/1", @@ -729,7 +735,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/2", @@ -1020,7 +1027,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/3", @@ -1387,7 +1395,8 @@ } ] ] - } + }, + "annotations": [] }, { "self_ref": "#/tables/4", @@ -2398,7 +2407,8 @@ } ] ] - } + }, + "annotations": [] } ], "key_value_items": [], diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.json b/tests/data/webp/groundtruth/docling_v2/webp-test.json index bf14a5c1..a53da5c9 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "webp-test", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json index e08af9cf..4c796c0b 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "ocr_test", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json index 835b1c74..5ecd3ec1 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_180.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "ocr_test_rotated_180", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json index 69a028d4..17633a7b 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_270.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "ocr_test_rotated_270", "origin": { "mimetype": "application/pdf", diff --git a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json index 94dc806f..32e62f7a 100644 --- a/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json +++ b/tests/data_scanned/groundtruth/docling_v2/ocr_test_rotated_90.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.3.0", + "version": "1.4.0", "name": "ocr_test_rotated_90", "origin": { "mimetype": "application/pdf",