diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json index e938e2d7..dd51e390 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json @@ -213,10 +213,10 @@ "prov": [ { "bbox": [ - 139.6674041748047, + 139.66741943359375, 322.5054626464844, 475.00927734375, - 454.4546203613281 + 454.45458984375 ], "page": 1, "span": [ diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json index 3bca0d55..5db555b0 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json @@ -2646,7 +2646,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2686,7 +2686,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2726,7 +2726,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2881,7 +2881,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3096,7 +3096,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3280,8 +3280,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7787,7 +7787,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7852,7 +7852,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8184,8 +8184,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13582,7 +13582,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13628,7 +13628,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13674,7 +13674,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13841,7 +13841,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14062,7 +14062,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14252,8 +14252,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19642,7 +19642,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19713,7 +19713,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20057,7 +20057,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20224,7 +20224,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20445,7 +20445,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20635,8 +20635,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26025,7 +26025,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26096,7 +26096,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26440,7 +26440,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26486,7 +26486,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json index c0570096..f281a447 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json @@ -336,8 +336,8 @@ { "page_no": 1, "bbox": { - "l": 139.6674041748047, - "t": 454.4546203613281, + "l": 139.66741943359375, + "t": 454.45458984375, "r": 475.00927734375, "b": 322.5054626464844, "coord_origin": "BOTTOMLEFT" diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json index 3bca0d55..5db555b0 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json @@ -2646,7 +2646,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2686,7 +2686,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2726,7 +2726,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2881,7 +2881,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3096,7 +3096,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3280,8 +3280,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7787,7 +7787,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7852,7 +7852,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8184,8 +8184,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13582,7 +13582,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13628,7 +13628,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13674,7 +13674,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13841,7 +13841,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14062,7 +14062,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14252,8 +14252,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19642,7 +19642,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19713,7 +19713,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20057,7 +20057,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20224,7 +20224,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20445,7 +20445,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20635,8 +20635,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26025,7 +26025,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26096,7 +26096,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26440,7 +26440,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26486,7 +26486,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json index 9c01cb7e..04f8198e 100644 --- a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json +++ b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json @@ -4,7 +4,7 @@ "name": "sample_sales_data", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "binary_hash": 4984052357623711224, + "binary_hash": 14806485565397602516, "filename": "sample_sales_data.xlsm" }, "furniture": { diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index e17e2be2..406de95f 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -5,92 +5,89 @@ item-0 at level 0: unspecified: group _root_ item-4 at level 1: section: group textbox item-5 at level 2: paragraph: Student falls ill item-6 at level 2: paragraph: - item-7 at level 2: paragraph: - item-8 at level 2: list: group list - item-9 at level 3: list_item: Suggested Reportable Symptoms: + item-7 at level 2: list: group list + item-8 at level 3: list_item: Suggested Reportable Symptoms: * ... sh * Blisters * Headache * Sore throat - item-10 at level 1: list_item: + item-9 at level 1: list_item: + item-10 at level 1: paragraph: item-11 at level 1: paragraph: - item-12 at level 1: paragraph: - item-13 at level 1: section: group textbox - item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-12 at level 1: section: group textbox + item-13 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-14 at level 1: paragraph: item-15 at level 1: paragraph: item-16 at level 1: paragraph: item-17 at level 1: paragraph: - item-18 at level 1: paragraph: - item-19 at level 1: section: group textbox - item-20 at level 2: paragraph: Yes + item-18 at level 1: section: group textbox + item-19 at level 2: paragraph: Yes + item-20 at level 1: paragraph: item-21 at level 1: paragraph: - item-22 at level 1: paragraph: - item-23 at level 1: section: group textbox - item-24 at level 2: list: group list - item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. - item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. - item-27 at level 2: paragraph: - item-28 at level 2: paragraph: - item-29 at level 1: list: group list - item-30 at level 2: list_item: + item-22 at level 1: section: group textbox + item-23 at level 2: list: group list + item-24 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. + item-25 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. + item-26 at level 2: paragraph: + item-27 at level 1: list: group list + item-28 at level 2: list_item: + item-29 at level 1: paragraph: + item-30 at level 1: paragraph: item-31 at level 1: paragraph: item-32 at level 1: paragraph: item-33 at level 1: paragraph: - item-34 at level 1: paragraph: - item-35 at level 1: paragraph: - item-36 at level 1: section: group textbox - item-37 at level 2: paragraph: Health Bureau: - item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-39 at level 2: list: group list - item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-41 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-42 at level 2: paragraph: - item-43 at level 2: paragraph: - item-44 at level 1: list: group list - item-45 at level 2: list_item: - item-46 at level 1: paragraph: - item-47 at level 1: section: group textbox - item-48 at level 2: paragraph: Department of Education: + item-34 at level 1: section: group textbox + item-35 at level 2: paragraph: Health Bureau: + item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-37 at level 2: list: group list + item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-40 at level 2: paragraph: + item-41 at level 1: list: group list + item-42 at level 2: list_item: + item-43 at level 1: paragraph: + item-44 at level 1: section: group textbox + item-45 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. + item-46 at level 1: paragraph: + item-47 at level 1: paragraph: + item-48 at level 1: paragraph: item-49 at level 1: paragraph: item-50 at level 1: paragraph: item-51 at level 1: paragraph: item-52 at level 1: paragraph: - item-53 at level 1: paragraph: - item-54 at level 1: paragraph: - item-55 at level 1: paragraph: - item-56 at level 1: section: group textbox - item-57 at level 2: inline: group group - item-58 at level 3: paragraph: The Health Bureau will handle - item-59 at level 3: paragraph: reporting and specimen collection - item-60 at level 3: paragraph: . - item-61 at level 2: paragraph: - item-62 at level 2: paragraph: - item-63 at level 1: paragraph: - item-64 at level 1: paragraph: + item-53 at level 1: section: group textbox + item-54 at level 2: inline: group group + item-55 at level 3: paragraph: The Health Bureau will handle + item-56 at level 3: paragraph: reporting and specimen collection + item-57 at level 3: paragraph: . + item-58 at level 2: paragraph: + item-59 at level 1: paragraph: + item-60 at level 1: paragraph: + item-61 at level 1: paragraph: + item-62 at level 1: section: group textbox + item-63 at level 2: paragraph: Whether the epidemic has eased. + item-64 at level 2: paragraph: item-65 at level 1: paragraph: item-66 at level 1: section: group textbox - item-67 at level 2: paragraph: Whether the epidemic has eased. - item-68 at level 2: paragraph: - item-69 at level 2: paragraph: + item-67 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-68 at level 2: paragraph: No + item-69 at level 1: paragraph: item-70 at level 1: paragraph: item-71 at level 1: section: group textbox - item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-73 at level 2: paragraph: No - item-74 at level 1: paragraph: - item-75 at level 1: paragraph: - item-76 at level 1: section: group textbox + item-72 at level 2: paragraph: Yes + item-73 at level 1: paragraph: + item-74 at level 1: section: group textbox + item-75 at level 2: paragraph: Yes + item-76 at level 1: paragraph: item-77 at level 1: paragraph: item-78 at level 1: section: group textbox - item-79 at level 1: paragraph: - item-80 at level 1: paragraph: - item-81 at level 1: section: group textbox - item-82 at level 2: paragraph: Case closed. - item-83 at level 2: paragraph: - item-84 at level 2: paragraph: - item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-79 at level 2: paragraph: Case closed. + item-80 at level 2: paragraph: + item-81 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-82 at level 1: paragraph: + item-83 at level 1: section: group textbox + item-84 at level 2: paragraph: No + item-85 at level 1: paragraph: item-86 at level 1: paragraph: - item-87 at level 1: section: group textbox - item-88 at level 1: paragraph: - item-89 at level 1: paragraph: - item-90 at level 1: paragraph: \ No newline at end of file + item-87 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 743fb578..840e937a 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -29,6 +29,9 @@ { "$ref": "#/groups/0" }, + { + "$ref": "#/texts/6" + }, { "$ref": "#/texts/7" }, @@ -36,10 +39,10 @@ "$ref": "#/texts/8" }, { - "$ref": "#/texts/9" + "$ref": "#/groups/2" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/10" }, { "$ref": "#/texts/11" @@ -50,17 +53,14 @@ { "$ref": "#/texts/13" }, - { - "$ref": "#/texts/14" - }, { "$ref": "#/groups/3" }, { - "$ref": "#/texts/16" + "$ref": "#/texts/15" }, { - "$ref": "#/texts/17" + "$ref": "#/texts/16" }, { "$ref": "#/groups/4" @@ -68,6 +68,12 @@ { "$ref": "#/groups/6" }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, { "$ref": "#/texts/23" }, @@ -77,12 +83,6 @@ { "$ref": "#/texts/25" }, - { - "$ref": "#/texts/26" - }, - { - "$ref": "#/texts/27" - }, { "$ref": "#/groups/7" }, @@ -90,11 +90,20 @@ "$ref": "#/groups/9" }, { - "$ref": "#/texts/35" + "$ref": "#/texts/32" }, { "$ref": "#/groups/10" }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, { "$ref": "#/texts/37" }, @@ -107,74 +116,65 @@ { "$ref": "#/texts/40" }, - { - "$ref": "#/texts/41" - }, - { - "$ref": "#/texts/42" - }, - { - "$ref": "#/texts/43" - }, { "$ref": "#/groups/11" }, { - "$ref": "#/texts/49" + "$ref": "#/texts/45" }, { - "$ref": "#/texts/50" + "$ref": "#/texts/46" }, { - "$ref": "#/texts/51" + "$ref": "#/texts/47" }, { "$ref": "#/groups/13" }, { - "$ref": "#/texts/55" + "$ref": "#/texts/50" }, { "$ref": "#/groups/14" }, + { + "$ref": "#/texts/53" + }, + { + "$ref": "#/texts/54" + }, + { + "$ref": "#/groups/15" + }, + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/groups/16" + }, { "$ref": "#/texts/58" }, { "$ref": "#/texts/59" }, - { - "$ref": "#/groups/15" - }, - { - "$ref": "#/texts/60" - }, - { - "$ref": "#/groups/16" - }, - { - "$ref": "#/texts/61" - }, - { - "$ref": "#/texts/62" - }, { "$ref": "#/groups/17" }, { - "$ref": "#/texts/67" + "$ref": "#/texts/63" }, { "$ref": "#/groups/18" }, { - "$ref": "#/texts/68" + "$ref": "#/texts/65" }, { - "$ref": "#/texts/69" + "$ref": "#/texts/66" }, { - "$ref": "#/texts/70" + "$ref": "#/texts/67" } ], "content_layer": "body", @@ -194,9 +194,6 @@ { "$ref": "#/texts/4" }, - { - "$ref": "#/texts/5" - }, { "$ref": "#/groups/1" } @@ -212,7 +209,7 @@ }, "children": [ { - "$ref": "#/texts/6" + "$ref": "#/texts/5" } ], "content_layer": "body", @@ -226,7 +223,7 @@ }, "children": [ { - "$ref": "#/texts/10" + "$ref": "#/texts/9" } ], "content_layer": "body", @@ -240,7 +237,7 @@ }, "children": [ { - "$ref": "#/texts/15" + "$ref": "#/texts/14" } ], "content_layer": "body", @@ -257,10 +254,7 @@ "$ref": "#/groups/5" }, { - "$ref": "#/texts/20" - }, - { - "$ref": "#/texts/21" + "$ref": "#/texts/19" } ], "content_layer": "body", @@ -274,10 +268,10 @@ }, "children": [ { - "$ref": "#/texts/18" + "$ref": "#/texts/17" }, { - "$ref": "#/texts/19" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -291,7 +285,7 @@ }, "children": [ { - "$ref": "#/texts/22" + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -305,19 +299,16 @@ }, "children": [ { - "$ref": "#/texts/28" + "$ref": "#/texts/26" }, { - "$ref": "#/texts/29" + "$ref": "#/texts/27" }, { "$ref": "#/groups/8" }, { - "$ref": "#/texts/32" - }, - { - "$ref": "#/texts/33" + "$ref": "#/texts/30" } ], "content_layer": "body", @@ -331,10 +322,10 @@ }, "children": [ { - "$ref": "#/texts/30" + "$ref": "#/texts/28" }, { - "$ref": "#/texts/31" + "$ref": "#/texts/29" } ], "content_layer": "body", @@ -348,7 +339,7 @@ }, "children": [ { - "$ref": "#/texts/34" + "$ref": "#/texts/31" } ], "content_layer": "body", @@ -362,7 +353,7 @@ }, "children": [ { - "$ref": "#/texts/36" + "$ref": "#/texts/33" } ], "content_layer": "body", @@ -379,10 +370,7 @@ "$ref": "#/groups/12" }, { - "$ref": "#/texts/47" - }, - { - "$ref": "#/texts/48" + "$ref": "#/texts/44" } ], "content_layer": "body", @@ -396,13 +384,13 @@ }, "children": [ { - "$ref": "#/texts/44" + "$ref": "#/texts/41" }, { - "$ref": "#/texts/45" + "$ref": "#/texts/42" }, { - "$ref": "#/texts/46" + "$ref": "#/texts/43" } ], "content_layer": "body", @@ -416,13 +404,10 @@ }, "children": [ { - "$ref": "#/texts/52" + "$ref": "#/texts/48" }, { - "$ref": "#/texts/53" - }, - { - "$ref": "#/texts/54" + "$ref": "#/texts/49" } ], "content_layer": "body", @@ -436,10 +421,10 @@ }, "children": [ { - "$ref": "#/texts/56" + "$ref": "#/texts/51" }, { - "$ref": "#/texts/57" + "$ref": "#/texts/52" } ], "content_layer": "body", @@ -451,7 +436,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/55" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -461,7 +450,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/57" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -473,16 +466,13 @@ }, "children": [ { - "$ref": "#/texts/63" + "$ref": "#/texts/60" }, { - "$ref": "#/texts/64" + "$ref": "#/texts/61" }, { - "$ref": "#/texts/65" - }, - { - "$ref": "#/texts/66" + "$ref": "#/texts/62" } ], "content_layer": "body", @@ -494,7 +484,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/64" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -581,18 +575,6 @@ }, { "self_ref": "#/texts/5", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/6", "parent": { "$ref": "#/groups/1" }, @@ -612,7 +594,7 @@ "marker": "-" }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, @@ -625,6 +607,18 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/8", "parent": { @@ -639,18 +633,6 @@ }, { "self_ref": "#/texts/9", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/10", "parent": { "$ref": "#/groups/2" }, @@ -667,6 +649,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/11", "parent": { @@ -705,18 +699,6 @@ }, { "self_ref": "#/texts/14", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/3" }, @@ -733,6 +715,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/16", "parent": { @@ -747,18 +741,6 @@ }, { "self_ref": "#/texts/17", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -778,7 +760,7 @@ "marker": "-" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -797,32 +779,20 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/6" }, @@ -835,6 +805,30 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/23", "parent": { @@ -873,30 +867,6 @@ }, { "self_ref": "#/texts/26", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/27", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/7" }, @@ -914,7 +884,7 @@ } }, { - "self_ref": "#/texts/29", + "self_ref": "#/texts/27", "parent": { "$ref": "#/groups/7" }, @@ -932,7 +902,7 @@ } }, { - "self_ref": "#/texts/30", + "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/8" }, @@ -952,7 +922,7 @@ "marker": "-" }, { - "self_ref": "#/texts/31", + "self_ref": "#/texts/29", "parent": { "$ref": "#/groups/8" }, @@ -972,7 +942,7 @@ "marker": "-" }, { - "self_ref": "#/texts/32", + "self_ref": "#/texts/30", "parent": { "$ref": "#/groups/7" }, @@ -984,19 +954,7 @@ "text": "" }, { - "self_ref": "#/texts/33", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/34", + "self_ref": "#/texts/31", "parent": { "$ref": "#/groups/9" }, @@ -1009,6 +967,48 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/35", "parent": { @@ -1024,20 +1024,14 @@ { "self_ref": "#/texts/36", "parent": { - "$ref": "#/groups/10" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - } + "orig": "", + "text": "" }, { "self_ref": "#/texts/37", @@ -1089,42 +1083,6 @@ }, { "self_ref": "#/texts/41", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/42", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/43", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/12" }, @@ -1142,7 +1100,7 @@ } }, { - "self_ref": "#/texts/45", + "self_ref": "#/texts/42", "parent": { "$ref": "#/groups/12" }, @@ -1160,7 +1118,7 @@ } }, { - "self_ref": "#/texts/46", + "self_ref": "#/texts/43", "parent": { "$ref": "#/groups/12" }, @@ -1178,7 +1136,7 @@ } }, { - "self_ref": "#/texts/47", + "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/11" }, @@ -1189,22 +1147,64 @@ "orig": "", "text": "" }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/48", "parent": { - "$ref": "#/groups/11" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "", - "text": "" + "orig": "Whether the epidemic has eased.", + "text": "Whether the epidemic has eased.", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } }, { "self_ref": "#/texts/49", "parent": { - "$ref": "#/body" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", @@ -1227,72 +1227,6 @@ }, { "self_ref": "#/texts/51", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/52", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "Whether the epidemic has eased.", - "text": "Whether the epidemic has eased.", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false - } - }, - { - "self_ref": "#/texts/53", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/54", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/55", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/56", "parent": { "$ref": "#/groups/14" }, @@ -1310,7 +1244,7 @@ } }, { - "self_ref": "#/texts/57", + "self_ref": "#/texts/52", "parent": { "$ref": "#/groups/14" }, @@ -1327,6 +1261,78 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/53", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/54", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/55", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/56", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, { "self_ref": "#/texts/58", "parent": { @@ -1353,42 +1359,6 @@ }, { "self_ref": "#/texts/60", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/61", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/62", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/63", "parent": { "$ref": "#/groups/17" }, @@ -1406,7 +1376,7 @@ } }, { - "self_ref": "#/texts/64", + "self_ref": "#/texts/61", "parent": { "$ref": "#/groups/17" }, @@ -1418,19 +1388,7 @@ "text": "" }, { - "self_ref": "#/texts/65", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/66", + "self_ref": "#/texts/62", "parent": { "$ref": "#/groups/17" }, @@ -1447,6 +1405,60 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "No", + "text": "No", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/65", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/66", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/67", "parent": { @@ -1458,42 +1470,6 @@ "prov": [], "orig": "", "text": "" - }, - { - "self_ref": "#/texts/68", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/69", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/70", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.md b/tests/data/groundtruth/docling_v2/textbox.docx.md index 9458bd0c..293c4d8c 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.md +++ b/tests/data/groundtruth/docling_v2/textbox.docx.md @@ -40,6 +40,12 @@ The Health Bureau will handle **reporting and specimen collection** . No +Yes + +Yes + **Case closed.** -The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. \ No newline at end of file +The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. + +No \ No newline at end of file diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt index 76fe886d..5682a134 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt @@ -1,2 +1,2 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.json b/tests/data/webp/groundtruth/docling_v2/webp-test.json index 94c9bda7..bf14a5c1 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.json @@ -42,10 +42,10 @@ { "page_no": 1, "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 2570.0959833241664, - "r": 1696.0985546594009, - "b": 2315.204273887442, + "r": 1696.0985042090742, + "b": 2319.1220927976665, "coord_origin": "BOTTOMLEFT" }, "charspan": [ diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json index 67ad465a..732403c0 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,13 +90,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -157,14 +157,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -195,13 +195,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -237,14 +237,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -262,14 +262,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -293,13 +293,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -335,14 +335,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -360,14 +360,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", diff --git a/tests/data/xlsx/sample_sales_data.xlsm b/tests/data/xlsx/sample_sales_data.xlsm index 8aae36be..0f3832a0 100644 Binary files a/tests/data/xlsx/sample_sales_data.xlsm and b/tests/data/xlsx/sample_sales_data.xlsm differ diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 2e2fdeeb..6dd73425 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -79,30 +79,21 @@ def test_pages(documents) -> None: documents: The paths and converted documents. """ # number of pages from the backend method - # Logic to handle multiple files - file_stems = ["sample_sales_data","test-01"] - for stem in file_stems: - path = next(item for item in get_excel_paths() if item.stem == stem) - in_doc = InputDocument( - path_or_stream=path, - format=InputFormat.XLSX, - filename=path.stem, - backend=MsExcelDocumentBackend, - ) - backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) - # Update the expected page count based on actual content - expected_page_count = 3 # Adjust this value based on the actual number of worksheets this needs to be adjusted for each xlsm and xlsx files independently - assert backend.page_count() == expected_page_count + path = next(item for item in get_excel_paths() if item.stem == "test-01") + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.XLSX, + filename=path.stem, + backend=MsExcelDocumentBackend, + ) + backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) + assert backend.page_count() == 3 - # number of pages from the converted document - doc = next(item for path, item in documents if path.stem == stem) - assert len(doc.pages) == 3 + # number of pages from the converted document + doc = next(item for path, item in documents if path.stem == "test-01") + assert len(doc.pages) == 3 - # page sizes as number of cells - - # for xlsm file just adjust this wrt the xlsm files for test xlsm enable this: - #assert doc.pages.get(1).size.as_tuple() == (4.0, 21.0) - # for xlsx file: - assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) - assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) - assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) + # page sizes as number of cells + assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) + assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) + assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)