From 3c922b4105273043635a87ac54a85bf683b6e43c Mon Sep 17 00:00:00 2001 From: Christoph Auer Date: Tue, 10 Jun 2025 13:14:32 +0200 Subject: [PATCH] Fix tests, upgrade XSLM example to a valid file Signed-off-by: Christoph Auer --- .../docling_v1/2305.03393v1-pg9.json | 4 +- .../docling_v1/2305.03393v1-pg9.pages.json | 58 +- .../docling_v2/2305.03393v1-pg9.json | 4 +- .../docling_v2/2305.03393v1-pg9.pages.json | 58 +- .../docling_v2/sample_sales_data.xlsm.json | 2 +- .../groundtruth/docling_v2/textbox.docx.itxt | 127 ++- .../groundtruth/docling_v2/textbox.docx.json | 800 +++++++++--------- .../groundtruth/docling_v2/textbox.docx.md | 8 +- .../docling_v2/webp-test.doctags.txt | 2 +- .../groundtruth/docling_v2/webp-test.json | 6 +- .../docling_v2/webp-test.pages.json | 152 ++-- tests/data/xlsx/sample_sales_data.xlsm | Bin 6103 -> 9945 bytes tests/test_backend_msexcel.py | 41 +- 13 files changed, 616 insertions(+), 646 deletions(-) diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json index e938e2d7..dd51e390 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.json @@ -213,10 +213,10 @@ "prov": [ { "bbox": [ - 139.6674041748047, + 139.66741943359375, 322.5054626464844, 475.00927734375, - 454.4546203613281 + 454.45458984375 ], "page": 1, "span": [ diff --git a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json index 3bca0d55..5db555b0 100644 --- a/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v1/2305.03393v1-pg9.pages.json @@ -2646,7 +2646,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2686,7 +2686,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2726,7 +2726,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2881,7 +2881,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3096,7 +3096,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3280,8 +3280,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7787,7 +7787,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7852,7 +7852,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8184,8 +8184,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13582,7 +13582,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13628,7 +13628,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13674,7 +13674,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13841,7 +13841,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14062,7 +14062,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14252,8 +14252,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19642,7 +19642,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19713,7 +19713,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20057,7 +20057,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20224,7 +20224,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20445,7 +20445,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20635,8 +20635,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26025,7 +26025,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26096,7 +26096,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26440,7 +26440,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26486,7 +26486,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json index c0570096..f281a447 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.json @@ -336,8 +336,8 @@ { "page_no": 1, "bbox": { - "l": 139.6674041748047, - "t": 454.4546203613281, + "l": 139.66741943359375, + "t": 454.45458984375, "r": 475.00927734375, "b": 322.5054626464844, "coord_origin": "BOTTOMLEFT" diff --git a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json index 3bca0d55..5db555b0 100644 --- a/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json +++ b/tests/data/groundtruth/docling_v2/2305.03393v1-pg9.pages.json @@ -2646,7 +2646,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -2686,7 +2686,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -2726,7 +2726,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -2881,7 +2881,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -3096,7 +3096,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -3280,8 +3280,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -7787,7 +7787,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -7852,7 +7852,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -8184,8 +8184,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -13582,7 +13582,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -13628,7 +13628,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, @@ -13674,7 +13674,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -13841,7 +13841,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -14062,7 +14062,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -14252,8 +14252,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -19642,7 +19642,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -19713,7 +19713,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -20057,7 +20057,7 @@ "b": 152.90697999999998, "coord_origin": "TOPLEFT" }, - "confidence": 0.9806435108184814, + "confidence": 0.9806433916091919, "cells": [ { "index": 2, @@ -20224,7 +20224,7 @@ "b": 255.42400999999995, "coord_origin": "TOPLEFT" }, - "confidence": 0.9850425124168396, + "confidence": 0.98504239320755, "cells": [ { "index": 7, @@ -20445,7 +20445,7 @@ "b": 327.98218, "coord_origin": "TOPLEFT" }, - "confidence": 0.9591907262802124, + "confidence": 0.9591909050941467, "cells": [ { "index": 15, @@ -20635,8 +20635,8 @@ "id": 0, "label": "table", "bbox": { - "l": 139.6674041748047, - "t": 337.5453796386719, + "l": 139.66741943359375, + "t": 337.54541015625, "r": 475.00927734375, "b": 469.4945373535156, "coord_origin": "TOPLEFT" @@ -26025,7 +26025,7 @@ "b": 518.17419, "coord_origin": "TOPLEFT" }, - "confidence": 0.9589295387268066, + "confidence": 0.9589294195175171, "cells": [ { "index": 91, @@ -26096,7 +26096,7 @@ "b": 618.3, "coord_origin": "TOPLEFT" }, - "confidence": 0.9849976301193237, + "confidence": 0.9849975109100342, "cells": [ { "index": 93, @@ -26440,7 +26440,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.9373533129692078, + "confidence": 0.9373534917831421, "cells": [ { "index": 0, @@ -26486,7 +26486,7 @@ "b": 102.78223000000003, "coord_origin": "TOPLEFT" }, - "confidence": 0.8858679533004761, + "confidence": 0.8858680725097656, "cells": [ { "index": 1, diff --git a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json index 9c01cb7e..04f8198e 100644 --- a/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json +++ b/tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json @@ -4,7 +4,7 @@ "name": "sample_sales_data", "origin": { "mimetype": "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", - "binary_hash": 4984052357623711224, + "binary_hash": 14806485565397602516, "filename": "sample_sales_data.xlsm" }, "furniture": { diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.itxt b/tests/data/groundtruth/docling_v2/textbox.docx.itxt index e17e2be2..406de95f 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.itxt +++ b/tests/data/groundtruth/docling_v2/textbox.docx.itxt @@ -5,92 +5,89 @@ item-0 at level 0: unspecified: group _root_ item-4 at level 1: section: group textbox item-5 at level 2: paragraph: Student falls ill item-6 at level 2: paragraph: - item-7 at level 2: paragraph: - item-8 at level 2: list: group list - item-9 at level 3: list_item: Suggested Reportable Symptoms: + item-7 at level 2: list: group list + item-8 at level 3: list_item: Suggested Reportable Symptoms: * ... sh * Blisters * Headache * Sore throat - item-10 at level 1: list_item: + item-9 at level 1: list_item: + item-10 at level 1: paragraph: item-11 at level 1: paragraph: - item-12 at level 1: paragraph: - item-13 at level 1: section: group textbox - item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-12 at level 1: section: group textbox + item-13 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms + item-14 at level 1: paragraph: item-15 at level 1: paragraph: item-16 at level 1: paragraph: item-17 at level 1: paragraph: - item-18 at level 1: paragraph: - item-19 at level 1: section: group textbox - item-20 at level 2: paragraph: Yes + item-18 at level 1: section: group textbox + item-19 at level 2: paragraph: Yes + item-20 at level 1: paragraph: item-21 at level 1: paragraph: - item-22 at level 1: paragraph: - item-23 at level 1: section: group textbox - item-24 at level 2: list: group list - item-25 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. - item-26 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. - item-27 at level 2: paragraph: - item-28 at level 2: paragraph: - item-29 at level 1: list: group list - item-30 at level 2: list_item: + item-22 at level 1: section: group textbox + item-23 at level 2: list: group list + item-24 at level 3: list_item: A report must be submitted withi ... saster Prevention Information Network. + item-25 at level 3: list_item: A report must also be submitted ... d Infectious Disease Reporting System. + item-26 at level 2: paragraph: + item-27 at level 1: list: group list + item-28 at level 2: list_item: + item-29 at level 1: paragraph: + item-30 at level 1: paragraph: item-31 at level 1: paragraph: item-32 at level 1: paragraph: item-33 at level 1: paragraph: - item-34 at level 1: paragraph: - item-35 at level 1: paragraph: - item-36 at level 1: section: group textbox - item-37 at level 2: paragraph: Health Bureau: - item-38 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. - item-39 at level 2: list: group list - item-40 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. - item-41 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. - item-42 at level 2: paragraph: - item-43 at level 2: paragraph: - item-44 at level 1: list: group list - item-45 at level 2: list_item: - item-46 at level 1: paragraph: - item-47 at level 1: section: group textbox - item-48 at level 2: paragraph: Department of Education: + item-34 at level 1: section: group textbox + item-35 at level 2: paragraph: Health Bureau: + item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control. + item-37 at level 2: list: group list + item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection. + item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act. + item-40 at level 2: paragraph: + item-41 at level 1: list: group list + item-42 at level 2: list_item: + item-43 at level 1: paragraph: + item-44 at level 1: section: group textbox + item-45 at level 2: paragraph: Department of Education: Collabo ... vention measures at all school levels. + item-46 at level 1: paragraph: + item-47 at level 1: paragraph: + item-48 at level 1: paragraph: item-49 at level 1: paragraph: item-50 at level 1: paragraph: item-51 at level 1: paragraph: item-52 at level 1: paragraph: - item-53 at level 1: paragraph: - item-54 at level 1: paragraph: - item-55 at level 1: paragraph: - item-56 at level 1: section: group textbox - item-57 at level 2: inline: group group - item-58 at level 3: paragraph: The Health Bureau will handle - item-59 at level 3: paragraph: reporting and specimen collection - item-60 at level 3: paragraph: . - item-61 at level 2: paragraph: - item-62 at level 2: paragraph: - item-63 at level 1: paragraph: - item-64 at level 1: paragraph: + item-53 at level 1: section: group textbox + item-54 at level 2: inline: group group + item-55 at level 3: paragraph: The Health Bureau will handle + item-56 at level 3: paragraph: reporting and specimen collection + item-57 at level 3: paragraph: . + item-58 at level 2: paragraph: + item-59 at level 1: paragraph: + item-60 at level 1: paragraph: + item-61 at level 1: paragraph: + item-62 at level 1: section: group textbox + item-63 at level 2: paragraph: Whether the epidemic has eased. + item-64 at level 2: paragraph: item-65 at level 1: paragraph: item-66 at level 1: section: group textbox - item-67 at level 2: paragraph: Whether the epidemic has eased. - item-68 at level 2: paragraph: - item-69 at level 2: paragraph: + item-67 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. + item-68 at level 2: paragraph: No + item-69 at level 1: paragraph: item-70 at level 1: paragraph: item-71 at level 1: section: group textbox - item-72 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease. - item-73 at level 2: paragraph: No - item-74 at level 1: paragraph: - item-75 at level 1: paragraph: - item-76 at level 1: section: group textbox + item-72 at level 2: paragraph: Yes + item-73 at level 1: paragraph: + item-74 at level 1: section: group textbox + item-75 at level 2: paragraph: Yes + item-76 at level 1: paragraph: item-77 at level 1: paragraph: item-78 at level 1: section: group textbox - item-79 at level 1: paragraph: - item-80 at level 1: paragraph: - item-81 at level 1: section: group textbox - item-82 at level 2: paragraph: Case closed. - item-83 at level 2: paragraph: - item-84 at level 2: paragraph: - item-85 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-79 at level 2: paragraph: Case closed. + item-80 at level 2: paragraph: + item-81 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary. + item-82 at level 1: paragraph: + item-83 at level 1: section: group textbox + item-84 at level 2: paragraph: No + item-85 at level 1: paragraph: item-86 at level 1: paragraph: - item-87 at level 1: section: group textbox - item-88 at level 1: paragraph: - item-89 at level 1: paragraph: - item-90 at level 1: paragraph: \ No newline at end of file + item-87 at level 1: paragraph: \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.json b/tests/data/groundtruth/docling_v2/textbox.docx.json index 743fb578..840e937a 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.json +++ b/tests/data/groundtruth/docling_v2/textbox.docx.json @@ -29,6 +29,9 @@ { "$ref": "#/groups/0" }, + { + "$ref": "#/texts/6" + }, { "$ref": "#/texts/7" }, @@ -36,10 +39,10 @@ "$ref": "#/texts/8" }, { - "$ref": "#/texts/9" + "$ref": "#/groups/2" }, { - "$ref": "#/groups/2" + "$ref": "#/texts/10" }, { "$ref": "#/texts/11" @@ -50,17 +53,14 @@ { "$ref": "#/texts/13" }, - { - "$ref": "#/texts/14" - }, { "$ref": "#/groups/3" }, { - "$ref": "#/texts/16" + "$ref": "#/texts/15" }, { - "$ref": "#/texts/17" + "$ref": "#/texts/16" }, { "$ref": "#/groups/4" @@ -68,6 +68,12 @@ { "$ref": "#/groups/6" }, + { + "$ref": "#/texts/21" + }, + { + "$ref": "#/texts/22" + }, { "$ref": "#/texts/23" }, @@ -77,12 +83,6 @@ { "$ref": "#/texts/25" }, - { - "$ref": "#/texts/26" - }, - { - "$ref": "#/texts/27" - }, { "$ref": "#/groups/7" }, @@ -90,11 +90,20 @@ "$ref": "#/groups/9" }, { - "$ref": "#/texts/35" + "$ref": "#/texts/32" }, { "$ref": "#/groups/10" }, + { + "$ref": "#/texts/34" + }, + { + "$ref": "#/texts/35" + }, + { + "$ref": "#/texts/36" + }, { "$ref": "#/texts/37" }, @@ -107,74 +116,65 @@ { "$ref": "#/texts/40" }, - { - "$ref": "#/texts/41" - }, - { - "$ref": "#/texts/42" - }, - { - "$ref": "#/texts/43" - }, { "$ref": "#/groups/11" }, { - "$ref": "#/texts/49" + "$ref": "#/texts/45" }, { - "$ref": "#/texts/50" + "$ref": "#/texts/46" }, { - "$ref": "#/texts/51" + "$ref": "#/texts/47" }, { "$ref": "#/groups/13" }, { - "$ref": "#/texts/55" + "$ref": "#/texts/50" }, { "$ref": "#/groups/14" }, + { + "$ref": "#/texts/53" + }, + { + "$ref": "#/texts/54" + }, + { + "$ref": "#/groups/15" + }, + { + "$ref": "#/texts/56" + }, + { + "$ref": "#/groups/16" + }, { "$ref": "#/texts/58" }, { "$ref": "#/texts/59" }, - { - "$ref": "#/groups/15" - }, - { - "$ref": "#/texts/60" - }, - { - "$ref": "#/groups/16" - }, - { - "$ref": "#/texts/61" - }, - { - "$ref": "#/texts/62" - }, { "$ref": "#/groups/17" }, { - "$ref": "#/texts/67" + "$ref": "#/texts/63" }, { "$ref": "#/groups/18" }, { - "$ref": "#/texts/68" + "$ref": "#/texts/65" }, { - "$ref": "#/texts/69" + "$ref": "#/texts/66" }, { - "$ref": "#/texts/70" + "$ref": "#/texts/67" } ], "content_layer": "body", @@ -194,9 +194,6 @@ { "$ref": "#/texts/4" }, - { - "$ref": "#/texts/5" - }, { "$ref": "#/groups/1" } @@ -212,7 +209,7 @@ }, "children": [ { - "$ref": "#/texts/6" + "$ref": "#/texts/5" } ], "content_layer": "body", @@ -226,7 +223,7 @@ }, "children": [ { - "$ref": "#/texts/10" + "$ref": "#/texts/9" } ], "content_layer": "body", @@ -240,7 +237,7 @@ }, "children": [ { - "$ref": "#/texts/15" + "$ref": "#/texts/14" } ], "content_layer": "body", @@ -257,10 +254,7 @@ "$ref": "#/groups/5" }, { - "$ref": "#/texts/20" - }, - { - "$ref": "#/texts/21" + "$ref": "#/texts/19" } ], "content_layer": "body", @@ -274,10 +268,10 @@ }, "children": [ { - "$ref": "#/texts/18" + "$ref": "#/texts/17" }, { - "$ref": "#/texts/19" + "$ref": "#/texts/18" } ], "content_layer": "body", @@ -291,7 +285,7 @@ }, "children": [ { - "$ref": "#/texts/22" + "$ref": "#/texts/20" } ], "content_layer": "body", @@ -305,19 +299,16 @@ }, "children": [ { - "$ref": "#/texts/28" + "$ref": "#/texts/26" }, { - "$ref": "#/texts/29" + "$ref": "#/texts/27" }, { "$ref": "#/groups/8" }, { - "$ref": "#/texts/32" - }, - { - "$ref": "#/texts/33" + "$ref": "#/texts/30" } ], "content_layer": "body", @@ -331,10 +322,10 @@ }, "children": [ { - "$ref": "#/texts/30" + "$ref": "#/texts/28" }, { - "$ref": "#/texts/31" + "$ref": "#/texts/29" } ], "content_layer": "body", @@ -348,7 +339,7 @@ }, "children": [ { - "$ref": "#/texts/34" + "$ref": "#/texts/31" } ], "content_layer": "body", @@ -362,7 +353,7 @@ }, "children": [ { - "$ref": "#/texts/36" + "$ref": "#/texts/33" } ], "content_layer": "body", @@ -379,10 +370,7 @@ "$ref": "#/groups/12" }, { - "$ref": "#/texts/47" - }, - { - "$ref": "#/texts/48" + "$ref": "#/texts/44" } ], "content_layer": "body", @@ -396,13 +384,13 @@ }, "children": [ { - "$ref": "#/texts/44" + "$ref": "#/texts/41" }, { - "$ref": "#/texts/45" + "$ref": "#/texts/42" }, { - "$ref": "#/texts/46" + "$ref": "#/texts/43" } ], "content_layer": "body", @@ -416,13 +404,10 @@ }, "children": [ { - "$ref": "#/texts/52" + "$ref": "#/texts/48" }, { - "$ref": "#/texts/53" - }, - { - "$ref": "#/texts/54" + "$ref": "#/texts/49" } ], "content_layer": "body", @@ -436,10 +421,10 @@ }, "children": [ { - "$ref": "#/texts/56" + "$ref": "#/texts/51" }, { - "$ref": "#/texts/57" + "$ref": "#/texts/52" } ], "content_layer": "body", @@ -451,7 +436,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/55" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -461,7 +450,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/57" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -473,16 +466,13 @@ }, "children": [ { - "$ref": "#/texts/63" + "$ref": "#/texts/60" }, { - "$ref": "#/texts/64" + "$ref": "#/texts/61" }, { - "$ref": "#/texts/65" - }, - { - "$ref": "#/texts/66" + "$ref": "#/texts/62" } ], "content_layer": "body", @@ -494,7 +484,11 @@ "parent": { "$ref": "#/body" }, - "children": [], + "children": [ + { + "$ref": "#/texts/64" + } + ], "content_layer": "body", "name": "textbox", "label": "section" @@ -581,18 +575,6 @@ }, { "self_ref": "#/texts/5", - "parent": { - "$ref": "#/groups/0" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/6", "parent": { "$ref": "#/groups/1" }, @@ -612,7 +594,7 @@ "marker": "-" }, { - "self_ref": "#/texts/7", + "self_ref": "#/texts/6", "parent": { "$ref": "#/body" }, @@ -625,6 +607,18 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/8", "parent": { @@ -639,18 +633,6 @@ }, { "self_ref": "#/texts/9", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/10", "parent": { "$ref": "#/groups/2" }, @@ -667,6 +649,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/10", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/11", "parent": { @@ -705,18 +699,6 @@ }, { "self_ref": "#/texts/14", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/15", "parent": { "$ref": "#/groups/3" }, @@ -733,6 +715,18 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/15", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/16", "parent": { @@ -747,18 +741,6 @@ }, { "self_ref": "#/texts/17", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -778,7 +760,7 @@ "marker": "-" }, { - "self_ref": "#/texts/19", + "self_ref": "#/texts/18", "parent": { "$ref": "#/groups/5" }, @@ -797,32 +779,20 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/19", + "parent": { + "$ref": "#/groups/4" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/20", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/21", - "parent": { - "$ref": "#/groups/4" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/22", "parent": { "$ref": "#/groups/6" }, @@ -835,6 +805,30 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/21", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/22", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/23", "parent": { @@ -873,30 +867,6 @@ }, { "self_ref": "#/texts/26", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/27", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/7" }, @@ -914,7 +884,7 @@ } }, { - "self_ref": "#/texts/29", + "self_ref": "#/texts/27", "parent": { "$ref": "#/groups/7" }, @@ -932,7 +902,7 @@ } }, { - "self_ref": "#/texts/30", + "self_ref": "#/texts/28", "parent": { "$ref": "#/groups/8" }, @@ -952,7 +922,7 @@ "marker": "-" }, { - "self_ref": "#/texts/31", + "self_ref": "#/texts/29", "parent": { "$ref": "#/groups/8" }, @@ -972,7 +942,7 @@ "marker": "-" }, { - "self_ref": "#/texts/32", + "self_ref": "#/texts/30", "parent": { "$ref": "#/groups/7" }, @@ -984,19 +954,7 @@ "text": "" }, { - "self_ref": "#/texts/33", - "parent": { - "$ref": "#/groups/7" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/34", + "self_ref": "#/texts/31", "parent": { "$ref": "#/groups/9" }, @@ -1009,6 +967,48 @@ "enumerated": false, "marker": "-" }, + { + "self_ref": "#/texts/32", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/33", + "parent": { + "$ref": "#/groups/10" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/34", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/35", "parent": { @@ -1024,20 +1024,14 @@ { "self_ref": "#/texts/36", "parent": { - "$ref": "#/groups/10" + "$ref": "#/body" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "text": "Department of Education:\nCollaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.", - "formatting": { - "bold": false, - "italic": false, - "underline": false, - "strikethrough": false - } + "orig": "", + "text": "" }, { "self_ref": "#/texts/37", @@ -1089,42 +1083,6 @@ }, { "self_ref": "#/texts/41", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/42", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/43", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/12" }, @@ -1142,7 +1100,7 @@ } }, { - "self_ref": "#/texts/45", + "self_ref": "#/texts/42", "parent": { "$ref": "#/groups/12" }, @@ -1160,7 +1118,7 @@ } }, { - "self_ref": "#/texts/46", + "self_ref": "#/texts/43", "parent": { "$ref": "#/groups/12" }, @@ -1178,7 +1136,7 @@ } }, { - "self_ref": "#/texts/47", + "self_ref": "#/texts/44", "parent": { "$ref": "#/groups/11" }, @@ -1189,22 +1147,64 @@ "orig": "", "text": "" }, + { + "self_ref": "#/texts/45", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/46", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/47", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/48", "parent": { - "$ref": "#/groups/11" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", "label": "paragraph", "prov": [], - "orig": "", - "text": "" + "orig": "Whether the epidemic has eased.", + "text": "Whether the epidemic has eased.", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false + } }, { "self_ref": "#/texts/49", "parent": { - "$ref": "#/body" + "$ref": "#/groups/13" }, "children": [], "content_layer": "body", @@ -1227,72 +1227,6 @@ }, { "self_ref": "#/texts/51", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/52", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "Whether the epidemic has eased.", - "text": "Whether the epidemic has eased.", - "formatting": { - "bold": true, - "italic": false, - "underline": false, - "strikethrough": false - } - }, - { - "self_ref": "#/texts/53", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/54", - "parent": { - "$ref": "#/groups/13" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/55", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/56", "parent": { "$ref": "#/groups/14" }, @@ -1310,7 +1244,7 @@ } }, { - "self_ref": "#/texts/57", + "self_ref": "#/texts/52", "parent": { "$ref": "#/groups/14" }, @@ -1327,6 +1261,78 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/53", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/54", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/55", + "parent": { + "$ref": "#/groups/15" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/56", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/57", + "parent": { + "$ref": "#/groups/16" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "Yes", + "text": "Yes", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, { "self_ref": "#/texts/58", "parent": { @@ -1353,42 +1359,6 @@ }, { "self_ref": "#/texts/60", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/61", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/62", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/63", "parent": { "$ref": "#/groups/17" }, @@ -1406,7 +1376,7 @@ } }, { - "self_ref": "#/texts/64", + "self_ref": "#/texts/61", "parent": { "$ref": "#/groups/17" }, @@ -1418,19 +1388,7 @@ "text": "" }, { - "self_ref": "#/texts/65", - "parent": { - "$ref": "#/groups/17" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/66", + "self_ref": "#/texts/62", "parent": { "$ref": "#/groups/17" }, @@ -1447,6 +1405,60 @@ "strikethrough": false } }, + { + "self_ref": "#/texts/63", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/64", + "parent": { + "$ref": "#/groups/18" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "No", + "text": "No", + "formatting": { + "bold": false, + "italic": false, + "underline": false, + "strikethrough": false + } + }, + { + "self_ref": "#/texts/65", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, + { + "self_ref": "#/texts/66", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "body", + "label": "paragraph", + "prov": [], + "orig": "", + "text": "" + }, { "self_ref": "#/texts/67", "parent": { @@ -1458,42 +1470,6 @@ "prov": [], "orig": "", "text": "" - }, - { - "self_ref": "#/texts/68", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/69", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" - }, - { - "self_ref": "#/texts/70", - "parent": { - "$ref": "#/body" - }, - "children": [], - "content_layer": "body", - "label": "paragraph", - "prov": [], - "orig": "", - "text": "" } ], "pictures": [], diff --git a/tests/data/groundtruth/docling_v2/textbox.docx.md b/tests/data/groundtruth/docling_v2/textbox.docx.md index 9458bd0c..293c4d8c 100644 --- a/tests/data/groundtruth/docling_v2/textbox.docx.md +++ b/tests/data/groundtruth/docling_v2/textbox.docx.md @@ -40,6 +40,12 @@ The Health Bureau will handle **reporting and specimen collection** . No +Yes + +Yes + **Case closed.** -The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. \ No newline at end of file +The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary. + +No \ No newline at end of file diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt index 76fe886d..5682a134 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.doctags.txt @@ -1,2 +1,2 @@ -Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package +Docling bundles PDF document conversion to JSON and Markdown in an easy self contained package \ No newline at end of file diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.json b/tests/data/webp/groundtruth/docling_v2/webp-test.json index 94c9bda7..bf14a5c1 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.json @@ -42,10 +42,10 @@ { "page_no": 1, "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 2570.0959833241664, - "r": 1696.0985546594009, - "b": 2315.204273887442, + "r": 1696.0985042090742, + "b": 2319.1220927976665, "coord_origin": "BOTTOMLEFT" }, "charspan": [ diff --git a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json index 67ad465a..732403c0 100644 --- a/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json +++ b/tests/data/webp/groundtruth/docling_v2/webp-test.pages.json @@ -40,14 +40,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -65,14 +65,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -90,13 +90,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -132,14 +132,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -157,14 +157,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -195,13 +195,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -237,14 +237,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -262,14 +262,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", @@ -293,13 +293,13 @@ "id": 0, "label": "text", "bbox": { - "l": 238.19302423176944, + "l": 234.08627147881114, "t": 258.9040166758338, - "r": 1696.0985546594009, - "b": 513.795726112558, + "r": 1696.0985042090742, + "b": 509.8779072023336, "coord_origin": "TOPLEFT" }, - "confidence": 0.9721010327339172, + "confidence": 0.9721011519432068, "cells": [ { "index": 0, @@ -335,14 +335,14 @@ "a": 255 }, "rect": { - "r_x0": 238.19302423176944, - "r_y0": 415.36904822716525, - "r_x1": 1696.0985546594009, - "r_y1": 415.36904822716525, - "r_x2": 1696.0985546594009, - "r_y2": 345.20535775097477, - "r_x3": 238.19302423176944, - "r_y3": 345.20535775097477, + "r_x0": 234.08627147881114, + "r_y0": 419.5788697734327, + "r_x1": 1696.0985042090742, + "r_y1": 419.5788697734327, + "r_x2": 1696.0985042090742, + "r_y2": 349.4151792972422, + "r_x3": 234.08627147881114, + "r_y3": 349.4151792972422, "coord_origin": "TOPLEFT" }, "text": "JSON and Markdown in an easy self contained", @@ -360,14 +360,14 @@ "a": 255 }, "rect": { - "r_x0": 245.43122061153045, - "r_y0": 513.795726112558, - "r_x1": 514.3223724413002, - "r_y1": 513.795726112558, - "r_x2": 514.3223724413002, - "r_y2": 436.0574704074058, - "r_x3": 245.43122061153045, - "r_y3": 436.0574704074058, + "r_x0": 242.29979922858777, + "r_y0": 509.8779072023336, + "r_x1": 513.3470125989277, + "r_y1": 509.8779072023336, + "r_x2": 513.3470125989277, + "r_y2": 439.9752910477536, + "r_x3": 242.29979922858777, + "r_y3": 439.9752910477536, "coord_origin": "TOPLEFT" }, "text": "package", diff --git a/tests/data/xlsx/sample_sales_data.xlsm b/tests/data/xlsx/sample_sales_data.xlsm index 8aae36be0a3d18795665121feb9587a08dea65c6..0f3832a0b178f816e214f8fcde79c572af372bdb 100644 GIT binary patch literal 9945 zcmeHt1y@|j)^-!zAvEsp?i$=71Pku&(!o7QaCZp~2_Bpv!QDx4*Wd*Ax09LsWrmsW z7ue=?}qa+6j1q8qV-~a#sDIgYBzrz~>0Jwny05AY>&$Pwt?Lek> zAbnML2UBNVCO2Cfl3b`~G}(Y>;OGB${TJ^*X@Y`cCktB8rSt;n4=hSMO_IPg7YI}M zL>gF!151prRgBTv6Z7mYzEH&+*)AB=RomX9>_n)Nwt3Z@Zlz_knr%B(ND(z3xx(XG zUcNu|Ofp21gttXbuz~E8?A$A92dx9x!?-j)(Lr5);T=H0^ACDsU}0wGP5#XL2HI7W zsr%WLZG~%Z0VjlfpV*W^XFJAFw`_>b!o*P-1}dKBMT0JmCMCwmcdamC#p}6Z;sh@# z8XC5;_Kz-ZVq7>cWZ$v3LAp3rB;e`O_fd6(V2{TjFIhbp-aDBceBxnzuFI7oMlXV` z0TGysJ~3Z~&W_AMZB^~+7LY&|Ta@z%yK#B8?IKh`Nkjh{nX*NATcnuc+YZ?wMT_ed zoT}!^clXvHib;sB%5gvJr33Yva2tV$rUYI&%o42`?aHZ|?R#i8adri^*-W?8;ZE!GL>wgaj!4&1!4a zSjf-7GxA`gLn8g_oTyNWn+acOiY3Ul(K0MCDvX zq(76Y`uNE#pw>p_QQ$AOQ4^r4;=P5G@@?_C8<<}dh}`WXy;$QY4Sx>gC$DoY3r@aq zd;?2I<&-4pSi0JU?(+5g>qV-RtUHZMOEg_cLvfB=|1!DcAurRh2N21yyKPMpGd{*b2c)cE%)lPAw9pw zQ&X|vwX8JCa^fR%*Eg~Gaw?wQhH>x3sF=~GOvds2ifvS?mood@SF48OWH8gciyOYP zr0=*tAe^{(2^_%wX(aDIzBKv(HV`%#000%dGj2A_uJ%sWM)vmBKf_wS>b&AQ3!1mS zLsWjQv+)bIMx1*A(RohD0P_EHHadUB|=6dsDnSS%L_`387ttV%1o_2 zao!(?=>y~q7j}W|IKl;bLqM9E8jL(PrSMP?&~Dz(NVU?WNp)_l5vqQ#INA!*9!VeA zF=lE1Y;s&J$UsuN#eo{2;F2GG7cKf+0+zHm+W43?pNQ3q(F^6WRQ^sz{e+0#2Rgii zIZuM5Op+cR>L}jgUa%g8D2q98-JAA<|rj9DY6uxkG8H44hIk(8vQDBz{ zU?qW1*2>cQhh+0vgYtt|Yr^#o`n#EtlEoYEvoE~yHP177-x>%KJ_d3uPH<8n=R-uK zr)Axgf`~EMWzMcZi+fR2_v0ixC?1(BR#2VU^Y~EflS`H}_zG*)gM;P{JzemVHBQ&z zL;|2N&zL*%NoKA3BpP|T*p>-51pTZpqesEtF7^p8GLQ6*BfZt4mUC}JiskL`qU#Qt zhy#4ndyYxil8YLMMQ*J}eH^NW{B5^O_fT{A1WJI!mAFSU@+vGA7vIdE9-cDZ}_8iIs!{&$|+J5+S z7!uCtpo}O$3L4}2{_bAM0U}D8Ge)COTpvRgkZ0?Ph7va z1<4_6U3~2)^^C7pdBu%tw)KL>0#UENGzhnDK3I^njo`x`2w^ZRzL%0P5E~(X=`q1Dy z=Rywa8o}&1=5wk$N56L&UNG?Ic8pbLcsUth$>ixlmT*B7Y^hCaVA!QDcJvYrG0gMq zh>y2{T_N1lA#Hxx1qVJ4R(wiX#y3Jqwua{Q-jaqM{m^ZULIVkGxNFqt6$8dPx2zU| zb%v#PfQicwuk;5DPa_>2>H6#Cpnj^M4#i;P=G-5nngc}4rFRJUzR%Tl3|3udo#@lU z9u$@<&|9m=@~5{1E}}=wBfAJuG|>@lCGpzUeMx zWxj76W}<9Nb6*XA=u5vHjze$HeTIXKNzJAQnBJ&sPDif}mX`OU$!^UAy{osU=BM|{ z*H-M_gfFOWzBiU}`FeW_2tHn8k4}ESb9Dc(prRYu8LFaBo175p(-vhnqt|QbIZ(xo zduqzir=i=E8^I}OBL_k>vK7u>fUbfTFYhECy0U$J>G%9ftil-9PxH;;LpQXpadzH9 z5X!>{{&Xzpa#%W@I8A)paM?J?Ix!mMybU$Q3A>eWpJ+PPtMYS>f|oF1^~Z@fFnj~L zI~T&eZzu@&?RK6oLX2>g&@GM#OEp?;I}*ykdg8l;qO{VQsTIuQ@6fzH5GW`A?nl0l zLCDfu`Vj;RNx1pZ`=@Y*ApQ19GGygQ8(cMlJF!@7O_zN6-2%-Z)C^w& z29j|hn$=j}p-)j3j%dt`P$49+h~j5xvIgP=Z>!~#H>)5$!$TcDEkzbYa-i4-s&ZzTvy$v`6dat2K*;S@Y(9%uV@?X5;NbvjWir9EVqxjN zunI46v!pMvWAH9;+@Y^YBR%P78N7L=UD_>Y-;PZR8;3m+nFg5o6;R7DKm_aOsYS*o>jeyqFM zO=ML4HsuXO%ON@cRO(YVff{P4VOH->lk*)kN!IvYAF9GHH{UQ{l3j$F?L_vZR^+R% zP>mLh@Z8-7xbV77IXh^)IbG+}%e{ANReGvcU|T=&9iU&^OiZf0WN)!sq2 zhiPJ=>&@-ad*Qz9Msem=uOIW()Xc3nXO~Aro-5kVPzR2M6hg)8`_SjMX?M(!)@2o8 z*1Y&{;0s4kG61%y-^*nQR?6VbgmlsydOp)pnRL_)koDd<8KgV*7n9A{BR}p)|T-ETQ?$p*$(K#0{C^iNy z#Xax{V=Lu^kBC0wM*6~WU;w5qRBa98hXGj!TRA zUTe5L6LkpbiIP4AjNA9*7SAc(D_CAM84b~>uDf=-dGOh`k8j9@W@1pf;U4O1xYeKV z%#)+-g~N`1Zn`Z$gn7Jf`pl3#s%5(kjg_|Hu5Z({0kVu2$d4c_h9~DP_#RJ5fb|L$ zO99fpuwiyAf%>?X<`9JmYi3_0pd`9j7Ug+{ShOxDxjqBpM3p#iGRu6bDZ5u3M$Isf zOwKS@yrJ$WQXP?Wsf)Ww^7KIUYD7dT(3ss<5x30pnYn45%jQ9fTA8Zc@YY9l&fHYx z?P4w!T(5>=Jcf}{cwM-FCFs%xKXlzelxlc0iV2r`=hG?1wDX`QX(}GFpy-;VaG{TQ z)C$I?c#=fK-+7OK#EA`@oU`>SE?CrwvtN@c(>;!Cn>h&Ww)H05J;?|))fE+!a`M#2 zN-MrW0@s=`kqk?NDlAd-lABIC8G=9OD8b`j6j4>fPd9R>KU9`>C9N|~8 z*Ba=KG2&7dUwVv)-`>6{4iG|5bl^9n3-IwP;jpzm9bg=hm`G%ehKPNHt0sppd@ zO0`*~iZ*9Up{6fGM+Kb@*d_3{(NS$OOD8(vJdUc4Z^P?gmrBe*k}hl@M1WfBo-wI* z3a6=Y>zU<#>BMG@_8|Ibk%;%K%wtC+qt;7?pN(MLk_LE8O z5zy%C+WsdBO_4rvRT-UB?JC%fn9A02e`N1ESQ>vN?yvbYkT9-n4-4C5Z(*wJoE|=8 zMk{Qc*1S=@6i1ipM)f~`i_;YG(hO+zdb z-y*Jl-vsw}(5U6Urt{V(XQz|%#qH||U5%NlH;t3$DR^-gCSki57JPj;y(#*&11|(l z+PxjlxO7gUQ{yhI9wBs2+7k?YDf~9-M>w^)AHnt5KT~flr4XJ2@C*PJ06_jDEIEVR zZA_hiF%W7B_VX-AEvPfTXdqY9&yGn@b>Rgc{GaKvW)oJ5F^4fyi=G=w%Wm7u_!67) zhlZoYC$R-zExwt(G^M7~%2v3M-Z)NPcZ zwQspiu52FBrQURi)K7`4Q+qoNnzU|#P39ogE+rsd3txuNW;l4Jheb>Chypsa(?OPE z6X6mLQS$-$UH+EiG*m$n@*A&VXxCd5L~>20a&B4tCM@CFNU9ofTMkZEt6@@Ub8mF1 z9}^ZO)~+xO;?tbWvvN(&Y}7RZq!2&mV{!unPpTKRPbKr=S@%OduN|^JW-{bg7f`YNX;T&ke$ zLkIDB&Rv;6?eaJVra`EH(0uMN4y+rLdM)&*-R(s?R>_<5TdUk=cPp4H9K@rIjd!>9&hy6LeB_CFF10!^f_YI zt4M^f%3;XFu+c)8w3b(Vm!X+B>*!myh5+;n6H!EJlbMEj{7wd|x#!wD57LvO>->dWF@XwB7(AhiDqhv64dfv3{v%U$VL8~e-`9_&^f=$@?T_5V01e|2Me0`QNSbGh24Y)T5Ms@S#!C?6)*RKOtoWB9QU1LcD~5KW$?1{z#T(zld}Hi&m!Sp zjvj+lpr;$WaC$H!@#oQl42{4pk@?r~Clm3Dn+-mCy++8eTrV?+!1zifP&hGW22*Yp zmSrTr_L%fXPK{{uZPdDw(qy=m_bLzj1@~h_@9AD82xnPgXBEVj@}1@F z%^hmW5q4d6OAb{KYLp%`lh2T#XRPpXjSJ?Y+j5fT))frU*?2NQ@}+U95s~i!pK}?s z`e$Un*KFw<*QSLjV;tzR-J||AX8L0MYm&7?;n<`>%dQNvK<1oDKxBnzyGH#1JuDAM!A+(82d1 zs*XeVN)QlpXy;qEak9}Iz;TP*hK;a@PNVccBrbwd^kP%I-@OwYyFDW+`v7?<)y}s= zk+jc1!=(|oAa;}zqS7@(zKvca)%C*PGWy;%jE_H^k3Uza9SgLRZ?-JEd-JT~POqcI zPX4|6G<4<(_7qt$y;mvzX&vy}H3%288=KG;)yBYs&7R&~0`V%K3V*#Ev|})A>>|^* z_V2vqo;7XoEAX+KgQFojn71^sH&$}8cW`Dlws$i9v(5ZJx)N+LfpM?oI$3~0%g~Qv z{hsNC#*;yHV$`0>Xc;uEK^u0+b(Yd>pC3FYT}lWMMD^mF4|{o&^Gl4?VDt@2OVxs< zfq4L6MUd((Qi~n;DkDOvv25ih^_Uzi2CdEM*R<0K(=b^IW$irhT*nlqw9~kB8IwTc zLB!&e$YwEj$Jgm$yuO^CrMQnKo?m1Lm>r;_;J0uN{c?yxcLIT4;=6BYJ^fa2WQ&D) z!)3&eqo@5p?7xR>VRKZUsGW1=>(2ZHP24r^UE{>TrGUX%EfhTZ^7wXgT~41f;p03e zY(i0hik)BM+=j02BGDl5CDCW-q@gD2w!KU!$K|cP4(UBBT_{&r7TSk{%zG0#EKruxr@NhJIzz6kAtAfKb^SFmijyQXJZ4KLcG4^e z_xHCytT+Ch3|&EKVGRZ25D^jpK>I5W4ILc*2SYIa{=72cM1CfxK}V1m#7K+yyo*Xm z{-s71+-H?W5OH&B-vUFRan-e(C3_UjN|m)0Y1cvxULxde@ z%XnT?)*gE2J3N~KclPkd%9w3_4uBF>)LAhT2m+Hyqf#-cj02T{(J3-ET*iP8yh+Xn z%@^+voPg4fOaw?&Xt){Vc7m|^?}zc&9^bn0QWN1)Ey0_Ui;OX{G>Z|~GltxoK6dt+ zd+gN7aIIQbVvu?}n#e|eSQso>Lw z>Tl6`F!T1Lz4}!6?|rM^q5wb@{4e4EPZ#T{o~O->-jA9oc4hC;&++%RP<@?`&-l%?+?+ZnebB$PxHs$8s32E&%d{K{xzF? zs_$t~Ndy4=BiKI`|GQWItN01YU&R0P(ModAV88(Y QNZ=0!7_*f>lW4&I0ZH2!6aWAK literal 6103 zcmZ`-1yqz#w;e!0x@$;D>6Y&9lp0E4Bm@Q+IwVB8J4IT$OFD*>PC-Ix1xZ1Wc%!cO ztGxfsT6gaHzBT*Yd-t5!_h_mh-6I450QUjmA$dlqQfd}~@TVI1z=aPhh^3|r#0kpz z#L0=n)4^UXMhpXy8&~#r*txw)s$1GZazeYh7RQI2GtNq;R{D8|P-K5x&BGsnQ(hDH69X}Q|uBc>QHbeCWtuTE{?FKtWKF0-{ zs=`x!>vN40n5nn$0W#LDi^EFE4F3V+w-oBRLlIk3&40Fg`Fx@2D}0-*hyVcI-`ll> zxPX7|cQipurI!nLpfh5lz5S6Tl1wTIj0w3r@Dr%{L!0aFa03PE7z9mk*^)s<~t|M{v{{fIExM!1pvrm0ssi$ zL8jcO(UCKSAwnrszGk-5+Ce3fnt^UMF8UEOtV|l8!yi-*Ya_wKj3Ctm*U6 z<4JLm6;~a0p%4%^k}6o4 z`Nlx*zY~q49i-JZTKJ8KEOX9JT{iUC1Geo4r0mCo{~iSF}Yi3n#KjJf-s_3h_G4KKR# zIRMwzk30n%N2Vg3DDJIkwZ7`fKDF;GkV31^x1F$BI9sfYFDi}J(U?B>Tm6!0lZ74) z(wKhcu{<3U-d{1{iPdqW5!zcB-d{H1`o)%!>ALb+!7j0lwEX%PZ+xPLP048*fPs_y z+`B2H6jSC)w;8vP3Bf@JEv{P^3aq=^61$gr2J@65OyBF5YBKOCBb1(qAga$J*A>6}vKy z?(hP;<*UfuLT#o*y>E~&viQ*{;>jryt;y@;9|YUB9#U1&$7EFW;IT@JTaY~hx-<|6 zHaeFCn;YTxPkO3P)#F_mmh&fyn-A`d#=DKp#be_omC7Hw493NPBjV8`B`^QM}k)59YK9R zD!CWu`8Xb0>br~%4(L`TGA{__bnRW+h`&18zkE+0T$N$SPDdQ45?Cii-+UE0e8|#= zwqpfiKpWAW1`9;szmg_*+|~J}oIJl-)$*`e6tuBhmFp_bHhv*$wp~Ap!7J0`qVl;h zsiT2WQ2Dgxos&A%2&Ra&XaZ%U$7@RdA7>`+8+3(SI;k?vKcHO}@#zdH#rvaLvk`{Wa=~DlNog?Z@WapnfFQxIACgD7hV{?Nnp4^~r?w zr6O>HV->RpnFFo1P>*dIfpyvN3b>~GG3EGU&~bH#omf4Q`uL4u#b6F5o;ooGzM^Fk zNs)5i37JmEHmg$KW5G^2-bVaX(zf-j0cUlcW_Hl@hUqe8XG9g z5dV{!$^PtiW>%#)t@~xFN4NR$wbPIS&Zfw=*{O+4<@2-U^-IS@5k=K3FBl$zs~jR- zG6f1T3D1V+*Mv9}0))||A=yL3AgdyZfV4}D_G8QzW}>kQaUp*>wihAlrc(^~gb~%4 z$PY+d)(;L6?L4C9FWCYef^~VUW=V!+!n?(IU1Sz?8BQt}88KciebOCm1$Ho+tMqz1 zX0tQhE}71V)su+HpX8^nQPqV)GrgO8ALVbTql5KWeAK8iUZvko7Kpc%k0stUzB|!SSfOtS$o zhh^U<`toAD+Og~zqa0&@ZhfU|I4@S0RET@9wFMT#2z&5J$c&j+w)B?5>)pwRr~;uI zQ&gE}^H8ZP*tW+jPVL?QLSktRr#MiluV^5Y*(a{!$uQ{u|#=Whgn_B zmRLBa^4-gHP}7L=2|_z_NshHQEx(y!jd=MCpKeCCD7yYU$9M;aH%d%>tlz#r)s!~) zGtzG2_sh0pI!+w(49%)iQ(#$Yzo_L7j(r*H9vUd~t4=nC%sC&wYt}iceNWNHNmIPK zt>~SV{yOq=l`ieAo^TRB9yUpKqTHf22$nl4TAJaIx8>liam39oAA4dYjLq@Q{Hf zTtN?8FHSH$)x}sR=HSf;w=!DCtmj@EEqtBpgNuT#tPfi01W8atK_)%y+PwD#Gvb9we z@B)SN9K`d?2=E=EWJQvKQ&?bGgm^pjzy|w#N5R9qN7{LmdOc-C)7rAHQ*1iHsV*{G zat`lgMP`EaS+|P9Bn7lUl1@t=GN0ufPGv=|g6CNn--Jm@X)Q?$bSg`J<_zrDu9wmA zY_ND0asQOC0esItqdB+p9`Nxbw*JT2h#?kQO}!an{Q3a$v-UU96*i%jhjuVuBD@_| zJC$Aea++@`fM;1S;jEW)tTNfitUON(z$8vG2W2pzbq%(4?(DFpwG=BXs&TKvQLse{ zt3oDE2;o+7^AT&A>6pMilqa6VphdJ?or$Q(yhcqHah}6tC=scwE^?yxFlRKk65|)O}X9%d}m( zCSOCdvFXu`%c2|V-YbcJ19%`3jrRf`xO)fyz)y|;Qvp7)um?kbPrriKA?}O%3rSpg zvzL2F&4lCxFepmzYO)tf#GBVXHv0u=}-jV|Y_7`nQ?d zz4Tuu+=5Y=(zDpPR`9|&zrkQ%5}M7 zY4iFiJ%$NlhgAFMd_q`fZauqA8x(shnD~Gsz*~-lTfo0mMSjJQh=fzL4-Rn1^3Gy1DDp28NW0upT#cYz1ydwe;G)~yy|n< zJuTo)uYw6S)z-@N$gSs!EWohf=JjEDtV(ePj(DHA#|=`oqOu~>aP#Z;<$FD?po2Bu zcN^xMLEJlUqc<+~yz8N>2d3R?dqbnH(e|?ODc#olrPQFo>ji*2uye(Y^WU?azL3K5 zhTn)%(E$Lg-)DikdcptppOSxB&jgamMR?6hd_bwRp!p^s^nqMPy;|~OXuEI?TFO;- zR7t}1<%NZE!d%l-ky!MbPnT3YRar&%j zi}ytN*9$BbpV`r~l7^Y8=1kb5q&oqwpEbPpV`#l#HJpmHUKfw;0E%V%$Mheej)0sq z#){p;{qw~SCOc;|wj(MR-KXneD&`lZ8zS~KZHjT%4+{vq#odCqD_2j)MnvA8LzbgE z{9hfV#7v2?ng9H0Q2nU^nO@wPUHH=QY7kulfvmsrQ{F()-B8Kl>tl z;4Xci9*U67>sLXl&ATrg;M{0d6Fdr<%Hb~dQ zi0}0^f?Zuj0sGi}u>*nG5zt!9qKM6q_HIFKLQ=$5($z^(g|~^loZ{4b>_fGAVRQj7 z-a#JBz$QF}YD^kb%%R#{k?)^Y_*Fdd4sf~n-3e(N9IR)v!ctwQbm~zQoppEXlZZF5wju5hQpNyb3t8-{C9+e>42soBrncmuEG^i($Xu!hBhUzX{;iERjK0H8Ei2 zz(qcx!^ekJqBn8AM2Hi1eNy|1-Tz=nJ;#R7!|`?F3BCL?lwqrpy#qi2s6K*YY@JKP zj->pPVF*gwRJ$E+!o#;x3%pjZzNY78iQdS#x-HxK3SQR=9^!Ddne3FvSuraU0XI~8 zY#JYRYkwM1P04%1Ll(i3;$%egX^6(bR?n|mL%n|6i$vgDu5xr8t#TF9$PA3O@6vkMbfxq4;N;?S>Z5I!xND`E-enr`RHC# zntG1t4F3o$R)|C~a)Da8Wspkw%-Olhm7Iw$1)}*JXuX<{UCT2Czl^xNrw7$3%htC( z!jrs-=S`^&C@%KoBC))U&M8S}3U>sC4s75}YCpRoYw#!xGuO1@w0vTbEdA2lwJ_|& z<=``Q(v{-FX6T!4J=84cz&H{#<`a>%h4YS3Qa7ykMsnUdDP>j=`l)UJ@iack*iNRr zzUB`*>z2YUT&dXw#5jJ~L*YzD0u9psZy%rD(C{K}47;5oysx=`JCMz$wCrX6AgrLc z%9FA7boBhihC9}`WP7TE;@IUi)Zqso#Mcj~a=zWx8ajMukNUV`O)6WJ*4zK%>xzab ztcPV^-B;*JAqCzeLqH@%`u{tgaR2%B34n9`fBTKg;-i6<7 z(fxtHgD?MI8g_RL+`Xdx$3P&~{a*(D5;1=+Y zUGUu`^atF4{|o&8q@=rc?#B5aI~njM?th7TO%-H#llNy4#Q;pgYs}$K-v;~-j2U`O diff --git a/tests/test_backend_msexcel.py b/tests/test_backend_msexcel.py index 2e2fdeeb..6dd73425 100644 --- a/tests/test_backend_msexcel.py +++ b/tests/test_backend_msexcel.py @@ -79,30 +79,21 @@ def test_pages(documents) -> None: documents: The paths and converted documents. """ # number of pages from the backend method - # Logic to handle multiple files - file_stems = ["sample_sales_data","test-01"] - for stem in file_stems: - path = next(item for item in get_excel_paths() if item.stem == stem) - in_doc = InputDocument( - path_or_stream=path, - format=InputFormat.XLSX, - filename=path.stem, - backend=MsExcelDocumentBackend, - ) - backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) - # Update the expected page count based on actual content - expected_page_count = 3 # Adjust this value based on the actual number of worksheets this needs to be adjusted for each xlsm and xlsx files independently - assert backend.page_count() == expected_page_count + path = next(item for item in get_excel_paths() if item.stem == "test-01") + in_doc = InputDocument( + path_or_stream=path, + format=InputFormat.XLSX, + filename=path.stem, + backend=MsExcelDocumentBackend, + ) + backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path) + assert backend.page_count() == 3 - # number of pages from the converted document - doc = next(item for path, item in documents if path.stem == stem) - assert len(doc.pages) == 3 + # number of pages from the converted document + doc = next(item for path, item in documents if path.stem == "test-01") + assert len(doc.pages) == 3 - # page sizes as number of cells - - # for xlsm file just adjust this wrt the xlsm files for test xlsm enable this: - #assert doc.pages.get(1).size.as_tuple() == (4.0, 21.0) - # for xlsx file: - assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) - assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) - assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0) + # page sizes as number of cells + assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0) + assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0) + assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)