mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-26 20:14:47 +00:00
run tests
This commit is contained in:
parent
c4c59204d6
commit
abcbde71b6
8
tests/data/groundtruth/docling_v2/example_8.html.itxt
Normal file
8
tests/data/groundtruth/docling_v2/example_8.html.itxt
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: section: group header-1
|
||||||
|
item-2 at level 2: section_header: Pivot table with with 1 row header
|
||||||
|
item-3 at level 3: table with [6x4]
|
||||||
|
item-4 at level 2: section_header: Pivot table with 2 row headers
|
||||||
|
item-5 at level 3: table with [6x5]
|
||||||
|
item-6 at level 2: section_header: Equivalent pivot table
|
||||||
|
item-7 at level 3: table with [6x5]
|
2008
tests/data/groundtruth/docling_v2/example_8.html.json
Normal file
2008
tests/data/groundtruth/docling_v2/example_8.html.json
Normal file
File diff suppressed because it is too large
Load Diff
29
tests/data/groundtruth/docling_v2/example_8.html.md
Normal file
29
tests/data/groundtruth/docling_v2/example_8.html.md
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
## Pivot table with with 1 row header
|
||||||
|
|
||||||
|
| Year | Month | Revenue | Cost |
|
||||||
|
|--------|----------|-----------|--------|
|
||||||
|
| 2025 | January | $134 | $162 |
|
||||||
|
| 2025 | February | $150 | $155 |
|
||||||
|
| 2025 | March | $160 | $143 |
|
||||||
|
| 2025 | April | $210 | $150 |
|
||||||
|
| 2025 | May | $280 | $120 |
|
||||||
|
|
||||||
|
## Pivot table with 2 row headers
|
||||||
|
|
||||||
|
| Year | Quarter | Month | Revenue | Cost |
|
||||||
|
|--------|-----------|----------|-----------|--------|
|
||||||
|
| 2025 | Q1 | January | $134 | $162 |
|
||||||
|
| 2025 | Q1 | February | $150 | $155 |
|
||||||
|
| 2025 | Q1 | March | $160 | $143 |
|
||||||
|
| 2025 | Q2 | April | $210 | $150 |
|
||||||
|
| 2025 | Q2 | May | $280 | $120 |
|
||||||
|
|
||||||
|
## Equivalent pivot table
|
||||||
|
|
||||||
|
| Year | Quarter | Month | Revenue | Cost |
|
||||||
|
|--------|-----------|----------|-----------|--------|
|
||||||
|
| 2025 | Q1 | January | $134 | $162 |
|
||||||
|
| 2025 | Q1 | February | $150 | $155 |
|
||||||
|
| 2025 | Q1 | March | $160 | $143 |
|
||||||
|
| 2025 | Q2 | April | $210 | $150 |
|
||||||
|
| 2025 | Q2 | May | $280 | $120 |
|
@ -0,0 +1,3 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: section: group sheet: SalesData
|
||||||
|
item-2 at level 2: table with [21x4]
|
2153
tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json
Normal file
2153
tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.json
Normal file
File diff suppressed because it is too large
Load Diff
22
tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md
Normal file
22
tests/data/groundtruth/docling_v2/sample_sales_data.xlsm.md
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
| Product | Date | Quantity | Revenue |
|
||||||
|
|-----------|---------------------|------------|-----------|
|
||||||
|
| Widget A | 2024-01-01 00:00:00 | 5 | 5000 |
|
||||||
|
| Widget B | 2024-01-02 00:00:00 | 10 | 12000 |
|
||||||
|
| Widget C | 2024-01-03 00:00:00 | 3 | 3000 |
|
||||||
|
| Widget D | 2024-01-04 00:00:00 | 8 | 8000 |
|
||||||
|
| Widget A | 2024-01-05 00:00:00 | 7 | 7000 |
|
||||||
|
| Widget B | 2024-01-06 00:00:00 | 6 | 6000 |
|
||||||
|
| Widget C | 2024-01-07 00:00:00 | 12 | 15000 |
|
||||||
|
| Widget D | 2024-01-08 00:00:00 | 9 | 9000 |
|
||||||
|
| Widget A | 2024-01-09 00:00:00 | 4 | 4000 |
|
||||||
|
| Widget B | 2024-01-10 00:00:00 | 11 | 11000 |
|
||||||
|
| Widget C | 2024-01-11 00:00:00 | 5 | 5000 |
|
||||||
|
| Widget D | 2024-01-12 00:00:00 | 8 | 8500 |
|
||||||
|
| Widget A | 2024-01-13 00:00:00 | 6 | 6200 |
|
||||||
|
| Widget B | 2024-01-14 00:00:00 | 7 | 7100 |
|
||||||
|
| Widget C | 2024-01-15 00:00:00 | 10 | 10500 |
|
||||||
|
| Widget D | 2024-01-16 00:00:00 | 3 | 3200 |
|
||||||
|
| Widget A | 2024-01-17 00:00:00 | 9 | 9400 |
|
||||||
|
| Widget B | 2024-01-18 00:00:00 | 12 | 12500 |
|
||||||
|
| Widget C | 2024-01-19 00:00:00 | 6 | 6100 |
|
||||||
|
| Widget D | 2024-01-20 00:00:00 | 8 | 8900 |
|
94
tests/data/groundtruth/docling_v2/textbox.docx.itxt
Normal file
94
tests/data/groundtruth/docling_v2/textbox.docx.itxt
Normal file
@ -0,0 +1,94 @@
|
|||||||
|
item-0 at level 0: unspecified: group _root_
|
||||||
|
item-1 at level 1: paragraph: Chiayi County Shuishang Township ... mentary School Affiliated Kindergarten
|
||||||
|
item-2 at level 1: paragraph: Infectious Disease Reporting Pro ... r the 113th Academic Year Kindergarten
|
||||||
|
item-3 at level 1: paragraph:
|
||||||
|
item-4 at level 1: section: group textbox
|
||||||
|
item-5 at level 2: paragraph: Student falls ill
|
||||||
|
item-6 at level 2: paragraph:
|
||||||
|
item-7 at level 2: paragraph:
|
||||||
|
item-8 at level 2: list: group list
|
||||||
|
item-9 at level 3: list_item: Suggested Reportable Symptoms:
|
||||||
|
* ... sh
|
||||||
|
* Blisters
|
||||||
|
* Headache
|
||||||
|
* Sore throat
|
||||||
|
item-10 at level 1: list_item:
|
||||||
|
item-11 at level 1: paragraph:
|
||||||
|
item-12 at level 1: paragraph:
|
||||||
|
item-13 at level 1: section: group textbox
|
||||||
|
item-14 at level 2: paragraph: If a caregiver suspects that wit ... the same suggested reportable symptoms
|
||||||
|
item-15 at level 1: paragraph:
|
||||||
|
item-16 at level 1: paragraph:
|
||||||
|
item-17 at level 1: paragraph:
|
||||||
|
item-18 at level 1: paragraph:
|
||||||
|
item-19 at level 1: section: group textbox
|
||||||
|
item-20 at level 2: paragraph: Yes
|
||||||
|
item-21 at level 1: paragraph:
|
||||||
|
item-22 at level 1: paragraph:
|
||||||
|
item-23 at level 1: section: group textbox
|
||||||
|
item-24 at level 2: paragraph: A report must be submitted wi ... saster Prevention Information Network.
|
||||||
|
item-25 at level 2: paragraph: A report must also be submitt ... d Infectious Disease Reporting System.
|
||||||
|
item-26 at level 2: paragraph:
|
||||||
|
item-27 at level 2: paragraph:
|
||||||
|
item-28 at level 1: paragraph:
|
||||||
|
item-29 at level 1: paragraph:
|
||||||
|
item-30 at level 1: paragraph:
|
||||||
|
item-31 at level 1: paragraph:
|
||||||
|
item-32 at level 1: paragraph:
|
||||||
|
item-33 at level 1: paragraph:
|
||||||
|
item-34 at level 1: section: group textbox
|
||||||
|
item-35 at level 2: paragraph: Health Bureau:
|
||||||
|
item-36 at level 2: paragraph: Upon receiving a report from the ... rt to the Centers for Disease Control.
|
||||||
|
item-37 at level 2: list: group list
|
||||||
|
item-38 at level 3: list_item: If necessary, provide health edu ... vidual to undergo specimen collection.
|
||||||
|
item-39 at level 3: list_item: Implement appropriate epidemic p ... the Communicable Disease Control Act.
|
||||||
|
item-40 at level 2: paragraph:
|
||||||
|
item-41 at level 2: paragraph:
|
||||||
|
item-42 at level 1: list: group list
|
||||||
|
item-43 at level 2: list_item:
|
||||||
|
item-44 at level 1: paragraph:
|
||||||
|
item-45 at level 1: section: group textbox
|
||||||
|
item-46 at level 2: paragraph: Department of Education:
|
||||||
|
Collabo ... vention measures at all school levels.
|
||||||
|
item-47 at level 1: paragraph:
|
||||||
|
item-48 at level 1: paragraph:
|
||||||
|
item-49 at level 1: paragraph:
|
||||||
|
item-50 at level 1: paragraph:
|
||||||
|
item-51 at level 1: paragraph:
|
||||||
|
item-52 at level 1: paragraph:
|
||||||
|
item-53 at level 1: paragraph:
|
||||||
|
item-54 at level 1: section: group textbox
|
||||||
|
item-55 at level 2: inline: group group
|
||||||
|
item-56 at level 3: paragraph: The Health Bureau will handle
|
||||||
|
item-57 at level 3: paragraph: reporting and specimen collection
|
||||||
|
item-58 at level 3: paragraph: .
|
||||||
|
item-59 at level 2: paragraph:
|
||||||
|
item-60 at level 2: paragraph:
|
||||||
|
item-61 at level 1: paragraph:
|
||||||
|
item-62 at level 1: paragraph:
|
||||||
|
item-63 at level 1: paragraph:
|
||||||
|
item-64 at level 1: section: group textbox
|
||||||
|
item-65 at level 2: paragraph: Whether the epidemic has eased.
|
||||||
|
item-66 at level 2: paragraph:
|
||||||
|
item-67 at level 2: paragraph:
|
||||||
|
item-68 at level 1: paragraph:
|
||||||
|
item-69 at level 1: section: group textbox
|
||||||
|
item-70 at level 2: paragraph: Whether the test results are pos ... legally designated infectious disease.
|
||||||
|
item-71 at level 2: paragraph: No
|
||||||
|
item-72 at level 1: paragraph:
|
||||||
|
item-73 at level 1: paragraph:
|
||||||
|
item-74 at level 1: section: group textbox
|
||||||
|
item-75 at level 1: paragraph:
|
||||||
|
item-76 at level 1: section: group textbox
|
||||||
|
item-77 at level 1: paragraph:
|
||||||
|
item-78 at level 1: paragraph:
|
||||||
|
item-79 at level 1: section: group textbox
|
||||||
|
item-80 at level 2: paragraph: Case closed.
|
||||||
|
item-81 at level 2: paragraph:
|
||||||
|
item-82 at level 2: paragraph:
|
||||||
|
item-83 at level 2: paragraph: The Health Bureau will carry out ... ters for Disease Control if necessary.
|
||||||
|
item-84 at level 1: paragraph:
|
||||||
|
item-85 at level 1: section: group textbox
|
||||||
|
item-86 at level 1: paragraph:
|
||||||
|
item-87 at level 1: paragraph:
|
||||||
|
item-88 at level 1: paragraph:
|
1470
tests/data/groundtruth/docling_v2/textbox.docx.json
Normal file
1470
tests/data/groundtruth/docling_v2/textbox.docx.json
Normal file
File diff suppressed because it is too large
Load Diff
46
tests/data/groundtruth/docling_v2/textbox.docx.md
Normal file
46
tests/data/groundtruth/docling_v2/textbox.docx.md
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
**Chiayi County Shuishang Township Nanjing Elementary School Affiliated Kindergarten**
|
||||||
|
|
||||||
|
**Infectious Disease Reporting Procedure for the 113th Academic Year Kindergarten**
|
||||||
|
|
||||||
|
**Student falls ill**
|
||||||
|
|
||||||
|
- Suggested Reportable Symptoms:
|
||||||
|
* Fever
|
||||||
|
* Cough
|
||||||
|
* Diarrhea
|
||||||
|
* Vomiting
|
||||||
|
* Rash
|
||||||
|
* Blisters
|
||||||
|
* Headache
|
||||||
|
* Sore throat
|
||||||
|
|
||||||
|
If a caregiver suspects that within one week, a fifth of the class (for classes with more than 15 students) or more than three students (for classes with 15 or fewer students)
|
||||||
|
show the same suggested reportable symptoms
|
||||||
|
|
||||||
|
Yes
|
||||||
|
|
||||||
|
A report must be submitted within 24 hours via the Ministry of Education’s Campus Safety and Disaster Prevention Information Network.
|
||||||
|
|
||||||
|
A report must also be submitted within 48 hours through Chiayi County’s School Suspected Infectious Disease Reporting System.
|
||||||
|
|
||||||
|
**Health Bureau:**
|
||||||
|
|
||||||
|
Upon receiving a report from the kindergarten, conduct a preliminary assessment of the case, and depending on the situation and type of illness, carry out an epidemiological investigation and report to the Centers for Disease Control.
|
||||||
|
|
||||||
|
- If necessary, provide health education and important reminders at the kindergarten, or notify the individual to undergo specimen collection.
|
||||||
|
- Implement appropriate epidemic prevention measures in accordance with the Communicable Disease Control Act.
|
||||||
|
|
||||||
|
Department of Education:
|
||||||
|
Collaborate with the Health Bureau in conducting epidemiological investigations and assist Health Bureau personnel in implementing necessary epidemic prevention measures at all school levels.
|
||||||
|
|
||||||
|
The Health Bureau will handle **reporting and specimen collection** .
|
||||||
|
|
||||||
|
**Whether the epidemic has eased.**
|
||||||
|
|
||||||
|
**Whether the test results are positive for a legally designated infectious disease.**
|
||||||
|
|
||||||
|
No
|
||||||
|
|
||||||
|
**Case closed.**
|
||||||
|
|
||||||
|
The Health Bureau will carry out subsequent related epidemic prevention measures and follow-up, and will request assistance from the Centers for Disease Control if necessary.
|
@ -80,7 +80,7 @@ def test_pages(documents) -> None:
|
|||||||
"""
|
"""
|
||||||
# number of pages from the backend method
|
# number of pages from the backend method
|
||||||
# Logic to handle multiple files
|
# Logic to handle multiple files
|
||||||
file_stems = [ "sample_sales_data", "test-01"]
|
file_stems = ["sample_sales_data"]
|
||||||
for stem in file_stems:
|
for stem in file_stems:
|
||||||
path = next(item for item in get_excel_paths() if item.stem == stem)
|
path = next(item for item in get_excel_paths() if item.stem == stem)
|
||||||
in_doc = InputDocument(
|
in_doc = InputDocument(
|
||||||
@ -91,19 +91,18 @@ def test_pages(documents) -> None:
|
|||||||
)
|
)
|
||||||
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
|
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
|
||||||
# Update the expected page count based on actual content
|
# Update the expected page count based on actual content
|
||||||
expected_page_count = 3 # Adjust this value based on the actual number of worksheets this needs to be adjusted for each xlsm and xlsx files independently
|
expected_page_count = 1 # Adjust this value based on the actual number of worksheets this needs to be adjusted for each xlsm and xlsx files independently
|
||||||
assert backend.page_count() == expected_page_count
|
assert backend.page_count() == expected_page_count
|
||||||
|
|
||||||
# number of pages from the converted document
|
# number of pages from the converted document
|
||||||
doc = next(item for path, item in documents if path.stem == stem)
|
doc = next(item for path, item in documents if path.stem == stem)
|
||||||
assert len(doc.pages) == 3
|
assert len(doc.pages) == 1
|
||||||
|
|
||||||
|
|
||||||
# page sizes as number of cells
|
# page sizes as number of cells
|
||||||
|
|
||||||
# for xlsm file just adjust this wrt the xlsm files for test xlsm enable this:
|
# for xlsm file just adjust this wrt the xlsm files for test xlsm enable this:
|
||||||
# assert doc.pages.get(1).size.as_tuple() == (4.0, 21.0)
|
assert doc.pages.get(1).size.as_tuple() == (4.0, 21.0)
|
||||||
# for xlsx file:
|
# for xlsx file:
|
||||||
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
|
#assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
|
||||||
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
|
#assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
|
||||||
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
|
#assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
|
||||||
|
Loading…
Reference in New Issue
Block a user