mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
test: Implement csv parsing and format tests
Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
parent
d64f2bb0ab
commit
d91ea7b186
6
tests/data/csv/test-01.csv
Normal file
6
tests/data/csv/test-01.csv
Normal file
@ -0,0 +1,6 @@
|
||||
Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
|
||||
1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
|
||||
2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
|
||||
3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
|
||||
4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
|
||||
5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
|
|
3
tests/data/groundtruth/docling_v2/test-01.csv.itxt
Normal file
3
tests/data/groundtruth/docling_v2/test-01.csv.itxt
Normal file
@ -0,0 +1,3 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group csv content
|
||||
item-2 at level 2: table with [6x12]
|
1811
tests/data/groundtruth/docling_v2/test-01.csv.json
Normal file
1811
tests/data/groundtruth/docling_v2/test-01.csv.json
Normal file
File diff suppressed because it is too large
Load Diff
7
tests/data/groundtruth/docling_v2/test-01.csv.md
Normal file
7
tests/data/groundtruth/docling_v2/test-01.csv.md
Normal file
@ -0,0 +1,7 @@
|
||||
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|
||||
|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
|
||||
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
|
||||
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
|
||||
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
|
||||
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
|
||||
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |
|
72
tests/test_backend_csv.py
Normal file
72
tests/test_backend_csv.py
Normal file
@ -0,0 +1,72 @@
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.datamodel.document import ConversionResult, DoclingDocument
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
GENERATE = False
|
||||
|
||||
|
||||
def get_csv_paths():
|
||||
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/csv/")
|
||||
|
||||
# List all PDF files in the directory and its subdirectories
|
||||
pdf_files = sorted(directory.rglob("*.csv"))
|
||||
return pdf_files
|
||||
|
||||
|
||||
def get_converter():
|
||||
|
||||
converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
|
||||
|
||||
return converter
|
||||
|
||||
|
||||
def verify_export(pred_text: str, gtfile: str):
|
||||
|
||||
if not os.path.exists(gtfile) or GENERATE:
|
||||
with open(gtfile, "w") as fw:
|
||||
fw.write(pred_text)
|
||||
|
||||
return True
|
||||
|
||||
else:
|
||||
with open(gtfile, "r") as fr:
|
||||
true_text = fr.read()
|
||||
|
||||
assert pred_text == true_text, "pred_itxt==true_itxt"
|
||||
return pred_text == true_text
|
||||
|
||||
|
||||
def test_e2e_csv_conversions():
|
||||
|
||||
csv_paths = get_csv_paths()
|
||||
converter = get_converter()
|
||||
|
||||
for csv_path in csv_paths:
|
||||
print(f"converting {csv_path}")
|
||||
|
||||
gt_path = (
|
||||
csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
|
||||
)
|
||||
|
||||
conv_result: ConversionResult = converter.convert(csv_path)
|
||||
|
||||
doc: DoclingDocument = conv_result.document
|
||||
|
||||
pred_md: str = doc.export_to_markdown()
|
||||
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
|
||||
|
||||
pred_itxt: str = doc._export_to_indented_text(
|
||||
max_text_len=70, explicit_tables=False
|
||||
)
|
||||
assert verify_export(
|
||||
pred_itxt, str(gt_path) + ".itxt"
|
||||
), "export to indented-text"
|
||||
|
||||
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
|
||||
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"
|
@ -108,6 +108,15 @@ def test_guess_format(tmp_path):
|
||||
doc_path = Path("./tests/data/md/wiki.md")
|
||||
assert dci._guess_format(doc_path) == InputFormat.MD
|
||||
|
||||
# Valid CSV
|
||||
buf = BytesIO(Path("./tests/data/csv/test-01.csv").open("rb").read())
|
||||
stream = DocumentStream(name="test-01.csv", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.CSV
|
||||
stream = DocumentStream(name="test-01", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.CSV
|
||||
doc_path = Path("./tests/data/csv/test-01.csv")
|
||||
assert dci._guess_format(doc_path) == InputFormat.CSV
|
||||
|
||||
# Valid XML USPTO patent
|
||||
buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
|
||||
stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
|
||||
|
Loading…
Reference in New Issue
Block a user