test: Implement csv parsing and format tests

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
Tobias Strebitzer 2025-02-12 11:37:47 +08:00
parent d64f2bb0ab
commit d91ea7b186
6 changed files with 1908 additions and 0 deletions

View File

@ -0,0 +1,6 @@
Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez, Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin, Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,3 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group csv content
item-2 at level 2: table with [6x12]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

72
tests/test_backend_csv.py Normal file
View File

@ -0,0 +1,72 @@
import json
import os
from pathlib import Path
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult, DoclingDocument
from docling.document_converter import DocumentConverter
GENERATE = False
def get_csv_paths():
# Define the directory you want to search
directory = Path("./tests/data/csv/")
# List all PDF files in the directory and its subdirectories
pdf_files = sorted(directory.rglob("*.csv"))
return pdf_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.CSV])
return converter
def verify_export(pred_text: str, gtfile: str):
if not os.path.exists(gtfile) or GENERATE:
with open(gtfile, "w") as fw:
fw.write(pred_text)
return True
else:
with open(gtfile, "r") as fr:
true_text = fr.read()
assert pred_text == true_text, "pred_itxt==true_itxt"
return pred_text == true_text
def test_e2e_csv_conversions():
csv_paths = get_csv_paths()
converter = get_converter()
for csv_path in csv_paths:
print(f"converting {csv_path}")
gt_path = (
csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
)
conv_result: ConversionResult = converter.convert(csv_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md"), "export to md"
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(
pred_itxt, str(gt_path) + ".itxt"
), "export to indented-text"
pred_json: str = json.dumps(doc.export_to_dict(), indent=2)
assert verify_export(pred_json, str(gt_path) + ".json"), "export to json"

View File

@ -108,6 +108,15 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/md/wiki.md")
assert dci._guess_format(doc_path) == InputFormat.MD
# Valid CSV
buf = BytesIO(Path("./tests/data/csv/test-01.csv").open("rb").read())
stream = DocumentStream(name="test-01.csv", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV
stream = DocumentStream(name="test-01", stream=buf)
assert dci._guess_format(stream) == InputFormat.CSV
doc_path = Path("./tests/data/csv/test-01.csv")
assert dci._guess_format(doc_path) == InputFormat.CSV
# Valid XML USPTO patent
buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
stream = DocumentStream(name="ipa20110039701.xml", stream=buf)