feat: Add support for various CSV dialects and update documentation

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
Tobias Strebitzer 2025-02-13 08:37:02 +08:00
parent 79eed3ef08
commit 1ca87f5d8c
24 changed files with 5555 additions and 92 deletions

View File

@ -4,13 +4,7 @@ from io import BytesIO, StringIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@ -23,27 +17,21 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
# Initialize parent for hierarchy
self.parent = None
self.valid = False
# Load content
try:
if isinstance(self.path_or_stream, BytesIO):
content = self.path_or_stream.getvalue().decode("utf-8")
self.csv_data = list(csv.reader(StringIO(content)))
self.content = self.path_or_stream.getvalue().decode("utf-8")
elif isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, 'r', newline='') as f:
self.csv_data = list(csv.reader(f))
with open(self.path_or_stream, "r", newline="") as f:
self.content = f.read()
self.valid = True
except Exception as e:
self.valid = False
raise RuntimeError(
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
) from e
return
def is_valid(self) -> bool:
_log.info(f"valid: {self.valid}")
return self.valid
@classmethod
@ -60,6 +48,23 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
return {InputFormat.CSV}
def convert(self) -> DoclingDocument:
"""
Parses the CSV data into a structured document model.
"""
# Detect CSV dialect
dialect = csv.Sniffer().sniff(self.content)
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|"}:
raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
)
# Parce CSV
result = csv.reader(StringIO(self.content), dialect=dialect)
self.csv_data = list(result)
_log.info(f"Detected {len(self.csv_data)} lines")
# Parse the CSV into a structured document model
origin = DocumentOrigin(
filename=self.file.name or "file.csv",
@ -70,13 +75,6 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
if self.is_valid():
# Create a section for the CSV content
self.parent = doc.add_group(
parent=None,
label=GroupLabel.SECTION,
name="csv content",
)
# Convert CSV data to table
if self.csv_data:
num_rows = len(self.csv_data)
@ -104,7 +102,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
)
table_data.table_cells.append(cell)
doc.add_table(data=table_data, parent=self.parent)
doc.add_table(data=table_data)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."

View File

@ -1,6 +1,6 @@
import csv
import logging
import re
import csv
from enum import Enum
from io import BytesIO
from pathlib import Path, PurePath
@ -424,4 +424,4 @@ class _DocumentConversionInput(BaseModel):
except csv.Error:
return None
return None
return None

View File

@ -10,11 +10,11 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend
@ -66,6 +66,7 @@ class CsvFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend

View File

@ -0,0 +1,80 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Conversion of CSV files\n",
"\n",
"This example shows how to convert CSV files to a structured Docling Document.\n",
"\n",
"* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
"* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example Code"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"# Convert CSV to Docling document\n",
"converter = DocumentConverter()\n",
"result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
"output = result.document.export_to_markdown()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This code generates the following output:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |\n",
"|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
"| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |\n",
"| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |\n",
"| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |\n",
"| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |\n",
"| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "docling-TtEIaPrw-py3.12",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,32 +0,0 @@
import json
import logging
from pathlib import Path
import yaml
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter
logging.basicConfig(level=logging.DEBUG)
def main():
# Convert CSV to Docling document:
source = "https://drive.google.com/uc?id=1zO8ekHWx9U7mrbx_0Hoxxu6od7uxJqWw&export=download"
converter = DocumentConverter()
result = converter.convert(source)
# Export Docling document:
out_path = Path("scratch")
print(f"Document converted." f"\nSaving output to: {str(out_path)}")
with (out_path / f"customers-100.md").open("w") as fp:
fp.write(result.document.export_to_markdown())
with (out_path / f"customers-100.json").open("w") as fp:
fp.write(json.dumps(result.document.export_to_dict()))
with (out_path / f"customers-100.yaml").open("w") as fp:
fp.write(yaml.safe_dump(result.document.export_to_dict()))
if __name__ == "__main__":
main()

View File

@ -80,6 +80,7 @@ nav:
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
- "Accelerator options": examples/run_with_accelerator.py
- "Simple translation": examples/translate.py
- examples/backend_csv.ipynb
- examples/backend_xml_rag.ipynb
- ✂️ Chunking:
- examples/hybrid_chunking.ipynb

View File

@ -1,6 +1,6 @@
Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,1Ef7b82A4CAAD10,Preston,"Lozano, Dr",Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Lozano, Dr Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez, Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin, Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,6 @@
Index|Customer Id|First Name|Last Name|Company|City|Country|Phone 1|Phone 2|Email|Subscription Date|Website
1|DD37Cf93aecA6Dc|Sheryl|Baxter|Rasmussen Group|East Leonard|Chile|229.077.5154|397.884.0519x718|zunigavanessa@smith.info|2020-08-24|http://www.stephenson.com/
2|1Ef7b82A4CAAD10|Preston|Lozano|Vega-Gentry|East Jimmychester|Djibouti|5153435776|686-620-1820x944|vmata@colon.com|2021-04-23|http://www.hobbs.com/
3|6F94879bDAfE5a6|Roy|Berry|Murillo-Perry|Isabelborough|Antigua and Barbuda|+1-539-402-0259|(496)978-3969x58947|beckycarr@hogan.com|2020-03-25|http://www.lawrence.com/
4|5Cef8BFA16c5e3c|Linda|Olsen|"Dominguez|Mcmillan and Donovan"|Bensonview|Dominican Republic|001-808-617-6467x12895|+1-813-324-8756|stanleyblackwell@benson.org|2020-06-02|http://www.good-lyons.com/
5|053d585Ab6b3159|Joanna|Bender|"Martin|Lang and Andrade"|West Priscilla|Slovakia (Slovak Republic)|001-234-203-0635x76146|001-199-446-3860x3486|colinalvarado@miles.net|2021-04-17|https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez|Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin|Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,6 @@
Index;Customer Id;First Name;Last Name;Company;City;Country;Phone 1;Phone 2;Email;Subscription Date;Website
1;DD37Cf93aecA6Dc;Sheryl;Baxter;Rasmussen Group;East Leonard;Chile;229.077.5154;397.884.0519x718;zunigavanessa@smith.info;2020-08-24;http://www.stephenson.com/
2;1Ef7b82A4CAAD10;Preston;Lozano;Vega-Gentry;East Jimmychester;Djibouti;5153435776;686-620-1820x944;vmata@colon.com;2021-04-23;http://www.hobbs.com/
3;6F94879bDAfE5a6;Roy;Berry;Murillo-Perry;Isabelborough;Antigua and Barbuda;+1-539-402-0259;(496)978-3969x58947;beckycarr@hogan.com;2020-03-25;http://www.lawrence.com/
4;5Cef8BFA16c5e3c;Linda;Olsen;"Dominguez;Mcmillan and Donovan";Bensonview;Dominican Republic;001-808-617-6467x12895;+1-813-324-8756;stanleyblackwell@benson.org;2020-06-02;http://www.good-lyons.com/
5;053d585Ab6b3159;Joanna;Bender;"Martin;Lang and Andrade";West Priscilla;Slovakia (Slovak Republic);001-234-203-0635x76146;001-199-446-3860x3486;colinalvarado@miles.net;2021-04-17;https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez;Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin;Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,6 @@
Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
4 5Cef8BFA16c5e3c Linda Olsen "Dominguez Mcmillan and Donovan" Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
5 053d585Ab6b3159 Joanna Bender "Martin Lang and Andrade" West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

View File

@ -1,11 +1,11 @@
{
"schema_name": "DoclingDocument",
"version": "1.1.0",
"name": "test-01",
"name": "csv-comma",
"origin": {
"mimetype": "text/csv",
"binary_hash": 7076108075619672109,
"filename": "test-01.csv"
"binary_hash": 297933764223584292,
"filename": "csv-comma.csv"
},
"furniture": {
"self_ref": "#/furniture",
@ -18,36 +18,21 @@
"self_ref": "#/body",
"children": [
{
"$ref": "#/groups/0"
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"name": "csv content",
"label": "section"
}
],
"groups": [],
"texts": [],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/groups/0"
"$ref": "#/body"
},
"children": [],
"content_layer": "body",
@ -389,7 +374,7 @@
"end_row_offset_idx": 3,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "Lozano",
"text": "Lozano, Dr",
"column_header": false,
"row_header": false,
"row_section": false
@ -1262,7 +1247,7 @@
"end_row_offset_idx": 3,
"start_col_offset_idx": 3,
"end_col_offset_idx": 4,
"text": "Lozano",
"text": "Lozano, Dr",
"column_header": false,
"row_header": false,
"row_section": false

View File

@ -1,7 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez|Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin|Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez;Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin;Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|-----------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -1,3 +0,0 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group csv content
item-2 at level 2: table with [6x12]

View File

@ -50,9 +50,7 @@ def test_e2e_csv_conversions():
for csv_path in csv_paths:
print(f"converting {csv_path}")
gt_path = (
csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
)
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
conv_result: ConversionResult = converter.convert(csv_path)