feat: Add support for various CSV dialects and update documentation

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
Tobias Strebitzer 2025-02-13 08:37:02 +08:00
parent 79eed3ef08
commit 1ca87f5d8c
24 changed files with 5555 additions and 92 deletions

View File

@ -4,13 +4,7 @@ from io import BytesIO, StringIO
from pathlib import Path from pathlib import Path
from typing import Set, Union from typing import Set, Union
from docling_core.types.doc import ( from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat from docling.datamodel.base_models import InputFormat
@ -23,27 +17,21 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream) super().__init__(in_doc, path_or_stream)
# Initialize parent for hierarchy # Load content
self.parent = None
self.valid = False
try: try:
if isinstance(self.path_or_stream, BytesIO): if isinstance(self.path_or_stream, BytesIO):
content = self.path_or_stream.getvalue().decode("utf-8") self.content = self.path_or_stream.getvalue().decode("utf-8")
self.csv_data = list(csv.reader(StringIO(content)))
elif isinstance(self.path_or_stream, Path): elif isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, 'r', newline='') as f: with open(self.path_or_stream, "r", newline="") as f:
self.csv_data = list(csv.reader(f)) self.content = f.read()
self.valid = True self.valid = True
except Exception as e: except Exception as e:
self.valid = False
raise RuntimeError( raise RuntimeError(
f"CsvDocumentBackend could not load document with hash {self.document_hash}" f"CsvDocumentBackend could not load document with hash {self.document_hash}"
) from e ) from e
return
def is_valid(self) -> bool: def is_valid(self) -> bool:
_log.info(f"valid: {self.valid}")
return self.valid return self.valid
@classmethod @classmethod
@ -60,6 +48,23 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
return {InputFormat.CSV} return {InputFormat.CSV}
def convert(self) -> DoclingDocument: def convert(self) -> DoclingDocument:
"""
Parses the CSV data into a structured document model.
"""
# Detect CSV dialect
dialect = csv.Sniffer().sniff(self.content)
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
if not dialect.delimiter in {",", ";", "\t", "|"}:
raise RuntimeError(
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
)
# Parce CSV
result = csv.reader(StringIO(self.content), dialect=dialect)
self.csv_data = list(result)
_log.info(f"Detected {len(self.csv_data)} lines")
# Parse the CSV into a structured document model # Parse the CSV into a structured document model
origin = DocumentOrigin( origin = DocumentOrigin(
filename=self.file.name or "file.csv", filename=self.file.name or "file.csv",
@ -70,13 +75,6 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin) doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
if self.is_valid(): if self.is_valid():
# Create a section for the CSV content
self.parent = doc.add_group(
parent=None,
label=GroupLabel.SECTION,
name="csv content",
)
# Convert CSV data to table # Convert CSV data to table
if self.csv_data: if self.csv_data:
num_rows = len(self.csv_data) num_rows = len(self.csv_data)
@ -104,7 +102,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
) )
table_data.table_cells.append(cell) table_data.table_cells.append(cell)
doc.add_table(data=table_data, parent=self.parent) doc.add_table(data=table_data)
else: else:
raise RuntimeError( raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init." f"Cannot convert doc with {self.document_hash} because the backend failed to init."

View File

@ -1,6 +1,6 @@
import csv
import logging import logging
import re import re
import csv
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
@ -424,4 +424,4 @@ class _DocumentConversionInput(BaseModel):
except csv.Error: except csv.Error:
return None return None
return None return None

View File

@ -10,11 +10,11 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
from docling.backend.abstract_backend import AbstractDocumentBackend from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.asciidoc_backend import AsciiDocBackend from docling.backend.asciidoc_backend import AsciiDocBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
@ -66,6 +66,7 @@ class CsvFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
class ExcelFormatOption(FormatOption): class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend

View File

@ -0,0 +1,80 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Conversion of CSV files\n",
"\n",
"This example shows how to convert CSV files to a structured Docling Document.\n",
"\n",
"* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
"* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Example Code"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"\n",
"from docling.document_converter import DocumentConverter\n",
"\n",
"# Convert CSV to Docling document\n",
"converter = DocumentConverter()\n",
"result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
"output = result.document.export_to_markdown()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This code generates the following output:"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |\n",
"|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
"| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |\n",
"| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |\n",
"| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |\n",
"| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |\n",
"| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "docling-TtEIaPrw-py3.12",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.8"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -1,32 +0,0 @@
import json
import logging
from pathlib import Path
import yaml
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter
logging.basicConfig(level=logging.DEBUG)
def main():
# Convert CSV to Docling document:
source = "https://drive.google.com/uc?id=1zO8ekHWx9U7mrbx_0Hoxxu6od7uxJqWw&export=download"
converter = DocumentConverter()
result = converter.convert(source)
# Export Docling document:
out_path = Path("scratch")
print(f"Document converted." f"\nSaving output to: {str(out_path)}")
with (out_path / f"customers-100.md").open("w") as fp:
fp.write(result.document.export_to_markdown())
with (out_path / f"customers-100.json").open("w") as fp:
fp.write(json.dumps(result.document.export_to_dict()))
with (out_path / f"customers-100.yaml").open("w") as fp:
fp.write(yaml.safe_dump(result.document.export_to_dict()))
if __name__ == "__main__":
main()

View File

@ -80,6 +80,7 @@ nav:
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py - "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
- "Accelerator options": examples/run_with_accelerator.py - "Accelerator options": examples/run_with_accelerator.py
- "Simple translation": examples/translate.py - "Simple translation": examples/translate.py
- examples/backend_csv.ipynb
- examples/backend_xml_rag.ipynb - examples/backend_xml_rag.ipynb
- ✂️ Chunking: - ✂️ Chunking:
- examples/hybrid_chunking.ipynb - examples/hybrid_chunking.ipynb

View File

@ -1,6 +1,6 @@
Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/ 1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/ 2,1Ef7b82A4CAAD10,Preston,"Lozano, Dr",Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/ 3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/ 4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/ 5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Lozano, Dr Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez, Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin, Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,6 @@
Index|Customer Id|First Name|Last Name|Company|City|Country|Phone 1|Phone 2|Email|Subscription Date|Website
1|DD37Cf93aecA6Dc|Sheryl|Baxter|Rasmussen Group|East Leonard|Chile|229.077.5154|397.884.0519x718|zunigavanessa@smith.info|2020-08-24|http://www.stephenson.com/
2|1Ef7b82A4CAAD10|Preston|Lozano|Vega-Gentry|East Jimmychester|Djibouti|5153435776|686-620-1820x944|vmata@colon.com|2021-04-23|http://www.hobbs.com/
3|6F94879bDAfE5a6|Roy|Berry|Murillo-Perry|Isabelborough|Antigua and Barbuda|+1-539-402-0259|(496)978-3969x58947|beckycarr@hogan.com|2020-03-25|http://www.lawrence.com/
4|5Cef8BFA16c5e3c|Linda|Olsen|"Dominguez|Mcmillan and Donovan"|Bensonview|Dominican Republic|001-808-617-6467x12895|+1-813-324-8756|stanleyblackwell@benson.org|2020-06-02|http://www.good-lyons.com/
5|053d585Ab6b3159|Joanna|Bender|"Martin|Lang and Andrade"|West Priscilla|Slovakia (Slovak Republic)|001-234-203-0635x76146|001-199-446-3860x3486|colinalvarado@miles.net|2021-04-17|https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez|Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin|Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,6 @@
Index;Customer Id;First Name;Last Name;Company;City;Country;Phone 1;Phone 2;Email;Subscription Date;Website
1;DD37Cf93aecA6Dc;Sheryl;Baxter;Rasmussen Group;East Leonard;Chile;229.077.5154;397.884.0519x718;zunigavanessa@smith.info;2020-08-24;http://www.stephenson.com/
2;1Ef7b82A4CAAD10;Preston;Lozano;Vega-Gentry;East Jimmychester;Djibouti;5153435776;686-620-1820x944;vmata@colon.com;2021-04-23;http://www.hobbs.com/
3;6F94879bDAfE5a6;Roy;Berry;Murillo-Perry;Isabelborough;Antigua and Barbuda;+1-539-402-0259;(496)978-3969x58947;beckycarr@hogan.com;2020-03-25;http://www.lawrence.com/
4;5Cef8BFA16c5e3c;Linda;Olsen;"Dominguez;Mcmillan and Donovan";Bensonview;Dominican Republic;001-808-617-6467x12895;+1-813-324-8756;stanleyblackwell@benson.org;2020-06-02;http://www.good-lyons.com/
5;053d585Ab6b3159;Joanna;Bender;"Martin;Lang and Andrade";West Priscilla;Slovakia (Slovak Republic);001-234-203-0635x76146;001-199-446-3860x3486;colinalvarado@miles.net;2021-04-17;https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez;Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin;Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,6 @@
Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
4 5Cef8BFA16c5e3c Linda Olsen "Dominguez Mcmillan and Donovan" Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
5 053d585Ab6b3159 Joanna Bender "Martin Lang and Andrade" West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/
1 Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
2 1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
3 2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
4 3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
5 4 5Cef8BFA16c5e3c Linda Olsen Dominguez Mcmillan and Donovan Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
6 5 053d585Ab6b3159 Joanna Bender Martin Lang and Andrade West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

View File

@ -1,11 +1,11 @@
{ {
"schema_name": "DoclingDocument", "schema_name": "DoclingDocument",
"version": "1.1.0", "version": "1.1.0",
"name": "test-01", "name": "csv-comma",
"origin": { "origin": {
"mimetype": "text/csv", "mimetype": "text/csv",
"binary_hash": 7076108075619672109, "binary_hash": 297933764223584292,
"filename": "test-01.csv" "filename": "csv-comma.csv"
}, },
"furniture": { "furniture": {
"self_ref": "#/furniture", "self_ref": "#/furniture",
@ -18,36 +18,21 @@
"self_ref": "#/body", "self_ref": "#/body",
"children": [ "children": [
{ {
"$ref": "#/groups/0" "$ref": "#/tables/0"
} }
], ],
"content_layer": "body", "content_layer": "body",
"name": "_root_", "name": "_root_",
"label": "unspecified" "label": "unspecified"
}, },
"groups": [ "groups": [],
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"name": "csv content",
"label": "section"
}
],
"texts": [], "texts": [],
"pictures": [], "pictures": [],
"tables": [ "tables": [
{ {
"self_ref": "#/tables/0", "self_ref": "#/tables/0",
"parent": { "parent": {
"$ref": "#/groups/0" "$ref": "#/body"
}, },
"children": [], "children": [],
"content_layer": "body", "content_layer": "body",
@ -389,7 +374,7 @@
"end_row_offset_idx": 3, "end_row_offset_idx": 3,
"start_col_offset_idx": 3, "start_col_offset_idx": 3,
"end_col_offset_idx": 4, "end_col_offset_idx": 4,
"text": "Lozano", "text": "Lozano, Dr",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false
@ -1262,7 +1247,7 @@
"end_row_offset_idx": 3, "end_row_offset_idx": 3,
"start_col_offset_idx": 3, "start_col_offset_idx": 3,
"end_col_offset_idx": 4, "end_col_offset_idx": 4,
"text": "Lozano", "text": "Lozano, Dr",
"column_header": false, "column_header": false,
"row_header": false, "row_header": false,
"row_section": false "row_section": false

View File

@ -1,7 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website | | Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------| |---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ | | 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ | | 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ | | 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ | | 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ | | 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez|Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin|Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez;Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin;Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -0,0 +1,2 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: table with [6x12]

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,7 @@
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|---------|-----------------|--------------|-------------|-----------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |

View File

@ -1,3 +0,0 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group csv content
item-2 at level 2: table with [6x12]

View File

@ -50,9 +50,7 @@ def test_e2e_csv_conversions():
for csv_path in csv_paths: for csv_path in csv_paths:
print(f"converting {csv_path}") print(f"converting {csv_path}")
gt_path = ( gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
)
conv_result: ConversionResult = converter.convert(csv_path) conv_result: ConversionResult = converter.convert(csv_path)