mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-31 14:34:40 +00:00
feat: Add support for various CSV dialects and update documentation
Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
parent
79eed3ef08
commit
1ca87f5d8c
@ -4,13 +4,7 @@ from io import BytesIO, StringIO
|
||||
from pathlib import Path
|
||||
from typing import Set, Union
|
||||
|
||||
from docling_core.types.doc import (
|
||||
DoclingDocument,
|
||||
DocumentOrigin,
|
||||
GroupLabel,
|
||||
TableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc import DoclingDocument, DocumentOrigin, TableCell, TableData
|
||||
|
||||
from docling.backend.abstract_backend import DeclarativeDocumentBackend
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
@ -23,27 +17,21 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
|
||||
# Initialize parent for hierarchy
|
||||
self.parent = None
|
||||
self.valid = False
|
||||
|
||||
# Load content
|
||||
try:
|
||||
if isinstance(self.path_or_stream, BytesIO):
|
||||
content = self.path_or_stream.getvalue().decode("utf-8")
|
||||
self.csv_data = list(csv.reader(StringIO(content)))
|
||||
self.content = self.path_or_stream.getvalue().decode("utf-8")
|
||||
elif isinstance(self.path_or_stream, Path):
|
||||
with open(self.path_or_stream, 'r', newline='') as f:
|
||||
self.csv_data = list(csv.reader(f))
|
||||
|
||||
with open(self.path_or_stream, "r", newline="") as f:
|
||||
self.content = f.read()
|
||||
self.valid = True
|
||||
except Exception as e:
|
||||
self.valid = False
|
||||
raise RuntimeError(
|
||||
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
|
||||
) from e
|
||||
return
|
||||
|
||||
def is_valid(self) -> bool:
|
||||
_log.info(f"valid: {self.valid}")
|
||||
return self.valid
|
||||
|
||||
@classmethod
|
||||
@ -60,6 +48,23 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
return {InputFormat.CSV}
|
||||
|
||||
def convert(self) -> DoclingDocument:
|
||||
"""
|
||||
Parses the CSV data into a structured document model.
|
||||
"""
|
||||
|
||||
# Detect CSV dialect
|
||||
dialect = csv.Sniffer().sniff(self.content)
|
||||
_log.info(f'Parsing CSV with delimiter: "{dialect.delimiter}"')
|
||||
if not dialect.delimiter in {",", ";", "\t", "|"}:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert csv with unknown delimiter {dialect.delimiter}."
|
||||
)
|
||||
|
||||
# Parce CSV
|
||||
result = csv.reader(StringIO(self.content), dialect=dialect)
|
||||
self.csv_data = list(result)
|
||||
_log.info(f"Detected {len(self.csv_data)} lines")
|
||||
|
||||
# Parse the CSV into a structured document model
|
||||
origin = DocumentOrigin(
|
||||
filename=self.file.name or "file.csv",
|
||||
@ -70,13 +75,6 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
|
||||
|
||||
if self.is_valid():
|
||||
# Create a section for the CSV content
|
||||
self.parent = doc.add_group(
|
||||
parent=None,
|
||||
label=GroupLabel.SECTION,
|
||||
name="csv content",
|
||||
)
|
||||
|
||||
# Convert CSV data to table
|
||||
if self.csv_data:
|
||||
num_rows = len(self.csv_data)
|
||||
@ -104,7 +102,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
table_data.table_cells.append(cell)
|
||||
|
||||
doc.add_table(data=table_data, parent=self.parent)
|
||||
doc.add_table(data=table_data)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
|
||||
|
@ -1,6 +1,6 @@
|
||||
import csv
|
||||
import logging
|
||||
import re
|
||||
import csv
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path, PurePath
|
||||
@ -424,4 +424,4 @@ class _DocumentConversionInput(BaseModel):
|
||||
except csv.Error:
|
||||
return None
|
||||
|
||||
return None
|
||||
return None
|
||||
|
@ -10,11 +10,11 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.csv_backend import CsvDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||
from docling.backend.md_backend import MarkdownDocumentBackend
|
||||
from docling.backend.csv_backend import CsvDocumentBackend
|
||||
from docling.backend.msexcel_backend import MsExcelDocumentBackend
|
||||
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
|
||||
from docling.backend.msword_backend import MsWordDocumentBackend
|
||||
@ -66,6 +66,7 @@ class CsvFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
||||
|
||||
|
||||
class ExcelFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
||||
|
80
docs/examples/backend_csv.ipynb
Normal file
80
docs/examples/backend_csv.ipynb
Normal file
@ -0,0 +1,80 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Conversion of CSV files\n",
|
||||
"\n",
|
||||
"This example shows how to convert CSV files to a structured Docling Document.\n",
|
||||
"\n",
|
||||
"* Multiple delimiters are supported: `,` `;` `|` `[tab]`\n",
|
||||
"* Additional CSV dialect settings are detected automatically (e.g. quotes, line separator, escape character)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example Code"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 59,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from pathlib import Path\n",
|
||||
"\n",
|
||||
"from docling.document_converter import DocumentConverter\n",
|
||||
"\n",
|
||||
"# Convert CSV to Docling document\n",
|
||||
"converter = DocumentConverter()\n",
|
||||
"result = converter.convert(Path(\"../../tests/data/csv/csv-comma.csv\"))\n",
|
||||
"output = result.document.export_to_markdown()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"This code generates the following output:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |\n",
|
||||
"|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|\n",
|
||||
"| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |\n",
|
||||
"| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |\n",
|
||||
"| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |\n",
|
||||
"| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |\n",
|
||||
"| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "docling-TtEIaPrw-py3.12",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.8"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from docling.datamodel.base_models import InputFormat
|
||||
from docling.document_converter import DocumentConverter
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
def main():
|
||||
# Convert CSV to Docling document:
|
||||
source = "https://drive.google.com/uc?id=1zO8ekHWx9U7mrbx_0Hoxxu6od7uxJqWw&export=download"
|
||||
converter = DocumentConverter()
|
||||
result = converter.convert(source)
|
||||
|
||||
# Export Docling document:
|
||||
out_path = Path("scratch")
|
||||
print(f"Document converted." f"\nSaving output to: {str(out_path)}")
|
||||
with (out_path / f"customers-100.md").open("w") as fp:
|
||||
fp.write(result.document.export_to_markdown())
|
||||
|
||||
with (out_path / f"customers-100.json").open("w") as fp:
|
||||
fp.write(json.dumps(result.document.export_to_dict()))
|
||||
|
||||
with (out_path / f"customers-100.yaml").open("w") as fp:
|
||||
fp.write(yaml.safe_dump(result.document.export_to_dict()))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -80,6 +80,7 @@ nav:
|
||||
- "RapidOCR with custom OCR models": examples/rapidocr_with_custom_models.py
|
||||
- "Accelerator options": examples/run_with_accelerator.py
|
||||
- "Simple translation": examples/translate.py
|
||||
- examples/backend_csv.ipynb
|
||||
- examples/backend_xml_rag.ipynb
|
||||
- ✂️ Chunking:
|
||||
- examples/hybrid_chunking.ipynb
|
||||
|
@ -1,6 +1,6 @@
|
||||
Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
|
||||
1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
|
||||
2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
|
||||
2,1Ef7b82A4CAAD10,Preston,"Lozano, Dr",Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
|
||||
3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/
|
||||
4,5Cef8BFA16c5e3c,Linda,Olsen,"Dominguez, Mcmillan and Donovan",Bensonview,Dominican Republic,001-808-617-6467x12895,+1-813-324-8756,stanleyblackwell@benson.org,2020-06-02,http://www.good-lyons.com/
|
||||
5,053d585Ab6b3159,Joanna,Bender,"Martin, Lang and Andrade",West Priscilla,Slovakia (Slovak Republic),001-234-203-0635x76146,001-199-446-3860x3486,colinalvarado@miles.net,2021-04-17,https://goodwin-ingram.com/
|
|
6
tests/data/csv/csv-pipe.csv
Normal file
6
tests/data/csv/csv-pipe.csv
Normal file
@ -0,0 +1,6 @@
|
||||
Index|Customer Id|First Name|Last Name|Company|City|Country|Phone 1|Phone 2|Email|Subscription Date|Website
|
||||
1|DD37Cf93aecA6Dc|Sheryl|Baxter|Rasmussen Group|East Leonard|Chile|229.077.5154|397.884.0519x718|zunigavanessa@smith.info|2020-08-24|http://www.stephenson.com/
|
||||
2|1Ef7b82A4CAAD10|Preston|Lozano|Vega-Gentry|East Jimmychester|Djibouti|5153435776|686-620-1820x944|vmata@colon.com|2021-04-23|http://www.hobbs.com/
|
||||
3|6F94879bDAfE5a6|Roy|Berry|Murillo-Perry|Isabelborough|Antigua and Barbuda|+1-539-402-0259|(496)978-3969x58947|beckycarr@hogan.com|2020-03-25|http://www.lawrence.com/
|
||||
4|5Cef8BFA16c5e3c|Linda|Olsen|"Dominguez|Mcmillan and Donovan"|Bensonview|Dominican Republic|001-808-617-6467x12895|+1-813-324-8756|stanleyblackwell@benson.org|2020-06-02|http://www.good-lyons.com/
|
||||
5|053d585Ab6b3159|Joanna|Bender|"Martin|Lang and Andrade"|West Priscilla|Slovakia (Slovak Republic)|001-234-203-0635x76146|001-199-446-3860x3486|colinalvarado@miles.net|2021-04-17|https://goodwin-ingram.com/
|
|
6
tests/data/csv/csv-semicolon.csv
Normal file
6
tests/data/csv/csv-semicolon.csv
Normal file
@ -0,0 +1,6 @@
|
||||
Index;Customer Id;First Name;Last Name;Company;City;Country;Phone 1;Phone 2;Email;Subscription Date;Website
|
||||
1;DD37Cf93aecA6Dc;Sheryl;Baxter;Rasmussen Group;East Leonard;Chile;229.077.5154;397.884.0519x718;zunigavanessa@smith.info;2020-08-24;http://www.stephenson.com/
|
||||
2;1Ef7b82A4CAAD10;Preston;Lozano;Vega-Gentry;East Jimmychester;Djibouti;5153435776;686-620-1820x944;vmata@colon.com;2021-04-23;http://www.hobbs.com/
|
||||
3;6F94879bDAfE5a6;Roy;Berry;Murillo-Perry;Isabelborough;Antigua and Barbuda;+1-539-402-0259;(496)978-3969x58947;beckycarr@hogan.com;2020-03-25;http://www.lawrence.com/
|
||||
4;5Cef8BFA16c5e3c;Linda;Olsen;"Dominguez;Mcmillan and Donovan";Bensonview;Dominican Republic;001-808-617-6467x12895;+1-813-324-8756;stanleyblackwell@benson.org;2020-06-02;http://www.good-lyons.com/
|
||||
5;053d585Ab6b3159;Joanna;Bender;"Martin;Lang and Andrade";West Priscilla;Slovakia (Slovak Republic);001-234-203-0635x76146;001-199-446-3860x3486;colinalvarado@miles.net;2021-04-17;https://goodwin-ingram.com/
|
|
6
tests/data/csv/csv-tab.csv
Normal file
6
tests/data/csv/csv-tab.csv
Normal file
@ -0,0 +1,6 @@
|
||||
Index Customer Id First Name Last Name Company City Country Phone 1 Phone 2 Email Subscription Date Website
|
||||
1 DD37Cf93aecA6Dc Sheryl Baxter Rasmussen Group East Leonard Chile 229.077.5154 397.884.0519x718 zunigavanessa@smith.info 2020-08-24 http://www.stephenson.com/
|
||||
2 1Ef7b82A4CAAD10 Preston Lozano Vega-Gentry East Jimmychester Djibouti 5153435776 686-620-1820x944 vmata@colon.com 2021-04-23 http://www.hobbs.com/
|
||||
3 6F94879bDAfE5a6 Roy Berry Murillo-Perry Isabelborough Antigua and Barbuda +1-539-402-0259 (496)978-3969x58947 beckycarr@hogan.com 2020-03-25 http://www.lawrence.com/
|
||||
4 5Cef8BFA16c5e3c Linda Olsen "Dominguez Mcmillan and Donovan" Bensonview Dominican Republic 001-808-617-6467x12895 +1-813-324-8756 stanleyblackwell@benson.org 2020-06-02 http://www.good-lyons.com/
|
||||
5 053d585Ab6b3159 Joanna Bender "Martin Lang and Andrade" West Priscilla Slovakia (Slovak Republic) 001-234-203-0635x76146 001-199-446-3860x3486 colinalvarado@miles.net 2021-04-17 https://goodwin-ingram.com/
|
|
2
tests/data/groundtruth/docling_v2/csv-comma.csv.itxt
Normal file
2
tests/data/groundtruth/docling_v2/csv-comma.csv.itxt
Normal file
@ -0,0 +1,2 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: table with [6x12]
|
@ -1,11 +1,11 @@
|
||||
{
|
||||
"schema_name": "DoclingDocument",
|
||||
"version": "1.1.0",
|
||||
"name": "test-01",
|
||||
"name": "csv-comma",
|
||||
"origin": {
|
||||
"mimetype": "text/csv",
|
||||
"binary_hash": 7076108075619672109,
|
||||
"filename": "test-01.csv"
|
||||
"binary_hash": 297933764223584292,
|
||||
"filename": "csv-comma.csv"
|
||||
},
|
||||
"furniture": {
|
||||
"self_ref": "#/furniture",
|
||||
@ -18,36 +18,21 @@
|
||||
"self_ref": "#/body",
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/groups/0"
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "_root_",
|
||||
"label": "unspecified"
|
||||
},
|
||||
"groups": [
|
||||
{
|
||||
"self_ref": "#/groups/0",
|
||||
"parent": {
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [
|
||||
{
|
||||
"$ref": "#/tables/0"
|
||||
}
|
||||
],
|
||||
"content_layer": "body",
|
||||
"name": "csv content",
|
||||
"label": "section"
|
||||
}
|
||||
],
|
||||
"groups": [],
|
||||
"texts": [],
|
||||
"pictures": [],
|
||||
"tables": [
|
||||
{
|
||||
"self_ref": "#/tables/0",
|
||||
"parent": {
|
||||
"$ref": "#/groups/0"
|
||||
"$ref": "#/body"
|
||||
},
|
||||
"children": [],
|
||||
"content_layer": "body",
|
||||
@ -389,7 +374,7 @@
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 3,
|
||||
"end_col_offset_idx": 4,
|
||||
"text": "Lozano",
|
||||
"text": "Lozano, Dr",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
||||
@ -1262,7 +1247,7 @@
|
||||
"end_row_offset_idx": 3,
|
||||
"start_col_offset_idx": 3,
|
||||
"end_col_offset_idx": 4,
|
||||
"text": "Lozano",
|
||||
"text": "Lozano, Dr",
|
||||
"column_header": false,
|
||||
"row_header": false,
|
||||
"row_section": false
|
@ -1,7 +1,7 @@
|
||||
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|
||||
|---------|-----------------|--------------|-------------|---------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
|
||||
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
|
||||
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
|
||||
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano, Dr | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
|
||||
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
|
||||
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez, Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
|
||||
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin, Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |
|
2
tests/data/groundtruth/docling_v2/csv-pipe.csv.itxt
Normal file
2
tests/data/groundtruth/docling_v2/csv-pipe.csv.itxt
Normal file
@ -0,0 +1,2 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: table with [6x12]
|
1796
tests/data/groundtruth/docling_v2/csv-pipe.csv.json
Normal file
1796
tests/data/groundtruth/docling_v2/csv-pipe.csv.json
Normal file
File diff suppressed because it is too large
Load Diff
7
tests/data/groundtruth/docling_v2/csv-pipe.csv.md
Normal file
7
tests/data/groundtruth/docling_v2/csv-pipe.csv.md
Normal file
@ -0,0 +1,7 @@
|
||||
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|
||||
|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
|
||||
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
|
||||
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
|
||||
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
|
||||
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez|Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
|
||||
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin|Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |
|
2
tests/data/groundtruth/docling_v2/csv-semicolon.csv.itxt
Normal file
2
tests/data/groundtruth/docling_v2/csv-semicolon.csv.itxt
Normal file
@ -0,0 +1,2 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: table with [6x12]
|
1796
tests/data/groundtruth/docling_v2/csv-semicolon.csv.json
Normal file
1796
tests/data/groundtruth/docling_v2/csv-semicolon.csv.json
Normal file
File diff suppressed because it is too large
Load Diff
7
tests/data/groundtruth/docling_v2/csv-semicolon.csv.md
Normal file
7
tests/data/groundtruth/docling_v2/csv-semicolon.csv.md
Normal file
@ -0,0 +1,7 @@
|
||||
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|
||||
|---------|-----------------|--------------|-------------|--------------------------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
|
||||
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
|
||||
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
|
||||
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
|
||||
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez;Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
|
||||
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin;Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |
|
2
tests/data/groundtruth/docling_v2/csv-tab.csv.itxt
Normal file
2
tests/data/groundtruth/docling_v2/csv-tab.csv.itxt
Normal file
@ -0,0 +1,2 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: table with [6x12]
|
1796
tests/data/groundtruth/docling_v2/csv-tab.csv.json
Normal file
1796
tests/data/groundtruth/docling_v2/csv-tab.csv.json
Normal file
File diff suppressed because it is too large
Load Diff
7
tests/data/groundtruth/docling_v2/csv-tab.csv.md
Normal file
7
tests/data/groundtruth/docling_v2/csv-tab.csv.md
Normal file
@ -0,0 +1,7 @@
|
||||
| Index | Customer Id | First Name | Last Name | Company | City | Country | Phone 1 | Phone 2 | Email | Subscription Date | Website |
|
||||
|---------|-----------------|--------------|-------------|-----------------|-------------------|----------------------------|------------------------|-----------------------|-----------------------------|---------------------|-----------------------------|
|
||||
| 1 | DD37Cf93aecA6Dc | Sheryl | Baxter | Rasmussen Group | East Leonard | Chile | 229.077.5154 | 397.884.0519x718 | zunigavanessa@smith.info | 2020-08-24 | http://www.stephenson.com/ |
|
||||
| 2 | 1Ef7b82A4CAAD10 | Preston | Lozano | Vega-Gentry | East Jimmychester | Djibouti | 5153435776 | 686-620-1820x944 | vmata@colon.com | 2021-04-23 | http://www.hobbs.com/ |
|
||||
| 3 | 6F94879bDAfE5a6 | Roy | Berry | Murillo-Perry | Isabelborough | Antigua and Barbuda | +1-539-402-0259 | (496)978-3969x58947 | beckycarr@hogan.com | 2020-03-25 | http://www.lawrence.com/ |
|
||||
| 4 | 5Cef8BFA16c5e3c | Linda | Olsen | Dominguez Mcmillan and Donovan | Bensonview | Dominican Republic | 001-808-617-6467x12895 | +1-813-324-8756 | stanleyblackwell@benson.org | 2020-06-02 | http://www.good-lyons.com/ |
|
||||
| 5 | 053d585Ab6b3159 | Joanna | Bender | Martin Lang and Andrade | West Priscilla | Slovakia (Slovak Republic) | 001-234-203-0635x76146 | 001-199-446-3860x3486 | colinalvarado@miles.net | 2021-04-17 | https://goodwin-ingram.com/ |
|
@ -1,3 +0,0 @@
|
||||
item-0 at level 0: unspecified: group _root_
|
||||
item-1 at level 1: section: group csv content
|
||||
item-2 at level 2: table with [6x12]
|
@ -50,9 +50,7 @@ def test_e2e_csv_conversions():
|
||||
for csv_path in csv_paths:
|
||||
print(f"converting {csv_path}")
|
||||
|
||||
gt_path = (
|
||||
csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
|
||||
)
|
||||
gt_path = csv_path.parent.parent / "groundtruth" / "docling_v2" / csv_path.name
|
||||
|
||||
conv_result: ConversionResult = converter.convert(csv_path)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user