feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945)

* feat: Implement csv backend and format detection

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* test: Implement csv parsing and format tests

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* docs: Add example and CSV format documentation

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* feat: Add support for various CSV dialects and update documentation

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

* feat: Add validation for delimiters and tests for inconsistent csv files

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>

---------

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
Tobias Strebitzer
2025-02-14 15:55:09 +08:00
committed by GitHub
parent 7493d5b01f
commit 00d9405b0a
42 changed files with 9885 additions and 0 deletions

View File

@@ -39,6 +39,7 @@ class InputFormat(str, Enum):
PDF = "pdf"
ASCIIDOC = "asciidoc"
MD = "md"
CSV = "csv"
XLSX = "xlsx"
XML_USPTO = "xml_uspto"
JSON_DOCLING = "json_docling"
@@ -61,6 +62,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.XML_PUBMED: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"],
@@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: ["text/markdown", "text/x-markdown"],
InputFormat.CSV: ["text/csv"],
InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
],