mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945)
* feat: Implement csv backend and format detection Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * test: Implement csv parsing and format tests Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add support for various CSV dialects and update documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add validation for delimiters and tests for inconsistent csv files Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> --------- Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
committed by
GitHub
parent
7493d5b01f
commit
00d9405b0a
@@ -39,6 +39,7 @@ class InputFormat(str, Enum):
|
||||
PDF = "pdf"
|
||||
ASCIIDOC = "asciidoc"
|
||||
MD = "md"
|
||||
CSV = "csv"
|
||||
XLSX = "xlsx"
|
||||
XML_USPTO = "xml_uspto"
|
||||
JSON_DOCLING = "json_docling"
|
||||
@@ -61,6 +62,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.XML_PUBMED: ["xml", "nxml"],
|
||||
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
|
||||
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
|
||||
InputFormat.CSV: ["csv"],
|
||||
InputFormat.XLSX: ["xlsx"],
|
||||
InputFormat.XML_USPTO: ["xml", "txt"],
|
||||
InputFormat.JSON_DOCLING: ["json"],
|
||||
@@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
|
||||
InputFormat.PDF: ["application/pdf"],
|
||||
InputFormat.ASCIIDOC: ["text/asciidoc"],
|
||||
InputFormat.MD: ["text/markdown", "text/x-markdown"],
|
||||
InputFormat.CSV: ["text/csv"],
|
||||
InputFormat.XLSX: [
|
||||
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
],
|
||||
|
||||
Reference in New Issue
Block a user