mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945)
* feat: Implement csv backend and format detection Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * test: Implement csv parsing and format tests Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add support for various CSV dialects and update documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add validation for delimiters and tests for inconsistent csv files Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> --------- Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
committed by
GitHub
parent
7493d5b01f
commit
00d9405b0a
@@ -108,6 +108,15 @@ def test_guess_format(tmp_path):
|
||||
doc_path = Path("./tests/data/md/wiki.md")
|
||||
assert dci._guess_format(doc_path) == InputFormat.MD
|
||||
|
||||
# Valid CSV
|
||||
buf = BytesIO(Path("./tests/data/csv/csv-comma.csv").open("rb").read())
|
||||
stream = DocumentStream(name="csv-comma.csv", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.CSV
|
||||
stream = DocumentStream(name="test-comma", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.CSV
|
||||
doc_path = Path("./tests/data/csv/csv-comma.csv")
|
||||
assert dci._guess_format(doc_path) == InputFormat.CSV
|
||||
|
||||
# Valid XML USPTO patent
|
||||
buf = BytesIO(Path("./tests/data/uspto/ipa20110039701.xml").open("rb").read())
|
||||
stream = DocumentStream(name="ipa20110039701.xml", stream=buf)
|
||||
|
||||
Reference in New Issue
Block a user