From 79eed3ef086763b2b55e3c978509ba7ab2f4f21e Mon Sep 17 00:00:00 2001 From: Tobias Strebitzer Date: Wed, 12 Feb 2025 12:11:32 +0800 Subject: [PATCH] docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer --- docling/backend/csv_backend.py | 3 +-- docs/examples/run_csv.py | 32 ++++++++++++++++++++++++++++++++ docs/supported_formats.md | 1 + 3 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 docs/examples/run_csv.py diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py index 253a0f03..6e1077f7 100644 --- a/docling/backend/csv_backend.py +++ b/docling/backend/csv_backend.py @@ -29,8 +29,7 @@ class CsvDocumentBackend(DeclarativeDocumentBackend): try: if isinstance(self.path_or_stream, BytesIO): - # Decode bytes to string for CSV reading - content = self.path_or_stream.read().decode('utf-8') + content = self.path_or_stream.getvalue().decode("utf-8") self.csv_data = list(csv.reader(StringIO(content))) elif isinstance(self.path_or_stream, Path): with open(self.path_or_stream, 'r', newline='') as f: diff --git a/docs/examples/run_csv.py b/docs/examples/run_csv.py new file mode 100644 index 00000000..8f6830b6 --- /dev/null +++ b/docs/examples/run_csv.py @@ -0,0 +1,32 @@ +import json +import logging +from pathlib import Path + +import yaml + +from docling.datamodel.base_models import InputFormat +from docling.document_converter import DocumentConverter + +logging.basicConfig(level=logging.DEBUG) + +def main(): + # Convert CSV to Docling document: + source = "https://drive.google.com/uc?id=1zO8ekHWx9U7mrbx_0Hoxxu6od7uxJqWw&export=download" + converter = DocumentConverter() + result = converter.convert(source) + + # Export Docling document: + out_path = Path("scratch") + print(f"Document converted." f"\nSaving output to: {str(out_path)}") + with (out_path / f"customers-100.md").open("w") as fp: + fp.write(result.document.export_to_markdown()) + + with (out_path / f"customers-100.json").open("w") as fp: + fp.write(json.dumps(result.document.export_to_dict())) + + with (out_path / f"customers-100.yaml").open("w") as fp: + fp.write(yaml.safe_dump(result.document.export_to_dict())) + + +if __name__ == "__main__": + main() diff --git a/docs/supported_formats.md b/docs/supported_formats.md index e217bb19..ada2fbba 100644 --- a/docs/supported_formats.md +++ b/docs/supported_formats.md @@ -13,6 +13,7 @@ Below you can find a listing of all supported input and output formats. | Markdown | | | AsciiDoc | | | HTML, XHTML | | +| CSV | | | PNG, JPEG, TIFF, BMP | Image formats | Schema-specific support: