diff --git a/docling/backend/csv_backend.py b/docling/backend/csv_backend.py new file mode 100644 index 00000000..253a0f03 --- /dev/null +++ b/docling/backend/csv_backend.py @@ -0,0 +1,114 @@ +import csv +import logging +from io import BytesIO, StringIO +from pathlib import Path +from typing import Set, Union + +from docling_core.types.doc import ( + DoclingDocument, + DocumentOrigin, + GroupLabel, + TableCell, + TableData, +) + +from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.datamodel.base_models import InputFormat +from docling.datamodel.document import InputDocument + +_log = logging.getLogger(__name__) + + +class CsvDocumentBackend(DeclarativeDocumentBackend): + def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]): + super().__init__(in_doc, path_or_stream) + + # Initialize parent for hierarchy + self.parent = None + self.valid = False + + try: + if isinstance(self.path_or_stream, BytesIO): + # Decode bytes to string for CSV reading + content = self.path_or_stream.read().decode('utf-8') + self.csv_data = list(csv.reader(StringIO(content))) + elif isinstance(self.path_or_stream, Path): + with open(self.path_or_stream, 'r', newline='') as f: + self.csv_data = list(csv.reader(f)) + + self.valid = True + except Exception as e: + self.valid = False + raise RuntimeError( + f"CsvDocumentBackend could not load document with hash {self.document_hash}" + ) from e + + def is_valid(self) -> bool: + _log.info(f"valid: {self.valid}") + return self.valid + + @classmethod + def supports_pagination(cls) -> bool: + return False + + def unload(self): + if isinstance(self.path_or_stream, BytesIO): + self.path_or_stream.close() + self.path_or_stream = None + + @classmethod + def supported_formats(cls) -> Set[InputFormat]: + return {InputFormat.CSV} + + def convert(self) -> DoclingDocument: + # Parse the CSV into a structured document model + origin = DocumentOrigin( + filename=self.file.name or "file.csv", + mimetype="text/csv", + binary_hash=self.document_hash, + ) + + doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin) + + if self.is_valid(): + # Create a section for the CSV content + self.parent = doc.add_group( + parent=None, + label=GroupLabel.SECTION, + name="csv content", + ) + + # Convert CSV data to table + if self.csv_data: + num_rows = len(self.csv_data) + num_cols = max(len(row) for row in self.csv_data) + + table_data = TableData( + num_rows=num_rows, + num_cols=num_cols, + table_cells=[], + ) + + # Convert each cell to TableCell + for row_idx, row in enumerate(self.csv_data): + for col_idx, cell_value in enumerate(row): + cell = TableCell( + text=str(cell_value), + row_span=1, # CSV doesn't support merged cells + col_span=1, + start_row_offset_idx=row_idx, + end_row_offset_idx=row_idx + 1, + start_col_offset_idx=col_idx, + end_col_offset_idx=col_idx + 1, + col_header=row_idx == 0, # First row as header + row_header=False, + ) + table_data.table_cells.append(cell) + + doc.add_table(data=table_data, parent=self.parent) + else: + raise RuntimeError( + f"Cannot convert doc with {self.document_hash} because the backend failed to init." + ) + + return doc diff --git a/docling/datamodel/base_models.py b/docling/datamodel/base_models.py index d1e7ce3a..a2b9428b 100644 --- a/docling/datamodel/base_models.py +++ b/docling/datamodel/base_models.py @@ -39,6 +39,7 @@ class InputFormat(str, Enum): PDF = "pdf" ASCIIDOC = "asciidoc" MD = "md" + CSV = "csv" XLSX = "xlsx" XML_USPTO = "xml_uspto" JSON_DOCLING = "json_docling" @@ -61,6 +62,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = { InputFormat.XML_PUBMED: ["xml", "nxml"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], + InputFormat.CSV: ["csv"], InputFormat.XLSX: ["xlsx"], InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.JSON_DOCLING: ["json"], @@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = { InputFormat.PDF: ["application/pdf"], InputFormat.ASCIIDOC: ["text/asciidoc"], InputFormat.MD: ["text/markdown", "text/x-markdown"], + InputFormat.CSV: ["text/csv"], InputFormat.XLSX: [ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ], diff --git a/docling/datamodel/document.py b/docling/datamodel/document.py index d887fed9..f4d02dbf 100644 --- a/docling/datamodel/document.py +++ b/docling/datamodel/document.py @@ -1,5 +1,6 @@ import logging import re +import csv from enum import Enum from io import BytesIO from pathlib import Path, PurePath @@ -296,6 +297,7 @@ class _DocumentConversionInput(BaseModel): mime = _DocumentConversionInput._mime_from_extension(ext) mime = mime or _DocumentConversionInput._detect_html_xhtml(content) + mime = mime or _DocumentConversionInput._detect_csv(content) mime = mime or "text/plain" formats = MimeTypeToFormat.get(mime, []) if formats: @@ -352,6 +354,8 @@ class _DocumentConversionInput(BaseModel): mime = FormatToMimeType[InputFormat.HTML][0] elif ext in FormatToExtensions[InputFormat.MD]: mime = FormatToMimeType[InputFormat.MD][0] + elif ext in FormatToExtensions[InputFormat.CSV]: + mime = FormatToMimeType[InputFormat.CSV][0] elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]: mime = FormatToMimeType[InputFormat.JSON_DOCLING][0] elif ext in FormatToExtensions[InputFormat.PDF]: @@ -392,3 +396,32 @@ class _DocumentConversionInput(BaseModel): return "application/xml" return None + + @staticmethod + def _detect_csv( + content: bytes, + ) -> Optional[Literal["text/csv"]]: + """Guess the mime type of a CSV file from its content. + + Args: + content: A short piece of a document from its beginning. + + Returns: + The mime type of a CSV file, or None if the content does + not match any of the format. + """ + content_str = content.decode("ascii", errors="ignore").strip() + + # Ensure there's at least one newline (CSV is usually multi-line) + if "\n" not in content_str: + return None + + # Use csv.Sniffer to detect CSV characteristics + try: + dialect = csv.Sniffer().sniff(content_str) + if dialect.delimiter in {",", ";", "\t", "|"}: # Common delimiters + return "text/csv" + except csv.Error: + return None + + return None \ No newline at end of file diff --git a/docling/document_converter.py b/docling/document_converter.py index d885dd20..de74d8b8 100644 --- a/docling/document_converter.py +++ b/docling/document_converter.py @@ -14,6 +14,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.md_backend import MarkdownDocumentBackend +from docling.backend.csv_backend import CsvDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend @@ -61,6 +62,10 @@ class FormatOption(BaseModel): return self +class CsvFormatOption(FormatOption): + pipeline_cls: Type = SimplePipeline + backend: Type[AbstractDocumentBackend] = CsvDocumentBackend + class ExcelFormatOption(FormatOption): pipeline_cls: Type = SimplePipeline backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend @@ -113,6 +118,9 @@ class PdfFormatOption(FormatOption): def _get_default_option(format: InputFormat) -> FormatOption: format_to_default_options = { + InputFormat.CSV: FormatOption( + pipeline_cls=SimplePipeline, backend=CsvDocumentBackend + ), InputFormat.XLSX: FormatOption( pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend ), diff --git a/docs/examples/run_with_formats.py b/docs/examples/run_with_formats.py index 2c4d7161..0eff248b 100644 --- a/docs/examples/run_with_formats.py +++ b/docs/examples/run_with_formats.py @@ -43,6 +43,7 @@ def main(): InputFormat.HTML, InputFormat.PPTX, InputFormat.ASCIIDOC, + InputFormat.CSV, InputFormat.MD, ], # whitelist formats, non-matching files are ignored. format_options={