feat: Implement csv backend and format detection

Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
Tobias Strebitzer 2025-02-12 11:37:24 +08:00
parent c47ae700ec
commit d64f2bb0ab
5 changed files with 159 additions and 0 deletions

View File

@ -0,0 +1,114 @@
import csv
import logging
from io import BytesIO, StringIO
from pathlib import Path
from typing import Set, Union
from docling_core.types.doc import (
DoclingDocument,
DocumentOrigin,
GroupLabel,
TableCell,
TableData,
)
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
class CsvDocumentBackend(DeclarativeDocumentBackend):
def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]):
super().__init__(in_doc, path_or_stream)
# Initialize parent for hierarchy
self.parent = None
self.valid = False
try:
if isinstance(self.path_or_stream, BytesIO):
# Decode bytes to string for CSV reading
content = self.path_or_stream.read().decode('utf-8')
self.csv_data = list(csv.reader(StringIO(content)))
elif isinstance(self.path_or_stream, Path):
with open(self.path_or_stream, 'r', newline='') as f:
self.csv_data = list(csv.reader(f))
self.valid = True
except Exception as e:
self.valid = False
raise RuntimeError(
f"CsvDocumentBackend could not load document with hash {self.document_hash}"
) from e
def is_valid(self) -> bool:
_log.info(f"valid: {self.valid}")
return self.valid
@classmethod
def supports_pagination(cls) -> bool:
return False
def unload(self):
if isinstance(self.path_or_stream, BytesIO):
self.path_or_stream.close()
self.path_or_stream = None
@classmethod
def supported_formats(cls) -> Set[InputFormat]:
return {InputFormat.CSV}
def convert(self) -> DoclingDocument:
# Parse the CSV into a structured document model
origin = DocumentOrigin(
filename=self.file.name or "file.csv",
mimetype="text/csv",
binary_hash=self.document_hash,
)
doc = DoclingDocument(name=self.file.stem or "file.csv", origin=origin)
if self.is_valid():
# Create a section for the CSV content
self.parent = doc.add_group(
parent=None,
label=GroupLabel.SECTION,
name="csv content",
)
# Convert CSV data to table
if self.csv_data:
num_rows = len(self.csv_data)
num_cols = max(len(row) for row in self.csv_data)
table_data = TableData(
num_rows=num_rows,
num_cols=num_cols,
table_cells=[],
)
# Convert each cell to TableCell
for row_idx, row in enumerate(self.csv_data):
for col_idx, cell_value in enumerate(row):
cell = TableCell(
text=str(cell_value),
row_span=1, # CSV doesn't support merged cells
col_span=1,
start_row_offset_idx=row_idx,
end_row_offset_idx=row_idx + 1,
start_col_offset_idx=col_idx,
end_col_offset_idx=col_idx + 1,
col_header=row_idx == 0, # First row as header
row_header=False,
)
table_data.table_cells.append(cell)
doc.add_table(data=table_data, parent=self.parent)
else:
raise RuntimeError(
f"Cannot convert doc with {self.document_hash} because the backend failed to init."
)
return doc

View File

@ -39,6 +39,7 @@ class InputFormat(str, Enum):
PDF = "pdf" PDF = "pdf"
ASCIIDOC = "asciidoc" ASCIIDOC = "asciidoc"
MD = "md" MD = "md"
CSV = "csv"
XLSX = "xlsx" XLSX = "xlsx"
XML_USPTO = "xml_uspto" XML_USPTO = "xml_uspto"
JSON_DOCLING = "json_docling" JSON_DOCLING = "json_docling"
@ -61,6 +62,7 @@ FormatToExtensions: Dict[InputFormat, List[str]] = {
InputFormat.XML_PUBMED: ["xml", "nxml"], InputFormat.XML_PUBMED: ["xml", "nxml"],
InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"], InputFormat.IMAGE: ["jpg", "jpeg", "png", "tif", "tiff", "bmp"],
InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"], InputFormat.ASCIIDOC: ["adoc", "asciidoc", "asc"],
InputFormat.CSV: ["csv"],
InputFormat.XLSX: ["xlsx"], InputFormat.XLSX: ["xlsx"],
InputFormat.XML_USPTO: ["xml", "txt"], InputFormat.XML_USPTO: ["xml", "txt"],
InputFormat.JSON_DOCLING: ["json"], InputFormat.JSON_DOCLING: ["json"],
@ -88,6 +90,7 @@ FormatToMimeType: Dict[InputFormat, List[str]] = {
InputFormat.PDF: ["application/pdf"], InputFormat.PDF: ["application/pdf"],
InputFormat.ASCIIDOC: ["text/asciidoc"], InputFormat.ASCIIDOC: ["text/asciidoc"],
InputFormat.MD: ["text/markdown", "text/x-markdown"], InputFormat.MD: ["text/markdown", "text/x-markdown"],
InputFormat.CSV: ["text/csv"],
InputFormat.XLSX: [ InputFormat.XLSX: [
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
], ],

View File

@ -1,5 +1,6 @@
import logging import logging
import re import re
import csv
from enum import Enum from enum import Enum
from io import BytesIO from io import BytesIO
from pathlib import Path, PurePath from pathlib import Path, PurePath
@ -296,6 +297,7 @@ class _DocumentConversionInput(BaseModel):
mime = _DocumentConversionInput._mime_from_extension(ext) mime = _DocumentConversionInput._mime_from_extension(ext)
mime = mime or _DocumentConversionInput._detect_html_xhtml(content) mime = mime or _DocumentConversionInput._detect_html_xhtml(content)
mime = mime or _DocumentConversionInput._detect_csv(content)
mime = mime or "text/plain" mime = mime or "text/plain"
formats = MimeTypeToFormat.get(mime, []) formats = MimeTypeToFormat.get(mime, [])
if formats: if formats:
@ -352,6 +354,8 @@ class _DocumentConversionInput(BaseModel):
mime = FormatToMimeType[InputFormat.HTML][0] mime = FormatToMimeType[InputFormat.HTML][0]
elif ext in FormatToExtensions[InputFormat.MD]: elif ext in FormatToExtensions[InputFormat.MD]:
mime = FormatToMimeType[InputFormat.MD][0] mime = FormatToMimeType[InputFormat.MD][0]
elif ext in FormatToExtensions[InputFormat.CSV]:
mime = FormatToMimeType[InputFormat.CSV][0]
elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]: elif ext in FormatToExtensions[InputFormat.JSON_DOCLING]:
mime = FormatToMimeType[InputFormat.JSON_DOCLING][0] mime = FormatToMimeType[InputFormat.JSON_DOCLING][0]
elif ext in FormatToExtensions[InputFormat.PDF]: elif ext in FormatToExtensions[InputFormat.PDF]:
@ -392,3 +396,32 @@ class _DocumentConversionInput(BaseModel):
return "application/xml" return "application/xml"
return None return None
@staticmethod
def _detect_csv(
content: bytes,
) -> Optional[Literal["text/csv"]]:
"""Guess the mime type of a CSV file from its content.
Args:
content: A short piece of a document from its beginning.
Returns:
The mime type of a CSV file, or None if the content does
not match any of the format.
"""
content_str = content.decode("ascii", errors="ignore").strip()
# Ensure there's at least one newline (CSV is usually multi-line)
if "\n" not in content_str:
return None
# Use csv.Sniffer to detect CSV characteristics
try:
dialect = csv.Sniffer().sniff(content_str)
if dialect.delimiter in {",", ";", "\t", "|"}: # Common delimiters
return "text/csv"
except csv.Error:
return None
return None

View File

@ -14,6 +14,7 @@ from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBacke
from docling.backend.html_backend import HTMLDocumentBackend from docling.backend.html_backend import HTMLDocumentBackend
from docling.backend.json.docling_json_backend import DoclingJSONBackend from docling.backend.json.docling_json_backend import DoclingJSONBackend
from docling.backend.md_backend import MarkdownDocumentBackend from docling.backend.md_backend import MarkdownDocumentBackend
from docling.backend.csv_backend import CsvDocumentBackend
from docling.backend.msexcel_backend import MsExcelDocumentBackend from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend from docling.backend.mspowerpoint_backend import MsPowerpointDocumentBackend
from docling.backend.msword_backend import MsWordDocumentBackend from docling.backend.msword_backend import MsWordDocumentBackend
@ -61,6 +62,10 @@ class FormatOption(BaseModel):
return self return self
class CsvFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
class ExcelFormatOption(FormatOption): class ExcelFormatOption(FormatOption):
pipeline_cls: Type = SimplePipeline pipeline_cls: Type = SimplePipeline
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
@ -113,6 +118,9 @@ class PdfFormatOption(FormatOption):
def _get_default_option(format: InputFormat) -> FormatOption: def _get_default_option(format: InputFormat) -> FormatOption:
format_to_default_options = { format_to_default_options = {
InputFormat.CSV: FormatOption(
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
),
InputFormat.XLSX: FormatOption( InputFormat.XLSX: FormatOption(
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
), ),

View File

@ -43,6 +43,7 @@ def main():
InputFormat.HTML, InputFormat.HTML,
InputFormat.PPTX, InputFormat.PPTX,
InputFormat.ASCIIDOC, InputFormat.ASCIIDOC,
InputFormat.CSV,
InputFormat.MD, InputFormat.MD,
], # whitelist formats, non-matching files are ignored. ], # whitelist formats, non-matching files are ignored.
format_options={ format_options={