mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 14:18:30 +00:00
feat: Add support for CSV input with new backend to transform CSV files to DoclingDocument (#945)
* feat: Implement csv backend and format detection Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * test: Implement csv parsing and format tests Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * docs: Add example and CSV format documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add support for various CSV dialects and update documentation Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> * feat: Add validation for delimiters and tests for inconsistent csv files Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com> --------- Signed-off-by: Tobias Strebitzer <tobias.strebitzer@magloft.com>
This commit is contained in:
committed by
GitHub
parent
7493d5b01f
commit
00d9405b0a
@@ -10,6 +10,7 @@ from pydantic import BaseModel, ConfigDict, model_validator, validate_call
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.asciidoc_backend import AsciiDocBackend
|
||||
from docling.backend.csv_backend import CsvDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.html_backend import HTMLDocumentBackend
|
||||
from docling.backend.json.docling_json_backend import DoclingJSONBackend
|
||||
@@ -61,6 +62,11 @@ class FormatOption(BaseModel):
|
||||
return self
|
||||
|
||||
|
||||
class CsvFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = CsvDocumentBackend
|
||||
|
||||
|
||||
class ExcelFormatOption(FormatOption):
|
||||
pipeline_cls: Type = SimplePipeline
|
||||
backend: Type[AbstractDocumentBackend] = MsExcelDocumentBackend
|
||||
@@ -113,6 +119,9 @@ class PdfFormatOption(FormatOption):
|
||||
|
||||
def _get_default_option(format: InputFormat) -> FormatOption:
|
||||
format_to_default_options = {
|
||||
InputFormat.CSV: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=CsvDocumentBackend
|
||||
),
|
||||
InputFormat.XLSX: FormatOption(
|
||||
pipeline_cls=SimplePipeline, backend=MsExcelDocumentBackend
|
||||
),
|
||||
|
||||
Reference in New Issue
Block a user