code for xlsm support

This commit is contained in:
ShiroYasha18 2025-05-04 01:07:27 +05:30
parent 7c705739f9
commit e6a070234f
5 changed files with 2403 additions and 1 deletions

View File

@ -34,11 +34,12 @@ class ConversionStatus(str, Enum):
class InputFormat(str, Enum): class InputFormat(str, Enum):
"""A document format supported by document backend parsers.""" """A document format supported by document backend parsers."""
PDF = "pdf"
DOCX = "docx" DOCX = "docx"
XLSM = "xlsm"
PPTX = "pptx" PPTX = "pptx"
HTML = "html" HTML = "html"
IMAGE = "image" IMAGE = "image"
PDF = "pdf"
ASCIIDOC = "asciidoc" ASCIIDOC = "asciidoc"
MD = "md" MD = "md"
CSV = "csv" CSV = "csv"

View File

@ -156,6 +156,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
InputFormat.JSON_DOCLING: FormatOption( InputFormat.JSON_DOCLING: FormatOption(
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
), ),
InputFormat.XLSM: InputFormat.XLSX,
} }
if (options := format_to_default_options.get(format)) is not None: if (options := format_to_default_options.get(format)) is not None:
return options return options

Binary file not shown.

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,47 @@
import os
from pathlib import Path
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat, FormatToExtensions
import json
INPUT_DIR = Path("tests/input")
OUTPUT_DIR = Path("tests/output")
OUTPUT_DIR.mkdir(exist_ok=True)
supported_exts = set()
for fmt in InputFormat:
exts = FormatToExtensions.get(fmt, [])
if exts:
supported_exts.update(exts)
else:
supported_exts.add(fmt.value)
supported_exts.add('xlsm')
print(f"Supported extensions: {sorted(supported_exts)}")
input_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix[1:].lower() in supported_exts]
print(f"Found {len(input_files)} files to process: {[f.name for f in input_files]}")
converter = DocumentConverter()
def convert_paths(obj):
if isinstance(obj, dict):
return {k: convert_paths(v) for k, v in obj.items()}
elif isinstance(obj, list):
return [convert_paths(i) for i in obj]
elif hasattr(obj, "__module__") and obj.__module__.startswith("pathlib"):
return str(obj)
else:
return obj
for file in input_files:
try:
print(f"Processing {file}...")
result = converter.convert(str(file))
out_path = OUTPUT_DIR / (file.stem + ".json")
result_dict = convert_paths(result.model_dump())
with open(out_path, "w", encoding="utf-8") as f:
json.dump(result_dict, f, ensure_ascii=False, indent=2)
print(f"Converted {file.name} -> {out_path.name}")
except Exception as e:
print(f"Failed to convert {file.name}: {e}")