mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
code for xlsm support
This commit is contained in:
parent
7c705739f9
commit
e6a070234f
@ -34,11 +34,12 @@ class ConversionStatus(str, Enum):
|
|||||||
class InputFormat(str, Enum):
|
class InputFormat(str, Enum):
|
||||||
"""A document format supported by document backend parsers."""
|
"""A document format supported by document backend parsers."""
|
||||||
|
|
||||||
|
PDF = "pdf"
|
||||||
DOCX = "docx"
|
DOCX = "docx"
|
||||||
|
XLSM = "xlsm"
|
||||||
PPTX = "pptx"
|
PPTX = "pptx"
|
||||||
HTML = "html"
|
HTML = "html"
|
||||||
IMAGE = "image"
|
IMAGE = "image"
|
||||||
PDF = "pdf"
|
|
||||||
ASCIIDOC = "asciidoc"
|
ASCIIDOC = "asciidoc"
|
||||||
MD = "md"
|
MD = "md"
|
||||||
CSV = "csv"
|
CSV = "csv"
|
||||||
|
@ -156,6 +156,7 @@ def _get_default_option(format: InputFormat) -> FormatOption:
|
|||||||
InputFormat.JSON_DOCLING: FormatOption(
|
InputFormat.JSON_DOCLING: FormatOption(
|
||||||
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
pipeline_cls=SimplePipeline, backend=DoclingJSONBackend
|
||||||
),
|
),
|
||||||
|
InputFormat.XLSM: InputFormat.XLSX,
|
||||||
}
|
}
|
||||||
if (options := format_to_default_options.get(format)) is not None:
|
if (options := format_to_default_options.get(format)) is not None:
|
||||||
return options
|
return options
|
||||||
|
BIN
tests/input/sample_sales_macro (1).xlsm
Normal file
BIN
tests/input/sample_sales_macro (1).xlsm
Normal file
Binary file not shown.
2353
tests/output/sample_sales_macro (1).json
Normal file
2353
tests/output/sample_sales_macro (1).json
Normal file
File diff suppressed because it is too large
Load Diff
47
tests/test_backend_msexcel_xlsm.py
Normal file
47
tests/test_backend_msexcel_xlsm.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
from docling.datamodel.base_models import InputFormat, FormatToExtensions
|
||||||
|
import json
|
||||||
|
|
||||||
|
INPUT_DIR = Path("tests/input")
|
||||||
|
OUTPUT_DIR = Path("tests/output")
|
||||||
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
supported_exts = set()
|
||||||
|
for fmt in InputFormat:
|
||||||
|
exts = FormatToExtensions.get(fmt, [])
|
||||||
|
if exts:
|
||||||
|
supported_exts.update(exts)
|
||||||
|
else:
|
||||||
|
supported_exts.add(fmt.value)
|
||||||
|
supported_exts.add('xlsm')
|
||||||
|
|
||||||
|
print(f"Supported extensions: {sorted(supported_exts)}")
|
||||||
|
|
||||||
|
input_files = [f for f in INPUT_DIR.iterdir() if f.is_file() and f.suffix[1:].lower() in supported_exts]
|
||||||
|
print(f"Found {len(input_files)} files to process: {[f.name for f in input_files]}")
|
||||||
|
|
||||||
|
converter = DocumentConverter()
|
||||||
|
|
||||||
|
def convert_paths(obj):
|
||||||
|
if isinstance(obj, dict):
|
||||||
|
return {k: convert_paths(v) for k, v in obj.items()}
|
||||||
|
elif isinstance(obj, list):
|
||||||
|
return [convert_paths(i) for i in obj]
|
||||||
|
elif hasattr(obj, "__module__") and obj.__module__.startswith("pathlib"):
|
||||||
|
return str(obj)
|
||||||
|
else:
|
||||||
|
return obj
|
||||||
|
|
||||||
|
for file in input_files:
|
||||||
|
try:
|
||||||
|
print(f"Processing {file}...")
|
||||||
|
result = converter.convert(str(file))
|
||||||
|
out_path = OUTPUT_DIR / (file.stem + ".json")
|
||||||
|
result_dict = convert_paths(result.model_dump())
|
||||||
|
with open(out_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(result_dict, f, ensure_ascii=False, indent=2)
|
||||||
|
print(f"Converted {file.name} -> {out_path.name}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to convert {file.name}: {e}")
|
Loading…
Reference in New Issue
Block a user