mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
feat: add a backend parser for WebVTT files (#2288)
* feat: add a backend parser for WebVTT files Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: update README with VTT support Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * docs: add description to supported formats Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore: upgrade docling-core to unescape WebVTT in markdown Pin the new release of docling-core 2.48.2. Do not escape HTML reserved characters when exporting WebVTT documents to markdown. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * test: add missing copyright notice Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
committed by
GitHub
parent
b5628f1227
commit
46efaaefee
@@ -206,6 +206,11 @@ def test_guess_format(tmp_path):
|
||||
doc_path.write_text("xyz", encoding="utf-8")
|
||||
assert dci._guess_format(doc_path) is None
|
||||
|
||||
# Valid WebVTT
|
||||
buf = BytesIO(Path("./tests/data/webvtt/webvtt_example_01.vtt").open("rb").read())
|
||||
stream = DocumentStream(name="webvtt_example_01.vtt", stream=buf)
|
||||
assert dci._guess_format(stream) == InputFormat.VTT
|
||||
|
||||
# Valid Docling JSON
|
||||
test_str = '{"name": ""}'
|
||||
stream = DocumentStream(name="test.json", stream=BytesIO(f"{test_str}".encode()))
|
||||
|
||||
Reference in New Issue
Block a user