mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix: guess HTML content starting with script tag (#1673)
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
3942923125
commit
984cb137f6
@@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
|
||||
doc_path = Path("./tests/data/html/wiki_duck.html")
|
||||
assert dci._guess_format(doc_path) == InputFormat.HTML
|
||||
|
||||
html_str = ( # HTML starting with a script
|
||||
"<script>\nconsole.log('foo');\n</script>"
|
||||
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
|
||||
)
|
||||
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
|
||||
assert dci._guess_format(stream) == InputFormat.HTML
|
||||
|
||||
# Valid MD
|
||||
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
|
||||
stream = DocumentStream(name="wiki.md", stream=buf)
|
||||
|
||||
Reference in New Issue
Block a user