fix: guess HTML content starting with script tag (#1673)

Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-06-02 08:43:24 +02:00
committed by GitHub
parent 3942923125
commit 984cb137f6
2 changed files with 12 additions and 1 deletions

View File

@@ -132,6 +132,13 @@ def test_guess_format(tmp_path):
doc_path = Path("./tests/data/html/wiki_duck.html")
assert dci._guess_format(doc_path) == InputFormat.HTML
html_str = ( # HTML starting with a script
"<script>\nconsole.log('foo');\n</script>"
'<!doctype html>\n<html lang="en-us class="no-js"></html>'
)
stream = DocumentStream(name="lorem_ipsum", stream=BytesIO(f"{html_str}".encode()))
assert dci._guess_format(stream) == InputFormat.HTML
# Valid MD
buf = BytesIO(Path("./tests/data/md/wiki.md").open("rb").read())
stream = DocumentStream(name="wiki.md", stream=buf)