fix(HTML): replace non-standard Unicode characters (#2006)

chore(HTML): replace non-standard Unicode characters for beter downstream tasks Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-12-08 12:48:28 +00:00 · 2025-07-29 11:05:35 +02:00
parent aae42b37a8
commit 86f70128aa
8 changed files with 125 additions and 52 deletions
--- a/tests/test_backend_html.py
+++ b/tests/test_backend_html.py
@@ -100,6 +100,22 @@ def test_ordered_lists():
        assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"


+def test_unicode_characters():
+    raw_html = "<html><body><h1>Hello World!</h1></body></html>".encode()  # noqa: RUF001
+    in_doc = InputDocument(
+        path_or_stream=BytesIO(raw_html),
+        format=InputFormat.HTML,
+        backend=HTMLDocumentBackend,
+        filename="test",
+    )
+    backend = HTMLDocumentBackend(
+        in_doc=in_doc,
+        path_or_stream=BytesIO(raw_html),
+    )
+    doc: DoclingDocument = backend.convert()
+    assert doc.texts[0].text == "Hello World!"
+
+
 def get_html_paths():
    # Define the directory you want to search
    directory = Path("./tests/data/html/")