mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(HTML): replace non-standard Unicode characters (#2006)
chore(HTML): replace non-standard Unicode characters for beter downstream tasks Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
aae42b37a8
commit
86f70128aa
@@ -100,6 +100,22 @@ def test_ordered_lists():
|
||||
assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
|
||||
|
||||
|
||||
def test_unicode_characters():
|
||||
raw_html = "<html><body><h1>Hello World!</h1></body></html>".encode() # noqa: RUF001
|
||||
in_doc = InputDocument(
|
||||
path_or_stream=BytesIO(raw_html),
|
||||
format=InputFormat.HTML,
|
||||
backend=HTMLDocumentBackend,
|
||||
filename="test",
|
||||
)
|
||||
backend = HTMLDocumentBackend(
|
||||
in_doc=in_doc,
|
||||
path_or_stream=BytesIO(raw_html),
|
||||
)
|
||||
doc: DoclingDocument = backend.convert()
|
||||
assert doc.texts[0].text == "Hello World!"
|
||||
|
||||
|
||||
def get_html_paths():
|
||||
# Define the directory you want to search
|
||||
directory = Path("./tests/data/html/")
|
||||
|
||||
Reference in New Issue
Block a user