Files
docling/tests/test_backend_html.py
Cesar Berrospi Ramis c5f2e2fdd6 fix(HTML): parse footer tag as a group in furniture content layer (#2106)
* fix(HTML): parse footer tag as a section in furniture

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(HTML): add test for body vs furniture in HTML parser.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-08-20 08:42:25 +02:00

214 lines
6.3 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from io import BytesIO
from pathlib import Path
from docling_core.types.doc.document import ContentLayer
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_heading_levels():
in_path = Path("tests/data/html/wiki_duck.html")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_lvl_1 = found_lvl_2 = False
for item, _ in doc.iterate_items():
if isinstance(item, SectionHeaderItem):
if item.text == "Etymology":
found_lvl_1 = True
# h2 becomes level 1 because of h1 as title
assert item.level == 1
elif item.text == "Feeding":
found_lvl_2 = True
# h3 becomes level 2 because of h1 as title
assert item.level == 2
assert found_lvl_1 and found_lvl_2
def test_ordered_lists():
test_set: list[tuple[bytes, str]] = []
test_set.append(
(
b"<html><body><ol><li>1st item</li><li>2nd item</li></ol></body></html>",
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="1"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="2"><li>1st item</li><li>2nd item</li></ol></body></html>',
"2. 1st item\n3. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="0"><li>1st item</li><li>2nd item</li></ol></body></html>',
"0. 1st item\n1. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="-5"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="foo"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
for idx, pair in enumerate(test_set):
in_doc = InputDocument(
path_or_stream=BytesIO(pair[0]),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(pair[0]),
)
doc: DoclingDocument = backend.convert()
assert doc
assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
def test_unicode_characters():
raw_html = "<html><body><h1>Hello World!</h1></body></html>".encode() # noqa: RUF001
in_doc = InputDocument(
path_or_stream=BytesIO(raw_html),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(raw_html),
)
doc: DoclingDocument = backend.convert()
assert doc.texts[0].text == "Hello World!"
def test_extract_parent_hyperlinks():
html_path = Path("./tests/data/html/hyperlink_04.html")
in_doc = InputDocument(
path_or_stream=html_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=html_path,
)
div_tag = backend.soup.find("div")
a_tag = backend.soup.find("a")
annotated_text_list = backend._extract_text_and_hyperlink_recursively(
div_tag, find_parent_annotation=True
)
assert str(annotated_text_list[0].hyperlink) == a_tag.get("href")
def get_html_paths():
# Define the directory you want to search
directory = Path("./tests/data/html/")
# List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html"))
return html_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
return converter
def test_e2e_html_conversions():
html_paths = get_html_paths()
converter = get_converter()
for html_path in html_paths:
# print(f"converting {html_path}")
gt_path = (
html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name
)
conv_result: ConversionResult = converter.convert(html_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
def test_html_furniture():
raw_html = (
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
b"<h1>Main Heading</h1>"
b"<p>Some Content</p>"
b"<footer><p>Some Footer Content</p></footer></body></html"
)
in_doc = InputDocument(
path_or_stream=BytesIO(raw_html),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(raw_html),
)
doc: DoclingDocument = backend.convert()
md_body = doc.export_to_markdown()
assert md_body == "# Main Heading\n\nSome Content"
md_all = doc.export_to_markdown(
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
)
assert md_all == (
"Initial content with some bold text\n\n# Main Heading\n\nSome Content\n\n"
"Some Footer Content"
)