Files
docling/tests/test_backend_html.py
Cesar Berrospi Ramis 0ba8d5d9e3 fix(html): slow table parsing (#2582)
* fix(html): simplify parsing of simple table cells

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* tests(html): add test for rich table cells

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(html): ensure table cells with formatted text are parsed as RichTableCell

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* refactor(html): simplify process_rich_table_cells since only rich cells are processed

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* fix(html): formatted cell runs should be parsed as text items respecting the order

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore: pin latest docling-core and update uv.lock

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore: upgrade dependencies on uv.lock

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-11-06 05:25:36 +01:00

526 lines
17 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from io import BytesIO
from pathlib import Path, PurePath
from unittest.mock import Mock, mock_open, patch
import pytest
from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ContentLayer
from pydantic import AnyUrl, ValidationError
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.backend_options import HTMLBackendOptions
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import (
ConversionResult,
DoclingDocument,
InputDocument,
SectionHeaderItem,
)
from docling.document_converter import DocumentConverter, HTMLFormatOption
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document, verify_export
GENERATE = GEN_TEST_DATA
def test_html_backend_options():
options = HTMLBackendOptions()
assert options.kind == "html"
assert not options.fetch_images
assert options.source_uri is None
url = "http://example.com"
source_location = AnyUrl(url=url)
options = HTMLBackendOptions(source_uri=source_location)
assert options.source_uri == source_location
source_location = PurePath("/local/path/to/file.html")
options = HTMLBackendOptions(source_uri=source_location)
assert options.source_uri == source_location
with pytest.raises(ValidationError, match="Input is not a valid path"):
HTMLBackendOptions(source_uri=12345)
def test_resolve_relative_path():
html_path = Path("./tests/data/html/example_01.html")
in_doc = InputDocument(
path_or_stream=html_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
html_doc = HTMLDocumentBackend(path_or_stream=html_path, in_doc=in_doc)
html_doc.base_path = "/local/path/to/file.html"
relative_path = "subdir/another.html"
expected_abs_loc = "/local/path/to/subdir/another.html"
assert html_doc._resolve_relative_path(relative_path) == expected_abs_loc
absolute_path = "/absolute/path/to/file.html"
assert html_doc._resolve_relative_path(absolute_path) == absolute_path
html_doc.base_path = "http://my_host.com"
protocol_relative_url = "//example.com/file.html"
expected_abs_loc = "https://example.com/file.html"
assert html_doc._resolve_relative_path(protocol_relative_url) == expected_abs_loc
html_doc.base_path = "http://example.com"
remote_relative_path = "subdir/file.html"
expected_abs_loc = "http://example.com/subdir/file.html"
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
html_doc.base_path = "http://example.com"
remote_relative_path = "https://my_host.com/my_page.html"
expected_abs_loc = "https://my_host.com/my_page.html"
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
html_doc.base_path = "http://example.com"
remote_relative_path = "/static/images/my_image.png"
expected_abs_loc = "http://example.com/static/images/my_image.png"
assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc
html_doc.base_path = None
relative_path = "subdir/file.html"
assert html_doc._resolve_relative_path(relative_path) == relative_path
def test_heading_levels():
in_path = Path("tests/data/html/wiki_duck.html")
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=in_path,
)
doc = backend.convert()
found_lvl_1 = found_lvl_2 = False
for item, _ in doc.iterate_items():
if isinstance(item, SectionHeaderItem):
if item.text == "Etymology":
found_lvl_1 = True
# h2 becomes level 1 because of h1 as title
assert item.level == 1
elif item.text == "Feeding":
found_lvl_2 = True
# h3 becomes level 2 because of h1 as title
assert item.level == 2
assert found_lvl_1 and found_lvl_2
def test_ordered_lists():
test_set: list[tuple[bytes, str]] = []
test_set.append(
(
b"<html><body><ol><li>1st item</li><li>2nd item</li></ol></body></html>",
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="1"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="2"><li>1st item</li><li>2nd item</li></ol></body></html>',
"2. 1st item\n3. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="0"><li>1st item</li><li>2nd item</li></ol></body></html>',
"0. 1st item\n1. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="-5"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
test_set.append(
(
b'<html><body><ol start="foo"><li>1st item</li><li>2nd item</li></ol></body></html>',
"1. 1st item\n2. 2nd item",
)
)
for idx, pair in enumerate(test_set):
in_doc = InputDocument(
path_or_stream=BytesIO(pair[0]),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(pair[0]),
)
doc: DoclingDocument = backend.convert()
assert doc
assert doc.export_to_markdown() == pair[1], f"Error in case {idx}"
def test_unicode_characters():
raw_html = "<html><body><h1>Hello World!</h1></body></html>".encode() # noqa: RUF001
in_doc = InputDocument(
path_or_stream=BytesIO(raw_html),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(raw_html),
)
doc: DoclingDocument = backend.convert()
assert doc.texts[0].text == "Hello World!"
def test_extract_parent_hyperlinks():
html_path = Path("./tests/data/html/hyperlink_04.html")
in_doc = InputDocument(
path_or_stream=html_path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=html_path,
)
div_tag = backend.soup.find("div")
a_tag = backend.soup.find("a")
annotated_text_list = backend._extract_text_and_hyperlink_recursively(
div_tag, find_parent_annotation=True
)
assert str(annotated_text_list[0].hyperlink) == a_tag.get("href")
@pytest.fixture(scope="module")
def html_paths() -> list[Path]:
# Define the directory you want to search
directory = Path("./tests/data/html/")
# List all HTML files in the directory and its subdirectories
html_files = sorted(directory.rglob("*.html"))
return html_files
def get_converter():
converter = DocumentConverter(allowed_formats=[InputFormat.HTML])
return converter
def test_e2e_html_conversions(html_paths):
converter = get_converter()
for html_path in html_paths:
gt_path = (
html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name
)
conv_result: ConversionResult = converter.convert(html_path)
doc: DoclingDocument = conv_result.document
pred_md: str = doc.export_to_markdown()
assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), (
"export to md"
)
pred_itxt: str = doc._export_to_indented_text(
max_text_len=70, explicit_tables=False
)
assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), (
"export to indented-text"
)
assert verify_document(doc, str(gt_path) + ".json", GENERATE)
@patch("docling.backend.html_backend.requests.get")
@patch("docling.backend.html_backend.open", new_callable=mock_open)
def test_e2e_html_conversion_with_images(mock_local, mock_remote):
source = "tests/data/html/example_01.html"
image_path = "tests/data/html/example_image_01.png"
with open(image_path, "rb") as f:
img_bytes = f.read()
# fetching image locally
mock_local.return_value.__enter__.return_value = BytesIO(img_bytes)
backend_options = HTMLBackendOptions(
enable_local_fetch=True, fetch_images=True, source_uri=source
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
res_local = converter.convert(source)
mock_local.assert_called_once()
assert res_local.document
num_pic: int = 0
for element, _ in res_local.document.iterate_items():
if isinstance(element, PictureItem):
assert element.image
num_pic += 1
assert num_pic == 1, "No embedded picture was found in the converted file"
# fetching image remotely
mock_resp = Mock()
mock_resp.status_code = 200
mock_resp.content = img_bytes
mock_remote.return_value = mock_resp
source_location = "https://example.com/example_01.html"
backend_options = HTMLBackendOptions(
enable_remote_fetch=True, fetch_images=True, source_uri=source_location
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
res_remote = converter.convert(source)
mock_remote.assert_called_once_with(
"https://example.com/example_image_01.png", stream=True
)
assert res_remote.document
num_pic = 0
for element, _ in res_remote.document.iterate_items():
if isinstance(element, PictureItem):
assert element.image
assert element.image.mimetype == "image/png"
num_pic += 1
assert num_pic == 1, "No embedded picture was found in the converted file"
# both methods should generate the same DoclingDocument
assert res_remote.document == res_local.document
# checking exported formats
gt_path = (
"tests/data/groundtruth/docling_v2/" + str(Path(source).stem) + "_images.html"
)
pred_md: str = res_local.document.export_to_markdown()
assert verify_export(pred_md, gt_path + ".md", generate=GENERATE)
assert verify_document(res_local.document, gt_path + ".json", GENERATE)
def test_html_furniture():
raw_html = (
b"<html><body><p>Initial content with some <strong>bold text</strong></p>"
b"<h1>Main Heading</h1>"
b"<p>Some Content</p>"
b"<footer><p>Some Footer Content</p></footer></body></html"
)
in_doc = InputDocument(
path_or_stream=BytesIO(raw_html),
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename="test",
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=BytesIO(raw_html),
)
doc: DoclingDocument = backend.convert()
md_body = doc.export_to_markdown()
assert md_body == "# Main Heading\n\nSome Content"
md_all = doc.export_to_markdown(
included_content_layers={ContentLayer.BODY, ContentLayer.FURNITURE}
)
assert md_all == (
"Initial content with some **bold text**\n\n# Main Heading\n\nSome Content\n\n"
"Some Footer Content"
)
def test_fetch_remote_images(monkeypatch):
source = "./tests/data/html/example_01.html"
# no image fetching: the image_fetch flag is False
backend_options = HTMLBackendOptions(
fetch_images=False, source_uri="http://example.com"
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with patch("docling.backend.html_backend.requests.get") as mocked_get:
res = converter.convert(source)
mocked_get.assert_not_called()
assert res.document
# no image fetching: the source location is False and enable_local_fetch is False
backend_options = HTMLBackendOptions(fetch_images=True)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.requests.get") as mocked_get,
pytest.warns(
match="Fetching local resources is only allowed when set explicitly"
),
):
res = converter.convert(source)
mocked_get.assert_not_called()
assert res.document
# no image fetching: the enable_remote_fetch is False
backend_options = HTMLBackendOptions(
fetch_images=True, source_uri="http://example.com"
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.requests.get") as mocked_get,
pytest.warns(
match="Fetching remote resources is only allowed when set explicitly"
),
):
res = converter.convert(source)
mocked_get.assert_not_called()
assert res.document
# image fetching: all conditions apply, source location is remote
backend_options = HTMLBackendOptions(
enable_remote_fetch=True, fetch_images=True, source_uri="http://example.com"
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.requests.get") as mocked_get,
pytest.warns(match="a bytes-like object is required"),
):
res = converter.convert(source)
mocked_get.assert_called_once()
assert res.document
# image fetching: all conditions apply, local fetching allowed
backend_options = HTMLBackendOptions(
enable_local_fetch=True, fetch_images=True, source_uri=source
)
converter = DocumentConverter(
allowed_formats=[InputFormat.HTML],
format_options={
InputFormat.HTML: HTMLFormatOption(backend_options=backend_options)
},
)
with (
patch("docling.backend.html_backend.open") as mocked_open,
pytest.warns(match="a bytes-like object is required"),
):
res = converter.convert(source)
mocked_open.assert_called_once_with(
"tests/data/html/example_image_01.png", "rb"
)
assert res.document
def test_is_rich_table_cell(html_paths):
"""Test the function is_rich_table_cell."""
name = "html_rich_table_cells.html"
path = next(item for item in html_paths if item.name == name)
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.HTML,
backend=HTMLDocumentBackend,
filename=name,
)
backend = HTMLDocumentBackend(
in_doc=in_doc,
path_or_stream=path,
)
gt_cells: dict[int, list[bool]] = {}
# table: Basic duck facts
gt_cells[0] = [
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
False,
True,
False,
True,
True,
]
# table: Duck family tree
gt_cells[1] = [False, False, True, False, True, False, True, False]
# table: Duck-related actions
gt_cells[2] = [False, True, True, True, False, True, True]
# table: nested table
gt_cells[3] = [False, False, False, False, False, False]
# table: Famous Ducks with Images
gt_cells[4] = [
False,
False,
False,
False,
False,
True,
False,
False,
True,
False,
False,
True,
False,
False,
False,
]
for idx_t, table in enumerate(backend.soup.find_all("table")):
gt_it = iter(gt_cells[idx_t])
num_cells = 0
containers = table.find_all(["thead", "tbody"], recursive=False)
for part in containers:
for idx_r, row in enumerate(part.find_all("tr", recursive=False)):
cells = row.find_all(["td", "th"], recursive=False)
if not cells:
continue
for idx_c, cell in enumerate(cells):
assert next(gt_it) == backend._is_rich_table_cell(cell), (
f"Wrong cell type in table {idx_t}, row {idx_r}, col {idx_c} "
f"with text: {cell.text}"
)
num_cells += 1
assert num_cells == len(gt_cells[idx_t]), (
f"Cell number does not match in table {idx_t}"
)