from io import BytesIO from pathlib import Path, PurePath from unittest.mock import Mock, mock_open, patch import pytest from docling_core.types.doc import PictureItem from docling_core.types.doc.document import ContentLayer from pydantic import AnyUrl, ValidationError from docling.backend.html_backend import HTMLDocumentBackend from docling.datamodel.backend_options import HTMLBackendOptions from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ( ConversionResult, DoclingDocument, InputDocument, SectionHeaderItem, ) from docling.document_converter import DocumentConverter, HTMLFormatOption from .test_data_gen_flag import GEN_TEST_DATA from .verify_utils import verify_document, verify_export GENERATE = GEN_TEST_DATA def test_html_backend_options(): options = HTMLBackendOptions() assert options.kind == "html" assert not options.fetch_images assert options.source_uri is None url = "http://example.com" source_location = AnyUrl(url=url) options = HTMLBackendOptions(source_uri=source_location) assert options.source_uri == source_location source_location = PurePath("/local/path/to/file.html") options = HTMLBackendOptions(source_uri=source_location) assert options.source_uri == source_location with pytest.raises(ValidationError, match="Input is not a valid path"): HTMLBackendOptions(source_uri=12345) def test_resolve_relative_path(): html_path = Path("./tests/data/html/example_01.html") in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) html_doc = HTMLDocumentBackend(path_or_stream=html_path, in_doc=in_doc) html_doc.base_path = "/local/path/to/file.html" relative_path = "subdir/another.html" expected_abs_loc = "/local/path/to/subdir/another.html" assert html_doc._resolve_relative_path(relative_path) == expected_abs_loc absolute_path = "/absolute/path/to/file.html" assert html_doc._resolve_relative_path(absolute_path) == absolute_path html_doc.base_path = "http://my_host.com" protocol_relative_url = "//example.com/file.html" expected_abs_loc = "https://example.com/file.html" assert html_doc._resolve_relative_path(protocol_relative_url) == expected_abs_loc html_doc.base_path = "http://example.com" remote_relative_path = "subdir/file.html" expected_abs_loc = "http://example.com/subdir/file.html" assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc html_doc.base_path = "http://example.com" remote_relative_path = "https://my_host.com/my_page.html" expected_abs_loc = "https://my_host.com/my_page.html" assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc html_doc.base_path = "http://example.com" remote_relative_path = "/static/images/my_image.png" expected_abs_loc = "http://example.com/static/images/my_image.png" assert html_doc._resolve_relative_path(remote_relative_path) == expected_abs_loc html_doc.base_path = None relative_path = "subdir/file.html" assert html_doc._resolve_relative_path(relative_path) == relative_path def test_heading_levels(): in_path = Path("tests/data/html/wiki_duck.html") in_doc = InputDocument( path_or_stream=in_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=in_path, ) doc = backend.convert() found_lvl_1 = found_lvl_2 = False for item, _ in doc.iterate_items(): if isinstance(item, SectionHeaderItem): if item.text == "Etymology": found_lvl_1 = True # h2 becomes level 1 because of h1 as title assert item.level == 1 elif item.text == "Feeding": found_lvl_2 = True # h3 becomes level 2 because of h1 as title assert item.level == 2 assert found_lvl_1 and found_lvl_2 def test_ordered_lists(): test_set: list[tuple[bytes, str]] = [] test_set.append( ( b"
  1. 1st item
  2. 2nd item
", "1. 1st item\n2. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "1. 1st item\n2. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "2. 1st item\n3. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "0. 1st item\n1. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "1. 1st item\n2. 2nd item", ) ) test_set.append( ( b'
  1. 1st item
  2. 2nd item
', "1. 1st item\n2. 2nd item", ) ) for idx, pair in enumerate(test_set): in_doc = InputDocument( path_or_stream=BytesIO(pair[0]), format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=BytesIO(pair[0]), ) doc: DoclingDocument = backend.convert() assert doc assert doc.export_to_markdown() == pair[1], f"Error in case {idx}" def test_unicode_characters(): raw_html = "

Hello World!

".encode() # noqa: RUF001 in_doc = InputDocument( path_or_stream=BytesIO(raw_html), format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=BytesIO(raw_html), ) doc: DoclingDocument = backend.convert() assert doc.texts[0].text == "Hello World!" def test_extract_parent_hyperlinks(): html_path = Path("./tests/data/html/hyperlink_04.html") in_doc = InputDocument( path_or_stream=html_path, format=InputFormat.HTML, backend=HTMLDocumentBackend, filename="test", ) backend = HTMLDocumentBackend( in_doc=in_doc, path_or_stream=html_path, ) div_tag = backend.soup.find("div") a_tag = backend.soup.find("a") annotated_text_list = backend._extract_text_and_hyperlink_recursively( div_tag, find_parent_annotation=True ) assert str(annotated_text_list[0].hyperlink) == a_tag.get("href") @pytest.fixture(scope="module") def html_paths() -> list[Path]: # Define the directory you want to search directory = Path("./tests/data/html/") # List all HTML files in the directory and its subdirectories html_files = sorted(directory.rglob("*.html")) return html_files def get_converter(): converter = DocumentConverter(allowed_formats=[InputFormat.HTML]) return converter def test_e2e_html_conversions(html_paths): converter = get_converter() for html_path in html_paths: gt_path = ( html_path.parent.parent / "groundtruth" / "docling_v2" / html_path.name ) conv_result: ConversionResult = converter.convert(html_path) doc: DoclingDocument = conv_result.document pred_md: str = doc.export_to_markdown() assert verify_export(pred_md, str(gt_path) + ".md", generate=GENERATE), ( "export to md" ) pred_itxt: str = doc._export_to_indented_text( max_text_len=70, explicit_tables=False ) assert verify_export(pred_itxt, str(gt_path) + ".itxt", generate=GENERATE), ( "export to indented-text" ) assert verify_document(doc, str(gt_path) + ".json", GENERATE) @patch("docling.backend.html_backend.requests.get") @patch("docling.backend.html_backend.open", new_callable=mock_open) def test_e2e_html_conversion_with_images(mock_local, mock_remote): source = "tests/data/html/example_01.html" image_path = "tests/data/html/example_image_01.png" with open(image_path, "rb") as f: img_bytes = f.read() # fetching image locally mock_local.return_value.__enter__.return_value = BytesIO(img_bytes) backend_options = HTMLBackendOptions( enable_local_fetch=True, fetch_images=True, source_uri=source ) converter = DocumentConverter( allowed_formats=[InputFormat.HTML], format_options={ InputFormat.HTML: HTMLFormatOption(backend_options=backend_options) }, ) res_local = converter.convert(source) mock_local.assert_called_once() assert res_local.document num_pic: int = 0 for element, _ in res_local.document.iterate_items(): if isinstance(element, PictureItem): assert element.image num_pic += 1 assert num_pic == 1, "No embedded picture was found in the converted file" # fetching image remotely mock_resp = Mock() mock_resp.status_code = 200 mock_resp.content = img_bytes mock_remote.return_value = mock_resp source_location = "https://example.com/example_01.html" backend_options = HTMLBackendOptions( enable_remote_fetch=True, fetch_images=True, source_uri=source_location ) converter = DocumentConverter( allowed_formats=[InputFormat.HTML], format_options={ InputFormat.HTML: HTMLFormatOption(backend_options=backend_options) }, ) res_remote = converter.convert(source) mock_remote.assert_called_once_with( "https://example.com/example_image_01.png", stream=True ) assert res_remote.document num_pic = 0 for element, _ in res_remote.document.iterate_items(): if isinstance(element, PictureItem): assert element.image assert element.image.mimetype == "image/png" num_pic += 1 assert num_pic == 1, "No embedded picture was found in the converted file" # both methods should generate the same DoclingDocument assert res_remote.document == res_local.document # checking exported formats gt_path = ( "tests/data/groundtruth/docling_v2/" + str(Path(source).stem) + "_images.html" ) pred_md: str = res_local.document.export_to_markdown() assert verify_export(pred_md, gt_path + ".md", generate=GENERATE) assert verify_document(res_local.document, gt_path + ".json", GENERATE) def test_html_furniture(): raw_html = ( b"

Initial content with some bold text

" b"

Main Heading

" b"

Some Content

" b"