fix(html): tackle paragraphs with block-level elements (#2720)

Fix p elements having block-level elements anywhere inside as browsers do.
Fix wrong type annotations.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-12-05 12:52:53 +01:00
committed by GitHub
parent aebe25cf00
commit d007ba0e6f
6 changed files with 1071 additions and 10 deletions

View File

@@ -3,6 +3,7 @@ from pathlib import Path, PurePath
from unittest.mock import Mock, mock_open, patch
import pytest
from bs4 import BeautifulSoup
from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ContentLayer
from pydantic import AnyUrl, ValidationError
@@ -523,3 +524,38 @@ def test_is_rich_table_cell(html_paths):
assert num_cells == len(gt_cells[idx_t]), (
f"Cell number does not match in table {idx_t}"
)
data_fix_par = [
(
"<p>Text<h2>Heading</h2>More text</p>",
"<p>Text</p><h2>Heading</h2><p>More text</p>",
),
(
"<html><body><p>Some text<h2>A heading</h2>More text</p></body></html>",
"<html><body><p>Some text</p><h2>A heading</h2><p>More text</p></body></html>",
),
(
"<p>Some text<h2>A heading</h2><i>Italics</i></p>",
"<p>Some text</p><h2>A heading</h2><p><i>Italics</i></p>",
),
(
"<p>Some text<p>Another paragraph</p>More text</p>",
"<p>Some text</p><p>Another paragraph</p><p>More text</p>",
),
(
"<p><table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>29</td></tr>"
"<tr><td>Bob</td><td>34</td></tr></table></p>",
"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>29</td></tr>"
"<tr><td>Bob</td><td>34</td></tr></table>",
),
]
@pytest.mark.parametrize("html,expected", data_fix_par)
def test_fix_invalid_paragraph_structure(html, expected):
"""Test the function _fix_invalid_paragraph_structure."""
soup = BeautifulSoup(html, "html.parser")
HTMLDocumentBackend._fix_invalid_paragraph_structure(soup)
assert str(soup) == expected