fix(html): tackle paragraphs with block-level elements (#2720)

Fix p elements having block-level elements anywhere inside as browsers do.
Fix wrong type annotations.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Cesar Berrospi Ramis
2025-12-05 12:52:53 +01:00
committed by GitHub
parent aebe25cf00
commit d007ba0e6f
6 changed files with 1071 additions and 10 deletions

View File

@@ -0,0 +1,15 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: title: 1
item-2 at level 2: text: 1st paragraph
item-3 at level 2: table with [6x3]
item-4 at level 3: unspecified: group rich_cell_group_1_0_0
item-5 at level 4: text: 2
item-6 at level 3: unspecified: group rich_cell_group_1_0_0
item-7 at level 4: text: 3
item-8 at level 3: unspecified: group rich_cell_group_1_1_0
item-9 at level 4: text: 4
item-10 at level 3: unspecified: group rich_cell_group_1_1_5
item-11 at level 4: text: 19
item-12 at level 4: text: 20
item-13 at level 1: title: 21
item-14 at level 2: text: 2nd paragraph

View File

@@ -0,0 +1,782 @@
{
"schema_name": "DoclingDocument",
"version": "1.8.0",
"name": "html_heading_in_p",
"origin": {
"mimetype": "text/html",
"binary_hash": 6321020421104590329,
"filename": "html_heading_in_p.html"
},
"furniture": {
"self_ref": "#/furniture",
"children": [],
"content_layer": "furniture",
"name": "_root_",
"label": "unspecified"
},
"body": {
"self_ref": "#/body",
"children": [
{
"$ref": "#/texts/0"
},
{
"$ref": "#/texts/1"
},
{
"$ref": "#/texts/8"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
"groups": [
{
"self_ref": "#/groups/0",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/texts/3"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_0_0",
"label": "unspecified"
},
{
"self_ref": "#/groups/1",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/texts/4"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_0_0",
"label": "unspecified"
},
{
"self_ref": "#/groups/2",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/texts/5"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_1_0",
"label": "unspecified"
},
{
"self_ref": "#/groups/3",
"parent": {
"$ref": "#/tables/0"
},
"children": [
{
"$ref": "#/texts/6"
},
{
"$ref": "#/texts/7"
}
],
"content_layer": "body",
"name": "rich_cell_group_1_1_5",
"label": "unspecified"
}
],
"texts": [
{
"self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
"children": [],
"content_layer": "furniture",
"label": "title",
"prov": [],
"orig": "Headings inside paragraphs in HTML",
"text": "Headings inside paragraphs in HTML"
},
{
"self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/2"
},
{
"$ref": "#/tables/0"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "1",
"text": "1"
},
{
"self_ref": "#/texts/2",
"parent": {
"$ref": "#/texts/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "1st paragraph",
"text": "1st paragraph"
},
{
"self_ref": "#/texts/3",
"parent": {
"$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "2",
"text": "2",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/4",
"parent": {
"$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "3",
"text": "3",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/5",
"parent": {
"$ref": "#/groups/2"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "4",
"text": "4",
"formatting": {
"bold": true,
"italic": false,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
"self_ref": "#/texts/6",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "19",
"text": "19"
},
{
"self_ref": "#/texts/7",
"parent": {
"$ref": "#/groups/3"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "20",
"text": "20"
},
{
"self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
"children": [
{
"$ref": "#/texts/9"
}
],
"content_layer": "body",
"label": "title",
"prov": [],
"orig": "21",
"text": "21"
},
{
"self_ref": "#/texts/9",
"parent": {
"$ref": "#/texts/8"
},
"children": [],
"content_layer": "body",
"label": "text",
"prov": [],
"orig": "2nd paragraph",
"text": "2nd paragraph"
}
],
"pictures": [],
"tables": [
{
"self_ref": "#/tables/0",
"parent": {
"$ref": "#/texts/1"
},
"children": [
{
"$ref": "#/groups/0"
},
{
"$ref": "#/groups/1"
},
{
"$ref": "#/groups/2"
},
{
"$ref": "#/groups/3"
}
],
"content_layer": "body",
"label": "table",
"prov": [],
"captions": [],
"references": [],
"footnotes": [],
"data": {
"table_cells": [
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "2",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/0"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "3",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/1"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "4",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/2"
}
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "5",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "6",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "7",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "8",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "9",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "10",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "11",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "12",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "13",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "14",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "15",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "16",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "17",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "18",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "19 \n20",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false,
"ref": {
"$ref": "#/groups/3"
}
}
],
"num_rows": 6,
"num_cols": 3,
"grid": [
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "2",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "3",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 0,
"end_row_offset_idx": 1,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "4",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "5",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "6",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 1,
"end_row_offset_idx": 2,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "7",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "8",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "9",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 2,
"end_row_offset_idx": 3,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "10",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "11",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "12",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 3,
"end_row_offset_idx": 4,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "13",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "14",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "15",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 4,
"end_row_offset_idx": 5,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "16",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
],
[
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
"text": "17",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
"text": "18",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
},
{
"row_span": 1,
"col_span": 1,
"start_row_offset_idx": 5,
"end_row_offset_idx": 6,
"start_col_offset_idx": 2,
"end_col_offset_idx": 3,
"text": "19 \n20",
"column_header": false,
"row_header": false,
"row_section": false,
"fillable": false
}
]
]
},
"annotations": []
}
],
"key_value_items": [],
"form_items": [],
"pages": {}
}

View File

@@ -0,0 +1,15 @@
# 1
1st paragraph
| **2** | **3** | **4** |
|---------|---------|---------|
| 5 | 6 | 7 |
| 8 | 9 | 10 |
| 11 | 12 | 13 |
| 14 | 15 | 16 |
| 17 | 18 | 19 20 |
# 21
2nd paragraph

99
tests/data/html/html_heading_in_p.html vendored Normal file
View File

@@ -0,0 +1,99 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>Headings inside paragraphs in HTML</title>
</head>
<body>
<p>
<h1 id="1"><span style="color: rgb(0,0,0);">1</span></h1>
<div>
<p>1st paragraph<br /></p>
<div>
<table>
<colgroup>
<col />
<col />
<col />
</colgroup>
<tbody>
<tr>
<td>
<p><strong><span style="color: rgb(0,0,0);">2</span></strong></p>
</td>
<td>
<p><strong><span style="color: rgb(0,0,0);">3</span></strong></p>
</td>
<td>
<p><strong><span style="color: rgb(0,0,0);">4</span></strong></p>
</td>
</tr>
<tr>
<td>
<p><span style="color: rgb(0,0,0);">5</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">6</span>
</p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">7</span></p>
</td>
</tr>
<tr>
<td>
<p><span style="color: rgb(0,0,0);">8</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">9</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">10</span></p>
</td>
</tr>
<tr>
<td>
<p><span style="color: rgb(0,0,0);">11</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">12</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">13</span></p>
</td>
</tr>
<tr>
<td>
<p><span style="color: rgb(255,0,255);">14</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">15</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">16</span></p>
</td>
</tr>
<tr>
<td>
<p><span style="color: rgb(255,0,255);">17</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">18</span></p>
</td>
<td>
<p><span style="color: rgb(0,0,0);">19</span></p>
<p><span style="color: rgb(0,0,0);">20</span></p>
</td>
</tr>
</tbody>
</table>
</div>
<h1 id="21"><span style="color: rgb(0,0,0);">21</span></h1>
</div>
<br /></p>
<p>2nd paragraph</p>
</body>
</html>

View File

@@ -3,6 +3,7 @@ from pathlib import Path, PurePath
from unittest.mock import Mock, mock_open, patch
import pytest
from bs4 import BeautifulSoup
from docling_core.types.doc import PictureItem
from docling_core.types.doc.document import ContentLayer
from pydantic import AnyUrl, ValidationError
@@ -523,3 +524,38 @@ def test_is_rich_table_cell(html_paths):
assert num_cells == len(gt_cells[idx_t]), (
f"Cell number does not match in table {idx_t}"
)
data_fix_par = [
(
"<p>Text<h2>Heading</h2>More text</p>",
"<p>Text</p><h2>Heading</h2><p>More text</p>",
),
(
"<html><body><p>Some text<h2>A heading</h2>More text</p></body></html>",
"<html><body><p>Some text</p><h2>A heading</h2><p>More text</p></body></html>",
),
(
"<p>Some text<h2>A heading</h2><i>Italics</i></p>",
"<p>Some text</p><h2>A heading</h2><p><i>Italics</i></p>",
),
(
"<p>Some text<p>Another paragraph</p>More text</p>",
"<p>Some text</p><p>Another paragraph</p><p>More text</p>",
),
(
"<p><table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>29</td></tr>"
"<tr><td>Bob</td><td>34</td></tr></table></p>",
"<table><tr><th>Name</th><th>Age</th></tr><tr><td>Alice</td><td>29</td></tr>"
"<tr><td>Bob</td><td>34</td></tr></table>",
),
]
@pytest.mark.parametrize("html,expected", data_fix_par)
def test_fix_invalid_paragraph_structure(html, expected):
"""Test the function _fix_invalid_paragraph_structure."""
soup = BeautifulSoup(html, "html.parser")
HTMLDocumentBackend._fix_invalid_paragraph_structure(soup)
assert str(soup) == expected