From d007ba0e6f257fcc4e96aad388512455011b7952 Mon Sep 17 00:00:00 2001
From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
Date: Fri, 5 Dec 2025 12:52:53 +0100
Subject: [PATCH] fix(html): tackle paragraphs with block-level elements
(#2720)
Fix p elements having block-level elements anywhere inside as browsers do.
Fix wrong type annotations.
Signed-off-by: Cesar Berrospi Ramis
+_PARA_BREAKERS = {
+ "address",
+ "article",
+ "aside",
+ "blockquote",
+ "div",
+ "dl",
+ "fieldset",
+ "figcaption",
+ "figure",
+ "footer",
+ "form",
+ "h1",
+ "h2",
+ "h3",
+ "h4",
+ "h5",
+ "h6",
+ "header",
+ "hr",
+ "main",
+ "nav",
+ "ol",
+ "ul",
+ "li",
+ "p", # inside also forces closing
+ "pre",
+ "section",
+ "table",
+ "thead",
+ "tbody",
+ "tfoot",
+ "tr",
+ "td",
+}
+
_CODE_TAG_SET: Final = {"code", "kbd", "samp"}
_FORMAT_TAG_MAP: Final = {
@@ -199,7 +236,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
options: HTMLBackendOptions = HTMLBackendOptions(),
):
super().__init__(in_doc, path_or_stream, options)
- self.soup: Optional[Tag] = None
+ self.soup: Optional[BeautifulSoup] = None
self.path_or_stream: Union[BytesIO, Path] = path_or_stream
self.base_path: Optional[str] = str(options.source_uri)
@@ -276,6 +313,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
# remove any hidden tag
for tag in self.soup(hidden=True):
tag.decompose()
+ # fix flow content that is not permitted inside
+ HTMLDocumentBackend._fix_invalid_paragraph_structure(self.soup)
content = self.soup.body or self.soup
# normalize elements that contain block-level breakers.
+
+ This function emulates browser logic when other block-level elements
+ are found inside a element.
+ When a is open and a block-level breaker (e.g., h1-h6, div, table)
+ appears, automatically close the , emit it, then emit the breaker,
+ and if needed open a new for trailing text.
+
+ Args:
+ soup: The HTML document. The DOM may be rewritten.
+ """
+
+ def _start_para():
+ nonlocal current_p
+ if current_p is None:
+ current_p = soup.new_tag("p")
+ new_nodes.append(current_p)
+
+ def _flush_para_if_empty():
+ nonlocal current_p
+ if current_p is not None and not current_p.get_text(strip=True):
+ # remove empty paragraph placeholder
+ if current_p in new_nodes:
+ new_nodes.remove(current_p)
+ current_p = None
+
+ paragraphs = soup.select(f"p:has({','.join(tag for tag in _PARA_BREAKERS)})")
+
+ for p in paragraphs:
+ parent = p.parent
+ if parent is None:
+ continue
+
+ new_nodes = []
+ current_p = None
+
+ for node in list(p.contents):
+ if isinstance(node, NavigableString):
+ text = str(node)
+ node.extract()
+ if text.strip():
+ _start_para()
+ if current_p is not None:
+ current_p.append(NavigableString(text))
+ # skip whitespace-only text
+ continue
+
+ if isinstance(node, Tag):
+ node.extract()
+
+ if node.name in _PARA_BREAKERS:
+ _flush_para_if_empty()
+ new_nodes.append(node)
+ continue
+ else:
+ _start_para()
+ if current_p is not None:
+ current_p.append(node)
+ continue
+
+ _flush_para_if_empty()
+
+ siblings = list(parent.children)
+ try:
+ idx = siblings.index(p)
+ except ValueError:
+ # p might have been removed
+ continue
+
+ p.extract()
+ for n in reversed(new_nodes):
+ parent.insert(idx, n)
+
@staticmethod
def _is_remote_url(value: str) -> bool:
parsed = urlparse(value)
@@ -528,15 +642,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
added_refs: list[RefItem] = []
buffer: AnnotatedTextList = AnnotatedTextList()
- def flush_buffer():
+ def _flush_buffer() -> None:
if not buffer:
- return added_refs
+ return
annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements()
parts = annotated_text_list.split_by_newline()
buffer.clear()
if not "".join([el.text for el in annotated_text_list]):
- return added_refs
+ return
for annotated_text_list in parts:
with self._use_inline_group(annotated_text_list, doc):
@@ -569,12 +683,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
if isinstance(node, Tag):
name = node.name.lower()
if name == "img":
- flush_buffer()
+ _flush_buffer()
im_ref3 = self._emit_image(node, doc)
if im_ref3:
added_refs.append(im_ref3)
elif name in _FORMAT_TAG_MAP:
- flush_buffer()
+ _flush_buffer()
with self._use_format([name]):
wk = self._walk(node, doc)
added_refs.extend(wk)
@@ -583,11 +697,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
wk2 = self._walk(node, doc)
added_refs.extend(wk2)
elif name in _BLOCK_TAGS:
- flush_buffer()
+ _flush_buffer()
blk = self._handle_block(node, doc)
added_refs.extend(blk)
elif node.find(_BLOCK_TAGS):
- flush_buffer()
+ _flush_buffer()
wk3 = self._walk(node, doc)
added_refs.extend(wk3)
else:
@@ -600,7 +714,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
node, PreformattedString
):
if str(node).strip("\n\r") == "":
- flush_buffer()
+ _flush_buffer()
else:
buffer.extend(
self._extract_text_and_hyperlink_recursively(
@@ -608,7 +722,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
)
)
- flush_buffer()
+ _flush_buffer()
return added_refs
@staticmethod
diff --git a/tests/data/groundtruth/docling_v2/html_heading_in_p.html.itxt b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.itxt
new file mode 100644
index 00000000..6842fdbe
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.itxt
@@ -0,0 +1,15 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: title: 1
+ item-2 at level 2: text: 1st paragraph
+ item-3 at level 2: table with [6x3]
+ item-4 at level 3: unspecified: group rich_cell_group_1_0_0
+ item-5 at level 4: text: 2
+ item-6 at level 3: unspecified: group rich_cell_group_1_0_0
+ item-7 at level 4: text: 3
+ item-8 at level 3: unspecified: group rich_cell_group_1_1_0
+ item-9 at level 4: text: 4
+ item-10 at level 3: unspecified: group rich_cell_group_1_1_5
+ item-11 at level 4: text: 19
+ item-12 at level 4: text: 20
+ item-13 at level 1: title: 21
+ item-14 at level 2: text: 2nd paragraph
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/html_heading_in_p.html.json b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.json
new file mode 100644
index 00000000..014f7478
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.json
@@ -0,0 +1,782 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.8.0",
+ "name": "html_heading_in_p",
+ "origin": {
+ "mimetype": "text/html",
+ "binary_hash": 6321020421104590329,
+ "filename": "html_heading_in_p.html"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "content_layer": "furniture",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/texts/1"
+ },
+ {
+ "$ref": "#/texts/8"
+ }
+ ],
+ "content_layer": "body",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [
+ {
+ "self_ref": "#/groups/0",
+ "parent": {
+ "$ref": "#/tables/0"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/3"
+ }
+ ],
+ "content_layer": "body",
+ "name": "rich_cell_group_1_0_0",
+ "label": "unspecified"
+ },
+ {
+ "self_ref": "#/groups/1",
+ "parent": {
+ "$ref": "#/tables/0"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/4"
+ }
+ ],
+ "content_layer": "body",
+ "name": "rich_cell_group_1_0_0",
+ "label": "unspecified"
+ },
+ {
+ "self_ref": "#/groups/2",
+ "parent": {
+ "$ref": "#/tables/0"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/5"
+ }
+ ],
+ "content_layer": "body",
+ "name": "rich_cell_group_1_1_0",
+ "label": "unspecified"
+ },
+ {
+ "self_ref": "#/groups/3",
+ "parent": {
+ "$ref": "#/tables/0"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/6"
+ },
+ {
+ "$ref": "#/texts/7"
+ }
+ ],
+ "content_layer": "body",
+ "name": "rich_cell_group_1_1_5",
+ "label": "unspecified"
+ }
+ ],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "furniture",
+ "label": "title",
+ "prov": [],
+ "orig": "Headings inside paragraphs in HTML",
+ "text": "Headings inside paragraphs in HTML"
+ },
+ {
+ "self_ref": "#/texts/1",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/2"
+ },
+ {
+ "$ref": "#/tables/0"
+ }
+ ],
+ "content_layer": "body",
+ "label": "title",
+ "prov": [],
+ "orig": "1",
+ "text": "1"
+ },
+ {
+ "self_ref": "#/texts/2",
+ "parent": {
+ "$ref": "#/texts/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "1st paragraph",
+ "text": "1st paragraph"
+ },
+ {
+ "self_ref": "#/texts/3",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "2",
+ "text": "2",
+ "formatting": {
+ "bold": true,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/4",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "3",
+ "text": "3",
+ "formatting": {
+ "bold": true,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/5",
+ "parent": {
+ "$ref": "#/groups/2"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "4",
+ "text": "4",
+ "formatting": {
+ "bold": true,
+ "italic": false,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/6",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "19",
+ "text": "19"
+ },
+ {
+ "self_ref": "#/texts/7",
+ "parent": {
+ "$ref": "#/groups/3"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "20",
+ "text": "20"
+ },
+ {
+ "self_ref": "#/texts/8",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/9"
+ }
+ ],
+ "content_layer": "body",
+ "label": "title",
+ "prov": [],
+ "orig": "21",
+ "text": "21"
+ },
+ {
+ "self_ref": "#/texts/9",
+ "parent": {
+ "$ref": "#/texts/8"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [],
+ "orig": "2nd paragraph",
+ "text": "2nd paragraph"
+ }
+ ],
+ "pictures": [],
+ "tables": [
+ {
+ "self_ref": "#/tables/0",
+ "parent": {
+ "$ref": "#/texts/1"
+ },
+ "children": [
+ {
+ "$ref": "#/groups/0"
+ },
+ {
+ "$ref": "#/groups/1"
+ },
+ {
+ "$ref": "#/groups/2"
+ },
+ {
+ "$ref": "#/groups/3"
+ }
+ ],
+ "content_layer": "body",
+ "label": "table",
+ "prov": [],
+ "captions": [],
+ "references": [],
+ "footnotes": [],
+ "data": {
+ "table_cells": [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "2",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false,
+ "ref": {
+ "$ref": "#/groups/0"
+ }
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "3",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false,
+ "ref": {
+ "$ref": "#/groups/1"
+ }
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "4",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false,
+ "ref": {
+ "$ref": "#/groups/2"
+ }
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "5",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "6",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "7",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "8",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "9",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "10",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "11",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "12",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "13",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 4,
+ "end_row_offset_idx": 5,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "14",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 4,
+ "end_row_offset_idx": 5,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "15",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 4,
+ "end_row_offset_idx": 5,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "16",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 5,
+ "end_row_offset_idx": 6,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "17",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 5,
+ "end_row_offset_idx": 6,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "18",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 5,
+ "end_row_offset_idx": 6,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "19 \n20",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false,
+ "ref": {
+ "$ref": "#/groups/3"
+ }
+ }
+ ],
+ "num_rows": 6,
+ "num_cols": 3,
+ "grid": [
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "2",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "3",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "4",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ }
+ ],
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "5",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "6",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "7",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ }
+ ],
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "8",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "9",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 2,
+ "end_row_offset_idx": 3,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "10",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ }
+ ],
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "11",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "12",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 3,
+ "end_row_offset_idx": 4,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "13",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ }
+ ],
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 4,
+ "end_row_offset_idx": 5,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "14",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 4,
+ "end_row_offset_idx": 5,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "15",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 4,
+ "end_row_offset_idx": 5,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "16",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ }
+ ],
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 5,
+ "end_row_offset_idx": 6,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "17",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 5,
+ "end_row_offset_idx": 6,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "18",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 5,
+ "end_row_offset_idx": 6,
+ "start_col_offset_idx": 2,
+ "end_col_offset_idx": 3,
+ "text": "19 \n20",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false,
+ "fillable": false
+ }
+ ]
+ ]
+ },
+ "annotations": []
+ }
+ ],
+ "key_value_items": [],
+ "form_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/html_heading_in_p.html.md b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.md
new file mode 100644
index 00000000..a427ac90
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.md
@@ -0,0 +1,15 @@
+# 1
+
+1st paragraph
+
+| **2** | **3** | **4** |
+|---------|---------|---------|
+| 5 | 6 | 7 |
+| 8 | 9 | 10 |
+| 11 | 12 | 13 |
+| 14 | 15 | 16 |
+| 17 | 18 | 19 20 |
+
+# 21
+
+2nd paragraph
\ No newline at end of file
diff --git a/tests/data/html/html_heading_in_p.html b/tests/data/html/html_heading_in_p.html
new file mode 100644
index 00000000..0503604e
--- /dev/null
+++ b/tests/data/html/html_heading_in_p.html
@@ -0,0 +1,99 @@
+
+
+
+
+ 1st paragraph 2 3 4 5 6
+ 7 8 9 10 11 12 13 14 15 16 17 18 19 20
tags
@@ -301,6 +340,81 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
self._walk(content, doc)
return doc
+ @staticmethod
+ def _fix_invalid_paragraph_structure(soup: BeautifulSoup) -> None:
+ """Rewrite 1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 21
+
2nd paragraph
+ + + \ No newline at end of file diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 672f13a1..ccf7d1e4 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -3,6 +3,7 @@ from pathlib import Path, PurePath from unittest.mock import Mock, mock_open, patch import pytest +from bs4 import BeautifulSoup from docling_core.types.doc import PictureItem from docling_core.types.doc.document import ContentLayer from pydantic import AnyUrl, ValidationError @@ -523,3 +524,38 @@ def test_is_rich_table_cell(html_paths): assert num_cells == len(gt_cells[idx_t]), ( f"Cell number does not match in table {idx_t}" ) + + +data_fix_par = [ + ( + "Text
Text
More text
", + ), + ( + "Some text
Some text
More text
", + ), + ( + "Some text
Some text
Italics
", + ), + ( + "Some text
Another paragraph
More text", + "Some text
Another paragraph
More text
", + ), + ( + "| Name | Age |
|---|---|
| Alice | 29 |
| Bob | 34 |
| Name | Age |
|---|---|
| Alice | 29 |
| Bob | 34 |