diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 8190e3cd..f2254351 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -67,6 +67,43 @@ _BLOCK_TAGS: Final = { "ul", } +# Block-level elements that should not appear inside

+_PARA_BREAKERS = { + "address", + "article", + "aside", + "blockquote", + "div", + "dl", + "fieldset", + "figcaption", + "figure", + "footer", + "form", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "header", + "hr", + "main", + "nav", + "ol", + "ul", + "li", + "p", #

inside

also forces closing + "pre", + "section", + "table", + "thead", + "tbody", + "tfoot", + "tr", + "td", +} + _CODE_TAG_SET: Final = {"code", "kbd", "samp"} _FORMAT_TAG_MAP: Final = { @@ -199,7 +236,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): options: HTMLBackendOptions = HTMLBackendOptions(), ): super().__init__(in_doc, path_or_stream, options) - self.soup: Optional[Tag] = None + self.soup: Optional[BeautifulSoup] = None self.path_or_stream: Union[BytesIO, Path] = path_or_stream self.base_path: Optional[str] = str(options.source_uri) @@ -276,6 +313,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): # remove any hidden tag for tag in self.soup(hidden=True): tag.decompose() + # fix flow content that is not permitted inside

+ HTMLDocumentBackend._fix_invalid_paragraph_structure(self.soup) content = self.soup.body or self.soup # normalize
tags @@ -301,6 +340,81 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self._walk(content, doc) return doc + @staticmethod + def _fix_invalid_paragraph_structure(soup: BeautifulSoup) -> None: + """Rewrite

elements that contain block-level breakers. + + This function emulates browser logic when other block-level elements + are found inside a

element. + When a

is open and a block-level breaker (e.g., h1-h6, div, table) + appears, automatically close the

, emit it, then emit the breaker, + and if needed open a new

for trailing text. + + Args: + soup: The HTML document. The DOM may be rewritten. + """ + + def _start_para(): + nonlocal current_p + if current_p is None: + current_p = soup.new_tag("p") + new_nodes.append(current_p) + + def _flush_para_if_empty(): + nonlocal current_p + if current_p is not None and not current_p.get_text(strip=True): + # remove empty paragraph placeholder + if current_p in new_nodes: + new_nodes.remove(current_p) + current_p = None + + paragraphs = soup.select(f"p:has({','.join(tag for tag in _PARA_BREAKERS)})") + + for p in paragraphs: + parent = p.parent + if parent is None: + continue + + new_nodes = [] + current_p = None + + for node in list(p.contents): + if isinstance(node, NavigableString): + text = str(node) + node.extract() + if text.strip(): + _start_para() + if current_p is not None: + current_p.append(NavigableString(text)) + # skip whitespace-only text + continue + + if isinstance(node, Tag): + node.extract() + + if node.name in _PARA_BREAKERS: + _flush_para_if_empty() + new_nodes.append(node) + continue + else: + _start_para() + if current_p is not None: + current_p.append(node) + continue + + _flush_para_if_empty() + + siblings = list(parent.children) + try: + idx = siblings.index(p) + except ValueError: + # p might have been removed + continue + + p.extract() + for n in reversed(new_nodes): + parent.insert(idx, n) + @staticmethod def _is_remote_url(value: str) -> bool: parsed = urlparse(value) @@ -528,15 +642,15 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): added_refs: list[RefItem] = [] buffer: AnnotatedTextList = AnnotatedTextList() - def flush_buffer(): + def _flush_buffer() -> None: if not buffer: - return added_refs + return annotated_text_list: AnnotatedTextList = buffer.simplify_text_elements() parts = annotated_text_list.split_by_newline() buffer.clear() if not "".join([el.text for el in annotated_text_list]): - return added_refs + return for annotated_text_list in parts: with self._use_inline_group(annotated_text_list, doc): @@ -569,12 +683,12 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): if isinstance(node, Tag): name = node.name.lower() if name == "img": - flush_buffer() + _flush_buffer() im_ref3 = self._emit_image(node, doc) if im_ref3: added_refs.append(im_ref3) elif name in _FORMAT_TAG_MAP: - flush_buffer() + _flush_buffer() with self._use_format([name]): wk = self._walk(node, doc) added_refs.extend(wk) @@ -583,11 +697,11 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): wk2 = self._walk(node, doc) added_refs.extend(wk2) elif name in _BLOCK_TAGS: - flush_buffer() + _flush_buffer() blk = self._handle_block(node, doc) added_refs.extend(blk) elif node.find(_BLOCK_TAGS): - flush_buffer() + _flush_buffer() wk3 = self._walk(node, doc) added_refs.extend(wk3) else: @@ -600,7 +714,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): node, PreformattedString ): if str(node).strip("\n\r") == "": - flush_buffer() + _flush_buffer() else: buffer.extend( self._extract_text_and_hyperlink_recursively( @@ -608,7 +722,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ) ) - flush_buffer() + _flush_buffer() return added_refs @staticmethod diff --git a/tests/data/groundtruth/docling_v2/html_heading_in_p.html.itxt b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.itxt new file mode 100644 index 00000000..6842fdbe --- /dev/null +++ b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.itxt @@ -0,0 +1,15 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: 1 + item-2 at level 2: text: 1st paragraph + item-3 at level 2: table with [6x3] + item-4 at level 3: unspecified: group rich_cell_group_1_0_0 + item-5 at level 4: text: 2 + item-6 at level 3: unspecified: group rich_cell_group_1_0_0 + item-7 at level 4: text: 3 + item-8 at level 3: unspecified: group rich_cell_group_1_1_0 + item-9 at level 4: text: 4 + item-10 at level 3: unspecified: group rich_cell_group_1_1_5 + item-11 at level 4: text: 19 + item-12 at level 4: text: 20 + item-13 at level 1: title: 21 + item-14 at level 2: text: 2nd paragraph \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/html_heading_in_p.html.json b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.json new file mode 100644 index 00000000..014f7478 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.json @@ -0,0 +1,782 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "html_heading_in_p", + "origin": { + "mimetype": "text/html", + "binary_hash": 6321020421104590329, + "filename": "html_heading_in_p.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + }, + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/texts/8" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_0_0", + "label": "unspecified" + }, + { + "self_ref": "#/groups/1", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_0_0", + "label": "unspecified" + }, + { + "self_ref": "#/groups/2", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_1_0", + "label": "unspecified" + }, + { + "self_ref": "#/groups/3", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/6" + }, + { + "$ref": "#/texts/7" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_1_5", + "label": "unspecified" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [], + "content_layer": "furniture", + "label": "title", + "prov": [], + "orig": "Headings inside paragraphs in HTML", + "text": "Headings inside paragraphs in HTML" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/tables/0" + } + ], + "content_layer": "body", + "label": "title", + "prov": [], + "orig": "1", + "text": "1" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/texts/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "1st paragraph", + "text": "1st paragraph" + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "2", + "text": "2", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/groups/1" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "3", + "text": "3", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/groups/2" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "4", + "text": "4", + "formatting": { + "bold": true, + "italic": false, + "underline": false, + "strikethrough": false, + "script": "baseline" + } + }, + { + "self_ref": "#/texts/6", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "19", + "text": "19" + }, + { + "self_ref": "#/texts/7", + "parent": { + "$ref": "#/groups/3" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "20", + "text": "20" + }, + { + "self_ref": "#/texts/8", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/9" + } + ], + "content_layer": "body", + "label": "title", + "prov": [], + "orig": "21", + "text": "21" + }, + { + "self_ref": "#/texts/9", + "parent": { + "$ref": "#/texts/8" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "2nd paragraph", + "text": "2nd paragraph" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/1" + }, + "children": [ + { + "$ref": "#/groups/0" + }, + { + "$ref": "#/groups/1" + }, + { + "$ref": "#/groups/2" + }, + { + "$ref": "#/groups/3" + } + ], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/0" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/1" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/2" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "11", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "13", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "14", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "15", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "16", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "17", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "18", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "19 \n20", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/3" + } + } + ], + "num_rows": 6, + "num_cols": 3, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "2", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "3", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "4", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "5", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "6", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "7", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "8", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "9", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 2, + "end_row_offset_idx": 3, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "10", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "11", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "12", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 3, + "end_row_offset_idx": 4, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "13", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "14", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "15", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 4, + "end_row_offset_idx": 5, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "16", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "17", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "18", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 5, + "end_row_offset_idx": 6, + "start_col_offset_idx": 2, + "end_col_offset_idx": 3, + "text": "19 \n20", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/html_heading_in_p.html.md b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.md new file mode 100644 index 00000000..a427ac90 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/html_heading_in_p.html.md @@ -0,0 +1,15 @@ +# 1 + +1st paragraph + +| **2** | **3** | **4** | +|---------|---------|---------| +| 5 | 6 | 7 | +| 8 | 9 | 10 | +| 11 | 12 | 13 | +| 14 | 15 | 16 | +| 17 | 18 | 19 20 | + +# 21 + +2nd paragraph \ No newline at end of file diff --git a/tests/data/html/html_heading_in_p.html b/tests/data/html/html_heading_in_p.html new file mode 100644 index 00000000..0503604e --- /dev/null +++ b/tests/data/html/html_heading_in_p.html @@ -0,0 +1,99 @@ + + + + + + Headings inside paragraphs in HTML + + + +

+

1

+
+

1st paragraph

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+

2

+
+

3

+
+

4

+
+

5

+
+

6 +

+
+

7

+
+

8

+
+

9

+
+

10

+
+

11

+
+

12

+
+

13

+
+

14

+
+

15

+
+

16

+
+

17

+
+

18

+
+

19

+

20

+
+
+

21

+
+

+

2nd paragraph

+ + + \ No newline at end of file diff --git a/tests/test_backend_html.py b/tests/test_backend_html.py index 672f13a1..ccf7d1e4 100644 --- a/tests/test_backend_html.py +++ b/tests/test_backend_html.py @@ -3,6 +3,7 @@ from pathlib import Path, PurePath from unittest.mock import Mock, mock_open, patch import pytest +from bs4 import BeautifulSoup from docling_core.types.doc import PictureItem from docling_core.types.doc.document import ContentLayer from pydantic import AnyUrl, ValidationError @@ -523,3 +524,38 @@ def test_is_rich_table_cell(html_paths): assert num_cells == len(gt_cells[idx_t]), ( f"Cell number does not match in table {idx_t}" ) + + +data_fix_par = [ + ( + "

Text

Heading

More text

", + "

Text

Heading

More text

", + ), + ( + "

Some text

A heading

More text

", + "

Some text

A heading

More text

", + ), + ( + "

Some text

A heading

Italics

", + "

Some text

A heading

Italics

", + ), + ( + "

Some text

Another paragraph

More text

", + "

Some text

Another paragraph

More text

", + ), + ( + "

" + "
NameAge
Alice29
Bob34

", + "" + "
NameAge
Alice29
Bob34
", + ), +] + + +@pytest.mark.parametrize("html,expected", data_fix_par) +def test_fix_invalid_paragraph_structure(html, expected): + """Test the function _fix_invalid_paragraph_structure.""" + + soup = BeautifulSoup(html, "html.parser") + HTMLDocumentBackend._fix_invalid_paragraph_structure(soup) + assert str(soup) == expected