From aebe25cf0027efc35c50de36751820c9a62e30ff Mon Sep 17 00:00:00 2001 From: Matvei Smirnov Date: Wed, 3 Dec 2025 20:52:23 +0300 Subject: [PATCH] fix(html): prevent hierarchy reset in rich table cells (#2716) * fix(html): restore parents after rich cell walking Signed-off-by: Matvei Smirnov * fix(html): add table cell context manager, update tests Signed-off-by: Matvei Smirnov * fix(html): table with heading test data Signed-off-by: Matvei Smirnov --------- Signed-off-by: Matvei Smirnov --- docling/backend/html_backend.py | 21 +- .../docling_v2/example_01.html.json | 4 +- .../docling_v2/example_02.html.json | 2 +- .../docling_v2/example_03.html.json | 2 +- .../docling_v2/example_04.html.json | 2 +- .../docling_v2/example_05.html.json | 2 +- .../docling_v2/example_06.html.json | 2 +- .../docling_v2/example_07.html.json | 2 +- .../docling_v2/example_08.html.json | 2 +- .../docling_v2/formatting.html.json | 2 +- .../docling_v2/html_code_snippets.html.json | 2 +- .../html_rich_table_cells.html.json | 2 +- .../docling_v2/hyperlink_01.html.json | 2 +- .../docling_v2/hyperlink_02.html.json | 2 +- .../docling_v2/hyperlink_03.html.json | 2 +- .../docling_v2/hyperlink_04.html.json | 2 +- .../docling_v2/hyperlink_05.html.json | 2 +- .../groundtruth/docling_v2/table_01.html.json | 2 +- .../groundtruth/docling_v2/table_02.html.json | 2 +- .../groundtruth/docling_v2/table_03.html.json | 2 +- .../groundtruth/docling_v2/table_04.html.json | 2 +- .../groundtruth/docling_v2/table_05.html.json | 2 +- .../groundtruth/docling_v2/table_06.html.json | 2 +- ...l.itxt => table_with_heading_01.html.itxt} | 0 ...l.json => table_with_heading_01.html.json} | 6 +- ....html.md => table_with_heading_01.html.md} | 0 .../table_with_heading_02.html.itxt | 9 + .../table_with_heading_02.html.json | 280 ++++++++++++++++++ .../docling_v2/table_with_heading_02.html.md | 11 + .../docling_v2/unit_test_01.html.json | 2 +- .../docling_v2/wiki_duck.html.json | 2 +- ...eading.html => table_with_heading_01.html} | 0 tests/data/html/table_with_heading_02.html | 28 ++ 33 files changed, 373 insertions(+), 32 deletions(-) rename tests/data/groundtruth/docling_v2/{table_with_heading.html.itxt => table_with_heading_01.html.itxt} (100%) rename tests/data/groundtruth/docling_v2/{table_with_heading.html.json => table_with_heading_01.html.json} (97%) rename tests/data/groundtruth/docling_v2/{table_with_heading.html.md => table_with_heading_01.html.md} (100%) create mode 100644 tests/data/groundtruth/docling_v2/table_with_heading_02.html.itxt create mode 100644 tests/data/groundtruth/docling_v2/table_with_heading_02.html.json create mode 100644 tests/data/groundtruth/docling_v2/table_with_heading_02.html.md rename tests/data/html/{table_with_heading.html => table_with_heading_01.html} (100%) create mode 100644 tests/data/html/table_with_heading_02.html diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 9ad84923..8190e3cd 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -459,10 +459,8 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): rich_table_cell = self._is_rich_table_cell(html_cell) if rich_table_cell: # Parse table cell sub-tree for Rich Cells content: - table_level = self.level - provs_in_cell = self._walk(html_cell, doc) - # After walking sub-tree in cell, restore previously set level - self.level = table_level + with self._use_table_cell_context(): + provs_in_cell = self._walk(html_cell, doc) group_name = f"rich_cell_group_{len(doc.tables)}_{col_idx}_{start_row_span + row_idx}" rich_table_cell, ref_for_rich_cell = ( @@ -829,6 +827,21 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): self.level -= 1 self.content_layer = current_layer + @contextmanager + def _use_table_cell_context(self): + """Preserve the hierarchy level and parents during table cell processing. + + While the context manager is active, the hierarchy level and parents can be modified. + When exiting, the original level and parents are restored. + """ + original_level = self.level + original_parents = self.parents.copy() + try: + yield + finally: + self.level = original_level + self.parents = original_parents + def _handle_heading(self, tag: Tag, doc: DoclingDocument) -> list[RefItem]: added_ref = [] tag_name = tag.name.lower() diff --git a/tests/data/groundtruth/docling_v2/example_01.html.json b/tests/data/groundtruth/docling_v2/example_01.html.json index 212bc0d2..91c05eca 100644 --- a/tests/data/groundtruth/docling_v2/example_01.html.json +++ b/tests/data/groundtruth/docling_v2/example_01.html.json @@ -1,10 +1,10 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_01", "origin": { "mimetype": "text/html", - "binary_hash": 13726679883013609282, + "binary_hash": 3245959421868226348, "filename": "example_01.html" }, "furniture": { diff --git a/tests/data/groundtruth/docling_v2/example_02.html.json b/tests/data/groundtruth/docling_v2/example_02.html.json index d37a774a..df3f43e0 100644 --- a/tests/data/groundtruth/docling_v2/example_02.html.json +++ b/tests/data/groundtruth/docling_v2/example_02.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_02", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_03.html.json b/tests/data/groundtruth/docling_v2/example_03.html.json index a5e98b07..39eca894 100644 --- a/tests/data/groundtruth/docling_v2/example_03.html.json +++ b/tests/data/groundtruth/docling_v2/example_03.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_03", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_04.html.json b/tests/data/groundtruth/docling_v2/example_04.html.json index bbd44157..bfc1006e 100644 --- a/tests/data/groundtruth/docling_v2/example_04.html.json +++ b/tests/data/groundtruth/docling_v2/example_04.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_04", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_05.html.json b/tests/data/groundtruth/docling_v2/example_05.html.json index dc83d3fc..bd619c04 100644 --- a/tests/data/groundtruth/docling_v2/example_05.html.json +++ b/tests/data/groundtruth/docling_v2/example_05.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_05", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_06.html.json b/tests/data/groundtruth/docling_v2/example_06.html.json index 599cfd3d..07845606 100644 --- a/tests/data/groundtruth/docling_v2/example_06.html.json +++ b/tests/data/groundtruth/docling_v2/example_06.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_06", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_07.html.json b/tests/data/groundtruth/docling_v2/example_07.html.json index bef073ed..eb24860f 100644 --- a/tests/data/groundtruth/docling_v2/example_07.html.json +++ b/tests/data/groundtruth/docling_v2/example_07.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_07", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/example_08.html.json b/tests/data/groundtruth/docling_v2/example_08.html.json index cad46653..de009c5f 100644 --- a/tests/data/groundtruth/docling_v2/example_08.html.json +++ b/tests/data/groundtruth/docling_v2/example_08.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "example_08", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/formatting.html.json b/tests/data/groundtruth/docling_v2/formatting.html.json index 5f3317c7..52f3e5b3 100644 --- a/tests/data/groundtruth/docling_v2/formatting.html.json +++ b/tests/data/groundtruth/docling_v2/formatting.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "formatting", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/html_code_snippets.html.json b/tests/data/groundtruth/docling_v2/html_code_snippets.html.json index bc20830c..17d71ffc 100644 --- a/tests/data/groundtruth/docling_v2/html_code_snippets.html.json +++ b/tests/data/groundtruth/docling_v2/html_code_snippets.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "html_code_snippets", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json b/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json index 388e5c86..6a4d3fd2 100644 --- a/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json +++ b/tests/data/groundtruth/docling_v2/html_rich_table_cells.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "html_rich_table_cells", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/hyperlink_01.html.json b/tests/data/groundtruth/docling_v2/hyperlink_01.html.json index 78c55fbc..d9df06e4 100644 --- a/tests/data/groundtruth/docling_v2/hyperlink_01.html.json +++ b/tests/data/groundtruth/docling_v2/hyperlink_01.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "hyperlink_01", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/hyperlink_02.html.json b/tests/data/groundtruth/docling_v2/hyperlink_02.html.json index 89383955..05d9ecdf 100644 --- a/tests/data/groundtruth/docling_v2/hyperlink_02.html.json +++ b/tests/data/groundtruth/docling_v2/hyperlink_02.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "hyperlink_02", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/hyperlink_03.html.json b/tests/data/groundtruth/docling_v2/hyperlink_03.html.json index b5276fc0..edf12e9d 100644 --- a/tests/data/groundtruth/docling_v2/hyperlink_03.html.json +++ b/tests/data/groundtruth/docling_v2/hyperlink_03.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "hyperlink_03", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/hyperlink_04.html.json b/tests/data/groundtruth/docling_v2/hyperlink_04.html.json index 6658e326..00595aa5 100644 --- a/tests/data/groundtruth/docling_v2/hyperlink_04.html.json +++ b/tests/data/groundtruth/docling_v2/hyperlink_04.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "hyperlink_04", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/hyperlink_05.html.json b/tests/data/groundtruth/docling_v2/hyperlink_05.html.json index 34b9becd..c56e68e9 100644 --- a/tests/data/groundtruth/docling_v2/hyperlink_05.html.json +++ b/tests/data/groundtruth/docling_v2/hyperlink_05.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "hyperlink_05", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/table_01.html.json b/tests/data/groundtruth/docling_v2/table_01.html.json index 53506206..f03208af 100644 --- a/tests/data/groundtruth/docling_v2/table_01.html.json +++ b/tests/data/groundtruth/docling_v2/table_01.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "table_01", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/table_02.html.json b/tests/data/groundtruth/docling_v2/table_02.html.json index 3d243602..0a4ea2d6 100644 --- a/tests/data/groundtruth/docling_v2/table_02.html.json +++ b/tests/data/groundtruth/docling_v2/table_02.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "table_02", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/table_03.html.json b/tests/data/groundtruth/docling_v2/table_03.html.json index 50b500cf..5b31177e 100644 --- a/tests/data/groundtruth/docling_v2/table_03.html.json +++ b/tests/data/groundtruth/docling_v2/table_03.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "table_03", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/table_04.html.json b/tests/data/groundtruth/docling_v2/table_04.html.json index d9402988..466d53f4 100644 --- a/tests/data/groundtruth/docling_v2/table_04.html.json +++ b/tests/data/groundtruth/docling_v2/table_04.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "table_04", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/table_05.html.json b/tests/data/groundtruth/docling_v2/table_05.html.json index e190729d..effdd5a3 100644 --- a/tests/data/groundtruth/docling_v2/table_05.html.json +++ b/tests/data/groundtruth/docling_v2/table_05.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "table_05", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/table_06.html.json b/tests/data/groundtruth/docling_v2/table_06.html.json index 0f9723ef..5e6b20a8 100644 --- a/tests/data/groundtruth/docling_v2/table_06.html.json +++ b/tests/data/groundtruth/docling_v2/table_06.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "table_06", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/table_with_heading.html.itxt b/tests/data/groundtruth/docling_v2/table_with_heading_01.html.itxt similarity index 100% rename from tests/data/groundtruth/docling_v2/table_with_heading.html.itxt rename to tests/data/groundtruth/docling_v2/table_with_heading_01.html.itxt diff --git a/tests/data/groundtruth/docling_v2/table_with_heading.html.json b/tests/data/groundtruth/docling_v2/table_with_heading_01.html.json similarity index 97% rename from tests/data/groundtruth/docling_v2/table_with_heading.html.json rename to tests/data/groundtruth/docling_v2/table_with_heading_01.html.json index d5a1b94a..76766dd2 100644 --- a/tests/data/groundtruth/docling_v2/table_with_heading.html.json +++ b/tests/data/groundtruth/docling_v2/table_with_heading_01.html.json @@ -1,11 +1,11 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", - "name": "table_with_heading", + "version": "1.8.0", + "name": "table_with_heading_01", "origin": { "mimetype": "text/html", "binary_hash": 5578561753677933781, - "filename": "table_with_heading.html" + "filename": "table_with_heading_01.html" }, "furniture": { "self_ref": "#/furniture", diff --git a/tests/data/groundtruth/docling_v2/table_with_heading.html.md b/tests/data/groundtruth/docling_v2/table_with_heading_01.html.md similarity index 100% rename from tests/data/groundtruth/docling_v2/table_with_heading.html.md rename to tests/data/groundtruth/docling_v2/table_with_heading_01.html.md diff --git a/tests/data/groundtruth/docling_v2/table_with_heading_02.html.itxt b/tests/data/groundtruth/docling_v2/table_with_heading_02.html.itxt new file mode 100644 index 00000000..1a13cd9d --- /dev/null +++ b/tests/data/groundtruth/docling_v2/table_with_heading_02.html.itxt @@ -0,0 +1,9 @@ +item-0 at level 0: unspecified: group _root_ + item-1 at level 1: title: Main Title + item-2 at level 2: text: Before the table + item-3 at level 2: table with [2x2] + item-4 at level 3: unspecified: group rich_cell_group_1_0_0 + item-5 at level 4: section_header: A + item-6 at level 4: text: text + item-7 at level 2: section_header: Section After + item-8 at level 3: text: After the table \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/table_with_heading_02.html.json b/tests/data/groundtruth/docling_v2/table_with_heading_02.html.json new file mode 100644 index 00000000..81a4ae5d --- /dev/null +++ b/tests/data/groundtruth/docling_v2/table_with_heading_02.html.json @@ -0,0 +1,280 @@ +{ + "schema_name": "DoclingDocument", + "version": "1.8.0", + "name": "table_with_heading_02", + "origin": { + "mimetype": "text/html", + "binary_hash": 5824324295334010827, + "filename": "table_with_heading_02.html" + }, + "furniture": { + "self_ref": "#/furniture", + "children": [], + "content_layer": "furniture", + "name": "_root_", + "label": "unspecified" + }, + "body": { + "self_ref": "#/body", + "children": [ + { + "$ref": "#/texts/0" + } + ], + "content_layer": "body", + "name": "_root_", + "label": "unspecified" + }, + "groups": [ + { + "self_ref": "#/groups/0", + "parent": { + "$ref": "#/tables/0" + }, + "children": [ + { + "$ref": "#/texts/2" + }, + { + "$ref": "#/texts/3" + } + ], + "content_layer": "body", + "name": "rich_cell_group_1_0_0", + "label": "unspecified" + } + ], + "texts": [ + { + "self_ref": "#/texts/0", + "parent": { + "$ref": "#/body" + }, + "children": [ + { + "$ref": "#/texts/1" + }, + { + "$ref": "#/tables/0" + }, + { + "$ref": "#/texts/4" + } + ], + "content_layer": "body", + "label": "title", + "prov": [], + "orig": "Main Title", + "text": "Main Title" + }, + { + "self_ref": "#/texts/1", + "parent": { + "$ref": "#/texts/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "Before the table", + "text": "Before the table" + }, + { + "self_ref": "#/texts/2", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "A", + "text": "A", + "level": 1 + }, + { + "self_ref": "#/texts/3", + "parent": { + "$ref": "#/groups/0" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "text", + "text": "text" + }, + { + "self_ref": "#/texts/4", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/texts/5" + } + ], + "content_layer": "body", + "label": "section_header", + "prov": [], + "orig": "Section After", + "text": "Section After", + "level": 1 + }, + { + "self_ref": "#/texts/5", + "parent": { + "$ref": "#/texts/4" + }, + "children": [], + "content_layer": "body", + "label": "text", + "prov": [], + "orig": "After the table", + "text": "After the table" + } + ], + "pictures": [], + "tables": [ + { + "self_ref": "#/tables/0", + "parent": { + "$ref": "#/texts/0" + }, + "children": [ + { + "$ref": "#/groups/0" + } + ], + "content_layer": "body", + "label": "table", + "prov": [], + "captions": [], + "references": [], + "footnotes": [], + "data": { + "table_cells": [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "A\ntext", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false, + "ref": { + "$ref": "#/groups/0" + } + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "B", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1...", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2...", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + "num_rows": 2, + "num_cols": 2, + "grid": [ + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "A\ntext", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 0, + "end_row_offset_idx": 1, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "B", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ], + [ + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 0, + "end_col_offset_idx": 1, + "text": "1...", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + }, + { + "row_span": 1, + "col_span": 1, + "start_row_offset_idx": 1, + "end_row_offset_idx": 2, + "start_col_offset_idx": 1, + "end_col_offset_idx": 2, + "text": "2...", + "column_header": false, + "row_header": false, + "row_section": false, + "fillable": false + } + ] + ] + }, + "annotations": [] + } + ], + "key_value_items": [], + "form_items": [], + "pages": {} +} \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/table_with_heading_02.html.md b/tests/data/groundtruth/docling_v2/table_with_heading_02.html.md new file mode 100644 index 00000000..dca4d978 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/table_with_heading_02.html.md @@ -0,0 +1,11 @@ +# Main Title + +Before the table + +| ## A text | B | +|--------------|------| +| 1... | 2... | + +## Section After + +After the table \ No newline at end of file diff --git a/tests/data/groundtruth/docling_v2/unit_test_01.html.json b/tests/data/groundtruth/docling_v2/unit_test_01.html.json index 60308a12..75abfc80 100644 --- a/tests/data/groundtruth/docling_v2/unit_test_01.html.json +++ b/tests/data/groundtruth/docling_v2/unit_test_01.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "unit_test_01", "origin": { "mimetype": "text/html", diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 1899a9bb..4f2f83eb 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -1,6 +1,6 @@ { "schema_name": "DoclingDocument", - "version": "1.7.0", + "version": "1.8.0", "name": "wiki_duck", "origin": { "mimetype": "text/html", diff --git a/tests/data/html/table_with_heading.html b/tests/data/html/table_with_heading_01.html similarity index 100% rename from tests/data/html/table_with_heading.html rename to tests/data/html/table_with_heading_01.html diff --git a/tests/data/html/table_with_heading_02.html b/tests/data/html/table_with_heading_02.html new file mode 100644 index 00000000..bb254931 --- /dev/null +++ b/tests/data/html/table_with_heading_02.html @@ -0,0 +1,28 @@ + + + + + +

Main Title

+

Before the table

+ + + + + + + + + +
+

A

+

text

+
B
1...2...
+

Section After

+ After the table + + \ No newline at end of file