From 53e68d3dc65ce9208021bec36a219b7cd20ee1cd Mon Sep 17 00:00:00 2001 From: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Date: Wed, 23 Jul 2025 17:18:38 +0200 Subject: [PATCH] fix(HTML): ensure correct concatenation of child strings in table cells and list items Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> --- docling/backend/html_backend.py | 38 ++++- .../docling_v2/pntd.0008301.nxml.json | 4 +- .../docling_v2/pone.0234687.nxml.json | 144 +++++++++--------- .../docling_v2/wiki_duck.html.json | 102 ++++++------- .../groundtruth/docling_v2/wiki_duck.html.md | 8 +- 5 files changed, 162 insertions(+), 134 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 1d3f5712..dffc095b 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -5,7 +5,7 @@ from io import BytesIO from pathlib import Path from typing import Final, Optional, Union, cast -from bs4 import BeautifulSoup, NavigableString, Tag +from bs4 import BeautifulSoup, NavigableString, PageElement, Tag from bs4.element import PreformattedString from docling_core.types.doc import ( DocItem, @@ -297,7 +297,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): ): parts.append(child) elif isinstance(child, Tag) and child.name not in ("ul", "ol"): - text_part = child.get_text() + text_part = HTMLDocumentBackend.get_text(child) if text_part: parts.append(text_part) li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip() @@ -417,6 +417,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): content_layer=self.content_layer, ) + @staticmethod + def get_text(item: PageElement) -> str: + """Concatenate all child strings of a PageElement. + + This method is equivalent to `PageElement.get_text()` but also considers + certain tags. When called on a

or

  • tags, it returns the text with a + trailing space, otherwise the text is concatenated without separators. + """ + + def _extract_text_recursively(item: PageElement) -> list[str]: + """Recursively extract text from all child nodes.""" + result: list[str] = [] + + if isinstance(item, NavigableString): + result = [item] + elif isinstance(item, Tag): + tag = cast(Tag, item) + parts: list[str] = [] + for child in tag: + parts.extend(_extract_text_recursively(child)) + result.append( + "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts) + ) + + return result + + parts: list[str] = _extract_text_recursively(item) + + return "".join(parts) + @staticmethod def _get_cell_spans(cell: Tag) -> tuple[int, int]: """Extract colspan and rowspan values from a table cell tag. @@ -510,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend): formula.replace_with(NavigableString(math_formula)) # TODO: extract content correctly from table-cells with lists - text = html_cell.text - - # label = html_cell.name + text = HTMLDocumentBackend.get_text(html_cell).strip() col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell) if row_header: row_span -= 1 diff --git a/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json b/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json index 1fae7c12..7d1e82b4 100644 --- a/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json +++ b/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json @@ -5839,7 +5839,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": " Number of total districts", + "text": "Number of total districts", "column_header": false, "row_header": false, "row_section": false @@ -6642,7 +6642,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": " Number of total districts", + "text": "Number of total districts", "column_header": false, "row_header": false, "row_section": false diff --git a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json index 7d4f7ef8..b504fd9c 100644 --- a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json +++ b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json @@ -4166,7 +4166,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ground corn", + "text": "Ground corn", "column_header": false, "row_header": false, "row_section": false @@ -4298,7 +4298,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean meal", + "text": "Soybean meal", "column_header": false, "row_header": false, "row_section": false @@ -4430,7 +4430,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silage", + "text": "Corn silage", "column_header": false, "row_header": false, "row_section": false @@ -4562,7 +4562,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann temperate pasture", + "text": "Ann temperate pasture", "column_header": false, "row_header": false, "row_section": false @@ -4694,7 +4694,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann tropical pasture", + "text": "Ann tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -4826,7 +4826,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Perenn tropical pasture", + "text": "Perenn tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -4970,7 +4970,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Organic matter", + "text": "Organic matter", "column_header": false, "row_header": false, "row_section": false @@ -5102,7 +5102,7 @@ "end_row_offset_idx": 14, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Crude protein", + "text": "Crude protein", "column_header": false, "row_header": false, "row_section": false @@ -5234,7 +5234,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Neutral detergent fibre", + "text": "Neutral detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -5366,7 +5366,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Acid detergent fibre", + "text": "Acid detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -5498,7 +5498,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ether extract", + "text": "Ether extract", "column_header": false, "row_header": false, "row_section": false @@ -5642,7 +5642,7 @@ "end_row_offset_idx": 19, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    OM digestibility, %", + "text": "OM digestibility, %", "column_header": false, "row_header": false, "row_section": false @@ -5774,7 +5774,7 @@ "end_row_offset_idx": 20, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    NEL, Mcal (kg DM)-1", + "text": "NEL, Mcal (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -5906,7 +5906,7 @@ "end_row_offset_idx": 21, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    MP, g (kg DM)-1", + "text": "MP, g (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -6713,7 +6713,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ground corn", + "text": "Ground corn", "column_header": false, "row_header": false, "row_section": false @@ -6847,7 +6847,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean meal", + "text": "Soybean meal", "column_header": false, "row_header": false, "row_section": false @@ -6981,7 +6981,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silage", + "text": "Corn silage", "column_header": false, "row_header": false, "row_section": false @@ -7115,7 +7115,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann temperate pasture", + "text": "Ann temperate pasture", "column_header": false, "row_header": false, "row_section": false @@ -7249,7 +7249,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ann tropical pasture", + "text": "Ann tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -7383,7 +7383,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Perenn tropical pasture", + "text": "Perenn tropical pasture", "column_header": false, "row_header": false, "row_section": false @@ -7651,7 +7651,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Organic matter", + "text": "Organic matter", "column_header": false, "row_header": false, "row_section": false @@ -7785,7 +7785,7 @@ "end_row_offset_idx": 14, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Crude protein", + "text": "Crude protein", "column_header": false, "row_header": false, "row_section": false @@ -7919,7 +7919,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Neutral detergent fibre", + "text": "Neutral detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -8053,7 +8053,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Acid detergent fibre", + "text": "Acid detergent fibre", "column_header": false, "row_header": false, "row_section": false @@ -8187,7 +8187,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Ether extract", + "text": "Ether extract", "column_header": false, "row_header": false, "row_section": false @@ -8455,7 +8455,7 @@ "end_row_offset_idx": 19, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    OM digestibility, %", + "text": "OM digestibility, %", "column_header": false, "row_header": false, "row_section": false @@ -8589,7 +8589,7 @@ "end_row_offset_idx": 20, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    NEL, Mcal (kg DM)-1", + "text": "NEL, Mcal (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -8723,7 +8723,7 @@ "end_row_offset_idx": 21, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    MP, g (kg DM)-1", + "text": "MP, g (kg DM)-1", "column_header": false, "row_header": false, "row_section": false @@ -8998,7 +8998,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn grain", + "text": "Corn grain", "column_header": false, "row_header": false, "row_section": false @@ -9058,7 +9058,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean", + "text": "Soybean", "column_header": false, "row_header": false, "row_section": false @@ -9178,7 +9178,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silageb", + "text": "Corn silageb", "column_header": false, "row_header": false, "row_section": false @@ -9238,7 +9238,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Annual ryegrassc", + "text": "Annual ryegrassc", "column_header": false, "row_header": false, "row_section": false @@ -9298,7 +9298,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Pearl milletd", + "text": "Pearl milletd", "column_header": false, "row_header": false, "row_section": false @@ -9358,7 +9358,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Kikuyu grasse", + "text": "Kikuyu grasse", "column_header": false, "row_header": false, "row_section": false @@ -9547,7 +9547,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn grain", + "text": "Corn grain", "column_header": false, "row_header": false, "row_section": false @@ -9609,7 +9609,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Soybean", + "text": "Soybean", "column_header": false, "row_header": false, "row_section": false @@ -9733,7 +9733,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Corn silageb", + "text": "Corn silageb", "column_header": false, "row_header": false, "row_section": false @@ -9795,7 +9795,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Annual ryegrassc", + "text": "Annual ryegrassc", "column_header": false, "row_header": false, "row_section": false @@ -9857,7 +9857,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Pearl milletd", + "text": "Pearl milletd", "column_header": false, "row_header": false, "row_section": false @@ -9919,7 +9919,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Kikuyu grasse", + "text": "Kikuyu grasse", "column_header": false, "row_header": false, "row_section": false @@ -10182,7 +10182,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N organic fertilizer, kg ha-1a", + "text": "N organic fertilizer, kg ha-1a", "column_header": false, "row_header": false, "row_section": false @@ -10242,7 +10242,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N synthetic fertilizer", + "text": "N synthetic fertilizer", "column_header": false, "row_header": false, "row_section": false @@ -10302,7 +10302,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N from residual DM, kg ha-1b", + "text": "N from residual DM, kg ha-1b", "column_header": false, "row_header": false, "row_section": false @@ -10362,7 +10362,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Emission fator, kg N2O-N (kg N)-1c", + "text": "Emission fator, kg N2O-N (kg N)-1c", "column_header": false, "row_header": false, "row_section": false @@ -10422,7 +10422,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from direct emissions", + "text": "kg N2O ha-1 from direct emissions", "column_header": false, "row_header": false, "row_section": false @@ -10542,7 +10542,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg organic N)-1b", + "text": "kg NH3-N+NOx-N (kg organic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10602,7 +10602,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg synthetic N)-1b", + "text": "kg NH3-N+NOx-N (kg synthetic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10662,7 +10662,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg NH3-N+NOx-N)-1b", + "text": "kg N2O-N (kg NH3-N+NOx-N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10722,7 +10722,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from NH3+NOx volatilized", + "text": "kg N2O ha-1 from NH3+NOx volatilized", "column_header": false, "row_header": false, "row_section": false @@ -10842,7 +10842,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N losses by leaching (kg N)-1b", + "text": "kg N losses by leaching (kg N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -10902,7 +10902,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg N leaching)-1", + "text": "kg N2O-N (kg N leaching)-1", "column_header": false, "row_header": false, "row_section": false @@ -10962,7 +10962,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from N losses by leaching", + "text": "kg N2O ha-1 from N losses by leaching", "column_header": false, "row_header": false, "row_section": false @@ -11873,7 +11873,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N organic fertilizer, kg ha-1a", + "text": "N organic fertilizer, kg ha-1a", "column_header": false, "row_header": false, "row_section": false @@ -11935,7 +11935,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N synthetic fertilizer", + "text": "N synthetic fertilizer", "column_header": false, "row_header": false, "row_section": false @@ -11997,7 +11997,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    N from residual DM, kg ha-1b", + "text": "N from residual DM, kg ha-1b", "column_header": false, "row_header": false, "row_section": false @@ -12059,7 +12059,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Emission fator, kg N2O-N (kg N)-1c", + "text": "Emission fator, kg N2O-N (kg N)-1c", "column_header": false, "row_header": false, "row_section": false @@ -12121,7 +12121,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from direct emissions", + "text": "kg N2O ha-1 from direct emissions", "column_header": false, "row_header": false, "row_section": false @@ -12245,7 +12245,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg organic N)-1b", + "text": "kg NH3-N+NOx-N (kg organic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12307,7 +12307,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg NH3-N+NOx-N (kg synthetic N)-1b", + "text": "kg NH3-N+NOx-N (kg synthetic N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12369,7 +12369,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg NH3-N+NOx-N)-1b", + "text": "kg N2O-N (kg NH3-N+NOx-N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12431,7 +12431,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from NH3+NOx volatilized", + "text": "kg N2O ha-1 from NH3+NOx volatilized", "column_header": false, "row_header": false, "row_section": false @@ -12555,7 +12555,7 @@ "end_row_offset_idx": 15, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N losses by leaching (kg N)-1b", + "text": "kg N losses by leaching (kg N)-1b", "column_header": false, "row_header": false, "row_section": false @@ -12617,7 +12617,7 @@ "end_row_offset_idx": 16, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O-N (kg N leaching)-1", + "text": "kg N2O-N (kg N leaching)-1", "column_header": false, "row_header": false, "row_section": false @@ -12679,7 +12679,7 @@ "end_row_offset_idx": 17, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    kg N2O ha-1 from N losses by leaching", + "text": "kg N2O ha-1 from N losses by leaching", "column_header": false, "row_header": false, "row_section": false @@ -13780,7 +13780,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Fuel for manure handling", + "text": "Fuel for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -13828,7 +13828,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Machinery for manure handling", + "text": "Machinery for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -13924,7 +13924,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for milking", + "text": "Electricity for milking", "column_header": false, "row_header": false, "row_section": false @@ -13972,7 +13972,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for lightingd", + "text": "Electricity for lightingd", "column_header": false, "row_header": false, "row_section": false @@ -14375,7 +14375,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Fuel for manure handling", + "text": "Fuel for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -14425,7 +14425,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Machinery for manure handling", + "text": "Machinery for manure handling", "column_header": false, "row_header": false, "row_section": false @@ -14525,7 +14525,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for milking", + "text": "Electricity for milking", "column_header": false, "row_header": false, "row_section": false @@ -14575,7 +14575,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "    Electricity for lightingd", + "text": "Electricity for lightingd", "column_header": false, "row_header": false, "row_section": false diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json index 29799b45..31978f31 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json @@ -8410,7 +8410,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Duck\n", + "text": "Duck", "column_header": true, "row_header": false, "row_section": false @@ -8422,7 +8422,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\n", + "text": "", "column_header": false, "row_header": false, "row_section": false @@ -8434,7 +8434,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Bufflehead\n(Bucephala albeola)\n", + "text": "Bufflehead\n(Bucephala albeola)", "column_header": false, "row_header": false, "row_section": false @@ -8446,7 +8446,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Scientific classification \n", + "text": "Scientific classification", "column_header": true, "row_header": false, "row_section": false @@ -8458,7 +8458,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Domain:\n", + "text": "Domain:", "column_header": false, "row_header": false, "row_section": false @@ -8470,7 +8470,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Eukaryota\n", + "text": "Eukaryota", "column_header": false, "row_header": false, "row_section": false @@ -8482,7 +8482,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Kingdom:\n", + "text": "Kingdom:", "column_header": false, "row_header": false, "row_section": false @@ -8494,7 +8494,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Animalia\n", + "text": "Animalia", "column_header": false, "row_header": false, "row_section": false @@ -8506,7 +8506,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Phylum:\n", + "text": "Phylum:", "column_header": false, "row_header": false, "row_section": false @@ -8518,7 +8518,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Chordata\n", + "text": "Chordata", "column_header": false, "row_header": false, "row_section": false @@ -8530,7 +8530,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Class:\n", + "text": "Class:", "column_header": false, "row_header": false, "row_section": false @@ -8542,7 +8542,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Aves\n", + "text": "Aves", "column_header": false, "row_header": false, "row_section": false @@ -8554,7 +8554,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Order:\n", + "text": "Order:", "column_header": false, "row_header": false, "row_section": false @@ -8566,7 +8566,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anseriformes\n", + "text": "Anseriformes", "column_header": false, "row_header": false, "row_section": false @@ -8578,7 +8578,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Superfamily:\n", + "text": "Superfamily:", "column_header": false, "row_header": false, "row_section": false @@ -8590,7 +8590,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatoidea\n", + "text": "Anatoidea", "column_header": false, "row_header": false, "row_section": false @@ -8602,7 +8602,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Family:\n", + "text": "Family:", "column_header": false, "row_header": false, "row_section": false @@ -8614,7 +8614,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatidae\n", + "text": "Anatidae", "column_header": false, "row_header": false, "row_section": false @@ -8626,7 +8626,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Subfamilies\n", + "text": "Subfamilies", "column_header": true, "row_header": false, "row_section": false @@ -8638,7 +8638,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\nSee text\n\n", + "text": "See text", "column_header": false, "row_header": false, "row_section": false @@ -8655,7 +8655,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Duck\n", + "text": "Duck", "column_header": true, "row_header": false, "row_section": false @@ -8667,7 +8667,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Duck\n", + "text": "Duck", "column_header": true, "row_header": false, "row_section": false @@ -8681,7 +8681,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\n", + "text": "", "column_header": false, "row_header": false, "row_section": false @@ -8693,7 +8693,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\n", + "text": "", "column_header": false, "row_header": false, "row_section": false @@ -8707,7 +8707,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Bufflehead\n(Bucephala albeola)\n", + "text": "Bufflehead\n(Bucephala albeola)", "column_header": false, "row_header": false, "row_section": false @@ -8719,7 +8719,7 @@ "end_row_offset_idx": 3, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Bufflehead\n(Bucephala albeola)\n", + "text": "Bufflehead\n(Bucephala albeola)", "column_header": false, "row_header": false, "row_section": false @@ -8733,7 +8733,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Scientific classification \n", + "text": "Scientific classification", "column_header": true, "row_header": false, "row_section": false @@ -8745,7 +8745,7 @@ "end_row_offset_idx": 4, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Scientific classification \n", + "text": "Scientific classification", "column_header": true, "row_header": false, "row_section": false @@ -8759,7 +8759,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Domain:\n", + "text": "Domain:", "column_header": false, "row_header": false, "row_section": false @@ -8771,7 +8771,7 @@ "end_row_offset_idx": 5, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Eukaryota\n", + "text": "Eukaryota", "column_header": false, "row_header": false, "row_section": false @@ -8785,7 +8785,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Kingdom:\n", + "text": "Kingdom:", "column_header": false, "row_header": false, "row_section": false @@ -8797,7 +8797,7 @@ "end_row_offset_idx": 6, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Animalia\n", + "text": "Animalia", "column_header": false, "row_header": false, "row_section": false @@ -8811,7 +8811,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Phylum:\n", + "text": "Phylum:", "column_header": false, "row_header": false, "row_section": false @@ -8823,7 +8823,7 @@ "end_row_offset_idx": 7, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Chordata\n", + "text": "Chordata", "column_header": false, "row_header": false, "row_section": false @@ -8837,7 +8837,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Class:\n", + "text": "Class:", "column_header": false, "row_header": false, "row_section": false @@ -8849,7 +8849,7 @@ "end_row_offset_idx": 8, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Aves\n", + "text": "Aves", "column_header": false, "row_header": false, "row_section": false @@ -8863,7 +8863,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Order:\n", + "text": "Order:", "column_header": false, "row_header": false, "row_section": false @@ -8875,7 +8875,7 @@ "end_row_offset_idx": 9, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anseriformes\n", + "text": "Anseriformes", "column_header": false, "row_header": false, "row_section": false @@ -8889,7 +8889,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Superfamily:\n", + "text": "Superfamily:", "column_header": false, "row_header": false, "row_section": false @@ -8901,7 +8901,7 @@ "end_row_offset_idx": 10, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatoidea\n", + "text": "Anatoidea", "column_header": false, "row_header": false, "row_section": false @@ -8915,7 +8915,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 0, "end_col_offset_idx": 1, - "text": "Family:\n", + "text": "Family:", "column_header": false, "row_header": false, "row_section": false @@ -8927,7 +8927,7 @@ "end_row_offset_idx": 11, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "Anatidae\n", + "text": "Anatidae", "column_header": false, "row_header": false, "row_section": false @@ -8941,7 +8941,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Subfamilies\n", + "text": "Subfamilies", "column_header": true, "row_header": false, "row_section": false @@ -8953,7 +8953,7 @@ "end_row_offset_idx": 12, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Subfamilies\n", + "text": "Subfamilies", "column_header": true, "row_header": false, "row_section": false @@ -8967,7 +8967,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\nSee text\n\n", + "text": "See text", "column_header": false, "row_header": false, "row_section": false @@ -8979,7 +8979,7 @@ "end_row_offset_idx": 13, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "\nSee text\n\n", + "text": "See text", "column_header": false, "row_header": false, "row_section": false @@ -9010,7 +9010,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Authority control databases ", + "text": "Authority control databases", "column_header": true, "row_header": false, "row_section": false @@ -9034,7 +9034,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "United StatesFranceBnF dataJapanLatviaIsrael", + "text": "United States France BnF data Japan Latvia Israel", "column_header": false, "row_header": false, "row_section": false @@ -9075,7 +9075,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Authority control databases ", + "text": "Authority control databases", "column_header": true, "row_header": false, "row_section": false @@ -9087,7 +9087,7 @@ "end_row_offset_idx": 1, "start_col_offset_idx": 0, "end_col_offset_idx": 2, - "text": "Authority control databases ", + "text": "Authority control databases", "column_header": true, "row_header": false, "row_section": false @@ -9113,7 +9113,7 @@ "end_row_offset_idx": 2, "start_col_offset_idx": 1, "end_col_offset_idx": 2, - "text": "United StatesFranceBnF dataJapanLatviaIsrael", + "text": "United States France BnF data Japan Latvia Israel", "column_header": false, "row_header": false, "row_section": false diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md index fa78a10d..d121e122 100644 --- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md +++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md @@ -511,10 +511,10 @@ Duck at Wikipedia's sister projects -| Authority control databases | Authority control databases | -|--------------------------------|----------------------------------------------| -| National | United StatesFranceBnF dataJapanLatviaIsrael | -| Other | IdRef | +| Authority control databases | Authority control databases | +|-------------------------------|---------------------------------------------------| +| National | United States France BnF data Japan Latvia Israel | +| Other | IdRef | Retrieved from "https://en.wikipedia.org/w/index.php?title=Duck&oldid=1246843351"