diff --git a/CHANGELOG.md b/CHANGELOG.md
index dd313163..5994ae14 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,16 @@
+## [v2.42.2](https://github.com/docling-project/docling/releases/tag/v2.42.2) - 2025-07-24
+
+### Fix
+
+* **HTML:** Concatenation of child strings in table cells and list items ([#1981](https://github.com/docling-project/docling/issues/1981)) ([`5132f06`](https://github.com/docling-project/docling/commit/5132f061a8125332ba10a4a30e0dd4973637a11b))
+* **docx:** Adding plain latex equations to table cells ([#1986](https://github.com/docling-project/docling/issues/1986)) ([`0b83609`](https://github.com/docling-project/docling/commit/0b836095319ebf2133c4a3a77602718034915e55))
+* Preserve PARTIAL_SUCCESS status when document timeout hits ([#1975](https://github.com/docling-project/docling/issues/1975)) ([`98e2fcf`](https://github.com/docling-project/docling/commit/98e2fcff63660c158bafb9a1b5584c1439d7a533))
+* Multi-page image support (tiff) ([#1928](https://github.com/docling-project/docling/issues/1928)) ([`8d50a59`](https://github.com/docling-project/docling/commit/8d50a59d4887caac1c214add8037ed0b5250f68c))
+
+### Documentation
+
+* Add chat with dosu ([#1984](https://github.com/docling-project/docling/issues/1984)) ([`7b5f860`](https://github.com/docling-project/docling/commit/7b5f86098d07b734f2b6aa8c88ae7cafa265246a))
+
## [v2.42.1](https://github.com/docling-project/docling/releases/tag/v2.42.1) - 2025-07-22
### Fix
diff --git a/README.md b/README.md
index c53e7b79..ebc5aeb7 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@
[](https://opensource.org/licenses/MIT)
[](https://pepy.tech/projects/docling)
[](https://apify.com/vancura/docling)
+[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
[](https://www.bestpractices.dev/projects/10101)
[](https://lfaidata.foundation/projects/)
diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 1d3f5712..dffc095b 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -5,7 +5,7 @@ from io import BytesIO
from pathlib import Path
from typing import Final, Optional, Union, cast
-from bs4 import BeautifulSoup, NavigableString, Tag
+from bs4 import BeautifulSoup, NavigableString, PageElement, Tag
from bs4.element import PreformattedString
from docling_core.types.doc import (
DocItem,
@@ -297,7 +297,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
):
parts.append(child)
elif isinstance(child, Tag) and child.name not in ("ul", "ol"):
- text_part = child.get_text()
+ text_part = HTMLDocumentBackend.get_text(child)
if text_part:
parts.append(text_part)
li_text = re.sub(r"\s+|\n+", " ", "".join(parts)).strip()
@@ -417,6 +417,36 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
content_layer=self.content_layer,
)
+ @staticmethod
+ def get_text(item: PageElement) -> str:
+ """Concatenate all child strings of a PageElement.
+
+ This method is equivalent to `PageElement.get_text()` but also considers
+ certain tags. When called on a
or
tags, it returns the text with a
+ trailing space, otherwise the text is concatenated without separators.
+ """
+
+ def _extract_text_recursively(item: PageElement) -> list[str]:
+ """Recursively extract text from all child nodes."""
+ result: list[str] = []
+
+ if isinstance(item, NavigableString):
+ result = [item]
+ elif isinstance(item, Tag):
+ tag = cast(Tag, item)
+ parts: list[str] = []
+ for child in tag:
+ parts.extend(_extract_text_recursively(child))
+ result.append(
+ "".join(parts) + " " if tag.name in {"p", "li"} else "".join(parts)
+ )
+
+ return result
+
+ parts: list[str] = _extract_text_recursively(item)
+
+ return "".join(parts)
+
@staticmethod
def _get_cell_spans(cell: Tag) -> tuple[int, int]:
"""Extract colspan and rowspan values from a table cell tag.
@@ -510,9 +540,7 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
formula.replace_with(NavigableString(math_formula))
# TODO: extract content correctly from table-cells with lists
- text = html_cell.text
-
- # label = html_cell.name
+ text = HTMLDocumentBackend.get_text(html_cell).strip()
col_span, row_span = HTMLDocumentBackend._get_cell_spans(html_cell)
if row_header:
row_span -= 1
diff --git a/docling/backend/msword_backend.py b/docling/backend/msword_backend.py
index abbcc6f6..45c53a98 100644
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@@ -1104,8 +1104,17 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
)
_log.debug(f" spanned before row {spanned_idx}")
+ # Detect equations in cell text
+ text, equations = self._handle_equations_in_text(
+ element=cell._element, text=cell.text
+ )
+ if len(equations) == 0:
+ text = cell.text
+ else:
+ text = text.replace("", "$").replace("", "$")
+
table_cell = TableCell(
- text=cell.text,
+ text=text,
row_span=spanned_idx - row_idx,
col_span=cell.grid_span,
start_row_offset_idx=row.grid_cols_before + row_idx,
diff --git a/docs/index.md b/docs/index.md
index 7ec40bfa..768612ad 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -13,6 +13,7 @@
[](https://opensource.org/licenses/MIT)
[](https://pepy.tech/projects/docling)
[](https://apify.com/vancura/docling)
+[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
[](https://www.bestpractices.dev/projects/10101)
[](https://lfaidata.foundation/projects/)
@@ -46,6 +47,13 @@ Docling simplifies document processing, parsing diverse formats — including ad
Reference
See more API details
+## Live assistant
+
+Do you want to leverage the power of AI and get a live support on Docling?
+Try out the [Chat with Dosu](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github) functionalities provided by our friends at [Dosu](https://dosu.dev/).
+
+[](https://app.dosu.dev/097760a8-135e-4789-8234-90c8837d7f1c/ask?utm_source=github)
+
## LF AI & Data
Docling is hosted as a project in the [LF AI & Data Foundation](https://lfaidata.foundation/projects/).
diff --git a/pyproject.toml b/pyproject.toml
index dfaa2872..94dad105 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "docling"
-version = "2.42.1" # DO NOT EDIT, updated automatically
+version = "2.42.2" # DO NOT EDIT, updated automatically
description = "SDK and CLI for parsing PDF, DOCX, HTML, and more, to a unified document representation for powering downstream workflows such as gen AI applications."
license = "MIT"
keywords = [
diff --git a/tests/data/docx/table_with_equations.docx b/tests/data/docx/table_with_equations.docx
new file mode 100644
index 00000000..151c03b2
Binary files /dev/null and b/tests/data/docx/table_with_equations.docx differ
diff --git a/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json b/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json
index 1fae7c12..7d1e82b4 100644
--- a/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json
+++ b/tests/data/groundtruth/docling_v2/pntd.0008301.nxml.json
@@ -5839,7 +5839,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Number of total districts",
+ "text": "Number of total districts",
"column_header": false,
"row_header": false,
"row_section": false
@@ -6642,7 +6642,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Number of total districts",
+ "text": "Number of total districts",
"column_header": false,
"row_header": false,
"row_section": false
diff --git a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json
index 7d4f7ef8..b504fd9c 100644
--- a/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json
+++ b/tests/data/groundtruth/docling_v2/pone.0234687.nxml.json
@@ -4166,7 +4166,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ground corn",
+ "text": "Ground corn",
"column_header": false,
"row_header": false,
"row_section": false
@@ -4298,7 +4298,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Soybean meal",
+ "text": "Soybean meal",
"column_header": false,
"row_header": false,
"row_section": false
@@ -4430,7 +4430,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Corn silage",
+ "text": "Corn silage",
"column_header": false,
"row_header": false,
"row_section": false
@@ -4562,7 +4562,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ann temperate pasture",
+ "text": "Ann temperate pasture",
"column_header": false,
"row_header": false,
"row_section": false
@@ -4694,7 +4694,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ann tropical pasture",
+ "text": "Ann tropical pasture",
"column_header": false,
"row_header": false,
"row_section": false
@@ -4826,7 +4826,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Perenn tropical pasture",
+ "text": "Perenn tropical pasture",
"column_header": false,
"row_header": false,
"row_section": false
@@ -4970,7 +4970,7 @@
"end_row_offset_idx": 13,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Organic matter",
+ "text": "Organic matter",
"column_header": false,
"row_header": false,
"row_section": false
@@ -5102,7 +5102,7 @@
"end_row_offset_idx": 14,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Crude protein",
+ "text": "Crude protein",
"column_header": false,
"row_header": false,
"row_section": false
@@ -5234,7 +5234,7 @@
"end_row_offset_idx": 15,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Neutral detergent fibre",
+ "text": "Neutral detergent fibre",
"column_header": false,
"row_header": false,
"row_section": false
@@ -5366,7 +5366,7 @@
"end_row_offset_idx": 16,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Acid detergent fibre",
+ "text": "Acid detergent fibre",
"column_header": false,
"row_header": false,
"row_section": false
@@ -5498,7 +5498,7 @@
"end_row_offset_idx": 17,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ether extract",
+ "text": "Ether extract",
"column_header": false,
"row_header": false,
"row_section": false
@@ -5642,7 +5642,7 @@
"end_row_offset_idx": 19,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " OM digestibility, %",
+ "text": "OM digestibility, %",
"column_header": false,
"row_header": false,
"row_section": false
@@ -5774,7 +5774,7 @@
"end_row_offset_idx": 20,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " NEL, Mcal (kg DM)-1",
+ "text": "NEL, Mcal (kg DM)-1",
"column_header": false,
"row_header": false,
"row_section": false
@@ -5906,7 +5906,7 @@
"end_row_offset_idx": 21,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " MP, g (kg DM)-1",
+ "text": "MP, g (kg DM)-1",
"column_header": false,
"row_header": false,
"row_section": false
@@ -6713,7 +6713,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ground corn",
+ "text": "Ground corn",
"column_header": false,
"row_header": false,
"row_section": false
@@ -6847,7 +6847,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Soybean meal",
+ "text": "Soybean meal",
"column_header": false,
"row_header": false,
"row_section": false
@@ -6981,7 +6981,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Corn silage",
+ "text": "Corn silage",
"column_header": false,
"row_header": false,
"row_section": false
@@ -7115,7 +7115,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ann temperate pasture",
+ "text": "Ann temperate pasture",
"column_header": false,
"row_header": false,
"row_section": false
@@ -7249,7 +7249,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ann tropical pasture",
+ "text": "Ann tropical pasture",
"column_header": false,
"row_header": false,
"row_section": false
@@ -7383,7 +7383,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Perenn tropical pasture",
+ "text": "Perenn tropical pasture",
"column_header": false,
"row_header": false,
"row_section": false
@@ -7651,7 +7651,7 @@
"end_row_offset_idx": 13,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Organic matter",
+ "text": "Organic matter",
"column_header": false,
"row_header": false,
"row_section": false
@@ -7785,7 +7785,7 @@
"end_row_offset_idx": 14,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Crude protein",
+ "text": "Crude protein",
"column_header": false,
"row_header": false,
"row_section": false
@@ -7919,7 +7919,7 @@
"end_row_offset_idx": 15,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Neutral detergent fibre",
+ "text": "Neutral detergent fibre",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8053,7 +8053,7 @@
"end_row_offset_idx": 16,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Acid detergent fibre",
+ "text": "Acid detergent fibre",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8187,7 +8187,7 @@
"end_row_offset_idx": 17,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Ether extract",
+ "text": "Ether extract",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8455,7 +8455,7 @@
"end_row_offset_idx": 19,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " OM digestibility, %",
+ "text": "OM digestibility, %",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8589,7 +8589,7 @@
"end_row_offset_idx": 20,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " NEL, Mcal (kg DM)-1",
+ "text": "NEL, Mcal (kg DM)-1",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8723,7 +8723,7 @@
"end_row_offset_idx": 21,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " MP, g (kg DM)-1",
+ "text": "MP, g (kg DM)-1",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8998,7 +8998,7 @@
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Corn grain",
+ "text": "Corn grain",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9058,7 +9058,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Soybean",
+ "text": "Soybean",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9178,7 +9178,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Corn silageb",
+ "text": "Corn silageb",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9238,7 +9238,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Annual ryegrassc",
+ "text": "Annual ryegrassc",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9298,7 +9298,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Pearl milletd",
+ "text": "Pearl milletd",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9358,7 +9358,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Kikuyu grasse",
+ "text": "Kikuyu grasse",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9547,7 +9547,7 @@
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Corn grain",
+ "text": "Corn grain",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9609,7 +9609,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Soybean",
+ "text": "Soybean",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9733,7 +9733,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Corn silageb",
+ "text": "Corn silageb",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9795,7 +9795,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Annual ryegrassc",
+ "text": "Annual ryegrassc",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9857,7 +9857,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Pearl milletd",
+ "text": "Pearl milletd",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9919,7 +9919,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Kikuyu grasse",
+ "text": "Kikuyu grasse",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10182,7 +10182,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " N organic fertilizer, kg ha-1a",
+ "text": "N organic fertilizer, kg ha-1a",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10242,7 +10242,7 @@
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " N synthetic fertilizer",
+ "text": "N synthetic fertilizer",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10302,7 +10302,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " N from residual DM, kg ha-1b",
+ "text": "N from residual DM, kg ha-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10362,7 +10362,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Emission fator, kg N2O-N (kg N)-1c",
+ "text": "Emission fator, kg N2O-N (kg N)-1c",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10422,7 +10422,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O ha-1 from direct emissions",
+ "text": "kg N2O ha-1 from direct emissions",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10542,7 +10542,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg NH3-N+NOx-N (kg organic N)-1b",
+ "text": "kg NH3-N+NOx-N (kg organic N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10602,7 +10602,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg NH3-N+NOx-N (kg synthetic N)-1b",
+ "text": "kg NH3-N+NOx-N (kg synthetic N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10662,7 +10662,7 @@
"end_row_offset_idx": 12,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O-N (kg NH3-N+NOx-N)-1b",
+ "text": "kg N2O-N (kg NH3-N+NOx-N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10722,7 +10722,7 @@
"end_row_offset_idx": 13,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O ha-1 from NH3+NOx volatilized",
+ "text": "kg N2O ha-1 from NH3+NOx volatilized",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10842,7 +10842,7 @@
"end_row_offset_idx": 15,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N losses by leaching (kg N)-1b",
+ "text": "kg N losses by leaching (kg N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10902,7 +10902,7 @@
"end_row_offset_idx": 16,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O-N (kg N leaching)-1",
+ "text": "kg N2O-N (kg N leaching)-1",
"column_header": false,
"row_header": false,
"row_section": false
@@ -10962,7 +10962,7 @@
"end_row_offset_idx": 17,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O ha-1 from N losses by leaching",
+ "text": "kg N2O ha-1 from N losses by leaching",
"column_header": false,
"row_header": false,
"row_section": false
@@ -11873,7 +11873,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " N organic fertilizer, kg ha-1a",
+ "text": "N organic fertilizer, kg ha-1a",
"column_header": false,
"row_header": false,
"row_section": false
@@ -11935,7 +11935,7 @@
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " N synthetic fertilizer",
+ "text": "N synthetic fertilizer",
"column_header": false,
"row_header": false,
"row_section": false
@@ -11997,7 +11997,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " N from residual DM, kg ha-1b",
+ "text": "N from residual DM, kg ha-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12059,7 +12059,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Emission fator, kg N2O-N (kg N)-1c",
+ "text": "Emission fator, kg N2O-N (kg N)-1c",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12121,7 +12121,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O ha-1 from direct emissions",
+ "text": "kg N2O ha-1 from direct emissions",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12245,7 +12245,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg NH3-N+NOx-N (kg organic N)-1b",
+ "text": "kg NH3-N+NOx-N (kg organic N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12307,7 +12307,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg NH3-N+NOx-N (kg synthetic N)-1b",
+ "text": "kg NH3-N+NOx-N (kg synthetic N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12369,7 +12369,7 @@
"end_row_offset_idx": 12,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O-N (kg NH3-N+NOx-N)-1b",
+ "text": "kg N2O-N (kg NH3-N+NOx-N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12431,7 +12431,7 @@
"end_row_offset_idx": 13,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O ha-1 from NH3+NOx volatilized",
+ "text": "kg N2O ha-1 from NH3+NOx volatilized",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12555,7 +12555,7 @@
"end_row_offset_idx": 15,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N losses by leaching (kg N)-1b",
+ "text": "kg N losses by leaching (kg N)-1b",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12617,7 +12617,7 @@
"end_row_offset_idx": 16,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O-N (kg N leaching)-1",
+ "text": "kg N2O-N (kg N leaching)-1",
"column_header": false,
"row_header": false,
"row_section": false
@@ -12679,7 +12679,7 @@
"end_row_offset_idx": 17,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " kg N2O ha-1 from N losses by leaching",
+ "text": "kg N2O ha-1 from N losses by leaching",
"column_header": false,
"row_header": false,
"row_section": false
@@ -13780,7 +13780,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Fuel for manure handling",
+ "text": "Fuel for manure handling",
"column_header": false,
"row_header": false,
"row_section": false
@@ -13828,7 +13828,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Machinery for manure handling",
+ "text": "Machinery for manure handling",
"column_header": false,
"row_header": false,
"row_section": false
@@ -13924,7 +13924,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Electricity for milking",
+ "text": "Electricity for milking",
"column_header": false,
"row_header": false,
"row_section": false
@@ -13972,7 +13972,7 @@
"end_row_offset_idx": 12,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Electricity for lightingd",
+ "text": "Electricity for lightingd",
"column_header": false,
"row_header": false,
"row_section": false
@@ -14375,7 +14375,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Fuel for manure handling",
+ "text": "Fuel for manure handling",
"column_header": false,
"row_header": false,
"row_section": false
@@ -14425,7 +14425,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Machinery for manure handling",
+ "text": "Machinery for manure handling",
"column_header": false,
"row_header": false,
"row_section": false
@@ -14525,7 +14525,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Electricity for milking",
+ "text": "Electricity for milking",
"column_header": false,
"row_header": false,
"row_section": false
@@ -14575,7 +14575,7 @@
"end_row_offset_idx": 12,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": " Electricity for lightingd",
+ "text": "Electricity for lightingd",
"column_header": false,
"row_header": false,
"row_section": false
diff --git a/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt b/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt
new file mode 100644
index 00000000..8b54db7c
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.itxt
@@ -0,0 +1,3 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: table with [2x2]
+ item-2 at level 1: paragraph:
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/table_with_equations.docx.json b/tests/data/groundtruth/docling_v2/table_with_equations.docx.json
new file mode 100644
index 00000000..fc8f9780
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.json
@@ -0,0 +1,174 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.5.0",
+ "name": "table_with_equations",
+ "origin": {
+ "mimetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "binary_hash": 6528760837820727976,
+ "filename": "table_with_equations.docx"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "content_layer": "furniture",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/tables/0"
+ },
+ {
+ "$ref": "#/texts/0"
+ }
+ ],
+ "content_layer": "body",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "paragraph",
+ "prov": [],
+ "orig": "",
+ "text": ""
+ }
+ ],
+ "pictures": [],
+ "tables": [
+ {
+ "self_ref": "#/tables/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "table",
+ "prov": [],
+ "captions": [],
+ "references": [],
+ "footnotes": [],
+ "data": {
+ "table_cells": [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "The next cell has an equation",
+ "column_header": true,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "$A= \\pi r^{2}$",
+ "column_header": true,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "The next cell has another equation",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ }
+ ],
+ "num_rows": 2,
+ "num_cols": 2,
+ "grid": [
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "The next cell has an equation",
+ "column_header": true,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 0,
+ "end_row_offset_idx": 1,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "$A= \\pi r^{2}$",
+ "column_header": true,
+ "row_header": false,
+ "row_section": false
+ }
+ ],
+ [
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 0,
+ "end_col_offset_idx": 1,
+ "text": "The next cell has another equation",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ },
+ {
+ "row_span": 1,
+ "col_span": 1,
+ "start_row_offset_idx": 1,
+ "end_row_offset_idx": 2,
+ "start_col_offset_idx": 1,
+ "end_col_offset_idx": 2,
+ "text": "$x=\\frac{-b \\pm \\sqrt{b^{2}-4ac}}{2a}$",
+ "column_header": false,
+ "row_header": false,
+ "row_section": false
+ }
+ ]
+ ]
+ },
+ "annotations": []
+ }
+ ],
+ "key_value_items": [],
+ "form_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/table_with_equations.docx.md b/tests/data/groundtruth/docling_v2/table_with_equations.docx.md
new file mode 100644
index 00000000..837e6550
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/table_with_equations.docx.md
@@ -0,0 +1,3 @@
+| The next cell has an equation | $A= \pi r^{2}$ |
+|------------------------------------|----------------------------------------|
+| The next cell has another equation | $x=\frac{-b \pm \sqrt{b^{2}-4ac}}{2a}$ |
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.json b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
index 29799b45..31978f31 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.json
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.json
@@ -8410,7 +8410,7 @@
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Duck\n",
+ "text": "Duck",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8422,7 +8422,7 @@
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "\n",
+ "text": "",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8434,7 +8434,7 @@
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Bufflehead\n(Bucephala albeola)\n",
+ "text": "Bufflehead\n(Bucephala albeola)",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8446,7 +8446,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Scientific classification \n",
+ "text": "Scientific classification",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8458,7 +8458,7 @@
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Domain:\n",
+ "text": "Domain:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8470,7 +8470,7 @@
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Eukaryota\n",
+ "text": "Eukaryota",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8482,7 +8482,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Kingdom:\n",
+ "text": "Kingdom:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8494,7 +8494,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Animalia\n",
+ "text": "Animalia",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8506,7 +8506,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Phylum:\n",
+ "text": "Phylum:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8518,7 +8518,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Chordata\n",
+ "text": "Chordata",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8530,7 +8530,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Class:\n",
+ "text": "Class:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8542,7 +8542,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Aves\n",
+ "text": "Aves",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8554,7 +8554,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Order:\n",
+ "text": "Order:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8566,7 +8566,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Anseriformes\n",
+ "text": "Anseriformes",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8578,7 +8578,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Superfamily:\n",
+ "text": "Superfamily:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8590,7 +8590,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Anatoidea\n",
+ "text": "Anatoidea",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8602,7 +8602,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Family:\n",
+ "text": "Family:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8614,7 +8614,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Anatidae\n",
+ "text": "Anatidae",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8626,7 +8626,7 @@
"end_row_offset_idx": 12,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Subfamilies\n",
+ "text": "Subfamilies",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8638,7 +8638,7 @@
"end_row_offset_idx": 13,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "\nSee text\n\n",
+ "text": "See text",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8655,7 +8655,7 @@
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Duck\n",
+ "text": "Duck",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8667,7 +8667,7 @@
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Duck\n",
+ "text": "Duck",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8681,7 +8681,7 @@
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "\n",
+ "text": "",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8693,7 +8693,7 @@
"end_row_offset_idx": 2,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "\n",
+ "text": "",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8707,7 +8707,7 @@
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Bufflehead\n(Bucephala albeola)\n",
+ "text": "Bufflehead\n(Bucephala albeola)",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8719,7 +8719,7 @@
"end_row_offset_idx": 3,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Bufflehead\n(Bucephala albeola)\n",
+ "text": "Bufflehead\n(Bucephala albeola)",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8733,7 +8733,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Scientific classification \n",
+ "text": "Scientific classification",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8745,7 +8745,7 @@
"end_row_offset_idx": 4,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Scientific classification \n",
+ "text": "Scientific classification",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8759,7 +8759,7 @@
"end_row_offset_idx": 5,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Domain:\n",
+ "text": "Domain:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8771,7 +8771,7 @@
"end_row_offset_idx": 5,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Eukaryota\n",
+ "text": "Eukaryota",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8785,7 +8785,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Kingdom:\n",
+ "text": "Kingdom:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8797,7 +8797,7 @@
"end_row_offset_idx": 6,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Animalia\n",
+ "text": "Animalia",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8811,7 +8811,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Phylum:\n",
+ "text": "Phylum:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8823,7 +8823,7 @@
"end_row_offset_idx": 7,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Chordata\n",
+ "text": "Chordata",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8837,7 +8837,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Class:\n",
+ "text": "Class:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8849,7 +8849,7 @@
"end_row_offset_idx": 8,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Aves\n",
+ "text": "Aves",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8863,7 +8863,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Order:\n",
+ "text": "Order:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8875,7 +8875,7 @@
"end_row_offset_idx": 9,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Anseriformes\n",
+ "text": "Anseriformes",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8889,7 +8889,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Superfamily:\n",
+ "text": "Superfamily:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8901,7 +8901,7 @@
"end_row_offset_idx": 10,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Anatoidea\n",
+ "text": "Anatoidea",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8915,7 +8915,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 0,
"end_col_offset_idx": 1,
- "text": "Family:\n",
+ "text": "Family:",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8927,7 +8927,7 @@
"end_row_offset_idx": 11,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "Anatidae\n",
+ "text": "Anatidae",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8941,7 +8941,7 @@
"end_row_offset_idx": 12,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Subfamilies\n",
+ "text": "Subfamilies",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8953,7 +8953,7 @@
"end_row_offset_idx": 12,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Subfamilies\n",
+ "text": "Subfamilies",
"column_header": true,
"row_header": false,
"row_section": false
@@ -8967,7 +8967,7 @@
"end_row_offset_idx": 13,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "\nSee text\n\n",
+ "text": "See text",
"column_header": false,
"row_header": false,
"row_section": false
@@ -8979,7 +8979,7 @@
"end_row_offset_idx": 13,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "\nSee text\n\n",
+ "text": "See text",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9010,7 +9010,7 @@
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Authority control databases ",
+ "text": "Authority control databases",
"column_header": true,
"row_header": false,
"row_section": false
@@ -9034,7 +9034,7 @@
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "United StatesFranceBnF dataJapanLatviaIsrael",
+ "text": "United States France BnF data Japan Latvia Israel",
"column_header": false,
"row_header": false,
"row_section": false
@@ -9075,7 +9075,7 @@
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Authority control databases ",
+ "text": "Authority control databases",
"column_header": true,
"row_header": false,
"row_section": false
@@ -9087,7 +9087,7 @@
"end_row_offset_idx": 1,
"start_col_offset_idx": 0,
"end_col_offset_idx": 2,
- "text": "Authority control databases ",
+ "text": "Authority control databases",
"column_header": true,
"row_header": false,
"row_section": false
@@ -9113,7 +9113,7 @@
"end_row_offset_idx": 2,
"start_col_offset_idx": 1,
"end_col_offset_idx": 2,
- "text": "United StatesFranceBnF dataJapanLatviaIsrael",
+ "text": "United States France BnF data Japan Latvia Israel",
"column_header": false,
"row_header": false,
"row_section": false
diff --git a/tests/data/groundtruth/docling_v2/wiki_duck.html.md b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
index fa78a10d..d121e122 100644
--- a/tests/data/groundtruth/docling_v2/wiki_duck.html.md
+++ b/tests/data/groundtruth/docling_v2/wiki_duck.html.md
@@ -511,10 +511,10 @@ Duck at Wikipedia's sister projects
-| Authority control databases | Authority control databases |
-|--------------------------------|----------------------------------------------|
-| National | United StatesFranceBnF dataJapanLatviaIsrael |
-| Other | IdRef |
+| Authority control databases | Authority control databases |
+|-------------------------------|---------------------------------------------------|
+| National | United States France BnF data Japan Latvia Israel |
+| Other | IdRef |
Retrieved from "https://en.wikipedia.org/w/index.php?title=Duck&oldid=1246843351"
diff --git a/uv.lock b/uv.lock
index 30d35235..4a3f55e7 100644
--- a/uv.lock
+++ b/uv.lock
@@ -806,7 +806,7 @@ wheels = [
[[package]]
name = "docling"
-version = "2.42.1"
+version = "2.42.2"
source = { editable = "." }
dependencies = [
{ name = "accelerate" },