fix(html): Parse rawspan and colspan when they include non numerical values (#2048)

* use re to stop at first non-digit Signed-off-by: Maroun Touma <touma@us.ibm.com> * Allow digit in first place followed by non numerical values Signed-off-by: Maroun Touma <touma@us.ibm.com> * refactor to match type checker Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Maroun Touma <touma@us.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-08-11 07:53:29 -04:00
parent bfda6d34d8
commit ed56f2de5d
1 changed files with 10 additions and 2 deletions
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -511,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
            str(cell.get("colspan", "1")),
            str(cell.get("rowspan", "1")),
        )
+
+        def _extract_num(s: str) -> int:
+            if s and s[0].isnumeric():
+                match = re.search(r"\d+", s)
+                if match:
+                    return int(match.group())
+            return 1
+
        int_spans: tuple[int, int] = (
-            int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
-            int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
+            _extract_num(raw_spans[0]),
+            _extract_num(raw_spans[1]),
        )

        return int_spans