fix(html): Parse rawspan and colspan when they include non numerical values (#2048)

* use re to stop at first non-digit

Signed-off-by: Maroun Touma <touma@us.ibm.com>

* Allow digit in first place followed by non numerical values

Signed-off-by: Maroun Touma <touma@us.ibm.com>

* refactor to match type checker

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

---------

Signed-off-by: Maroun Touma <touma@us.ibm.com>
Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
Maroun Touma
2025-08-11 07:53:29 -04:00
committed by GitHub
parent bfda6d34d8
commit ed56f2de5d

View File

@@ -511,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
str(cell.get("colspan", "1")),
str(cell.get("rowspan", "1")),
)
def _extract_num(s: str) -> int:
if s and s[0].isnumeric():
match = re.search(r"\d+", s)
if match:
return int(match.group())
return 1
int_spans: tuple[int, int] = (
int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
_extract_num(raw_spans[0]),
_extract_num(raw_spans[1]),
)
return int_spans