mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-08 12:48:28 +00:00
fix(html): Parse rawspan and colspan when they include non numerical values (#2048)
* use re to stop at first non-digit Signed-off-by: Maroun Touma <touma@us.ibm.com> * Allow digit in first place followed by non numerical values Signed-off-by: Maroun Touma <touma@us.ibm.com> * refactor to match type checker Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> --------- Signed-off-by: Maroun Touma <touma@us.ibm.com> Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Co-authored-by: Michele Dolfi <dol@zurich.ibm.com>
This commit is contained in:
@@ -511,9 +511,17 @@ class HTMLDocumentBackend(DeclarativeDocumentBackend):
|
||||
str(cell.get("colspan", "1")),
|
||||
str(cell.get("rowspan", "1")),
|
||||
)
|
||||
|
||||
def _extract_num(s: str) -> int:
|
||||
if s and s[0].isnumeric():
|
||||
match = re.search(r"\d+", s)
|
||||
if match:
|
||||
return int(match.group())
|
||||
return 1
|
||||
|
||||
int_spans: tuple[int, int] = (
|
||||
int(raw_spans[0]) if raw_spans[0].isnumeric() else 1,
|
||||
int(raw_spans[1]) if raw_spans[0].isnumeric() else 1,
|
||||
_extract_num(raw_spans[0]),
|
||||
_extract_num(raw_spans[1]),
|
||||
)
|
||||
|
||||
return int_spans
|
||||
|
||||
Reference in New Issue
Block a user