mirror of
https://github.com/DS4SD/docling.git
synced 2025-07-27 04:24:45 +00:00
Replaced hardcoded otsl tokens with the ones from docling-core tokens.py enum
Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
This commit is contained in:
parent
2a43c199d5
commit
437053572d
@ -154,48 +154,8 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
return BoundingBox(l=l, t=t, r=r, b=b)
|
return BoundingBox(l=l, t=t, r=r, b=b)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def parse_table_content_old(otsl_content: str) -> TableData:
|
|
||||||
rows = []
|
|
||||||
table_cells = []
|
|
||||||
|
|
||||||
for row_content in otsl_content.split("<nl>"):
|
|
||||||
row_content = row_content.strip()
|
|
||||||
if not row_content:
|
|
||||||
continue
|
|
||||||
|
|
||||||
current_row = []
|
|
||||||
cells = re.findall(r"<(fcel|ecel)>([^<]*)", row_content)
|
|
||||||
for cell_type, cell_content in cells:
|
|
||||||
if cell_type == "fcel":
|
|
||||||
current_row.append(cell_content.strip())
|
|
||||||
elif cell_type == "ecel":
|
|
||||||
current_row.append("")
|
|
||||||
|
|
||||||
if current_row:
|
|
||||||
rows.append(current_row)
|
|
||||||
|
|
||||||
for r_idx, row in enumerate(rows):
|
|
||||||
for c_idx, cell_text in enumerate(row):
|
|
||||||
table_cells.append(
|
|
||||||
TableCell(
|
|
||||||
text=cell_text.strip(),
|
|
||||||
row_span=1,
|
|
||||||
col_span=1,
|
|
||||||
start_row_offset_idx=r_idx,
|
|
||||||
end_row_offset_idx=r_idx + 1,
|
|
||||||
start_col_offset_idx=c_idx,
|
|
||||||
end_col_offset_idx=c_idx + 1,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
return TableData(
|
|
||||||
num_rows=len(rows),
|
|
||||||
num_cols=max(len(row) for row in rows) if rows else 0,
|
|
||||||
table_cells=table_cells,
|
|
||||||
)
|
|
||||||
|
|
||||||
def parse_texts(texts, tokens):
|
def parse_texts(texts, tokens):
|
||||||
split_word = "<nl>"
|
split_word = TableToken.OTSL_NL.value
|
||||||
split_row_tokens = [
|
split_row_tokens = [
|
||||||
list(y)
|
list(y)
|
||||||
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
for x, y in itertools.groupby(tokens, lambda z: z == split_word)
|
||||||
@ -227,11 +187,17 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
|
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
cell_text = ""
|
cell_text = ""
|
||||||
if text in ["<fcel>", "<ecel>", "<ched>", "<rhed>", "<srow>"]:
|
if text in [
|
||||||
|
TableToken.OTSL_FCEL.value,
|
||||||
|
TableToken.OTSL_ECEL.value,
|
||||||
|
TableToken.OTSL_CHED.value,
|
||||||
|
TableToken.OTSL_RHED.value,
|
||||||
|
TableToken.OTSL_SROW.value,
|
||||||
|
]:
|
||||||
row_span = 1
|
row_span = 1
|
||||||
col_span = 1
|
col_span = 1
|
||||||
right_offset = 1
|
right_offset = 1
|
||||||
if text != "<ecel>":
|
if text != TableToken.OTSL_ECEL.value:
|
||||||
cell_text = texts[i + 1]
|
cell_text = texts[i + 1]
|
||||||
right_offset = 2
|
right_offset = 2
|
||||||
|
|
||||||
@ -242,15 +208,27 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
if r_idx + 1 < len(split_row_tokens):
|
if r_idx + 1 < len(split_row_tokens):
|
||||||
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
next_bottom_cell = split_row_tokens[r_idx + 1][c_idx]
|
||||||
|
|
||||||
if next_right_cell in ["<lcel>", "<xcel>"]:
|
if next_right_cell in [
|
||||||
|
TableToken.OTSL_LCEL.value,
|
||||||
|
TableToken.OTSL_XCEL.value,
|
||||||
|
]:
|
||||||
# we have horisontal spanning cell or 2d spanning cell
|
# we have horisontal spanning cell or 2d spanning cell
|
||||||
col_span += count_right(
|
col_span += count_right(
|
||||||
split_row_tokens, c_idx + 1, r_idx, ["<lcel>", "<xcel>"]
|
split_row_tokens,
|
||||||
|
c_idx + 1,
|
||||||
|
r_idx,
|
||||||
|
[TableToken.OTSL_LCEL.value, TableToken.OTSL_XCEL.value],
|
||||||
)
|
)
|
||||||
if next_bottom_cell in ["<ucel>", "<xcel>"]:
|
if next_bottom_cell in [
|
||||||
|
TableToken.OTSL_UCEL.value,
|
||||||
|
TableToken.OTSL_XCEL.value,
|
||||||
|
]:
|
||||||
# we have a vertical spanning cell or 2d spanning cell
|
# we have a vertical spanning cell or 2d spanning cell
|
||||||
row_span += count_down(
|
row_span += count_down(
|
||||||
split_row_tokens, c_idx, r_idx + 1, ["<lcel>", "<xcel>"]
|
split_row_tokens,
|
||||||
|
c_idx,
|
||||||
|
r_idx + 1,
|
||||||
|
[TableToken.OTSL_UCEL.value, TableToken.OTSL_XCEL.value],
|
||||||
)
|
)
|
||||||
|
|
||||||
table_cells.append(
|
table_cells.append(
|
||||||
@ -265,17 +243,17 @@ class VlmPipeline(PaginatedPipeline):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
if text in [
|
if text in [
|
||||||
"<fcel>",
|
TableToken.OTSL_FCEL.value,
|
||||||
"<ecel>",
|
TableToken.OTSL_ECEL.value,
|
||||||
"<ched>",
|
TableToken.OTSL_CHED.value,
|
||||||
"<rhed>",
|
TableToken.OTSL_RHED.value,
|
||||||
"<srow>",
|
TableToken.OTSL_SROW.value,
|
||||||
"<lcel>",
|
TableToken.OTSL_LCEL.value,
|
||||||
"<ucel>",
|
TableToken.OTSL_UCEL.value,
|
||||||
"<xcel>",
|
TableToken.OTSL_XCEL.value,
|
||||||
]:
|
]:
|
||||||
c_idx += 1
|
c_idx += 1
|
||||||
if text == "<nl>":
|
if text == TableToken.OTSL_NL.value:
|
||||||
r_idx += 1
|
r_idx += 1
|
||||||
c_idx = 0
|
c_idx = 0
|
||||||
return table_cells, split_row_tokens
|
return table_cells, split_row_tokens
|
||||||
|
Loading…
Reference in New Issue
Block a user