fix: add table raw content when no table structure model is used (#1815)

* add table raw cells when no table structure model was used

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>

* Add RichTableCell instance for tables with missing structure.

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

* Update test GT

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>

---------

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
Michele Dolfi
2025-10-02 13:46:42 +02:00
committed by GitHub
parent f0b630e24e
commit 4f295ed051
34 changed files with 6835 additions and 3389 deletions

View File

@@ -355,6 +355,13 @@ def convert( # noqa: C901
help="Replace any existing text with OCR generated text over the full content.",
),
] = False,
tables: Annotated[
bool,
typer.Option(
...,
help="If enabled, the table structure model will be used to extract table information.",
),
] = True,
ocr_engine: Annotated[
str,
typer.Option(
@@ -591,7 +598,7 @@ def convert( # noqa: C901
accelerator_options=accelerator_options,
do_ocr=ocr,
ocr_options=ocr_options,
do_table_structure=True,
do_table_structure=tables,
do_code_enrichment=enrich_code,
do_formula_enrichment=enrich_formula,
do_picture_description=enrich_picture_description,

View File

@@ -9,6 +9,7 @@ from docling_core.types.doc import (
NodeItem,
ProvenanceItem,
RefItem,
RichTableCell,
TableData,
)
from docling_core.types.doc.document import ContentLayer
@@ -103,6 +104,22 @@ class ReadingOrderModel:
else:
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
def _create_rich_cell_group(
self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
) -> RefItem:
"""Create a group containing all child elements for a rich table cell."""
group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
group_element = doc.add_group(
label=GroupLabel.UNSPECIFIED,
name=group_name,
parent=table_item,
)
# Add all child elements to the group
self._add_child_elements(element, group_element, doc)
return group_element.get_ref()
def _readingorder_elements_to_docling_doc(
self,
conv_res: ConversionResult,
@@ -197,11 +214,16 @@ class ReadingOrderModel:
)
elif isinstance(element, Table):
tbl_data = TableData(
num_rows=element.num_rows,
num_cols=element.num_cols,
table_cells=element.table_cells,
)
# Check if table has no structure prediction
if element.num_rows == 0 and element.num_cols == 0:
# Create minimal 1x1 table with rich cell containing all children
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
else:
tbl_data = TableData(
num_rows=element.num_rows,
num_cols=element.num_cols,
table_cells=element.table_cells,
)
prov = ProvenanceItem(
page_no=element.page_no + 1,
@@ -231,6 +253,26 @@ class ReadingOrderModel:
tbl.footnotes.append(new_footnote_item.get_ref())
# Handle case where table has no structure prediction
if element.num_rows == 0 and element.num_cols == 0:
# Create rich cell containing all child elements
rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
# Create rich table cell spanning the entire 1x1 table
rich_cell = RichTableCell(
text="", # Empty text since content is in the group
row_span=1,
col_span=1,
start_row_offset_idx=0,
end_row_offset_idx=1,
start_col_offset_idx=0,
end_col_offset_idx=1,
column_header=False,
row_header=False,
ref=rich_cell_ref,
)
out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
# TODO: Consider adding children of Table.
elif isinstance(element, FigureElement):