mirror of
https://github.com/DS4SD/docling.git
synced 2025-12-11 22:28:31 +00:00
fix: add table raw content when no table structure model is used (#1815)
* add table raw cells when no table structure model was used Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Add RichTableCell instance for tables with missing structure. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
@@ -355,6 +355,13 @@ def convert( # noqa: C901
|
||||
help="Replace any existing text with OCR generated text over the full content.",
|
||||
),
|
||||
] = False,
|
||||
tables: Annotated[
|
||||
bool,
|
||||
typer.Option(
|
||||
...,
|
||||
help="If enabled, the table structure model will be used to extract table information.",
|
||||
),
|
||||
] = True,
|
||||
ocr_engine: Annotated[
|
||||
str,
|
||||
typer.Option(
|
||||
@@ -591,7 +598,7 @@ def convert( # noqa: C901
|
||||
accelerator_options=accelerator_options,
|
||||
do_ocr=ocr,
|
||||
ocr_options=ocr_options,
|
||||
do_table_structure=True,
|
||||
do_table_structure=tables,
|
||||
do_code_enrichment=enrich_code,
|
||||
do_formula_enrichment=enrich_formula,
|
||||
do_picture_description=enrich_picture_description,
|
||||
|
||||
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
|
||||
NodeItem,
|
||||
ProvenanceItem,
|
||||
RefItem,
|
||||
RichTableCell,
|
||||
TableData,
|
||||
)
|
||||
from docling_core.types.doc.document import ContentLayer
|
||||
@@ -103,6 +104,22 @@ class ReadingOrderModel:
|
||||
else:
|
||||
doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)
|
||||
|
||||
def _create_rich_cell_group(
|
||||
self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
|
||||
) -> RefItem:
|
||||
"""Create a group containing all child elements for a rich table cell."""
|
||||
group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
|
||||
group_element = doc.add_group(
|
||||
label=GroupLabel.UNSPECIFIED,
|
||||
name=group_name,
|
||||
parent=table_item,
|
||||
)
|
||||
|
||||
# Add all child elements to the group
|
||||
self._add_child_elements(element, group_element, doc)
|
||||
|
||||
return group_element.get_ref()
|
||||
|
||||
def _readingorder_elements_to_docling_doc(
|
||||
self,
|
||||
conv_res: ConversionResult,
|
||||
@@ -197,11 +214,16 @@ class ReadingOrderModel:
|
||||
)
|
||||
|
||||
elif isinstance(element, Table):
|
||||
tbl_data = TableData(
|
||||
num_rows=element.num_rows,
|
||||
num_cols=element.num_cols,
|
||||
table_cells=element.table_cells,
|
||||
)
|
||||
# Check if table has no structure prediction
|
||||
if element.num_rows == 0 and element.num_cols == 0:
|
||||
# Create minimal 1x1 table with rich cell containing all children
|
||||
tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
|
||||
else:
|
||||
tbl_data = TableData(
|
||||
num_rows=element.num_rows,
|
||||
num_cols=element.num_cols,
|
||||
table_cells=element.table_cells,
|
||||
)
|
||||
|
||||
prov = ProvenanceItem(
|
||||
page_no=element.page_no + 1,
|
||||
@@ -231,6 +253,26 @@ class ReadingOrderModel:
|
||||
|
||||
tbl.footnotes.append(new_footnote_item.get_ref())
|
||||
|
||||
# Handle case where table has no structure prediction
|
||||
if element.num_rows == 0 and element.num_cols == 0:
|
||||
# Create rich cell containing all child elements
|
||||
rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
|
||||
|
||||
# Create rich table cell spanning the entire 1x1 table
|
||||
rich_cell = RichTableCell(
|
||||
text="", # Empty text since content is in the group
|
||||
row_span=1,
|
||||
col_span=1,
|
||||
start_row_offset_idx=0,
|
||||
end_row_offset_idx=1,
|
||||
start_col_offset_idx=0,
|
||||
end_col_offset_idx=1,
|
||||
column_header=False,
|
||||
row_header=False,
|
||||
ref=rich_cell_ref,
|
||||
)
|
||||
out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
|
||||
|
||||
# TODO: Consider adding children of Table.
|
||||
|
||||
elif isinstance(element, FigureElement):
|
||||
|
||||
Reference in New Issue
Block a user