fix: add table raw content when no table structure model is used (#1815)

* add table raw cells when no table structure model was used Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> * Add RichTableCell instance for tables with missing structure. Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update test GT Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Michele Dolfi <dol@zurich.ibm.com> Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Co-authored-by: Christoph Auer <cau@zurich.ibm.com>
2025-12-11 22:28:31 +00:00 · 2025-10-02 13:46:42 +02:00
parent f0b630e24e
commit 4f295ed051
34 changed files with 6835 additions and 3389 deletions
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@@ -355,6 +355,13 @@ def convert(  # noqa: C901
            help="Replace any existing text with OCR generated text over the full content.",
        ),
    ] = False,
+    tables: Annotated[
+        bool,
+        typer.Option(
+            ...,
+            help="If enabled, the table structure model will be used to extract table information.",
+        ),
+    ] = True,
    ocr_engine: Annotated[
        str,
        typer.Option(
@@ -591,7 +598,7 @@ def convert(  # noqa: C901
                accelerator_options=accelerator_options,
                do_ocr=ocr,
                ocr_options=ocr_options,
-                do_table_structure=True,
+                do_table_structure=tables,
                do_code_enrichment=enrich_code,
                do_formula_enrichment=enrich_formula,
                do_picture_description=enrich_picture_description,
--- a/docling/models/readingorder_model.py
+++ b/docling/models/readingorder_model.py
@@ -9,6 +9,7 @@ from docling_core.types.doc import (
    NodeItem,
    ProvenanceItem,
    RefItem,
+    RichTableCell,
    TableData,
 )
 from docling_core.types.doc.document import ContentLayer
@@ -103,6 +104,22 @@ class ReadingOrderModel:
            else:
                doc.add_text(parent=doc_item, label=c_label, text=c_text, prov=c_prov)

+    def _create_rich_cell_group(
+        self, element: BasePageElement, doc: DoclingDocument, table_item: NodeItem
+    ) -> RefItem:
+        """Create a group containing all child elements for a rich table cell."""
+        group_name = f"rich_cell_group_{len(doc.tables)}_0_0"
+        group_element = doc.add_group(
+            label=GroupLabel.UNSPECIFIED,
+            name=group_name,
+            parent=table_item,
+        )
+
+        # Add all child elements to the group
+        self._add_child_elements(element, group_element, doc)
+
+        return group_element.get_ref()
+
    def _readingorder_elements_to_docling_doc(
        self,
        conv_res: ConversionResult,
@@ -197,11 +214,16 @@ class ReadingOrderModel:
                            )

            elif isinstance(element, Table):
-                tbl_data = TableData(
-                    num_rows=element.num_rows,
-                    num_cols=element.num_cols,
-                    table_cells=element.table_cells,
-                )
+                # Check if table has no structure prediction
+                if element.num_rows == 0 and element.num_cols == 0:
+                    # Create minimal 1x1 table with rich cell containing all children
+                    tbl_data = TableData(num_rows=1, num_cols=1, table_cells=[])
+                else:
+                    tbl_data = TableData(
+                        num_rows=element.num_rows,
+                        num_cols=element.num_cols,
+                        table_cells=element.table_cells,
+                    )

                prov = ProvenanceItem(
                    page_no=element.page_no + 1,
@@ -231,6 +253,26 @@ class ReadingOrderModel:

                        tbl.footnotes.append(new_footnote_item.get_ref())

+                # Handle case where table has no structure prediction
+                if element.num_rows == 0 and element.num_cols == 0:
+                    # Create rich cell containing all child elements
+                    rich_cell_ref = self._create_rich_cell_group(element, out_doc, tbl)
+
+                    # Create rich table cell spanning the entire 1x1 table
+                    rich_cell = RichTableCell(
+                        text="",  # Empty text since content is in the group
+                        row_span=1,
+                        col_span=1,
+                        start_row_offset_idx=0,
+                        end_row_offset_idx=1,
+                        start_col_offset_idx=0,
+                        end_col_offset_idx=1,
+                        column_header=False,
+                        row_header=False,
+                        ref=rich_cell_ref,
+                    )
+                    out_doc.add_table_cell(table_item=tbl, cell=rich_cell)
+
                # TODO: Consider adding children of Table.

            elif isinstance(element, FigureElement):