fix(xlsx): speed up by detecting the true last non-empty row/column (#2404)

* Update msexcel_backend.py Fix #2307, Follow the instruction of https://github.com/docling-project/docling/issues/2307#issuecomment-3327248503. Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * Update msexcel_backend.py Fix error Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * Fix linting issues Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * Add test files and data (Signed-off-by: Huangrui Chu <huangrui.chu.1999@gmail.com>) Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * resolve conflict with test_backend_msexecl; update the boundary Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * chore(xlsx): use a dataclass to represent a bounding rectangle in worksheets Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(xlsx): increase parsing speed by iterating on 'sheet._cells' Increase the parsing speed of the spreadsheet backend by iterating on 'sheets._cells' since this is proportional to the number of created cells. Rename test file to align it to other test files. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-11 22:28:31 +00:00 · 2025-10-21 02:08:20 -04:00
parent 657ce8b01c
commit b66624bfff
6 changed files with 4019 additions and 27 deletions
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Optional, Union, cast
+from typing import Annotated, Any, Optional, Union, cast

 from docling_core.types.doc import (
    BoundingBox,
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
 from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
 from openpyxl.worksheet.worksheet import Worksheet
 from PIL import Image as PILImage
-from pydantic import BaseModel, NonNegativeInt, PositiveInt
+from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
+from pydantic.dataclasses import dataclass
 from typing_extensions import override

 from docling.backend.abstract_backend import (
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)


+@dataclass
+class DataRegion:
+    """Represents the bounding rectangle of non-empty cells in a worksheet."""
+
+    min_row: Annotated[
+        PositiveInt, Field(description="Smallest row index (1-based index).")
+    ]
+    max_row: Annotated[
+        PositiveInt, Field(description="Largest row index (1-based index).")
+    ]
+    min_col: Annotated[
+        PositiveInt, Field(description="Smallest column index (1-based index).")
+    ]
+    max_col: Annotated[
+        PositiveInt, Field(description="Largest column index (1-based index).")
+    ]
+
+    def width(self) -> PositiveInt:
+        """Number of columns in the data region."""
+        return self.max_col - self.min_col + 1
+
+    def height(self) -> PositiveInt:
+        """Number of rows in the data region."""
+        return self.max_row - self.min_row + 1
+
+
 class ExcelCell(BaseModel):
    """Represents an Excel cell.

@@ -294,6 +321,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken

        return doc

+    def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
+        """Find the true data boundaries (min/max rows and columns) in a worksheet.
+
+        This function scans all cells to find the smallest rectangular region that contains
+        all non-empty cells or merged cell ranges. It returns the minimal and maximal
+        row/column indices that bound the actual data region.
+
+        Args:
+            sheet: The worksheet to analyze.
+
+        Returns:
+            A data region representing the smallest rectangle that covers all data and merged cells.
+            If the sheet is empty, returns (1, 1, 1, 1) by default.
+        """
+        min_row, min_col = None, None
+        max_row, max_col = 0, 0
+
+        for cell in sheet._cells.values():
+            if cell.value is not None:
+                r, c = cell.row, cell.column
+                min_row = r if min_row is None else min(min_row, r)
+                min_col = c if min_col is None else min(min_col, c)
+                max_row = max(max_row, r)
+                max_col = max(max_col, c)
+
+        # Expand bounds to include merged cells
+        for merged in sheet.merged_cells.ranges:
+            min_row = (
+                merged.min_row if min_row is None else min(min_row, merged.min_row)
+            )
+            min_col = (
+                merged.min_col if min_col is None else min(min_col, merged.min_col)
+            )
+            max_row = max(max_row, merged.max_row)
+            max_col = max(max_col, merged.max_col)
+
+        # If no data found, default to (1, 1, 1, 1)
+        if min_row is None or min_col is None:
+            min_row = min_col = max_row = max_col = 1
+
+        return DataRegion(min_row, max_row, min_col, max_col)
+
    def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
        """Find all compact rectangular data tables in an Excel worksheet.

@@ -303,18 +372,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        Returns:
            A list of ExcelTable objects representing the data tables.
        """
+        bounds: DataRegion = self._find_true_data_bounds(
+            sheet
+        )  # The true data boundaries
        tables: list[ExcelTable] = []  # List to store found tables
        visited: set[tuple[int, int]] = set()  # Track already visited cells

-        # Iterate over all cells in the sheet
-        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
-            for rj, cell in enumerate(row):
-                # Skip empty or already visited cells
+        # Limit scan to actual data bounds
+        for ri, row in enumerate(
+            sheet.iter_rows(
+                min_row=bounds.min_row,
+                max_row=bounds.max_row,
+                min_col=bounds.min_col,
+                max_col=bounds.max_col,
+                values_only=False,
+            ),
+            start=bounds.min_row - 1,
+        ):
+            for rj, cell in enumerate(row, start=bounds.min_col - 1):
                if cell.value is None or (ri, rj) in visited:
                    continue

                # If the cell starts a new table, find its bounds
-                table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
+                table_bounds, visited_cells = self._find_table_bounds(
+                    sheet, ri, rj, bounds.max_row, bounds.max_col
+                )

                visited.update(visited_cells)  # Mark these cells as visited
                tables.append(table_bounds)
@@ -326,6 +408,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        sheet: Worksheet,
        start_row: int,
        start_col: int,
+        max_row: int,
+        max_col: int,
    ) -> tuple[ExcelTable, set[tuple[int, int]]]:
        """Determine the bounds of a compact rectangular table.

@@ -333,14 +417,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            sheet: The Excel worksheet to be parsed.
            start_row: The row number of the starting cell.
            start_col: The column number of the starting cell.
+            max_row: Maximum row boundary from true data bounds.
+            max_col: Maximum column boundary from true data bounds.

        Returns:
            A tuple with an Excel table and a set of cell coordinates.
        """
        _log.debug("find_table_bounds")

-        max_row = self._find_table_bottom(sheet, start_row, start_col)
-        max_col = self._find_table_right(sheet, start_row, start_col)
+        table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
+        table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)

        # Collect the data within the bounds
        data = []
@@ -348,9 +434,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        for ri, row in enumerate(
            sheet.iter_rows(
                min_row=start_row + 1,  # start_row is 0-based but iter_rows is 1-based
-                max_row=max_row + 1,
+                max_row=table_max_row + 1,
                min_col=start_col + 1,
-                max_col=max_col + 1,
+                max_col=table_max_col + 1,
                values_only=False,
            ),
            start_row,
@@ -390,15 +476,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        return (
            ExcelTable(
                anchor=(start_col, start_row),
-                num_rows=max_row + 1 - start_row,
-                num_cols=max_col + 1 - start_col,
+                num_rows=table_max_row + 1 - start_row,
+                num_cols=table_max_col + 1 - start_col,
                data=data,
            ),
            visited_cells,
        )

    def _find_table_bottom(
-        self, sheet: Worksheet, start_row: int, start_col: int
+        self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
    ) -> int:
        """Find the bottom boundary of a table.

@@ -406,16 +492,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            sheet: The Excel worksheet to be parsed.
            start_row: The starting row of the table.
            start_col: The starting column of the table.
+            max_row: Maximum row boundary from true data bounds.

        Returns:
            The row index representing the bottom boundary of the table.
        """
-        max_row: int = start_row
+        table_max_row: int = start_row

        for ri, (cell,) in enumerate(
            sheet.iter_rows(
                min_row=start_row + 2,
-                max_row=sheet.max_row,
+                max_row=max_row,
                min_col=start_col + 1,
                max_col=start_col + 1,
                values_only=False,
@@ -431,16 +518,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            if cell.value is None and not merged_range:
                break  # Stop if the cell is empty and not merged

-            # Expand max_row to include the merged range if applicable
+            # Expand table_max_row to include the merged range if applicable
            if merged_range:
-                max_row = max(max_row, merged_range.max_row - 1)
+                table_max_row = max(table_max_row, merged_range.max_row - 1)
            else:
-                max_row = ri
+                table_max_row = ri

-        return max_row
+        return table_max_row

    def _find_table_right(
-        self, sheet: Worksheet, start_row: int, start_col: int
+        self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
    ) -> int:
        """Find the right boundary of a table.

@@ -448,18 +535,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            sheet: The Excel worksheet to be parsed.
            start_row: The starting row of the table.
            start_col: The starting column of the table.
+            max_col: The actual max column of the table.

        Returns:
            The column index representing the right boundary of the table."
        """
-        max_col: int = start_col
+        table_max_col: int = start_col

        for rj, (cell,) in enumerate(
            sheet.iter_cols(
                min_row=start_row + 1,
                max_row=start_row + 1,
                min_col=start_col + 2,
-                max_col=sheet.max_column,
+                max_col=max_col,
                values_only=False,
            ),
            start_col + 1,
@@ -473,13 +561,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            if cell.value is None and not merged_range:
                break  # Stop if the cell is empty and not merged

-            # Expand max_col to include the merged range if applicable
+            # Expand table_max_col to include the merged range if applicable
            if merged_range:
-                max_col = max(max_col, merged_range.max_col - 1)
+                table_max_col = max(table_max_col, merged_range.max_col - 1)
            else:
-                max_col = rj
+                table_max_col = rj

-        return max_col
+        return table_max_col

    def _find_images_in_sheet(
        self, doc: DoclingDocument, sheet: Worksheet