fix(xlsx): speed up by detecting the true last non-empty row/column (#2404)

* Update msexcel_backend.py Fix #2307, Follow the instruction of https://github.com/docling-project/docling/issues/2307#issuecomment-3327248503. Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * Update msexcel_backend.py Fix error Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * Fix linting issues Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * Add test files and data (Signed-off-by: Huangrui Chu <huangrui.chu.1999@gmail.com>) Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * resolve conflict with test_backend_msexecl; update the boundary Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> * chore(xlsx): use a dataclass to represent a bounding rectangle in worksheets Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> * chore(xlsx): increase parsing speed by iterating on 'sheet._cells' Increase the parsing speed of the spreadsheet backend by iterating on 'sheets._cells' since this is proportional to the number of created cells. Rename test file to align it to other test files. Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> --------- Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com> Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com> Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
2025-12-08 12:48:28 +00:00 · 2025-10-21 02:08:20 -04:00
parent 657ce8b01c
commit b66624bfff
6 changed files with 4019 additions and 27 deletions
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@@ -1,7 +1,7 @@
 import logging
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Optional, Union, cast
+from typing import Annotated, Any, Optional, Union, cast

 from docling_core.types.doc import (
    BoundingBox,
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
 from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
 from openpyxl.worksheet.worksheet import Worksheet
 from PIL import Image as PILImage
-from pydantic import BaseModel, NonNegativeInt, PositiveInt
+from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
+from pydantic.dataclasses import dataclass
 from typing_extensions import override

 from docling.backend.abstract_backend import (
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
 _log = logging.getLogger(__name__)


+@dataclass
+class DataRegion:
+    """Represents the bounding rectangle of non-empty cells in a worksheet."""
+
+    min_row: Annotated[
+        PositiveInt, Field(description="Smallest row index (1-based index).")
+    ]
+    max_row: Annotated[
+        PositiveInt, Field(description="Largest row index (1-based index).")
+    ]
+    min_col: Annotated[
+        PositiveInt, Field(description="Smallest column index (1-based index).")
+    ]
+    max_col: Annotated[
+        PositiveInt, Field(description="Largest column index (1-based index).")
+    ]
+
+    def width(self) -> PositiveInt:
+        """Number of columns in the data region."""
+        return self.max_col - self.min_col + 1
+
+    def height(self) -> PositiveInt:
+        """Number of rows in the data region."""
+        return self.max_row - self.min_row + 1
+
+
 class ExcelCell(BaseModel):
    """Represents an Excel cell.

@@ -294,6 +321,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken

        return doc

+    def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
+        """Find the true data boundaries (min/max rows and columns) in a worksheet.
+
+        This function scans all cells to find the smallest rectangular region that contains
+        all non-empty cells or merged cell ranges. It returns the minimal and maximal
+        row/column indices that bound the actual data region.
+
+        Args:
+            sheet: The worksheet to analyze.
+
+        Returns:
+            A data region representing the smallest rectangle that covers all data and merged cells.
+            If the sheet is empty, returns (1, 1, 1, 1) by default.
+        """
+        min_row, min_col = None, None
+        max_row, max_col = 0, 0
+
+        for cell in sheet._cells.values():
+            if cell.value is not None:
+                r, c = cell.row, cell.column
+                min_row = r if min_row is None else min(min_row, r)
+                min_col = c if min_col is None else min(min_col, c)
+                max_row = max(max_row, r)
+                max_col = max(max_col, c)
+
+        # Expand bounds to include merged cells
+        for merged in sheet.merged_cells.ranges:
+            min_row = (
+                merged.min_row if min_row is None else min(min_row, merged.min_row)
+            )
+            min_col = (
+                merged.min_col if min_col is None else min(min_col, merged.min_col)
+            )
+            max_row = max(max_row, merged.max_row)
+            max_col = max(max_col, merged.max_col)
+
+        # If no data found, default to (1, 1, 1, 1)
+        if min_row is None or min_col is None:
+            min_row = min_col = max_row = max_col = 1
+
+        return DataRegion(min_row, max_row, min_col, max_col)
+
    def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
        """Find all compact rectangular data tables in an Excel worksheet.

@@ -303,18 +372,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        Returns:
            A list of ExcelTable objects representing the data tables.
        """
+        bounds: DataRegion = self._find_true_data_bounds(
+            sheet
+        )  # The true data boundaries
        tables: list[ExcelTable] = []  # List to store found tables
        visited: set[tuple[int, int]] = set()  # Track already visited cells

-        # Iterate over all cells in the sheet
-        for ri, row in enumerate(sheet.iter_rows(values_only=False)):
-            for rj, cell in enumerate(row):
-                # Skip empty or already visited cells
+        # Limit scan to actual data bounds
+        for ri, row in enumerate(
+            sheet.iter_rows(
+                min_row=bounds.min_row,
+                max_row=bounds.max_row,
+                min_col=bounds.min_col,
+                max_col=bounds.max_col,
+                values_only=False,
+            ),
+            start=bounds.min_row - 1,
+        ):
+            for rj, cell in enumerate(row, start=bounds.min_col - 1):
                if cell.value is None or (ri, rj) in visited:
                    continue

                # If the cell starts a new table, find its bounds
-                table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
+                table_bounds, visited_cells = self._find_table_bounds(
+                    sheet, ri, rj, bounds.max_row, bounds.max_col
+                )

                visited.update(visited_cells)  # Mark these cells as visited
                tables.append(table_bounds)
@@ -326,6 +408,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        sheet: Worksheet,
        start_row: int,
        start_col: int,
+        max_row: int,
+        max_col: int,
    ) -> tuple[ExcelTable, set[tuple[int, int]]]:
        """Determine the bounds of a compact rectangular table.

@@ -333,14 +417,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            sheet: The Excel worksheet to be parsed.
            start_row: The row number of the starting cell.
            start_col: The column number of the starting cell.
+            max_row: Maximum row boundary from true data bounds.
+            max_col: Maximum column boundary from true data bounds.

        Returns:
            A tuple with an Excel table and a set of cell coordinates.
        """
        _log.debug("find_table_bounds")

-        max_row = self._find_table_bottom(sheet, start_row, start_col)
-        max_col = self._find_table_right(sheet, start_row, start_col)
+        table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
+        table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)

        # Collect the data within the bounds
        data = []
@@ -348,9 +434,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        for ri, row in enumerate(
            sheet.iter_rows(
                min_row=start_row + 1,  # start_row is 0-based but iter_rows is 1-based
-                max_row=max_row + 1,
+                max_row=table_max_row + 1,
                min_col=start_col + 1,
-                max_col=max_col + 1,
+                max_col=table_max_col + 1,
                values_only=False,
            ),
            start_row,
@@ -390,15 +476,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
        return (
            ExcelTable(
                anchor=(start_col, start_row),
-                num_rows=max_row + 1 - start_row,
-                num_cols=max_col + 1 - start_col,
+                num_rows=table_max_row + 1 - start_row,
+                num_cols=table_max_col + 1 - start_col,
                data=data,
            ),
            visited_cells,
        )

    def _find_table_bottom(
-        self, sheet: Worksheet, start_row: int, start_col: int
+        self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
    ) -> int:
        """Find the bottom boundary of a table.

@@ -406,16 +492,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            sheet: The Excel worksheet to be parsed.
            start_row: The starting row of the table.
            start_col: The starting column of the table.
+            max_row: Maximum row boundary from true data bounds.

        Returns:
            The row index representing the bottom boundary of the table.
        """
-        max_row: int = start_row
+        table_max_row: int = start_row

        for ri, (cell,) in enumerate(
            sheet.iter_rows(
                min_row=start_row + 2,
-                max_row=sheet.max_row,
+                max_row=max_row,
                min_col=start_col + 1,
                max_col=start_col + 1,
                values_only=False,
@@ -431,16 +518,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            if cell.value is None and not merged_range:
                break  # Stop if the cell is empty and not merged

-            # Expand max_row to include the merged range if applicable
+            # Expand table_max_row to include the merged range if applicable
            if merged_range:
-                max_row = max(max_row, merged_range.max_row - 1)
+                table_max_row = max(table_max_row, merged_range.max_row - 1)
            else:
-                max_row = ri
+                table_max_row = ri

-        return max_row
+        return table_max_row

    def _find_table_right(
-        self, sheet: Worksheet, start_row: int, start_col: int
+        self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
    ) -> int:
        """Find the right boundary of a table.

@@ -448,18 +535,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            sheet: The Excel worksheet to be parsed.
            start_row: The starting row of the table.
            start_col: The starting column of the table.
+            max_col: The actual max column of the table.

        Returns:
            The column index representing the right boundary of the table."
        """
-        max_col: int = start_col
+        table_max_col: int = start_col

        for rj, (cell,) in enumerate(
            sheet.iter_cols(
                min_row=start_row + 1,
                max_row=start_row + 1,
                min_col=start_col + 2,
-                max_col=sheet.max_column,
+                max_col=max_col,
                values_only=False,
            ),
            start_col + 1,
@@ -473,13 +561,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
            if cell.value is None and not merged_range:
                break  # Stop if the cell is empty and not merged

-            # Expand max_col to include the merged range if applicable
+            # Expand table_max_col to include the merged range if applicable
            if merged_range:
-                max_col = max(max_col, merged_range.max_col - 1)
+                table_max_col = max(table_max_col, merged_range.max_col - 1)
            else:
-                max_col = rj
+                table_max_col = rj

-        return max_col
+        return table_max_col

    def _find_images_in_sheet(
        self, doc: DoclingDocument, sheet: Worksheet
--- a/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.itxt
+++ b/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.itxt
@@ -0,0 +1,11 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: section: group sheet: Sheet1
+    item-2 at level 2: table with [7x3]
+  item-3 at level 1: section: group sheet: Sheet2
+    item-4 at level 2: table with [9x4]
+    item-5 at level 2: table with [5x3]
+    item-6 at level 2: table with [5x3]
+  item-7 at level 1: section: group sheet: Sheet3
+    item-8 at level 2: table with [7x3]
+    item-9 at level 2: table with [7x3]
+    item-10 at level 2: picture
--- a/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.json
+++ b/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.json
--- a/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.md
+++ b/tests/data/groundtruth/docling_v2/xlsx_04_inflated.xlsx.md
@@ -0,0 +1,53 @@
+|   first  |   second  |   third |
+|----------|-----------|---------|
+|        1 |         5 |       9 |
+|        2 |         4 |       6 |
+|        3 |         3 |       3 |
+|        4 |         2 |       0 |
+|        5 |         1 |      -3 |
+|        6 |         0 |      -6 |
+
+|   col-1 |   col-2 |   col-3 |   col-4 |
+|---------|---------|---------|---------|
+|       1 |       2 |       3 |       4 |
+|       2 |       4 |       6 |       8 |
+|       3 |       6 |       9 |      12 |
+|       4 |       8 |      12 |      16 |
+|       5 |      10 |      15 |      20 |
+|       6 |      12 |      18 |      24 |
+|       7 |      14 |      21 |      28 |
+|       8 |      16 |      24 |      32 |
+
+|   col-1 |   col-2 |   col-3 |
+|---------|---------|---------|
+|       1 |       2 |       3 |
+|       2 |       4 |       6 |
+|       3 |       6 |       9 |
+|       4 |       8 |      12 |
+
+|   col-1 |   col-2 |   col-3 |
+|---------|---------|---------|
+|       1 |       2 |       3 |
+|       2 |       4 |       6 |
+|       3 |       6 |       9 |
+|       4 |       8 |      12 |
+
+| first    | header   | header   |
+|----------|----------|----------|
+| first    | second   | third    |
+| 1        | 2        | 3        |
+| 3        | 4        | 5        |
+| 3        | 6        | 7        |
+| 8        | 9        | 9        |
+| 10       | 9        | 9        |
+
+| first (f)   | header (f)   | header (f)   |
+|-------------|--------------|--------------|
+| first (f)   | second       | third        |
+| 1           | 2            | 3            |
+| 3           | 4            | 5            |
+| 3           | 6            | 7            |
+| 8           | 9            | 9            |
+| 10          | 9            | 9            |
+
+<!-- image -->
--- a/tests/data/xlsx/xlsx_04_inflated.xlsx
+++ b/tests/data/xlsx/xlsx_04_inflated.xlsx
--- a/tests/test_backend_msexcel.py
+++ b/tests/test_backend_msexcel.py
@@ -2,6 +2,7 @@ import logging
 from pathlib import Path

 import pytest
+from openpyxl import load_workbook

 from docling.backend.msexcel_backend import MsExcelDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -113,3 +114,67 @@ def test_chartsheet(documents) -> None:
    assert doc.groups[1].name == "sheet: Duck Chart"
    assert doc.pages[2].size.height == 0
    assert doc.pages[2].size.width == 0
+
+
+def test_inflated_rows_handling(documents) -> None:
+    """Test that files with inflated max_row are handled correctly.
+
+    xlsx_04_inflated.xlsx has inflated max_row (1,048,496) but only 7 rows of actual data.
+    This test verifies that our backend correctly identifies true data bounds.
+    """
+    # First, verify the file has inflated max_row using openpyxl directly
+    path = next(item for item in get_excel_paths() if item.stem == "xlsx_04_inflated")
+
+    wb = load_workbook(path)
+    ws = wb.active
+    reported_max_row = ws.max_row
+
+    # Assert that openpyxl reports inflated max_row
+    assert reported_max_row > 100000, (
+        f"xlsx_04_inflated.xlsx should have inflated max_row (expected >100k, got {reported_max_row:,}). "
+        f"This test file is designed to verify proper handling of Excel files with inflated row counts."
+    )
+
+    _log.info(
+        f"xlsx_04_inflated.xlsx - Openpyxl reported max_row: {reported_max_row:,}"
+    )
+
+    # Now test that our backend handles it correctly
+    in_doc = InputDocument(
+        path_or_stream=path,
+        format=InputFormat.XLSX,
+        filename=path.stem,
+        backend=MsExcelDocumentBackend,
+    )
+    backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
+
+    # Verify backend detects correct number of pages (should be 4, like test-01)
+    page_count = backend.page_count()
+    assert page_count == 4, (
+        f"Backend should detect 4 pages (same as test-01), got {page_count}"
+    )
+
+    # Verify converted document has correct pages
+    doc = next(item for path, item in documents if path.stem == "xlsx_04_inflated")
+    assert len(doc.pages) == 4, f"Document should have 4 pages, got {len(doc.pages)}"
+
+    # Verify page sizes match expected dimensions (same as test-01)
+    # These should reflect actual data, not inflated row counts
+    assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0), (
+        f"Page 1 should be 3x7 cells, got {doc.pages.get(1).size.as_tuple()}"
+    )
+    assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0), (
+        f"Page 2 should be 9x18 cells, got {doc.pages.get(2).size.as_tuple()}"
+    )
+    assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0), (
+        f"Page 3 should be 13x36 cells, got {doc.pages.get(3).size.as_tuple()}"
+    )
+    assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0), (
+        f"Page 4 should be 0x0 cells (empty), got {doc.pages.get(4).size.as_tuple()}"
+    )
+
+    _log.info(
+        f"✓ Successfully handled inflated max_row: "
+        f"reported {reported_max_row:,} rows, "
+        f"correctly processed as {page_count} pages with proper dimensions"
+    )