fix(xlsx): speed up by detecting the true last non-empty row/column (#2404)

* Update msexcel_backend.py

Fix #2307, Follow the instruction of https://github.com/docling-project/docling/issues/2307#issuecomment-3327248503.

Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com>

* Update msexcel_backend.py

Fix error

Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com>

* Fix linting issues

Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com>

* Add test files and data (Signed-off-by: Huangrui Chu <huangrui.chu.1999@gmail.com>)

Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com>

* resolve conflict with test_backend_msexecl; update the boundary

Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com>

* chore(xlsx): use a dataclass to represent a bounding rectangle in worksheets

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

* chore(xlsx): increase parsing speed by iterating on 'sheet._cells'

Increase the parsing speed of the spreadsheet backend by iterating on 'sheets._cells'
since this is proportional to the number of created cells.
Rename test file to align it to other test files.

Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>

---------

Signed-off-by: Richard (Huangrui) Chu <65276824+HuangruiChu@users.noreply.github.com>
Signed-off-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
Co-authored-by: Cesar Berrospi Ramis <ceb@zurich.ibm.com>
This commit is contained in:
Richard (Huangrui) Chu
2025-10-21 02:08:20 -04:00
committed by GitHub
parent 657ce8b01c
commit b66624bfff
6 changed files with 4019 additions and 27 deletions

View File

@@ -1,7 +1,7 @@
import logging
from io import BytesIO
from pathlib import Path
from typing import Any, Optional, Union, cast
from typing import Annotated, Any, Optional, Union, cast
from docling_core.types.doc import (
BoundingBox,
@@ -23,7 +23,8 @@ from openpyxl.drawing.image import Image
from openpyxl.drawing.spreadsheet_drawing import TwoCellAnchor
from openpyxl.worksheet.worksheet import Worksheet
from PIL import Image as PILImage
from pydantic import BaseModel, NonNegativeInt, PositiveInt
from pydantic import BaseModel, Field, NonNegativeInt, PositiveInt
from pydantic.dataclasses import dataclass
from typing_extensions import override
from docling.backend.abstract_backend import (
@@ -36,6 +37,32 @@ from docling.datamodel.document import InputDocument
_log = logging.getLogger(__name__)
@dataclass
class DataRegion:
"""Represents the bounding rectangle of non-empty cells in a worksheet."""
min_row: Annotated[
PositiveInt, Field(description="Smallest row index (1-based index).")
]
max_row: Annotated[
PositiveInt, Field(description="Largest row index (1-based index).")
]
min_col: Annotated[
PositiveInt, Field(description="Smallest column index (1-based index).")
]
max_col: Annotated[
PositiveInt, Field(description="Largest column index (1-based index).")
]
def width(self) -> PositiveInt:
"""Number of columns in the data region."""
return self.max_col - self.min_col + 1
def height(self) -> PositiveInt:
"""Number of rows in the data region."""
return self.max_row - self.min_row + 1
class ExcelCell(BaseModel):
"""Represents an Excel cell.
@@ -294,6 +321,48 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
return doc
def _find_true_data_bounds(self, sheet: Worksheet) -> DataRegion:
"""Find the true data boundaries (min/max rows and columns) in a worksheet.
This function scans all cells to find the smallest rectangular region that contains
all non-empty cells or merged cell ranges. It returns the minimal and maximal
row/column indices that bound the actual data region.
Args:
sheet: The worksheet to analyze.
Returns:
A data region representing the smallest rectangle that covers all data and merged cells.
If the sheet is empty, returns (1, 1, 1, 1) by default.
"""
min_row, min_col = None, None
max_row, max_col = 0, 0
for cell in sheet._cells.values():
if cell.value is not None:
r, c = cell.row, cell.column
min_row = r if min_row is None else min(min_row, r)
min_col = c if min_col is None else min(min_col, c)
max_row = max(max_row, r)
max_col = max(max_col, c)
# Expand bounds to include merged cells
for merged in sheet.merged_cells.ranges:
min_row = (
merged.min_row if min_row is None else min(min_row, merged.min_row)
)
min_col = (
merged.min_col if min_col is None else min(min_col, merged.min_col)
)
max_row = max(max_row, merged.max_row)
max_col = max(max_col, merged.max_col)
# If no data found, default to (1, 1, 1, 1)
if min_row is None or min_col is None:
min_row = min_col = max_row = max_col = 1
return DataRegion(min_row, max_row, min_col, max_col)
def _find_data_tables(self, sheet: Worksheet) -> list[ExcelTable]:
"""Find all compact rectangular data tables in an Excel worksheet.
@@ -303,18 +372,31 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
Returns:
A list of ExcelTable objects representing the data tables.
"""
bounds: DataRegion = self._find_true_data_bounds(
sheet
) # The true data boundaries
tables: list[ExcelTable] = [] # List to store found tables
visited: set[tuple[int, int]] = set() # Track already visited cells
# Iterate over all cells in the sheet
for ri, row in enumerate(sheet.iter_rows(values_only=False)):
for rj, cell in enumerate(row):
# Skip empty or already visited cells
# Limit scan to actual data bounds
for ri, row in enumerate(
sheet.iter_rows(
min_row=bounds.min_row,
max_row=bounds.max_row,
min_col=bounds.min_col,
max_col=bounds.max_col,
values_only=False,
),
start=bounds.min_row - 1,
):
for rj, cell in enumerate(row, start=bounds.min_col - 1):
if cell.value is None or (ri, rj) in visited:
continue
# If the cell starts a new table, find its bounds
table_bounds, visited_cells = self._find_table_bounds(sheet, ri, rj)
table_bounds, visited_cells = self._find_table_bounds(
sheet, ri, rj, bounds.max_row, bounds.max_col
)
visited.update(visited_cells) # Mark these cells as visited
tables.append(table_bounds)
@@ -326,6 +408,8 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
sheet: Worksheet,
start_row: int,
start_col: int,
max_row: int,
max_col: int,
) -> tuple[ExcelTable, set[tuple[int, int]]]:
"""Determine the bounds of a compact rectangular table.
@@ -333,14 +417,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
sheet: The Excel worksheet to be parsed.
start_row: The row number of the starting cell.
start_col: The column number of the starting cell.
max_row: Maximum row boundary from true data bounds.
max_col: Maximum column boundary from true data bounds.
Returns:
A tuple with an Excel table and a set of cell coordinates.
"""
_log.debug("find_table_bounds")
max_row = self._find_table_bottom(sheet, start_row, start_col)
max_col = self._find_table_right(sheet, start_row, start_col)
table_max_row = self._find_table_bottom(sheet, start_row, start_col, max_row)
table_max_col = self._find_table_right(sheet, start_row, start_col, max_col)
# Collect the data within the bounds
data = []
@@ -348,9 +434,9 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
for ri, row in enumerate(
sheet.iter_rows(
min_row=start_row + 1, # start_row is 0-based but iter_rows is 1-based
max_row=max_row + 1,
max_row=table_max_row + 1,
min_col=start_col + 1,
max_col=max_col + 1,
max_col=table_max_col + 1,
values_only=False,
),
start_row,
@@ -390,15 +476,15 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
return (
ExcelTable(
anchor=(start_col, start_row),
num_rows=max_row + 1 - start_row,
num_cols=max_col + 1 - start_col,
num_rows=table_max_row + 1 - start_row,
num_cols=table_max_col + 1 - start_col,
data=data,
),
visited_cells,
)
def _find_table_bottom(
self, sheet: Worksheet, start_row: int, start_col: int
self, sheet: Worksheet, start_row: int, start_col: int, max_row: int
) -> int:
"""Find the bottom boundary of a table.
@@ -406,16 +492,17 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
sheet: The Excel worksheet to be parsed.
start_row: The starting row of the table.
start_col: The starting column of the table.
max_row: Maximum row boundary from true data bounds.
Returns:
The row index representing the bottom boundary of the table.
"""
max_row: int = start_row
table_max_row: int = start_row
for ri, (cell,) in enumerate(
sheet.iter_rows(
min_row=start_row + 2,
max_row=sheet.max_row,
max_row=max_row,
min_col=start_col + 1,
max_col=start_col + 1,
values_only=False,
@@ -431,16 +518,16 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
if cell.value is None and not merged_range:
break # Stop if the cell is empty and not merged
# Expand max_row to include the merged range if applicable
# Expand table_max_row to include the merged range if applicable
if merged_range:
max_row = max(max_row, merged_range.max_row - 1)
table_max_row = max(table_max_row, merged_range.max_row - 1)
else:
max_row = ri
table_max_row = ri
return max_row
return table_max_row
def _find_table_right(
self, sheet: Worksheet, start_row: int, start_col: int
self, sheet: Worksheet, start_row: int, start_col: int, max_col: int
) -> int:
"""Find the right boundary of a table.
@@ -448,18 +535,19 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
sheet: The Excel worksheet to be parsed.
start_row: The starting row of the table.
start_col: The starting column of the table.
max_col: The actual max column of the table.
Returns:
The column index representing the right boundary of the table."
"""
max_col: int = start_col
table_max_col: int = start_col
for rj, (cell,) in enumerate(
sheet.iter_cols(
min_row=start_row + 1,
max_row=start_row + 1,
min_col=start_col + 2,
max_col=sheet.max_column,
max_col=max_col,
values_only=False,
),
start_col + 1,
@@ -473,13 +561,13 @@ class MsExcelDocumentBackend(DeclarativeDocumentBackend, PaginatedDocumentBacken
if cell.value is None and not merged_range:
break # Stop if the cell is empty and not merged
# Expand max_col to include the merged range if applicable
# Expand table_max_col to include the merged range if applicable
if merged_range:
max_col = max(max_col, merged_range.max_col - 1)
table_max_col = max(table_max_col, merged_range.max_col - 1)
else:
max_col = rj
table_max_col = rj
return max_col
return table_max_col
def _find_images_in_sheet(
self, doc: DoclingDocument, sheet: Worksheet

View File

@@ -0,0 +1,11 @@
item-0 at level 0: unspecified: group _root_
item-1 at level 1: section: group sheet: Sheet1
item-2 at level 2: table with [7x3]
item-3 at level 1: section: group sheet: Sheet2
item-4 at level 2: table with [9x4]
item-5 at level 2: table with [5x3]
item-6 at level 2: table with [5x3]
item-7 at level 1: section: group sheet: Sheet3
item-8 at level 2: table with [7x3]
item-9 at level 2: table with [7x3]
item-10 at level 2: picture

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,53 @@
| first | second | third |
|----------|-----------|---------|
| 1 | 5 | 9 |
| 2 | 4 | 6 |
| 3 | 3 | 3 |
| 4 | 2 | 0 |
| 5 | 1 | -3 |
| 6 | 0 | -6 |
| col-1 | col-2 | col-3 | col-4 |
|---------|---------|---------|---------|
| 1 | 2 | 3 | 4 |
| 2 | 4 | 6 | 8 |
| 3 | 6 | 9 | 12 |
| 4 | 8 | 12 | 16 |
| 5 | 10 | 15 | 20 |
| 6 | 12 | 18 | 24 |
| 7 | 14 | 21 | 28 |
| 8 | 16 | 24 | 32 |
| col-1 | col-2 | col-3 |
|---------|---------|---------|
| 1 | 2 | 3 |
| 2 | 4 | 6 |
| 3 | 6 | 9 |
| 4 | 8 | 12 |
| col-1 | col-2 | col-3 |
|---------|---------|---------|
| 1 | 2 | 3 |
| 2 | 4 | 6 |
| 3 | 6 | 9 |
| 4 | 8 | 12 |
| first | header | header |
|----------|----------|----------|
| first | second | third |
| 1 | 2 | 3 |
| 3 | 4 | 5 |
| 3 | 6 | 7 |
| 8 | 9 | 9 |
| 10 | 9 | 9 |
| first (f) | header (f) | header (f) |
|-------------|--------------|--------------|
| first (f) | second | third |
| 1 | 2 | 3 |
| 3 | 4 | 5 |
| 3 | 6 | 7 |
| 8 | 9 | 9 |
| 10 | 9 | 9 |
<!-- image -->

BIN
tests/data/xlsx/xlsx_04_inflated.xlsx vendored Normal file

Binary file not shown.

View File

@@ -2,6 +2,7 @@ import logging
from pathlib import Path
import pytest
from openpyxl import load_workbook
from docling.backend.msexcel_backend import MsExcelDocumentBackend
from docling.datamodel.base_models import InputFormat
@@ -113,3 +114,67 @@ def test_chartsheet(documents) -> None:
assert doc.groups[1].name == "sheet: Duck Chart"
assert doc.pages[2].size.height == 0
assert doc.pages[2].size.width == 0
def test_inflated_rows_handling(documents) -> None:
"""Test that files with inflated max_row are handled correctly.
xlsx_04_inflated.xlsx has inflated max_row (1,048,496) but only 7 rows of actual data.
This test verifies that our backend correctly identifies true data bounds.
"""
# First, verify the file has inflated max_row using openpyxl directly
path = next(item for item in get_excel_paths() if item.stem == "xlsx_04_inflated")
wb = load_workbook(path)
ws = wb.active
reported_max_row = ws.max_row
# Assert that openpyxl reports inflated max_row
assert reported_max_row > 100000, (
f"xlsx_04_inflated.xlsx should have inflated max_row (expected >100k, got {reported_max_row:,}). "
f"This test file is designed to verify proper handling of Excel files with inflated row counts."
)
_log.info(
f"xlsx_04_inflated.xlsx - Openpyxl reported max_row: {reported_max_row:,}"
)
# Now test that our backend handles it correctly
in_doc = InputDocument(
path_or_stream=path,
format=InputFormat.XLSX,
filename=path.stem,
backend=MsExcelDocumentBackend,
)
backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=path)
# Verify backend detects correct number of pages (should be 4, like test-01)
page_count = backend.page_count()
assert page_count == 4, (
f"Backend should detect 4 pages (same as test-01), got {page_count}"
)
# Verify converted document has correct pages
doc = next(item for path, item in documents if path.stem == "xlsx_04_inflated")
assert len(doc.pages) == 4, f"Document should have 4 pages, got {len(doc.pages)}"
# Verify page sizes match expected dimensions (same as test-01)
# These should reflect actual data, not inflated row counts
assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0), (
f"Page 1 should be 3x7 cells, got {doc.pages.get(1).size.as_tuple()}"
)
assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0), (
f"Page 2 should be 9x18 cells, got {doc.pages.get(2).size.as_tuple()}"
)
assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0), (
f"Page 3 should be 13x36 cells, got {doc.pages.get(3).size.as_tuple()}"
)
assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0), (
f"Page 4 should be 0x0 cells (empty), got {doc.pages.get(4).size.as_tuple()}"
)
_log.info(
f"✓ Successfully handled inflated max_row: "
f"reported {reported_max_row:,} rows, "
f"correctly processed as {page_count} pages with proper dimensions"
)