fix: xlsx cell parsing, now returning values instead of formulas (#2520)

* fix: xlsx doc parsing, now returning values instead of formulas Signed-off-by: glypt <8trash-can8@protonmail.ch> * fix: add test for better coverage of xlsx backend Signed-off-by: glypt <8trash-can8@protonmail.ch> * fix: add the total of ducks as a formula in the tests/data This also adds the test that the value 310 is contained in the table. Without the fix from the previous commit, it would return "B7+C7" Signed-off-by: glypt <8trash-can8@protonmail.ch> --------- Signed-off-by: glypt <8trash-can8@protonmail.ch>
2025-12-08 20:58:11 +00:00 · 2025-10-29 11:35:51 +01:00
parent b6c892b505
commit d9c90eb45e
6 changed files with 291 additions and 15 deletions
--- a/tests/test_backend_msexcel.py
+++ b/tests/test_backend_msexcel.py
@@ -1,4 +1,5 @@
 import logging
+from io import BytesIO
 from pathlib import Path

 import pytest
@@ -116,6 +117,52 @@ def test_chartsheet(documents) -> None:
    assert doc.pages[2].size.width == 0


+def test_chartsheet_data_values(documents) -> None:
+    """Test that data values are extracted correctly from xlsx_03_chartsheet.
+
+    This test verifies that calculated values (not formulas) are returned.
+    The file contains duck observations with year 2024 having a total of 310 ducks.
+    We need to verify that both 2024 and 310 appear in the parsed data.
+
+    Args:
+        documents: The paths and converted documents.
+    """
+    doc = next(item for path, item in documents if path.stem == "xlsx_03_chartsheet")
+
+    # Find all tables
+    tables = list(doc.tables)
+    assert len(tables) > 0, "Should have at least one table"
+
+    # Look for a table that has the year 2024 in it
+    table_with_2024 = None
+    row_index_of_2024 = None
+
+    for table in tables:
+        for cell in table.data.table_cells:
+            if cell.text == "2024":
+                table_with_2024 = table
+                row_index_of_2024 = cell.start_row_offset_idx
+                break
+        if table_with_2024:
+            break
+
+    assert table_with_2024 is not None, "Should find a table containing year 2024"
+    assert row_index_of_2024 is not None, "Should find row index for 2024"
+
+    # Now verify that the value 310 exists in the document
+    # (it may be in the same table or a different table due to how the parser splits tables)
+    found_310 = False
+    for table in tables:
+        for cell in table.data.table_cells:
+            if cell.text == "310":
+                found_310 = True
+                break
+        if found_310:
+            break
+
+    assert found_310, "Should find the value 310 (total ducks for 2024) in the document"
+
+
 def test_inflated_rows_handling(documents) -> None:
    """Test that files with inflated max_row are handled correctly.

@@ -178,3 +225,46 @@ def test_inflated_rows_handling(documents) -> None:
        f"reported {reported_max_row:,} rows, "
        f"correctly processed as {page_count} pages with proper dimensions"
    )
+
+
+def test_bytesio_stream():
+    """Test that Excel files can be loaded from BytesIO streams.
+
+    This test verifies that the BytesIO code path in the backend is working correctly,
+    ensuring that data_only=True is applied when loading workbooks from streams.
+    """
+    # Get a test Excel file
+    path = next(item for item in get_excel_paths() if item.stem == "xlsx_01")
+
+    # Load the file into a BytesIO stream
+    buf = BytesIO(path.open("rb").read())
+
+    # Create an InputDocument with the BytesIO stream
+    in_doc = InputDocument(
+        path_or_stream=buf,
+        format=InputFormat.XLSX,
+        filename=path.stem,
+        backend=MsExcelDocumentBackend,
+    )
+
+    # Initialize the backend with the BytesIO stream
+    backend = MsExcelDocumentBackend(in_doc=in_doc, path_or_stream=buf)
+
+    # Verify the backend is valid
+    assert backend.is_valid(), "Backend should be valid when loaded from BytesIO"
+
+    # Verify page count matches expected value
+    assert backend.page_count() == 4, "Should detect 4 pages from BytesIO stream"
+
+    # Convert the document
+    doc = backend.convert()
+
+    # Verify the document was converted successfully
+    assert doc is not None, "Document should be converted from BytesIO stream"
+    assert len(doc.pages) == 4, "Document should have 4 pages"
+
+    # Verify page sizes match expected dimensions
+    assert doc.pages.get(1).size.as_tuple() == (3.0, 7.0)
+    assert doc.pages.get(2).size.as_tuple() == (9.0, 18.0)
+    assert doc.pages.get(3).size.as_tuple() == (13.0, 36.0)
+    assert doc.pages.get(4).size.as_tuple() == (0.0, 0.0)