Skip to content

Commit e1a9b78

Browse files
authored
BUG: read_excel with openpyxl and missing dimension (#39486)
1 parent b8e0a5c commit e1a9b78

File tree

7 files changed

+63
-7
lines changed

7 files changed

+63
-7
lines changed

doc/source/whatsnew/v1.2.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ Bug fixes
3131
~~~~~~~~~
3232

3333
- :func:`pandas.read_excel` error message when a specified ``sheetname`` does not exist is now uniform across engines (:issue:`39250`)
34+
- Fixed bug in :func:`pandas.read_excel` producing incorrect results when the engine ``openpyxl`` is used and the excel file is missing or has incorrect dimension information; the fix requires ``openpyxl`` >= 3.0.0, prior versions may still fail (:issue:`38956`, :issue:`39001`)
3435
-
3536

3637
.. ---------------------------------------------------------------------------

pandas/compat/_optional.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"matplotlib": "2.2.3",
1818
"numexpr": "2.6.8",
1919
"odfpy": "1.3.0",
20-
"openpyxl": "2.5.7",
20+
"openpyxl": "2.6.0",
2121
"pandas_gbq": "0.12.0",
2222
"pyarrow": "0.15.0",
2323
"pytest": "5.0.1",

pandas/io/excel/_openpyxl.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
from __future__ import annotations
22

3+
from distutils.version import LooseVersion
34
from typing import TYPE_CHECKING, Dict, List, Optional
45

56
import numpy as np
67

78
from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions
8-
from pandas.compat._optional import import_optional_dependency
9+
from pandas.compat._optional import get_version, import_optional_dependency
910

1011
from pandas.io.excel._base import BaseExcelReader, ExcelWriter
1112
from pandas.io.excel._util import validate_freeze_panes
@@ -505,14 +506,14 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
505506

506507
from openpyxl.cell.cell import TYPE_BOOL, TYPE_ERROR, TYPE_NUMERIC
507508

508-
if cell.is_date:
509+
if cell.value is None:
510+
return "" # compat with xlrd
511+
elif cell.is_date:
509512
return cell.value
510513
elif cell.data_type == TYPE_ERROR:
511514
return np.nan
512515
elif cell.data_type == TYPE_BOOL:
513516
return bool(cell.value)
514-
elif cell.value is None:
515-
return "" # compat with xlrd
516517
elif cell.data_type == TYPE_NUMERIC:
517518
# GH5394
518519
if convert_float:
@@ -525,8 +526,29 @@ def _convert_cell(self, cell, convert_float: bool) -> Scalar:
525526
return cell.value
526527

527528
def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:
529+
# GH 39001
530+
# Reading of excel file depends on dimension data being correct but
531+
# writers sometimes omit or get it wrong
532+
import openpyxl
533+
534+
version = LooseVersion(get_version(openpyxl))
535+
536+
if version >= "3.0.0":
537+
sheet.reset_dimensions()
538+
528539
data: List[List[Scalar]] = []
529-
for row in sheet.rows:
530-
data.append([self._convert_cell(cell, convert_float) for cell in row])
540+
for row_number, row in enumerate(sheet.rows):
541+
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
542+
data.append(converted_row)
543+
544+
if version >= "3.0.0" and len(data) > 0:
545+
# With dimension reset, openpyxl no longer pads rows
546+
max_width = max(len(data_row) for data_row in data)
547+
if min(len(data_row) for data_row in data) < max_width:
548+
empty_cell: List[Scalar] = [""]
549+
data = [
550+
data_row + (max_width - len(data_row)) * empty_cell
551+
for data_row in data
552+
]
531553

532554
return data
4.78 KB
Binary file not shown.
Binary file not shown.
4.78 KB
Binary file not shown.

pandas/tests/io/excel/test_openpyxl.py

+33
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
1+
from distutils.version import LooseVersion
2+
13
import numpy as np
24
import pytest
35

6+
from pandas.compat._optional import get_version
7+
48
import pandas as pd
59
from pandas import DataFrame
610
import pandas._testing as tm
@@ -116,3 +120,32 @@ def test_to_excel_with_openpyxl_engine(ext):
116120
).highlight_max()
117121

118122
styled.to_excel(filename, engine="openpyxl")
123+
124+
125+
@pytest.mark.parametrize(
126+
"header, expected_data",
127+
[
128+
(
129+
0,
130+
{
131+
"Title": [np.nan, "A", 1, 2, 3],
132+
"Unnamed: 1": [np.nan, "B", 4, 5, 6],
133+
"Unnamed: 2": [np.nan, "C", 7, 8, 9],
134+
},
135+
),
136+
(2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}),
137+
],
138+
)
139+
@pytest.mark.parametrize(
140+
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
141+
)
142+
@pytest.mark.xfail(
143+
LooseVersion(get_version(openpyxl)) < "3.0.0",
144+
reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
145+
)
146+
def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
147+
# GH 38956, 39001 - no/incorrect dimension information
148+
path = datapath("io", "data", "excel", f"{filename}{ext}")
149+
result = pd.read_excel(path, header=header)
150+
expected = DataFrame(expected_data)
151+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)