Skip to content

REG: read_excel with engine specified raises on non-path/non-buffer #39586

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Feb 7, 2021
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.2.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixed regressions
- Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`)
- Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`)
- Fixed regression in :meth:`core.window.rolling.Rolling.count` where the ``min_periods`` argument would be set to ``0`` after the operation (:issue:`39554`)
- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`)
-

.. ---------------------------------------------------------------------------
Expand Down
31 changes: 21 additions & 10 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,26 +1069,37 @@ def __init__(

xlrd_version = LooseVersion(get_version(xlrd))

if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content_or_path=path_or_buffer, storage_options=storage_options
)

ext = None
if engine is None:
# Only determine ext if it is needed
if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
content_or_path=path_or_buffer, storage_options=storage_options
)

# ext will always be valid, otherwise inspect_excel_format would raise
engine = config.get_option(f"io.excel.{ext}.reader", silent=True)
if engine == "auto":
engine = get_default_engine(ext, mode="reader")

if engine == "xlrd" and ext != "xls" and xlrd_version is not None:
if xlrd_version >= "2":
if engine == "xlrd" and xlrd_version is not None:
if ext is None:
# Need ext to determine ext in order to raise/warn
if isinstance(path_or_buffer, xlrd.Book):
ext = "xls"
else:
ext = inspect_excel_format(
path_or_buffer, storage_options=storage_options
)

if ext != "xls" and xlrd_version >= "2":
raise ValueError(
f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, "
f"only the xls format is supported. Install openpyxl instead."
)
else:
elif ext != "xls":
caller = inspect.stack()[1]
if (
caller.filename.endswith(
Expand Down
8 changes: 6 additions & 2 deletions pandas/io/excel/_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,15 +533,19 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]:

version = LooseVersion(get_version(openpyxl))

if version >= "3.0.0":
# There is no good way of determining if a sheet is read-only
# https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605
is_readonly = hasattr(sheet, "reset_dimensions")

if version >= "3.0.0" and is_readonly:
sheet.reset_dimensions()

data: List[List[Scalar]] = []
for row_number, row in enumerate(sheet.rows):
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
data.append(converted_row)

if version >= "3.0.0" and len(data) > 0:
if version >= "3.0.0" and is_readonly and len(data) > 0:
# With dimension reset, openpyxl no longer pads rows
max_width = max(len(data_row) for data_row in data)
if min(len(data_row) for data_row in data) < max_width:
Expand Down
32 changes: 26 additions & 6 deletions pandas/tests/io/excel/test_openpyxl.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,17 @@ def test_to_excel_with_openpyxl_engine(ext):
styled.to_excel(filename, engine="openpyxl")


@pytest.mark.parametrize("read_only", [True, False])
def test_read_workbook(datapath, ext, read_only):
# GH 39528
filename = datapath("io", "data", "excel", "test1" + ext)
wb = openpyxl.load_workbook(filename, read_only=read_only)
result = pd.read_excel(wb, engine="openpyxl")
wb.close()
expected = pd.read_excel(filename)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It cannot hurt to call wb.close() before tm.assert_frame_equal

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @twoertwein, added.

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"header, expected_data",
[
Expand All @@ -139,13 +150,22 @@ def test_to_excel_with_openpyxl_engine(ext):
@pytest.mark.parametrize(
"filename", ["dimension_missing", "dimension_small", "dimension_large"]
)
@pytest.mark.xfail(
LooseVersion(get_version(openpyxl)) < "3.0.0",
reason="openpyxl read-only sheet is incorrect when dimension data is wrong",
)
def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename):
# When read_only is None, use read_excel instead of a workbook
@pytest.mark.parametrize("read_only", [True, False, None])
def test_read_with_bad_dimension(
datapath, ext, header, expected_data, filename, read_only, request
):
# GH 38956, 39001 - no/incorrect dimension information
version = LooseVersion(get_version(openpyxl))
if (read_only or read_only is None) and version < "3.0.0":
msg = "openpyxl read-only sheet is incorrect when dimension data is wrong"
request.node.add_marker(pytest.mark.xfail(reason=msg))
path = datapath("io", "data", "excel", f"{filename}{ext}")
result = pd.read_excel(path, header=header)
if read_only is None:
result = pd.read_excel(path, header=header)
else:
wb = openpyxl.load_workbook(path, read_only=read_only)
result = pd.read_excel(wb, engine="openpyxl", header=header)
wb.close()
expected = DataFrame(expected_data)
tm.assert_frame_equal(result, expected)
9 changes: 8 additions & 1 deletion pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from functools import partial
import os
from urllib.error import URLError
from zipfile import BadZipFile

import numpy as np
import pytest
Expand Down Expand Up @@ -685,7 +686,13 @@ def test_missing_file_raises(self, read_ext):

def test_corrupt_bytes_raises(self, read_ext, engine):
bad_stream = b"foo"
with pytest.raises(ValueError, match="File is not a recognized excel file"):
if engine is None or engine == "xlrd":
error = ValueError
msg = "File is not a recognized excel file"
else:
error = BadZipFile
msg = "File is not a zip file"
with pytest.raises(error, match=msg):
pd.read_excel(bad_stream)

@tm.network
Expand Down