From b8871839ccfc189eb887d40fa24bf874a14235b2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 18:30:13 -0500 Subject: [PATCH 1/9] REG: read_excel with engine specified raises on non-path/non-buffer --- doc/source/whatsnew/v1.2.2.rst | 2 +- pandas/io/excel/_base.py | 22 +++++++++++++++------- pandas/tests/io/excel/test_openpyxl.py | 8 ++++++++ pandas/tests/io/excel/test_readers.py | 9 ++++++++- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.2.2.rst b/doc/source/whatsnew/v1.2.2.rst index 0ee1abaa2a0eb..dc27ceba138c9 100644 --- a/doc/source/whatsnew/v1.2.2.rst +++ b/doc/source/whatsnew/v1.2.2.rst @@ -21,7 +21,7 @@ Fixed regressions - Fixed regression in :meth:`~DataFrame.to_pickle` failing to create bz2/xz compressed pickle files with ``protocol=5`` (:issue:`39002`) - Fixed regression in :func:`pandas.testing.assert_series_equal` and :func:`pandas.testing.assert_frame_equal` always raising ``AssertionError`` when comparing extension dtypes (:issue:`39410`) - Fixed regression in :meth:`~DataFrame.to_csv` opening ``codecs.StreamWriter`` in binary mode instead of in text mode and ignoring user-provided ``mode`` (:issue:`39247`) -- +- Fixed regression in :func:`read_excel` that incorrectly raised when the argument ``io`` was a non-path and non-buffer and the ``engine`` argument was specified (:issue:`39528`) .. --------------------------------------------------------------------------- diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 213be7c05b370..1d77cc12caf8e 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1069,26 +1069,34 @@ def __init__( xlrd_version = LooseVersion(get_version(xlrd)) - if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): - ext = "xls" - else: + ext = None + if engine is None: + # Only determine ext if it is needed ext = inspect_excel_format( content_or_path=path_or_buffer, storage_options=storage_options ) - if engine is None: # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) if engine == "auto": engine = get_default_engine(ext, mode="reader") - if engine == "xlrd" and ext != "xls" and xlrd_version is not None: - if xlrd_version >= "2": + if engine == "xlrd" and xlrd_version is not None: + if ext is None: + # Need ext to determine ext in order to raise/warn + if isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + path_or_buffer, storage_options=storage_options + ) + + if ext != "xls" and xlrd_version >= "2": raise ValueError( f"Your version of xlrd is {xlrd_version}. In xlrd >= 2.0, " f"only the xls format is supported. Install openpyxl instead." ) - else: + elif ext != "xls": caller = inspect.stack()[1] if ( caller.filename.endswith( diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3155e22d3ff5d..7d24de2a17529 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -116,3 +116,11 @@ def test_to_excel_with_openpyxl_engine(ext): ).highlight_max() styled.to_excel(filename, engine="openpyxl") + + +def test_read_workbook(datapath, ext): + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename) + result = pd.read_excel(wb, engine="openpyxl") + expected = pd.read_excel(filename) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b2e87de5580e6..a594718bd62d9 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -2,6 +2,7 @@ from functools import partial import os from urllib.error import URLError +from zipfile import BadZipFile import numpy as np import pytest @@ -685,7 +686,13 @@ def test_missing_file_raises(self, read_ext): def test_corrupt_bytes_raises(self, read_ext, engine): bad_stream = b"foo" - with pytest.raises(ValueError, match="File is not a recognized excel file"): + if engine is None or engine == "xlrd": + error = ValueError + msg = "File is not a recognized excel file" + else: + error = BadZipFile + msg = "File is not a zip file" + with pytest.raises(error, match=msg): pd.read_excel(bad_stream) @tm.network From 601ad87cf02d2ddc68a026fd01ed87ceb24505c3 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 18:37:51 -0500 Subject: [PATCH 2/9] Restore special-casing for xlrd.Book even when engine is None --- pandas/io/excel/_base.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 1d77cc12caf8e..84b5cae09acce 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1072,9 +1072,12 @@ def __init__( ext = None if engine is None: # Only determine ext if it is needed - ext = inspect_excel_format( - content_or_path=path_or_buffer, storage_options=storage_options - ) + if xlrd_version is not None and isinstance(path_or_buffer, xlrd.Book): + ext = "xls" + else: + ext = inspect_excel_format( + content_or_path=path_or_buffer, storage_options=storage_options + ) # ext will always be valid, otherwise inspect_excel_format would raise engine = config.get_option(f"io.excel.{ext}.reader", silent=True) From d835dffdc779d90f6857432c3cd6f36ba4bf4b90 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Wed, 3 Feb 2021 18:44:20 -0500 Subject: [PATCH 3/9] GH # in test --- pandas/tests/io/excel/test_openpyxl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 7d24de2a17529..bc2aca862546b 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -119,6 +119,7 @@ def test_to_excel_with_openpyxl_engine(ext): def test_read_workbook(datapath, ext): + # GH 39528 filename = datapath("io", "data", "excel", "test1" + ext) wb = openpyxl.load_workbook(filename) result = pd.read_excel(wb, engine="openpyxl") From e6684e9a0f0f3d953ac05de3f7d68ddc327902d4 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 4 Feb 2021 18:19:24 -0500 Subject: [PATCH 4/9] Added wb.close() to test --- pandas/tests/io/excel/test_openpyxl.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index bc2aca862546b..d426811c43efa 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -123,5 +123,6 @@ def test_read_workbook(datapath, ext): filename = datapath("io", "data", "excel", "test1" + ext) wb = openpyxl.load_workbook(filename) result = pd.read_excel(wb, engine="openpyxl") + wb.close() expected = pd.read_excel(filename) tm.assert_frame_equal(result, expected) From ba2bc75a36d594ffec889e819e120a8e2aa70ea2 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 5 Feb 2021 16:44:55 -0500 Subject: [PATCH 5/9] Added logic/tests for determining if a sheet is read-only --- pandas/io/excel/_openpyxl.py | 6 ++-- pandas/tests/io/excel/test_openpyxl.py | 39 ++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 64c64b5009b0c..6bc742f6d9577 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -533,7 +533,9 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: version = LooseVersion(get_version(openpyxl)) - if version >= "3.0.0": + is_readonly = hasattr(sheet, "reset_dimensions") + + if version >= "3.0.0" and is_readonly: sheet.reset_dimensions() data: List[List[Scalar]] = [] @@ -541,7 +543,7 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: converted_row = [self._convert_cell(cell, convert_float) for cell in row] data.append(converted_row) - if version >= "3.0.0" and len(data) > 0: + if version >= "3.0.0" and is_readonly and len(data) > 0: # With dimension reset, openpyxl no longer pads rows max_width = max(len(data_row) for data_row in data) if min(len(data_row) for data_row in data) < max_width: diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 830da757d23e8..3bc502e8b33c6 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -151,10 +151,45 @@ def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename) tm.assert_frame_equal(result, expected) -def test_read_workbook(datapath, ext): +@pytest.mark.parametrize( + "header, expected_data", + [ + ( + 0, + { + "Title": [np.nan, "A", 1, 2, 3], + "Unnamed: 1": [np.nan, "B", 4, 5, 6], + "Unnamed: 2": [np.nan, "C", 7, 8, 9], + }, + ), + (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), + ], +) +@pytest.mark.parametrize( + "filename", ["dimension_missing", "dimension_small", "dimension_large"] +) +@pytest.mark.parametrize("read_only", [True, False]) +@pytest.mark.xfail( + LooseVersion(get_version(openpyxl)) < "3.0.0", + reason="openpyxl read-only sheet is incorrect when dimension data is wrong", +) +def test_read_wb_with_bad_dimension( + datapath, ext, filename, header, expected_data, read_only +): + # GH 38956, 39001 - no/incorrect dimension information + path = datapath("io", "data", "excel", f"{filename}{ext}") + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() + expected = DataFrame(expected_data) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): # GH 39528 filename = datapath("io", "data", "excel", "test1" + ext) - wb = openpyxl.load_workbook(filename) + wb = openpyxl.load_workbook(filename, read_only=read_only) result = pd.read_excel(wb, engine="openpyxl") wb.close() expected = pd.read_excel(filename) From 1381eccfa8e8ca19fde8dd0c483c33ba7a43fd32 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 5 Feb 2021 16:51:19 -0500 Subject: [PATCH 6/9] Added comment --- pandas/io/excel/_openpyxl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 6bc742f6d9577..274b18b2605cc 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -533,6 +533,8 @@ def get_sheet_data(self, sheet, convert_float: bool) -> List[List[Scalar]]: version = LooseVersion(get_version(openpyxl)) + # There is no good way of determining if a sheet is read-only + # https://foss.heptapod.net/openpyxl/openpyxl/-/issues/1605 is_readonly = hasattr(sheet, "reset_dimensions") if version >= "3.0.0" and is_readonly: From a3db3eb54b4439edbbd4ac49880649116d3e9288 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 5 Feb 2021 17:20:43 -0500 Subject: [PATCH 7/9] Combine and reorg tests --- pandas/tests/io/excel/test_openpyxl.py | 61 ++++++++------------------ 1 file changed, 18 insertions(+), 43 deletions(-) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 3bc502e8b33c6..2f901bde0ba09 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -122,32 +122,14 @@ def test_to_excel_with_openpyxl_engine(ext): styled.to_excel(filename, engine="openpyxl") -@pytest.mark.parametrize( - "header, expected_data", - [ - ( - 0, - { - "Title": [np.nan, "A", 1, 2, 3], - "Unnamed: 1": [np.nan, "B", 4, 5, 6], - "Unnamed: 2": [np.nan, "C", 7, 8, 9], - }, - ), - (2, {"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]}), - ], -) -@pytest.mark.parametrize( - "filename", ["dimension_missing", "dimension_small", "dimension_large"] -) -@pytest.mark.xfail( - LooseVersion(get_version(openpyxl)) < "3.0.0", - reason="openpyxl read-only sheet is incorrect when dimension data is wrong", -) -def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename): - # GH 38956, 39001 - no/incorrect dimension information - path = datapath("io", "data", "excel", f"{filename}{ext}") - result = pd.read_excel(path, header=header) - expected = DataFrame(expected_data) +@pytest.mark.parametrize("read_only", [True, False]) +def test_read_workbook(datapath, ext, read_only): + # GH 39528 + filename = datapath("io", "data", "excel", "test1" + ext) + wb = openpyxl.load_workbook(filename, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl") + wb.close() + expected = pd.read_excel(filename) tm.assert_frame_equal(result, expected) @@ -168,29 +150,22 @@ def test_read_with_bad_dimension(datapath, ext, header, expected_data, filename) @pytest.mark.parametrize( "filename", ["dimension_missing", "dimension_small", "dimension_large"] ) -@pytest.mark.parametrize("read_only", [True, False]) +# When read_only is None, use read_excel instead of a workbook +@pytest.mark.parametrize("read_only", [True, False, None]) @pytest.mark.xfail( LooseVersion(get_version(openpyxl)) < "3.0.0", reason="openpyxl read-only sheet is incorrect when dimension data is wrong", ) -def test_read_wb_with_bad_dimension( - datapath, ext, filename, header, expected_data, read_only +def test_read_with_bad_dimension( + datapath, ext, header, expected_data, filename, read_only ): # GH 38956, 39001 - no/incorrect dimension information path = datapath("io", "data", "excel", f"{filename}{ext}") - wb = openpyxl.load_workbook(path, read_only=read_only) - result = pd.read_excel(wb, engine="openpyxl", header=header) - wb.close() + if read_only is None: + result = pd.read_excel(path, header=header) + else: + wb = openpyxl.load_workbook(path, read_only=read_only) + result = pd.read_excel(wb, engine="openpyxl", header=header) + wb.close() expected = DataFrame(expected_data) tm.assert_frame_equal(result, expected) - - -@pytest.mark.parametrize("read_only", [True, False]) -def test_read_workbook(datapath, ext, read_only): - # GH 39528 - filename = datapath("io", "data", "excel", "test1" + ext) - wb = openpyxl.load_workbook(filename, read_only=read_only) - result = pd.read_excel(wb, engine="openpyxl") - wb.close() - expected = pd.read_excel(filename) - tm.assert_frame_equal(result, expected) From 33acfaa19077248e04a0ffa0be5ab82aa3b5ee36 Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Fri, 5 Feb 2021 19:59:10 -0500 Subject: [PATCH 8/9] Only mark as xfail for read-only cases --- pandas/tests/io/excel/test_openpyxl.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index 2f901bde0ba09..e6c50a174b088 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -152,14 +152,16 @@ def test_read_workbook(datapath, ext, read_only): ) # When read_only is None, use read_excel instead of a workbook @pytest.mark.parametrize("read_only", [True, False, None]) -@pytest.mark.xfail( - LooseVersion(get_version(openpyxl)) < "3.0.0", - reason="openpyxl read-only sheet is incorrect when dimension data is wrong", -) def test_read_with_bad_dimension( - datapath, ext, header, expected_data, filename, read_only + datapath, ext, header, expected_data, filename, read_only, request ): # GH 38956, 39001 - no/incorrect dimension information + version = LooseVersion(get_version(openpyxl)) + if (read_only or read_only is None) and version < "3.0.0": + msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" + request.node.add_marker( + pytest.mark.xfail(LooseVersion(get_version(openpyxl)) < "3.0.0", reason=msg) + ) path = datapath("io", "data", "excel", f"{filename}{ext}") if read_only is None: result = pd.read_excel(path, header=header) From 224d2a18aa87d6551b9f4a9201a713d1f77eafcc Mon Sep 17 00:00:00 2001 From: rhshadrach Date: Fri, 5 Feb 2021 20:01:36 -0500 Subject: [PATCH 9/9] fixup --- pandas/tests/io/excel/test_openpyxl.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/io/excel/test_openpyxl.py b/pandas/tests/io/excel/test_openpyxl.py index e6c50a174b088..da12829b579fe 100644 --- a/pandas/tests/io/excel/test_openpyxl.py +++ b/pandas/tests/io/excel/test_openpyxl.py @@ -159,9 +159,7 @@ def test_read_with_bad_dimension( version = LooseVersion(get_version(openpyxl)) if (read_only or read_only is None) and version < "3.0.0": msg = "openpyxl read-only sheet is incorrect when dimension data is wrong" - request.node.add_marker( - pytest.mark.xfail(LooseVersion(get_version(openpyxl)) < "3.0.0", reason=msg) - ) + request.node.add_marker(pytest.mark.xfail(reason=msg)) path = datapath("io", "data", "excel", f"{filename}{ext}") if read_only is None: result = pd.read_excel(path, header=header)