From a2ce319c7c1f37d5626786eb9adeac2db1523edc Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 13:02:07 +0000 Subject: [PATCH 1/8] Re-work engine guessing in ExcelFile / read_excel. This uses more reliable content introspection that results in better engine selection. It also removes situations where xlrd gets handed a file it no longer supports. --- pandas/io/excel/_base.py | 149 +++++++++++------- pandas/tests/io/excel/test_readers.py | 4 +- pandas/tests/io/excel/test_readers_special.py | 20 +++ 3 files changed, 116 insertions(+), 57 deletions(-) create mode 100644 pandas/tests/io/excel/test_readers_special.py diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bf1011176693f..05887b6aa8acb 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,9 +3,11 @@ import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os +from pathlib import Path from textwrap import fill -from typing import Any, Dict, Mapping, Union, cast +from typing import Any, Dict, Mapping, Union, cast, BinaryIO import warnings +from zipfile import ZipFile from pandas._config import config @@ -888,32 +890,73 @@ def close(self): return content -def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool: +def _peek(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes: """ - Check if the stream is an OpenDocument Spreadsheet (.ods) file - - It uses magic values inside the stream + Return the specified number of bytes from the start of the stream + and seek back to the start of the stream afterwards. Parameters ---------- stream : Union[BufferedIOBase, RawIOBase] - IO stream with data which might be an ODS file Returns ------- - is_ods : bool - Boolean indication that this is indeed an ODS file or not + content : bytes + The bytes founds. """ stream.seek(0) - is_ods = False - if stream.read(4) == b"PK\003\004": - stream.seek(30) - is_ods = ( - stream.read(54) == b"mimetype" - b"application/vnd.oasis.opendocument.spreadsheet" - ) + content = stream.read(size) stream.seek(0) - return is_ods + return content + + +_XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1" +_ZIP_SIGNATURE = b"PK\x03\x04" +_PEEK_SIZE = max(len(_XLS_SIGNATURE), len(_ZIP_SIGNATURE)) + + +def _engine_from_content(stream: Union[BufferedIOBase, RawIOBase, BinaryIO]) -> str: + """ + Use the content of a stream to try and figure out which engine to use. + + It uses magic values inside the stream. + + Parameters + ---------- + stream : Union[BufferedIOBase, RawIOBase] + IO stream with data which might contain spreadsheet data. + + Returns + ------- + engine : Optional[engine] + The string engine if it can be confidently inferred. + """ + engine = None + peek = _peek(stream, _PEEK_SIZE) + + if peek.startswith(_XLS_SIGNATURE): + engine = "xlrd" + + elif peek.startswith(_ZIP_SIGNATURE): + zf = ZipFile(stream) + + # Workaround for some third party files that use forward slashes and + # lower case names. We map the expected name in lowercase to the + # actual filename in the zip container. + component_names = { + name.replace("\\", "/").lower(): name for name in zf.namelist() + } + + stream.seek(0) + + if "xl/workbook.xml" in component_names: + engine = "openpyxl" + if "xl/workbook.bin" in component_names: + engine = "pyxlsb" + if "content.xml" in component_names: + engine = "odf" + + return engine class ExcelFile: @@ -970,21 +1013,39 @@ class ExcelFile: "pyxlsb": PyxlsbReader, } + _ext_to_engine: Mapping[str, str] = { + ".ods": "odf", + ".xls": "xlrd", + ".xlsx": "openpyxl", + } + def __init__( self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: - # Determine ext and use odf for ods stream/file + + ext = peek = None + + if isinstance(path_or_buffer, bytes): + path_or_buffer = BytesIO(path_or_buffer) + if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): - ext = None - if _is_ods_stream(path_or_buffer): - engine = "odf" - else: + engine = _engine_from_content(path_or_buffer) + peek = _peek(path_or_buffer) + + elif isinstance(path_or_buffer, (str, os.PathLike)): ext = os.path.splitext(str(path_or_buffer))[-1] - if ext == ".ods": - engine = "odf" + handles = get_handle( + stringify_path(path_or_buffer), + "rb", + storage_options=storage_options, + is_text=False, + ) + with handles: + engine = _engine_from_content(handles.handle) + peek = _peek(handles.handle) - if ( + elif ( import_optional_dependency( "xlrd", raise_on_missing=False, on_version="ignore" ) @@ -995,38 +1056,16 @@ def __init__( if isinstance(path_or_buffer, Book): engine = "xlrd" - # GH 35029 - Prefer openpyxl except for xls files + # Couldn't tell for definite, so guess based on extension: if engine is None: - if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls": - engine = "xlrd" - elif ( - import_optional_dependency( - "openpyxl", raise_on_missing=False, on_version="ignore" - ) - is not None - ): - engine = "openpyxl" - else: - caller = inspect.stack()[1] - if ( - caller.filename.endswith("pandas/io/excel/_base.py") - and caller.function == "read_excel" - ): - stacklevel = 4 - else: - stacklevel = 2 - warnings.warn( - "The xlrd engine is no longer maintained and is not " - "supported when using pandas with python >= 3.9. However, " - "the engine xlrd will continue to be allowed for the " - "indefinite future. Beginning with pandas 1.2.0, the " - "openpyxl engine will be used if it is installed and the " - "engine argument is not specified. Either install openpyxl " - "or specify engine='xlrd' to silence this warning.", - FutureWarning, - stacklevel=stacklevel, - ) - engine = "xlrd" + engine = self._ext_to_engine.get(ext) + + if engine is None: + raise ValueError( + f"Could not find engine for {repr(path_or_buffer)}, content was " + f"{repr(peek)}" + ) + if engine not in self._engines: raise ValueError(f"Unknown engine: {engine}") diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 98a55ae39bd77..00c6a7f6c052e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -508,10 +508,10 @@ def test_reader_spaces(self, read_ext): def test_read_excel_ods_nested_xml(self, read_ext, basename, expected): # see gh-35802 engine = pd.read_excel.keywords["engine"] - if engine != "odf": + if engine not in ("odf", None): pytest.skip(f"Skipped for engine: {engine}") - actual = pd.read_excel(basename + read_ext) + actual = pd.read_excel(basename + ".ods") tm.assert_frame_equal(actual, expected) def test_reading_all_sheets(self, read_ext): diff --git a/pandas/tests/io/excel/test_readers_special.py b/pandas/tests/io/excel/test_readers_special.py new file mode 100644 index 0000000000000..83ad3dcbc683f --- /dev/null +++ b/pandas/tests/io/excel/test_readers_special.py @@ -0,0 +1,20 @@ +# Tests that don't need or don't work with the autouse fixtures in test_readers.py +import pytest + +import pandas as pd + + +def test_unreadable_bytes(): + with pytest.raises( + ValueError, match=r"Could not find engine for .+, content was b'rubbish'" + ): + pd.read_excel(b"rubbish") + + +def test_unreadable_file(tmp_path): + bad = tmp_path / "bad" + bad.write_bytes(b"rubbish") + with pytest.raises( + ValueError, match=r"Could not find engine for .+, content was b'rubbish'" + ): + pd.read_excel(bad) From 42663683a8a270b4924bd313e6a2837a76d45475 Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 13:02:26 +0000 Subject: [PATCH 2/8] no reason to skip these in other locales --- pandas/tests/io/excel/test_readers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 00c6a7f6c052e..6011b7bd240bf 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -624,7 +624,6 @@ def test_read_from_http_url(self, read_ext): local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) - @td.skip_if_not_us_locale def test_read_from_s3_url(self, read_ext, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: From 998a7783a47c9adfbc1401dad0441e09da3920b6 Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 13:21:12 +0000 Subject: [PATCH 3/8] cleanup imports --- pandas/io/excel/_base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 05887b6aa8acb..bf4b90bd83fbd 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -1,11 +1,9 @@ import abc import datetime -import inspect from io import BufferedIOBase, BytesIO, RawIOBase import os -from pathlib import Path from textwrap import fill -from typing import Any, Dict, Mapping, Union, cast, BinaryIO +from typing import Any, BinaryIO, Dict, Mapping, Union, cast import warnings from zipfile import ZipFile From 29d1dd08bcb56a57e6bfc48ec85c96ce29f9bac6 Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 13:45:16 +0000 Subject: [PATCH 4/8] add param missed from docstring --- pandas/io/excel/_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index bf4b90bd83fbd..f51c33abcf8f6 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -896,6 +896,8 @@ def _peek(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> Parameters ---------- stream : Union[BufferedIOBase, RawIOBase] + size: int + The number of bytes to read. Returns ------- From 163285c1b20196194437a226e960f82fa5d97c9d Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 13:45:25 +0000 Subject: [PATCH 5/8] rename to _peek_stream --- pandas/io/excel/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index f51c33abcf8f6..d513fb6def186 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -888,7 +888,7 @@ def close(self): return content -def _peek(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes: +def _peek_stream(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes: """ Return the specified number of bytes from the start of the stream and seek back to the start of the stream afterwards. @@ -932,7 +932,7 @@ def _engine_from_content(stream: Union[BufferedIOBase, RawIOBase, BinaryIO]) -> The string engine if it can be confidently inferred. """ engine = None - peek = _peek(stream, _PEEK_SIZE) + peek = _peek_stream(stream, _PEEK_SIZE) if peek.startswith(_XLS_SIGNATURE): engine = "xlrd" @@ -1031,7 +1031,7 @@ def __init__( if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): engine = _engine_from_content(path_or_buffer) - peek = _peek(path_or_buffer) + peek = _peek_stream(path_or_buffer) elif isinstance(path_or_buffer, (str, os.PathLike)): ext = os.path.splitext(str(path_or_buffer))[-1] @@ -1043,7 +1043,7 @@ def __init__( ) with handles: engine = _engine_from_content(handles.handle) - peek = _peek(handles.handle) + peek = _peek_stream(handles.handle) elif ( import_optional_dependency( From de99f1af4c2ea494a358ec72ce698ad14a891f6f Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 13:47:26 +0000 Subject: [PATCH 6/8] line width --- pandas/io/excel/_base.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index d513fb6def186..7123f4cabac47 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -888,7 +888,10 @@ def close(self): return content -def _peek_stream(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes: +def _peek_stream( + stream: Union[BufferedIOBase, RawIOBase, BinaryIO], + size: int = 20 +) -> bytes: """ Return the specified number of bytes from the start of the stream and seek back to the start of the stream afterwards. From 9aea6b95cbe1ad94ef12fabdf3452e6d1cf591c8 Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 13:55:00 +0000 Subject: [PATCH 7/8] :rageface: --- pandas/io/excel/_base.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 7123f4cabac47..0dbe1ce200e4b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -889,8 +889,7 @@ def close(self): def _peek_stream( - stream: Union[BufferedIOBase, RawIOBase, BinaryIO], - size: int = 20 + stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20 ) -> bytes: """ Return the specified number of bytes from the start of the stream From 21b43e1fe566c018e3cd90b9fdf133b90e475b66 Mon Sep 17 00:00:00 2001 From: Chris Withers Date: Wed, 16 Dec 2020 19:15:55 +0000 Subject: [PATCH 8/8] test for non-existent file --- pandas/tests/io/excel/test_readers_special.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/tests/io/excel/test_readers_special.py b/pandas/tests/io/excel/test_readers_special.py index 83ad3dcbc683f..0d7c3e590ce9f 100644 --- a/pandas/tests/io/excel/test_readers_special.py +++ b/pandas/tests/io/excel/test_readers_special.py @@ -18,3 +18,9 @@ def test_unreadable_file(tmp_path): ValueError, match=r"Could not find engine for .+, content was b'rubbish'" ): pd.read_excel(bad) + + +def test_non_existent_path(tmp_path): + bad = tmp_path / "bad" + with pytest.raises(FileNotFoundError): + pd.read_excel(bad)