From a2ce319c7c1f37d5626786eb9adeac2db1523edc Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 13:02:07 +0000
Subject: [PATCH 1/8] Re-work engine guessing in ExcelFile / read_excel.

This uses more reliable content introspection that results in better engine selection.
It also removes situations where xlrd gets handed a file it no longer supports.
---
 pandas/io/excel/_base.py                      | 149 +++++++++++-------
 pandas/tests/io/excel/test_readers.py         |   4 +-
 pandas/tests/io/excel/test_readers_special.py |  20 +++
 3 files changed, 116 insertions(+), 57 deletions(-)
 create mode 100644 pandas/tests/io/excel/test_readers_special.py

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index bf1011176693f..05887b6aa8acb 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -3,9 +3,11 @@
 import inspect
 from io import BufferedIOBase, BytesIO, RawIOBase
 import os
+from pathlib import Path
 from textwrap import fill
-from typing import Any, Dict, Mapping, Union, cast
+from typing import Any, Dict, Mapping, Union, cast, BinaryIO
 import warnings
+from zipfile import ZipFile
 
 from pandas._config import config
 
@@ -888,32 +890,73 @@ def close(self):
         return content
 
 
-def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
+def _peek(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes:
     """
-    Check if the stream is an OpenDocument Spreadsheet (.ods) file
-
-    It uses magic values inside the stream
+    Return the specified number of bytes from the start of the stream
+    and seek back to the start of the stream afterwards.
 
     Parameters
     ----------
     stream : Union[BufferedIOBase, RawIOBase]
-        IO stream with data which might be an ODS file
 
     Returns
     -------
-    is_ods : bool
-        Boolean indication that this is indeed an ODS file or not
+    content : bytes
+        The bytes founds.
     """
     stream.seek(0)
-    is_ods = False
-    if stream.read(4) == b"PK\003\004":
-        stream.seek(30)
-        is_ods = (
-            stream.read(54) == b"mimetype"
-            b"application/vnd.oasis.opendocument.spreadsheet"
-        )
+    content = stream.read(size)
     stream.seek(0)
-    return is_ods
+    return content
+
+
+_XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
+_ZIP_SIGNATURE = b"PK\x03\x04"
+_PEEK_SIZE = max(len(_XLS_SIGNATURE), len(_ZIP_SIGNATURE))
+
+
+def _engine_from_content(stream: Union[BufferedIOBase, RawIOBase, BinaryIO]) -> str:
+    """
+    Use the content of a stream to try and figure out which engine to use.
+
+    It uses magic values inside the stream.
+
+    Parameters
+    ----------
+    stream : Union[BufferedIOBase, RawIOBase]
+        IO stream with data which might contain spreadsheet data.
+
+    Returns
+    -------
+    engine : Optional[engine]
+        The string engine if it can be confidently inferred.
+    """
+    engine = None
+    peek = _peek(stream, _PEEK_SIZE)
+
+    if peek.startswith(_XLS_SIGNATURE):
+        engine = "xlrd"
+
+    elif peek.startswith(_ZIP_SIGNATURE):
+        zf = ZipFile(stream)
+
+        # Workaround for some third party files that use forward slashes and
+        # lower case names. We map the expected name in lowercase to the
+        # actual filename in the zip container.
+        component_names = {
+            name.replace("\\", "/").lower(): name for name in zf.namelist()
+        }
+
+        stream.seek(0)
+
+        if "xl/workbook.xml" in component_names:
+            engine = "openpyxl"
+        if "xl/workbook.bin" in component_names:
+            engine = "pyxlsb"
+        if "content.xml" in component_names:
+            engine = "odf"
+
+    return engine
 
 
 class ExcelFile:
@@ -970,21 +1013,39 @@ class ExcelFile:
         "pyxlsb": PyxlsbReader,
     }
 
+    _ext_to_engine: Mapping[str, str] = {
+        ".ods": "odf",
+        ".xls": "xlrd",
+        ".xlsx": "openpyxl",
+    }
+
     def __init__(
         self, path_or_buffer, engine=None, storage_options: StorageOptions = None
     ):
         if engine is None:
-            # Determine ext and use odf for ods stream/file
+
+            ext = peek = None
+
+            if isinstance(path_or_buffer, bytes):
+                path_or_buffer = BytesIO(path_or_buffer)
+
             if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
-                ext = None
-                if _is_ods_stream(path_or_buffer):
-                    engine = "odf"
-            else:
+                engine = _engine_from_content(path_or_buffer)
+                peek = _peek(path_or_buffer)
+
+            elif isinstance(path_or_buffer, (str, os.PathLike)):
                 ext = os.path.splitext(str(path_or_buffer))[-1]
-                if ext == ".ods":
-                    engine = "odf"
+                handles = get_handle(
+                    stringify_path(path_or_buffer),
+                    "rb",
+                    storage_options=storage_options,
+                    is_text=False,
+                )
+                with handles:
+                    engine = _engine_from_content(handles.handle)
+                    peek = _peek(handles.handle)
 
-            if (
+            elif (
                 import_optional_dependency(
                     "xlrd", raise_on_missing=False, on_version="ignore"
                 )
@@ -995,38 +1056,16 @@ def __init__(
                 if isinstance(path_or_buffer, Book):
                     engine = "xlrd"
 
-            # GH 35029 - Prefer openpyxl except for xls files
+            # Couldn't tell for definite, so guess based on extension:
             if engine is None:
-                if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
-                    engine = "xlrd"
-                elif (
-                    import_optional_dependency(
-                        "openpyxl", raise_on_missing=False, on_version="ignore"
-                    )
-                    is not None
-                ):
-                    engine = "openpyxl"
-                else:
-                    caller = inspect.stack()[1]
-                    if (
-                        caller.filename.endswith("pandas/io/excel/_base.py")
-                        and caller.function == "read_excel"
-                    ):
-                        stacklevel = 4
-                    else:
-                        stacklevel = 2
-                    warnings.warn(
-                        "The xlrd engine is no longer maintained and is not "
-                        "supported when using pandas with python >= 3.9. However, "
-                        "the engine xlrd will continue to be allowed for the "
-                        "indefinite future. Beginning with pandas 1.2.0, the "
-                        "openpyxl engine will be used if it is installed and the "
-                        "engine argument is not specified. Either install openpyxl "
-                        "or specify engine='xlrd' to silence this warning.",
-                        FutureWarning,
-                        stacklevel=stacklevel,
-                    )
-                    engine = "xlrd"
+                engine = self._ext_to_engine.get(ext)
+
+            if engine is None:
+                raise ValueError(
+                    f"Could not find engine for {repr(path_or_buffer)}, content was "
+                    f"{repr(peek)}"
+                )
+
         if engine not in self._engines:
             raise ValueError(f"Unknown engine: {engine}")
 
diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 98a55ae39bd77..00c6a7f6c052e 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -508,10 +508,10 @@ def test_reader_spaces(self, read_ext):
     def test_read_excel_ods_nested_xml(self, read_ext, basename, expected):
         # see gh-35802
         engine = pd.read_excel.keywords["engine"]
-        if engine != "odf":
+        if engine not in ("odf", None):
             pytest.skip(f"Skipped for engine: {engine}")
 
-        actual = pd.read_excel(basename + read_ext)
+        actual = pd.read_excel(basename + ".ods")
         tm.assert_frame_equal(actual, expected)
 
     def test_reading_all_sheets(self, read_ext):
diff --git a/pandas/tests/io/excel/test_readers_special.py b/pandas/tests/io/excel/test_readers_special.py
new file mode 100644
index 0000000000000..83ad3dcbc683f
--- /dev/null
+++ b/pandas/tests/io/excel/test_readers_special.py
@@ -0,0 +1,20 @@
+# Tests that don't need or don't work with the autouse fixtures in test_readers.py
+import pytest
+
+import pandas as pd
+
+
+def test_unreadable_bytes():
+    with pytest.raises(
+        ValueError, match=r"Could not find engine for .+, content was b'rubbish'"
+    ):
+        pd.read_excel(b"rubbish")
+
+
+def test_unreadable_file(tmp_path):
+    bad = tmp_path / "bad"
+    bad.write_bytes(b"rubbish")
+    with pytest.raises(
+        ValueError, match=r"Could not find engine for .+, content was b'rubbish'"
+    ):
+        pd.read_excel(bad)

From 42663683a8a270b4924bd313e6a2837a76d45475 Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 13:02:26 +0000
Subject: [PATCH 2/8] no reason to skip these in other locales

---
 pandas/tests/io/excel/test_readers.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py
index 00c6a7f6c052e..6011b7bd240bf 100644
--- a/pandas/tests/io/excel/test_readers.py
+++ b/pandas/tests/io/excel/test_readers.py
@@ -624,7 +624,6 @@ def test_read_from_http_url(self, read_ext):
         local_table = pd.read_excel("test1" + read_ext)
         tm.assert_frame_equal(url_table, local_table)
 
-    @td.skip_if_not_us_locale
     def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
         # Bucket "pandas-test" created in tests/io/conftest.py
         with open("test1" + read_ext, "rb") as f:

From 998a7783a47c9adfbc1401dad0441e09da3920b6 Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 13:21:12 +0000
Subject: [PATCH 3/8] cleanup imports

---
 pandas/io/excel/_base.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 05887b6aa8acb..bf4b90bd83fbd 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -1,11 +1,9 @@
 import abc
 import datetime
-import inspect
 from io import BufferedIOBase, BytesIO, RawIOBase
 import os
-from pathlib import Path
 from textwrap import fill
-from typing import Any, Dict, Mapping, Union, cast, BinaryIO
+from typing import Any, BinaryIO, Dict, Mapping, Union, cast
 import warnings
 from zipfile import ZipFile
 

From 29d1dd08bcb56a57e6bfc48ec85c96ce29f9bac6 Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 13:45:16 +0000
Subject: [PATCH 4/8] add param missed from docstring

---
 pandas/io/excel/_base.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index bf4b90bd83fbd..f51c33abcf8f6 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -896,6 +896,8 @@ def _peek(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) ->
     Parameters
     ----------
     stream : Union[BufferedIOBase, RawIOBase]
+    size: int
+      The number of bytes to read.
 
     Returns
     -------

From 163285c1b20196194437a226e960f82fa5d97c9d Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 13:45:25 +0000
Subject: [PATCH 5/8] rename to _peek_stream

---
 pandas/io/excel/_base.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index f51c33abcf8f6..d513fb6def186 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -888,7 +888,7 @@ def close(self):
         return content
 
 
-def _peek(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes:
+def _peek_stream(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes:
     """
     Return the specified number of bytes from the start of the stream
     and seek back to the start of the stream afterwards.
@@ -932,7 +932,7 @@ def _engine_from_content(stream: Union[BufferedIOBase, RawIOBase, BinaryIO]) ->
         The string engine if it can be confidently inferred.
     """
     engine = None
-    peek = _peek(stream, _PEEK_SIZE)
+    peek = _peek_stream(stream, _PEEK_SIZE)
 
     if peek.startswith(_XLS_SIGNATURE):
         engine = "xlrd"
@@ -1031,7 +1031,7 @@ def __init__(
 
             if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
                 engine = _engine_from_content(path_or_buffer)
-                peek = _peek(path_or_buffer)
+                peek = _peek_stream(path_or_buffer)
 
             elif isinstance(path_or_buffer, (str, os.PathLike)):
                 ext = os.path.splitext(str(path_or_buffer))[-1]
@@ -1043,7 +1043,7 @@ def __init__(
                 )
                 with handles:
                     engine = _engine_from_content(handles.handle)
-                    peek = _peek(handles.handle)
+                    peek = _peek_stream(handles.handle)
 
             elif (
                 import_optional_dependency(

From de99f1af4c2ea494a358ec72ce698ad14a891f6f Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 13:47:26 +0000
Subject: [PATCH 6/8] line width

---
 pandas/io/excel/_base.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index d513fb6def186..7123f4cabac47 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -888,7 +888,10 @@ def close(self):
         return content
 
 
-def _peek_stream(stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20) -> bytes:
+def _peek_stream(
+        stream: Union[BufferedIOBase, RawIOBase, BinaryIO],
+        size: int = 20
+) -> bytes:
     """
     Return the specified number of bytes from the start of the stream
     and seek back to the start of the stream afterwards.

From 9aea6b95cbe1ad94ef12fabdf3452e6d1cf591c8 Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 13:55:00 +0000
Subject: [PATCH 7/8] :rageface:

---
 pandas/io/excel/_base.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py
index 7123f4cabac47..0dbe1ce200e4b 100644
--- a/pandas/io/excel/_base.py
+++ b/pandas/io/excel/_base.py
@@ -889,8 +889,7 @@ def close(self):
 
 
 def _peek_stream(
-        stream: Union[BufferedIOBase, RawIOBase, BinaryIO],
-        size: int = 20
+    stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20
 ) -> bytes:
     """
     Return the specified number of bytes from the start of the stream

From 21b43e1fe566c018e3cd90b9fdf133b90e475b66 Mon Sep 17 00:00:00 2001
From: Chris Withers <chris@simplistix.co.uk>
Date: Wed, 16 Dec 2020 19:15:55 +0000
Subject: [PATCH 8/8] test for non-existent file

---
 pandas/tests/io/excel/test_readers_special.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/pandas/tests/io/excel/test_readers_special.py b/pandas/tests/io/excel/test_readers_special.py
index 83ad3dcbc683f..0d7c3e590ce9f 100644
--- a/pandas/tests/io/excel/test_readers_special.py
+++ b/pandas/tests/io/excel/test_readers_special.py
@@ -18,3 +18,9 @@ def test_unreadable_file(tmp_path):
         ValueError, match=r"Could not find engine for .+, content was b'rubbish'"
     ):
         pd.read_excel(bad)
+
+
+def test_non_existent_path(tmp_path):
+    bad = tmp_path / "bad"
+    with pytest.raises(FileNotFoundError):
+        pd.read_excel(bad)