Skip to content

Better inference of spreadsheet formats. #38522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 97 additions & 56 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
import abc
import datetime
import inspect
from io import BufferedIOBase, BytesIO, RawIOBase
import os
from textwrap import fill
from typing import Any, Dict, Mapping, Union, cast
from typing import Any, BinaryIO, Dict, Mapping, Union, cast
import warnings
from zipfile import ZipFile

from pandas._config import config

Expand Down Expand Up @@ -888,32 +888,77 @@ def close(self):
return content


def _is_ods_stream(stream: Union[BufferedIOBase, RawIOBase]) -> bool:
def _peek_stream(
stream: Union[BufferedIOBase, RawIOBase, BinaryIO], size: int = 20
) -> bytes:
"""
Check if the stream is an OpenDocument Spreadsheet (.ods) file

It uses magic values inside the stream
Return the specified number of bytes from the start of the stream
and seek back to the start of the stream afterwards.

Parameters
----------
stream : Union[BufferedIOBase, RawIOBase]
IO stream with data which might be an ODS file
size: int
The number of bytes to read.

Returns
-------
is_ods : bool
Boolean indication that this is indeed an ODS file or not
content : bytes
The bytes founds.
"""
stream.seek(0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What if stream doesn't support seek? I think this is possible with compressed files.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have a feeling we'd have seen reports of this either in the pandas tracker or xlrd's tracker, since, unless I'm mistaken this type of seeking has been used in xlrd for many years now.

is_ods = False
if stream.read(4) == b"PK\003\004":
stream.seek(30)
is_ods = (
stream.read(54) == b"mimetype"
b"application/vnd.oasis.opendocument.spreadsheet"
)
content = stream.read(size)
stream.seek(0)
return is_ods
return content


_XLS_SIGNATURE = b"\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1"
_ZIP_SIGNATURE = b"PK\x03\x04"
_PEEK_SIZE = max(len(_XLS_SIGNATURE), len(_ZIP_SIGNATURE))


def _engine_from_content(stream: Union[BufferedIOBase, RawIOBase, BinaryIO]) -> str:
"""
Use the content of a stream to try and figure out which engine to use.

It uses magic values inside the stream.

Parameters
----------
stream : Union[BufferedIOBase, RawIOBase]
IO stream with data which might contain spreadsheet data.

Returns
-------
engine : Optional[engine]
The string engine if it can be confidently inferred.
"""
engine = None
peek = _peek_stream(stream, _PEEK_SIZE)

if peek.startswith(_XLS_SIGNATURE):
engine = "xlrd"

elif peek.startswith(_ZIP_SIGNATURE):
zf = ZipFile(stream)

# Workaround for some third party files that use forward slashes and
# lower case names. We map the expected name in lowercase to the
# actual filename in the zip container.
component_names = {
name.replace("\\", "/").lower(): name for name in zf.namelist()
}

stream.seek(0)

if "xl/workbook.xml" in component_names:
engine = "openpyxl"
if "xl/workbook.bin" in component_names:
engine = "pyxlsb"
if "content.xml" in component_names:
engine = "odf"

return engine


class ExcelFile:
Expand Down Expand Up @@ -970,21 +1015,39 @@ class ExcelFile:
"pyxlsb": PyxlsbReader,
}

_ext_to_engine: Mapping[str, str] = {
".ods": "odf",
".xls": "xlrd",
".xlsx": "openpyxl",
}

def __init__(
self, path_or_buffer, engine=None, storage_options: StorageOptions = None
):
if engine is None:
# Determine ext and use odf for ods stream/file

ext = peek = None

if isinstance(path_or_buffer, bytes):
path_or_buffer = BytesIO(path_or_buffer)

if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)):
ext = None
if _is_ods_stream(path_or_buffer):
engine = "odf"
else:
engine = _engine_from_content(path_or_buffer)
peek = _peek_stream(path_or_buffer)

elif isinstance(path_or_buffer, (str, os.PathLike)):
ext = os.path.splitext(str(path_or_buffer))[-1]
if ext == ".ods":
engine = "odf"
handles = get_handle(
stringify_path(path_or_buffer),
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the stringigy_path here needed? (I would assume that get_handle handles that)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, stringify_path is not needed for get_handle

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the user expansion that stringify_path is needed here.

"rb",
storage_options=storage_options,
is_text=False,
)
with handles:
engine = _engine_from_content(handles.handle)
peek = _peek_stream(handles.handle)

if (
elif (
import_optional_dependency(
"xlrd", raise_on_missing=False, on_version="ignore"
)
Expand All @@ -995,38 +1058,16 @@ def __init__(
if isinstance(path_or_buffer, Book):
engine = "xlrd"

# GH 35029 - Prefer openpyxl except for xls files
# Couldn't tell for definite, so guess based on extension:
if engine is None:
if ext is None or isinstance(path_or_buffer, bytes) or ext == ".xls":
engine = "xlrd"
elif (
import_optional_dependency(
"openpyxl", raise_on_missing=False, on_version="ignore"
)
is not None
):
engine = "openpyxl"
else:
caller = inspect.stack()[1]
if (
caller.filename.endswith("pandas/io/excel/_base.py")
and caller.function == "read_excel"
):
stacklevel = 4
else:
stacklevel = 2
warnings.warn(
"The xlrd engine is no longer maintained and is not "
"supported when using pandas with python >= 3.9. However, "
"the engine xlrd will continue to be allowed for the "
"indefinite future. Beginning with pandas 1.2.0, the "
"openpyxl engine will be used if it is installed and the "
"engine argument is not specified. Either install openpyxl "
"or specify engine='xlrd' to silence this warning.",
FutureWarning,
stacklevel=stacklevel,
)
engine = "xlrd"
engine = self._ext_to_engine.get(ext)

if engine is None:
raise ValueError(
f"Could not find engine for {repr(path_or_buffer)}, content was "
f"{repr(peek)}"
)

if engine not in self._engines:
raise ValueError(f"Unknown engine: {engine}")

Expand Down
5 changes: 2 additions & 3 deletions pandas/tests/io/excel/test_readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -508,10 +508,10 @@ def test_reader_spaces(self, read_ext):
def test_read_excel_ods_nested_xml(self, read_ext, basename, expected):
# see gh-35802
engine = pd.read_excel.keywords["engine"]
if engine != "odf":
if engine not in ("odf", None):
pytest.skip(f"Skipped for engine: {engine}")

actual = pd.read_excel(basename + read_ext)
actual = pd.read_excel(basename + ".ods")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can now run these tests on the None engine since read_excel will now correctly infer the odf engine rather than falling through to xlrd.

tm.assert_frame_equal(actual, expected)

def test_reading_all_sheets(self, read_ext):
Expand Down Expand Up @@ -624,7 +624,6 @@ def test_read_from_http_url(self, read_ext):
local_table = pd.read_excel("test1" + read_ext)
tm.assert_frame_equal(url_table, local_table)

@td.skip_if_not_us_locale
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does anyone know what the intention was in this skip? I'm in the UK and wanted to make sure I hadn't broken these tests. I couldn't think of any reason why this needed to be in, but have removed it in a separate commit which I have no problem dropping out of this PR if people would prefer.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was added in #21814. But no idea why (the test doesn't seem to have anything locale specific (like eg date parsing)). But we can see what CI says.
cc @TomAugspurger

def test_read_from_s3_url(self, read_ext, s3_resource, s3so):
# Bucket "pandas-test" created in tests/io/conftest.py
with open("test1" + read_ext, "rb") as f:
Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/io/excel/test_readers_special.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Tests that don't need or don't work with the autouse fixtures in test_readers.py
import pytest

import pandas as pd


def test_unreadable_bytes():
with pytest.raises(
ValueError, match=r"Could not find engine for .+, content was b'rubbish'"
):
pd.read_excel(b"rubbish")


def test_unreadable_file(tmp_path):
bad = tmp_path / "bad"
bad.write_bytes(b"rubbish")
with pytest.raises(
ValueError, match=r"Could not find engine for .+, content was b'rubbish'"
):
pd.read_excel(bad)


def test_non_existent_path(tmp_path):
bad = tmp_path / "bad"
with pytest.raises(FileNotFoundError):
pd.read_excel(bad)