diff --git a/docs/source/whatsnew/v0.10.0.txt b/docs/source/whatsnew/v0.10.0.txt index fb464ceb..9fc7c33a 100644 --- a/docs/source/whatsnew/v0.10.0.txt +++ b/docs/source/whatsnew/v0.10.0.txt @@ -11,6 +11,7 @@ Bug Fixes - Fixed Yahoo readers which now require headers - Fixed other reader - Improved compatibility with pandas +- Decoding stores from Yahoo that were encrypted Contributors ~~~~~~~~~~~~ @@ -26,6 +27,7 @@ Thanks to all of the contributors for the 0.10.0 release (based on git log): - Lukas Halim - Simon Garisch - Dmitry Alekseev +- Raphael Frach These lists of names are automatically generated based on git log, and may not be complete. \ No newline at end of file diff --git a/pandas_datareader/tests/io/test_jsdmx.py b/pandas_datareader/tests/io/test_jsdmx.py index 066cc159..b95b0108 100644 --- a/pandas_datareader/tests/io/test_jsdmx.py +++ b/pandas_datareader/tests/io/test_jsdmx.py @@ -170,7 +170,7 @@ def test_quartervalue(dirpath): "2011-10-01", ], dtype="datetime64[ns]", - name=u"Period", + name="Period", freq=None, ) tm.assert_index_equal(result.index, expected) diff --git a/pandas_datareader/tests/yahoo/test_options.py b/pandas_datareader/tests/yahoo/test_options.py index d37839d8..17f0f74b 100644 --- a/pandas_datareader/tests/yahoo/test_options.py +++ b/pandas_datareader/tests/yahoo/test_options.py @@ -100,7 +100,7 @@ def assert_option_result(self, df): ] ) tm.assert_index_equal(df.columns, exp_columns) - assert df.index.names == [u"Strike", u"Expiry", u"Type", u"Symbol"] + assert df.index.names == ["Strike", "Expiry", "Type", "Symbol"] dtypes = [ np.dtype(x) diff --git a/pandas_datareader/yahoo/daily.py b/pandas_datareader/yahoo/daily.py index 0e8a8a7e..96308cfb 100644 --- a/pandas_datareader/yahoo/daily.py +++ b/pandas_datareader/yahoo/daily.py @@ -4,6 +4,11 @@ import re import time +import hashlib +from base64 import b64decode +from Crypto.Cipher import AES +from Crypto.Util.Padding import unpad + from pandas import DataFrame, isnull, notnull, to_datetime from pandas_datareader._utils import RemoteDataError @@ -11,6 +16,74 @@ from pandas_datareader.yahoo.headers import DEFAULT_HEADERS +def decrypt_cryptojs_aes(data): + encrypted_stores = data["context"]["dispatcher"]["stores"] + password_key = next(key for key in data.keys() if key not in ["context", "plugins"]) + password = data[password_key] + + encrypted_stores = b64decode(encrypted_stores) + + assert encrypted_stores[0:8] == b"Salted__" + salt = encrypted_stores[8:16] + encrypted_stores = encrypted_stores[16:] + + def EVPKDF( + password, + salt, + keySize=32, + ivSize=16, + iterations=1, + hashAlgorithm="md5", + ) -> tuple: + """OpenSSL EVP Key Derivation Function + Args: + password (Union[str, bytes, bytearray]): Password to generate key from. + salt (Union[bytes, bytearray]): Salt to use. + keySize (int, optional): Output key length in bytes. Defaults to 32. + ivSize (int, optional): Output Initialization Vector (IV) length in bytes. Defaults to 16. + iterations (int, optional): Number of iterations to perform. Defaults to 1. + hashAlgorithm (str, optional): Hash algorithm to use for the KDF. Defaults to 'md5'. + Returns: + key, iv: Derived key and Initialization Vector (IV) bytes. + Taken from: https://gist.github.com/rafiibrahim8/0cd0f8c46896cafef6486cb1a50a16d3 + OpenSSL original code: https://github.com/openssl/openssl/blob/master/crypto/evp/evp_key.c#L78 + """ + + assert iterations > 0, "Iterations can not be less than 1." + + if isinstance(password, str): + password = password.encode("utf-8") + + final_length = keySize + ivSize + key_iv = b"" + block = None + + while len(key_iv) < final_length: + hasher = hashlib.new(hashAlgorithm) + if block: + hasher.update(block) + hasher.update(password) + hasher.update(salt) + block = hasher.digest() + for _ in range(1, iterations): + block = hashlib.new(hashAlgorithm, block).digest() + key_iv += block + + key, iv = key_iv[:keySize], key_iv[keySize:final_length] + return key, iv + + key, iv = EVPKDF( + password, salt, keySize=32, ivSize=16, iterations=1, hashAlgorithm="md5" + ) + + cipher = AES.new(key, AES.MODE_CBC, iv=iv) + plaintext = cipher.decrypt(encrypted_stores) + plaintext = unpad(plaintext, 16, style="pkcs7") + decoded_stores = json.loads(plaintext) + + return decoded_stores + + class YahooDailyReader(_DailyBaseReader): """ Returns DataFrame of with historical over date range, @@ -150,7 +223,13 @@ def _read_one_data(self, url, params): ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);" try: j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1)) - data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"] + + new_j = decrypt_cryptojs_aes( + j + ) + + data = new_j["HistoricalPriceStore"] + except KeyError: msg = "No data fetched for symbol {} using {}" raise RemoteDataError(msg.format(symbol, self.__class__.__name__)) diff --git a/requirements.txt b/requirements.txt index 25c6f68a..aa4a8be1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,5 @@ lxml pandas>=0.23 requests>=2.19.0 +pycryptodome>=3.16.0 +packaging>=22.0