Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decoding stores that was encrypted by Yahoo! Finance recently #953

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/whatsnew/v0.10.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Bug Fixes
- Fixed Yahoo readers which now require headers
- Fixed other reader
- Improved compatibility with pandas
- Decoding stores from Yahoo that were encrypted

Contributors
~~~~~~~~~~~~
Expand All @@ -26,6 +27,7 @@ Thanks to all of the contributors for the 0.10.0 release (based on git log):
- Lukas Halim
- Simon Garisch
- Dmitry Alekseev
- Raphael Frach

These lists of names are automatically generated based on git log, and may not
be complete.
2 changes: 1 addition & 1 deletion pandas_datareader/tests/io/test_jsdmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def test_quartervalue(dirpath):
"2011-10-01",
],
dtype="datetime64[ns]",
name=u"Period",
name="Period",
freq=None,
)
tm.assert_index_equal(result.index, expected)
2 changes: 1 addition & 1 deletion pandas_datareader/tests/yahoo/test_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ def assert_option_result(self, df):
]
)
tm.assert_index_equal(df.columns, exp_columns)
assert df.index.names == [u"Strike", u"Expiry", u"Type", u"Symbol"]
assert df.index.names == ["Strike", "Expiry", "Type", "Symbol"]

dtypes = [
np.dtype(x)
Expand Down
81 changes: 80 additions & 1 deletion pandas_datareader/yahoo/daily.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,86 @@
import re
import time

import hashlib
from base64 import b64decode
from Crypto.Cipher import AES
from Crypto.Util.Padding import unpad

from pandas import DataFrame, isnull, notnull, to_datetime

from pandas_datareader._utils import RemoteDataError
from pandas_datareader.base import _DailyBaseReader
from pandas_datareader.yahoo.headers import DEFAULT_HEADERS


def decrypt_cryptojs_aes(data):
encrypted_stores = data["context"]["dispatcher"]["stores"]
password_key = next(key for key in data.keys() if key not in ["context", "plugins"])
password = data[password_key]

encrypted_stores = b64decode(encrypted_stores)

assert encrypted_stores[0:8] == b"Salted__"
salt = encrypted_stores[8:16]
encrypted_stores = encrypted_stores[16:]

def EVPKDF(
password,
salt,
keySize=32,
ivSize=16,
iterations=1,
hashAlgorithm="md5",
) -> tuple:
"""OpenSSL EVP Key Derivation Function
Args:
password (Union[str, bytes, bytearray]): Password to generate key from.
salt (Union[bytes, bytearray]): Salt to use.
keySize (int, optional): Output key length in bytes. Defaults to 32.
ivSize (int, optional): Output Initialization Vector (IV) length in bytes. Defaults to 16.
iterations (int, optional): Number of iterations to perform. Defaults to 1.
hashAlgorithm (str, optional): Hash algorithm to use for the KDF. Defaults to 'md5'.
Returns:
key, iv: Derived key and Initialization Vector (IV) bytes.
Taken from: https://gist.github.com/rafiibrahim8/0cd0f8c46896cafef6486cb1a50a16d3
OpenSSL original code: https://github.com/openssl/openssl/blob/master/crypto/evp/evp_key.c#L78
"""

assert iterations > 0, "Iterations can not be less than 1."

if isinstance(password, str):
password = password.encode("utf-8")

final_length = keySize + ivSize
key_iv = b""
block = None

while len(key_iv) < final_length:
hasher = hashlib.new(hashAlgorithm)
if block:
hasher.update(block)
hasher.update(password)
hasher.update(salt)
block = hasher.digest()
for _ in range(1, iterations):
block = hashlib.new(hashAlgorithm, block).digest()
key_iv += block

key, iv = key_iv[:keySize], key_iv[keySize:final_length]
return key, iv

key, iv = EVPKDF(
password, salt, keySize=32, ivSize=16, iterations=1, hashAlgorithm="md5"
)

cipher = AES.new(key, AES.MODE_CBC, iv=iv)
plaintext = cipher.decrypt(encrypted_stores)
plaintext = unpad(plaintext, 16, style="pkcs7")
decoded_stores = json.loads(plaintext)

return decoded_stores


class YahooDailyReader(_DailyBaseReader):
"""
Returns DataFrame of with historical over date range,
Expand Down Expand Up @@ -150,7 +223,13 @@ def _read_one_data(self, url, params):
ptrn = r"root\.App\.main = (.*?);\n}\(this\)\);"
try:
j = json.loads(re.search(ptrn, resp.text, re.DOTALL).group(1))
data = j["context"]["dispatcher"]["stores"]["HistoricalPriceStore"]

new_j = decrypt_cryptojs_aes(
j
)

data = new_j["HistoricalPriceStore"]

except KeyError:
msg = "No data fetched for symbol {} using {}"
raise RemoteDataError(msg.format(symbol, self.__class__.__name__))
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
lxml
pandas>=0.23
requests>=2.19.0
pycryptodome>=3.16.0
packaging>=22.0