Skip to content

Commit 1f46e8e

Browse files
committed
added stock prediction tutorial
1 parent 1e33aed commit 1f46e8e

26 files changed

+12539
-0
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ This is a repository of all the tutorials of [The Python Code](https://www.thepy
3939
- [Building a Speech Emotion Recognizer using Scikit-learn](https://www.thepythoncode.com/article/building-a-speech-emotion-recognizer-using-sklearn). ([code](machine-learning/speech-emotion-recognition))
4040
- [How to Convert Speech to Text in Python](https://www.thepythoncode.com/article/using-speech-recognition-to-convert-speech-to-text-python). ([code](machine-learning/speech-recognition))
4141
- [Top 8 Python Libraries For Data Scientists and Machine Learning Engineers](https://www.thepythoncode.com/article/top-python-libraries-for-data-scientists).
42+
- [How to Predict Stock Prices in Python using TensorFlow 2 and Keras](https://www.thepythoncode.com/article/stock-price-prediction-in-python-using-tensorflow-2-and-keras). ([code](machine-learning/stock-prediction))
4243

4344

4445
- ### [General Python Topics](https://www.thepythoncode.com/topic/general-python-topics)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# [How to Predict Stock Prices in Python using TensorFlow 2 and Keras](https://www.thepythoncode.com/article/stock-price-prediction-in-python-using-tensorflow-2-and-keras)
2+
3+
To run this:
4+
- `pip3 install -r requirements.txt`
5+
- Please read the tutorial before using this, edit `parameters.py` for your needs and run `train.py`. This will start training using the parameters you specified, you can use `tensorboard` on `logs` folder to visualize your training process.
6+
- Once you trained your model, use `test.py` to evaluate and test your model.

machine-learning/stock-prediction/data/AAPL_2020-01-08.csv

+9,852
Large diffs are not rendered by default.

machine-learning/stock-prediction/data/TSLA_2020-01-08.csv

+2,400
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import os
2+
import time
3+
from tensorflow.keras.layers import LSTM
4+
5+
6+
# Window size or the sequence length
7+
N_STEPS = 100
8+
# Lookup step, 1 is the next day
9+
LOOKUP_STEP = 90
10+
11+
# test ratio size, 0.2 is 20%
12+
TEST_SIZE = 0.2
13+
# features to use
14+
FEATURE_COLUMNS = ["adjclose", "volume", "open", "high", "low"]
15+
# date now
16+
date_now = time.strftime("%Y-%m-%d")
17+
18+
### model parameters
19+
20+
N_LAYERS = 3
21+
# LSTM cell
22+
CELL = LSTM
23+
# 256 LSTM neurons
24+
UNITS = 256
25+
# 40% dropout
26+
DROPOUT = 0.4
27+
28+
### training parameters
29+
30+
# mean squared error loss
31+
LOSS = "mse"
32+
OPTIMIZER = "rmsprop"
33+
BATCH_SIZE = 64
34+
EPOCHS = 300
35+
36+
# Apple stock market
37+
ticker = "AAPL"
38+
ticker_data_filename = os.path.join("data", f"{ticker}_{date_now}.csv")
39+
# model name to save
40+
model_name = f"{date_now}_{ticker}-{LOSS}-{CELL.__name__}-seq-{N_STEPS}-step-{LOOKUP_STEP}-layers-{N_LAYERS}-units-{UNITS}"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
sklearn
2+
tensorflow
3+
matplotlib
4+
numpy
5+
pandas
6+
yahoo_fin
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
from tensorflow.keras.models import Sequential
2+
from tensorflow.keras.layers import LSTM, Dense, Dropout
3+
from sklearn import preprocessing
4+
from sklearn.model_selection import train_test_split
5+
from yahoo_fin import stock_info as si
6+
from collections import deque
7+
8+
import numpy as np
9+
import pandas as pd
10+
import random
11+
12+
13+
def load_data(ticker, n_steps=50, scale=True, shuffle=True, lookup_step=1,
14+
test_size=0.2, feature_columns=['adjclose', 'volume', 'open', 'high', 'low']):
15+
"""
16+
Loads data from Yahoo Finance source, as well as scaling, shuffling, normalizing and splitting.
17+
Params:
18+
ticker (str/pd.DataFrame): the ticker you want to load, examples include AAPL, TESL, etc.
19+
n_steps (int): the historical sequence length (i.e window size) used to predict, default is 50
20+
scale (bool): whether to scale prices from 0 to 1, default is True
21+
shuffle (bool): whether to shuffle the data, default is True
22+
lookup_step (int): the future lookup step to predict, default is 1 (e.g next day)
23+
test_size (float): ratio for test data, default is 0.2 (20% testing data)
24+
feature_columns (list): the list of features to use to feed into the model, default is everything grabbed from yahoo_fin
25+
"""
26+
# see if ticker is already a loaded stock from yahoo finance
27+
if isinstance(ticker, str):
28+
# load it from yahoo_fin library
29+
df = si.get_data(ticker)
30+
elif isinstance(ticker, pd.DataFrame):
31+
# already loaded, use it directly
32+
df = ticker
33+
else:
34+
raise TypeError("ticker can be either a str or a `pd.DataFrame` instances")
35+
36+
# this will contain all the elements we want to return from this function
37+
result = {}
38+
# we will also return the original dataframe itself
39+
result['df'] = df.copy()
40+
41+
# make sure that the passed feature_columns exist in the dataframe
42+
for col in feature_columns:
43+
assert col in df.columns
44+
45+
if scale:
46+
column_scaler = {}
47+
# scale the data (prices) from 0 to 1
48+
for column in feature_columns:
49+
scaler = preprocessing.MinMaxScaler()
50+
df[column] = scaler.fit_transform(np.expand_dims(df[column].values, axis=1))
51+
column_scaler[column] = scaler
52+
53+
# add the MinMaxScaler instances to the result returned
54+
result["column_scaler"] = column_scaler
55+
56+
# add the target column (label) by shifting by `lookup_step`
57+
df['future'] = df['adjclose'].shift(-lookup_step)
58+
59+
# last `lookup_step` columns contains NaN in future column
60+
# get them before droping NaNs
61+
last_sequence = np.array(df[feature_columns].tail(lookup_step))
62+
63+
# drop NaNs
64+
df.dropna(inplace=True)
65+
66+
sequence_data = []
67+
sequences = deque(maxlen=n_steps)
68+
69+
for entry, target in zip(df[feature_columns].values, df['future'].values):
70+
sequences.append(entry)
71+
if len(sequences) == n_steps:
72+
sequence_data.append([np.array(sequences), target])
73+
74+
# get the last sequence by appending the last `n_step` sequence with `lookup_step` sequence
75+
# for instance, if n_steps=50 and lookup_step=10, last_sequence should be of 59 (that is 50+10-1) length
76+
# this last_sequence will be used to predict in future dates that are not available in the dataset
77+
last_sequence = list(sequences) + list(last_sequence)
78+
# shift the last sequence by -1
79+
last_sequence = np.array(pd.DataFrame(last_sequence).shift(-1).dropna())
80+
# add to result
81+
result['last_sequence'] = last_sequence
82+
83+
# construct the X's and y's
84+
X, y = [], []
85+
for seq, target in sequence_data:
86+
X.append(seq)
87+
y.append(target)
88+
89+
# convert to numpy arrays
90+
X = np.array(X)
91+
y = np.array(y)
92+
93+
# reshape X to fit the neural network
94+
X = X.reshape((X.shape[0], X.shape[2], X.shape[1]))
95+
96+
# split the dataset
97+
result["X_train"], result["X_test"], result["y_train"], result["y_test"] = train_test_split(X, y,
98+
test_size=test_size, shuffle=shuffle)
99+
# return the result
100+
return result
101+
102+
103+
def create_model(input_length, units=256, cell=LSTM, n_layers=2, dropout=0.3,
104+
loss="mean_absolute_error", optimizer="rmsprop"):
105+
model = Sequential()
106+
for i in range(n_layers):
107+
if i == 0:
108+
# first layer
109+
model.add(cell(units, return_sequences=True, input_shape=(None, input_length)))
110+
elif i == n_layers - 1:
111+
# last layer
112+
model.add(cell(units, return_sequences=False))
113+
else:
114+
# hidden layers
115+
model.add(cell(units, return_sequences=True))
116+
# add dropout after each layer
117+
model.add(Dropout(dropout))
118+
119+
model.add(Dense(1, activation="linear"))
120+
model.compile(loss=loss, metrics=["mean_absolute_error"], optimizer=optimizer)
121+
122+
return model
+67
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
from stock_prediction import create_model, load_data, np
2+
from parameters import *
3+
import matplotlib.pyplot as plt
4+
from sklearn.metrics import accuracy_score
5+
6+
def plot_graph(model, data):
7+
y_test = data["y_test"]
8+
X_test = data["X_test"]
9+
y_pred = model.predict(X_test)
10+
y_test = np.squeeze(data["column_scaler"]["adjclose"].inverse_transform(np.expand_dims(y_test, axis=0)))
11+
y_pred = np.squeeze(data["column_scaler"]["adjclose"].inverse_transform(y_pred))
12+
plt.plot(y_test[-200:], c='b')
13+
plt.plot(y_pred[-200:], c='r')
14+
plt.xlabel("Days")
15+
plt.ylabel("Price")
16+
plt.legend(["Actual Price", "Predicted Price"])
17+
plt.show()
18+
19+
20+
def get_accuracy(model, data):
21+
y_test = data["y_test"]
22+
X_test = data["X_test"]
23+
y_pred = model.predict(X_test)
24+
y_test = np.squeeze(data["column_scaler"]["adjclose"].inverse_transform(np.expand_dims(y_test, axis=0)))
25+
y_pred = np.squeeze(data["column_scaler"]["adjclose"].inverse_transform(y_pred))
26+
y_pred = list(map(lambda current, future: int(float(future) > float(current)), y_test[:-LOOKUP_STEP], y_pred[LOOKUP_STEP:]))
27+
y_test = list(map(lambda current, future: int(float(future) > float(current)), y_test[:-LOOKUP_STEP], y_test[LOOKUP_STEP:]))
28+
return accuracy_score(y_test, y_pred)
29+
30+
31+
def predict(model, data, classification=False):
32+
# retrieve the last sequence from data
33+
last_sequence = data["last_sequence"][:N_STEPS]
34+
# retrieve the column scalers
35+
column_scaler = data["column_scaler"]
36+
# reshape the last sequence
37+
last_sequence = last_sequence.reshape((last_sequence.shape[1], last_sequence.shape[0]))
38+
# expand dimension
39+
last_sequence = np.expand_dims(last_sequence, axis=0)
40+
# get the prediction (scaled from 0 to 1)
41+
prediction = model.predict(last_sequence)
42+
# get the price (by inverting the scaling)
43+
predicted_price = column_scaler["adjclose"].inverse_transform(prediction)[0][0]
44+
return predicted_price
45+
46+
47+
# load the data
48+
data = load_data(ticker, N_STEPS, lookup_step=LOOKUP_STEP, test_size=TEST_SIZE,
49+
feature_columns=FEATURE_COLUMNS, shuffle=False)
50+
51+
# construct the model
52+
model = create_model(N_STEPS, loss=LOSS, units=UNITS, cell=CELL, n_layers=N_LAYERS,
53+
dropout=DROPOUT, optimizer=OPTIMIZER)
54+
55+
model_path = os.path.join("results", model_name) + ".h5"
56+
model.load_weights(model_path)
57+
58+
# evaluate the model
59+
mse, mae = model.evaluate(data["X_test"], data["y_test"])
60+
# calculate the mean absolute error (inverse scaling)
61+
mean_absolute_error = data["column_scaler"]["adjclose"].inverse_transform(mae.reshape(1, -1))[0][0]
62+
print("Mean Absolute Error:", mean_absolute_error)
63+
# predict the future price
64+
future_price = predict(model, data)
65+
print(f"Future price after {LOOKUP_STEP} days is {future_price:.2f}$")
66+
print("Accuracy Score:", get_accuracy(model, data))
67+
plot_graph(model, data)
+45
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from stock_prediction import create_model, load_data
2+
from tensorflow.keras.layers import LSTM
3+
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
4+
import os
5+
import pandas as pd
6+
from parameters import *
7+
8+
9+
# create these folders if they does not exist
10+
if not os.path.isdir("results"):
11+
os.mkdir("results")
12+
13+
if not os.path.isdir("logs"):
14+
os.mkdir("logs")
15+
16+
if not os.path.isdir("data"):
17+
os.mkdir("data")
18+
19+
# load the CSV file from disk (dataset) if it already exists (without downloading)
20+
if os.path.isfile(ticker_data_filename):
21+
ticker = pd.read_csv(ticker_data_filename)
22+
23+
# load the data
24+
data = load_data(ticker, N_STEPS, lookup_step=LOOKUP_STEP, test_size=TEST_SIZE, feature_columns=FEATURE_COLUMNS)
25+
26+
if not os.path.isfile(ticker_data_filename):
27+
# save the CSV file (dataset)
28+
data["df"].to_csv(ticker_data_filename)
29+
30+
# construct the model
31+
model = create_model(N_STEPS, loss=LOSS, units=UNITS, cell=CELL, n_layers=N_LAYERS,
32+
dropout=DROPOUT, optimizer=OPTIMIZER)
33+
34+
# some tensorflow callbacks
35+
checkpointer = ModelCheckpoint(os.path.join("results", model_name), save_weights_only=True, save_best_only=True, verbose=1)
36+
tensorboard = TensorBoard(log_dir=os.path.join("logs", model_name))
37+
38+
history = model.fit(data["X_train"], data["y_train"],
39+
batch_size=BATCH_SIZE,
40+
epochs=EPOCHS,
41+
validation_data=(data["X_test"], data["y_test"]),
42+
callbacks=[checkpointer, tensorboard],
43+
verbose=1)
44+
45+
model.save(os.path.join("results", model_name) + ".h5")

0 commit comments

Comments
 (0)