├── .gitignore ├── README.md ├── doc └── bachelor_thesis.pdf ├── machine_learning ├── __init__.py ├── development │ ├── dataset_preprocessing.py │ ├── keras_ffnn.py │ ├── keras_lstm.py │ ├── knn_regression.py │ ├── knn_wrapper.py │ ├── linear_regression.py │ ├── new_evaluation.md │ ├── new_regression │ │ ├── lin_reg.py │ │ ├── new_dataset.py │ │ ├── new_knn_regression.py │ │ └── new_linear_regression.py │ ├── optimized_ffnn │ │ ├── ffnn.py │ │ ├── ffnn_hyperparam_tune.py │ │ ├── ffnn_hypparam_tune_main.py │ │ ├── ffnn_main.py │ │ └── ffnn_optimal_hyperparameters.txt │ ├── optimized_lstm │ │ ├── hyperparam_tune_main.py │ │ ├── hyperparameter_tunning.py │ │ ├── lstm.py │ │ ├── lstm_main.py │ │ └── optimal_hyperparameters.txt │ ├── original_evaluation.md │ ├── technical_indicators_dataset.py │ └── testing │ │ ├── analysis.py │ │ ├── companies.py │ │ ├── future_gap_test.py │ │ ├── lag_metric.py │ │ ├── results │ │ ├── amazon.md │ │ ├── analysis.md │ │ ├── apple.md │ │ ├── eval.md │ │ ├── facebook.md │ │ ├── future_gap.md │ │ ├── tesla.md │ │ └── window_and_ts.md │ │ ├── test.py │ │ └── window_plot_test.py ├── final │ ├── evaluation │ │ └── metrics.py │ ├── experiments │ │ ├── exp1.py │ │ ├── exp2.py │ │ ├── exp3.py │ │ ├── exp4.py │ │ └── exp5.py │ ├── models │ │ ├── ffnn.py │ │ ├── knn_reg.py │ │ ├── knn_wrapper.py │ │ ├── lin_reg.py │ │ └── lstm.py │ └── utils │ │ └── dataset.py └── readme.md ├── resources └── historical_data │ ├── AAPL.csv │ ├── AMZN.csv │ ├── FB.csv │ ├── GOOG.csv │ ├── IBM.csv │ ├── MSFT.csv │ ├── SPY.csv │ ├── TSLA.csv │ └── ^GSPC.csv ├── results ├── amazon.png ├── apple.png ├── experiments │ ├── exp1 │ │ ├── ffnn.png │ │ ├── knn.png │ │ ├── lin_reg.png │ │ └── lstm.png │ ├── exp2 │ │ ├── sudden_vs_normal.png │ │ ├── sudden_vs_normal_daily_lag.png │ │ ├── sudden_vs_normal_lag.png │ │ └── sudden_vs_normal_pal.png │ ├── exp3 │ │ ├── amazon.png │ │ ├── apple.png │ │ ├── facebook.png │ │ └── tesla.png │ ├── exp4 │ │ ├── gap1.png │ │ ├── gap2.png │ │ ├── gap3.png │ │ ├── gap4.png │ │ └── gap5.png │ └── exp5 │ │ ├── linreg_forecast.png │ │ ├── linreg_forecast_pal.png │ │ ├── linreg_pal.png │ │ ├── linreg_pal_daily.png │ │ ├── lstm_forecast.png │ │ ├── lstm_forecast_pal.png │ │ ├── lstm_pal.png │ │ └── lstm_pal_daily.png ├── facebook.png ├── ffnn_reg.png ├── future_gap_test.png ├── hyperparam_tune_ffnn1.png ├── hyperparam_tune_ffnn2.png ├── hyperparam_tune_lstm1.png ├── hyperparam_tune_lstm2.png ├── knn.png ├── lin_reg.png ├── lstm.png ├── new_lin_reg.png ├── optimized_ffnn.png ├── optimized_lstm.png ├── stable.png ├── stable_lag.png ├── sudden_vs_normal.png ├── sudden_vs_normal_daily_lag.png ├── sudden_vs_normal_lag.png ├── sudden_vs_normal_pal.png ├── sudden_vs_normal_pal_1.png ├── tesla.png ├── volatile.png ├── volatile_lag.png ├── window_test_1.png └── window_test_2.png ├── statistics_and_optimization ├── __init__.py ├── bollinger_bands.py ├── portfolio_optimization.py ├── portfolio_statistics.py └── readme.md └── utils ├── README.md ├── __init__.py └── util.py /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | __pycache__ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ML4T 2 | *Machine Learning for Trading* 3 | 4 | ## Project Overview 5 | *GUC 2018 Bachelor Thesis Project* 6 | 7 | Stock market prediction is an interesting realm to test the capabilities of machine learning 8 | on. The nature of the stock market is volatile, sophisticated, and very sensitive to external 9 | information, which makes it difficult to predict. Different machine learning models 10 | are developed to forecast future stock prices. Using historical stock market data, technical 11 | indicators are computed and used along with a stock’s price as features associated 12 | with a target output, which is the future stock price. This provides a dataset that the 13 | machine learning models use to train upon, and thus the models become capable of predicting 14 | future prices. The models used are: linear regressor, kNN regressor, Feedforward 15 | Neural Network (FFNN), and Long Short Term Memory (LSTM) Recurrent Neural Network 16 | (RNN). The prediction models are compared and evaluated using different metrics. 17 | Several case studies are performed to evaluate the performance of the machine learning 18 | models. From the case studies, few insights have been made: 19 | 20 | 1. The LSTM RNN outperformed all the other models. 21 | 2. The LSTM RNN model is capable of accurately predicting the next-day price unless a major external event impacts the stock price suddenly. 22 | 3. The LSTM RNN model naturally lags on picking up on external events that impact the stock price suddenly. 23 | 24 | ## Algorithms Evaluation 25 | *Development Phase* 26 | 27 | * [Original](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/original_evaluation.md) 28 | * [New](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/new_evaluation.md) 29 | 30 | ## Testing 31 | *Testing Phase* 32 | 33 | * Considering that the LSTM model is regarded as the flagship machine learning model in this project, 34 | it is the one used in this testing section. 35 | 36 | * The model is trained on the period starting from a company's first public trading day till the day 37 | before the required testing period. 38 | 39 | ### Companies During Times of Change 40 | *Predicting Stock prices for a portfolio of 4 companies during different interesting time periods* 41 | 42 | * **[Facebook](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/facebook.md)** 43 | 44 | *Facebook started trading publicly on 18/05/2012.* 45 | 46 | * Facebook–Cambridge Analytica data scandal, [January/2018 - March/2018] 47 | 48 | Amid the scandal and Mark Zuckerburg's public hearing, Facebook's stock price fell. 49 | 50 | * **[Apple](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/apple.md)** 51 | 52 | *Apple started trading publicly on 12/12/1980.* 53 | 54 | * Apple's first free fall, [September/2012 - June/2013] 55 | 56 | Apple faced multiple hardships during this period; earnings were no longer growing, 57 | low-priced phones were capturing most of the smartphone market share over the iPhone, 58 | and the company entered the "post-Steve Jobs" era where the company's next generation 59 | of leaders and products were in question. 60 | 61 | * **[Tesla](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/tesla.md)** 62 | 63 | *Tesla started trading publicly on 29/06/2010.* 64 | 65 | * Disappointing Q3 Reports, [September/2013 - November/2013] 66 | 67 | Tesla reported disappointing third quarter financial results. In addition, 68 | a third widely-reported fi re involving a Model S in just two months was 69 | putting Tesla under heat. 70 | 71 | * **[Amazon](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/amazon.md)** 72 | 73 | *Amazon started trading publicly on 15/05/1997.* 74 | 75 | * Exceeding Q3 expectations, [September/2017 - February/2018] 76 | 77 | Amazon's Q3 reports showed an increase in profits, an acceleration in revenue growth, an increase 78 | in AWS' operating income, and the success of Alexa-enabled devices. 79 | 80 | ### Window and Time Steps Test 81 | A test to determine the optimal window and time steps. See results [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/window_and_ts.md). 82 | 83 | ### Evaluation Metrics 84 | New metrics to evaluate the performance of the model over different future gaps. See results [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/eval.md). 85 | 86 | ### Future Gap Test 87 | A test to compare between the linear regressor, FFNN, and LSTM RNN over different future gaps. See results [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/future_gap.md). 88 | 89 | ## Analysis 90 | *Analysing the tests using a novel metric* 91 | 92 | To analyse the forecast and evaluate how fast does the model predict the closest price to the actual, a lag metric is created. 93 | The **_Prediction-Actual Lag (PAL)_** metric works as follows: 94 | The future gap chosen when making the forecast indicates how far into the future should a prediction be, for example if the future gap is set to 1, the forecast is a next-trading-day forecast. The actual prices are traversed and compared with the predictions, each actual price datapoint is compared against a number of the prediction data points, that number is the future gap, so if the future gap is set to 5, then each actual datapoint is compared to the corresponding prediction datapoint and the 4 next to it. See **_PAL_** in action [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/analysis.md). 95 | 96 | ## Software and Libraries 97 | *This project uses the following software and Python libraries:* 98 | 99 | * [NumPy](http://www.numpy.org/) 100 | * [pandas](http://pandas.pydata.org/) 101 | * [matplotlib](https://matplotlib.org/index.html) 102 | * [SciPy](https://www.scipy.org/) 103 | * [TensorFlow](https://www.tensorflow.org) 104 | * [Keras](https://keras.io/) 105 | * [scikit-learn](http://scikit-learn.org/stable/) 106 | * [TA-Lib](https://mrjbq7.github.io/ta-lib/doc_index.html) 107 | -------------------------------------------------------------------------------- /doc/bachelor_thesis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/doc/bachelor_thesis.pdf -------------------------------------------------------------------------------- /machine_learning/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/machine_learning/__init__.py -------------------------------------------------------------------------------- /machine_learning/development/dataset_preprocessing.py: -------------------------------------------------------------------------------- 1 | '''This file constructs a dataset to be used by the ML algorithms. 2 | The dataset consists of the past price and technical indicators as 3 | features, and the price as the output. The dataset is indexed by 4 | date, a row entry contains the price and techincal indicators of 5 | some day prior to the date index, and the price is the actual 6 | price of the stock at the date marked by the index. 7 | ''' 8 | from utils.util import get_data 9 | import numpy as np 10 | import pandas as pd 11 | 12 | '''technical indicators computation functions 13 | 14 | *prices : adjusted closing stock prices 15 | *window : rolling statistics window 16 | ''' 17 | #BEGIN 18 | def compute_momentum_ratio(prices, window): 19 | #first window elements >> NA 20 | momentum_ratio = (prices/prices.shift(periods = 1)) - 1 21 | return momentum_ratio 22 | 23 | def compute_sma_ratio(prices, window): 24 | #Simple Moving Average 25 | #first window-1 elements >> NA 26 | sma_ratio = (prices / prices.rolling(window = window).mean()) - 1 27 | return sma_ratio 28 | 29 | def compute_bollinger_bands_ratio(prices, window): 30 | #first window-1 elements >> NA 31 | bb_ratio = prices - prices.rolling(window = window).mean() 32 | bb_ratio = bb_ratio / (2 * prices.rolling(window = window).std()) 33 | return bb_ratio 34 | 35 | def compute_volatility_ratio(prices, window): 36 | #first window-1 elements >> NA 37 | volatility_ratio = ((prices/prices.shift(periods = 1)) - 1).rolling(window = window).std() 38 | return volatility_ratio 39 | 40 | def compute_vroc_ratio(volume, window): 41 | #Volume Rate of Change 42 | #first window-1 elements >> NA 43 | vroc_ratio = (volume/volume.shift(periods = window)) - 1 44 | return vroc_ratio 45 | 46 | def compute_daily_return_volatility(prices, window): 47 | #first window-1 elements >> NA 48 | daily_return = (prices/prices.shift(periods= 1)) - 1 49 | volatility = daily_return.rolling(window=window).std() 50 | return volatility 51 | #END 52 | 53 | '''dataset constructor function 54 | 55 | *start_date : start date for the entire dataset (training and testing) 56 | *end_date : end date for the entire dataset (training and testing) 57 | *stock : stock label to be used in the dataset 58 | ''' 59 | def get_dataset_dataframe(start_date='17/12/2014', end_date = '31/12/2017', stock='IBM'): 60 | #importing stock data 61 | stock_df = get_data([stock], start_date, end_date) 62 | date_range = pd.date_range(start_date, end_date) 63 | dataset_df = pd.DataFrame(index=date_range) 64 | 65 | #calculating technical indicators 66 | #make sure to include the last 2 weeks of 2014 to compensate calculations loss 67 | #1st week is lost in the preparation of the indicators 68 | #2nd week is lost to include the future gap 69 | future_gap = 5 #1 trading week 70 | dataset_df['price'] = stock_df[stock] 71 | dataset_df.dropna(subset=['price'], inplace=True) 72 | dataset_df['momentum'] = compute_momentum_ratio(stock_df[stock], future_gap) 73 | dataset_df['sma'] = compute_sma_ratio(stock_df[stock], future_gap) 74 | dataset_df['bolinger_band'] = compute_bollinger_bands_ratio(stock_df[stock], future_gap) 75 | #dataset_df['volatility'] = compute_daily_return_volatility(stock_df[stock], future_gap) 76 | dataset_df.dropna(subset=dataset_df.columns, inplace=True) 77 | dataset_df = dataset_df.shift(future_gap) 78 | shifted_columns_names = ['price(t-%d)' %(future_gap), 'moment(t-%d)' %(future_gap), 'sma(t-%d)' %(future_gap), 79 | 'b_band(t-%d)' %(future_gap)] 80 | dataset_df.columns = shifted_columns_names 81 | dataset_df.dropna(subset=shifted_columns_names, inplace=True) 82 | dataset_df['price'] = stock_df[stock] 83 | 84 | return dataset_df -------------------------------------------------------------------------------- /machine_learning/development/keras_ffnn.py: -------------------------------------------------------------------------------- 1 | ''' this file uses a keras feed-forward-NN to predict stock prices 2 | one trading week in advance 3 | ''' 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from keras.models import Sequential 7 | from keras.layers import Dense 8 | from sklearn.preprocessing import MinMaxScaler 9 | from sklearn.metrics import mean_squared_error 10 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe 11 | 12 | '''a tester function 13 | ''' 14 | def main(): 15 | #getting the preprocessed dataset dataframe 16 | dataset_df = get_dataset_dataframe() 17 | #dataset preparation 18 | dataset = dataset_df.values 19 | #dataset scaling 20 | scaler = MinMaxScaler(feature_range=(0, 1)) 21 | dataset = scaler.fit_transform(dataset) 22 | #dataset splitting 23 | training_start_index = 0 24 | training_end_index = 503 25 | testing_start_index = 504 26 | testing_end_index = 755 27 | X_train = dataset[training_start_index:training_end_index+1, :-1] 28 | Y_train = dataset[training_start_index:training_end_index+1, -1] 29 | X_test = dataset[testing_start_index:testing_end_index+1, :-1] 30 | Y_test = dataset[testing_start_index:testing_end_index+1, -1] 31 | #Feed Forward NN model 32 | model = Sequential() 33 | model.add(Dense(20, input_dim=4, activation='relu', kernel_initializer='normal')) 34 | model.add(Dense(10, activation='relu', kernel_initializer='normal')) 35 | model.add(Dense(1, kernel_initializer='normal')) 36 | model.compile(loss='mse', optimizer='adam') 37 | #fitting the training data 38 | history = model.fit(X_train, Y_train, epochs=200, batch_size=int(X_train.shape[0]/8), 39 | validation_split=0.2, verbose=2, shuffle=False) 40 | #evaluating the testing data 41 | results = model.evaluate(X_test, Y_test) 42 | results_names = model.metrics_names 43 | print(results_names, ":", results) 44 | #predictions 45 | predictions_scaled = model.predict(X_test) 46 | test_dataset_scaled = np.concatenate((X_test, predictions_scaled), axis=1) 47 | test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled) 48 | predictions_unscaled = test_dataset_unscaled[:, -1] 49 | #actual values 50 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 51 | test_dataset_scaled = np.concatenate((X_test, Y_test), axis=1) 52 | test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled) 53 | Y_test_unscaled = test_dataset_unscaled[:, -1] 54 | #evaluation 55 | rmse = (mean_squared_error(predictions_unscaled, Y_test_unscaled) ** 0.5) 56 | print('Test RMSE: %.3f' %(rmse)) 57 | correlation = np.corrcoef(predictions_unscaled, Y_test_unscaled) 58 | print("Correlation: %.3f"%(correlation[0, 1])) 59 | #plotting 60 | _, (ax1, ax2) = plt.subplots(2,1) 61 | ax1.plot(history.history['loss'], label='Training') 62 | ax1.plot(history.history['val_loss'], label='Validation') 63 | ax1.set_xlabel('Epoch #') 64 | ax1.set_ylabel('Loss') 65 | ax1.legend(loc='best') 66 | ax1.grid(True) 67 | 68 | ax2.plot(range(len(predictions_unscaled)), predictions_unscaled, label='Prediction') 69 | ax2.plot(range(len(Y_test_unscaled)), Y_test_unscaled, label='Actual') 70 | ax2.set_xlabel('Trading Day') 71 | ax2.set_ylabel('Price') 72 | ax2.legend(loc='best') 73 | ax2.grid(True) 74 | 75 | plt.show() 76 | 77 | main() -------------------------------------------------------------------------------- /machine_learning/development/keras_lstm.py: -------------------------------------------------------------------------------- 1 | ''' this file uses a keras LSTM RNN to predict stock prices 2 | one trading week in advance 3 | ''' 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from keras.models import Sequential 7 | from keras.layers import LSTM, Dense 8 | from sklearn.preprocessing import MinMaxScaler 9 | from sklearn.metrics import mean_squared_error 10 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe 11 | 12 | '''a tester function 13 | ''' 14 | def main(): 15 | #getting the preprocessed dataset dataframe 16 | dataset_df = get_dataset_dataframe() 17 | #dataset preparation 18 | dataset = dataset_df.values 19 | #dataset scaling 20 | scaler = MinMaxScaler(feature_range=(0, 1)) 21 | dataset = scaler.fit_transform(dataset) 22 | training_start_index = 0 23 | training_end_index = 503 24 | testing_start_index = 504 25 | testing_end_index = 755 26 | #dataset splitting 27 | X_train = dataset[training_start_index:training_end_index+1, :-1] 28 | Y_train = dataset[training_start_index:training_end_index+1, -1] 29 | X_test = dataset[testing_start_index:testing_end_index+1, :-1] 30 | Y_test = dataset[testing_start_index:testing_end_index+1, -1] 31 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 32 | #reshaping the dataset for the LSTM RCC 33 | X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1])) 34 | X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1])) 35 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 36 | (samples, timesteps, features) = X_train.shape 37 | #LSTM RNN model 38 | model = Sequential() 39 | model.add(LSTM(100, input_shape=(timesteps, features))) 40 | model.add(Dense(1)) 41 | model.compile(loss='mse', optimizer='adam') 42 | #fitting the training data 43 | history = model.fit(X_train, Y_train, epochs=200, batch_size=int(samples/8), 44 | validation_split=0.2, verbose=2, shuffle=False) 45 | #evaluating the testing data 46 | results = model.evaluate(X_test, Y_test) 47 | results_names = model.metrics_names 48 | print("Test", results_names, ":", results) 49 | #predictions 50 | predictions_scaled = model.predict(X_test) 51 | X_test = X_test.reshape((X_test.shape[0], X_test.shape[2])) 52 | test_dataset_scaled = np.concatenate((X_test, predictions_scaled), axis=1) 53 | test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled) 54 | predictions_unscaled = test_dataset_unscaled[:, -1] 55 | #actual values 56 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 57 | test_dataset_scaled = np.concatenate((X_test, Y_test), axis=1) 58 | test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled) 59 | Y_test_unscaled = test_dataset_unscaled[:, -1] 60 | #evaluation 61 | rmse = (mean_squared_error(predictions_unscaled, Y_test_unscaled) ** 0.5) 62 | print('Test RMSE: %.3f' %(rmse)) 63 | correlation = np.corrcoef(predictions_unscaled, Y_test_unscaled) 64 | print("Correlation: %.3f"%(correlation[0, 1])) 65 | #plots 66 | _, (ax1, ax2) = plt.subplots(2,1) 67 | ax1.plot(history.history['loss'], label='Training') 68 | ax1.plot(history.history['val_loss'], label='Validation') 69 | ax1.set_xlabel('Epoch #') 70 | ax1.set_ylabel('Loss') 71 | ax1.legend(loc='best') 72 | ax1.grid(True) 73 | 74 | ax2.plot(range(len(predictions_unscaled)), predictions_unscaled, label='Prediction') 75 | ax2.plot(range(len(Y_test_unscaled)), Y_test_unscaled, label='Actual') 76 | ax2.set_xlabel('Trading Day') 77 | ax2.set_ylabel('Price') 78 | ax2.legend(loc='best') 79 | ax2.grid(True) 80 | 81 | plt.show() 82 | 83 | '''to ensure running the tester function only when this file is run, not imported 84 | ''' 85 | if __name__ == "__main__": 86 | main() -------------------------------------------------------------------------------- /machine_learning/development/knn_regression.py: -------------------------------------------------------------------------------- 1 | ''' this file shows an implementation of kNN regression to 2 | predict stock prices one trading week in advance 3 | ''' 4 | from utils.util import get_data, plot_data 5 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe 6 | from machine_learning.development.linear_regression import calculate_rmse 7 | import machine_learning.development.knn_wrapper as knn 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | '''a tester function 12 | ''' 13 | def main(): 14 | #getting the preprocessed dataset dataframe 15 | dataset_df = get_dataset_dataframe() 16 | #dataset preparation 17 | dataset = dataset_df.values 18 | #dataset splitting 19 | training_start_index = 0 20 | training_end_index = 503 21 | testing_start_index = 504 22 | testing_end_index = 755 23 | X_train = dataset[training_start_index:training_end_index+1, :-1] 24 | Y_train = dataset[training_start_index:training_end_index+1, -1] 25 | X_test = dataset[testing_start_index:testing_end_index+1, :-1] 26 | Y_test = dataset[testing_start_index:testing_end_index+1, -1] 27 | #kNN model 28 | model = knn.knn(3) 29 | #fitting the training data 30 | model.train(X_train, Y_train) 31 | #predictions 32 | predictions = model.query(X_test) 33 | #evaluation 34 | rmse = (calculate_rmse(predictions, Y_test) ** 0.5) 35 | print('Test RMSE: %.3f' %(rmse)) 36 | correlation = np.corrcoef(predictions, Y_test) 37 | print("Correlation: %.3f"%(correlation[0, 1])) 38 | #plotting 39 | _, ax = plt.subplots() 40 | ax.plot(range(len(predictions)), predictions, label='Prediction') 41 | ax.plot(range(len(Y_test)), Y_test, label='Actual') 42 | ax.set_xlabel('Trading Day') 43 | ax.set_ylabel('Price') 44 | ax.legend(loc='best') 45 | ax.grid(True) 46 | 47 | plt.show() 48 | 49 | '''to ensure running the tester function only when this file is run, not imported 50 | ''' 51 | if __name__ == "__main__": 52 | main() -------------------------------------------------------------------------------- /machine_learning/development/knn_wrapper.py: -------------------------------------------------------------------------------- 1 | ''' this file contains an implementation of kNN regression 2 | ''' 3 | import numpy as np 4 | 5 | '''kNN wrapper class 6 | 7 | *k : k nearest neighbors to be considered 8 | *dataset : training dataset including the features and the output 9 | ''' 10 | class knn: 11 | __k = 0 12 | __dataset = None 13 | 14 | '''constructor function 15 | 16 | *k : k nearest neighbors to be considered 17 | ''' 18 | def __init__(self, k): 19 | self.__k = k 20 | 21 | '''training function 22 | 23 | *data_x : training dataset features 24 | *data_y : training dataset output 25 | ''' 26 | def train(self, data_x, data_y): 27 | data_y_reshaped = data_y.reshape((data_y.shape[0], 1)) 28 | self.__dataset = np.concatenate((data_x, data_y_reshaped), axis=1) 29 | 30 | '''querying/evaluating function 31 | 32 | *features : test dataset features 33 | ''' 34 | def query(self, features, normalize=True, addDiff=True): 35 | dataset_price_normed = self.__dataset[:, 0] 36 | features_price_normed = features[:, 0] 37 | 38 | if normalize: 39 | dataset_price_normed = (self.__dataset[:, 0]/self.__dataset[0, 0]) - 1 40 | features_price_normed = (features[:, 0]/features[0, 0]) - 1 41 | 42 | cumm_difference = np.zeros(features.shape[0]) 43 | predicted_price = np.zeros(features.shape[0]) 44 | 45 | for i in range(0, features.shape[0]): 46 | 47 | price_normed_difference = np.absolute(dataset_price_normed - features_price_normed[i]) 48 | moment_difference = np.absolute(self.__dataset[:, 1] - features[i, 1]) 49 | sma_difference = np.absolute(self.__dataset[:, 2] - features[i, 2]) 50 | b_band_difference = np.absolute(self.__dataset[:, 3] - features[i, 3]) 51 | 52 | cumm_difference = price_normed_difference + moment_difference + sma_difference + b_band_difference 53 | difference_op = np.asarray([cumm_difference, self.__dataset[:, -1]]).T 54 | sorting_index = np.argsort(difference_op[:, 0]) 55 | difference_sorted = difference_op[sorting_index] 56 | 57 | k_mean = np.mean(difference_sorted[:self.__k, 1]) 58 | predicted_price[i] = k_mean 59 | 60 | if addDiff: 61 | predicted_price += (features[0, 0] - self.__dataset[0, 0]) 62 | return predicted_price -------------------------------------------------------------------------------- /machine_learning/development/linear_regression.py: -------------------------------------------------------------------------------- 1 | ''' this file shows an implementation of linear regression to 2 | predict stock prices one trading week in advance. SciPy's 3 | minimize function is used to optimize the fitted linear line 4 | coefficients 5 | ''' 6 | from utils.util import get_data 7 | import numpy as np 8 | import scipy.optimize as spo 9 | import matplotlib.pyplot as plt 10 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe 11 | 12 | '''computes and returns the root mean squared error 13 | 14 | *x : a dynamic variable: (value, array, ...) 15 | *y : a dynamic variable: (value, array, ...) 16 | ''' 17 | def calculate_rmse(x, y): 18 | #squared error 19 | se = (x-y) ** 2 20 | #mean squared error 21 | mse = np.mean(se) 22 | #root mean squared error 23 | rmse = mse ** 0.5 24 | return rmse 25 | 26 | '''given the fitted line coefficients and the dataset, this 27 | function computes the rmse between the actual values and 28 | the predicted values of the linear regression 29 | 30 | *coefficients : fitted line coefficients array 31 | *data : dataset containing the features and the output 32 | ''' 33 | def new_error_fun(coefficients, data): 34 | price = coefficients[0]*data[:, 0] 35 | moment = coefficients[1]*data[:, 1] 36 | sma = coefficients[2]*data[:, 2] 37 | b_band = coefficients[3]*data[:, 3] 38 | std = coefficients[4]*data[:, 4] 39 | vroc = coefficients[5]*data[:, 5] 40 | constant = coefficients[6] 41 | predicted_values = price+moment+sma+b_band+std+vroc+constant 42 | actual_values = data[:, -1] 43 | rmse = calculate_rmse(predicted_values, actual_values) 44 | return rmse 45 | 46 | '''given the fitted line coefficients and the dataset, this 47 | function computes the rmse between the actual values and 48 | the predicted values of the linear regression 49 | 50 | *coefficients : fitted line coefficients array 51 | *data : dataset containing the features and the output 52 | ''' 53 | def error_fun(coefficients, data): 54 | price = coefficients[0]*data[:, 0] 55 | moment = coefficients[1]*data[:, 1] 56 | sma = coefficients[2]*data[:, 2] 57 | b_band = coefficients[3]*data[:, 3] 58 | constant = coefficients[4] 59 | predicted_values = price+moment+sma+b_band+constant 60 | actual_values = data[:, -1] 61 | rmse = calculate_rmse(predicted_values, actual_values) 62 | return rmse 63 | 64 | '''given the data to be passed to the error fcn, this function 65 | computes an initial guess of the coefficients and uses SciPy's 66 | minimize fcn and the error fcn to find the optimal coefficients 67 | 68 | *data : fitted line coefficients array 69 | *err_fun : error function to be minimized by SciPy's minimizor 70 | ''' 71 | def minimize_new_err_fun(data, err_fun): 72 | price = np.mean(data[:, 0]) 73 | moment = np.mean(data[:, 1]) 74 | sma = np.mean(data[:, 2]) 75 | b_band = np.mean(data[:, 3]) 76 | std = np.mean(data[:, 4]) 77 | vroc = np.mean(data[:, 5]) 78 | constant = 0 79 | coefficients_guess = [price, moment, sma, b_band, std, vroc, constant] 80 | result = spo.minimize(error_fun, coefficients_guess, args=(data, ), method="SLSQP", options= {'disp' : True}) 81 | return result.x 82 | 83 | '''given the data to be passed to the error fcn, this function 84 | computes an initial guess of the coefficients and uses SciPy's 85 | minimize fcn and the error fcn to find the optimal coefficients 86 | 87 | *data : fitted line coefficients array 88 | *err_fun : error function to be minimized by SciPy's minimizor 89 | ''' 90 | def minimize_err_fun(data, err_fun): 91 | price = np.mean(data[:, 0]) 92 | moment = np.mean(data[:, 1]) 93 | sma = np.mean(data[:, 2]) 94 | b_band = np.mean(data[:, 3]) 95 | constant = 0 96 | coefficients_guess = [price, moment, sma, b_band, constant] 97 | result = spo.minimize(error_fun, coefficients_guess, args=(data, ), method="SLSQP", options= {'disp' : True}) 98 | return result.x 99 | 100 | '''a normalization fcn 101 | 102 | *values : values to be normalized 103 | *mean : mean of the values 104 | *std : standard deviation of the values 105 | ''' 106 | def normalize(values, mean, std): 107 | return (values - mean) / std 108 | 109 | '''an inverse-normalization fcn 110 | 111 | *values : normalized values 112 | *mean : mean of the normalized values 113 | *std : standard deviation of the normalized values 114 | ''' 115 | def inverse_normalize(normalized_values, mean, std): 116 | return (normalized_values * std) + mean 117 | 118 | '''a tester function 119 | ''' 120 | def main(): 121 | #getting the preprocessed dataset dataframe 122 | dataset_df = get_dataset_dataframe() 123 | #dataset preparation 124 | dataset = dataset_df.values 125 | #dataset normalization 126 | '''mean = np.mean(dataset, axis=0) 127 | std = np.std(dataset, axis=0) 128 | dataset_normalized = normalize(dataset, mean, std) 129 | ''' 130 | #dataset splitting 131 | training_start_index = 0 132 | training_end_index = 503 133 | testing_start_index = 504 134 | testing_end_index = 755 135 | training_set = dataset[training_start_index:training_end_index+1, :] 136 | X_test = dataset[testing_start_index:testing_end_index+1, :-1] 137 | Y_test = dataset[testing_start_index:testing_end_index+1, -1] 138 | #training 139 | fitted_line_coefficients = minimize_err_fun(training_set, error_fun) 140 | print("Line Coefficients:", fitted_line_coefficients) 141 | #testing 142 | price = fitted_line_coefficients[0]*X_test[:, 0] 143 | moment = fitted_line_coefficients[1]*X_test[:, 1] 144 | sma = fitted_line_coefficients[2]*X_test[:, 2] 145 | b_band = fitted_line_coefficients[3]*X_test[:, 3] 146 | constant = fitted_line_coefficients[4] 147 | predicted_values = price+moment+sma+b_band+constant 148 | #evaluation 149 | rmse = calculate_rmse(predicted_values, Y_test) 150 | print('RMSE: %.3f' %(rmse)) 151 | correlation = np.corrcoef(predicted_values, Y_test) 152 | print("Correlation: %.3f"%(correlation[0, 1])) 153 | #plots 154 | _, ax = plt.subplots() 155 | ax.plot(range(len(predicted_values)), predicted_values, label='Prediction') 156 | ax.plot(range(len(Y_test)), Y_test, label='Actual') 157 | ax.set_xlabel('Trading Day') 158 | ax.set_ylabel('Price') 159 | ax.legend(loc='best') 160 | ax.grid(True) 161 | plt.show() 162 | 163 | '''to ensure running the tester function only when this file is run, not imported 164 | ''' 165 | if __name__ == "__main__": 166 | main() -------------------------------------------------------------------------------- /machine_learning/development/new_evaluation.md: -------------------------------------------------------------------------------- 1 | ## Algorithms Evaluation 2 | 3 | | Variable | Value | Description | 4 | | :--------- | :---------- | :---------- | 5 | | stock | ^GSPC | S&P 500 | 6 | | start date | 1950-01-01 | stock historical data start date | 7 | | end date | 2017-12-31 | stock historical data end date | 8 | | window | 2 | window for computing rolling statistics | 9 | | future gap | 1, 5, 20 | how far (trading days) into the future is the prediction | 10 | | split | 0.8 | training-testing dataset split | 11 | 12 | ### Evaluation metrics 13 | *metrics are applied on the normalized dataset, where the values are in the range [0, 1]* 14 | 15 | 1. Loss 16 | *RMSE : accumulation of all errors, RMSE value represents dollar value 17 | *MAPE : accumulation of all error percentages, MAPE value represents percentage value 18 | 19 | 2. Accuracy 20 | *Correlation : linear relationship between predictions and actual values, range: [-1, 1] 21 | *r-squared : how close predictions are to actual prices, range: [0, 1] 22 | 23 | * Optimized LSTM 24 | ```sh 25 | python -m machine_learning.development.optimized_lstm.lstm_main 26 | ``` 27 | | Future Gap | RMSE | MAPE | Corr | R^2 | 28 | | :--------: | :--: | :--: | :--: | :--: | 29 | | 1 day | 0.007| 1.033| 0.999| 0.998| 30 | | 1 week | 0.012| 1.642| 0.998| 0.995| 31 | | 1 month | 0.026| 3.708| 0.992| 0.972| 32 | 33 | *shown below is a 1 trading day future gap* 34 | 35 | ![Optimized LSTM](https://github.com/ahmedhamdi96/ML4T/blob/master/results/optimized_lstm.png) 36 | 37 | * Optimized FFNN 38 | ```sh 39 | python -m machine_learning.development.optimized_ffnn.ffnn_main 40 | ``` 41 | | Future Gap | RMSE | MAPE | Corr | R^2 | 42 | | :--------: | :--: | :--: | :--: | :-: | 43 | | 1 day | 0.009| 1.401| 0.999| 0.997| 44 | | 1 week | 0.015| 2.108| 0.998| 0.992| 45 | | 1 month | 0.021| 3.014| 0.992| 0.984| 46 | 47 | *shown below is a 1 trading day future gap* 48 | 49 | ![Optimized FFNN](https://github.com/ahmedhamdi96/ML4T/blob/master/results/optimized_ffnn.png) 50 | 51 | ## Hyperparameter Tuning 52 | 53 | * LSTM 54 | ```sh 55 | python -m machine_learning.development.optimized_lstm.hyperparam_tune_main 56 | ``` 57 | *Time Elapsed: 25 hours* 58 | 59 | | Hyperparameter | Optimal Value | 60 | | :------------: | :-----------: | 61 | | Dropout | 0.2 | 62 | | Neurons | [256, 256, 32, 1] | 63 | | Decay | 0.1 | 64 | | Time Steps | 5 | 65 | | Batch Size | 2048 | 66 | | Epochs | 300 | 67 | 68 | ![LSTM Hyperparam Tune 1](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_lstm1.png) 69 | ![LSTM Hyperparam Tune 2](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_lstm2.png) 70 | 71 | * FFNN 72 | ```sh 73 | python -m machine_learning.development.optimized_ffnn.ffnn_hyperparam_tune_main 74 | ``` 75 | *Time Elapsed: 5.3 minutes* 76 | 77 | | Hyperparameter | Optimal Value | 78 | | :------------: | :-----------: | 79 | | Dropout | 0.8 | 80 | | Neurons | [256, 256, 64, 1] | 81 | | Decay | 0.1 | 82 | | Batch Size | 128 | 83 | | Epochs | 200 | 84 | 85 | ![FFNN Hyperparam Tune 1](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_ffnn1.png) 86 | ![FFNN Hyperparam Tune 2](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_ffnn2.png) -------------------------------------------------------------------------------- /machine_learning/development/new_regression/lin_reg.py: -------------------------------------------------------------------------------- 1 | from utils.util import get_stock_data 2 | import machine_learning.development.dataset_preprocessing as dpp 3 | import machine_learning.development.linear_regression as lin_reg 4 | from machine_learning.development.new_regression.new_dataset import compute_mape 5 | from sklearn.preprocessing import MinMaxScaler 6 | import numpy as np 7 | from sklearn.metrics import mean_squared_error, mean_absolute_error 8 | from sklearn.metrics import r2_score 9 | 10 | def bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True): 11 | cols = ["Date", "Adj Close", "Volume"] 12 | df = get_stock_data(stock_symbol, start_date, end_date, cols) 13 | df.rename(columns={"Adj Close" : 'price'}, inplace=True) 14 | df['momentum'] = dpp.compute_momentum_ratio(df['price'], window) 15 | df['sma'] = dpp.compute_sma_ratio(df['price'], window) 16 | df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window) 17 | df['volatility'] = dpp.compute_volatility_ratio(df['price'], window) 18 | df['vroc'] = dpp.compute_vroc_ratio(df['Volume'], window) 19 | df['actual_price'] = df['price'] 20 | df.drop(columns=["Volume"], inplace=True) 21 | df = df[window:] 22 | df.replace([np.inf, -np.inf], np.nan, inplace=True) 23 | df.fillna(method='ffill', inplace=True) 24 | df.fillna(method='bfill', inplace=True) 25 | scaler = None 26 | 27 | if normalize: 28 | scaler = MinMaxScaler() 29 | df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1)) 30 | df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1)) 31 | df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1)) 32 | df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1)) 33 | df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1)) 34 | df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1)) 35 | df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1)) 36 | 37 | print(df.head()) 38 | print(df.tail()) 39 | return df, scaler 40 | 41 | def dataset_reshape(dataset, future_gap, split): 42 | print("Dataset Shape:", dataset.shape) 43 | X = dataset[:, :-1] 44 | Y = dataset[:, -1] 45 | print("X Shape:", X.shape) 46 | print("Y Shape:", Y.shape) 47 | 48 | print("Applying Future Gap...") 49 | X = X[:-future_gap] 50 | Y = Y[future_gap:] 51 | print("X Shape:", X.shape) 52 | print("Y Shape:", Y.shape) 53 | 54 | if split != None: 55 | print("Applying training, testing split...") 56 | split_index = int(split*X.shape[0]) 57 | X_train = X[:split_index] 58 | X_test = X[split_index:] 59 | Y_train = Y[:split_index] 60 | Y_test = Y[split_index:] 61 | print("(X_train, Y_train, X_test, Y_test) Shapes:") 62 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 63 | return X_train, Y_train, X_test, Y_test 64 | 65 | return X, Y 66 | 67 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled): 68 | rmse = (mean_squared_error(Y_test, predictions) ** 0.5) 69 | print('\nNormalized RMSE: %.3f' %(rmse)) 70 | nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test) 71 | print('Normalized NRMSE: %.3f' %(nrmse)) 72 | mae = mean_absolute_error(Y_test, predictions) 73 | print('Normalized MAE: %.3f' %(mae)) 74 | mape = compute_mape(Y_test, predictions) 75 | print('Normalized MAPE: %.3f' %(mape)) 76 | correlation = np.corrcoef(Y_test.T, predictions.T) 77 | print("Normalized Correlation: %.3f"%(correlation[0, 1])) 78 | r2 = r2_score(Y_test, predictions) 79 | print("Normalized r^2: %.3f"%(r2)) 80 | normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 81 | 82 | #evaluating the model on the inverse-normalized dataset 83 | rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5) 84 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 85 | nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test) 86 | print('Normalized NRMSE: %.3f' %(nrmse)) 87 | mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled) 88 | print('Normalized MAE: %.3f' %(mae)) 89 | mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 90 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 91 | correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T) 92 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 93 | r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled) 94 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 95 | inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 96 | 97 | return normalized_metrics, inv_normalized_metrics 98 | 99 | def final_test_linreg(stock_symbol, start_date, end_date, window, future_gap): 100 | #building the dataset 101 | print("> building the dataset...") 102 | df_train, _ = bulid_new_TIs_dataset(stock_symbol, None, start_date, window) 103 | df_test, scaler = bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window) 104 | #reshaping the dataset for LinReg 105 | print("\n> reshaping the dataset for LinReg...") 106 | ds_train = df_train.values 107 | ds_test = df_test.values 108 | X_train, Y_train = dataset_reshape(ds_train, future_gap, None) 109 | X_test, Y_test = dataset_reshape(ds_test, future_gap, None) 110 | #fitting the training data 111 | print("\n> fitting the training data...") 112 | Y_train = Y_train.reshape((Y_train.shape[0], 1)) 113 | training_set = np.concatenate((X_train, Y_train), axis=1) 114 | fitted_line_coefficients = lin_reg.minimize_new_err_fun(training_set, lin_reg.error_fun) 115 | print("Line Coefficients:", fitted_line_coefficients) 116 | #predictions 117 | price = fitted_line_coefficients[0]*X_test[:, 0] 118 | moment = fitted_line_coefficients[1]*X_test[:, 1] 119 | sma = fitted_line_coefficients[2]*X_test[:, 2] 120 | b_band = fitted_line_coefficients[3]*X_test[:, 3] 121 | std = fitted_line_coefficients[4]*X_test[:, 4] 122 | vroc = fitted_line_coefficients[5]*X_test[:, 5] 123 | constant = fitted_line_coefficients[4] 124 | predictions = price+moment+sma+b_band+std+vroc+constant 125 | #inverse-scaling 126 | print("\n> inverse-scaling the scaled values...") 127 | predictions = predictions.reshape((predictions.shape[0], 1)) 128 | predictions_inv_scaled = scaler.inverse_transform(predictions) 129 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 130 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 131 | #evaluation 132 | normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 133 | Y_test_inv_scaled, predictions_inv_scaled) 134 | #grouping the actual prices and predictions 135 | print("\n> grouping the actual prices and predictions...") 136 | feature_cols = df_test.columns.tolist() 137 | feature_cols.remove("actual_price") 138 | df_test.drop(columns=feature_cols, inplace=True) 139 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 140 | df_test = df_test.iloc[future_gap:] 141 | df_test['Actual'] = Y_test_inv_scaled 142 | df_test['Prediction'] = predictions_inv_scaled 143 | 144 | return normalized_metrics, inv_normalized_metrics, df_test -------------------------------------------------------------------------------- /machine_learning/development/new_regression/new_dataset.py: -------------------------------------------------------------------------------- 1 | from utils.util import get_stock_data 2 | import machine_learning.development.dataset_preprocessing as dpp 3 | from sklearn.preprocessing import MinMaxScaler 4 | import numpy as np 5 | 6 | def compute_mape(y_true, y_pred): 7 | return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 8 | 9 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True): 10 | cols = ["Date", "Adj Close"] 11 | df = get_stock_data(stock_symbol, start_date, end_date, cols) 12 | df.rename(columns={"Adj Close" : 'price'}, inplace=True) 13 | df['momentum'] = dpp.compute_momentum_ratio(df['price'], window) 14 | df['sma'] = dpp.compute_sma_ratio(df['price'], window) 15 | df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window) 16 | df['actual_price'] = df['price'] 17 | df = df[window:] 18 | scaler = None 19 | 20 | if normalize: 21 | scaler = MinMaxScaler() 22 | df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1)) 23 | df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1)) 24 | df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1)) 25 | df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1)) 26 | df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1)) 27 | 28 | print(df.head(10)) 29 | print(df.tail(10)) 30 | return df, scaler 31 | 32 | def dataset_reshape(dataset, future_gap, split): 33 | print("Dataset Shape:", dataset.shape) 34 | X = dataset[:, :-1] 35 | Y = dataset[:, -1] 36 | print("X Shape:", X.shape) 37 | print("Y Shape:", Y.shape) 38 | 39 | print("Applying Future Gap...") 40 | X = X[:-future_gap] 41 | Y = Y[future_gap:] 42 | print("X Shape:", X.shape) 43 | print("Y Shape:", Y.shape) 44 | 45 | print("Applying training, testing split...") 46 | split_index = int(split*X.shape[0]) 47 | X_train = X[:split_index] 48 | X_test = X[split_index:] 49 | Y_train = Y[:split_index] 50 | Y_test = Y[split_index:] 51 | print("(X_train, Y_train, X_test, Y_test) Shapes:") 52 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 53 | return X_train, Y_train, X_test, Y_test -------------------------------------------------------------------------------- /machine_learning/development/new_regression/new_knn_regression.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.new_regression import new_dataset as ds 2 | from machine_learning.development.linear_regression import calculate_rmse 3 | import machine_learning.development.knn_wrapper as knn 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from sklearn.metrics import r2_score 7 | 8 | #building the dataset 9 | print("> building the dataset...") 10 | stock_symbol = '^GSPC' 11 | start_date = '1950-01-01' 12 | end_date = '2017-12-31' 13 | window = 5 14 | dataframe, scaler = ds.bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 15 | 16 | #reshaping the dataset 17 | print("\n> reshaping the dataset...") 18 | dataset = dataframe.values 19 | future_gap = 5 #1 trading week 20 | split = 0.8 #80% of the dataset 21 | X_train, Y_train, X_test, Y_test = ds.dataset_reshape(dataset, future_gap, split) 22 | 23 | #kNN model 24 | model = knn.knn(5) 25 | 26 | #fitting the training data 27 | model.train(X_train, Y_train) 28 | 29 | #predictions 30 | predictions = model.query(X_test, normalize=False, addDiff=False) 31 | 32 | #evaluating the model on the normalized dataset 33 | rmse = calculate_rmse(predictions, Y_test) 34 | print('Normalized Test RMSE: %.3f' %(rmse)) 35 | mape = ds.compute_mape(Y_test, predictions) 36 | print('Normalized Outsample MAPE: %.3f' %(mape)) 37 | correlation = np.corrcoef(predictions, Y_test) 38 | print("Normalized Correlation: %.3f"%(correlation[0, 1])) 39 | r2 = r2_score(predictions, Y_test) 40 | print("Normalized Outsample r^2: %.3f"%(r2)) 41 | 42 | #evaluating the model on the inverse-normalized dataset 43 | predictions = predictions.reshape((predictions.shape[0], 1)) 44 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 45 | 46 | predictions_inv_scaled = scaler.inverse_transform(predictions) 47 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 48 | 49 | rmse = calculate_rmse(predictions_inv_scaled, Y_test_inv_scaled) 50 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 51 | mape = ds.compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 52 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 53 | correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T) 54 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 55 | r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled) 56 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 57 | 58 | #plotting 59 | _, ax = plt.subplots() 60 | ax.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction') 61 | ax.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual') 62 | ax.set_xlabel('Trading Day') 63 | ax.set_ylabel('Price') 64 | ax.legend(loc='best') 65 | ax.grid(True) 66 | 67 | plt.show() -------------------------------------------------------------------------------- /machine_learning/development/new_regression/new_linear_regression.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.new_regression import new_dataset as ds 2 | import machine_learning.development.linear_regression as lin_reg 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | from sklearn.metrics import r2_score 6 | 7 | #building the dataset 8 | print("> building the dataset...") 9 | stock_symbol = '^GSPC' 10 | start_date = '1950-01-01' 11 | end_date = '2017-12-31' 12 | window = 5 13 | dataframe, scaler = ds.bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 14 | 15 | #reshaping the dataset 16 | print("\n> reshaping the dataset...") 17 | dataset = dataframe.values 18 | future_gap = 5 #1 trading week 19 | split = 0.8 #80% of the dataset 20 | X_train, Y_train, X_test, Y_test = ds.dataset_reshape(dataset, future_gap, split) 21 | 22 | #training 23 | Y_train = Y_train.reshape((Y_train.shape[0], 1)) 24 | training_set = np.concatenate((X_train, Y_train), axis=1) 25 | fitted_line_coefficients = lin_reg.minimize_err_fun(training_set, lin_reg.error_fun) 26 | print("Line Coefficients:", fitted_line_coefficients) 27 | 28 | #testing 29 | price = fitted_line_coefficients[0]*X_test[:, 0] 30 | moment = fitted_line_coefficients[1]*X_test[:, 1] 31 | sma = fitted_line_coefficients[2]*X_test[:, 2] 32 | b_band = fitted_line_coefficients[3]*X_test[:, 3] 33 | constant = fitted_line_coefficients[4] 34 | predictions = price+moment+sma+b_band+constant 35 | 36 | #evaluating the model on the normalized dataset 37 | rmse = lin_reg.calculate_rmse(predictions, Y_test) 38 | print('\nNormalized Outsample RMSE: %.3f' %(rmse)) 39 | mape = ds.compute_mape(Y_test, predictions) 40 | print('Normalized Outsample MAPE: %.3f' %(mape)) 41 | correlation = np.corrcoef(predictions, Y_test) 42 | print("Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 43 | r2 = r2_score(predictions, Y_test) 44 | print("Normalized Outsample r^2: %.3f"%(r2)) 45 | 46 | #evaluating the model on the inverse-normalized dataset 47 | predictions = predictions.reshape((predictions.shape[0], 1)) 48 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 49 | 50 | predictions_inv_scaled = scaler.inverse_transform(predictions) 51 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 52 | 53 | rmse = lin_reg.calculate_rmse(predictions_inv_scaled, Y_test_inv_scaled) 54 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 55 | mape = ds.compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 56 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 57 | correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T) 58 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 59 | r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled) 60 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 61 | 62 | #plotting 63 | _, ax = plt.subplots() 64 | ax.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction') 65 | ax.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual') 66 | ax.set_xlabel('Trading Day') 67 | ax.set_ylabel('Price') 68 | ax.legend(loc='best') 69 | ax.grid(True) 70 | 71 | plt.show() -------------------------------------------------------------------------------- /machine_learning/development/optimized_ffnn/ffnn.py: -------------------------------------------------------------------------------- 1 | from utils.util import get_stock_data 2 | import machine_learning.development.dataset_preprocessing as dpp 3 | import numpy as np 4 | from keras.models import Sequential 5 | from keras.layers.core import Dense, Dropout 6 | from keras.optimizers import Adam 7 | from sklearn.preprocessing import MinMaxScaler 8 | from machine_learning.development.new_regression.new_dataset import compute_mape 9 | from sklearn.metrics import mean_squared_error, mean_absolute_error 10 | from sklearn.metrics import r2_score 11 | 12 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True): 13 | cols = ["Date", "Adj Close"] 14 | df = get_stock_data(stock_symbol, start_date, end_date, cols) 15 | df.rename(columns={"Adj Close" : 'price'}, inplace=True) 16 | df['momentum'] = dpp.compute_momentum_ratio(df['price'], window) 17 | df['sma'] = dpp.compute_sma_ratio(df['price'], window) 18 | df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window) 19 | df['actual_price'] = df['price'] 20 | df = df[window:] 21 | scaler = None 22 | 23 | if normalize: 24 | scaler = MinMaxScaler() 25 | df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1)) 26 | df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1)) 27 | df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1)) 28 | df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1)) 29 | df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1)) 30 | 31 | print(df.head()) 32 | print(df.tail()) 33 | return df, scaler 34 | 35 | def bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True): 36 | cols = ["Date", "Adj Close", "Volume"] 37 | df = get_stock_data(stock_symbol, start_date, end_date, cols) 38 | df.rename(columns={"Adj Close" : 'price'}, inplace=True) 39 | df['momentum'] = dpp.compute_momentum_ratio(df['price'], window) 40 | df['sma'] = dpp.compute_sma_ratio(df['price'], window) 41 | df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window) 42 | df['volatility'] = dpp.compute_volatility_ratio(df['price'], window) 43 | df['vroc'] = dpp.compute_vroc_ratio(df['Volume'], window) 44 | df['actual_price'] = df['price'] 45 | df.drop(columns=["Volume"], inplace=True) 46 | df = df[window:] 47 | df.replace([np.inf, -np.inf], np.nan, inplace=True) 48 | df.fillna(method='ffill', inplace=True) 49 | df.fillna(method='bfill', inplace=True) 50 | scaler = None 51 | 52 | if normalize: 53 | scaler = MinMaxScaler() 54 | df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1)) 55 | df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1)) 56 | df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1)) 57 | df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1)) 58 | df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1)) 59 | df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1)) 60 | df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1)) 61 | 62 | print(df.head()) 63 | print(df.tail()) 64 | return df, scaler 65 | 66 | def ffnn_dataset_reshape(dataset, future_gap, split): 67 | print("Dataset Shape:", dataset.shape) 68 | X = dataset[:, :-1] 69 | Y = dataset[:, -1] 70 | print("X Shape:", X.shape) 71 | print("Y Shape:", Y.shape) 72 | 73 | print("Applying Future Gap...") 74 | X = X[:-future_gap] 75 | Y = Y[future_gap:] 76 | 77 | if split != None: 78 | print("Applying training, testing split...") 79 | split_index = int(split*X.shape[0]) 80 | X_train = X[:split_index] 81 | X_test = X[split_index:] 82 | Y_train = Y[:split_index] 83 | Y_test = Y[split_index:] 84 | print("(X_train, Y_train, X_test, Y_test) Shapes:") 85 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 86 | return X_train, Y_train, X_test, Y_test 87 | 88 | return X, Y 89 | 90 | def build_model(features, neurons, drop_out, decay=0.0): 91 | model = Sequential() 92 | 93 | model.add(Dense(neurons[0], input_dim=features, activation='relu',)) 94 | model.add(Dropout(drop_out)) 95 | 96 | model.add(Dense(neurons[1], activation='relu')) 97 | model.add(Dropout(drop_out)) 98 | 99 | model.add(Dense(neurons[2], activation='relu')) 100 | model.add(Dense(neurons[3], activation='linear')) 101 | 102 | adam = Adam(decay=decay) 103 | model.compile(loss='mse',optimizer=adam) 104 | model.summary() 105 | return model 106 | 107 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks): 108 | 109 | history = model.fit( 110 | X_train, 111 | Y_train, 112 | batch_size = batch_size, 113 | epochs = epochs, 114 | validation_split = validation_split, 115 | verbose = verbose, 116 | callbacks = callbacks 117 | ) 118 | 119 | return history 120 | 121 | def evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose): 122 | train_mse = model.evaluate(X_train, Y_train, verbose=verbose) 123 | print('Insample Testing: %.5f MSE (%.3f RMSE)' % (train_mse, (train_mse ** 0.5))) 124 | 125 | test_mse = model.evaluate(X_test, Y_test, verbose=verbose) 126 | print('Outsample Testing: %.5f MSE (%.3f RMSE)' % (test_mse, (test_mse ** 0.5))) 127 | 128 | return train_mse, test_mse 129 | 130 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled): 131 | rmse = (mean_squared_error(Y_test, predictions) ** 0.5) 132 | print('\nNormalized RMSE: %.3f' %(rmse)) 133 | nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test) 134 | print('Normalized NRMSE: %.3f' %(nrmse)) 135 | mae = mean_absolute_error(Y_test, predictions) 136 | print('Normalized MAE: %.3f' %(mae)) 137 | mape = compute_mape(Y_test, predictions) 138 | print('Normalized MAPE: %.3f' %(mape)) 139 | correlation = np.corrcoef(Y_test.T, predictions.T) 140 | print("Normalized Correlation: %.3f"%(correlation[0, 1])) 141 | r2 = r2_score(Y_test, predictions) 142 | print("Normalized r^2: %.3f"%(r2)) 143 | normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 144 | 145 | #evaluating the model on the inverse-normalized dataset 146 | rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5) 147 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 148 | nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test) 149 | print('Normalized NRMSE: %.3f' %(nrmse)) 150 | mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled) 151 | print('Normalized MAE: %.3f' %(mae)) 152 | mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 153 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 154 | correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T) 155 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 156 | r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled) 157 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 158 | inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 159 | 160 | return normalized_metrics, inv_normalized_metrics 161 | 162 | def final_test_ffnn(stock_symbol, start_date, end_date, window, future_gap, neurons, 163 | drop_out, batch_size, epochs, validation_split, verbose, callbacks): 164 | #building the dataset 165 | print("> building the dataset...") 166 | df_train, _ = bulid_new_TIs_dataset(stock_symbol, None, start_date, window) 167 | df_test, scaler = bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window) 168 | #reshaping the dataset for FFNN 169 | print("\n> reshaping the dataset for FFNN...") 170 | ds_train = df_train.values 171 | ds_test = df_test.values 172 | X_train, Y_train = ffnn_dataset_reshape(ds_train, future_gap, None) 173 | X_test, Y_test = ffnn_dataset_reshape(ds_test, future_gap, None) 174 | #building the FFNN model 175 | print("\n> building the FFNN model...") 176 | features = X_train.shape[1] 177 | model = build_model(features, neurons, drop_out) 178 | #fitting the training data 179 | print("\n> fitting the training data...") 180 | model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks) 181 | #predictions 182 | print("\n> testing the model for predictions...") 183 | predictions = model.predict(X_test) 184 | #inverse-scaling 185 | print("\n> inverse-scaling the scaled values...") 186 | predictions = predictions.reshape((predictions.shape[0], 1)) 187 | predictions_inv_scaled = scaler.inverse_transform(predictions) 188 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 189 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 190 | #evaluation 191 | normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 192 | Y_test_inv_scaled, predictions_inv_scaled) 193 | #grouping the actual prices and predictions 194 | print("\n> grouping the actual prices and predictions...") 195 | feature_cols = df_test.columns.tolist() 196 | feature_cols.remove("actual_price") 197 | df_test.drop(columns=feature_cols, inplace=True) 198 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 199 | df_test = df_test.iloc[future_gap:] 200 | df_test['Actual'] = Y_test_inv_scaled 201 | df_test['Prediction'] = predictions_inv_scaled 202 | 203 | return normalized_metrics, inv_normalized_metrics, df_test -------------------------------------------------------------------------------- /machine_learning/development/optimized_ffnn/ffnn_hyperparam_tune.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_ffnn import ffnn 2 | from keras.callbacks import EarlyStopping 3 | 4 | def evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout, 5 | neurons, batch_size, epochs, validation_split, verbose, decay=0.0): 6 | 7 | dataframe, _ = ffnn.bulid_TIs_dataset(stock, start_date, end_date, window) 8 | dataset = dataframe.values 9 | X_train, Y_train, X_test, Y_test = ffnn.ffnn_dataset_reshape(dataset, future_gap, split) 10 | features = X_train.shape[1] 11 | model = ffnn.build_model(features, neurons, dropout, decay) 12 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 13 | patience=50, verbose=verbose, mode='auto') 14 | callbacks = [early_stopping_callback] 15 | ffnn.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks) 16 | train_mse, test_mse = ffnn.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose) 17 | return train_mse, test_mse 18 | 19 | def optimal_dropout(stock, start_date, end_date, window, future_gap, split, neurons, 20 | batch_size, epochs, validation_split, verbose, dropout_list): 21 | dropout_result = {} 22 | for dropout in dropout_list: 23 | print("\n> testing droput: (%.1f)..." %(dropout)) 24 | _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout, 25 | neurons, batch_size, epochs, validation_split, verbose) 26 | dropout_result[dropout] = testScore 27 | return dropout_result 28 | 29 | def optimal_epochs(stock, start_date, end_date, window, future_gap, split, dropout, 30 | neurons, batch_size, validation_split, verbose, epochs_list): 31 | epochs_result = {} 32 | for epochs in epochs_list: 33 | print("\n> testing epochs: (%d)..." %(epochs)) 34 | _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout, 35 | neurons, batch_size, epochs, validation_split, verbose) 36 | epochs_result[epochs] = testScore 37 | return epochs_result 38 | 39 | def optimal_neurons(stock, start_date, end_date, window, future_gap, split, dropout, 40 | batch_size, epochs, validation_split, verbose, neurons_list1, neurons_list2): 41 | neurons_result = {} 42 | for ffnn_neuron in neurons_list1: 43 | neurons = [ffnn_neuron, ffnn_neuron] 44 | for dense_neuron in neurons_list2: 45 | neurons.append(dense_neuron) 46 | neurons.append(1) 47 | print("\n> testing neurons: (%s)..." %(str(neurons))) 48 | _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout, 49 | neurons, batch_size, epochs, validation_split, verbose) 50 | neurons_result[str(neurons)] = testScore 51 | neurons = neurons[:2] 52 | return neurons_result 53 | 54 | def optimal_decay(stock, start_date, end_date, window, future_gap, split, dropout, 55 | neurons, batch_size, epochs, validation_split, verbose, decay_list): 56 | decay_result = {} 57 | for decay in decay_list: 58 | print("\n> testing decay: (%.1f)..." %(decay)) 59 | _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout, 60 | neurons, batch_size, epochs, validation_split, verbose, decay) 61 | decay_result[decay] = testScore 62 | return decay_result 63 | 64 | def optimal_batch_size(stock, start_date, end_date, window, future_gap, split, dropout, neurons, 65 | epochs, validation_split, verbose, decay, batch_size_list): 66 | batch_size_result = {} 67 | for batch_size in batch_size_list: 68 | print("\n> testing batch size: (%d)..." %(batch_size)) 69 | _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout, 70 | neurons, batch_size, epochs, validation_split, verbose, decay) 71 | batch_size_result[batch_size] = testScore 72 | return batch_size_result -------------------------------------------------------------------------------- /machine_learning/development/optimized_ffnn/ffnn_hypparam_tune_main.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_ffnn import ffnn_hyperparam_tune as hpt 2 | import matplotlib.pyplot as plt 3 | import time 4 | 5 | #start time 6 | start_time = time.time() 7 | 8 | #intial hyperparameters 9 | stock = '^GSPC' 10 | start_date = '1950-01-01' 11 | end_date = '2017-12-31' 12 | window = 5 13 | future_gap = 5 14 | split = 0.8 15 | dropout = None 16 | neurons = [64, 64, 32, 1] 17 | batch_size = 4026 18 | epochs = 1 19 | validation_split = 0.1 20 | verbose = 1 21 | 22 | #optimal hyperparameters txt file 23 | print("\n> finding the optimal hyperparameters...") 24 | file = open("machine_learning/optimized_ffnn/ffnn_optimal_hyperparameters.txt", "wb") #ab+ to read and append to file 25 | fig1, (ax1, ax2, ax3) = plt.subplots(3, 1) 26 | fig2, (ax4, ax5) = plt.subplots(2, 1) 27 | 28 | #finding the optimal dropout 29 | print("\n> finding the optimal dropout...") 30 | dropout_list = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] 31 | dropout_result = hpt.optimal_dropout(stock, start_date, end_date, window, future_gap, split, neurons, 32 | batch_size, epochs, validation_split, verbose, dropout_list) 33 | 34 | min_loss = min(dropout_result.values()) 35 | optimal_dropout = -1.0 36 | for dout, loss in dropout_result.items(): 37 | if loss == min_loss: 38 | optimal_dropout = dout 39 | 40 | file.write(bytes("dropout: %.1f, " %(optimal_dropout), 'UTF-8')) 41 | print("\nDropout:", optimal_dropout) 42 | dropout = optimal_dropout 43 | 44 | items = dropout_result.items() 45 | x, y = zip(*items) 46 | ax1.plot(x, y) 47 | ax1.set_xlabel('Dropout') 48 | ax1.set_ylabel('MSE') 49 | ax1.grid(True) 50 | 51 | #finding the optimal neurons 52 | print("\n> finding the optimal neurons...") 53 | neuronlist1 = [64, 128, 256] 54 | neuronlist2 = [16, 32, 64] 55 | neurons_result = hpt.optimal_neurons(stock, start_date, end_date, window, future_gap, split, dropout, 56 | batch_size, epochs, validation_split, verbose, neuronlist1, neuronlist2) 57 | 58 | min_loss = min(neurons_result.values()) 59 | optimal_neurons = "" 60 | for n, loss in neurons_result.items(): 61 | if loss == min_loss: 62 | optimal_neurons = n 63 | 64 | file.write(bytes("neurons: %s, " %(str(optimal_neurons)), 'UTF-8')) 65 | print("\nNeurons:", optimal_neurons) 66 | neurons = optimal_neurons 67 | neurons = neurons[1:-1] 68 | neurons = neurons.split(", ") 69 | neurons = [int(neuron_str) for neuron_str in neurons] 70 | 71 | items = neurons_result.items() 72 | x, y = zip(*items) 73 | ax2.bar(range(len(items)), y, align='center') 74 | plt.sca(ax2) 75 | plt.xticks(range(len(items)), x, rotation=25) 76 | ax2.set_xlabel('Neurons') 77 | ax2.set_ylabel('MSE') 78 | ax2.grid(True) 79 | 80 | #finding the optimal decay 81 | print("\n> finding the optimal decay...") 82 | decay_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] 83 | decay_result = hpt.optimal_decay(stock, start_date, end_date, window, future_gap, split, dropout, 84 | neurons, batch_size, epochs, validation_split, verbose, decay_list) 85 | 86 | min_loss = min(decay_result.values()) 87 | optimal_decay = -1.0 88 | for d, loss in decay_result.items(): 89 | if loss == min_loss: 90 | optimal_decay = d 91 | 92 | file.write(bytes("decay: %.1f, " %(optimal_decay), 'UTF-8')) 93 | print("\nDecay:", optimal_decay) 94 | decay = optimal_decay 95 | 96 | items = decay_result.items() 97 | x, y = zip(*items) 98 | ax3.plot(x, y) 99 | ax3.set_xlabel('Decay') 100 | ax3.set_ylabel('MSE') 101 | ax3.grid(True) 102 | 103 | #finding the optimal batch size 104 | print("\n> finding the optimal batch size...") 105 | batch_size_list = [128, 256, 512, 1024, 2048, 4096] 106 | batch_size_result = hpt.optimal_batch_size(stock, start_date, end_date, window, future_gap, split, dropout, 107 | neurons, epochs, validation_split, verbose, decay, batch_size_list) 108 | 109 | min_loss = min(batch_size_result.values()) 110 | optimal_batch_size = -1 111 | for bs, loss in batch_size_result.items(): 112 | if loss == min_loss: 113 | optimal_batch_size = bs 114 | 115 | file.write(bytes("batch_size: %d, " %(optimal_batch_size), 'UTF-8')) 116 | print("\nBatch Size:", optimal_batch_size) 117 | batch_size = optimal_batch_size 118 | 119 | items = batch_size_result.items() 120 | x, y = zip(*items) 121 | ax4.plot(x, y) 122 | ax4.set_xlabel('Batch Size') 123 | ax4.set_ylabel('MSE') 124 | ax4.grid(True) 125 | 126 | #finding the optimal epochs 127 | print("\n> finding the optimal epochs...") 128 | epochs_list = [50, 60, 70, 80, 90, 100, 200, 300] 129 | epochs_result = hpt.optimal_epochs(stock, start_date, end_date, window, future_gap, split, dropout, 130 | neurons, batch_size, validation_split, verbose, epochs_list) 131 | 132 | min_loss = min(epochs_result.values()) 133 | optimal_epochs = -1 134 | for ep, loss in epochs_result.items(): 135 | if loss == min_loss: 136 | optimal_epochs = ep 137 | 138 | file.write(bytes("epochs: %d, " %(optimal_epochs), 'UTF-8')) 139 | print("\nEpochs:", optimal_epochs) 140 | epochs = optimal_epochs 141 | 142 | items = epochs_result.items() 143 | x, y = zip(*items) 144 | ax5.plot(x, y) 145 | ax5.set_xlabel('Epochs') 146 | ax5.set_ylabel('MSE') 147 | ax5.grid(True) 148 | 149 | #end time 150 | end_time = time.time() 151 | time = end_time - start_time 152 | file.write(bytes("time elapsed: %.3fs." %(time), 'UTF-8')) 153 | 154 | #closing the file and showing the plot 155 | print("\nOptimal Hyperparameters") 156 | print("Dropout:", optimal_dropout) 157 | print("Neurons:", optimal_neurons) 158 | print("Decay:", optimal_decay) 159 | print("Batch Size:", optimal_batch_size) 160 | print("Epochs:", optimal_epochs) 161 | print("Time Elapsed (s):", time) 162 | 163 | file.close() 164 | fig1.tight_layout() 165 | fig2.tight_layout() 166 | plt.show() -------------------------------------------------------------------------------- /machine_learning/development/optimized_ffnn/ffnn_main.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_ffnn import ffnn 2 | from machine_learning.development.new_regression.new_dataset import compute_mape 3 | from keras.callbacks import EarlyStopping 4 | from sklearn.metrics import mean_squared_error 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from sklearn.metrics import r2_score 8 | 9 | def main(internal_eval=False): 10 | #building the dataset 11 | print("> building the dataset...") 12 | stock_symbol = '^GSPC' 13 | start_date = '1950-01-01' 14 | end_date = '2017-12-31' 15 | window = 2 16 | dataframe, scaler = ffnn.bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window) 17 | 18 | #reshaping the dataset for FFNN 19 | print("\n> reshaping the dataset for FFNN...") 20 | dataset = dataframe.values 21 | future_gap = 1 #1 trading day 22 | split = 0.8 #80% of the dataset 23 | X_train, Y_train, X_test, Y_test = ffnn.ffnn_dataset_reshape(dataset, future_gap, split) 24 | 25 | #building the FFNN model 26 | print("\n> building the FFNN model...") 27 | features = X_train.shape[1] 28 | neurons = [256, 256, 16, 1] 29 | drop_out = 0.3 30 | verbose = 1 31 | model = ffnn.build_model(features, neurons, drop_out) 32 | 33 | #fitting the training data 34 | print("\n> fitting the training data...") 35 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 36 | patience=50, verbose=verbose, mode='auto') 37 | batch_size = 4096 38 | epochs = 200 39 | validation_split = 0.1 40 | _ = ffnn.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, 41 | verbose, [early_stopping_callback]) 42 | 43 | #internal evaluation 44 | if internal_eval: 45 | print("\n> internal evaluation...") 46 | _, _ = ffnn.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose) 47 | 48 | #predictions 49 | predictions = model.predict(X_test) 50 | predictions = predictions.reshape((predictions.shape[0], 1)) 51 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 52 | 53 | #evaluating the model on the normalized dataset 54 | rmse = (mean_squared_error(predictions, Y_test) ** 0.5) 55 | print('\nNormalized Test RMSE: %.3f' %(rmse)) 56 | mape = compute_mape(Y_test, predictions) 57 | print('Normalized Outsample MAPE: %.3f' %(mape)) 58 | correlation = np.corrcoef(predictions.T, Y_test.T) 59 | print("Normalized Correlation: %.3f"%(correlation[0, 1])) 60 | r2 = r2_score(predictions, Y_test) 61 | print("Normalized Outsample r^2: %.3f"%(r2)) 62 | 63 | #evaluating the model on the inverse-normalized dataset 64 | predictions_inv_scaled = scaler.inverse_transform(predictions) 65 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 66 | 67 | rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5) 68 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 69 | mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 70 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 71 | correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T) 72 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 73 | r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled) 74 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 75 | 76 | #plotting the results 77 | print("\n> plotting the results...") 78 | _, ax2 = plt.subplots() 79 | '''ax1.plot(history.history['loss'], label='Training') 80 | ax1.plot(history.history['val_loss'], label='Validation') 81 | ax1.set_xlabel('Epoch #') 82 | ax1.set_ylabel('Loss') 83 | ax1.legend(loc='best') 84 | ax1.grid(True) 85 | ''' 86 | ax2.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction') 87 | ax2.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual') 88 | ax2.set_xlabel('Trading Day') 89 | ax2.set_ylabel('Price') 90 | ax2.legend(loc='best') 91 | ax2.grid(True) 92 | 93 | plt.show() 94 | 95 | main() 96 | 97 | 98 | #to be stored temporarily 99 | '''#evaluating the model on the *dataset* 100 | print("\n> evaluating the model on the *dataset*...") 101 | predictions = model.predict(X_test) 102 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 103 | 104 | predictions_inv_scaled = scaler.inverse_transform(predictions) 105 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 106 | 107 | rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5) 108 | print('Outsample RMSE: %.3f' %(rmse)) 109 | #correlation = np.corrcoef(predictions_inv_scaled, Y_test_inv_scaled) 110 | #print("Outsample Correlation: %.3f"%(correlation[0, 1]))# 111 | # ''' -------------------------------------------------------------------------------- /machine_learning/development/optimized_ffnn/ffnn_optimal_hyperparameters.txt: -------------------------------------------------------------------------------- 1 | dropout: 0.8, neurons: [256, 256, 64, 1], decay: 0.1, batch_size: 128, epochs: 200, time elapsed: 317.959s. -------------------------------------------------------------------------------- /machine_learning/development/optimized_lstm/hyperparam_tune_main.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import hyperparameter_tunning as hpt 2 | import matplotlib.pyplot as plt 3 | import time 4 | 5 | #start time 6 | start_time = time.time() 7 | 8 | #intial hyperparameters 9 | stock = '^GSPC' 10 | start_date = '1950-01-01' 11 | end_date = '2017-12-31' 12 | future_gap = 1 13 | time_steps = 20 14 | split = 0.9 15 | dropout = None 16 | neurons = [128, 128, 32, 1] 17 | batch_size = 512 18 | epochs = 50 19 | validation_split= 0.1 20 | verbose = 1 21 | 22 | #optimal hyperparameters txt file 23 | print("\n> finding the optimal hyperparameters...") 24 | file = open("machine_learning/optimized_lstm/optimal_hyperparameters.txt", "wb") #ab+ to read and append to file 25 | fig1, (ax1, ax2, ax3) = plt.subplots(3, 1) 26 | fig2, (ax4, ax5, ax6) = plt.subplots(3, 1) 27 | 28 | #finding the optimal dropout 29 | print("\n> finding the optimal dropout...") 30 | dropout_list = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8] 31 | dropout_result = hpt.optimal_dropout(stock, start_date, end_date, future_gap, time_steps, split, neurons, 32 | batch_size, epochs, validation_split, verbose, dropout_list) 33 | 34 | min_loss = min(dropout_result.values()) 35 | optimal_dropout = -1.0 36 | for dout, loss in dropout_result.items(): 37 | if loss == min_loss: 38 | optimal_dropout = dout 39 | 40 | file.write(bytes("dropout: %.1f, " %(optimal_dropout), 'UTF-8')) 41 | print("\nDropout:", optimal_dropout) 42 | dropout = optimal_dropout 43 | 44 | items = dropout_result.items() 45 | x, y = zip(*items) 46 | ax1.plot(x, y) 47 | ax1.set_xlabel('Dropout') 48 | ax1.set_ylabel('MSE') 49 | ax1.grid(True) 50 | 51 | #finding the optimal neurons 52 | print("\n> finding the optimal neurons...") 53 | neuronlist1 = [64, 128, 256] 54 | neuronlist2 = [16, 32, 64] 55 | neurons_result = hpt.optimal_neurons(stock, start_date, end_date, future_gap, time_steps, split, dropout, 56 | batch_size, epochs, validation_split, verbose, neuronlist1, neuronlist2) 57 | 58 | min_loss = min(neurons_result.values()) 59 | optimal_neurons = "" 60 | for n, loss in neurons_result.items(): 61 | if loss == min_loss: 62 | optimal_neurons = n 63 | 64 | file.write(bytes("neurons: %s, " %(str(optimal_neurons)), 'UTF-8')) 65 | print("\nNeurons:", optimal_neurons) 66 | neurons = optimal_neurons 67 | neurons = neurons[1:-1] 68 | neurons = neurons.split(", ") 69 | neurons = [int(neuron_str) for neuron_str in neurons] 70 | 71 | items = neurons_result.items() 72 | x, y = zip(*items) 73 | ax2.bar(range(len(items)), y, align='center') 74 | plt.sca(ax2) 75 | plt.xticks(range(len(items)), x, rotation=25) 76 | ax2.set_xlabel('Neurons') 77 | ax2.set_ylabel('MSE') 78 | ax2.grid(True) 79 | 80 | #finding the optimal decay 81 | print("\n> finding the optimal decay...") 82 | decay_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] 83 | decay_result = hpt.optimal_decay(stock, start_date, end_date, future_gap, time_steps, split, dropout, 84 | neurons, batch_size, epochs, validation_split, verbose, decay_list) 85 | 86 | min_loss = min(decay_result.values()) 87 | optimal_decay = -1.0 88 | for d, loss in decay_result.items(): 89 | if loss == min_loss: 90 | optimal_decay = d 91 | 92 | file.write(bytes("decay: %.1f, " %(optimal_decay), 'UTF-8')) 93 | print("\nDecay:", optimal_decay) 94 | decay = optimal_decay 95 | 96 | items = decay_result.items() 97 | x, y = zip(*items) 98 | ax3.plot(x, y) 99 | ax3.set_xlabel('Decay') 100 | ax3.set_ylabel('MSE') 101 | ax3.grid(True) 102 | 103 | #finding the optimal time steps 104 | print("\n> finding the optimal time steps...") 105 | time_steps_list = [5, 10, 15, 20, 40, 80, 100] 106 | time_steps_result = hpt.optimal_time_steps(stock, start_date, end_date, future_gap, split, dropout, neurons, 107 | batch_size, epochs, validation_split, verbose, decay, time_steps_list) 108 | 109 | min_loss = min(time_steps_result.values()) 110 | optimal_time_steps = -1 111 | for ts, loss in time_steps_result.items(): 112 | if loss == min_loss: 113 | optimal_time_steps = ts 114 | 115 | file.write(bytes("time_steps: %d, " %(optimal_time_steps), 'UTF-8')) 116 | print("\nTime Steps:", optimal_time_steps) 117 | time_steps = optimal_time_steps 118 | 119 | items = time_steps_result.items() 120 | x, y = zip(*items) 121 | ax4.plot(x, y) 122 | ax4.set_xlabel('Time Steps') 123 | ax4.set_ylabel('MSE') 124 | ax4.grid(True) 125 | 126 | #finding the optimal batch size 127 | print("\n> finding the optimal batch size...") 128 | batch_size_list = [128, 256, 512, 1024, 2048, 4096] 129 | batch_size_result = hpt.optimal_batch_size(stock, start_date, end_date, future_gap, time_steps, split, dropout, 130 | neurons, epochs, validation_split, verbose, decay, batch_size_list) 131 | 132 | min_loss = min(batch_size_result.values()) 133 | optimal_batch_size = -1 134 | for bs, loss in batch_size_result.items(): 135 | if loss == min_loss: 136 | optimal_batch_size = bs 137 | 138 | file.write(bytes("batch_size: %d, " %(optimal_batch_size), 'UTF-8')) 139 | print("\nBatch Size:", optimal_batch_size) 140 | batch_size = optimal_batch_size 141 | 142 | items = batch_size_result.items() 143 | x, y = zip(*items) 144 | ax5.plot(x, y) 145 | ax5.set_xlabel('Batch Size') 146 | ax5.set_ylabel('MSE') 147 | ax5.grid(True) 148 | 149 | #finding the optimal epochs 150 | print("\n> finding the optimal epochs...") 151 | epochs_list = [50, 60, 70, 80, 90, 100, 200, 300] 152 | epochs_result = hpt.optimal_epochs(stock, start_date, end_date, future_gap, time_steps, split, dropout, 153 | neurons, batch_size, validation_split, verbose, epochs_list) 154 | 155 | min_loss = min(epochs_result.values()) 156 | optimal_epochs = -1 157 | for ep, loss in epochs_result.items(): 158 | if loss == min_loss: 159 | optimal_epochs = ep 160 | 161 | file.write(bytes("epochs: %d, " %(optimal_epochs), 'UTF-8')) 162 | print("\nEpochs:", optimal_epochs) 163 | epochs = optimal_epochs 164 | 165 | items = epochs_result.items() 166 | x, y = zip(*items) 167 | ax6.plot(x, y) 168 | ax6.set_xlabel('Epochs') 169 | ax6.set_ylabel('MSE') 170 | ax6.grid(True) 171 | 172 | #end time 173 | end_time = time.time() 174 | time = end_time - start_time 175 | file.write(bytes("time elapsed: %.3fs." %(time), 'UTF-8')) 176 | 177 | #closing the file and showing the plot 178 | print("\nOptimal Hyperparameters") 179 | print("Dropout:", optimal_dropout) 180 | print("Neurons:", optimal_neurons) 181 | print("Decay:", optimal_decay) 182 | print("Time Steps:", optimal_time_steps) 183 | print("Batch Size:", optimal_batch_size) 184 | print("Epochs:", optimal_epochs) 185 | print("Time Elapsed (s):", time) 186 | 187 | file.close() 188 | fig1.tight_layout() 189 | fig2.tight_layout() 190 | plt.show() 191 | -------------------------------------------------------------------------------- /machine_learning/development/optimized_lstm/hyperparameter_tunning.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import lstm 2 | from keras.callbacks import EarlyStopping 3 | 4 | def evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout, 5 | neurons, batch_size, epochs, validation_split, verbose, decay=0.0): 6 | 7 | dataframe, _ = lstm.bulid_dataset(stock, start_date, end_date) 8 | dataset = dataframe.values 9 | X_train, Y_train, X_test, Y_test = lstm.lstm_dataset_reshape(dataset, time_steps, future_gap, split) 10 | features = X_train.shape[2] 11 | model = lstm.build_model(time_steps, features, neurons, dropout, decay) 12 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 13 | patience=50, verbose=verbose, mode='auto') 14 | callbacks = [early_stopping_callback] 15 | lstm.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks) 16 | train_mse, test_mse = lstm.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose) 17 | return train_mse, test_mse 18 | 19 | def optimal_dropout(stock, start_date, end_date, future_gap, time_steps, split, neurons, 20 | batch_size, epochs, validation_split, verbose, dropout_list): 21 | dropout_result = {} 22 | for dropout in dropout_list: 23 | print("\n> testing droput: (%.1f)..." %(dropout)) 24 | _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout, 25 | neurons, batch_size, epochs, validation_split, verbose) 26 | dropout_result[dropout] = testScore 27 | return dropout_result 28 | 29 | def optimal_epochs(stock, start_date, end_date, future_gap, time_steps, split, dropout, 30 | neurons, batch_size, validation_split, verbose, epochs_list): 31 | epochs_result = {} 32 | for epochs in epochs_list: 33 | print("\n> testing epochs: (%d)..." %(epochs)) 34 | _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout, 35 | neurons, batch_size, epochs, validation_split, verbose) 36 | epochs_result[epochs] = testScore 37 | return epochs_result 38 | 39 | def optimal_neurons(stock, start_date, end_date, future_gap, time_steps, split, dropout, 40 | batch_size, epochs, validation_split, verbose, neurons_list1, neurons_list2): 41 | neurons_result = {} 42 | for lstm_neuron in neurons_list1: 43 | neurons = [lstm_neuron, lstm_neuron] 44 | for dense_neuron in neurons_list2: 45 | neurons.append(dense_neuron) 46 | neurons.append(1) 47 | print("\n> testing neurons: (%s)..." %(str(neurons))) 48 | _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout, 49 | neurons, batch_size, epochs, validation_split, verbose) 50 | neurons_result[str(neurons)] = testScore 51 | neurons = neurons[:2] 52 | return neurons_result 53 | 54 | def optimal_decay(stock, start_date, end_date, future_gap, time_steps, split, dropout, 55 | neurons, batch_size, epochs, validation_split, verbose, decay_list): 56 | decay_result = {} 57 | for decay in decay_list: 58 | print("\n> testing decay: (%.1f)..." %(decay)) 59 | _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout, 60 | neurons, batch_size, epochs, validation_split, verbose, decay) 61 | decay_result[decay] = testScore 62 | return decay_result 63 | 64 | def optimal_time_steps(stock, start_date, end_date, future_gap, split, dropout, neurons, batch_size, 65 | epochs, validation_split, verbose, decay, time_steps_list): 66 | timesteps_result = {} 67 | for time_steps in time_steps_list: 68 | print("\n> testing time steps: (%d)..." %(time_steps)) 69 | _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout, 70 | neurons, batch_size, epochs, validation_split, verbose, decay) 71 | timesteps_result[time_steps] = testScore 72 | return timesteps_result 73 | 74 | def optimal_batch_size(stock, start_date, end_date, future_gap, time_steps, split, dropout, neurons, 75 | epochs, validation_split, verbose, decay, batch_size_list): 76 | batch_size_result = {} 77 | for batch_size in batch_size_list: 78 | print("\n> testing batch size: (%d)..." %(batch_size)) 79 | _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout, 80 | neurons, batch_size, epochs, validation_split, verbose, decay) 81 | batch_size_result[batch_size] = testScore 82 | return batch_size_result -------------------------------------------------------------------------------- /machine_learning/development/optimized_lstm/lstm.py: -------------------------------------------------------------------------------- 1 | from utils.util import get_stock_data, plot_data 2 | from machine_learning.development.testing.lag_metric import compute_lag_metric 3 | import machine_learning.development.dataset_preprocessing as dpp 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | from keras.models import Sequential 7 | from keras.layers.core import Dense, Dropout 8 | from keras.layers.recurrent import LSTM 9 | from keras.optimizers import Adam 10 | from sklearn.preprocessing import MinMaxScaler 11 | from machine_learning.development.new_regression.new_dataset import compute_mape 12 | from sklearn.metrics import mean_squared_error, mean_absolute_error 13 | from sklearn.metrics import r2_score 14 | 15 | def bulid_dataset(stock_symbol, start_date, end_date, normalize=True): 16 | cols = ["Date", "Open", "Low", "High", "Adj Close"] 17 | df = get_stock_data(stock_symbol, start_date, end_date, cols) 18 | df.replace([np.inf, -np.inf], np.nan, inplace=True) 19 | df.fillna(method='ffill', inplace=True) 20 | df.fillna(method='bfill', inplace=True) 21 | scaler = None 22 | 23 | if normalize: 24 | scaler = MinMaxScaler() 25 | df['Open'] = scaler.fit_transform(df['Open'].values.reshape(-1,1)) 26 | df['Low'] = scaler.fit_transform(df['Low'].values.reshape(-1,1)) 27 | df['High'] = scaler.fit_transform(df['High'].values.reshape(-1,1)) 28 | df['Adj Close'] = scaler.fit_transform(df['Adj Close'].values.reshape(-1,1)) 29 | 30 | print(df.head()) 31 | print(df.tail()) 32 | return df, scaler 33 | 34 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True): 35 | cols = ["Date", "Adj Close", "Volume"] 36 | df = get_stock_data(stock_symbol, start_date, end_date, cols) 37 | df.rename(columns={"Adj Close" : 'price'}, inplace=True) 38 | df['momentum'] = dpp.compute_momentum_ratio(df['price'], window) 39 | df['sma'] = dpp.compute_sma_ratio(df['price'], window) 40 | df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window) 41 | df['volatility'] = dpp.compute_volatility_ratio(df['price'], window) 42 | df['vroc'] = dpp.compute_vroc_ratio(df['Volume'], window) 43 | df['actual_price'] = df['price'] 44 | df.drop(columns=["Volume"], inplace=True) 45 | df = df[window:] 46 | df.replace([np.inf, -np.inf], np.nan, inplace=True) 47 | df.fillna(method='ffill', inplace=True) 48 | df.fillna(method='bfill', inplace=True) 49 | scaler = None 50 | 51 | if normalize: 52 | scaler = MinMaxScaler() 53 | df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1)) 54 | df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1)) 55 | df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1)) 56 | df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1)) 57 | df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1)) 58 | df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1)) 59 | df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1)) 60 | 61 | print(df.head()) 62 | print(df.tail()) 63 | return df, scaler 64 | 65 | def lstm_dataset_reshape(dataset, time_steps, future_gap, split): 66 | print("Dataset Shape:", dataset.shape) 67 | X = dataset[:, :-1] 68 | Y = dataset[:, -1] 69 | print("X Shape:", X.shape) 70 | print("Y Shape:", Y.shape) 71 | 72 | X_sampled = [] 73 | for i in range(X.shape[0] - time_steps + 1): 74 | X_sampled.append(X[i : i+time_steps]) 75 | X_sampled = np.array(X_sampled) 76 | print("Sampled X Shape:", X_sampled.shape) 77 | 78 | future_gap_index = future_gap - 1 79 | X_sampled = X_sampled[:-future_gap] 80 | Y_sampled = Y[time_steps+future_gap_index: ] 81 | print("Applying Future Gap...") 82 | print("Sampled X Shape:", X_sampled.shape) 83 | print("Sampled Y Shape:", Y_sampled.shape) 84 | 85 | if split != None: 86 | split_index = int(split*X_sampled.shape[0]) 87 | X_train = X_sampled[:split_index] 88 | X_test = X_sampled[split_index:] 89 | Y_train = Y_sampled[:split_index] 90 | Y_test = Y_sampled[split_index:] 91 | print("(X_train, Y_train, X_test, Y_test) Shapes:") 92 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 93 | return X_train, Y_train, X_test, Y_test 94 | 95 | return X_sampled, Y_sampled 96 | 97 | def build_model(time_steps, features, neurons, drop_out, decay=0.0): 98 | model = Sequential() 99 | 100 | model.add(LSTM(neurons[0], input_shape=(time_steps, features), return_sequences=True)) 101 | model.add(Dropout(drop_out)) 102 | 103 | model.add(LSTM(neurons[1], input_shape=(time_steps, features), return_sequences=False)) 104 | model.add(Dropout(drop_out)) 105 | 106 | model.add(Dense(neurons[2], activation='relu')) 107 | model.add(Dense(neurons[3], activation='linear')) 108 | 109 | adam = Adam(decay=decay) 110 | model.compile(loss='mse',optimizer=adam) 111 | model.summary() 112 | return model 113 | 114 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks): 115 | 116 | history = model.fit( 117 | X_train, 118 | Y_train, 119 | batch_size = batch_size, 120 | epochs = epochs, 121 | validation_split = validation_split, 122 | verbose = verbose, 123 | callbacks = callbacks 124 | ) 125 | 126 | return history 127 | 128 | def evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose): 129 | train_mse = model.evaluate(X_train, Y_train, verbose=verbose) 130 | print('Insample Testing: %.5f MSE (%.3f RMSE)' % (train_mse, (train_mse ** 0.5))) 131 | 132 | test_mse = model.evaluate(X_test, Y_test, verbose=verbose) 133 | print('Outsample Testing: %.5f MSE (%.3f RMSE)' % (test_mse, (test_mse ** 0.5))) 134 | 135 | return train_mse, test_mse 136 | 137 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled): 138 | rmse = (mean_squared_error(Y_test, predictions) ** 0.5) 139 | print('\nNormalized RMSE: %.3f' %(rmse)) 140 | nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test) 141 | print('Normalized NRMSE: %.3f' %(nrmse)) 142 | mae = mean_absolute_error(Y_test, predictions) 143 | print('Normalized MAE: %.3f' %(mae)) 144 | mape = compute_mape(Y_test, predictions) 145 | print('Normalized MAPE: %.3f' %(mape)) 146 | correlation = np.corrcoef(Y_test.T, predictions.T) 147 | print("Normalized Correlation: %.3f"%(correlation[0, 1])) 148 | r2 = r2_score(Y_test, predictions) 149 | print("Normalized r^2: %.3f"%(r2)) 150 | normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 151 | 152 | #evaluating the model on the inverse-normalized dataset 153 | rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5) 154 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 155 | nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test) 156 | print('Normalized NRMSE: %.3f' %(nrmse)) 157 | mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled) 158 | print('Normalized MAE: %.3f' %(mae)) 159 | mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 160 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 161 | correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T) 162 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 163 | r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled) 164 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 165 | inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 166 | 167 | return normalized_metrics, inv_normalized_metrics 168 | 169 | def test_lstm(stock_symbol, start_date, end_date, window, future_gap, time_steps, 170 | neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks, show_plot_flg): 171 | #building the dataset 172 | print("> building the dataset...") 173 | df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window) 174 | df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 175 | #reshaping the dataset for LSTM 176 | print("\n> reshaping the dataset for LSTM...") 177 | ds_train = df_train.values 178 | ds_test = df_test.values 179 | X_train, Y_train = lstm_dataset_reshape(ds_train, time_steps, future_gap, None) 180 | X_test, Y_test = lstm_dataset_reshape(ds_test, time_steps, future_gap, None) 181 | #building the LSTM model 182 | print("\n> building the LSTM model...") 183 | features = X_train.shape[2] 184 | model = build_model(time_steps, features, neurons, drop_out) 185 | #fitting the training data 186 | print("\n> fitting the training data...") 187 | model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks) 188 | #predictions 189 | print("\n> testing the model for predictions...") 190 | predictions = model.predict(X_test) 191 | #inverse-scaling 192 | print("\n> inverse-scaling the scaled values...") 193 | predictions = predictions.reshape((predictions.shape[0], 1)) 194 | predictions_inv_scaled = scaler.inverse_transform(predictions) 195 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 196 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 197 | #grouping the actual prices and predictions 198 | print("\n> grouping the actual prices and predictions...") 199 | feature_cols = df_test.columns.tolist() 200 | feature_cols.remove("actual_price") 201 | df_test.drop(columns=feature_cols, inplace=True) 202 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 203 | df_test = df_test.iloc[time_steps+future_gap-1:] 204 | df_test['Actual'] = Y_test_inv_scaled 205 | df_test['Prediction'] = predictions_inv_scaled 206 | #ploting the forecast vs the actual 207 | print("\n> plotting the results...") 208 | lookup = 5 209 | lag_list = compute_lag_metric(df_test['Actual'], df_test['Prediction'], lookup, stock_symbol) 210 | 211 | df_test = df_test[:len(df_test)-lookup+1] 212 | plot_data(df_test, stock_symbol+" Price Forecast", "Date", "Price", show_plot=False) 213 | 214 | ax = df_test.plot(title=stock_symbol+" Price Forecast and PAL Overlay") 215 | ax.set_xlabel("Date") 216 | ax.set_ylabel("Price") 217 | ax.legend(loc="best") 218 | ax.grid(True) 219 | #sudden vs normal plot annotation 220 | ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10, 221 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 222 | ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10, 223 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 224 | ax1 = ax.twinx() 225 | ax1.scatter(df_test.index, lag_list, c='g') 226 | ax1.set_ylabel("PAL") 227 | 228 | if show_plot_flg: 229 | plt.show() 230 | 231 | def final_test_lstm(stock_symbol, start_date, end_date, window, future_gap, time_steps, 232 | neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks): 233 | #building the dataset 234 | print("> building the dataset...") 235 | df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window) 236 | df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 237 | #reshaping the dataset for LSTM 238 | print("\n> reshaping the dataset for LSTM...") 239 | ds_train = df_train.values 240 | ds_test = df_test.values 241 | X_train, Y_train = lstm_dataset_reshape(ds_train, time_steps, future_gap, None) 242 | X_test, Y_test = lstm_dataset_reshape(ds_test, time_steps, future_gap, None) 243 | #building the LSTM model 244 | print("\n> building the LSTM model...") 245 | features = X_train.shape[2] 246 | model = build_model(time_steps, features, neurons, drop_out) 247 | #fitting the training data 248 | print("\n> fitting the training data...") 249 | model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks) 250 | #predictions 251 | print("\n> testing the model for predictions...") 252 | predictions = model.predict(X_test) 253 | #inverse-scaling 254 | print("\n> inverse-scaling the scaled values...") 255 | predictions = predictions.reshape((predictions.shape[0], 1)) 256 | predictions_inv_scaled = scaler.inverse_transform(predictions) 257 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 258 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 259 | #evaluation 260 | normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 261 | Y_test_inv_scaled, predictions_inv_scaled) 262 | #grouping the actual prices and predictions 263 | print("\n> grouping the actual prices and predictions...") 264 | feature_cols = df_test.columns.tolist() 265 | feature_cols.remove("actual_price") 266 | df_test.drop(columns=feature_cols, inplace=True) 267 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 268 | df_test = df_test.iloc[time_steps+future_gap-1:] 269 | df_test['Actual'] = Y_test_inv_scaled 270 | df_test['Prediction'] = predictions_inv_scaled 271 | 272 | return normalized_metrics, inv_normalized_metrics, df_test -------------------------------------------------------------------------------- /machine_learning/development/optimized_lstm/lstm_main.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import lstm 2 | from machine_learning.development.new_regression.new_dataset import compute_mape 3 | from keras.callbacks import EarlyStopping 4 | from sklearn.metrics import mean_squared_error 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | from sklearn.metrics import r2_score 8 | 9 | def main(internal_eval=False): 10 | #building the dataset 11 | print("> building the dataset...") 12 | stock_symbol = '^GSPC' 13 | start_date = '1950-01-01' 14 | end_date = '2017-12-31' 15 | window = 2 16 | dataframe, scaler = lstm.bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 17 | 18 | #reshaping the dataset for LSTM 19 | print("\n> reshaping the dataset for LSTM...") 20 | dataset = dataframe.values 21 | time_steps = 1 #1 trading day 22 | future_gap = 1 #1 trading day 23 | split = 0.8 #80% of the dataset 24 | X_train, Y_train, X_test, Y_test = lstm.lstm_dataset_reshape(dataset, time_steps, future_gap, split) 25 | 26 | #building the LSTM model 27 | print("\n> building the LSTM model...") 28 | features = X_train.shape[2] 29 | neurons = [256, 256, 32, 1] 30 | drop_out = 0.2 31 | verbose = 1 32 | model = lstm.build_model(time_steps, features, neurons, drop_out) 33 | 34 | #fitting the training data 35 | print("\n> fitting the training data...") 36 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 37 | patience=50, verbose=verbose, mode='auto') 38 | batch_size = 2048 39 | epochs = 300 40 | validation_split = 0.1 41 | history = lstm.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, 42 | verbose, [early_stopping_callback]) 43 | 44 | #internal evaluation 45 | if internal_eval: 46 | print("\n> internal evaluation...") 47 | _, _ = lstm.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose) 48 | 49 | #predictions 50 | predictions = model.predict(X_test) 51 | predictions = predictions.reshape((predictions.shape[0], 1)) 52 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 53 | 54 | #evaluating the model on the normalized dataset 55 | rmse = (mean_squared_error(predictions, Y_test) ** 0.5) 56 | print('\nNormalized Test RMSE: %.3f' %(rmse)) 57 | mape = compute_mape(Y_test, predictions) 58 | print('Normalized Outsample MAPE: %.3f' %(mape)) 59 | correlation = np.corrcoef(predictions.T, Y_test.T) 60 | print("Normalized Correlation: %.3f"%(correlation[0, 1])) 61 | r2 = r2_score(predictions, Y_test) 62 | print("Normalized Outsample r^2: %.3f"%(r2)) 63 | 64 | #evaluating the model on the inverse-normalized dataset 65 | predictions_inv_scaled = scaler.inverse_transform(predictions) 66 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 67 | 68 | rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5) 69 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 70 | mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 71 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 72 | correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T) 73 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 74 | r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled) 75 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 76 | 77 | #plotting the results 78 | print("\n> plotting the results...") 79 | _, ax2 = plt.subplots() 80 | '''ax1.plot(history.history['loss'], label='Training') 81 | ax1.plot(history.history['val_loss'], label='Validation') 82 | ax1.set_xlabel('Epoch #') 83 | ax1.set_ylabel('Loss') 84 | ax1.legend(loc='best') 85 | ax1.grid(True) 86 | ''' 87 | ax2.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction') 88 | ax2.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual') 89 | ax2.set_xlabel('Trading Day') 90 | ax2.set_ylabel('Price') 91 | ax2.legend(loc='best') 92 | ax2.grid(True) 93 | 94 | plt.show() 95 | 96 | main() 97 | 98 | 99 | #to be stored temporarily 100 | '''#evaluating the model on the *inverse-normalized dataset* 101 | if print_flag: 102 | print("\n> evaluating the model on the *dataset*...") 103 | predictions = model.predict(X_test) 104 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 105 | 106 | predictions_inv_scaled = scaler.inverse_transform(predictions) 107 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 108 | 109 | rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5) 110 | if print_flag: 111 | print('Outsample RMSE: %.3f' %(rmse)) 112 | #correlation = np.corrcoef(predictions_inv_scaled, Y_test_inv_scaled) 113 | #print("Outsample Correlation: %.3f"%(correlation[0, 1])) 114 | ''' -------------------------------------------------------------------------------- /machine_learning/development/optimized_lstm/optimal_hyperparameters.txt: -------------------------------------------------------------------------------- 1 | dropout: 0.2, neurons: [256, 256, 32, 1], decay: 0.1, time_steps: 5, batch_size: 2048, epochs: 300, time elapsed: 90345.647s. -------------------------------------------------------------------------------- /machine_learning/development/original_evaluation.md: -------------------------------------------------------------------------------- 1 | ## Algorithms Evaluation 2 | *Algorithm: (RMSE, Correlation)* 3 | 4 | * Linear Regression: (3.328, 0.948) 5 | ```sh 6 | python -m machine_learning.development.linear_regression 7 | ``` 8 | ![Linear Regression](https://github.com/ahmedhamdi96/ML4T/blob/master/results/lin_reg.png) 9 | 10 | * kNN Regression: (2.142, 0.905) 11 | ```sh 12 | python -m machine_learning.development.knn_regression 13 | ``` 14 | ![kNN Regression](https://github.com/ahmedhamdi96/ML4T/blob/master/results/knn.png) 15 | 16 | * Keras Regression: (3.360, 0.947) 17 | ```sh 18 | python -m machine_learning.development.keras_ffnn 19 | ``` 20 | ![Keras Regression](https://github.com/ahmedhamdi96/ML4T/blob/master/results/ffnn_reg.png) 21 | 22 | * Keras RNN LSTM: (3.405, 0.949) 23 | ```sh 24 | python -m machine_learning.development.keras_lstm 25 | ``` 26 | ![Keras RNN LSTM](https://github.com/ahmedhamdi96/ML4T/blob/master/results/lstm.png) -------------------------------------------------------------------------------- /machine_learning/development/technical_indicators_dataset.py: -------------------------------------------------------------------------------- 1 | '''This file constructs a dataset to be used by the ML algorithms. 2 | The dataset consists of the past price and technical indicators as 3 | features, and the price as the output. The dataset is indexed by 4 | date, a row entry contains the price and techincal indicators of 5 | some day prior to the date index, and the price is the actual 6 | price of the stock at the date marked by the index. 7 | ''' 8 | from utils.util import get_stock_data 9 | import numpy as np 10 | import pandas as pd 11 | import talib as ta 12 | 13 | '''technical indicators computation functions 14 | 15 | *prices : adjusted closing stock prices 16 | *window : rolling statistics window 17 | ''' 18 | #BEGIN 19 | def compute_momentum_ratio(prices, window): 20 | #first window elements >> NA 21 | momentum_ratio = (prices/prices.shift(periods = window)) - 1 22 | return momentum_ratio 23 | 24 | def compute_sma_ratio(prices, window): 25 | #Simple Moving Average 26 | #first window-1 elements >> NA 27 | sma_ratio = (prices / prices.rolling(window = window).mean()) - 1 28 | return sma_ratio 29 | 30 | def compute_bollinger_bands_ratio(prices, window): 31 | #first window-1 elements >> NA 32 | bb_ratio = prices - prices.rolling(window = window).mean() 33 | bb_ratio = bb_ratio / (2 * prices.rolling(window = window).std()) 34 | return bb_ratio 35 | 36 | def compute_daily_return_volatility(prices, window): 37 | #first window-1 elements >> NA 38 | daily_return = (prices/prices.shift(periods= 1)) - 1 39 | volatility = daily_return.rolling(window=window).std() 40 | return volatility 41 | #END 42 | 43 | '''dataset constructor function 44 | 45 | *start_date : start date for the entire dataset (training and testing) 46 | *end_date : end date for the entire dataset (training and testing) 47 | *stock : stock label to be used in the dataset 48 | ''' 49 | def get_dataset_dataframe(start_date='17/12/2014', end_date = '31/12/2017', stock='IBM'): 50 | #importing stock data 51 | columns = ["Date", "Adj Close", "High", "Low", "Volume"] 52 | stock_df = get_stock_data(stock, start_date, end_date, columns=columns) 53 | date_range = pd.date_range(start_date, end_date) 54 | dataset_df = pd.DataFrame(index=date_range) 55 | #calculating technical indicators 56 | #make sure to include the last 2 weeks of 2014 to compensate calculations loss 57 | #1st week is lost in the preparation of the indicators 58 | #2nd week is lost to include the future gap 59 | future_gap = 5 #1 trading week 60 | dataset_df['price'] = stock_df["Adj Close"] 61 | dataset_df.dropna(subset=['price'], inplace=True) 62 | dataset_df['momentum'] = compute_momentum_ratio(stock_df["Adj Close"], future_gap) 63 | dataset_df['sma'] = compute_sma_ratio(stock_df["Adj Close"], future_gap) 64 | dataset_df['bolinger_band'] = compute_bollinger_bands_ratio(stock_df["Adj Close"], future_gap) 65 | dataset_df['sar'] = ta.SAR(stock_df["High"], stock_df["Low"]) 66 | dataset_df['rsi'] = ta.RSI(stock_df["Adj Close"], timeperiod=future_gap) 67 | dataset_df['obv'] = ta.OBV(stock_df["Adj Close"], stock_df["Volume"]) 68 | dataset_df['adosc'] = ta.ADOSC(stock_df["High"], stock_df["Low"], stock_df["Adj Close"], stock_df["Volume"], 69 | fastperiod=2, slowperiod=3) 70 | dataset_df['macd'], _, _ = ta.MACD(stock_df["Adj Close"], fastperiod=2, slowperiod=3, signalperiod=3) 71 | dataset_df['slowk '], dataset_df['slowd'] = ta.STOCH(stock_df["High"], stock_df["Low"], stock_df["Adj Close"], 72 | fastk_period=3, slowk_period=2, slowd_period=3) 73 | dataset_df['cci'] = ta.CCI(stock_df["High"], stock_df["Low"], stock_df["Adj Close"], timeperiod=future_gap) 74 | dataset_df['volatility'] = compute_daily_return_volatility(stock_df["Adj Close"], future_gap) 75 | dataset_df.dropna(subset=dataset_df.columns, inplace=True) 76 | dataset_df = dataset_df.shift(future_gap) 77 | shifted_columns_names = ['price(t-%d)' %(future_gap), 'moment(t-%d)' %(future_gap), 'sma(t-%d)' %(future_gap), 78 | 'b_band(t-%d)' %(future_gap), 'sar(t-%d)' %(future_gap), 'rsi(t-%d)' %(future_gap), 79 | 'obv(t-%d)' %(future_gap), 'adosc(t-%d)' %(future_gap), 'macd(t-%d)' %(future_gap), 80 | 'slowk(t-%d)' %(future_gap), 'slowd(t-%d)' %(future_gap), 'cci(t-%d)' %(future_gap), 81 | 'volatility(t-%d)' %(future_gap)] 82 | dataset_df.columns = shifted_columns_names 83 | dataset_df.dropna(subset=shifted_columns_names, inplace=True) 84 | dataset_df['price'] = stock_df["Adj Close"] 85 | 86 | return dataset_df -------------------------------------------------------------------------------- /machine_learning/development/testing/analysis.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import lstm 2 | from keras.callbacks import EarlyStopping 3 | 4 | #sudden vs normal 5 | stocks_list = ['TSLA'] 6 | show_plot = len(stocks_list) 7 | dates_dic = { 8 | 'TSLA': ['2013-01-01', '2013-06-01'], 9 | } 10 | 11 | window = 2 12 | future_gap = 1 13 | time_steps = 1 14 | neurons = [256, 256, 32, 1] 15 | drop_out = 0.2 16 | batch_size = 2048 17 | epochs = 300 18 | validation_split = 0.1 19 | verbose = 1 20 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 21 | patience=50, verbose=verbose, mode='auto') 22 | callbacks = [early_stopping_callback] 23 | 24 | for stock in stocks_list: 25 | show_plot -= 1 26 | show_plot_flg = True if show_plot == 0 else False 27 | start_date = dates_dic[stock][0] 28 | end_date = dates_dic[stock][1] 29 | lstm.test_lstm(stock, start_date, end_date, window, future_gap, time_steps, 30 | neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks, show_plot_flg) 31 | 32 | #sudden vs normal forecast annotations 33 | ''' 34 | ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10, 35 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 36 | ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10, 37 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 38 | ''' -------------------------------------------------------------------------------- /machine_learning/development/testing/companies.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import lstm 2 | from keras.callbacks import EarlyStopping 3 | 4 | stocks_list = ['FB', 'AAPL', 'TSLA', 'AMZN'] 5 | show_plot = len(stocks_list) 6 | dates_dic = { 7 | 'FB' : ['2017-12-01', '2018-05-01'], 8 | 'AAPL': ['2012-08-01', '2013-08-01'], 9 | 'TSLA': ['2013-08-01', '2014-01-01'], 10 | 'AMZN': ['2017-08-01', '2018-04-01'], 11 | } 12 | 13 | window = 2 14 | future_gap = 1 15 | time_steps = 1 16 | neurons = [256, 256, 32, 1] 17 | drop_out = 0.2 18 | batch_size = 2048 19 | epochs = 300 20 | validation_split = 0.1 21 | verbose = 1 22 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 23 | patience=50, verbose=verbose, mode='auto') 24 | callbacks = [early_stopping_callback] 25 | 26 | for stock in stocks_list: 27 | show_plot -= 1 28 | show_plot_flg = True if show_plot == 0 else False 29 | start_date = dates_dic[stock][0] 30 | end_date = dates_dic[stock][1] 31 | lstm.test_lstm(stock, start_date, end_date, window, future_gap, time_steps, 32 | neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks, show_plot_flg) -------------------------------------------------------------------------------- /machine_learning/development/testing/future_gap_test.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import lstm 2 | from keras.callbacks import EarlyStopping 3 | import matplotlib.pyplot as plt 4 | 5 | stock = 'AAPL' 6 | dates_dic = { 7 | 'AAPL' : ['2017-01-01', '2018-01-01'] 8 | } 9 | 10 | window = 2 11 | future_gap = 1 12 | time_steps = 1 13 | neurons = [256, 256, 32, 1] 14 | drop_out = 0.2 15 | batch_size = 2048 16 | epochs = 300 17 | validation_split = 0.1 18 | verbose = 1 19 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 20 | patience=50, verbose=verbose, mode='auto') 21 | callbacks = [early_stopping_callback] 22 | 23 | #future_gap test 24 | future_gap_list = [1, 5, 20] 25 | future_gap_dic = { 26 | 1 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 27 | 5 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 28 | 20 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 29 | } 30 | future_gap_plots = { 31 | 1 : None, 32 | 5 : None, 33 | 20 : None 34 | } 35 | 36 | for future_gap in future_gap_list: 37 | start_date = dates_dic[stock][0] 38 | end_date = dates_dic[stock][1] 39 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 40 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 41 | verbose, callbacks) 42 | future_gap_dic[future_gap][0] = normalized_metrics 43 | future_gap_dic[future_gap][1] = inv_normalized_metrics 44 | future_gap_plots[future_gap] = df 45 | 46 | print(future_gap_dic) 47 | 48 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1) 49 | 50 | df = future_gap_plots[1] 51 | ax1.plot(df.index, df["Actual"], label='Actual') 52 | ax1.plot(df.index, df["Prediction"], label='Prediction') 53 | ax1.set_title('Future Gap = 1') 54 | ax1.set_xlabel('Date') 55 | ax1.set_ylabel('Price') 56 | ax1.legend(loc="best") 57 | ax1.grid(True) 58 | 59 | df = future_gap_plots[5] 60 | ax2.plot(df.index, df["Actual"], label='Actual') 61 | ax2.plot(df.index, df["Prediction"], label='Prediction') 62 | ax2.set_title('Future Gap = 5') 63 | ax2.set_xlabel('Date') 64 | ax2.set_ylabel('Price') 65 | ax2.legend(loc="best") 66 | ax2.grid(True) 67 | 68 | df = future_gap_plots[20] 69 | ax3.plot(df.index, df["Actual"], label='Actual') 70 | ax3.plot(df.index, df["Prediction"], label='Prediction') 71 | ax3.set_title('Future Gap = 20') 72 | ax3.set_xlabel('Date') 73 | ax3.set_ylabel('Price') 74 | ax3.legend(loc="best") 75 | ax3.grid(True) 76 | 77 | fig.tight_layout() 78 | plt.show() -------------------------------------------------------------------------------- /machine_learning/development/testing/lag_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | def compute_lag_metric(actual, prediction, lookup, symbol): 5 | diff_list = [None] * lookup 6 | lag_list = [None] * (len(actual)-lookup+1) 7 | 8 | for i in range(len(actual)-lookup+1): 9 | for j in range(lookup): 10 | diff_list[j] = abs(actual[i] - prediction[i+j]) 11 | lag_list[i] = diff_list.index(min(diff_list)) 12 | 13 | max_diff_count = [0] * lookup 14 | 15 | for i in range(len(lag_list)): 16 | max_diff_count[lag_list[i]] += 1 17 | 18 | _, ax = plt.subplots() 19 | ax.bar(range(len(max_diff_count)), max_diff_count, align='center') 20 | plt.sca(ax) 21 | plt.title(symbol+" Lag Test") 22 | ax.set_xlabel('Day Lag') 23 | ax.set_ylabel('Frequency') 24 | ax.grid(True) 25 | 26 | _, ax1 = plt.subplots() 27 | ax1.scatter(range(len(lag_list)), lag_list) 28 | plt.title(symbol+" Daily Lag Test") 29 | ax1.set_xlabel('Trading Day') 30 | ax1.set_ylabel('Lag') 31 | ax1.grid(True) 32 | 33 | return lag_list -------------------------------------------------------------------------------- /machine_learning/development/testing/results/amazon.md: -------------------------------------------------------------------------------- 1 | ## Amazon 2 | 3 | ### Exceeding Q3 epectations, [September/2017 - February/2018] 4 | 5 | ![Amazon](https://github.com/ahmedhamdi96/ML4T/blob/master/results/amazon.png) -------------------------------------------------------------------------------- /machine_learning/development/testing/results/analysis.md: -------------------------------------------------------------------------------- 1 | ## Analysis with PAL 2 | 3 | ### Sudden Changes vs Normal Movements 4 | 5 | This forecast is used to predict the tesla stock for a duration between 01/01/2013 and 01/06/2013, PAL is also used to analyze the behaviour of the model during two different periods a stock usually goes through; a normal movement, where the stock price fluctuates with no dramatic change, the other period is a sudden change period, where the stock moves violently either upwards, downwards, or up and down with high volatility. 6 | 7 | ![SvN](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal.png) 8 | 9 | > Up until 07/05/2013, the stock price movement exhibits a normal movement, no violent trajectories appear. 10 | > This is when the model performs best. The forecast does not lag the actual price, and follows the same trend 11 | > and movement of the actual price. Starting from 07/05/2013, the stock moves up with a steep trajectory, and 12 | > during that sudden change is when the model performs poorly. Upon researching news about Tesla on May/2013, 13 | > it was discovered that the company reported its first quarterly profit and its flagship at that time, the 14 | > Model S, received the best review of any car in Consumer Reports magazine's history, see the report 15 | > [here](http://money.cnn.com/2013/05/10/investing/tesla-stock). These postive news caused an unexpected and 16 | > sudden surge in Tesla's stock price. A hypothesis that can be proposed from that is that the model is capable 17 | > of predicting the price and the fluctuations in price caused by the stock market movement, but when external 18 | > events that impact the stock price suddenly, the model naturally does not pick up on these events. 19 | 20 | ![Lag](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_lag.png) 21 | 22 | > This plot shows the frequency of when was the prediction closest to the actual price, the day lag indicates 23 | > the number of days it took for the forecast to best match the actual price. 24 | 25 | ![Daily Lag](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_daily_lag.png) 26 | 27 | > This plot follows the same timeline of the forecast on the x-axis, against the lag on the y-axis. This plot 28 | > supports the hypothesis, mentioned earlier, the model finds the closest prediction to the actual price early 29 | > on during the normal movement phase, and lags at the end of the timeline during the sudden change phase. 30 | 31 | ![SvN w/ PAL](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_pal.png) 32 | 33 | ![SvN w/ PAL 1](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_pal_1.png) -------------------------------------------------------------------------------- /machine_learning/development/testing/results/apple.md: -------------------------------------------------------------------------------- 1 | ## Apple 2 | 3 | ### Apple's first free fall, [September/2012 - June/2013] 4 | 5 | ![Apple](https://github.com/ahmedhamdi96/ML4T/blob/master/results/apple.png) -------------------------------------------------------------------------------- /machine_learning/development/testing/results/eval.md: -------------------------------------------------------------------------------- 1 | ## Evaluation Metrics 2 | 3 | ### Apple 2017 Stock Price Forecast 4 | 5 | | Future Gap | RMSE | NRMSE | MAE | MAPE | Corr | R^2 | 6 | | :--------: | :--: | :--: | :--: | :--: | :--: | :--: | 7 | | 1 day | 0.0281 | 0.0492 | 0.0196 | 4.72 | 0.993 | 0.986 | 8 | | 1 week | 0.0672 | 0.116 | 0.0524 | 11.3 | 0.967 | 0.915 | 9 | | 1 month | 0.1539 | 0.252 | 0.129 | 23.7 | 0.827 | 0.396 | 10 | 11 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/future_gap_test.png) -------------------------------------------------------------------------------- /machine_learning/development/testing/results/facebook.md: -------------------------------------------------------------------------------- 1 | ## Facebook 2 | 3 | ### Facebook–Cambridge Analytica data scandal, [January/2018 - March/2018] 4 | 5 | ![Facebook](https://github.com/ahmedhamdi96/ML4T/blob/master/results/facebook.png) -------------------------------------------------------------------------------- /machine_learning/development/testing/results/future_gap.md: -------------------------------------------------------------------------------- 1 | ## Future Gap Test 2 | 3 | ### Microsoft 2017 Stock Price Forecast 4 | 5 | #### LSTM 6 | | Future Gap | RMSE | NRMSE | MAE | Corr | R^2 | 7 | | :--------: | :--: | :--: | :--: | :--: | :--: | 8 | | 1 Day | 0.0273 | 0.0676 | 0.0184 | 0.995 | 0.991 | 9 | | 2 Days | 0.0369 | 0.0909 | 0.0254 | 0.992 | 0.983 | 10 | | 3 Days | 0.0437 | 0.1070 | 0.0314 | 0.989 | 0.976 | 11 | | 4 Days | 0.0496 | 0.1210 | 0.0363 | 0.985 | 0.969 | 12 | | 5 Days | 0.0568 | 0.1380 | 0.0421 | 0.981 | 0.959 | 13 | 14 | #### Linear Regressor 15 | | Future Gap | RMSE | NRMSE | MAE | Corr | R^2 | 16 | | :--------: | :--: | :--: | :--: | :--: | :--: | 17 | | 1 Day | 0.0275 | 0.0679 | 0.0185 | 0.993 | 0.990 | 18 | | 2 Days | 0.0372 | 0.0917 | 0.0260 | 0.992 | 0.983 | 19 | | 3 Days | 0.0441 | 0.1080 | 0.0317 | 0.989 | 0.976 | 20 | | 4 Days | 0.0504 | 0.1230 | 0.0366 | 0.985 | 0.968 | 21 | | 5 Days | 0.0572 | 0.1390 | 0.0422 | 0.981 | 0.958 | 22 | 23 | #### FFNN 24 | | Future Gap | RMSE | NRMSE | MAE | Corr | R^2 | 25 | | :--------: | :--: | :--: | :--: | :--: | :--: | 26 | | 1 Day | 0.0376 | 0.0931 | 0.0278 | 0.994 | 0.982 | 27 | | 2 Days | 0.0474 | 0.1170 | 0.0335 | 0.991 | 0.972 | 28 | | 3 Days | 0.0691 | 0.1700 | 0.0501 | 0.984 | 0.939 | 29 | | 4 Days | 0.0535 | 0.1310 | 0.0389 | 0.982 | 0.964 | 30 | | 5 Days | 0.0709 | 0.1729 | 0.0512 | 0.972 | 0.936 | 31 | 32 | *Shown below are the forecasts of the LSTM RNN model* 33 | 34 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap1.png) 35 | 36 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap2.png) 37 | 38 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap3.png) 39 | 40 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap4.png) 41 | 42 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap5.png) -------------------------------------------------------------------------------- /machine_learning/development/testing/results/tesla.md: -------------------------------------------------------------------------------- 1 | ## Tesla 2 | 3 | ### Analysts downgrades, [September/2013 - November/2013] 4 | 5 | ![Tesla](https://github.com/ahmedhamdi96/ML4T/blob/master/results/tesla.png) -------------------------------------------------------------------------------- /machine_learning/development/testing/results/window_and_ts.md: -------------------------------------------------------------------------------- 1 | ## Window and Time Steps Test 2 | 3 | ### Time Steps Test 4 | 5 | | Time Steps | RMSE | MAPE | Corr | R^2 | 6 | | :--------: | :--: | :--: | :--: | :--: | 7 | | 1 | 0.0317 | 5.26 | 0.993 | 0.982 | 8 | | 2 | 0.0338 | 5.45 | 0.990 | 0.979 | 9 | | 3 | 0.0452 | 7.89 | 0.988 | 0.961 | 10 | | 4 | 0.0462 | 7.77 | 0.985 | 0.959 | 11 | | 5 | 0.0538 | 8.91 | 0.982 | 0.942 | 12 | 13 | Winner: 1 14 | 15 | ### Window Test 16 | 17 | | Window | RMSE | MAPE | Corr | R^2 | 18 | | :----: | :--: | :--: | :--: | :--: | 19 | | 2 | 0.0299 | 4.84 | 0.994 | 0.984 | 20 | | 3 | 0.0294 | 5.31 | 0.993 | 0.985 | 21 | | 4 | 0.0336 | 7.85 | 0.992 | 0.981 | 22 | | 5 | 0.0287 | inf | 0.993 | 0.986 | 23 | 24 | Winner: The metrics are not decisive enough, so a plot test could help. 25 | 26 | ![Window 2,3](https://github.com/ahmedhamdi96/ML4T/blob/master/results/window_test_1.png) 27 | ![Window 4,5](https://github.com/ahmedhamdi96/ML4T/blob/master/results/window_test_2.png) 28 | 29 | -------------------------------------------------------------------------------- /machine_learning/development/testing/test.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import lstm 2 | from keras.callbacks import EarlyStopping 3 | 4 | stock = 'AAPL' 5 | dates_dic = { 6 | 'AAPL' : ['2017-01-01', '2018-01-01'] 7 | } 8 | 9 | window = 2 10 | future_gap = 1 11 | time_steps = 1 12 | neurons = [256, 256, 32, 1] 13 | drop_out = 0.2 14 | batch_size = 2048 15 | epochs = 300 16 | validation_split = 0.1 17 | verbose = 1 18 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 19 | patience=50, verbose=verbose, mode='auto') 20 | callbacks = [early_stopping_callback] 21 | 22 | #window test 23 | window_list = [2,3,4,5] 24 | window_dic = { 25 | 2 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 26 | 3 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 27 | 4 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 28 | 5 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 29 | } 30 | 31 | #time_steps test 32 | time_steps_list = [1,2,3,4,5] 33 | time_steps_dic = { 34 | 1 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 35 | 2 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 36 | 3 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 37 | 4 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]], 38 | 5 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]] 39 | } 40 | 41 | for window in window_list: 42 | start_date = dates_dic[stock][0] 43 | end_date = dates_dic[stock][1] 44 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 45 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 46 | verbose, callbacks) 47 | window_dic[window][0] = normalized_metrics 48 | window_dic[window][1] = inv_normalized_metrics 49 | 50 | print(window_dic) -------------------------------------------------------------------------------- /machine_learning/development/testing/window_plot_test.py: -------------------------------------------------------------------------------- 1 | from machine_learning.development.optimized_lstm import lstm 2 | from keras.callbacks import EarlyStopping 3 | import matplotlib.pyplot as plt 4 | 5 | stock = 'AAPL' 6 | dates_dic = { 7 | 'AAPL' : ['2017-01-01', '2018-01-01'] 8 | } 9 | 10 | window = 2 11 | future_gap = 1 12 | time_steps = 1 13 | neurons = [256, 256, 32, 1] 14 | drop_out = 0.2 15 | batch_size = 2048 16 | epochs = 300 17 | validation_split = 0.1 18 | verbose = 1 19 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 20 | patience=50, verbose=verbose, mode='auto') 21 | callbacks = [early_stopping_callback] 22 | 23 | #window test 24 | window_list = [2,3,4,5] 25 | window_dic = { 26 | 2 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None], 27 | 3 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None], 28 | 4 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None], 29 | 5 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None] 30 | } 31 | 32 | window = 5 33 | 34 | for window in window_list: 35 | start_date = dates_dic[stock][0] 36 | end_date = dates_dic[stock][1] 37 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 38 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 39 | verbose, callbacks) 40 | window_dic[window][0] = normalized_metrics 41 | window_dic[window][1] = inv_normalized_metrics 42 | window_dic[window][2] = df 43 | 44 | fig1, (ax1, ax2) = plt.subplots(2, 1) 45 | fig2, (ax3, ax4) = plt.subplots(2, 1) 46 | 47 | df = window_dic[2][2] 48 | ax1.plot(df.index, df["Actual"], label='Actual') 49 | ax1.plot(df.index, df["Prediction"], label='Prediction') 50 | ax1.set_title('Window = 2') 51 | ax1.set_xlabel('Date') 52 | ax1.set_ylabel('Price') 53 | ax1.legend(loc="best") 54 | ax1.grid(True) 55 | 56 | df = window_dic[3][2] 57 | ax2.plot(df.index, df["Actual"], label='Actual') 58 | ax2.plot(df.index, df["Prediction"], label='Prediction') 59 | ax2.set_title('Window = 3') 60 | ax2.set_xlabel('Date') 61 | ax2.set_ylabel('Price') 62 | ax2.legend(loc="best") 63 | ax2.grid(True) 64 | 65 | df = window_dic[4][2] 66 | ax3.plot(df.index, df["Actual"], label='Actual') 67 | ax3.plot(df.index, df["Prediction"], label='Prediction') 68 | ax3.set_title('Window = 4') 69 | ax3.set_xlabel('Date') 70 | ax3.set_ylabel('Price') 71 | ax3.legend(loc="best") 72 | ax3.grid(True) 73 | 74 | df = window_dic[5][2] 75 | ax4.plot(df.index, df["Actual"], label='Actual') 76 | ax4.plot(df.index, df["Prediction"], label='Prediction') 77 | ax4.set_title('Window = 5') 78 | ax4.set_xlabel('Date') 79 | ax4.set_ylabel('Price') 80 | ax4.legend(loc="best") 81 | ax4.grid(True) 82 | 83 | fig1.tight_layout() 84 | fig2.tight_layout() 85 | plt.show() -------------------------------------------------------------------------------- /machine_learning/final/evaluation/metrics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | from sklearn.metrics import mean_squared_error, mean_absolute_error 4 | from sklearn.metrics import r2_score 5 | 6 | def compute_mape(y_true, y_pred): 7 | return np.mean(np.abs((y_true - y_pred) / y_true)) * 100 8 | 9 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled): 10 | rmse = (mean_squared_error(Y_test, predictions) ** 0.5) 11 | print('\nNormalized RMSE: %.3f' %(rmse)) 12 | nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test) 13 | print('Normalized NRMSE: %.3f' %(nrmse)) 14 | mae = mean_absolute_error(Y_test, predictions) 15 | print('Normalized MAE: %.3f' %(mae)) 16 | mape = compute_mape(Y_test, predictions) 17 | print('Normalized MAPE: %.3f' %(mape)) 18 | correlation = np.corrcoef(Y_test.T, predictions.T) 19 | print("Normalized Correlation: %.3f"%(correlation[0, 1])) 20 | r2 = r2_score(Y_test, predictions) 21 | print("Normalized r^2: %.3f"%(r2)) 22 | normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 23 | 24 | #evaluating the model on the inverse-normalized dataset 25 | rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5) 26 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse)) 27 | nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test) 28 | print('Normalized NRMSE: %.3f' %(nrmse)) 29 | mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled) 30 | print('Normalized MAE: %.3f' %(mae)) 31 | mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled) 32 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape)) 33 | correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T) 34 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1])) 35 | r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled) 36 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2)) 37 | inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2] 38 | 39 | return normalized_metrics, inv_normalized_metrics 40 | 41 | def compute_lag_metric(actual, prediction, lookup, symbol): 42 | diff_list = [None] * lookup 43 | lag_list = [None] * (len(actual)-lookup+1) 44 | 45 | for i in range(len(actual)-lookup+1): 46 | for j in range(lookup): 47 | diff_list[j] = abs(actual[i] - prediction[i+j]) 48 | lag_list[i] = diff_list.index(min(diff_list)) 49 | 50 | max_diff_count = [0] * lookup 51 | 52 | for i in range(len(lag_list)): 53 | max_diff_count[lag_list[i]] += 1 54 | 55 | _, ax = plt.subplots() 56 | ax.bar(range(len(max_diff_count)), max_diff_count, align='center') 57 | plt.sca(ax) 58 | plt.title(symbol+" Lag Test") 59 | ax.set_xlabel('Day Lag') 60 | ax.set_ylabel('Frequency') 61 | ax.grid(True) 62 | 63 | _, ax1 = plt.subplots() 64 | index = actual[:len(actual)-lookup+1].index 65 | ax1.scatter(index, lag_list) 66 | plt.title(symbol+" Daily Lag Test") 67 | ax1.set_xlabel('Date') 68 | ax1.set_ylabel('Lag') 69 | ax1.grid(True) 70 | 71 | return lag_list -------------------------------------------------------------------------------- /machine_learning/final/experiments/exp1.py: -------------------------------------------------------------------------------- 1 | from utils.util import plot_data 2 | from machine_learning.final.models import lstm 3 | from machine_learning.final.models import ffnn 4 | from machine_learning.final.models import lin_reg 5 | from machine_learning.final.models import knn_reg 6 | from keras.callbacks import EarlyStopping 7 | import matplotlib.pyplot as plt 8 | 9 | #models comparison 10 | stock = 'AAPL' 11 | dates_dic = { 12 | 'AAPL' : ['2017-01-01', '2018-01-01'] 13 | } 14 | metrics_dic = { 15 | 'LSTM' : [], 16 | 'FFNN' : [], 17 | 'LinReg' : [], 18 | 'kNNReg' : [] 19 | } 20 | 21 | window = 2 22 | future_gap = 1 23 | time_steps = 1 24 | neurons = [256, 256, 32, 1] 25 | drop_out = 0.2 26 | batch_size = 2048 27 | epochs = 300 28 | validation_split = 0.1 29 | verbose = 1 30 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 31 | patience=50, verbose=verbose, mode='auto') 32 | callbacks = [early_stopping_callback] 33 | start_date = dates_dic[stock][0] 34 | end_date = dates_dic[stock][1] 35 | 36 | #LSTM 37 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 38 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 39 | verbose, callbacks) 40 | metrics_dic['LSTM'] = normalized_metrics 41 | plot_data(df, stock+" 2017 Price Forecast (LSTM)", "Date", "Price", show_plot=False) 42 | 43 | #FFNN 44 | neurons = [256, 256, 64, 1] 45 | batch_size = 128 46 | epochs = 200 47 | 48 | normalized_metrics, inv_normalized_metrics, df = ffnn.final_test_ffnn(stock, start_date, 49 | end_date, window, future_gap, neurons, drop_out, batch_size, epochs, validation_split, 50 | verbose, callbacks) 51 | metrics_dic['FFNN'] = normalized_metrics 52 | plot_data(df, stock+" 2017 Price Forecast (FFNN)", "Date", "Price", show_plot=False) 53 | 54 | #LinReg 55 | normalized_metrics, inv_normalized_metrics, df = lin_reg.final_test_linreg(stock, start_date, 56 | end_date, window, future_gap) 57 | metrics_dic['LinReg'] = normalized_metrics 58 | plot_data(df, stock+" 2017 Price Forecast (LinReg)", "Date", "Price", show_plot=False) 59 | 60 | #kNNReg 61 | k = 100 62 | 63 | normalized_metrics, inv_normalized_metrics, df = knn_reg.final_test_knnreg(stock, start_date, 64 | end_date, window, future_gap, k) 65 | metrics_dic['kNNReg'] = normalized_metrics 66 | plot_data(df, stock+" 2017 Price Forecast (kNNReg)", "Date", "Price", show_plot=False) 67 | 68 | print(metrics_dic) 69 | plt.show() -------------------------------------------------------------------------------- /machine_learning/final/experiments/exp2.py: -------------------------------------------------------------------------------- 1 | from machine_learning.final.models import lstm 2 | from machine_learning.final.evaluation.metrics import compute_lag_metric 3 | from keras.callbacks import EarlyStopping 4 | import matplotlib.pyplot as plt 5 | 6 | #sudden vs normal 7 | stock = 'TSLA' 8 | dates_dic = { 9 | 'TSLA': ['2013-01-01', '2013-06-01'], 10 | } 11 | 12 | window = 2 13 | future_gap = 1 14 | time_steps = 1 15 | neurons = [256, 256, 32, 1] 16 | drop_out = 0.2 17 | batch_size = 2048 18 | epochs = 300 19 | validation_split = 0.1 20 | verbose = 1 21 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 22 | patience=50, verbose=verbose, mode='auto') 23 | callbacks = [early_stopping_callback] 24 | start_date = dates_dic[stock][0] 25 | end_date = dates_dic[stock][1] 26 | #LSTM 27 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 28 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, 29 | validation_split, verbose, callbacks) 30 | #PAL 31 | lookup = 5 32 | lag_list = compute_lag_metric(df['Actual'], df['Prediction'], lookup, stock) 33 | #Price Forecast Plot 34 | df = df[:len(df)-lookup+1] 35 | ax = df.plot(title=stock+" Price Forecast") 36 | ax.set_xlabel("Date") 37 | ax.set_ylabel("Price") 38 | ax.legend(loc="best") 39 | ax.grid(True) 40 | ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10, 41 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 42 | ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10, 43 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 44 | #Price Forecast and PAL Overlay Plot 45 | ax = df.plot(title=stock+" Price Forecast and PAL Overlay") 46 | ax.set_xlabel("Date") 47 | ax.set_ylabel("Price") 48 | ax.legend(loc="best") 49 | ax.grid(True) 50 | ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10, 51 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 52 | ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10, 53 | arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8)) 54 | ax1 = ax.twinx() 55 | ax1.scatter(df.index, lag_list, c='g') 56 | ax1.set_ylabel("PAL") 57 | 58 | plt.show() -------------------------------------------------------------------------------- /machine_learning/final/experiments/exp3.py: -------------------------------------------------------------------------------- 1 | from utils.util import plot_data 2 | from machine_learning.final.models import lstm 3 | from keras.callbacks import EarlyStopping 4 | import matplotlib.pyplot as plt 5 | 6 | #companies 7 | stocks_list = ['FB', 'AAPL', 'TSLA', 'AMZN'] 8 | dates_dic = { 9 | 'FB' : ['2017-12-01', '2018-05-01'], 10 | 'AAPL': ['2012-08-01', '2013-08-01'], 11 | 'TSLA': ['2013-08-01', '2014-01-01'], 12 | 'AMZN': ['2017-08-01', '2018-04-01'], 13 | } 14 | metrics_dic = { 15 | 'FB' : [], 16 | 'AAPL' : [], 17 | 'TSLA' : [], 18 | 'AMZN' : [] 19 | } 20 | 21 | window = 2 22 | future_gap = 1 23 | time_steps = 1 24 | neurons = [256, 256, 32, 1] 25 | drop_out = 0.2 26 | batch_size = 2048 27 | epochs = 300 28 | validation_split = 0.1 29 | verbose = 1 30 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 31 | patience=50, verbose=verbose, mode='auto') 32 | callbacks = [early_stopping_callback] 33 | 34 | for stock in stocks_list: 35 | start_date = dates_dic[stock][0] 36 | end_date = dates_dic[stock][1] 37 | normalized_metrics,_, df = lstm.final_test_lstm(stock, start_date, end_date, window, future_gap, time_steps, 38 | neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks) 39 | metrics_dic[stock] = normalized_metrics 40 | plot_data(df, stock+" Price Forecast", "Date", "Price", show_plot=False) 41 | 42 | print(metrics_dic) 43 | plt.show() -------------------------------------------------------------------------------- /machine_learning/final/experiments/exp4.py: -------------------------------------------------------------------------------- 1 | from utils.util import plot_data 2 | from machine_learning.final.models import lstm 3 | from machine_learning.final.models import ffnn 4 | from machine_learning.final.models import lin_reg 5 | from keras.callbacks import EarlyStopping 6 | import matplotlib.pyplot as plt 7 | 8 | #future gap 9 | stock = 'MSFT' 10 | dates_dic = { 11 | 'MSFT' : ['2017-01-01', '2018-01-01'] 12 | } 13 | future_gap_list = [1, 2, 3, 4, 5] 14 | 15 | #LSTM 16 | window = 2 17 | future_gap = 1 18 | time_steps = 1 19 | neurons = [256, 256, 32, 1] 20 | drop_out = 0.2 21 | batch_size = 2048 22 | epochs = 300 23 | validation_split = 0.1 24 | verbose = 1 25 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 26 | patience=50, verbose=verbose, mode='auto') 27 | callbacks = [early_stopping_callback] 28 | 29 | lstm_future_gap_metrics = { 30 | 1 : [], 31 | 2 : [], 32 | 3 : [], 33 | 4 : [], 34 | 5 : [] 35 | } 36 | 37 | 38 | for future_gap in future_gap_list: 39 | start_date = dates_dic[stock][0] 40 | end_date = dates_dic[stock][1] 41 | normalized_metrics, _, df = lstm.final_test_lstm(stock, start_date, 42 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 43 | verbose, callbacks) 44 | lstm_future_gap_metrics[future_gap] = normalized_metrics 45 | plot_data(df, 'Future Gap = '+str(future_gap), "Date", "Price", show_plot=False) 46 | 47 | 48 | #FFNN 49 | neurons = [256, 256, 64, 1] 50 | batch_size = 128 51 | epochs = 200 52 | 53 | ffnn_future_gap_metrics = { 54 | 1 : [], 55 | 2 : [], 56 | 3 : [], 57 | 4 : [], 58 | 5 : [] 59 | } 60 | 61 | for future_gap in future_gap_list: 62 | start_date = dates_dic[stock][0] 63 | end_date = dates_dic[stock][1] 64 | normalized_metrics, _, df = ffnn.final_test_ffnn(stock, start_date, 65 | end_date, window, future_gap, neurons, drop_out, batch_size, epochs, validation_split, 66 | verbose, callbacks) 67 | ffnn_future_gap_metrics[future_gap] = normalized_metrics 68 | plot_data(df, 'Future Gap = '+str(future_gap), "Date", "Price", show_plot=False) 69 | 70 | linreg_future_gap_metrics = { 71 | 1 : [], 72 | 2 : [], 73 | 3 : [], 74 | 4 : [], 75 | 5 : [] 76 | } 77 | 78 | for future_gap in future_gap_list: 79 | start_date = dates_dic[stock][0] 80 | end_date = dates_dic[stock][1] 81 | normalized_metrics, _, df = lin_reg.final_test_linreg(stock, start_date, 82 | end_date, window, future_gap) 83 | linreg_future_gap_metrics[future_gap] = normalized_metrics 84 | plot_data(df, 'Future Gap = '+str(future_gap), "Date", "Price", show_plot=False) 85 | 86 | print(lstm_future_gap_metrics) 87 | print(ffnn_future_gap_metrics) 88 | print(linreg_future_gap_metrics) 89 | plt.show() -------------------------------------------------------------------------------- /machine_learning/final/experiments/exp5.py: -------------------------------------------------------------------------------- 1 | from utils.util import plot_data 2 | from machine_learning.final.evaluation.metrics import compute_lag_metric 3 | from machine_learning.final.models import lstm 4 | from machine_learning.final.models import ffnn 5 | from machine_learning.final.models import lin_reg 6 | from machine_learning.final.models import knn_reg 7 | from keras.callbacks import EarlyStopping 8 | import matplotlib.pyplot as plt 9 | 10 | #LSTM and LinReg PAL 11 | stock = 'AAPL' 12 | dates_dic = { 13 | 'AAPL' : ['2017-01-01', '2018-01-01'] 14 | } 15 | metrics_dic = { 16 | 'LSTM' : [], 17 | 'LinReg' : [] 18 | } 19 | 20 | window = 2 21 | future_gap = 1 22 | time_steps = 1 23 | neurons = [256, 256, 32, 1] 24 | drop_out = 0.2 25 | batch_size = 2048 26 | epochs = 300 27 | validation_split = 0.1 28 | verbose = 1 29 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 30 | patience=50, verbose=verbose, mode='auto') 31 | callbacks = [early_stopping_callback] 32 | start_date = dates_dic[stock][0] 33 | end_date = dates_dic[stock][1] 34 | 35 | #LSTM 36 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 37 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 38 | verbose, callbacks) 39 | metrics_dic['LSTM'] = normalized_metrics 40 | #PAL 41 | lookup = 5 42 | lag_list = compute_lag_metric(df['Actual'], df['Prediction'], lookup, stock) 43 | df = df[:len(df)-lookup+1] 44 | #Price Forecast Plot 45 | plot_data(df, stock+" 2017 Price Forecast (LSTM)", "Date", "Price", show_plot=False) 46 | #Price Forecast and PAL Overlay Plot 47 | ax = df.plot(title=stock+" 2017 Price Forecast and PAL Overlay") 48 | ax.set_xlabel("Date") 49 | ax.set_ylabel("Price") 50 | ax.legend(loc="best") 51 | ax.grid(True) 52 | ax1 = ax.twinx() 53 | ax1.scatter(df.index, lag_list, c='g') 54 | ax1.set_ylabel("PAL") 55 | 56 | #LinReg 57 | normalized_metrics, inv_normalized_metrics, df = lin_reg.final_test_linreg(stock, start_date, 58 | end_date, window, future_gap) 59 | metrics_dic['LinReg'] = normalized_metrics 60 | #PAL 61 | lookup = 5 62 | lag_list = compute_lag_metric(df['Actual'], df['Prediction'], lookup, stock) 63 | df = df[:len(df)-lookup+1] 64 | #Price Forecast Plot 65 | plot_data(df, stock+" 2017 Price Forecast (LinReg)", "Date", "Price", show_plot=False) 66 | #Price Forecast and PAL Overlay Plot 67 | ax = df.plot(title=stock+" 2017 Price Forecast and PAL Overlay") 68 | ax.set_xlabel("Date") 69 | ax.set_ylabel("Price") 70 | ax.legend(loc="best") 71 | ax.grid(True) 72 | ax1 = ax.twinx() 73 | ax1.scatter(df.index, lag_list, c='g') 74 | ax1.set_ylabel("PAL") 75 | 76 | print(metrics_dic) 77 | plt.show() -------------------------------------------------------------------------------- /machine_learning/final/models/ffnn.py: -------------------------------------------------------------------------------- 1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset, dataset_split 2 | from machine_learning.final.evaluation.metrics import evaluate 3 | from keras.models import Sequential 4 | from keras.layers.core import Dense, Dropout 5 | from keras.optimizers import Adam 6 | 7 | def build_model(features, neurons, drop_out, decay=0.0): 8 | model = Sequential() 9 | 10 | model.add(Dense(neurons[0], input_dim=features, activation='relu',)) 11 | model.add(Dropout(drop_out)) 12 | 13 | model.add(Dense(neurons[1], activation='relu')) 14 | model.add(Dropout(drop_out)) 15 | 16 | model.add(Dense(neurons[2], activation='relu')) 17 | model.add(Dense(neurons[3], activation='linear')) 18 | 19 | adam = Adam(decay=decay) 20 | model.compile(loss='mse',optimizer=adam) 21 | model.summary() 22 | return model 23 | 24 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks): 25 | 26 | history = model.fit( 27 | X_train, 28 | Y_train, 29 | batch_size = batch_size, 30 | epochs = epochs, 31 | validation_split = validation_split, 32 | verbose = verbose, 33 | callbacks = callbacks 34 | ) 35 | 36 | return history 37 | 38 | def final_test_ffnn(stock_symbol, start_date, end_date, window, future_gap, neurons, 39 | drop_out, batch_size, epochs, validation_split, verbose, callbacks): 40 | #building the dataset 41 | print("> building the dataset...") 42 | df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window) 43 | df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 44 | #reshaping the dataset for FFNN 45 | print("\n> reshaping the dataset for FFNN...") 46 | ds_train = df_train.values 47 | ds_test = df_test.values 48 | X_train, Y_train = dataset_split(ds_train, future_gap, None) 49 | X_test, Y_test = dataset_split(ds_test, future_gap, None) 50 | #building the FFNN model 51 | print("\n> building the FFNN model...") 52 | features = X_train.shape[1] 53 | model = build_model(features, neurons, drop_out) 54 | #fitting the training data 55 | print("\n> fitting the training data...") 56 | model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks) 57 | #predictions 58 | print("\n> testing the model for predictions...") 59 | predictions = model.predict(X_test) 60 | #inverse-scaling 61 | print("\n> inverse-scaling the scaled values...") 62 | predictions = predictions.reshape((predictions.shape[0], 1)) 63 | predictions_inv_scaled = scaler.inverse_transform(predictions) 64 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 65 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 66 | #evaluation 67 | normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 68 | Y_test_inv_scaled, predictions_inv_scaled) 69 | #grouping the actual prices and predictions 70 | print("\n> grouping the actual prices and predictions...") 71 | feature_cols = df_test.columns.tolist() 72 | feature_cols.remove("actual_price") 73 | df_test.drop(columns=feature_cols, inplace=True) 74 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 75 | df_test = df_test.iloc[future_gap:] 76 | df_test['Actual'] = Y_test_inv_scaled 77 | df_test['Prediction'] = predictions_inv_scaled 78 | 79 | return normalized_metrics, inv_normalized_metrics, df_test -------------------------------------------------------------------------------- /machine_learning/final/models/knn_reg.py: -------------------------------------------------------------------------------- 1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset, dataset_split 2 | import machine_learning.final.models.knn_wrapper as knn 3 | from machine_learning.final.evaluation.metrics import evaluate 4 | 5 | def final_test_knnreg(stock_symbol, start_date, end_date, window, future_gap, k): 6 | #building the dataset 7 | print("> building the dataset...") 8 | df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window) 9 | df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 10 | #reshaping the dataset for LinReg 11 | print("\n> reshaping the dataset for LinReg...") 12 | ds_train = df_train.values 13 | ds_test = df_test.values 14 | X_train, Y_train = dataset_split(ds_train, future_gap, None) 15 | X_test, Y_test = dataset_split(ds_test, future_gap, None) 16 | #kNN model 17 | model = knn.knn(k) 18 | #fitting the training data 19 | model.train(X_train, Y_train) 20 | #predictions 21 | predictions = model.query(X_test, normalize=False, addDiff=False) 22 | #inverse-scaling 23 | print("\n> inverse-scaling the scaled values...") 24 | predictions = predictions.reshape((predictions.shape[0], 1)) 25 | predictions_inv_scaled = scaler.inverse_transform(predictions) 26 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 27 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 28 | #evaluation 29 | normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 30 | Y_test_inv_scaled, predictions_inv_scaled) 31 | #grouping the actual prices and predictions 32 | print("\n> grouping the actual prices and predictions...") 33 | feature_cols = df_test.columns.tolist() 34 | feature_cols.remove("actual_price") 35 | df_test.drop(columns=feature_cols, inplace=True) 36 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 37 | df_test = df_test.iloc[future_gap:] 38 | df_test['Actual'] = Y_test_inv_scaled 39 | df_test['Prediction'] = predictions_inv_scaled 40 | 41 | return normalized_metrics, inv_normalized_metrics, df_test -------------------------------------------------------------------------------- /machine_learning/final/models/knn_wrapper.py: -------------------------------------------------------------------------------- 1 | ''' this file contains an implementation of kNN regression 2 | ''' 3 | import numpy as np 4 | 5 | '''kNN wrapper class 6 | 7 | *k : k nearest neighbors to be considered 8 | *dataset : training dataset including the features and the output 9 | ''' 10 | class knn: 11 | __k = 0 12 | __dataset = None 13 | 14 | '''constructor function 15 | 16 | *k : k nearest neighbors to be considered 17 | ''' 18 | def __init__(self, k): 19 | self.__k = k 20 | 21 | '''training function 22 | 23 | *data_x : training dataset features 24 | *data_y : training dataset output 25 | ''' 26 | def train(self, data_x, data_y): 27 | data_y_reshaped = data_y.reshape((data_y.shape[0], 1)) 28 | self.__dataset = np.concatenate((data_x, data_y_reshaped), axis=1) 29 | 30 | '''querying/evaluating function 31 | 32 | *features : test dataset features 33 | ''' 34 | def query(self, features, normalize=True, addDiff=True): 35 | dataset_price_normed = self.__dataset[:, 0] 36 | features_price_normed = features[:, 0] 37 | 38 | if normalize: 39 | dataset_price_normed = (self.__dataset[:, 0]/self.__dataset[0, 0]) - 1 40 | features_price_normed = (features[:, 0]/features[0, 0]) - 1 41 | 42 | cumm_difference = np.zeros(features.shape[0]) 43 | predicted_price = np.zeros(features.shape[0]) 44 | 45 | for i in range(0, features.shape[0]): 46 | 47 | price_normed_diff = np.absolute(dataset_price_normed - features_price_normed[i]) 48 | moment_diff = np.absolute(self.__dataset[:, 1] - features[i, 1]) 49 | sma_diff = np.absolute(self.__dataset[:, 2] - features[i, 2]) 50 | b_band_diff = np.absolute(self.__dataset[:, 3] - features[i, 3]) 51 | std_diff = np.absolute(self.__dataset[:, 4] - features[i, 4]) 52 | vroc_diff = np.absolute(self.__dataset[:, 5] - features[i, 5]) 53 | 54 | cumm_difference = price_normed_diff + moment_diff + sma_diff + b_band_diff + std_diff + vroc_diff 55 | difference_op = np.asarray([cumm_difference, self.__dataset[:, -1]]).T 56 | sorting_index = np.argsort(difference_op[:, 0]) 57 | difference_sorted = difference_op[sorting_index] 58 | 59 | k_mean = np.mean(difference_sorted[:self.__k, 1]) 60 | predicted_price[i] = k_mean 61 | 62 | if addDiff: 63 | predicted_price += (features[0, 0] - self.__dataset[0, 0]) 64 | return predicted_price -------------------------------------------------------------------------------- /machine_learning/final/models/lin_reg.py: -------------------------------------------------------------------------------- 1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset 2 | from machine_learning.final.evaluation.metrics import evaluate 3 | import numpy as np 4 | import scipy.optimize as spo 5 | 6 | '''computes and returns the root mean squared error 7 | 8 | *x : a dynamic variable: (value, array, ...) 9 | *y : a dynamic variable: (value, array, ...) 10 | ''' 11 | def calculate_rmse(x, y): 12 | #squared error 13 | se = (x-y) ** 2 14 | #mean squared error 15 | mse = np.mean(se) 16 | #root mean squared error 17 | rmse = mse ** 0.5 18 | return rmse 19 | 20 | '''given the fitted line coefficients and the dataset, this 21 | function computes the rmse between the actual values and 22 | the predicted values of the linear regression 23 | 24 | *coefficients : fitted line coefficients array 25 | *data : dataset containing the features and the output 26 | ''' 27 | def error_fun(coefficients, data): 28 | price = coefficients[0]*data[:, 0] 29 | moment = coefficients[1]*data[:, 1] 30 | sma = coefficients[2]*data[:, 2] 31 | b_band = coefficients[3]*data[:, 3] 32 | std = coefficients[4]*data[:, 4] 33 | vroc = coefficients[5]*data[:, 5] 34 | constant = coefficients[6] 35 | predicted_values = price+moment+sma+b_band+std+vroc+constant 36 | actual_values = data[:, -1] 37 | rmse = calculate_rmse(predicted_values, actual_values) 38 | return rmse 39 | 40 | '''given the data to be passed to the error fcn, this function 41 | computes an initial guess of the coefficients and uses SciPy's 42 | minimize fcn and the error fcn to find the optimal coefficients 43 | 44 | *data : fitted line coefficients array 45 | *err_fun : error function to be minimized by SciPy's minimizor 46 | ''' 47 | def minimize_new_err_fun(data, err_fun): 48 | price = np.mean(data[:, 0]) 49 | moment = np.mean(data[:, 1]) 50 | sma = np.mean(data[:, 2]) 51 | b_band = np.mean(data[:, 3]) 52 | std = np.mean(data[:, 4]) 53 | vroc = np.mean(data[:, 5]) 54 | constant = 0 55 | coefficients_guess = [price, moment, sma, b_band, std, vroc, constant] 56 | result = spo.minimize(error_fun, coefficients_guess, args=(data, ), method="SLSQP", options= {'disp' : True}) 57 | return result.x 58 | 59 | def dataset_reshape(dataset, future_gap, split): 60 | print("Dataset Shape:", dataset.shape) 61 | X = dataset[:, :-1] 62 | Y = dataset[:, -1] 63 | print("X Shape:", X.shape) 64 | print("Y Shape:", Y.shape) 65 | 66 | print("Applying Future Gap...") 67 | X = X[:-future_gap] 68 | Y = Y[future_gap:] 69 | print("X Shape:", X.shape) 70 | print("Y Shape:", Y.shape) 71 | 72 | if split != None: 73 | print("Applying training, testing split...") 74 | split_index = int(split*X.shape[0]) 75 | X_train = X[:split_index] 76 | X_test = X[split_index:] 77 | Y_train = Y[:split_index] 78 | Y_test = Y[split_index:] 79 | print("(X_train, Y_train, X_test, Y_test) Shapes:") 80 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 81 | return X_train, Y_train, X_test, Y_test 82 | 83 | return X, Y 84 | 85 | def final_test_linreg(stock_symbol, start_date, end_date, window, future_gap): 86 | #building the dataset 87 | print("> building the dataset...") 88 | df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window) 89 | df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 90 | #reshaping the dataset for LinReg 91 | print("\n> reshaping the dataset for LinReg...") 92 | ds_train = df_train.values 93 | ds_test = df_test.values 94 | X_train, Y_train = dataset_reshape(ds_train, future_gap, None) 95 | X_test, Y_test = dataset_reshape(ds_test, future_gap, None) 96 | #fitting the training data 97 | print("\n> fitting the training data...") 98 | Y_train = Y_train.reshape((Y_train.shape[0], 1)) 99 | training_set = np.concatenate((X_train, Y_train), axis=1) 100 | fitted_line_coefficients = minimize_new_err_fun(training_set, error_fun) 101 | print("Line Coefficients:", fitted_line_coefficients) 102 | #predictions 103 | price = fitted_line_coefficients[0]*X_test[:, 0] 104 | moment = fitted_line_coefficients[1]*X_test[:, 1] 105 | sma = fitted_line_coefficients[2]*X_test[:, 2] 106 | b_band = fitted_line_coefficients[3]*X_test[:, 3] 107 | std = fitted_line_coefficients[4]*X_test[:, 4] 108 | vroc = fitted_line_coefficients[5]*X_test[:, 5] 109 | constant = fitted_line_coefficients[6] 110 | predictions = price+moment+sma+b_band+std+vroc+constant 111 | #inverse-scaling 112 | print("\n> inverse-scaling the scaled values...") 113 | predictions = predictions.reshape((predictions.shape[0], 1)) 114 | predictions_inv_scaled = scaler.inverse_transform(predictions) 115 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 116 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 117 | #evaluation 118 | normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 119 | Y_test_inv_scaled, predictions_inv_scaled) 120 | #grouping the actual prices and predictions 121 | print("\n> grouping the actual prices and predictions...") 122 | feature_cols = df_test.columns.tolist() 123 | feature_cols.remove("actual_price") 124 | df_test.drop(columns=feature_cols, inplace=True) 125 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 126 | df_test = df_test.iloc[future_gap:] 127 | df_test['Actual'] = Y_test_inv_scaled 128 | df_test['Prediction'] = predictions_inv_scaled 129 | 130 | return normalized_metrics, inv_normalized_metrics, df_test -------------------------------------------------------------------------------- /machine_learning/final/models/lstm.py: -------------------------------------------------------------------------------- 1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset 2 | from machine_learning.final.evaluation.metrics import evaluate 3 | import numpy as np 4 | from keras.models import Sequential 5 | from keras.layers.core import Dense, Dropout 6 | from keras.layers.recurrent import LSTM 7 | from keras.optimizers import Adam 8 | 9 | def lstm_dataset_reshape(dataset, time_steps, future_gap, split): 10 | print("Dataset Shape:", dataset.shape) 11 | X = dataset[:, :-1] 12 | Y = dataset[:, -1] 13 | print("X Shape:", X.shape) 14 | print("Y Shape:", Y.shape) 15 | 16 | X_sampled = [] 17 | for i in range(X.shape[0] - time_steps + 1): 18 | X_sampled.append(X[i : i+time_steps]) 19 | X_sampled = np.array(X_sampled) 20 | print("Sampled X Shape:", X_sampled.shape) 21 | 22 | future_gap_index = future_gap - 1 23 | X_sampled = X_sampled[:-future_gap] 24 | Y_sampled = Y[time_steps+future_gap_index: ] 25 | print("Applying Future Gap...") 26 | print("Sampled X Shape:", X_sampled.shape) 27 | print("Sampled Y Shape:", Y_sampled.shape) 28 | 29 | if split != None: 30 | split_index = int(split*X_sampled.shape[0]) 31 | X_train = X_sampled[:split_index] 32 | X_test = X_sampled[split_index:] 33 | Y_train = Y_sampled[:split_index] 34 | Y_test = Y_sampled[split_index:] 35 | print("(X_train, Y_train, X_test, Y_test) Shapes:") 36 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 37 | return X_train, Y_train, X_test, Y_test 38 | 39 | return X_sampled, Y_sampled 40 | 41 | def build_model(time_steps, features, neurons, drop_out, decay=0.0): 42 | model = Sequential() 43 | 44 | model.add(LSTM(neurons[0], input_shape=(time_steps, features), return_sequences=True)) 45 | model.add(Dropout(drop_out)) 46 | 47 | model.add(LSTM(neurons[1], input_shape=(time_steps, features), return_sequences=False)) 48 | model.add(Dropout(drop_out)) 49 | 50 | model.add(Dense(neurons[2], activation='relu')) 51 | model.add(Dense(neurons[3], activation='linear')) 52 | 53 | adam = Adam(decay=decay) 54 | model.compile(loss='mse',optimizer=adam) 55 | model.summary() 56 | return model 57 | 58 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks): 59 | 60 | history = model.fit( 61 | X_train, 62 | Y_train, 63 | batch_size = batch_size, 64 | epochs = epochs, 65 | validation_split = validation_split, 66 | verbose = verbose, 67 | callbacks = callbacks 68 | ) 69 | 70 | return history 71 | 72 | def final_test_lstm(stock_symbol, start_date, end_date, window, future_gap, time_steps, 73 | neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks): 74 | #building the dataset 75 | print("> building the dataset...") 76 | df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window) 77 | df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window) 78 | #reshaping the dataset for LSTM 79 | print("\n> reshaping the dataset for LSTM...") 80 | ds_train = df_train.values 81 | ds_test = df_test.values 82 | X_train, Y_train = lstm_dataset_reshape(ds_train, time_steps, future_gap, None) 83 | X_test, Y_test = lstm_dataset_reshape(ds_test, time_steps, future_gap, None) 84 | #building the LSTM model 85 | print("\n> building the LSTM model...") 86 | features = X_train.shape[2] 87 | model = build_model(time_steps, features, neurons, drop_out) 88 | #fitting the training data 89 | print("\n> fitting the training data...") 90 | model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks) 91 | #predictions 92 | print("\n> testing the model for predictions...") 93 | predictions = model.predict(X_test) 94 | #inverse-scaling 95 | print("\n> inverse-scaling the scaled values...") 96 | predictions = predictions.reshape((predictions.shape[0], 1)) 97 | predictions_inv_scaled = scaler.inverse_transform(predictions) 98 | Y_test = Y_test.reshape((Y_test.shape[0], 1)) 99 | Y_test_inv_scaled = scaler.inverse_transform(Y_test) 100 | #evaluation 101 | normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 102 | Y_test_inv_scaled, predictions_inv_scaled) 103 | #grouping the actual prices and predictions 104 | print("\n> grouping the actual prices and predictions...") 105 | feature_cols = df_test.columns.tolist() 106 | feature_cols.remove("actual_price") 107 | df_test.drop(columns=feature_cols, inplace=True) 108 | df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True) 109 | df_test = df_test.iloc[time_steps+future_gap-1:] 110 | df_test['Actual'] = Y_test_inv_scaled 111 | df_test['Prediction'] = predictions_inv_scaled 112 | 113 | return normalized_metrics, inv_normalized_metrics, df_test -------------------------------------------------------------------------------- /machine_learning/final/utils/dataset.py: -------------------------------------------------------------------------------- 1 | from utils.util import get_stock_data 2 | import numpy as np 3 | from sklearn.preprocessing import MinMaxScaler 4 | 5 | '''technical indicators computation functions 6 | 7 | *prices : adjusted closing stock prices 8 | *window : rolling statistics window 9 | ''' 10 | #BEGIN 11 | def compute_momentum_ratio(prices, window): 12 | #first window elements >> NA 13 | momentum_ratio = (prices/prices.shift(periods = 1)) - 1 14 | return momentum_ratio 15 | 16 | def compute_sma_ratio(prices, window): 17 | #Simple Moving Average 18 | #first window-1 elements >> NA 19 | sma_ratio = (prices / prices.rolling(window = window).mean()) - 1 20 | return sma_ratio 21 | 22 | def compute_bollinger_bands_ratio(prices, window): 23 | #first window-1 elements >> NA 24 | bb_ratio = prices - prices.rolling(window = window).mean() 25 | bb_ratio = bb_ratio / (2 * prices.rolling(window = window).std()) 26 | return bb_ratio 27 | 28 | def compute_volatility_ratio(prices, window): 29 | #first window-1 elements >> NA 30 | volatility_ratio = ((prices/prices.shift(periods = 1)) - 1).rolling(window = window).std() 31 | return volatility_ratio 32 | 33 | def compute_vroc_ratio(volume, window): 34 | #Volume Rate of Change 35 | #first window-1 elements >> NA 36 | vroc_ratio = (volume/volume.shift(periods = window)) - 1 37 | return vroc_ratio 38 | #END 39 | 40 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True): 41 | cols = ["Date", "Adj Close", "Volume"] 42 | df = get_stock_data(stock_symbol, start_date, end_date, cols) 43 | df.rename(columns={"Adj Close" : 'price'}, inplace=True) 44 | df['momentum'] = compute_momentum_ratio(df['price'], window) 45 | df['sma'] = compute_sma_ratio(df['price'], window) 46 | df['bolinger_band'] = compute_bollinger_bands_ratio(df['price'], window) 47 | df['volatility'] = compute_volatility_ratio(df['price'], window) 48 | df['vroc'] = compute_vroc_ratio(df['Volume'], window) 49 | df['actual_price'] = df['price'] 50 | df.drop(columns=["Volume"], inplace=True) 51 | df = df[window:] 52 | df.replace([np.inf, -np.inf], np.nan, inplace=True) 53 | df.fillna(method='ffill', inplace=True) 54 | df.fillna(method='bfill', inplace=True) 55 | scaler = None 56 | 57 | if normalize: 58 | scaler = MinMaxScaler() 59 | df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1)) 60 | df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1)) 61 | df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1)) 62 | df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1)) 63 | df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1)) 64 | df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1)) 65 | df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1)) 66 | 67 | print(df.head()) 68 | print(df.tail()) 69 | return df, scaler 70 | 71 | def dataset_split(dataset, future_gap, split): 72 | print("Dataset Shape:", dataset.shape) 73 | X = dataset[:, :-1] 74 | Y = dataset[:, -1] 75 | print("X Shape:", X.shape) 76 | print("Y Shape:", Y.shape) 77 | 78 | print("Applying Future Gap...") 79 | X = X[:-future_gap] 80 | Y = Y[future_gap:] 81 | print("X Shape:", X.shape) 82 | print("Y Shape:", Y.shape) 83 | 84 | if split != None: 85 | print("Applying training, testing split...") 86 | split_index = int(split*X.shape[0]) 87 | X_train = X[:split_index] 88 | X_test = X[split_index:] 89 | Y_train = Y[:split_index] 90 | Y_test = Y[split_index:] 91 | print("(X_train, Y_train, X_test, Y_test) Shapes:") 92 | print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape) 93 | return X_train, Y_train, X_test, Y_test 94 | 95 | return X, Y -------------------------------------------------------------------------------- /machine_learning/readme.md: -------------------------------------------------------------------------------- 1 | This directory contains projects applying different machine learning algorithms on historical data of different companies to predict stock prices. The development directory is where the models where developed. It includes initial try-outs, improvements, and multiple 2 | changes. The final directory includes the finalized versions of the models, the evaluation metrics, and the final experiments. -------------------------------------------------------------------------------- /results/amazon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/amazon.png -------------------------------------------------------------------------------- /results/apple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/apple.png -------------------------------------------------------------------------------- /results/experiments/exp1/ffnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/ffnn.png -------------------------------------------------------------------------------- /results/experiments/exp1/knn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/knn.png -------------------------------------------------------------------------------- /results/experiments/exp1/lin_reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/lin_reg.png -------------------------------------------------------------------------------- /results/experiments/exp1/lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/lstm.png -------------------------------------------------------------------------------- /results/experiments/exp2/sudden_vs_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal.png -------------------------------------------------------------------------------- /results/experiments/exp2/sudden_vs_normal_daily_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal_daily_lag.png -------------------------------------------------------------------------------- /results/experiments/exp2/sudden_vs_normal_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal_lag.png -------------------------------------------------------------------------------- /results/experiments/exp2/sudden_vs_normal_pal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal_pal.png -------------------------------------------------------------------------------- /results/experiments/exp3/amazon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/amazon.png -------------------------------------------------------------------------------- /results/experiments/exp3/apple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/apple.png -------------------------------------------------------------------------------- /results/experiments/exp3/facebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/facebook.png -------------------------------------------------------------------------------- /results/experiments/exp3/tesla.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/tesla.png -------------------------------------------------------------------------------- /results/experiments/exp4/gap1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap1.png -------------------------------------------------------------------------------- /results/experiments/exp4/gap2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap2.png -------------------------------------------------------------------------------- /results/experiments/exp4/gap3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap3.png -------------------------------------------------------------------------------- /results/experiments/exp4/gap4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap4.png -------------------------------------------------------------------------------- /results/experiments/exp4/gap5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap5.png -------------------------------------------------------------------------------- /results/experiments/exp5/linreg_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_forecast.png -------------------------------------------------------------------------------- /results/experiments/exp5/linreg_forecast_pal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_forecast_pal.png -------------------------------------------------------------------------------- /results/experiments/exp5/linreg_pal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_pal.png -------------------------------------------------------------------------------- /results/experiments/exp5/linreg_pal_daily.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_pal_daily.png -------------------------------------------------------------------------------- /results/experiments/exp5/lstm_forecast.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_forecast.png -------------------------------------------------------------------------------- /results/experiments/exp5/lstm_forecast_pal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_forecast_pal.png -------------------------------------------------------------------------------- /results/experiments/exp5/lstm_pal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_pal.png -------------------------------------------------------------------------------- /results/experiments/exp5/lstm_pal_daily.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_pal_daily.png -------------------------------------------------------------------------------- /results/facebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/facebook.png -------------------------------------------------------------------------------- /results/ffnn_reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/ffnn_reg.png -------------------------------------------------------------------------------- /results/future_gap_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/future_gap_test.png -------------------------------------------------------------------------------- /results/hyperparam_tune_ffnn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_ffnn1.png -------------------------------------------------------------------------------- /results/hyperparam_tune_ffnn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_ffnn2.png -------------------------------------------------------------------------------- /results/hyperparam_tune_lstm1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_lstm1.png -------------------------------------------------------------------------------- /results/hyperparam_tune_lstm2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_lstm2.png -------------------------------------------------------------------------------- /results/knn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/knn.png -------------------------------------------------------------------------------- /results/lin_reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/lin_reg.png -------------------------------------------------------------------------------- /results/lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/lstm.png -------------------------------------------------------------------------------- /results/new_lin_reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/new_lin_reg.png -------------------------------------------------------------------------------- /results/optimized_ffnn.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/optimized_ffnn.png -------------------------------------------------------------------------------- /results/optimized_lstm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/optimized_lstm.png -------------------------------------------------------------------------------- /results/stable.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/stable.png -------------------------------------------------------------------------------- /results/stable_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/stable_lag.png -------------------------------------------------------------------------------- /results/sudden_vs_normal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal.png -------------------------------------------------------------------------------- /results/sudden_vs_normal_daily_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_daily_lag.png -------------------------------------------------------------------------------- /results/sudden_vs_normal_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_lag.png -------------------------------------------------------------------------------- /results/sudden_vs_normal_pal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_pal.png -------------------------------------------------------------------------------- /results/sudden_vs_normal_pal_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_pal_1.png -------------------------------------------------------------------------------- /results/tesla.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/tesla.png -------------------------------------------------------------------------------- /results/volatile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/volatile.png -------------------------------------------------------------------------------- /results/volatile_lag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/volatile_lag.png -------------------------------------------------------------------------------- /results/window_test_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/window_test_1.png -------------------------------------------------------------------------------- /results/window_test_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/window_test_2.png -------------------------------------------------------------------------------- /statistics_and_optimization/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/statistics_and_optimization/__init__.py -------------------------------------------------------------------------------- /statistics_and_optimization/bollinger_bands.py: -------------------------------------------------------------------------------- 1 | ''' this file calculates and visualizes a company's stock 2 | price bollinger bands, to be used as a trading strategy 3 | ''' 4 | from utils.util import get_data, plot_data 5 | 6 | '''given the rolling mean and std, calculate the upper and 7 | lower bollinger bands 8 | 9 | *mean : the rolling mean of a stock price 10 | *std : the rolling standard deviation of a stock price 11 | ''' 12 | def get_bollinger_bands(mean, std): 13 | upper_band = mean + (2*std) 14 | lower_band = mean - (2*std) 15 | return upper_band, lower_band 16 | 17 | '''a tester function 18 | ''' 19 | def main(): 20 | start_date = "01/01/2017" 21 | end_date = "31/12/2017" 22 | symbols = ["FB"] 23 | stock_symbol = "FB" 24 | df = get_data(symbols, start_date, end_date, include_SPY=False) 25 | print(df.head()) 26 | print(df.tail()) 27 | 28 | window = 20 29 | rolling_mean = df[stock_symbol].rolling(window=window).mean() 30 | rolling_std = df[stock_symbol].rolling(window=window).std() 31 | df["Rolling Mean"] = rolling_mean 32 | df["Upper Bollinger Band"], df["Lower Bollinger Band"] = get_bollinger_bands(rolling_mean, rolling_std) 33 | plot_data(df, stock_symbol+" Bollinger Bands", "Date", "Price") 34 | 35 | '''to ensure running the tester function only when this file is run, not imported 36 | ''' 37 | if __name__ == "__main__": 38 | main() -------------------------------------------------------------------------------- /statistics_and_optimization/portfolio_optimization.py: -------------------------------------------------------------------------------- 1 | ''' this file finds the optimal portfolio allocation to maximize a 2 | chosen portfolio statistic 3 | ''' 4 | from utils.util import get_data, plot_data 5 | from statistics_and_optimization.portfolio_statistics import compute_daily_portfolio_value, compute_portfolio_statistics 6 | import pandas as pd 7 | import numpy as np 8 | import scipy.optimize as spo 9 | 10 | '''this function returns a portfolio statistic to be maximized, 11 | the value is multiplied by negative one, because it will be 12 | passed to a minimizer in the compute_optimal_allocations fcn 13 | 14 | *allocations : given allocations to a portfolio 15 | *df_portfolio : the portfolio dataframe 16 | ''' 17 | def portfolio_statistic(allocations, df_portfolio): 18 | #Daily Portfolio Value 19 | daily_portfolio_value = compute_daily_portfolio_value(df_portfolio, 1, allocations) 20 | 21 | #Portfolio Statistics 22 | cummulative_portfolio_return, _, _, _ = compute_portfolio_statistics(daily_portfolio_value) 23 | 24 | return -1*cummulative_portfolio_return 25 | 26 | '''this function uses SciPy's minimizer and portfolio_statistic fcns 27 | to minmize the negative portfolio statistic, and thus maximizing it 28 | it returns the optimal allocation for maximizing the statistic 29 | 30 | *dataframe : the portfolio dataframe 31 | ''' 32 | def compute_optimal_allocations(dataframe): 33 | guess = 1.0/dataframe.shape[1] 34 | allocations_guess = [guess] * dataframe.shape[1] 35 | bounds = [[0,1]] * dataframe.shape[1] 36 | constraints = { 37 | 'type':'eq', 38 | 'fun': lambda allocations_guess : 1.0 - np.sum(allocations_guess) 39 | } 40 | minimum = spo.minimize(portfolio_statistic, allocations_guess, args=(dataframe, ), 41 | method="SLSQP", bounds=bounds, constraints=constraints, 42 | options={'disp':True}) 43 | return minimum.x 44 | 45 | '''a tester function 46 | ''' 47 | def main(): 48 | symbols = ["AAPL", "FB", "GOOG", "SPY"] 49 | start_date = "01/01/2017" 50 | end_date = "31/12/2017" 51 | 52 | #Portfolio and SPY Dataframes 53 | df_portfolio = get_data(symbols, start_date, end_date) 54 | df_SPY = df_portfolio.ix[:, "SPY"] 55 | df_SPY = df_SPY/df_SPY.ix[0] 56 | 57 | #Optimized Allocations 58 | optimized_allocations = compute_optimal_allocations(df_portfolio) 59 | optimized_portfolio = compute_daily_portfolio_value(df_portfolio, 100000, optimized_allocations) 60 | optimized_portfolio = optimized_portfolio/optimized_portfolio.ix[0] 61 | 62 | #Default Allocations 63 | default_allocations = [0.25, 0.25, 0.25,0.25] 64 | default_portfolio = compute_daily_portfolio_value(df_portfolio, 100000, default_allocations) 65 | default_portfolio = default_portfolio/default_portfolio.ix[0] 66 | 67 | df_comparsion = pd.concat([optimized_portfolio, default_portfolio, df_SPY], 68 | keys=["Optimized Portfolio","Default Portfolio","S&P500"], axis=1) 69 | 70 | plot_data(df_comparsion, "Portfolio Optimization", "Date", "Price") 71 | 72 | '''to ensure running the tester function only when this file is run, not imported 73 | ''' 74 | if __name__ == "__main__": 75 | main() -------------------------------------------------------------------------------- /statistics_and_optimization/portfolio_statistics.py: -------------------------------------------------------------------------------- 1 | ''' this file constructs a portfolio and computes some 2 | portfolio statistics 3 | ''' 4 | import pandas as pd 5 | from utils.util import get_data, plot_data 6 | 7 | '''this helper function computes the daily value 8 | of a portfolio 9 | 10 | *df : dataframe containing the stocks to be included 11 | *capital : starting portfolio capital 12 | *allocations : allocations to the chosen stocks 13 | ''' 14 | def compute_daily_portfolio_value(df, capital, allocations): 15 | #normalization 16 | normalized = df/df.ix[0, :] 17 | #allocation 18 | allocated = normalized*allocations 19 | #capital/position value 20 | pos_val = allocated*capital 21 | #port value 22 | port_val = pos_val.sum(axis=1) 23 | return port_val 24 | 25 | '''five helper functions, each computing and returning a portfolio statistic 26 | ''' 27 | #BEGIN 28 | def compute_daily_portfolio_return(daily_portfolio_value): 29 | return daily_portfolio_value[1:] / daily_portfolio_value[:-1].values - 1 30 | 31 | def compute_cummulative_portfolio_return(daily_portfolio_value): 32 | return daily_portfolio_value[-1] / daily_portfolio_value[0] - 1 33 | 34 | def compute_mean_daily_portfolio_return(daily_portfolio_return): 35 | return daily_portfolio_return.mean() 36 | 37 | def compute_std_daily_portfolio_return(daily_portfolio_return): 38 | return daily_portfolio_return.std() 39 | 40 | def compute_daily_sampled_sharpe_ratio(mean_daily_portfolio_return, std_daily_portfolio_return): 41 | return (252**0.5) * mean_daily_portfolio_return/std_daily_portfolio_return 42 | #END 43 | 44 | '''this helper function wraps all the helper functions that compute 45 | the daily statistics of a portfolio and returns all the statistics 46 | 47 | *daily_portfolio_value : dataframe containing the daily values of a portfolio 48 | ''' 49 | def compute_portfolio_statistics(daily_portfolio_value): 50 | daily_portfolio_return = compute_daily_portfolio_return(daily_portfolio_value) 51 | cummulative_portfolio_return = compute_cummulative_portfolio_return(daily_portfolio_value) 52 | mean_daily_portfolio_return = compute_mean_daily_portfolio_return(daily_portfolio_return) 53 | std_daily_portfolio_return = compute_std_daily_portfolio_return(daily_portfolio_return) 54 | daily_sampled_sharpe_ratio = compute_daily_sampled_sharpe_ratio(mean_daily_portfolio_return, std_daily_portfolio_return) 55 | 56 | return cummulative_portfolio_return, mean_daily_portfolio_return, std_daily_portfolio_return, daily_sampled_sharpe_ratio 57 | 58 | '''a tester function 59 | ''' 60 | def main(): 61 | capital = 100000 62 | symbols = ["AAPL", "FB", "GOOG", "SPY"] 63 | allocations = [0.25, 0.25, 0.25, 0.25] 64 | start_date = "01/01/2017" 65 | end_date = "31/12/2017" 66 | 67 | #Portfolio Dataframe 68 | df_portfolio = get_data(symbols, start_date, end_date) 69 | df_SPY = df_portfolio.ix[:, "SPY"] 70 | 71 | #Daily Portfolio Value 72 | daily_portfolio_value = compute_daily_portfolio_value(df_portfolio, capital, allocations) 73 | print(daily_portfolio_value.head()) 74 | 75 | #Daily Portfolio Return 76 | daily_portfolio_return = compute_daily_portfolio_return(daily_portfolio_value) 77 | 78 | #Cummulative Portfolio Return 79 | cummulative_portfolio_return = compute_cummulative_portfolio_return(daily_portfolio_value) 80 | print("Cummulative Portfolio Return:", cummulative_portfolio_return) 81 | 82 | #Daily Portfolio Return Mean 83 | mean_daily_portfolio_return = compute_mean_daily_portfolio_return(daily_portfolio_return) 84 | print("Daily Portfolio Return Mean:", mean_daily_portfolio_return) 85 | 86 | #Daily Portfolio Return Standard Deviation 87 | std_daily_portfolio_return = compute_std_daily_portfolio_return(daily_portfolio_return) 88 | print("Daily Portfolio Return Standard Deviation:", std_daily_portfolio_return) 89 | 90 | #Daily Sampled Sharpe Ratio 91 | daily_sampled_sharpe_ratio = compute_daily_sampled_sharpe_ratio(mean_daily_portfolio_return, std_daily_portfolio_return) 92 | print("Daily Sampled Sharpe Ratio:", daily_sampled_sharpe_ratio) 93 | 94 | #Comparing between the portfolio and S&P500 95 | daily_portfolio_value_normalized = daily_portfolio_value/daily_portfolio_value.ix[0] 96 | df_SPY_normalized = df_SPY/df_SPY.ix[0] 97 | df_comparsion = pd.concat([daily_portfolio_value_normalized, df_SPY_normalized], keys=["Portfolio", "SPY"], axis=1) 98 | plot_data(df_comparsion, "Portfolio 2017 Normalized Price", "Date", "Price") 99 | 100 | '''to ensure running the tester function only when this file is run, not imported 101 | ''' 102 | if __name__ == "__main__": 103 | main() -------------------------------------------------------------------------------- /statistics_and_optimization/readme.md: -------------------------------------------------------------------------------- 1 | This directory contains files calculating statistics, and representing data visually. 2 | It also includes files dealing with portfolio management, statistics, and optimization. -------------------------------------------------------------------------------- /utils/README.md: -------------------------------------------------------------------------------- 1 | This directory contains utility files that contain basic essential functions used throughout the project. -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/utils/__init__.py -------------------------------------------------------------------------------- /utils/util.py: -------------------------------------------------------------------------------- 1 | ''' this file contains functions that are used in most files 2 | of this project, it contains utility functions to read and 3 | plot stock historical data 4 | ''' 5 | import os 6 | import pandas as pd 7 | import matplotlib.pyplot as plt 8 | 9 | '''this helper function redirects to the directory containing 10 | the stock historical data 11 | 12 | *symbol : stock symbol 13 | *depth : directory depth from the root 14 | ''' 15 | def symbol_to_path(symbol, depth=1): 16 | base = os.path.dirname(__file__) 17 | 18 | while depth > 0: 19 | base = os.path.dirname(base) 20 | depth -= 1 21 | 22 | path = os.path.join(base, "resources", "historical_data", "{}.csv".format(symbol)) 23 | return path 24 | 25 | '''this function creates a dataframe of chosen stocks with 26 | dates as the index and the adjusted closing price of each 27 | stock as the columns 28 | 29 | *symbols : stock symbol 30 | *start_date : start date of the dataframe's date index 31 | *end_date : end date of the dataframe's date index 32 | *include_SPY : boolean to indicate whether to include 33 | S&P500 index stock 34 | ''' 35 | def get_data(symbols, start_date, end_date, include_SPY=True): 36 | if include_SPY and "SPY" not in symbols: 37 | symbols.insert(0, "SPY") 38 | 39 | dates_index = pd.date_range(start=start_date, end=end_date) 40 | df = pd.DataFrame(index = dates_index) 41 | 42 | for symbol in symbols: 43 | df_temp = pd.read_csv(symbol_to_path(symbol), index_col="Date", 44 | parse_dates=True, usecols=["Date", "Adj Close"], 45 | na_values="nan") 46 | df_temp = df_temp.rename(columns={"Adj Close" : symbol}) 47 | df = df.join(df_temp, how="inner") 48 | 49 | return df 50 | 51 | '''this function creates a dataframe for a selected stock with 52 | dates as the index and the chosen columns 53 | 54 | *symbol : stock symbol 55 | *start_date : start date of the dataframe's date index 56 | *end_date : end date of the dataframe's date index 57 | *columns : columns to include in the dataframe 58 | ''' 59 | def get_stock_data(symbol, start_date=None, end_date=None, columns=["Date", "Adj Close"]): 60 | 61 | df = pd.read_csv(symbol_to_path(symbol), index_col="Date", 62 | parse_dates=True, usecols=columns, 63 | na_values="nan") 64 | return df[start_date:end_date] 65 | 66 | '''this function plots a given dataframe 67 | 68 | *dataframe : dataframe to be plotted 69 | *plot_title : the plot title 70 | *xlabel : the horizontal axis label 71 | *ylabel : the vertical axis label 72 | *leg_loc : legend location 73 | ''' 74 | def plot_data(dataframe, plot_title, xlabel, ylabel, leg_loc="best", show_plot='True'): 75 | ax = dataframe.plot(title=plot_title) 76 | ax.set_xlabel(xlabel) 77 | ax.set_ylabel(ylabel) 78 | ax.legend(loc=leg_loc) 79 | ax.grid(True) 80 | if show_plot: 81 | plt.show() 82 | 83 | '''a tester function 84 | ''' 85 | def main(): 86 | start_date = "01/01/2017" 87 | end_date = "31/12/2017" 88 | symbols = ["GOOG","AAPL","FB"] 89 | df = get_data(symbols, start_date, end_date) 90 | print(df) 91 | 92 | column_slicing = ['SPY', 'GOOG'] 93 | dataframe_sliced = df.ix[:, column_slicing] 94 | plot_data(dataframe_sliced, "Selected Stock Prices", "Date", "Price") 95 | 96 | '''to ensure running the tester function only when this file is run, not imported 97 | ''' 98 | if __name__ == "__main__": 99 | main() --------------------------------------------------------------------------------