├── .gitignore
├── README.md
├── doc
    └── bachelor_thesis.pdf
├── machine_learning
    ├── __init__.py
    ├── development
    │   ├── dataset_preprocessing.py
    │   ├── keras_ffnn.py
    │   ├── keras_lstm.py
    │   ├── knn_regression.py
    │   ├── knn_wrapper.py
    │   ├── linear_regression.py
    │   ├── new_evaluation.md
    │   ├── new_regression
    │   │   ├── lin_reg.py
    │   │   ├── new_dataset.py
    │   │   ├── new_knn_regression.py
    │   │   └── new_linear_regression.py
    │   ├── optimized_ffnn
    │   │   ├── ffnn.py
    │   │   ├── ffnn_hyperparam_tune.py
    │   │   ├── ffnn_hypparam_tune_main.py
    │   │   ├── ffnn_main.py
    │   │   └── ffnn_optimal_hyperparameters.txt
    │   ├── optimized_lstm
    │   │   ├── hyperparam_tune_main.py
    │   │   ├── hyperparameter_tunning.py
    │   │   ├── lstm.py
    │   │   ├── lstm_main.py
    │   │   └── optimal_hyperparameters.txt
    │   ├── original_evaluation.md
    │   ├── technical_indicators_dataset.py
    │   └── testing
    │   │   ├── analysis.py
    │   │   ├── companies.py
    │   │   ├── future_gap_test.py
    │   │   ├── lag_metric.py
    │   │   ├── results
    │   │       ├── amazon.md
    │   │       ├── analysis.md
    │   │       ├── apple.md
    │   │       ├── eval.md
    │   │       ├── facebook.md
    │   │       ├── future_gap.md
    │   │       ├── tesla.md
    │   │       └── window_and_ts.md
    │   │   ├── test.py
    │   │   └── window_plot_test.py
    ├── final
    │   ├── evaluation
    │   │   └── metrics.py
    │   ├── experiments
    │   │   ├── exp1.py
    │   │   ├── exp2.py
    │   │   ├── exp3.py
    │   │   ├── exp4.py
    │   │   └── exp5.py
    │   ├── models
    │   │   ├── ffnn.py
    │   │   ├── knn_reg.py
    │   │   ├── knn_wrapper.py
    │   │   ├── lin_reg.py
    │   │   └── lstm.py
    │   └── utils
    │   │   └── dataset.py
    └── readme.md
├── resources
    └── historical_data
    │   ├── AAPL.csv
    │   ├── AMZN.csv
    │   ├── FB.csv
    │   ├── GOOG.csv
    │   ├── IBM.csv
    │   ├── MSFT.csv
    │   ├── SPY.csv
    │   ├── TSLA.csv
    │   └── ^GSPC.csv
├── results
    ├── amazon.png
    ├── apple.png
    ├── experiments
    │   ├── exp1
    │   │   ├── ffnn.png
    │   │   ├── knn.png
    │   │   ├── lin_reg.png
    │   │   └── lstm.png
    │   ├── exp2
    │   │   ├── sudden_vs_normal.png
    │   │   ├── sudden_vs_normal_daily_lag.png
    │   │   ├── sudden_vs_normal_lag.png
    │   │   └── sudden_vs_normal_pal.png
    │   ├── exp3
    │   │   ├── amazon.png
    │   │   ├── apple.png
    │   │   ├── facebook.png
    │   │   └── tesla.png
    │   ├── exp4
    │   │   ├── gap1.png
    │   │   ├── gap2.png
    │   │   ├── gap3.png
    │   │   ├── gap4.png
    │   │   └── gap5.png
    │   └── exp5
    │   │   ├── linreg_forecast.png
    │   │   ├── linreg_forecast_pal.png
    │   │   ├── linreg_pal.png
    │   │   ├── linreg_pal_daily.png
    │   │   ├── lstm_forecast.png
    │   │   ├── lstm_forecast_pal.png
    │   │   ├── lstm_pal.png
    │   │   └── lstm_pal_daily.png
    ├── facebook.png
    ├── ffnn_reg.png
    ├── future_gap_test.png
    ├── hyperparam_tune_ffnn1.png
    ├── hyperparam_tune_ffnn2.png
    ├── hyperparam_tune_lstm1.png
    ├── hyperparam_tune_lstm2.png
    ├── knn.png
    ├── lin_reg.png
    ├── lstm.png
    ├── new_lin_reg.png
    ├── optimized_ffnn.png
    ├── optimized_lstm.png
    ├── stable.png
    ├── stable_lag.png
    ├── sudden_vs_normal.png
    ├── sudden_vs_normal_daily_lag.png
    ├── sudden_vs_normal_lag.png
    ├── sudden_vs_normal_pal.png
    ├── sudden_vs_normal_pal_1.png
    ├── tesla.png
    ├── volatile.png
    ├── volatile_lag.png
    ├── window_test_1.png
    └── window_test_2.png
├── statistics_and_optimization
    ├── __init__.py
    ├── bollinger_bands.py
    ├── portfolio_optimization.py
    ├── portfolio_statistics.py
    └── readme.md
└── utils
    ├── README.md
    ├── __init__.py
    └── util.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode
2 | __pycache__


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ML4T
  2 | *Machine Learning for Trading*
  3 | 
  4 | ## Project Overview
  5 | *GUC 2018 Bachelor Thesis Project*
  6 | 
  7 | Stock market prediction is an interesting realm to test the capabilities of machine learning
  8 | on. The nature of the stock market is volatile, sophisticated, and very sensitive to external
  9 | information, which makes it difficult to predict. Different machine learning models
 10 | are developed to forecast future stock prices. Using historical stock market data, technical
 11 | indicators are computed and used along with a stock’s price as features associated
 12 | with a target output, which is the future stock price. This provides a dataset that the
 13 | machine learning models use to train upon, and thus the models become capable of predicting
 14 | future prices. The models used are: linear regressor, kNN regressor, Feedforward
 15 | Neural Network (FFNN), and Long Short Term Memory (LSTM) Recurrent Neural Network
 16 | (RNN). The prediction models are compared and evaluated using different metrics.
 17 | Several case studies are performed to evaluate the performance of the machine learning
 18 | models. From the case studies, few insights have been made:
 19 | 
 20 | 1. The LSTM RNN outperformed all the other models.
 21 | 2. The LSTM RNN model is capable of accurately predicting the next-day price unless a major external event impacts the stock price suddenly.
 22 | 3. The LSTM RNN model naturally lags on picking up on external events that impact the stock price suddenly.
 23 | 
 24 | ## Algorithms Evaluation
 25 | *Development Phase*
 26 | 
 27 | * [Original](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/original_evaluation.md)
 28 | * [New](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/new_evaluation.md)
 29 | 
 30 | ## Testing
 31 | *Testing Phase*
 32 | 
 33 |     * Considering that the LSTM model is regarded as the flagship machine learning model in this project, 
 34 |     it is the one used in this testing section.
 35 | 
 36 |     * The model is trained on the period starting from a company's first public trading day till the day 
 37 |     before the required testing period.
 38 | 
 39 | ### Companies During Times of Change
 40 | *Predicting Stock prices for a portfolio of 4 companies during different interesting time periods*
 41 | 
 42 | * **[Facebook](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/facebook.md)**
 43 | 
 44 |   *Facebook started trading publicly on 18/05/2012.*
 45 | 
 46 |   * Facebook–Cambridge Analytica data scandal, [January/2018 - March/2018]
 47 | 
 48 |     Amid the scandal and Mark Zuckerburg's public hearing, Facebook's stock price fell.
 49 | 
 50 | * **[Apple](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/apple.md)**
 51 | 
 52 |   *Apple started trading publicly on 12/12/1980.*
 53 | 
 54 |   * Apple's first free fall, [September/2012 - June/2013]
 55 | 
 56 |     Apple faced multiple hardships during this period; earnings were no longer growing, 
 57 |     low-priced phones were capturing most of the smartphone market share over the iPhone,
 58 |     and the company entered the "post-Steve Jobs" era where the company's next generation 
 59 |     of leaders and products were in question.
 60 | 
 61 | * **[Tesla](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/tesla.md)**
 62 | 
 63 |   *Tesla started trading publicly on 29/06/2010.*
 64 | 
 65 |   * Disappointing Q3 Reports, [September/2013 - November/2013]
 66 | 
 67 |     Tesla reported disappointing third quarter financial results. In addition, 
 68 |     a third widely-reported fire involving a Model S in just two months was 
 69 |     putting Tesla under heat.
 70 | 
 71 | * **[Amazon](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/amazon.md)**
 72 | 
 73 |   *Amazon started trading publicly on 15/05/1997.*
 74 | 
 75 |   * Exceeding Q3 expectations, [September/2017 - February/2018]
 76 | 
 77 |     Amazon's Q3 reports showed an increase in profits, an acceleration in revenue growth, an increase 
 78 |     in AWS' operating income, and the success of Alexa-enabled devices.
 79 | 
 80 | ### Window and Time Steps Test
 81 | A test to determine the optimal window and time steps. See results [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/window_and_ts.md).
 82 | 
 83 | ### Evaluation Metrics
 84 | New metrics to evaluate the performance of the model over different future gaps. See results [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/eval.md).
 85 | 
 86 | ### Future Gap Test
 87 | A test to compare between the linear regressor, FFNN, and LSTM RNN over different future gaps. See results [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/future_gap.md).
 88 | 
 89 | ## Analysis
 90 | *Analysing the tests using a novel metric*
 91 | 
 92 | To analyse the forecast and evaluate how fast does the model predict the closest price to the actual, a lag metric is created.
 93 | The **_Prediction-Actual Lag (PAL)_** metric works as follows: 
 94 | The future gap chosen when making the forecast indicates how far into the future should a prediction be, for example if the future gap is set to 1, the forecast is a next-trading-day forecast. The actual prices are traversed and compared with the predictions, each actual price datapoint is compared against a number of the prediction data points, that number is the future gap, so if the future gap is set to 5, then each actual datapoint is compared to the corresponding prediction datapoint and the 4 next to it. See **_PAL_** in action [here](https://github.com/ahmedhamdi96/ML4T/blob/master/machine_learning/development/testing/results/analysis.md).
 95 | 
 96 | ## Software and Libraries
 97 | *This project uses the following software and Python libraries:*
 98 | 
 99 | * [NumPy](http://www.numpy.org/)
100 | * [pandas](http://pandas.pydata.org/)
101 | * [matplotlib](https://matplotlib.org/index.html)
102 | * [SciPy](https://www.scipy.org/)
103 | * [TensorFlow](https://www.tensorflow.org)
104 | * [Keras](https://keras.io/)
105 | * [scikit-learn](http://scikit-learn.org/stable/)
106 | * [TA-Lib](https://mrjbq7.github.io/ta-lib/doc_index.html)
107 | 


--------------------------------------------------------------------------------
/doc/bachelor_thesis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/doc/bachelor_thesis.pdf


--------------------------------------------------------------------------------
/machine_learning/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/machine_learning/__init__.py


--------------------------------------------------------------------------------
/machine_learning/development/dataset_preprocessing.py:
--------------------------------------------------------------------------------
 1 | '''This file constructs a dataset to be used by the ML algorithms.
 2 | The dataset consists of the past price and technical indicators as
 3 | features, and the price as the output. The dataset is indexed by
 4 | date, a row entry contains the price and techincal indicators of
 5 | some day prior to the date index, and the price is the actual 
 6 | price of the stock at the date marked by the index.
 7 | '''
 8 | from utils.util import get_data
 9 | import numpy as np
10 | import pandas as pd
11 | 
12 | '''technical indicators computation functions
13 | 
14 | *prices : adjusted closing stock prices
15 | *window : rolling statistics window 
16 | '''
17 | #BEGIN
18 | def compute_momentum_ratio(prices, window):
19 |     #first window elements >> NA
20 |     momentum_ratio = (prices/prices.shift(periods = 1)) - 1
21 |     return momentum_ratio
22 | 
23 | def compute_sma_ratio(prices, window):
24 |     #Simple Moving Average
25 |     #first window-1 elements >> NA
26 |     sma_ratio = (prices / prices.rolling(window = window).mean()) - 1
27 |     return sma_ratio
28 | 
29 | def compute_bollinger_bands_ratio(prices, window):
30 |     #first window-1 elements >> NA
31 |     bb_ratio = prices - prices.rolling(window = window).mean()
32 |     bb_ratio = bb_ratio / (2 * prices.rolling(window = window).std())
33 |     return bb_ratio
34 | 
35 | def compute_volatility_ratio(prices, window):
36 |     #first window-1 elements >> NA
37 |     volatility_ratio = ((prices/prices.shift(periods = 1)) - 1).rolling(window = window).std()
38 |     return volatility_ratio
39 | 
40 | def compute_vroc_ratio(volume, window):
41 |     #Volume Rate of Change
42 |     #first window-1 elements >> NA
43 |     vroc_ratio = (volume/volume.shift(periods = window)) - 1
44 |     return vroc_ratio
45 | 
46 | def compute_daily_return_volatility(prices, window):
47 |     #first window-1 elements >> NA
48 |     daily_return = (prices/prices.shift(periods= 1)) - 1
49 |     volatility = daily_return.rolling(window=window).std()
50 |     return volatility
51 | #END
52 | 
53 | '''dataset constructor function
54 | 
55 | *start_date : start date for the entire dataset (training and testing)
56 | *end_date   : end date for the entire dataset (training and testing)
57 | *stock      : stock label to be used in the dataset
58 | '''
59 | def get_dataset_dataframe(start_date='17/12/2014', end_date = '31/12/2017', stock='IBM'):
60 |     #importing stock data
61 |     stock_df = get_data([stock], start_date, end_date) 
62 |     date_range = pd.date_range(start_date, end_date)
63 |     dataset_df = pd.DataFrame(index=date_range)
64 | 
65 |     #calculating technical indicators
66 |     #make sure to include the last 2 weeks of 2014 to compensate calculations loss
67 |     #1st week is lost in the preparation of the indicators
68 |     #2nd week is lost to include the future gap
69 |     future_gap = 5 #1 trading week
70 |     dataset_df['price'] = stock_df[stock]
71 |     dataset_df.dropna(subset=['price'], inplace=True)
72 |     dataset_df['momentum'] = compute_momentum_ratio(stock_df[stock], future_gap)
73 |     dataset_df['sma'] = compute_sma_ratio(stock_df[stock], future_gap)
74 |     dataset_df['bolinger_band'] = compute_bollinger_bands_ratio(stock_df[stock], future_gap)
75 |     #dataset_df['volatility'] = compute_daily_return_volatility(stock_df[stock], future_gap)
76 |     dataset_df.dropna(subset=dataset_df.columns, inplace=True)
77 |     dataset_df = dataset_df.shift(future_gap)
78 |     shifted_columns_names = ['price(t-%d)' %(future_gap), 'moment(t-%d)' %(future_gap), 'sma(t-%d)' %(future_gap), 
79 |                              'b_band(t-%d)' %(future_gap)]
80 |     dataset_df.columns = shifted_columns_names
81 |     dataset_df.dropna(subset=shifted_columns_names, inplace=True)
82 |     dataset_df['price'] = stock_df[stock]
83 | 
84 |     return dataset_df


--------------------------------------------------------------------------------
/machine_learning/development/keras_ffnn.py:
--------------------------------------------------------------------------------
 1 | ''' this file uses a keras feed-forward-NN to predict stock prices
 2 | one trading week in advance
 3 | '''
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from keras.models import Sequential
 7 | from keras.layers import Dense
 8 | from sklearn.preprocessing import MinMaxScaler
 9 | from sklearn.metrics import mean_squared_error
10 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe
11 | 
12 | '''a tester function
13 | '''
14 | def main():
15 |     #getting the preprocessed dataset dataframe
16 |     dataset_df = get_dataset_dataframe()
17 |     #dataset preparation
18 |     dataset = dataset_df.values
19 |     #dataset scaling
20 |     scaler = MinMaxScaler(feature_range=(0, 1))
21 |     dataset = scaler.fit_transform(dataset)
22 |     #dataset splitting
23 |     training_start_index = 0
24 |     training_end_index = 503
25 |     testing_start_index = 504
26 |     testing_end_index = 755
27 |     X_train = dataset[training_start_index:training_end_index+1, :-1]
28 |     Y_train = dataset[training_start_index:training_end_index+1, -1]
29 |     X_test = dataset[testing_start_index:testing_end_index+1, :-1]
30 |     Y_test = dataset[testing_start_index:testing_end_index+1, -1]
31 |     #Feed Forward NN model
32 |     model = Sequential()
33 |     model.add(Dense(20, input_dim=4, activation='relu', kernel_initializer='normal'))
34 |     model.add(Dense(10, activation='relu', kernel_initializer='normal'))
35 |     model.add(Dense(1, kernel_initializer='normal'))
36 |     model.compile(loss='mse', optimizer='adam')
37 |     #fitting the training data
38 |     history = model.fit(X_train, Y_train, epochs=200, batch_size=int(X_train.shape[0]/8), 
39 |                         validation_split=0.2, verbose=2, shuffle=False)
40 |     #evaluating the testing data
41 |     results = model.evaluate(X_test, Y_test)
42 |     results_names = model.metrics_names
43 |     print(results_names, ":", results)
44 |     #predictions
45 |     predictions_scaled = model.predict(X_test)
46 |     test_dataset_scaled = np.concatenate((X_test, predictions_scaled), axis=1)
47 |     test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled)
48 |     predictions_unscaled = test_dataset_unscaled[:, -1]
49 |     #actual values
50 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
51 |     test_dataset_scaled = np.concatenate((X_test, Y_test), axis=1)
52 |     test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled)
53 |     Y_test_unscaled = test_dataset_unscaled[:, -1]
54 |     #evaluation
55 |     rmse = (mean_squared_error(predictions_unscaled, Y_test_unscaled) ** 0.5)
56 |     print('Test RMSE: %.3f' %(rmse))
57 |     correlation = np.corrcoef(predictions_unscaled, Y_test_unscaled)
58 |     print("Correlation: %.3f"%(correlation[0, 1]))
59 |     #plotting
60 |     _, (ax1, ax2) = plt.subplots(2,1)
61 |     ax1.plot(history.history['loss'], label='Training')
62 |     ax1.plot(history.history['val_loss'], label='Validation')
63 |     ax1.set_xlabel('Epoch #')
64 |     ax1.set_ylabel('Loss')
65 |     ax1.legend(loc='best')
66 |     ax1.grid(True)
67 | 
68 |     ax2.plot(range(len(predictions_unscaled)), predictions_unscaled, label='Prediction')
69 |     ax2.plot(range(len(Y_test_unscaled)), Y_test_unscaled, label='Actual')
70 |     ax2.set_xlabel('Trading Day')
71 |     ax2.set_ylabel('Price')
72 |     ax2.legend(loc='best')
73 |     ax2.grid(True)
74 |     
75 |     plt.show()
76 | 
77 | main()    


--------------------------------------------------------------------------------
/machine_learning/development/keras_lstm.py:
--------------------------------------------------------------------------------
 1 | ''' this file uses a keras LSTM RNN to predict stock prices
 2 | one trading week in advance
 3 | '''
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from keras.models import Sequential
 7 | from keras.layers import LSTM, Dense
 8 | from sklearn.preprocessing import MinMaxScaler
 9 | from sklearn.metrics import mean_squared_error
10 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe
11 | 
12 | '''a tester function
13 | '''
14 | def main():
15 |     #getting the preprocessed dataset dataframe
16 |     dataset_df = get_dataset_dataframe()
17 |     #dataset preparation
18 |     dataset = dataset_df.values
19 |     #dataset scaling
20 |     scaler = MinMaxScaler(feature_range=(0, 1))
21 |     dataset = scaler.fit_transform(dataset)
22 |     training_start_index = 0
23 |     training_end_index = 503
24 |     testing_start_index = 504
25 |     testing_end_index = 755
26 |     #dataset splitting
27 |     X_train = dataset[training_start_index:training_end_index+1, :-1]
28 |     Y_train = dataset[training_start_index:training_end_index+1, -1]
29 |     X_test = dataset[testing_start_index:testing_end_index+1, :-1]
30 |     Y_test = dataset[testing_start_index:testing_end_index+1, -1]
31 |     print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
32 |     #reshaping the dataset for the LSTM RCC
33 |     X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
34 |     X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
35 |     print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
36 |     (samples, timesteps, features) = X_train.shape
37 |     #LSTM RNN model
38 |     model = Sequential()
39 |     model.add(LSTM(100, input_shape=(timesteps, features)))
40 |     model.add(Dense(1))
41 |     model.compile(loss='mse', optimizer='adam')
42 |     #fitting the training data
43 |     history = model.fit(X_train, Y_train, epochs=200, batch_size=int(samples/8), 
44 |                         validation_split=0.2, verbose=2, shuffle=False)
45 |     #evaluating the testing data
46 |     results = model.evaluate(X_test, Y_test)
47 |     results_names = model.metrics_names
48 |     print("Test", results_names, ":", results)
49 |     #predictions
50 |     predictions_scaled = model.predict(X_test)
51 |     X_test = X_test.reshape((X_test.shape[0], X_test.shape[2]))
52 |     test_dataset_scaled = np.concatenate((X_test, predictions_scaled), axis=1)
53 |     test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled)
54 |     predictions_unscaled = test_dataset_unscaled[:, -1]
55 |     #actual values
56 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
57 |     test_dataset_scaled = np.concatenate((X_test, Y_test), axis=1)
58 |     test_dataset_unscaled = scaler.inverse_transform(test_dataset_scaled)
59 |     Y_test_unscaled = test_dataset_unscaled[:, -1]
60 |     #evaluation
61 |     rmse = (mean_squared_error(predictions_unscaled, Y_test_unscaled) ** 0.5)
62 |     print('Test RMSE: %.3f' %(rmse))
63 |     correlation = np.corrcoef(predictions_unscaled, Y_test_unscaled)
64 |     print("Correlation: %.3f"%(correlation[0, 1]))
65 |     #plots
66 |     _, (ax1, ax2) = plt.subplots(2,1)
67 |     ax1.plot(history.history['loss'], label='Training')
68 |     ax1.plot(history.history['val_loss'], label='Validation')
69 |     ax1.set_xlabel('Epoch #')
70 |     ax1.set_ylabel('Loss')
71 |     ax1.legend(loc='best')
72 |     ax1.grid(True)
73 | 
74 |     ax2.plot(range(len(predictions_unscaled)), predictions_unscaled, label='Prediction')
75 |     ax2.plot(range(len(Y_test_unscaled)), Y_test_unscaled, label='Actual')
76 |     ax2.set_xlabel('Trading Day')
77 |     ax2.set_ylabel('Price')
78 |     ax2.legend(loc='best')
79 |     ax2.grid(True)
80 |     
81 |     plt.show()
82 | 
83 | '''to ensure running the tester function only when this file is run, not imported
84 | '''
85 | if __name__ == "__main__":
86 |     main()


--------------------------------------------------------------------------------
/machine_learning/development/knn_regression.py:
--------------------------------------------------------------------------------
 1 | ''' this file shows an implementation of kNN regression to
 2 | predict stock prices one trading week in advance
 3 | '''
 4 | from utils.util import get_data, plot_data
 5 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe
 6 | from machine_learning.development.linear_regression import calculate_rmse
 7 | import machine_learning.development.knn_wrapper as knn
 8 | import numpy as np
 9 | import matplotlib.pyplot as plt
10 | 
11 | '''a tester function
12 | '''
13 | def main():
14 |     #getting the preprocessed dataset dataframe
15 |     dataset_df = get_dataset_dataframe()
16 |     #dataset preparation
17 |     dataset = dataset_df.values
18 |     #dataset splitting
19 |     training_start_index = 0
20 |     training_end_index = 503
21 |     testing_start_index = 504
22 |     testing_end_index = 755
23 |     X_train = dataset[training_start_index:training_end_index+1, :-1]
24 |     Y_train = dataset[training_start_index:training_end_index+1, -1]
25 |     X_test = dataset[testing_start_index:testing_end_index+1, :-1]
26 |     Y_test = dataset[testing_start_index:testing_end_index+1, -1]
27 |     #kNN model
28 |     model = knn.knn(3)
29 |     #fitting the training data
30 |     model.train(X_train, Y_train)
31 |     #predictions
32 |     predictions = model.query(X_test)
33 |     #evaluation
34 |     rmse = (calculate_rmse(predictions, Y_test) ** 0.5)
35 |     print('Test RMSE: %.3f' %(rmse))
36 |     correlation = np.corrcoef(predictions, Y_test)
37 |     print("Correlation: %.3f"%(correlation[0, 1]))
38 |     #plotting
39 |     _, ax = plt.subplots()
40 |     ax.plot(range(len(predictions)), predictions, label='Prediction')
41 |     ax.plot(range(len(Y_test)), Y_test, label='Actual')
42 |     ax.set_xlabel('Trading Day')
43 |     ax.set_ylabel('Price')
44 |     ax.legend(loc='best')
45 |     ax.grid(True)
46 |     
47 |     plt.show()
48 | 
49 | '''to ensure running the tester function only when this file is run, not imported
50 | '''
51 | if __name__ == "__main__":
52 |     main()


--------------------------------------------------------------------------------
/machine_learning/development/knn_wrapper.py:
--------------------------------------------------------------------------------
 1 | ''' this file contains an implementation of kNN regression
 2 | '''
 3 | import numpy as np
 4 | 
 5 | '''kNN wrapper class
 6 | 
 7 | *k       : k nearest neighbors to be considered
 8 | *dataset : training dataset including the features and the output
 9 | '''
10 | class knn:
11 |     __k = 0
12 |     __dataset = None
13 | 
14 |     '''constructor function
15 | 
16 |     *k       : k nearest neighbors to be considered
17 |     '''
18 |     def __init__(self, k):
19 |         self.__k = k
20 | 
21 |     '''training function
22 | 
23 |     *data_x : training dataset features
24 |     *data_y : training dataset output
25 |     '''
26 |     def train(self, data_x, data_y):
27 |         data_y_reshaped = data_y.reshape((data_y.shape[0], 1))
28 |         self.__dataset = np.concatenate((data_x, data_y_reshaped), axis=1)
29 | 
30 |     '''querying/evaluating function
31 | 
32 |     *features : test dataset features
33 |     '''
34 |     def query(self, features, normalize=True, addDiff=True):
35 |         dataset_price_normed = self.__dataset[:, 0]
36 |         features_price_normed = features[:, 0]
37 | 
38 |         if normalize:
39 |             dataset_price_normed = (self.__dataset[:, 0]/self.__dataset[0, 0]) - 1
40 |             features_price_normed = (features[:, 0]/features[0, 0]) - 1
41 |         
42 |         cumm_difference = np.zeros(features.shape[0])
43 |         predicted_price = np.zeros(features.shape[0])
44 | 
45 |         for i in range(0, features.shape[0]):
46 | 
47 |             price_normed_difference = np.absolute(dataset_price_normed - features_price_normed[i])
48 |             moment_difference = np.absolute(self.__dataset[:, 1] - features[i, 1])
49 |             sma_difference = np.absolute(self.__dataset[:, 2] - features[i, 2])
50 |             b_band_difference =  np.absolute(self.__dataset[:, 3] - features[i, 3])
51 |             
52 |             cumm_difference = price_normed_difference + moment_difference + sma_difference + b_band_difference
53 |             difference_op = np.asarray([cumm_difference, self.__dataset[:, -1]]).T
54 |             sorting_index = np.argsort(difference_op[:, 0])
55 |             difference_sorted = difference_op[sorting_index]
56 | 
57 |             k_mean = np.mean(difference_sorted[:self.__k, 1])
58 |             predicted_price[i] = k_mean
59 | 
60 |         if addDiff:
61 |             predicted_price += (features[0, 0] - self.__dataset[0, 0])
62 |         return predicted_price


--------------------------------------------------------------------------------
/machine_learning/development/linear_regression.py:
--------------------------------------------------------------------------------
  1 | ''' this file shows an implementation of linear regression to
  2 | predict stock prices one trading week in advance. SciPy's
  3 | minimize function is used to optimize the fitted linear line 
  4 | coefficients
  5 | '''
  6 | from utils.util import get_data
  7 | import numpy as np
  8 | import scipy.optimize as spo
  9 | import matplotlib.pyplot as plt
 10 | from machine_learning.development.dataset_preprocessing import get_dataset_dataframe
 11 | 
 12 | '''computes and returns the root mean squared error
 13 | 
 14 | *x : a dynamic variable: (value, array, ...)
 15 | *y : a dynamic variable: (value, array, ...)
 16 | '''
 17 | def calculate_rmse(x, y):
 18 |     #squared error
 19 |     se = (x-y) ** 2
 20 |     #mean squared error
 21 |     mse = np.mean(se)
 22 |     #root mean squared error
 23 |     rmse = mse ** 0.5
 24 |     return rmse
 25 | 
 26 | '''given the fitted line coefficients and the dataset, this
 27 | function computes the rmse between the actual values and 
 28 | the predicted values of the linear regression
 29 | 
 30 | *coefficients : fitted line coefficients array
 31 | *data         : dataset containing the features and the output
 32 | '''
 33 | def new_error_fun(coefficients, data):
 34 |     price = coefficients[0]*data[:, 0]
 35 |     moment = coefficients[1]*data[:, 1]
 36 |     sma = coefficients[2]*data[:, 2]
 37 |     b_band = coefficients[3]*data[:, 3]
 38 |     std = coefficients[4]*data[:, 4]
 39 |     vroc = coefficients[5]*data[:, 5]
 40 |     constant = coefficients[6]
 41 |     predicted_values = price+moment+sma+b_band+std+vroc+constant
 42 |     actual_values = data[:, -1]
 43 |     rmse = calculate_rmse(predicted_values, actual_values)
 44 |     return rmse
 45 | 
 46 | '''given the fitted line coefficients and the dataset, this
 47 | function computes the rmse between the actual values and 
 48 | the predicted values of the linear regression
 49 | 
 50 | *coefficients : fitted line coefficients array
 51 | *data         : dataset containing the features and the output
 52 | '''
 53 | def error_fun(coefficients, data):
 54 |     price = coefficients[0]*data[:, 0]
 55 |     moment = coefficients[1]*data[:, 1]
 56 |     sma = coefficients[2]*data[:, 2]
 57 |     b_band = coefficients[3]*data[:, 3]
 58 |     constant = coefficients[4]
 59 |     predicted_values = price+moment+sma+b_band+constant
 60 |     actual_values = data[:, -1]
 61 |     rmse = calculate_rmse(predicted_values, actual_values)
 62 |     return rmse
 63 | 
 64 | '''given the data to be passed to the error fcn, this function 
 65 | computes an initial guess of the coefficients and uses SciPy's
 66 | minimize fcn and the error fcn to find the optimal coefficients
 67 | 
 68 | *data    : fitted line coefficients array
 69 | *err_fun : error function to be minimized by SciPy's minimizor
 70 | '''
 71 | def minimize_new_err_fun(data, err_fun):
 72 |     price = np.mean(data[:, 0])
 73 |     moment = np.mean(data[:, 1])
 74 |     sma = np.mean(data[:, 2])
 75 |     b_band = np.mean(data[:, 3])
 76 |     std = np.mean(data[:, 4])
 77 |     vroc = np.mean(data[:, 5])
 78 |     constant = 0
 79 |     coefficients_guess = [price, moment, sma, b_band, std, vroc, constant]
 80 |     result = spo.minimize(error_fun, coefficients_guess, args=(data, ), method="SLSQP", options= {'disp' : True})
 81 |     return result.x
 82 | 
 83 | '''given the data to be passed to the error fcn, this function 
 84 | computes an initial guess of the coefficients and uses SciPy's
 85 | minimize fcn and the error fcn to find the optimal coefficients
 86 | 
 87 | *data    : fitted line coefficients array
 88 | *err_fun : error function to be minimized by SciPy's minimizor
 89 | '''
 90 | def minimize_err_fun(data, err_fun):
 91 |     price = np.mean(data[:, 0])
 92 |     moment = np.mean(data[:, 1])
 93 |     sma = np.mean(data[:, 2])
 94 |     b_band = np.mean(data[:, 3])
 95 |     constant = 0
 96 |     coefficients_guess = [price, moment, sma, b_band, constant]
 97 |     result = spo.minimize(error_fun, coefficients_guess, args=(data, ), method="SLSQP", options= {'disp' : True})
 98 |     return result.x
 99 | 
100 | '''a normalization fcn
101 | 
102 | *values : values to be normalized
103 | *mean   : mean of the values
104 | *std    : standard deviation of the values
105 | '''
106 | def normalize(values, mean, std):
107 |     return (values - mean) / std
108 | 
109 | '''an inverse-normalization fcn
110 | 
111 | *values : normalized values
112 | *mean   : mean of the normalized values
113 | *std    : standard deviation of the normalized values
114 | '''
115 | def inverse_normalize(normalized_values, mean, std):
116 |     return (normalized_values * std) + mean
117 | 
118 | '''a tester function
119 | '''
120 | def main():
121 |     #getting the preprocessed dataset dataframe
122 |     dataset_df = get_dataset_dataframe()
123 |     #dataset preparation
124 |     dataset = dataset_df.values
125 |     #dataset normalization
126 |     '''mean = np.mean(dataset, axis=0)
127 |     std = np.std(dataset, axis=0)
128 |     dataset_normalized = normalize(dataset, mean, std)
129 |     '''
130 |     #dataset splitting
131 |     training_start_index = 0
132 |     training_end_index = 503
133 |     testing_start_index = 504
134 |     testing_end_index = 755
135 |     training_set = dataset[training_start_index:training_end_index+1, :]
136 |     X_test = dataset[testing_start_index:testing_end_index+1, :-1]
137 |     Y_test = dataset[testing_start_index:testing_end_index+1, -1]
138 |     #training
139 |     fitted_line_coefficients = minimize_err_fun(training_set, error_fun)
140 |     print("Line Coefficients:", fitted_line_coefficients)
141 |     #testing
142 |     price = fitted_line_coefficients[0]*X_test[:, 0]
143 |     moment = fitted_line_coefficients[1]*X_test[:, 1]
144 |     sma = fitted_line_coefficients[2]*X_test[:, 2]
145 |     b_band = fitted_line_coefficients[3]*X_test[:, 3]
146 |     constant = fitted_line_coefficients[4]
147 |     predicted_values = price+moment+sma+b_band+constant
148 |     #evaluation
149 |     rmse = calculate_rmse(predicted_values, Y_test)
150 |     print('RMSE: %.3f' %(rmse))
151 |     correlation = np.corrcoef(predicted_values, Y_test)
152 |     print("Correlation: %.3f"%(correlation[0, 1]))
153 |     #plots
154 |     _, ax = plt.subplots()
155 |     ax.plot(range(len(predicted_values)), predicted_values, label='Prediction')
156 |     ax.plot(range(len(Y_test)), Y_test, label='Actual')
157 |     ax.set_xlabel('Trading Day')
158 |     ax.set_ylabel('Price')
159 |     ax.legend(loc='best')
160 |     ax.grid(True)
161 |     plt.show()
162 | 
163 | '''to ensure running the tester function only when this file is run, not imported
164 | '''
165 | if __name__ == "__main__":
166 |     main()


--------------------------------------------------------------------------------
/machine_learning/development/new_evaluation.md:
--------------------------------------------------------------------------------
 1 | ## Algorithms Evaluation
 2 | 
 3 | |  Variable  | Value       | Description |
 4 | | :--------- | :---------- | :---------- |
 5 | | stock      | ^GSPC       | S&P 500 |
 6 | | start date | 1950-01-01  | stock historical data start date |
 7 | | end date   | 2017-12-31  | stock historical data end date |
 8 | | window     | 2           | window for computing rolling statistics |
 9 | | future gap | 1, 5, 20    | how far (trading days) into the future is the prediction |
10 | | split      | 0.8         | training-testing dataset split |
11 | 
12 | ### Evaluation metrics
13 | *metrics are applied on the normalized dataset, where the values are in the range [0, 1]*
14 | 
15 |     1. Loss
16 |         *RMSE : accumulation of all errors, RMSE value represents dollar value
17 |         *MAPE : accumulation of all error percentages, MAPE value represents percentage value
18 | 
19 |     2. Accuracy
20 |         *Correlation : linear relationship between predictions and actual values, range: [-1, 1]
21 |         *r-squared   : how close predictions are to actual prices, range: [0, 1]
22 | 
23 | * <strong> Optimized LSTM </strong>
24 | ```sh
25 | python -m machine_learning.development.optimized_lstm.lstm_main
26 | ```
27 | | Future Gap | RMSE | MAPE | Corr | R^2 |
28 | | :--------: | :--: | :--: | :--: | :--: |
29 | | 1 day      | 0.007| 1.033| 0.999| 0.998|
30 | | 1 week     | 0.012| 1.642| 0.998| 0.995|
31 | | 1 month    | 0.026| 3.708| 0.992| 0.972|
32 | 
33 | *shown below is a 1 trading day future gap*
34 | 
35 | ![Optimized LSTM](https://github.com/ahmedhamdi96/ML4T/blob/master/results/optimized_lstm.png)
36 | 
37 | * <strong> Optimized FFNN </strong>
38 | ```sh
39 | python -m machine_learning.development.optimized_ffnn.ffnn_main
40 | ```
41 | | Future Gap | RMSE | MAPE | Corr | R^2 |
42 | | :--------: | :--: | :--: | :--: | :-: |
43 | | 1 day      | 0.009| 1.401| 0.999| 0.997|
44 | | 1 week     | 0.015| 2.108| 0.998| 0.992|
45 | | 1 month    | 0.021| 3.014| 0.992| 0.984|
46 | 
47 | *shown below is a 1 trading day future gap*
48 | 
49 | ![Optimized FFNN](https://github.com/ahmedhamdi96/ML4T/blob/master/results/optimized_ffnn.png)
50 | 
51 | ## Hyperparameter Tuning
52 | 
53 | * <strong> LSTM </strong>
54 | ```sh
55 | python -m machine_learning.development.optimized_lstm.hyperparam_tune_main
56 | ```
57 | *Time Elapsed: 25 hours*
58 | 
59 | | Hyperparameter | Optimal Value |
60 | | :------------: | :-----------: |
61 | | Dropout        | 0.2           |
62 | | Neurons        | [256, 256, 32, 1] |
63 | | Decay          | 0.1           |
64 | | Time Steps     | 5             |
65 | | Batch Size     | 2048          |
66 | | Epochs         | 300           |
67 | 
68 | ![LSTM Hyperparam Tune 1](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_lstm1.png)
69 | ![LSTM Hyperparam Tune 2](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_lstm2.png)
70 | 
71 | * <strong> FFNN </strong>
72 | ```sh
73 | python -m machine_learning.development.optimized_ffnn.ffnn_hyperparam_tune_main
74 | ```
75 | *Time Elapsed: 5.3 minutes*
76 | 
77 | | Hyperparameter | Optimal Value |
78 | | :------------: | :-----------: |
79 | | Dropout        | 0.8           |
80 | | Neurons        | [256, 256, 64, 1] |
81 | | Decay          | 0.1           |
82 | | Batch Size     | 128           |
83 | | Epochs         | 200           |
84 | 
85 | ![FFNN Hyperparam Tune 1](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_ffnn1.png)
86 | ![FFNN Hyperparam Tune 2](https://github.com/ahmedhamdi96/ML4T/blob/master/results/hyperparam_tune_ffnn2.png)


--------------------------------------------------------------------------------
/machine_learning/development/new_regression/lin_reg.py:
--------------------------------------------------------------------------------
  1 | from utils.util import get_stock_data
  2 | import machine_learning.development.dataset_preprocessing as dpp
  3 | import machine_learning.development.linear_regression as lin_reg
  4 | from machine_learning.development.new_regression.new_dataset import compute_mape
  5 | from sklearn.preprocessing import MinMaxScaler
  6 | import numpy as np
  7 | from sklearn.metrics import mean_squared_error, mean_absolute_error
  8 | from sklearn.metrics import r2_score
  9 | 
 10 | def bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True):
 11 |     cols = ["Date", "Adj Close", "Volume"]
 12 |     df = get_stock_data(stock_symbol, start_date, end_date, cols)
 13 |     df.rename(columns={"Adj Close" : 'price'}, inplace=True)
 14 |     df['momentum'] = dpp.compute_momentum_ratio(df['price'], window)
 15 |     df['sma'] = dpp.compute_sma_ratio(df['price'], window)
 16 |     df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window)
 17 |     df['volatility'] = dpp.compute_volatility_ratio(df['price'], window)
 18 |     df['vroc'] = dpp.compute_vroc_ratio(df['Volume'], window)
 19 |     df['actual_price'] = df['price']
 20 |     df.drop(columns=["Volume"], inplace=True)
 21 |     df = df[window:]
 22 |     df.replace([np.inf, -np.inf], np.nan, inplace=True)
 23 |     df.fillna(method='ffill', inplace=True)
 24 |     df.fillna(method='bfill', inplace=True)
 25 |     scaler = None
 26 | 
 27 |     if normalize:        
 28 |         scaler = MinMaxScaler()
 29 |         df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1))
 30 |         df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1))
 31 |         df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1))
 32 |         df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1))
 33 |         df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1))
 34 |         df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1))
 35 |         df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1))
 36 |         
 37 |     print(df.head())
 38 |     print(df.tail())
 39 |     return df, scaler
 40 | 
 41 | def dataset_reshape(dataset, future_gap, split):
 42 |     print("Dataset Shape:", dataset.shape)
 43 |     X = dataset[:, :-1]
 44 |     Y = dataset[:, -1]
 45 |     print("X Shape:", X.shape)
 46 |     print("Y Shape:", Y.shape)
 47 | 
 48 |     print("Applying Future Gap...")
 49 |     X = X[:-future_gap]
 50 |     Y = Y[future_gap:]
 51 |     print("X Shape:", X.shape)
 52 |     print("Y Shape:", Y.shape)
 53 | 
 54 |     if split != None:
 55 |         print("Applying training, testing split...")
 56 |         split_index = int(split*X.shape[0])
 57 |         X_train = X[:split_index]
 58 |         X_test = X[split_index:]
 59 |         Y_train = Y[:split_index]
 60 |         Y_test = Y[split_index:]
 61 |         print("(X_train, Y_train, X_test, Y_test) Shapes:")
 62 |         print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
 63 |         return X_train, Y_train, X_test, Y_test
 64 |     
 65 |     return X, Y
 66 | 
 67 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled):
 68 |     rmse = (mean_squared_error(Y_test, predictions) ** 0.5)
 69 |     print('\nNormalized RMSE: %.3f' %(rmse))
 70 |     nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test)
 71 |     print('Normalized NRMSE: %.3f' %(nrmse))
 72 |     mae = mean_absolute_error(Y_test, predictions)
 73 |     print('Normalized MAE: %.3f' %(mae))
 74 |     mape = compute_mape(Y_test, predictions)
 75 |     print('Normalized MAPE: %.3f' %(mape))
 76 |     correlation = np.corrcoef(Y_test.T, predictions.T)
 77 |     print("Normalized Correlation: %.3f"%(correlation[0, 1]))
 78 |     r2 = r2_score(Y_test, predictions)
 79 |     print("Normalized r^2: %.3f"%(r2))
 80 |     normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
 81 | 
 82 |     #evaluating the model on the inverse-normalized dataset
 83 |     rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5)
 84 |     print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
 85 |     nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test)
 86 |     print('Normalized NRMSE: %.3f' %(nrmse))
 87 |     mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled)
 88 |     print('Normalized MAE: %.3f' %(mae))
 89 |     mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
 90 |     print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
 91 |     correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T)
 92 |     print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
 93 |     r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled)
 94 |     print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
 95 |     inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
 96 | 
 97 |     return normalized_metrics, inv_normalized_metrics
 98 | 
 99 | def final_test_linreg(stock_symbol, start_date, end_date, window, future_gap):
100 |     #building the dataset
101 |     print("> building the dataset...")
102 |     df_train, _ = bulid_new_TIs_dataset(stock_symbol, None, start_date, window)
103 |     df_test, scaler = bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window)
104 |     #reshaping the dataset for LinReg
105 |     print("\n> reshaping the dataset for LinReg...")
106 |     ds_train = df_train.values
107 |     ds_test = df_test.values
108 |     X_train, Y_train = dataset_reshape(ds_train, future_gap, None)
109 |     X_test, Y_test = dataset_reshape(ds_test, future_gap, None)
110 |     #fitting the training data
111 |     print("\n> fitting the training data...")
112 |     Y_train = Y_train.reshape((Y_train.shape[0], 1))
113 |     training_set = np.concatenate((X_train, Y_train), axis=1)
114 |     fitted_line_coefficients = lin_reg.minimize_new_err_fun(training_set, lin_reg.error_fun)
115 |     print("Line Coefficients:", fitted_line_coefficients)
116 |     #predictions
117 |     price = fitted_line_coefficients[0]*X_test[:, 0]
118 |     moment = fitted_line_coefficients[1]*X_test[:, 1]
119 |     sma = fitted_line_coefficients[2]*X_test[:, 2]
120 |     b_band = fitted_line_coefficients[3]*X_test[:, 3]
121 |     std = fitted_line_coefficients[4]*X_test[:, 4]
122 |     vroc = fitted_line_coefficients[5]*X_test[:, 5]
123 |     constant = fitted_line_coefficients[4]
124 |     predictions = price+moment+sma+b_band+std+vroc+constant
125 |     #inverse-scaling
126 |     print("\n> inverse-scaling the scaled values...")
127 |     predictions = predictions.reshape((predictions.shape[0], 1))
128 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
129 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
130 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
131 |     #evaluation
132 |     normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 
133 |                                                           Y_test_inv_scaled, predictions_inv_scaled)
134 |     #grouping the actual prices and predictions
135 |     print("\n> grouping the actual prices and predictions...")
136 |     feature_cols = df_test.columns.tolist()
137 |     feature_cols.remove("actual_price")
138 |     df_test.drop(columns=feature_cols, inplace=True)
139 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
140 |     df_test = df_test.iloc[future_gap:]
141 |     df_test['Actual'] = Y_test_inv_scaled
142 |     df_test['Prediction'] = predictions_inv_scaled
143 | 
144 |     return normalized_metrics, inv_normalized_metrics, df_test


--------------------------------------------------------------------------------
/machine_learning/development/new_regression/new_dataset.py:
--------------------------------------------------------------------------------
 1 | from utils.util import get_stock_data
 2 | import machine_learning.development.dataset_preprocessing as dpp
 3 | from sklearn.preprocessing import MinMaxScaler
 4 | import numpy as np
 5 | 
 6 | def compute_mape(y_true, y_pred):
 7 |     return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
 8 | 
 9 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True):
10 |     cols = ["Date", "Adj Close"]
11 |     df = get_stock_data(stock_symbol, start_date, end_date, cols)
12 |     df.rename(columns={"Adj Close" : 'price'}, inplace=True)
13 |     df['momentum'] = dpp.compute_momentum_ratio(df['price'], window)
14 |     df['sma'] = dpp.compute_sma_ratio(df['price'], window)
15 |     df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window)
16 |     df['actual_price'] = df['price']
17 |     df = df[window:]
18 |     scaler = None
19 | 
20 |     if normalize:        
21 |         scaler = MinMaxScaler()
22 |         df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1))
23 |         df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1))
24 |         df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1))
25 |         df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1))
26 |         df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1))
27 | 
28 |     print(df.head(10))
29 |     print(df.tail(10))
30 |     return df, scaler
31 | 
32 | def dataset_reshape(dataset, future_gap, split):
33 |     print("Dataset Shape:", dataset.shape)
34 |     X = dataset[:, :-1]
35 |     Y = dataset[:, -1]
36 |     print("X Shape:", X.shape)
37 |     print("Y Shape:", Y.shape)
38 | 
39 |     print("Applying Future Gap...")
40 |     X = X[:-future_gap]
41 |     Y = Y[future_gap:]
42 |     print("X Shape:", X.shape)
43 |     print("Y Shape:", Y.shape)
44 | 
45 |     print("Applying training, testing split...")
46 |     split_index = int(split*X.shape[0])
47 |     X_train = X[:split_index]
48 |     X_test = X[split_index:]
49 |     Y_train = Y[:split_index]
50 |     Y_test = Y[split_index:]
51 |     print("(X_train, Y_train, X_test, Y_test) Shapes:")
52 |     print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
53 |     return X_train, Y_train, X_test, Y_test


--------------------------------------------------------------------------------
/machine_learning/development/new_regression/new_knn_regression.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.new_regression import new_dataset as ds
 2 | from machine_learning.development.linear_regression import calculate_rmse
 3 | import machine_learning.development.knn_wrapper as knn
 4 | import numpy as np
 5 | import matplotlib.pyplot as plt
 6 | from sklearn.metrics import r2_score
 7 | 
 8 | #building the dataset
 9 | print("> building the dataset...")
10 | stock_symbol = '^GSPC'
11 | start_date = '1950-01-01'
12 | end_date = '2017-12-31'
13 | window = 5
14 | dataframe, scaler = ds.bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
15 | 
16 | #reshaping the dataset
17 | print("\n> reshaping the dataset...")
18 | dataset = dataframe.values
19 | future_gap = 5 #1 trading week
20 | split = 0.8 #80% of the dataset
21 | X_train, Y_train, X_test, Y_test = ds.dataset_reshape(dataset, future_gap, split)
22 | 
23 | #kNN model
24 | model = knn.knn(5)
25 | 
26 | #fitting the training data
27 | model.train(X_train, Y_train)
28 | 
29 | #predictions
30 | predictions = model.query(X_test, normalize=False, addDiff=False)
31 | 
32 | #evaluating the model on the normalized dataset
33 | rmse = calculate_rmse(predictions, Y_test)
34 | print('Normalized Test RMSE: %.3f' %(rmse))
35 | mape = ds.compute_mape(Y_test, predictions)
36 | print('Normalized Outsample MAPE: %.3f' %(mape))
37 | correlation = np.corrcoef(predictions, Y_test)
38 | print("Normalized Correlation: %.3f"%(correlation[0, 1]))
39 | r2 = r2_score(predictions, Y_test)
40 | print("Normalized Outsample r^2: %.3f"%(r2))
41 | 
42 | #evaluating the model on the inverse-normalized dataset
43 | predictions = predictions.reshape((predictions.shape[0], 1))
44 | Y_test = Y_test.reshape((Y_test.shape[0], 1))
45 | 
46 | predictions_inv_scaled = scaler.inverse_transform(predictions)
47 | Y_test_inv_scaled = scaler.inverse_transform(Y_test)
48 | 
49 | rmse = calculate_rmse(predictions_inv_scaled, Y_test_inv_scaled)
50 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
51 | mape = ds.compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
52 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
53 | correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T)
54 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
55 | r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled)
56 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
57 | 
58 | #plotting
59 | _, ax = plt.subplots()
60 | ax.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction')
61 | ax.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual')
62 | ax.set_xlabel('Trading Day')
63 | ax.set_ylabel('Price')
64 | ax.legend(loc='best')
65 | ax.grid(True)
66 | 
67 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/development/new_regression/new_linear_regression.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.new_regression import new_dataset as ds
 2 | import machine_learning.development.linear_regression as lin_reg
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | from sklearn.metrics import r2_score
 6 | 
 7 | #building the dataset
 8 | print("> building the dataset...")
 9 | stock_symbol = '^GSPC'
10 | start_date = '1950-01-01'
11 | end_date = '2017-12-31'
12 | window = 5
13 | dataframe, scaler = ds.bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
14 | 
15 | #reshaping the dataset
16 | print("\n> reshaping the dataset...")
17 | dataset = dataframe.values
18 | future_gap = 5 #1 trading week
19 | split = 0.8 #80% of the dataset
20 | X_train, Y_train, X_test, Y_test = ds.dataset_reshape(dataset, future_gap, split)
21 | 
22 | #training
23 | Y_train = Y_train.reshape((Y_train.shape[0], 1))
24 | training_set = np.concatenate((X_train, Y_train), axis=1)
25 | fitted_line_coefficients = lin_reg.minimize_err_fun(training_set, lin_reg.error_fun)
26 | print("Line Coefficients:", fitted_line_coefficients)
27 | 
28 | #testing
29 | price = fitted_line_coefficients[0]*X_test[:, 0]
30 | moment = fitted_line_coefficients[1]*X_test[:, 1]
31 | sma = fitted_line_coefficients[2]*X_test[:, 2]
32 | b_band = fitted_line_coefficients[3]*X_test[:, 3]
33 | constant = fitted_line_coefficients[4]
34 | predictions = price+moment+sma+b_band+constant
35 | 
36 | #evaluating the model on the normalized dataset
37 | rmse = lin_reg.calculate_rmse(predictions, Y_test)
38 | print('\nNormalized Outsample RMSE: %.3f' %(rmse))
39 | mape = ds.compute_mape(Y_test, predictions)
40 | print('Normalized Outsample MAPE: %.3f' %(mape))
41 | correlation = np.corrcoef(predictions, Y_test)
42 | print("Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
43 | r2 = r2_score(predictions, Y_test)
44 | print("Normalized Outsample r^2: %.3f"%(r2))
45 | 
46 | #evaluating the model on the inverse-normalized dataset
47 | predictions = predictions.reshape((predictions.shape[0], 1))
48 | Y_test = Y_test.reshape((Y_test.shape[0], 1))
49 | 
50 | predictions_inv_scaled = scaler.inverse_transform(predictions)
51 | Y_test_inv_scaled = scaler.inverse_transform(Y_test)
52 | 
53 | rmse = lin_reg.calculate_rmse(predictions_inv_scaled, Y_test_inv_scaled)
54 | print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
55 | mape = ds.compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
56 | print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
57 | correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T)
58 | print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
59 | r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled)
60 | print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
61 | 
62 | #plotting
63 | _, ax = plt.subplots()
64 | ax.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction')
65 | ax.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual')
66 | ax.set_xlabel('Trading Day')
67 | ax.set_ylabel('Price')
68 | ax.legend(loc='best')
69 | ax.grid(True)
70 | 
71 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/development/optimized_ffnn/ffnn.py:
--------------------------------------------------------------------------------
  1 | from utils.util import get_stock_data
  2 | import machine_learning.development.dataset_preprocessing as dpp
  3 | import numpy as np
  4 | from keras.models import Sequential
  5 | from keras.layers.core import Dense, Dropout
  6 | from keras.optimizers import Adam
  7 | from sklearn.preprocessing import MinMaxScaler
  8 | from machine_learning.development.new_regression.new_dataset import compute_mape
  9 | from sklearn.metrics import mean_squared_error, mean_absolute_error
 10 | from sklearn.metrics import r2_score
 11 | 
 12 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True):
 13 |     cols = ["Date", "Adj Close"]
 14 |     df = get_stock_data(stock_symbol, start_date, end_date, cols)
 15 |     df.rename(columns={"Adj Close" : 'price'}, inplace=True)
 16 |     df['momentum'] = dpp.compute_momentum_ratio(df['price'], window)
 17 |     df['sma'] = dpp.compute_sma_ratio(df['price'], window)
 18 |     df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window)
 19 |     df['actual_price'] = df['price']
 20 |     df = df[window:]
 21 |     scaler = None
 22 | 
 23 |     if normalize:        
 24 |         scaler = MinMaxScaler()
 25 |         df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1))
 26 |         df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1))
 27 |         df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1))
 28 |         df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1))
 29 |         df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1))
 30 | 
 31 |     print(df.head())
 32 |     print(df.tail())
 33 |     return df, scaler
 34 | 
 35 | def bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True):
 36 |     cols = ["Date", "Adj Close", "Volume"]
 37 |     df = get_stock_data(stock_symbol, start_date, end_date, cols)
 38 |     df.rename(columns={"Adj Close" : 'price'}, inplace=True)
 39 |     df['momentum'] = dpp.compute_momentum_ratio(df['price'], window)
 40 |     df['sma'] = dpp.compute_sma_ratio(df['price'], window)
 41 |     df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window)
 42 |     df['volatility'] = dpp.compute_volatility_ratio(df['price'], window)
 43 |     df['vroc'] = dpp.compute_vroc_ratio(df['Volume'], window)
 44 |     df['actual_price'] = df['price']
 45 |     df.drop(columns=["Volume"], inplace=True)
 46 |     df = df[window:]
 47 |     df.replace([np.inf, -np.inf], np.nan, inplace=True)
 48 |     df.fillna(method='ffill', inplace=True)
 49 |     df.fillna(method='bfill', inplace=True)
 50 |     scaler = None
 51 | 
 52 |     if normalize:        
 53 |         scaler = MinMaxScaler()
 54 |         df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1))
 55 |         df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1))
 56 |         df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1))
 57 |         df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1))
 58 |         df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1))
 59 |         df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1))
 60 |         df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1))
 61 |         
 62 |     print(df.head())
 63 |     print(df.tail())
 64 |     return df, scaler
 65 | 
 66 | def ffnn_dataset_reshape(dataset, future_gap, split):
 67 |     print("Dataset Shape:", dataset.shape)
 68 |     X = dataset[:, :-1]
 69 |     Y = dataset[:, -1]
 70 |     print("X Shape:", X.shape)
 71 |     print("Y Shape:", Y.shape)
 72 | 
 73 |     print("Applying Future Gap...")
 74 |     X = X[:-future_gap]
 75 |     Y = Y[future_gap:]
 76 | 
 77 |     if split != None:
 78 |         print("Applying training, testing split...")
 79 |         split_index = int(split*X.shape[0])
 80 |         X_train = X[:split_index]
 81 |         X_test = X[split_index:]
 82 |         Y_train = Y[:split_index]
 83 |         Y_test = Y[split_index:]
 84 |         print("(X_train, Y_train, X_test, Y_test) Shapes:")
 85 |         print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
 86 |         return X_train, Y_train, X_test, Y_test
 87 |     
 88 |     return X, Y
 89 | 
 90 | def build_model(features, neurons, drop_out, decay=0.0):
 91 |     model = Sequential()
 92 |     
 93 |     model.add(Dense(neurons[0], input_dim=features, activation='relu',))
 94 |     model.add(Dropout(drop_out))
 95 |         
 96 |     model.add(Dense(neurons[1], activation='relu'))
 97 |     model.add(Dropout(drop_out))
 98 |         
 99 |     model.add(Dense(neurons[2], activation='relu'))        
100 |     model.add(Dense(neurons[3], activation='linear'))
101 | 
102 |     adam = Adam(decay=decay)
103 |     model.compile(loss='mse',optimizer=adam)
104 |     model.summary()
105 |     return model
106 | 
107 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks):
108 | 
109 |     history = model.fit(
110 |         X_train,
111 |         Y_train,
112 |         batch_size = batch_size,
113 |         epochs = epochs,
114 |         validation_split = validation_split,
115 |         verbose = verbose,
116 |         callbacks = callbacks
117 |     )
118 | 
119 |     return history
120 | 
121 | def evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose):
122 |     train_mse = model.evaluate(X_train, Y_train, verbose=verbose)
123 |     print('Insample Testing: %.5f MSE (%.3f RMSE)' % (train_mse, (train_mse ** 0.5)))
124 | 
125 |     test_mse = model.evaluate(X_test, Y_test, verbose=verbose)
126 |     print('Outsample Testing: %.5f MSE (%.3f RMSE)' % (test_mse, (test_mse ** 0.5)))
127 | 
128 |     return train_mse, test_mse
129 | 
130 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled):
131 |     rmse = (mean_squared_error(Y_test, predictions) ** 0.5)
132 |     print('\nNormalized RMSE: %.3f' %(rmse))
133 |     nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test)
134 |     print('Normalized NRMSE: %.3f' %(nrmse))
135 |     mae = mean_absolute_error(Y_test, predictions)
136 |     print('Normalized MAE: %.3f' %(mae))
137 |     mape = compute_mape(Y_test, predictions)
138 |     print('Normalized MAPE: %.3f' %(mape))
139 |     correlation = np.corrcoef(Y_test.T, predictions.T)
140 |     print("Normalized Correlation: %.3f"%(correlation[0, 1]))
141 |     r2 = r2_score(Y_test, predictions)
142 |     print("Normalized r^2: %.3f"%(r2))
143 |     normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
144 | 
145 |     #evaluating the model on the inverse-normalized dataset
146 |     rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5)
147 |     print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
148 |     nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test)
149 |     print('Normalized NRMSE: %.3f' %(nrmse))
150 |     mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled)
151 |     print('Normalized MAE: %.3f' %(mae))
152 |     mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
153 |     print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
154 |     correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T)
155 |     print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
156 |     r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled)
157 |     print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
158 |     inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
159 | 
160 |     return normalized_metrics, inv_normalized_metrics
161 | 
162 | def final_test_ffnn(stock_symbol, start_date, end_date, window, future_gap, neurons, 
163 |                     drop_out, batch_size, epochs, validation_split, verbose, callbacks):
164 |     #building the dataset
165 |     print("> building the dataset...")
166 |     df_train, _ = bulid_new_TIs_dataset(stock_symbol, None, start_date, window)
167 |     df_test, scaler = bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window)
168 |     #reshaping the dataset for FFNN
169 |     print("\n> reshaping the dataset for FFNN...")
170 |     ds_train = df_train.values
171 |     ds_test = df_test.values
172 |     X_train, Y_train = ffnn_dataset_reshape(ds_train, future_gap, None)
173 |     X_test, Y_test = ffnn_dataset_reshape(ds_test, future_gap, None)
174 |     #building the FFNN model
175 |     print("\n> building the FFNN model...")
176 |     features = X_train.shape[1]
177 |     model = build_model(features, neurons, drop_out)
178 |     #fitting the training data
179 |     print("\n> fitting the training data...")
180 |     model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks)
181 |     #predictions
182 |     print("\n> testing the model for predictions...")
183 |     predictions = model.predict(X_test)
184 |     #inverse-scaling
185 |     print("\n> inverse-scaling the scaled values...")
186 |     predictions = predictions.reshape((predictions.shape[0], 1))
187 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
188 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
189 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
190 |     #evaluation
191 |     normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 
192 |                                                           Y_test_inv_scaled, predictions_inv_scaled)
193 |     #grouping the actual prices and predictions
194 |     print("\n> grouping the actual prices and predictions...")
195 |     feature_cols = df_test.columns.tolist()
196 |     feature_cols.remove("actual_price")
197 |     df_test.drop(columns=feature_cols, inplace=True)
198 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
199 |     df_test = df_test.iloc[future_gap:]
200 |     df_test['Actual'] = Y_test_inv_scaled
201 |     df_test['Prediction'] = predictions_inv_scaled
202 | 
203 |     return normalized_metrics, inv_normalized_metrics, df_test


--------------------------------------------------------------------------------
/machine_learning/development/optimized_ffnn/ffnn_hyperparam_tune.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.optimized_ffnn import ffnn
 2 | from keras.callbacks import EarlyStopping
 3 | 
 4 | def evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout,
 5 |                   neurons, batch_size, epochs, validation_split, verbose, decay=0.0):
 6 | 
 7 |     dataframe, _ = ffnn.bulid_TIs_dataset(stock, start_date, end_date, window)
 8 |     dataset = dataframe.values
 9 |     X_train, Y_train, X_test, Y_test = ffnn.ffnn_dataset_reshape(dataset, future_gap, split)
10 |     features = X_train.shape[1]
11 |     model = ffnn.build_model(features, neurons, dropout, decay)
12 |     early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
13 |                                             patience=50, verbose=verbose, mode='auto')
14 |     callbacks =  [early_stopping_callback]
15 |     ffnn.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks)
16 |     train_mse, test_mse = ffnn.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose)
17 |     return train_mse, test_mse
18 | 
19 | def optimal_dropout(stock, start_date, end_date, window, future_gap, split, neurons,
20 |                     batch_size, epochs, validation_split, verbose, dropout_list):
21 |     dropout_result = {}
22 |     for dropout in dropout_list:
23 |         print("\n> testing droput: (%.1f)..." %(dropout))    
24 |         _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout,
25 |                                      neurons, batch_size, epochs, validation_split, verbose)
26 |         dropout_result[dropout] = testScore
27 |     return dropout_result
28 | 
29 | def optimal_epochs(stock, start_date, end_date, window, future_gap, split, dropout,
30 |                     neurons, batch_size, validation_split, verbose, epochs_list):
31 |     epochs_result = {}
32 |     for epochs in epochs_list: 
33 |         print("\n> testing epochs: (%d)..." %(epochs))    
34 |         _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout,
35 |                                      neurons, batch_size, epochs, validation_split, verbose)
36 |         epochs_result[epochs] = testScore
37 |     return epochs_result
38 | 
39 | def optimal_neurons(stock, start_date, end_date, window, future_gap, split, dropout,
40 |                     batch_size, epochs, validation_split, verbose, neurons_list1, neurons_list2):
41 |     neurons_result = {}
42 |     for ffnn_neuron in neurons_list1:
43 |         neurons = [ffnn_neuron, ffnn_neuron]
44 |         for dense_neuron in neurons_list2:
45 |             neurons.append(dense_neuron)
46 |             neurons.append(1)
47 |             print("\n> testing neurons: (%s)..." %(str(neurons))) 
48 |             _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout,
49 |                                          neurons, batch_size, epochs, validation_split, verbose)
50 |             neurons_result[str(neurons)] = testScore
51 |             neurons = neurons[:2]
52 |     return neurons_result
53 | 
54 | def optimal_decay(stock, start_date, end_date, window, future_gap, split, dropout,
55 |                   neurons, batch_size, epochs, validation_split, verbose, decay_list):
56 |     decay_result = {}
57 |     for decay in decay_list:
58 |         print("\n> testing decay: (%.1f)..." %(decay))
59 |         _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout,
60 |                                      neurons, batch_size, epochs, validation_split, verbose, decay)
61 |         decay_result[decay] = testScore
62 |     return decay_result
63 | 
64 | def optimal_batch_size(stock, start_date, end_date, window, future_gap, split, dropout, neurons,
65 |                        epochs, validation_split, verbose, decay, batch_size_list):
66 |     batch_size_result = {}
67 |     for batch_size in batch_size_list:
68 |         print("\n> testing batch size: (%d)..." %(batch_size))
69 |         _, testScore = evaluate_ffnn(stock, start_date, end_date, window, future_gap, split, dropout,
70 |                                      neurons, batch_size, epochs, validation_split, verbose, decay)
71 |         batch_size_result[batch_size] = testScore
72 |     return batch_size_result


--------------------------------------------------------------------------------
/machine_learning/development/optimized_ffnn/ffnn_hypparam_tune_main.py:
--------------------------------------------------------------------------------
  1 | from machine_learning.development.optimized_ffnn import ffnn_hyperparam_tune as hpt
  2 | import matplotlib.pyplot as plt
  3 | import time
  4 | 
  5 | #start time
  6 | start_time = time.time()
  7 | 
  8 | #intial hyperparameters
  9 | stock = '^GSPC'
 10 | start_date = '1950-01-01'
 11 | end_date = '2017-12-31'
 12 | window = 5
 13 | future_gap = 5
 14 | split = 0.8
 15 | dropout = None
 16 | neurons = [64, 64, 32, 1]
 17 | batch_size = 4026 
 18 | epochs = 1
 19 | validation_split = 0.1
 20 | verbose = 1
 21 | 
 22 | #optimal hyperparameters txt file
 23 | print("\n> finding the optimal hyperparameters...")
 24 | file = open("machine_learning/optimized_ffnn/ffnn_optimal_hyperparameters.txt", "wb") #ab+ to read and append to file
 25 | fig1, (ax1, ax2, ax3) = plt.subplots(3, 1)
 26 | fig2, (ax4, ax5) = plt.subplots(2, 1)
 27 | 
 28 | #finding the optimal dropout
 29 | print("\n> finding the optimal dropout...")
 30 | dropout_list = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
 31 | dropout_result = hpt.optimal_dropout(stock, start_date, end_date, window, future_gap, split, neurons,
 32 |                                      batch_size, epochs, validation_split, verbose, dropout_list)
 33 | 
 34 | min_loss = min(dropout_result.values())
 35 | optimal_dropout = -1.0
 36 | for dout, loss in dropout_result.items():
 37 |     if loss == min_loss:
 38 |         optimal_dropout = dout
 39 | 
 40 | file.write(bytes("dropout: %.1f, " %(optimal_dropout), 'UTF-8'))
 41 | print("\nDropout:", optimal_dropout)
 42 | dropout = optimal_dropout
 43 | 
 44 | items = dropout_result.items()
 45 | x, y = zip(*items)
 46 | ax1.plot(x, y)
 47 | ax1.set_xlabel('Dropout')
 48 | ax1.set_ylabel('MSE')
 49 | ax1.grid(True)
 50 | 
 51 | #finding the optimal neurons
 52 | print("\n> finding the optimal neurons...")
 53 | neuronlist1 = [64, 128, 256]
 54 | neuronlist2 = [16, 32, 64]
 55 | neurons_result = hpt.optimal_neurons(stock, start_date, end_date, window, future_gap, split, dropout, 
 56 |                                      batch_size, epochs, validation_split, verbose, neuronlist1, neuronlist2)
 57 | 
 58 | min_loss = min(neurons_result.values())
 59 | optimal_neurons = ""
 60 | for n, loss in neurons_result.items():
 61 |     if loss == min_loss:
 62 |         optimal_neurons = n
 63 | 
 64 | file.write(bytes("neurons: %s, " %(str(optimal_neurons)), 'UTF-8'))
 65 | print("\nNeurons:", optimal_neurons)
 66 | neurons = optimal_neurons
 67 | neurons = neurons[1:-1]
 68 | neurons = neurons.split(", ")
 69 | neurons = [int(neuron_str) for neuron_str in neurons]
 70 | 
 71 | items = neurons_result.items()
 72 | x, y = zip(*items)
 73 | ax2.bar(range(len(items)), y, align='center')
 74 | plt.sca(ax2)
 75 | plt.xticks(range(len(items)), x, rotation=25)
 76 | ax2.set_xlabel('Neurons')
 77 | ax2.set_ylabel('MSE')
 78 | ax2.grid(True)
 79 | 
 80 | #finding the optimal decay
 81 | print("\n> finding the optimal decay...")
 82 | decay_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
 83 | decay_result = hpt.optimal_decay(stock, start_date, end_date, window, future_gap, split, dropout, 
 84 |                                   neurons, batch_size, epochs, validation_split, verbose, decay_list)
 85 | 
 86 | min_loss = min(decay_result.values())
 87 | optimal_decay = -1.0
 88 | for d, loss in decay_result.items():
 89 |     if loss == min_loss:
 90 |         optimal_decay = d
 91 | 
 92 | file.write(bytes("decay: %.1f, " %(optimal_decay), 'UTF-8'))
 93 | print("\nDecay:", optimal_decay)
 94 | decay = optimal_decay
 95 | 
 96 | items = decay_result.items()
 97 | x, y = zip(*items)
 98 | ax3.plot(x, y)
 99 | ax3.set_xlabel('Decay')
100 | ax3.set_ylabel('MSE')
101 | ax3.grid(True)
102 | 
103 | #finding the optimal batch size
104 | print("\n> finding the optimal batch size...")
105 | batch_size_list = [128, 256, 512, 1024, 2048, 4096]
106 | batch_size_result = hpt.optimal_batch_size(stock, start_date, end_date, window, future_gap, split, dropout,
107 |                                          neurons, epochs, validation_split, verbose, decay, batch_size_list)
108 | 
109 | min_loss = min(batch_size_result.values())
110 | optimal_batch_size = -1
111 | for bs, loss in batch_size_result.items():
112 |     if loss == min_loss:
113 |         optimal_batch_size = bs
114 | 
115 | file.write(bytes("batch_size: %d, " %(optimal_batch_size), 'UTF-8'))
116 | print("\nBatch Size:", optimal_batch_size)
117 | batch_size = optimal_batch_size
118 | 
119 | items = batch_size_result.items()
120 | x, y = zip(*items)
121 | ax4.plot(x, y)
122 | ax4.set_xlabel('Batch Size')
123 | ax4.set_ylabel('MSE')
124 | ax4.grid(True)
125 | 
126 | #finding the optimal epochs
127 | print("\n> finding the optimal epochs...")
128 | epochs_list = [50, 60, 70, 80, 90, 100, 200, 300]
129 | epochs_result = hpt.optimal_epochs(stock, start_date, end_date, window, future_gap, split, dropout, 
130 |                                     neurons, batch_size, validation_split, verbose, epochs_list)
131 | 
132 | min_loss = min(epochs_result.values())
133 | optimal_epochs = -1
134 | for ep, loss in epochs_result.items():
135 |     if loss == min_loss:
136 |         optimal_epochs = ep
137 | 
138 | file.write(bytes("epochs: %d, " %(optimal_epochs), 'UTF-8'))
139 | print("\nEpochs:", optimal_epochs)
140 | epochs = optimal_epochs
141 | 
142 | items = epochs_result.items()
143 | x, y = zip(*items)
144 | ax5.plot(x, y)
145 | ax5.set_xlabel('Epochs')
146 | ax5.set_ylabel('MSE')
147 | ax5.grid(True)
148 | 
149 | #end time
150 | end_time = time.time()
151 | time = end_time - start_time
152 | file.write(bytes("time elapsed: %.3fs." %(time), 'UTF-8'))
153 | 
154 | #closing the file and showing the plot
155 | print("\nOptimal Hyperparameters")
156 | print("Dropout:", optimal_dropout)
157 | print("Neurons:", optimal_neurons)
158 | print("Decay:", optimal_decay)
159 | print("Batch Size:", optimal_batch_size)
160 | print("Epochs:", optimal_epochs)
161 | print("Time Elapsed (s):", time)
162 | 
163 | file.close()
164 | fig1.tight_layout()
165 | fig2.tight_layout()
166 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/development/optimized_ffnn/ffnn_main.py:
--------------------------------------------------------------------------------
  1 | from machine_learning.development.optimized_ffnn import ffnn
  2 | from machine_learning.development.new_regression.new_dataset import compute_mape
  3 | from keras.callbacks import EarlyStopping
  4 | from sklearn.metrics import mean_squared_error
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | from sklearn.metrics import r2_score
  8 | 
  9 | def main(internal_eval=False):
 10 |     #building the dataset
 11 |     print("> building the dataset...")
 12 |     stock_symbol = '^GSPC'
 13 |     start_date = '1950-01-01'
 14 |     end_date = '2017-12-31'
 15 |     window = 2
 16 |     dataframe, scaler = ffnn.bulid_new_TIs_dataset(stock_symbol, start_date, end_date, window)
 17 | 
 18 |     #reshaping the dataset for FFNN
 19 |     print("\n> reshaping the dataset for FFNN...")
 20 |     dataset = dataframe.values
 21 |     future_gap = 1 #1 trading day
 22 |     split = 0.8 #80% of the dataset
 23 |     X_train, Y_train, X_test, Y_test = ffnn.ffnn_dataset_reshape(dataset, future_gap, split)
 24 | 
 25 |     #building the FFNN model
 26 |     print("\n> building the FFNN model...")
 27 |     features = X_train.shape[1]
 28 |     neurons = [256, 256, 16, 1]
 29 |     drop_out = 0.3
 30 |     verbose = 1
 31 |     model = ffnn.build_model(features, neurons, drop_out)
 32 | 
 33 |     #fitting the training data
 34 |     print("\n> fitting the training data...")
 35 |     early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
 36 |                                             patience=50, verbose=verbose, mode='auto')
 37 |     batch_size = 4096
 38 |     epochs = 200
 39 |     validation_split = 0.1
 40 |     _ = ffnn.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split,
 41 |                              verbose, [early_stopping_callback])
 42 | 
 43 |     #internal evaluation
 44 |     if internal_eval:
 45 |         print("\n> internal evaluation...")
 46 |         _, _ = ffnn.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose)
 47 | 
 48 |     #predictions
 49 |     predictions = model.predict(X_test)
 50 |     predictions = predictions.reshape((predictions.shape[0], 1))
 51 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
 52 | 
 53 |     #evaluating the model on the normalized dataset
 54 |     rmse = (mean_squared_error(predictions, Y_test) ** 0.5)
 55 |     print('\nNormalized Test RMSE: %.3f' %(rmse))
 56 |     mape = compute_mape(Y_test, predictions)
 57 |     print('Normalized Outsample MAPE: %.3f' %(mape))
 58 |     correlation = np.corrcoef(predictions.T, Y_test.T)
 59 |     print("Normalized Correlation: %.3f"%(correlation[0, 1]))
 60 |     r2 = r2_score(predictions, Y_test)
 61 |     print("Normalized Outsample r^2: %.3f"%(r2))
 62 | 
 63 |     #evaluating the model on the inverse-normalized dataset
 64 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
 65 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
 66 | 
 67 |     rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5)
 68 |     print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
 69 |     mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
 70 |     print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
 71 |     correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T)
 72 |     print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
 73 |     r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled)
 74 |     print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
 75 | 
 76 |     #plotting the results
 77 |     print("\n> plotting the results...")
 78 |     _, ax2 = plt.subplots()
 79 |     '''ax1.plot(history.history['loss'], label='Training')
 80 |     ax1.plot(history.history['val_loss'], label='Validation')
 81 |     ax1.set_xlabel('Epoch #')
 82 |     ax1.set_ylabel('Loss')
 83 |     ax1.legend(loc='best')
 84 |     ax1.grid(True)
 85 |     '''
 86 |     ax2.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction')
 87 |     ax2.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual')
 88 |     ax2.set_xlabel('Trading Day')
 89 |     ax2.set_ylabel('Price')
 90 |     ax2.legend(loc='best')
 91 |     ax2.grid(True)
 92 | 
 93 |     plt.show()
 94 | 
 95 | main()
 96 | 
 97 | 
 98 | #to be stored temporarily
 99 | '''#evaluating the model on the *dataset*
100 | print("\n> evaluating the model on the *dataset*...")
101 | predictions = model.predict(X_test)
102 | Y_test = Y_test.reshape((Y_test.shape[0], 1))
103 | 
104 | predictions_inv_scaled = scaler.inverse_transform(predictions)
105 | Y_test_inv_scaled = scaler.inverse_transform(Y_test)
106 | 
107 | rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5)
108 | print('Outsample RMSE: %.3f' %(rmse))
109 | #correlation = np.corrcoef(predictions_inv_scaled, Y_test_inv_scaled)
110 | #print("Outsample Correlation: %.3f"%(correlation[0, 1]))#
111 | # '''


--------------------------------------------------------------------------------
/machine_learning/development/optimized_ffnn/ffnn_optimal_hyperparameters.txt:
--------------------------------------------------------------------------------
1 | dropout: 0.8, neurons: [256, 256, 64, 1], decay: 0.1, batch_size: 128, epochs: 200, time elapsed: 317.959s.


--------------------------------------------------------------------------------
/machine_learning/development/optimized_lstm/hyperparam_tune_main.py:
--------------------------------------------------------------------------------
  1 | from machine_learning.development.optimized_lstm import hyperparameter_tunning as hpt
  2 | import matplotlib.pyplot as plt
  3 | import time
  4 | 
  5 | #start time
  6 | start_time = time.time()
  7 | 
  8 | #intial hyperparameters
  9 | stock = '^GSPC'
 10 | start_date = '1950-01-01'
 11 | end_date = '2017-12-31'
 12 | future_gap = 1
 13 | time_steps = 20
 14 | split = 0.9
 15 | dropout = None
 16 | neurons = [128, 128, 32, 1]
 17 | batch_size = 512 
 18 | epochs = 50
 19 | validation_split= 0.1
 20 | verbose = 1
 21 | 
 22 | #optimal hyperparameters txt file
 23 | print("\n> finding the optimal hyperparameters...")
 24 | file = open("machine_learning/optimized_lstm/optimal_hyperparameters.txt", "wb") #ab+ to read and append to file
 25 | fig1, (ax1, ax2, ax3) = plt.subplots(3, 1)
 26 | fig2, (ax4, ax5, ax6) = plt.subplots(3, 1)
 27 | 
 28 | #finding the optimal dropout
 29 | print("\n> finding the optimal dropout...")
 30 | dropout_list = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
 31 | dropout_result = hpt.optimal_dropout(stock, start_date, end_date, future_gap, time_steps, split, neurons,
 32 |                                      batch_size, epochs, validation_split, verbose, dropout_list)
 33 | 
 34 | min_loss = min(dropout_result.values())
 35 | optimal_dropout = -1.0
 36 | for dout, loss in dropout_result.items():
 37 |     if loss == min_loss:
 38 |         optimal_dropout = dout
 39 | 
 40 | file.write(bytes("dropout: %.1f, " %(optimal_dropout), 'UTF-8'))
 41 | print("\nDropout:", optimal_dropout)
 42 | dropout = optimal_dropout
 43 | 
 44 | items = dropout_result.items()
 45 | x, y = zip(*items)
 46 | ax1.plot(x, y)
 47 | ax1.set_xlabel('Dropout')
 48 | ax1.set_ylabel('MSE')
 49 | ax1.grid(True)
 50 | 
 51 | #finding the optimal neurons
 52 | print("\n> finding the optimal neurons...")
 53 | neuronlist1 = [64, 128, 256]
 54 | neuronlist2 = [16, 32, 64]
 55 | neurons_result = hpt.optimal_neurons(stock, start_date, end_date, future_gap, time_steps, split, dropout, 
 56 |                                      batch_size, epochs, validation_split, verbose, neuronlist1, neuronlist2)
 57 | 
 58 | min_loss = min(neurons_result.values())
 59 | optimal_neurons = ""
 60 | for n, loss in neurons_result.items():
 61 |     if loss == min_loss:
 62 |         optimal_neurons = n
 63 | 
 64 | file.write(bytes("neurons: %s, " %(str(optimal_neurons)), 'UTF-8'))
 65 | print("\nNeurons:", optimal_neurons)
 66 | neurons = optimal_neurons
 67 | neurons = neurons[1:-1]
 68 | neurons = neurons.split(", ")
 69 | neurons = [int(neuron_str) for neuron_str in neurons]
 70 | 
 71 | items = neurons_result.items()
 72 | x, y = zip(*items)
 73 | ax2.bar(range(len(items)), y, align='center')
 74 | plt.sca(ax2)
 75 | plt.xticks(range(len(items)), x, rotation=25)
 76 | ax2.set_xlabel('Neurons')
 77 | ax2.set_ylabel('MSE')
 78 | ax2.grid(True)
 79 | 
 80 | #finding the optimal decay
 81 | print("\n> finding the optimal decay...")
 82 | decay_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
 83 | decay_result = hpt.optimal_decay(stock, start_date, end_date, future_gap, time_steps, split, dropout, 
 84 |                                   neurons, batch_size, epochs, validation_split, verbose, decay_list)
 85 | 
 86 | min_loss = min(decay_result.values())
 87 | optimal_decay = -1.0
 88 | for d, loss in decay_result.items():
 89 |     if loss == min_loss:
 90 |         optimal_decay = d
 91 | 
 92 | file.write(bytes("decay: %.1f, " %(optimal_decay), 'UTF-8'))
 93 | print("\nDecay:", optimal_decay)
 94 | decay = optimal_decay
 95 | 
 96 | items = decay_result.items()
 97 | x, y = zip(*items)
 98 | ax3.plot(x, y)
 99 | ax3.set_xlabel('Decay')
100 | ax3.set_ylabel('MSE')
101 | ax3.grid(True)
102 | 
103 | #finding the optimal time steps
104 | print("\n> finding the optimal time steps...")
105 | time_steps_list = [5, 10, 15, 20, 40, 80, 100]
106 | time_steps_result = hpt.optimal_time_steps(stock, start_date, end_date, future_gap, split, dropout, neurons,
107 |                                          batch_size, epochs, validation_split, verbose, decay, time_steps_list)
108 | 
109 | min_loss = min(time_steps_result.values())
110 | optimal_time_steps = -1
111 | for ts, loss in time_steps_result.items():
112 |     if loss == min_loss:
113 |         optimal_time_steps = ts
114 | 
115 | file.write(bytes("time_steps: %d, " %(optimal_time_steps), 'UTF-8'))
116 | print("\nTime Steps:", optimal_time_steps)
117 | time_steps = optimal_time_steps
118 | 
119 | items = time_steps_result.items()
120 | x, y = zip(*items)
121 | ax4.plot(x, y)
122 | ax4.set_xlabel('Time Steps')
123 | ax4.set_ylabel('MSE')
124 | ax4.grid(True)
125 | 
126 | #finding the optimal batch size
127 | print("\n> finding the optimal batch size...")
128 | batch_size_list = [128, 256, 512, 1024, 2048, 4096]
129 | batch_size_result = hpt.optimal_batch_size(stock, start_date, end_date, future_gap, time_steps, split, dropout,
130 |                                          neurons, epochs, validation_split, verbose, decay, batch_size_list)
131 | 
132 | min_loss = min(batch_size_result.values())
133 | optimal_batch_size = -1
134 | for bs, loss in batch_size_result.items():
135 |     if loss == min_loss:
136 |         optimal_batch_size = bs
137 | 
138 | file.write(bytes("batch_size: %d, " %(optimal_batch_size), 'UTF-8'))
139 | print("\nBatch Size:", optimal_batch_size)
140 | batch_size = optimal_batch_size
141 | 
142 | items = batch_size_result.items()
143 | x, y = zip(*items)
144 | ax5.plot(x, y)
145 | ax5.set_xlabel('Batch Size')
146 | ax5.set_ylabel('MSE')
147 | ax5.grid(True)
148 | 
149 | #finding the optimal epochs
150 | print("\n> finding the optimal epochs...")
151 | epochs_list = [50, 60, 70, 80, 90, 100, 200, 300]
152 | epochs_result = hpt.optimal_epochs(stock, start_date, end_date, future_gap, time_steps, split, dropout, 
153 |                                     neurons, batch_size, validation_split, verbose, epochs_list)
154 | 
155 | min_loss = min(epochs_result.values())
156 | optimal_epochs = -1
157 | for ep, loss in epochs_result.items():
158 |     if loss == min_loss:
159 |         optimal_epochs = ep
160 | 
161 | file.write(bytes("epochs: %d, " %(optimal_epochs), 'UTF-8'))
162 | print("\nEpochs:", optimal_epochs)
163 | epochs = optimal_epochs
164 | 
165 | items = epochs_result.items()
166 | x, y = zip(*items)
167 | ax6.plot(x, y)
168 | ax6.set_xlabel('Epochs')
169 | ax6.set_ylabel('MSE')
170 | ax6.grid(True)
171 | 
172 | #end time
173 | end_time = time.time()
174 | time = end_time - start_time
175 | file.write(bytes("time elapsed: %.3fs." %(time), 'UTF-8'))
176 | 
177 | #closing the file and showing the plot
178 | print("\nOptimal Hyperparameters")
179 | print("Dropout:", optimal_dropout)
180 | print("Neurons:", optimal_neurons)
181 | print("Decay:", optimal_decay)
182 | print("Time Steps:", optimal_time_steps)
183 | print("Batch Size:", optimal_batch_size)
184 | print("Epochs:", optimal_epochs)
185 | print("Time Elapsed (s):", time)
186 | 
187 | file.close()
188 | fig1.tight_layout()
189 | fig2.tight_layout()
190 | plt.show()
191 | 


--------------------------------------------------------------------------------
/machine_learning/development/optimized_lstm/hyperparameter_tunning.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.optimized_lstm import lstm
 2 | from keras.callbacks import EarlyStopping
 3 | 
 4 | def evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout,
 5 |                   neurons, batch_size, epochs, validation_split, verbose, decay=0.0):
 6 | 
 7 |     dataframe, _ = lstm.bulid_dataset(stock, start_date, end_date)
 8 |     dataset = dataframe.values
 9 |     X_train, Y_train, X_test, Y_test = lstm.lstm_dataset_reshape(dataset, time_steps, future_gap, split)
10 |     features = X_train.shape[2]
11 |     model = lstm.build_model(time_steps, features, neurons, dropout, decay)
12 |     early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
13 |                                             patience=50, verbose=verbose, mode='auto')
14 |     callbacks =  [early_stopping_callback]
15 |     lstm.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks)
16 |     train_mse, test_mse = lstm.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose)
17 |     return train_mse, test_mse
18 | 
19 | def optimal_dropout(stock, start_date, end_date, future_gap, time_steps, split, neurons,
20 |                     batch_size, epochs, validation_split, verbose, dropout_list):
21 |     dropout_result = {}
22 |     for dropout in dropout_list:
23 |         print("\n> testing droput: (%.1f)..." %(dropout))    
24 |         _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout,
25 |                                      neurons, batch_size, epochs, validation_split, verbose)
26 |         dropout_result[dropout] = testScore
27 |     return dropout_result
28 | 
29 | def optimal_epochs(stock, start_date, end_date, future_gap, time_steps, split, dropout,
30 |                     neurons, batch_size, validation_split, verbose, epochs_list):
31 |     epochs_result = {}
32 |     for epochs in epochs_list: 
33 |         print("\n> testing epochs: (%d)..." %(epochs))    
34 |         _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout,
35 |                                      neurons, batch_size, epochs, validation_split, verbose)
36 |         epochs_result[epochs] = testScore
37 |     return epochs_result
38 | 
39 | def optimal_neurons(stock, start_date, end_date, future_gap, time_steps, split, dropout,
40 |                     batch_size, epochs, validation_split, verbose, neurons_list1, neurons_list2):
41 |     neurons_result = {}
42 |     for lstm_neuron in neurons_list1:
43 |         neurons = [lstm_neuron, lstm_neuron]
44 |         for dense_neuron in neurons_list2:
45 |             neurons.append(dense_neuron)
46 |             neurons.append(1)
47 |             print("\n> testing neurons: (%s)..." %(str(neurons))) 
48 |             _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout,
49 |                                         neurons, batch_size, epochs, validation_split, verbose)
50 |             neurons_result[str(neurons)] = testScore
51 |             neurons = neurons[:2]
52 |     return neurons_result
53 | 
54 | def optimal_decay(stock, start_date, end_date, future_gap, time_steps, split, dropout,
55 |                   neurons, batch_size, epochs, validation_split, verbose, decay_list):
56 |     decay_result = {}
57 |     for decay in decay_list:
58 |         print("\n> testing decay: (%.1f)..." %(decay))
59 |         _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout,
60 |                                     neurons, batch_size, epochs, validation_split, verbose, decay)
61 |         decay_result[decay] = testScore
62 |     return decay_result
63 | 
64 | def optimal_time_steps(stock, start_date, end_date, future_gap, split, dropout, neurons, batch_size,
65 |                        epochs, validation_split, verbose, decay, time_steps_list):
66 |     timesteps_result = {}
67 |     for time_steps in time_steps_list:
68 |         print("\n> testing time steps: (%d)..." %(time_steps))
69 |         _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout,
70 |                                     neurons, batch_size, epochs, validation_split, verbose, decay)
71 |         timesteps_result[time_steps] = testScore
72 |     return timesteps_result
73 | 
74 | def optimal_batch_size(stock, start_date, end_date, future_gap, time_steps, split, dropout, neurons,
75 |                        epochs, validation_split, verbose, decay, batch_size_list):
76 |     batch_size_result = {}
77 |     for batch_size in batch_size_list:
78 |         print("\n> testing batch size: (%d)..." %(batch_size))
79 |         _, testScore = evaluate_lstm(stock, start_date, end_date, future_gap, time_steps, split, dropout,
80 |                                     neurons, batch_size, epochs, validation_split, verbose, decay)
81 |         batch_size_result[batch_size] = testScore
82 |     return batch_size_result


--------------------------------------------------------------------------------
/machine_learning/development/optimized_lstm/lstm.py:
--------------------------------------------------------------------------------
  1 | from utils.util import get_stock_data, plot_data
  2 | from machine_learning.development.testing.lag_metric import compute_lag_metric
  3 | import machine_learning.development.dataset_preprocessing as dpp
  4 | import numpy as np
  5 | import matplotlib.pyplot as plt
  6 | from keras.models import Sequential
  7 | from keras.layers.core import Dense, Dropout
  8 | from keras.layers.recurrent import LSTM
  9 | from keras.optimizers import Adam
 10 | from sklearn.preprocessing import MinMaxScaler
 11 | from machine_learning.development.new_regression.new_dataset import compute_mape
 12 | from sklearn.metrics import mean_squared_error, mean_absolute_error
 13 | from sklearn.metrics import r2_score
 14 | 
 15 | def bulid_dataset(stock_symbol, start_date, end_date, normalize=True):
 16 |     cols = ["Date", "Open", "Low", "High", "Adj Close"]
 17 |     df = get_stock_data(stock_symbol, start_date, end_date, cols)
 18 |     df.replace([np.inf, -np.inf], np.nan, inplace=True)
 19 |     df.fillna(method='ffill', inplace=True)
 20 |     df.fillna(method='bfill', inplace=True)
 21 |     scaler = None
 22 | 
 23 |     if normalize:        
 24 |         scaler = MinMaxScaler()
 25 |         df['Open'] = scaler.fit_transform(df['Open'].values.reshape(-1,1))
 26 |         df['Low'] = scaler.fit_transform(df['Low'].values.reshape(-1,1))
 27 |         df['High'] = scaler.fit_transform(df['High'].values.reshape(-1,1))
 28 |         df['Adj Close'] = scaler.fit_transform(df['Adj Close'].values.reshape(-1,1))
 29 |     
 30 |     print(df.head())
 31 |     print(df.tail())
 32 |     return df, scaler
 33 | 
 34 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True):
 35 |     cols = ["Date", "Adj Close", "Volume"]
 36 |     df = get_stock_data(stock_symbol, start_date, end_date, cols)
 37 |     df.rename(columns={"Adj Close" : 'price'}, inplace=True)
 38 |     df['momentum'] = dpp.compute_momentum_ratio(df['price'], window)
 39 |     df['sma'] = dpp.compute_sma_ratio(df['price'], window)
 40 |     df['bolinger_band'] = dpp.compute_bollinger_bands_ratio(df['price'], window)
 41 |     df['volatility'] = dpp.compute_volatility_ratio(df['price'], window)
 42 |     df['vroc'] = dpp.compute_vroc_ratio(df['Volume'], window)
 43 |     df['actual_price'] = df['price']
 44 |     df.drop(columns=["Volume"], inplace=True)
 45 |     df = df[window:]
 46 |     df.replace([np.inf, -np.inf], np.nan, inplace=True)
 47 |     df.fillna(method='ffill', inplace=True)
 48 |     df.fillna(method='bfill', inplace=True)
 49 |     scaler = None
 50 | 
 51 |     if normalize:        
 52 |         scaler = MinMaxScaler()
 53 |         df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1))
 54 |         df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1))
 55 |         df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1))
 56 |         df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1))
 57 |         df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1))
 58 |         df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1))
 59 |         df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1))
 60 |         
 61 |     print(df.head())
 62 |     print(df.tail())
 63 |     return df, scaler
 64 | 
 65 | def lstm_dataset_reshape(dataset, time_steps, future_gap, split):
 66 |     print("Dataset Shape:", dataset.shape)
 67 |     X = dataset[:, :-1]
 68 |     Y = dataset[:, -1]
 69 |     print("X Shape:", X.shape)
 70 |     print("Y Shape:", Y.shape)
 71 | 
 72 |     X_sampled = []
 73 |     for i in range(X.shape[0] - time_steps + 1):
 74 |         X_sampled.append(X[i : i+time_steps])
 75 |     X_sampled = np.array(X_sampled)
 76 |     print("Sampled X Shape:", X_sampled.shape)
 77 | 
 78 |     future_gap_index = future_gap - 1
 79 |     X_sampled = X_sampled[:-future_gap]
 80 |     Y_sampled = Y[time_steps+future_gap_index: ]
 81 |     print("Applying Future Gap...")
 82 |     print("Sampled X Shape:", X_sampled.shape)
 83 |     print("Sampled Y Shape:", Y_sampled.shape)
 84 | 
 85 |     if split != None:
 86 |         split_index = int(split*X_sampled.shape[0])
 87 |         X_train = X_sampled[:split_index]
 88 |         X_test = X_sampled[split_index:]
 89 |         Y_train = Y_sampled[:split_index]
 90 |         Y_test = Y_sampled[split_index:]
 91 |         print("(X_train, Y_train, X_test, Y_test) Shapes:")
 92 |         print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
 93 |         return X_train, Y_train, X_test, Y_test
 94 | 
 95 |     return X_sampled, Y_sampled
 96 | 
 97 | def build_model(time_steps, features, neurons, drop_out, decay=0.0):
 98 |     model = Sequential()
 99 |     
100 |     model.add(LSTM(neurons[0], input_shape=(time_steps, features), return_sequences=True))
101 |     model.add(Dropout(drop_out))
102 |         
103 |     model.add(LSTM(neurons[1], input_shape=(time_steps, features), return_sequences=False))
104 |     model.add(Dropout(drop_out))
105 |         
106 |     model.add(Dense(neurons[2], activation='relu'))        
107 |     model.add(Dense(neurons[3], activation='linear'))
108 | 
109 |     adam = Adam(decay=decay)
110 |     model.compile(loss='mse',optimizer=adam)
111 |     model.summary()
112 |     return model
113 | 
114 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks):
115 | 
116 |     history = model.fit(
117 |     X_train,
118 |     Y_train,
119 |     batch_size = batch_size,
120 |     epochs = epochs,
121 |     validation_split = validation_split,
122 |     verbose = verbose,
123 |     callbacks = callbacks
124 |     )
125 | 
126 |     return history
127 | 
128 | def evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose):
129 |     train_mse = model.evaluate(X_train, Y_train, verbose=verbose)
130 |     print('Insample Testing: %.5f MSE (%.3f RMSE)' % (train_mse, (train_mse ** 0.5)))
131 | 
132 |     test_mse = model.evaluate(X_test, Y_test, verbose=verbose)
133 |     print('Outsample Testing: %.5f MSE (%.3f RMSE)' % (test_mse, (test_mse ** 0.5)))
134 | 
135 |     return train_mse, test_mse
136 | 
137 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled):
138 |     rmse = (mean_squared_error(Y_test, predictions) ** 0.5)
139 |     print('\nNormalized RMSE: %.3f' %(rmse))
140 |     nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test)
141 |     print('Normalized NRMSE: %.3f' %(nrmse))
142 |     mae = mean_absolute_error(Y_test, predictions)
143 |     print('Normalized MAE: %.3f' %(mae))
144 |     mape = compute_mape(Y_test, predictions)
145 |     print('Normalized MAPE: %.3f' %(mape))
146 |     correlation = np.corrcoef(Y_test.T, predictions.T)
147 |     print("Normalized Correlation: %.3f"%(correlation[0, 1]))
148 |     r2 = r2_score(Y_test, predictions)
149 |     print("Normalized r^2: %.3f"%(r2))
150 |     normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
151 | 
152 |     #evaluating the model on the inverse-normalized dataset
153 |     rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5)
154 |     print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
155 |     nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test)
156 |     print('Normalized NRMSE: %.3f' %(nrmse))
157 |     mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled)
158 |     print('Normalized MAE: %.3f' %(mae))
159 |     mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
160 |     print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
161 |     correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T)
162 |     print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
163 |     r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled)
164 |     print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
165 |     inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
166 | 
167 |     return normalized_metrics, inv_normalized_metrics
168 | 
169 | def test_lstm(stock_symbol, start_date, end_date, window, future_gap, time_steps,
170 |               neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks, show_plot_flg):
171 |     #building the dataset
172 |     print("> building the dataset...")
173 |     df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window)
174 |     df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
175 |     #reshaping the dataset for LSTM
176 |     print("\n> reshaping the dataset for LSTM...")
177 |     ds_train = df_train.values
178 |     ds_test = df_test.values
179 |     X_train, Y_train = lstm_dataset_reshape(ds_train, time_steps, future_gap, None)
180 |     X_test, Y_test = lstm_dataset_reshape(ds_test, time_steps, future_gap, None)
181 |     #building the LSTM model
182 |     print("\n> building the LSTM model...")
183 |     features = X_train.shape[2]
184 |     model = build_model(time_steps, features, neurons, drop_out)
185 |     #fitting the training data
186 |     print("\n> fitting the training data...")
187 |     model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks)
188 |     #predictions
189 |     print("\n> testing the model for predictions...")
190 |     predictions = model.predict(X_test)
191 |     #inverse-scaling
192 |     print("\n> inverse-scaling the scaled values...")
193 |     predictions = predictions.reshape((predictions.shape[0], 1))
194 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
195 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
196 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
197 |     #grouping the actual prices and predictions
198 |     print("\n> grouping the actual prices and predictions...")
199 |     feature_cols = df_test.columns.tolist()
200 |     feature_cols.remove("actual_price")
201 |     df_test.drop(columns=feature_cols, inplace=True)
202 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
203 |     df_test = df_test.iloc[time_steps+future_gap-1:]
204 |     df_test['Actual'] = Y_test_inv_scaled
205 |     df_test['Prediction'] = predictions_inv_scaled
206 |     #ploting the forecast vs the actual
207 |     print("\n> plotting the results...")
208 |     lookup = 5
209 |     lag_list = compute_lag_metric(df_test['Actual'], df_test['Prediction'], lookup, stock_symbol)
210 | 
211 |     df_test = df_test[:len(df_test)-lookup+1]
212 |     plot_data(df_test, stock_symbol+" Price Forecast", "Date", "Price", show_plot=False)
213 | 
214 |     ax = df_test.plot(title=stock_symbol+" Price Forecast and PAL Overlay")
215 |     ax.set_xlabel("Date")
216 |     ax.set_ylabel("Price")
217 |     ax.legend(loc="best")
218 |     ax.grid(True)
219 |     #sudden vs normal plot annotation
220 |     ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10,
221 |             arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
222 |     ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10,
223 |             arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
224 |     ax1 = ax.twinx()
225 |     ax1.scatter(df_test.index, lag_list, c='g')
226 |     ax1.set_ylabel("PAL")
227 | 
228 |     if show_plot_flg:
229 |         plt.show()
230 | 
231 | def final_test_lstm(stock_symbol, start_date, end_date, window, future_gap, time_steps,
232 |               neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks):
233 |     #building the dataset
234 |     print("> building the dataset...")
235 |     df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window)
236 |     df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
237 |     #reshaping the dataset for LSTM
238 |     print("\n> reshaping the dataset for LSTM...")
239 |     ds_train = df_train.values
240 |     ds_test = df_test.values
241 |     X_train, Y_train = lstm_dataset_reshape(ds_train, time_steps, future_gap, None)
242 |     X_test, Y_test = lstm_dataset_reshape(ds_test, time_steps, future_gap, None)
243 |     #building the LSTM model
244 |     print("\n> building the LSTM model...")
245 |     features = X_train.shape[2]
246 |     model = build_model(time_steps, features, neurons, drop_out)
247 |     #fitting the training data
248 |     print("\n> fitting the training data...")
249 |     model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks)
250 |     #predictions
251 |     print("\n> testing the model for predictions...")
252 |     predictions = model.predict(X_test)
253 |     #inverse-scaling
254 |     print("\n> inverse-scaling the scaled values...")
255 |     predictions = predictions.reshape((predictions.shape[0], 1))
256 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
257 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
258 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
259 |     #evaluation
260 |     normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 
261 |                                                           Y_test_inv_scaled, predictions_inv_scaled)
262 |     #grouping the actual prices and predictions
263 |     print("\n> grouping the actual prices and predictions...")
264 |     feature_cols = df_test.columns.tolist()
265 |     feature_cols.remove("actual_price")
266 |     df_test.drop(columns=feature_cols, inplace=True)
267 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
268 |     df_test = df_test.iloc[time_steps+future_gap-1:]
269 |     df_test['Actual'] = Y_test_inv_scaled
270 |     df_test['Prediction'] = predictions_inv_scaled
271 | 
272 |     return normalized_metrics, inv_normalized_metrics, df_test


--------------------------------------------------------------------------------
/machine_learning/development/optimized_lstm/lstm_main.py:
--------------------------------------------------------------------------------
  1 | from machine_learning.development.optimized_lstm import lstm
  2 | from machine_learning.development.new_regression.new_dataset import compute_mape
  3 | from keras.callbacks import EarlyStopping
  4 | from sklearn.metrics import mean_squared_error
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | from sklearn.metrics import r2_score
  8 | 
  9 | def main(internal_eval=False):
 10 |     #building the dataset
 11 |     print("> building the dataset...")
 12 |     stock_symbol = '^GSPC'
 13 |     start_date = '1950-01-01'
 14 |     end_date = '2017-12-31'
 15 |     window = 2
 16 |     dataframe, scaler = lstm.bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
 17 | 
 18 |     #reshaping the dataset for LSTM
 19 |     print("\n> reshaping the dataset for LSTM...")
 20 |     dataset = dataframe.values
 21 |     time_steps = 1 #1 trading day
 22 |     future_gap = 1 #1 trading day
 23 |     split = 0.8 #80% of the dataset
 24 |     X_train, Y_train, X_test, Y_test = lstm.lstm_dataset_reshape(dataset, time_steps, future_gap, split)
 25 | 
 26 |     #building the LSTM model
 27 |     print("\n> building the LSTM model...")
 28 |     features = X_train.shape[2]
 29 |     neurons = [256, 256, 32, 1]
 30 |     drop_out = 0.2
 31 |     verbose = 1
 32 |     model = lstm.build_model(time_steps, features, neurons, drop_out)
 33 | 
 34 |     #fitting the training data
 35 |     print("\n> fitting the training data...")
 36 |     early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
 37 |                                             patience=50, verbose=verbose, mode='auto')
 38 |     batch_size = 2048
 39 |     epochs = 300
 40 |     validation_split = 0.1
 41 |     history = lstm.model_fit(model, X_train, Y_train, batch_size, epochs, validation_split,
 42 |                              verbose, [early_stopping_callback])
 43 | 
 44 |     #internal evaluation
 45 |     if internal_eval:
 46 |         print("\n> internal evaluation...")
 47 |         _, _ = lstm.evaluate_model(model, X_train, Y_train, X_test, Y_test, verbose)
 48 | 
 49 |     #predictions
 50 |     predictions = model.predict(X_test)
 51 |     predictions = predictions.reshape((predictions.shape[0], 1))
 52 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
 53 | 
 54 |     #evaluating the model on the normalized dataset
 55 |     rmse = (mean_squared_error(predictions, Y_test) ** 0.5)
 56 |     print('\nNormalized Test RMSE: %.3f' %(rmse))
 57 |     mape = compute_mape(Y_test, predictions)
 58 |     print('Normalized Outsample MAPE: %.3f' %(mape))
 59 |     correlation = np.corrcoef(predictions.T, Y_test.T)
 60 |     print("Normalized Correlation: %.3f"%(correlation[0, 1]))
 61 |     r2 = r2_score(predictions, Y_test)
 62 |     print("Normalized Outsample r^2: %.3f"%(r2))
 63 | 
 64 |     #evaluating the model on the inverse-normalized dataset
 65 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
 66 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
 67 | 
 68 |     rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5)
 69 |     print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
 70 |     mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
 71 |     print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
 72 |     correlation = np.corrcoef(predictions_inv_scaled.T, Y_test_inv_scaled.T)
 73 |     print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
 74 |     r2 = r2_score(predictions_inv_scaled, Y_test_inv_scaled)
 75 |     print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
 76 | 
 77 |     #plotting the results
 78 |     print("\n> plotting the results...")
 79 |     _, ax2 = plt.subplots()
 80 |     '''ax1.plot(history.history['loss'], label='Training')
 81 |     ax1.plot(history.history['val_loss'], label='Validation')
 82 |     ax1.set_xlabel('Epoch #')
 83 |     ax1.set_ylabel('Loss')
 84 |     ax1.legend(loc='best')
 85 |     ax1.grid(True)
 86 |     '''
 87 |     ax2.plot(range(len(predictions_inv_scaled)), predictions_inv_scaled, label='Prediction')
 88 |     ax2.plot(range(len(Y_test_inv_scaled)), Y_test_inv_scaled, label='Actual')
 89 |     ax2.set_xlabel('Trading Day')
 90 |     ax2.set_ylabel('Price')
 91 |     ax2.legend(loc='best')
 92 |     ax2.grid(True)
 93 | 
 94 |     plt.show()
 95 | 
 96 | main()
 97 | 
 98 | 
 99 | #to be stored temporarily
100 | '''#evaluating the model on the *inverse-normalized dataset*
101 | if print_flag:
102 |     print("\n> evaluating the model on the *dataset*...")
103 | predictions = model.predict(X_test)
104 | Y_test = Y_test.reshape((Y_test.shape[0], 1))
105 | 
106 | predictions_inv_scaled = scaler.inverse_transform(predictions)
107 | Y_test_inv_scaled = scaler.inverse_transform(Y_test)
108 | 
109 | rmse = (mean_squared_error(predictions_inv_scaled, Y_test_inv_scaled) ** 0.5)
110 | if print_flag:
111 |     print('Outsample RMSE: %.3f' %(rmse))
112 | #correlation = np.corrcoef(predictions_inv_scaled, Y_test_inv_scaled)
113 | #print("Outsample Correlation: %.3f"%(correlation[0, 1]))
114 | '''


--------------------------------------------------------------------------------
/machine_learning/development/optimized_lstm/optimal_hyperparameters.txt:
--------------------------------------------------------------------------------
1 | dropout: 0.2, neurons: [256, 256, 32, 1], decay: 0.1, time_steps: 5, batch_size: 2048, epochs: 300, time elapsed: 90345.647s.


--------------------------------------------------------------------------------
/machine_learning/development/original_evaluation.md:
--------------------------------------------------------------------------------
 1 | ## Algorithms Evaluation
 2 | *Algorithm: (RMSE, Correlation)*
 3 | 
 4 | * <strong> Linear Regression: (3.328, 0.948)</strong>
 5 | ```sh
 6 | python -m machine_learning.development.linear_regression
 7 | ```
 8 | ![Linear Regression](https://github.com/ahmedhamdi96/ML4T/blob/master/results/lin_reg.png)
 9 | 
10 | * <strong> kNN Regression: (2.142, 0.905)</strong>
11 | ```sh
12 | python -m machine_learning.development.knn_regression
13 | ```
14 | ![kNN Regression](https://github.com/ahmedhamdi96/ML4T/blob/master/results/knn.png)
15 | 
16 | * <strong> Keras Regression: (3.360, 0.947)</strong>
17 | ```sh
18 | python -m machine_learning.development.keras_ffnn
19 | ```
20 | ![Keras Regression](https://github.com/ahmedhamdi96/ML4T/blob/master/results/ffnn_reg.png)
21 | 
22 | * <strong> Keras RNN LSTM: (3.405, 0.949)</strong>
23 | ```sh
24 | python -m machine_learning.development.keras_lstm
25 | ```
26 | ![Keras RNN LSTM](https://github.com/ahmedhamdi96/ML4T/blob/master/results/lstm.png)


--------------------------------------------------------------------------------
/machine_learning/development/technical_indicators_dataset.py:
--------------------------------------------------------------------------------
 1 | '''This file constructs a dataset to be used by the ML algorithms.
 2 | The dataset consists of the past price and technical indicators as
 3 | features, and the price as the output. The dataset is indexed by
 4 | date, a row entry contains the price and techincal indicators of
 5 | some day prior to the date index, and the price is the actual 
 6 | price of the stock at the date marked by the index.
 7 | '''
 8 | from utils.util import get_stock_data
 9 | import numpy as np
10 | import pandas as pd
11 | import talib as ta
12 | 
13 | '''technical indicators computation functions
14 | 
15 | *prices : adjusted closing stock prices
16 | *window : rolling statistics window 
17 | '''
18 | #BEGIN
19 | def compute_momentum_ratio(prices, window):
20 |     #first window elements >> NA
21 |     momentum_ratio = (prices/prices.shift(periods = window)) - 1
22 |     return momentum_ratio
23 | 
24 | def compute_sma_ratio(prices, window):
25 |     #Simple Moving Average
26 |     #first window-1 elements >> NA
27 |     sma_ratio = (prices / prices.rolling(window = window).mean()) - 1
28 |     return sma_ratio
29 | 
30 | def compute_bollinger_bands_ratio(prices, window):
31 |     #first window-1 elements >> NA
32 |     bb_ratio = prices - prices.rolling(window = window).mean()
33 |     bb_ratio = bb_ratio / (2 * prices.rolling(window = window).std())
34 |     return bb_ratio
35 | 
36 | def compute_daily_return_volatility(prices, window):
37 |     #first window-1 elements >> NA
38 |     daily_return = (prices/prices.shift(periods= 1)) - 1
39 |     volatility = daily_return.rolling(window=window).std()
40 |     return volatility
41 | #END
42 | 
43 | '''dataset constructor function
44 | 
45 | *start_date : start date for the entire dataset (training and testing)
46 | *end_date   : end date for the entire dataset (training and testing)
47 | *stock      : stock label to be used in the dataset
48 | '''
49 | def get_dataset_dataframe(start_date='17/12/2014', end_date = '31/12/2017', stock='IBM'):
50 |     #importing stock data
51 |     columns = ["Date", "Adj Close", "High", "Low", "Volume"]
52 |     stock_df = get_stock_data(stock, start_date, end_date, columns=columns) 
53 |     date_range = pd.date_range(start_date, end_date)
54 |     dataset_df = pd.DataFrame(index=date_range)
55 |     #calculating technical indicators
56 |     #make sure to include the last 2 weeks of 2014 to compensate calculations loss
57 |     #1st week is lost in the preparation of the indicators
58 |     #2nd week is lost to include the future gap
59 |     future_gap = 5 #1 trading week
60 |     dataset_df['price'] = stock_df["Adj Close"]
61 |     dataset_df.dropna(subset=['price'], inplace=True)
62 |     dataset_df['momentum'] = compute_momentum_ratio(stock_df["Adj Close"], future_gap)
63 |     dataset_df['sma'] = compute_sma_ratio(stock_df["Adj Close"], future_gap)
64 |     dataset_df['bolinger_band'] = compute_bollinger_bands_ratio(stock_df["Adj Close"], future_gap)
65 |     dataset_df['sar'] = ta.SAR(stock_df["High"], stock_df["Low"])
66 |     dataset_df['rsi'] = ta.RSI(stock_df["Adj Close"], timeperiod=future_gap)
67 |     dataset_df['obv'] = ta.OBV(stock_df["Adj Close"], stock_df["Volume"])
68 |     dataset_df['adosc'] = ta.ADOSC(stock_df["High"], stock_df["Low"], stock_df["Adj Close"], stock_df["Volume"],
69 |                                    fastperiod=2, slowperiod=3)
70 |     dataset_df['macd'], _, _ = ta.MACD(stock_df["Adj Close"], fastperiod=2, slowperiod=3, signalperiod=3)
71 |     dataset_df['slowk '], dataset_df['slowd'] = ta.STOCH(stock_df["High"], stock_df["Low"], stock_df["Adj Close"],
72 |                                                          fastk_period=3, slowk_period=2, slowd_period=3)
73 |     dataset_df['cci'] = ta.CCI(stock_df["High"], stock_df["Low"], stock_df["Adj Close"], timeperiod=future_gap)
74 |     dataset_df['volatility'] = compute_daily_return_volatility(stock_df["Adj Close"], future_gap)
75 |     dataset_df.dropna(subset=dataset_df.columns, inplace=True)
76 |     dataset_df = dataset_df.shift(future_gap)
77 |     shifted_columns_names = ['price(t-%d)' %(future_gap), 'moment(t-%d)' %(future_gap), 'sma(t-%d)' %(future_gap), 
78 |                              'b_band(t-%d)' %(future_gap), 'sar(t-%d)' %(future_gap), 'rsi(t-%d)' %(future_gap),
79 |                              'obv(t-%d)' %(future_gap), 'adosc(t-%d)' %(future_gap), 'macd(t-%d)' %(future_gap),
80 |                              'slowk(t-%d)' %(future_gap), 'slowd(t-%d)' %(future_gap), 'cci(t-%d)' %(future_gap),
81 |                              'volatility(t-%d)' %(future_gap)]
82 |     dataset_df.columns = shifted_columns_names
83 |     dataset_df.dropna(subset=shifted_columns_names, inplace=True)
84 |     dataset_df['price'] = stock_df["Adj Close"]
85 | 
86 |     return dataset_df


--------------------------------------------------------------------------------
/machine_learning/development/testing/analysis.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.optimized_lstm import lstm
 2 | from keras.callbacks import EarlyStopping
 3 | 
 4 | #sudden vs normal
 5 | stocks_list = ['TSLA']
 6 | show_plot = len(stocks_list)
 7 | dates_dic = {
 8 |     'TSLA': ['2013-01-01', '2013-06-01'],
 9 |     }
10 | 
11 | window = 2
12 | future_gap = 1
13 | time_steps = 1
14 | neurons = [256, 256, 32, 1]
15 | drop_out = 0.2                                   
16 | batch_size = 2048
17 | epochs = 300
18 | validation_split = 0.1
19 | verbose = 1
20 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
21 |                                         patience=50, verbose=verbose, mode='auto')
22 | callbacks = [early_stopping_callback] 
23 | 
24 | for stock in stocks_list:
25 |     show_plot -= 1
26 |     show_plot_flg = True if show_plot == 0  else False
27 |     start_date = dates_dic[stock][0]
28 |     end_date = dates_dic[stock][1]
29 |     lstm.test_lstm(stock, start_date, end_date, window, future_gap, time_steps,
30 |                 neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks, show_plot_flg)
31 | 
32 | #sudden vs normal forecast annotations
33 | ''' 
34 | ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10,
35 |             arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
36 | ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10,
37 |         arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
38 | '''


--------------------------------------------------------------------------------
/machine_learning/development/testing/companies.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.optimized_lstm import lstm
 2 | from keras.callbacks import EarlyStopping
 3 | 
 4 | stocks_list = ['FB', 'AAPL', 'TSLA', 'AMZN']
 5 | show_plot = len(stocks_list)
 6 | dates_dic = {
 7 |     'FB'  : ['2017-12-01', '2018-05-01'],
 8 |     'AAPL': ['2012-08-01', '2013-08-01'],
 9 |     'TSLA': ['2013-08-01', '2014-01-01'],
10 |     'AMZN': ['2017-08-01', '2018-04-01'],
11 |     }
12 | 
13 | window = 2
14 | future_gap = 1
15 | time_steps = 1
16 | neurons = [256, 256, 32, 1]
17 | drop_out = 0.2                                   
18 | batch_size = 2048
19 | epochs = 300
20 | validation_split = 0.1
21 | verbose = 1
22 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
23 |                                         patience=50, verbose=verbose, mode='auto')
24 | callbacks = [early_stopping_callback] 
25 | 
26 | for stock in stocks_list:
27 |     show_plot -= 1
28 |     show_plot_flg = True if show_plot == 0  else False
29 |     start_date = dates_dic[stock][0]
30 |     end_date = dates_dic[stock][1]
31 |     lstm.test_lstm(stock, start_date, end_date, window, future_gap, time_steps,
32 |                 neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks, show_plot_flg)


--------------------------------------------------------------------------------
/machine_learning/development/testing/future_gap_test.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.optimized_lstm import lstm
 2 | from keras.callbacks import EarlyStopping
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | stock = 'AAPL'
 6 | dates_dic = {
 7 |     'AAPL'  : ['2017-01-01', '2018-01-01']
 8 |     }
 9 | 
10 | window = 2
11 | future_gap = 1
12 | time_steps = 1
13 | neurons = [256, 256, 32, 1]
14 | drop_out = 0.2                                   
15 | batch_size = 2048
16 | epochs = 300
17 | validation_split = 0.1
18 | verbose = 1
19 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
20 |                                         patience=50, verbose=verbose, mode='auto')
21 | callbacks = [early_stopping_callback] 
22 | 
23 | #future_gap test
24 | future_gap_list = [1, 5, 20]
25 | future_gap_dic = {
26 |     1  : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
27 |     5  : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
28 |     20 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
29 | }
30 | future_gap_plots = {
31 |     1  : None,
32 |     5  : None,
33 |     20 : None
34 | }
35 | 
36 | for future_gap in future_gap_list:
37 |     start_date = dates_dic[stock][0]
38 |     end_date = dates_dic[stock][1]
39 |     normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 
40 |     end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 
41 |     verbose, callbacks)
42 |     future_gap_dic[future_gap][0] = normalized_metrics
43 |     future_gap_dic[future_gap][1] = inv_normalized_metrics
44 |     future_gap_plots[future_gap] = df
45 | 
46 | print(future_gap_dic)
47 | 
48 | fig, (ax1, ax2, ax3) = plt.subplots(3, 1)
49 | 
50 | df = future_gap_plots[1]
51 | ax1.plot(df.index, df["Actual"], label='Actual')
52 | ax1.plot(df.index, df["Prediction"], label='Prediction')
53 | ax1.set_title('Future Gap = 1')
54 | ax1.set_xlabel('Date')
55 | ax1.set_ylabel('Price')
56 | ax1.legend(loc="best")
57 | ax1.grid(True)
58 | 
59 | df = future_gap_plots[5]
60 | ax2.plot(df.index, df["Actual"], label='Actual')
61 | ax2.plot(df.index, df["Prediction"], label='Prediction')
62 | ax2.set_title('Future Gap = 5')
63 | ax2.set_xlabel('Date')
64 | ax2.set_ylabel('Price')
65 | ax2.legend(loc="best")
66 | ax2.grid(True)
67 | 
68 | df = future_gap_plots[20]
69 | ax3.plot(df.index, df["Actual"], label='Actual')
70 | ax3.plot(df.index, df["Prediction"], label='Prediction')
71 | ax3.set_title('Future Gap = 20')
72 | ax3.set_xlabel('Date')
73 | ax3.set_ylabel('Price')
74 | ax3.legend(loc="best")
75 | ax3.grid(True)
76 | 
77 | fig.tight_layout()
78 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/development/testing/lag_metric.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | import matplotlib.pyplot as plt 
 3 | 
 4 | def compute_lag_metric(actual, prediction, lookup, symbol):
 5 |     diff_list = [None] * lookup
 6 |     lag_list = [None] * (len(actual)-lookup+1)
 7 | 
 8 |     for i in range(len(actual)-lookup+1):
 9 |         for j in range(lookup):
10 |             diff_list[j] = abs(actual[i] - prediction[i+j])
11 |         lag_list[i] = diff_list.index(min(diff_list))
12 | 
13 |     max_diff_count = [0] * lookup
14 | 
15 |     for i in range(len(lag_list)):
16 |         max_diff_count[lag_list[i]] += 1
17 | 
18 |     _, ax = plt.subplots()
19 |     ax.bar(range(len(max_diff_count)), max_diff_count, align='center')
20 |     plt.sca(ax)
21 |     plt.title(symbol+" Lag Test")
22 |     ax.set_xlabel('Day Lag')
23 |     ax.set_ylabel('Frequency')
24 |     ax.grid(True)
25 | 
26 |     _, ax1 = plt.subplots()
27 |     ax1.scatter(range(len(lag_list)), lag_list)
28 |     plt.title(symbol+" Daily Lag Test")
29 |     ax1.set_xlabel('Trading Day')
30 |     ax1.set_ylabel('Lag')
31 |     ax1.grid(True)
32 | 
33 |     return lag_list


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/amazon.md:
--------------------------------------------------------------------------------
1 | ## Amazon
2 | 
3 | ### Exceeding Q3 epectations, [September/2017 - February/2018]
4 | 
5 | ![Amazon](https://github.com/ahmedhamdi96/ML4T/blob/master/results/amazon.png)


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/analysis.md:
--------------------------------------------------------------------------------
 1 | ## Analysis with PAL
 2 | 
 3 | ### Sudden Changes vs Normal Movements
 4 | 
 5 | This forecast is used to predict the tesla stock for a duration between 01/01/2013 and 01/06/2013, PAL is also used to analyze the behaviour of the model during two different periods a stock usually goes through; a normal movement, where the stock price fluctuates  with no dramatic change, the other period is a sudden change period, where the stock moves violently either upwards, downwards, or up and down with high volatility.
 6 | 
 7 | ![SvN](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal.png)
 8 | 
 9 | > Up until 07/05/2013, the stock price movement exhibits  a normal movement, no violent trajectories appear. 
10 | > This is when the model performs best. The forecast does not lag the actual price, and follows the same trend 
11 | > and movement of the actual price. Starting from 07/05/2013, the stock moves up with a steep trajectory, and 
12 | > during that sudden change is when the model performs poorly. Upon researching news about Tesla on May/2013, 
13 | > it was discovered that the company reported its first quarterly profit and its flagship at that time, the 
14 | > Model S, received the best review of any car in Consumer Reports magazine's history, see the report 
15 | > [here](http://money.cnn.com/2013/05/10/investing/tesla-stock). These postive news caused an unexpected and 
16 | > sudden surge in Tesla's stock price. A hypothesis that can be proposed from that is that the model is capable 
17 | > of predicting the price and the fluctuations in price caused by the stock market movement, but when external 
18 | > events that impact the stock price suddenly, the model naturally does not pick up on these events.
19 | 
20 | ![Lag](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_lag.png)
21 | 
22 | > This plot shows the frequency of when was the prediction closest to the actual price, the day lag indicates 
23 | > the number of days it took for the forecast to best match the actual price. 
24 | 
25 | ![Daily Lag](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_daily_lag.png)
26 | 
27 | > This plot follows the same timeline of the forecast on the x-axis, against the lag on the y-axis. This plot 
28 | > supports the hypothesis, mentioned earlier, the model finds the closest prediction to the actual price early 
29 | > on during the normal movement phase, and lags at the end of the timeline during the sudden change phase.
30 | 
31 | ![SvN w/ PAL](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_pal.png)
32 | 
33 | ![SvN w/ PAL 1](https://github.com/ahmedhamdi96/ML4T/blob/master/results/sudden_vs_normal_pal_1.png)


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/apple.md:
--------------------------------------------------------------------------------
1 | ## Apple
2 | 
3 | ### Apple's first free fall, [September/2012 - June/2013]
4 | 
5 | ![Apple](https://github.com/ahmedhamdi96/ML4T/blob/master/results/apple.png)


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/eval.md:
--------------------------------------------------------------------------------
 1 | ## Evaluation Metrics
 2 | 
 3 | ### Apple 2017 Stock Price Forecast
 4 | 
 5 | | Future Gap | RMSE | NRMSE | MAE | MAPE | Corr | R^2  |
 6 | | :--------: | :--: | :--: | :--: | :--: | :--: | :--: |
 7 | | 1 day      | 0.0281 | 0.0492 | 0.0196 | 4.72 | 0.993 | 0.986 |
 8 | | 1 week     | 0.0672 | 0.116 | 0.0524 | 11.3 | 0.967 | 0.915 |
 9 | | 1 month    | 0.1539 | 0.252 | 0.129 | 23.7 | 0.827 | 0.396 |
10 | 
11 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/future_gap_test.png)


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/facebook.md:
--------------------------------------------------------------------------------
1 | ## Facebook
2 | 
3 | ### Facebook–Cambridge Analytica data scandal, [January/2018 - March/2018]
4 | 
5 | ![Facebook](https://github.com/ahmedhamdi96/ML4T/blob/master/results/facebook.png)


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/future_gap.md:
--------------------------------------------------------------------------------
 1 | ## Future Gap Test
 2 | 
 3 | ### Microsoft 2017 Stock Price Forecast
 4 | 
 5 | #### LSTM
 6 | | Future Gap | RMSE | NRMSE | MAE | Corr | R^2  |
 7 | | :--------: | :--: | :--: | :--: | :--: | :--: |
 8 | | 1 Day  | 0.0273 | 0.0676 | 0.0184 | 0.995 | 0.991 |
 9 | | 2 Days | 0.0369 | 0.0909 | 0.0254 | 0.992 | 0.983 |
10 | | 3 Days | 0.0437 | 0.1070 | 0.0314 | 0.989 | 0.976 | 
11 | | 4 Days | 0.0496 | 0.1210 | 0.0363 | 0.985 | 0.969 |
12 | | 5 Days | 0.0568 | 0.1380 | 0.0421 | 0.981 | 0.959 |
13 | 
14 | #### Linear Regressor
15 | | Future Gap | RMSE | NRMSE | MAE | Corr | R^2  |
16 | | :--------: | :--: | :--: | :--: | :--: | :--: |
17 | | 1 Day  | 0.0275 | 0.0679 | 0.0185 | 0.993 | 0.990 |
18 | | 2 Days | 0.0372 | 0.0917 | 0.0260 | 0.992 | 0.983 |
19 | | 3 Days | 0.0441 | 0.1080 | 0.0317 | 0.989 | 0.976 | 
20 | | 4 Days | 0.0504 | 0.1230 | 0.0366 | 0.985 | 0.968 |
21 | | 5 Days | 0.0572 | 0.1390 | 0.0422 | 0.981 | 0.958 |
22 | 
23 | #### FFNN
24 | | Future Gap | RMSE | NRMSE | MAE | Corr | R^2  |
25 | | :--------: | :--: | :--: | :--: | :--: | :--: |
26 | | 1 Day  | 0.0376 | 0.0931 | 0.0278 | 0.994 | 0.982 |
27 | | 2 Days | 0.0474 | 0.1170 | 0.0335 | 0.991 | 0.972 |
28 | | 3 Days | 0.0691 | 0.1700 | 0.0501 | 0.984 | 0.939 | 
29 | | 4 Days | 0.0535 | 0.1310 | 0.0389 | 0.982 | 0.964 |
30 | | 5 Days | 0.0709 | 0.1729 | 0.0512 | 0.972 | 0.936 |
31 | 
32 | *Shown below are the forecasts of the LSTM RNN model*
33 | 
34 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap1.png)
35 | 
36 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap2.png)
37 | 
38 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap3.png)
39 | 
40 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap4.png)
41 | 
42 | ![Future Gap](https://github.com/ahmedhamdi96/ML4T/blob/master/results/experiments/exp4/gap5.png)


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/tesla.md:
--------------------------------------------------------------------------------
1 | ## Tesla
2 | 
3 | ### Analysts downgrades, [September/2013 - November/2013]
4 | 
5 | ![Tesla](https://github.com/ahmedhamdi96/ML4T/blob/master/results/tesla.png)


--------------------------------------------------------------------------------
/machine_learning/development/testing/results/window_and_ts.md:
--------------------------------------------------------------------------------
 1 | ## Window and Time Steps Test
 2 | 
 3 | ### Time Steps Test
 4 | 
 5 | | Time Steps | RMSE | MAPE | Corr | R^2  |
 6 | | :--------: | :--: | :--: | :--: | :--: |
 7 | | 1 | 0.0317 | 5.26 | 0.993 | 0.982 |
 8 | | 2 | 0.0338 | 5.45 | 0.990 | 0.979 |
 9 | | 3 | 0.0452 | 7.89 | 0.988 | 0.961 |
10 | | 4 | 0.0462 | 7.77 | 0.985 | 0.959 |
11 | | 5 | 0.0538 | 8.91 | 0.982 | 0.942 |
12 | 
13 | Winner: 1
14 | 
15 | ### Window Test
16 | 
17 | | Window | RMSE | MAPE | Corr | R^2  |
18 | | :----: | :--: | :--: | :--: | :--: |
19 | | 2 | 0.0299 | 4.84 | 0.994 | 0.984 |
20 | | 3 | 0.0294 | 5.31 | 0.993 | 0.985 |
21 | | 4 | 0.0336 | 7.85 | 0.992 | 0.981 |
22 | | 5 | 0.0287 | inf  | 0.993 | 0.986 |
23 | 
24 | Winner: The metrics are not decisive enough, so a plot test could help.
25 | 
26 | ![Window 2,3](https://github.com/ahmedhamdi96/ML4T/blob/master/results/window_test_1.png)
27 | ![Window 4,5](https://github.com/ahmedhamdi96/ML4T/blob/master/results/window_test_2.png)
28 | 
29 | 


--------------------------------------------------------------------------------
/machine_learning/development/testing/test.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.optimized_lstm import lstm
 2 | from keras.callbacks import EarlyStopping
 3 | 
 4 | stock = 'AAPL'
 5 | dates_dic = {
 6 |     'AAPL'  : ['2017-01-01', '2018-01-01']
 7 |     }
 8 | 
 9 | window = 2
10 | future_gap = 1
11 | time_steps = 1
12 | neurons = [256, 256, 32, 1]
13 | drop_out = 0.2                                   
14 | batch_size = 2048
15 | epochs = 300
16 | validation_split = 0.1
17 | verbose = 1
18 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
19 |                                         patience=50, verbose=verbose, mode='auto')
20 | callbacks = [early_stopping_callback] 
21 | 
22 | #window test
23 | window_list = [2,3,4,5]
24 | window_dic = {
25 |     2 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
26 |     3 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
27 |     4 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
28 |     5 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
29 | }
30 | 
31 | #time_steps test
32 | time_steps_list = [1,2,3,4,5]
33 | time_steps_dic = {
34 |     1 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
35 |     2 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
36 |     3 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
37 |     4 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
38 |     5 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
39 | }
40 | 
41 | for window in window_list:
42 |     start_date = dates_dic[stock][0]
43 |     end_date = dates_dic[stock][1]
44 |     normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 
45 |     end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 
46 |     verbose, callbacks)
47 |     window_dic[window][0] = normalized_metrics
48 |     window_dic[window][1] = inv_normalized_metrics
49 | 
50 | print(window_dic)


--------------------------------------------------------------------------------
/machine_learning/development/testing/window_plot_test.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.development.optimized_lstm import lstm
 2 | from keras.callbacks import EarlyStopping
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | stock = 'AAPL'
 6 | dates_dic = {
 7 |     'AAPL'  : ['2017-01-01', '2018-01-01']
 8 |     }
 9 | 
10 | window = 2
11 | future_gap = 1
12 | time_steps = 1
13 | neurons = [256, 256, 32, 1]
14 | drop_out = 0.2                                   
15 | batch_size = 2048
16 | epochs = 300
17 | validation_split = 0.1
18 | verbose = 1
19 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
20 |                                         patience=50, verbose=verbose, mode='auto')
21 | callbacks = [early_stopping_callback] 
22 | 
23 | #window test
24 | window_list = [2,3,4,5]
25 | window_dic = {
26 |     2 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None],
27 |     3 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None],
28 |     4 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None],
29 |     5 : [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0], None]
30 | }
31 | 
32 | window = 5
33 | 
34 | for window in window_list:
35 |     start_date = dates_dic[stock][0]
36 |     end_date = dates_dic[stock][1]
37 |     normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 
38 |     end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 
39 |     verbose, callbacks)
40 |     window_dic[window][0] = normalized_metrics
41 |     window_dic[window][1] = inv_normalized_metrics
42 |     window_dic[window][2] = df
43 | 
44 | fig1, (ax1, ax2) = plt.subplots(2, 1)
45 | fig2, (ax3, ax4) = plt.subplots(2, 1)
46 | 
47 | df = window_dic[2][2]
48 | ax1.plot(df.index, df["Actual"], label='Actual')
49 | ax1.plot(df.index, df["Prediction"], label='Prediction')
50 | ax1.set_title('Window = 2')
51 | ax1.set_xlabel('Date')
52 | ax1.set_ylabel('Price')
53 | ax1.legend(loc="best")
54 | ax1.grid(True)
55 | 
56 | df = window_dic[3][2]
57 | ax2.plot(df.index, df["Actual"], label='Actual')
58 | ax2.plot(df.index, df["Prediction"], label='Prediction')
59 | ax2.set_title('Window = 3')
60 | ax2.set_xlabel('Date')
61 | ax2.set_ylabel('Price')
62 | ax2.legend(loc="best")
63 | ax2.grid(True)
64 | 
65 | df = window_dic[4][2]
66 | ax3.plot(df.index, df["Actual"], label='Actual')
67 | ax3.plot(df.index, df["Prediction"], label='Prediction')
68 | ax3.set_title('Window = 4')
69 | ax3.set_xlabel('Date')
70 | ax3.set_ylabel('Price')
71 | ax3.legend(loc="best")
72 | ax3.grid(True)
73 | 
74 | df = window_dic[5][2]
75 | ax4.plot(df.index, df["Actual"], label='Actual')
76 | ax4.plot(df.index, df["Prediction"], label='Prediction')
77 | ax4.set_title('Window = 5')
78 | ax4.set_xlabel('Date')
79 | ax4.set_ylabel('Price')
80 | ax4.legend(loc="best")
81 | ax4.grid(True)
82 | 
83 | fig1.tight_layout()
84 | fig2.tight_layout()
85 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/final/evaluation/metrics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | from sklearn.metrics import mean_squared_error, mean_absolute_error
 4 | from sklearn.metrics import r2_score
 5 | 
 6 | def compute_mape(y_true, y_pred):
 7 |     return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
 8 | 
 9 | def evaluate(Y_test, predictions, Y_test_inv_scaled, predictions_inv_scaled):
10 |     rmse = (mean_squared_error(Y_test, predictions) ** 0.5)
11 |     print('\nNormalized RMSE: %.3f' %(rmse))
12 |     nrmse = ((mean_squared_error(Y_test, predictions) ** 0.5))/np.mean(Y_test)
13 |     print('Normalized NRMSE: %.3f' %(nrmse))
14 |     mae = mean_absolute_error(Y_test, predictions)
15 |     print('Normalized MAE: %.3f' %(mae))
16 |     mape = compute_mape(Y_test, predictions)
17 |     print('Normalized MAPE: %.3f' %(mape))
18 |     correlation = np.corrcoef(Y_test.T, predictions.T)
19 |     print("Normalized Correlation: %.3f"%(correlation[0, 1]))
20 |     r2 = r2_score(Y_test, predictions)
21 |     print("Normalized r^2: %.3f"%(r2))
22 |     normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
23 | 
24 |     #evaluating the model on the inverse-normalized dataset
25 |     rmse = (mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5)
26 |     print('\nInverse-Normalized Outsample RMSE: %.3f' %(rmse))
27 |     nrmse = ((mean_squared_error(Y_test_inv_scaled, predictions_inv_scaled) ** 0.5))/np.mean(Y_test)
28 |     print('Normalized NRMSE: %.3f' %(nrmse))
29 |     mae = mean_absolute_error(Y_test_inv_scaled, predictions_inv_scaled)
30 |     print('Normalized MAE: %.3f' %(mae))
31 |     mape = compute_mape(Y_test_inv_scaled, predictions_inv_scaled)
32 |     print('Inverse-Normalized Outsample MAPE: %.3f' %(mape))
33 |     correlation = np.corrcoef(Y_test_inv_scaled.T, predictions_inv_scaled.T)
34 |     print("Inverse-Normalized Outsample Correlation: %.3f"%(correlation[0, 1]))
35 |     r2 = r2_score(Y_test_inv_scaled, predictions_inv_scaled)
36 |     print("Inverse-Normalized Outsample r^2: %.3f"%(r2))
37 |     inv_normalized_metrics = [rmse, nrmse, mae, mape, correlation[0, 1], r2]
38 | 
39 |     return normalized_metrics, inv_normalized_metrics
40 | 
41 | def compute_lag_metric(actual, prediction, lookup, symbol):
42 |     diff_list = [None] * lookup
43 |     lag_list = [None] * (len(actual)-lookup+1)
44 | 
45 |     for i in range(len(actual)-lookup+1):
46 |         for j in range(lookup):
47 |             diff_list[j] = abs(actual[i] - prediction[i+j])
48 |         lag_list[i] = diff_list.index(min(diff_list))
49 | 
50 |     max_diff_count = [0] * lookup
51 | 
52 |     for i in range(len(lag_list)):
53 |         max_diff_count[lag_list[i]] += 1
54 | 
55 |     _, ax = plt.subplots()
56 |     ax.bar(range(len(max_diff_count)), max_diff_count, align='center')
57 |     plt.sca(ax)
58 |     plt.title(symbol+" Lag Test")
59 |     ax.set_xlabel('Day Lag')
60 |     ax.set_ylabel('Frequency')
61 |     ax.grid(True)
62 | 
63 |     _, ax1 = plt.subplots()
64 |     index = actual[:len(actual)-lookup+1].index
65 |     ax1.scatter(index, lag_list)
66 |     plt.title(symbol+" Daily Lag Test")
67 |     ax1.set_xlabel('Date')
68 |     ax1.set_ylabel('Lag')
69 |     ax1.grid(True)
70 | 
71 |     return lag_list


--------------------------------------------------------------------------------
/machine_learning/final/experiments/exp1.py:
--------------------------------------------------------------------------------
 1 | from utils.util import plot_data
 2 | from machine_learning.final.models import lstm
 3 | from machine_learning.final.models import ffnn
 4 | from machine_learning.final.models import lin_reg
 5 | from machine_learning.final.models import knn_reg
 6 | from keras.callbacks import EarlyStopping
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | #models comparison
10 | stock = 'AAPL'
11 | dates_dic = {
12 |     'AAPL'  : ['2017-01-01', '2018-01-01']
13 | }
14 | metrics_dic = {
15 |     'LSTM'   : [],
16 |     'FFNN'   : [],
17 |     'LinReg' : [],
18 |     'kNNReg' : []
19 | }
20 | 
21 | window = 2
22 | future_gap = 1
23 | time_steps = 1
24 | neurons = [256, 256, 32, 1]
25 | drop_out = 0.2                                   
26 | batch_size = 2048
27 | epochs = 300
28 | validation_split = 0.1
29 | verbose = 1
30 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
31 |                                         patience=50, verbose=verbose, mode='auto')
32 | callbacks = [early_stopping_callback] 
33 | start_date = dates_dic[stock][0]
34 | end_date = dates_dic[stock][1]
35 | 
36 | #LSTM
37 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 
38 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 
39 | verbose, callbacks)
40 | metrics_dic['LSTM'] = normalized_metrics
41 | plot_data(df, stock+" 2017 Price Forecast (LSTM)", "Date", "Price", show_plot=False)
42 | 
43 | #FFNN
44 | neurons = [256, 256, 64, 1]
45 | batch_size = 128
46 | epochs = 200
47 | 
48 | normalized_metrics, inv_normalized_metrics, df = ffnn.final_test_ffnn(stock, start_date, 
49 | end_date, window, future_gap, neurons, drop_out, batch_size, epochs, validation_split, 
50 | verbose, callbacks)
51 | metrics_dic['FFNN'] = normalized_metrics
52 | plot_data(df, stock+" 2017 Price Forecast (FFNN)", "Date", "Price", show_plot=False)
53 | 
54 | #LinReg
55 | normalized_metrics, inv_normalized_metrics, df = lin_reg.final_test_linreg(stock, start_date, 
56 | end_date, window, future_gap)
57 | metrics_dic['LinReg'] = normalized_metrics
58 | plot_data(df, stock+" 2017 Price Forecast (LinReg)", "Date", "Price", show_plot=False)
59 | 
60 | #kNNReg
61 | k = 100
62 | 
63 | normalized_metrics, inv_normalized_metrics, df = knn_reg.final_test_knnreg(stock, start_date, 
64 | end_date, window, future_gap, k)
65 | metrics_dic['kNNReg'] = normalized_metrics
66 | plot_data(df, stock+" 2017 Price Forecast (kNNReg)", "Date", "Price", show_plot=False)
67 | 
68 | print(metrics_dic)
69 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/final/experiments/exp2.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.final.models import lstm
 2 | from machine_learning.final.evaluation.metrics import compute_lag_metric
 3 | from keras.callbacks import EarlyStopping
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | #sudden vs normal
 7 | stock = 'TSLA'
 8 | dates_dic = {
 9 |     'TSLA': ['2013-01-01', '2013-06-01'],
10 | }
11 | 
12 | window = 2
13 | future_gap = 1
14 | time_steps = 1
15 | neurons = [256, 256, 32, 1]
16 | drop_out = 0.2                                   
17 | batch_size = 2048
18 | epochs = 300
19 | validation_split = 0.1
20 | verbose = 1
21 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
22 |                                         patience=50, verbose=verbose, mode='auto')
23 | callbacks = [early_stopping_callback] 
24 | start_date = dates_dic[stock][0]
25 | end_date = dates_dic[stock][1]
26 | #LSTM
27 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 
28 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, 
29 | validation_split, verbose, callbacks)
30 | #PAL
31 | lookup = 5
32 | lag_list = compute_lag_metric(df['Actual'], df['Prediction'], lookup, stock)
33 | #Price Forecast Plot
34 | df = df[:len(df)-lookup+1]
35 | ax = df.plot(title=stock+" Price Forecast")
36 | ax.set_xlabel("Date")
37 | ax.set_ylabel("Price")
38 | ax.legend(loc="best")
39 | ax.grid(True)
40 | ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10,
41 |             arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
42 | ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10,
43 |             arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
44 | #Price Forecast and PAL Overlay Plot
45 | ax = df.plot(title=stock+" Price Forecast and PAL Overlay")
46 | ax.set_xlabel("Date")
47 | ax.set_ylabel("Price")
48 | ax.legend(loc="best")
49 | ax.grid(True)
50 | ax.annotate('Normal Movement', xy=('2013-02-15', 40), xytext=('2013-03-05', 50), fontsize=10,
51 |             arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
52 | ax.annotate('Sudden Change', xy=('2013-05-10', 55), xytext=('2013-03-05', 70), fontsize=10,
53 |             arrowprops=dict(facecolor='black', shrink=0.1, headwidth=8))
54 | ax1 = ax.twinx()
55 | ax1.scatter(df.index, lag_list, c='g')
56 | ax1.set_ylabel("PAL")
57 | 
58 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/final/experiments/exp3.py:
--------------------------------------------------------------------------------
 1 | from utils.util import plot_data
 2 | from machine_learning.final.models import lstm
 3 | from keras.callbacks import EarlyStopping
 4 | import matplotlib.pyplot as plt
 5 | 
 6 | #companies
 7 | stocks_list = ['FB', 'AAPL', 'TSLA', 'AMZN']
 8 | dates_dic = {
 9 |     'FB'  : ['2017-12-01', '2018-05-01'],
10 |     'AAPL': ['2012-08-01', '2013-08-01'],
11 |     'TSLA': ['2013-08-01', '2014-01-01'],
12 |     'AMZN': ['2017-08-01', '2018-04-01'],
13 | }
14 | metrics_dic = {
15 |     'FB'   : [],
16 |     'AAPL' : [],
17 |     'TSLA' : [],
18 |     'AMZN' : []
19 | }
20 | 
21 | window = 2
22 | future_gap = 1
23 | time_steps = 1
24 | neurons = [256, 256, 32, 1]
25 | drop_out = 0.2                                   
26 | batch_size = 2048
27 | epochs = 300
28 | validation_split = 0.1
29 | verbose = 1
30 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
31 |                                         patience=50, verbose=verbose, mode='auto')
32 | callbacks = [early_stopping_callback] 
33 | 
34 | for stock in stocks_list:
35 |     start_date = dates_dic[stock][0]
36 |     end_date = dates_dic[stock][1]
37 |     normalized_metrics,_, df = lstm.final_test_lstm(stock, start_date, end_date, window, future_gap, time_steps,
38 |     neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks)
39 |     metrics_dic[stock] = normalized_metrics
40 |     plot_data(df, stock+" Price Forecast", "Date", "Price", show_plot=False)
41 | 
42 | print(metrics_dic)
43 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/final/experiments/exp4.py:
--------------------------------------------------------------------------------
 1 | from utils.util import plot_data
 2 | from machine_learning.final.models import lstm
 3 | from machine_learning.final.models import ffnn
 4 | from machine_learning.final.models import lin_reg
 5 | from keras.callbacks import EarlyStopping
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | #future gap
 9 | stock = 'MSFT'
10 | dates_dic = {
11 |     'MSFT'  : ['2017-01-01', '2018-01-01']
12 | }
13 | future_gap_list = [1, 2, 3, 4, 5]
14 | 
15 | #LSTM
16 | window = 2
17 | future_gap = 1
18 | time_steps = 1
19 | neurons = [256, 256, 32, 1]
20 | drop_out = 0.2                                   
21 | batch_size = 2048
22 | epochs = 300
23 | validation_split = 0.1
24 | verbose = 1
25 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
26 |                                         patience=50, verbose=verbose, mode='auto')
27 | callbacks = [early_stopping_callback] 
28 | 
29 | lstm_future_gap_metrics = {
30 |     1 : [],
31 |     2 : [],
32 |     3 : [],
33 |     4 : [],
34 |     5 : []
35 | }
36 | 
37 | 
38 | for future_gap in future_gap_list:
39 |     start_date = dates_dic[stock][0]
40 |     end_date = dates_dic[stock][1]
41 |     normalized_metrics, _, df = lstm.final_test_lstm(stock, start_date, 
42 |     end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 
43 |     verbose, callbacks)
44 |     lstm_future_gap_metrics[future_gap] = normalized_metrics
45 |     plot_data(df, 'Future Gap = '+str(future_gap), "Date", "Price", show_plot=False)
46 | 
47 | 
48 | #FFNN
49 | neurons = [256, 256, 64, 1]
50 | batch_size = 128
51 | epochs = 200
52 | 
53 | ffnn_future_gap_metrics = {
54 |     1 : [],
55 |     2 : [],
56 |     3 : [],
57 |     4 : [],
58 |     5 : []
59 | }
60 | 
61 | for future_gap in future_gap_list:
62 |     start_date = dates_dic[stock][0]
63 |     end_date = dates_dic[stock][1]
64 |     normalized_metrics, _, df = ffnn.final_test_ffnn(stock, start_date, 
65 |     end_date, window, future_gap, neurons, drop_out, batch_size, epochs, validation_split, 
66 |     verbose, callbacks)
67 |     ffnn_future_gap_metrics[future_gap] = normalized_metrics
68 |     plot_data(df, 'Future Gap = '+str(future_gap), "Date", "Price", show_plot=False)
69 | 
70 | linreg_future_gap_metrics = {
71 |     1 : [],
72 |     2 : [],
73 |     3 : [],
74 |     4 : [],
75 |     5 : []
76 | }
77 | 
78 | for future_gap in future_gap_list:
79 |     start_date = dates_dic[stock][0]
80 |     end_date = dates_dic[stock][1]
81 |     normalized_metrics, _, df = lin_reg.final_test_linreg(stock, start_date, 
82 |     end_date, window, future_gap)
83 |     linreg_future_gap_metrics[future_gap] = normalized_metrics
84 |     plot_data(df, 'Future Gap = '+str(future_gap), "Date", "Price", show_plot=False)
85 | 
86 | print(lstm_future_gap_metrics)
87 | print(ffnn_future_gap_metrics)
88 | print(linreg_future_gap_metrics)
89 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/final/experiments/exp5.py:
--------------------------------------------------------------------------------
 1 | from utils.util import plot_data
 2 | from machine_learning.final.evaluation.metrics import compute_lag_metric
 3 | from machine_learning.final.models import lstm
 4 | from machine_learning.final.models import ffnn
 5 | from machine_learning.final.models import lin_reg
 6 | from machine_learning.final.models import knn_reg
 7 | from keras.callbacks import EarlyStopping
 8 | import matplotlib.pyplot as plt
 9 | 
10 | #LSTM and LinReg PAL
11 | stock = 'AAPL'
12 | dates_dic = {
13 |     'AAPL'  : ['2017-01-01', '2018-01-01']
14 | }
15 | metrics_dic = {
16 |     'LSTM'   : [],
17 |     'LinReg' : []
18 | }
19 | 
20 | window = 2
21 | future_gap = 1
22 | time_steps = 1
23 | neurons = [256, 256, 32, 1]
24 | drop_out = 0.2                                   
25 | batch_size = 2048
26 | epochs = 300
27 | validation_split = 0.1
28 | verbose = 1
29 | early_stopping_callback = EarlyStopping(monitor='val_loss', min_delta=0, 
30 |                                         patience=50, verbose=verbose, mode='auto')
31 | callbacks = [early_stopping_callback] 
32 | start_date = dates_dic[stock][0]
33 | end_date = dates_dic[stock][1]
34 | 
35 | #LSTM
36 | normalized_metrics, inv_normalized_metrics, df = lstm.final_test_lstm(stock, start_date, 
37 | end_date, window, future_gap, time_steps, neurons, drop_out, batch_size, epochs, validation_split, 
38 | verbose, callbacks)
39 | metrics_dic['LSTM'] = normalized_metrics
40 | #PAL
41 | lookup = 5
42 | lag_list = compute_lag_metric(df['Actual'], df['Prediction'], lookup, stock)
43 | df = df[:len(df)-lookup+1]
44 | #Price Forecast Plot
45 | plot_data(df, stock+" 2017 Price Forecast (LSTM)", "Date", "Price", show_plot=False)
46 | #Price Forecast and PAL Overlay Plot
47 | ax = df.plot(title=stock+" 2017 Price Forecast and PAL Overlay")
48 | ax.set_xlabel("Date")
49 | ax.set_ylabel("Price")
50 | ax.legend(loc="best")
51 | ax.grid(True)
52 | ax1 = ax.twinx()
53 | ax1.scatter(df.index, lag_list, c='g')
54 | ax1.set_ylabel("PAL")
55 | 
56 | #LinReg
57 | normalized_metrics, inv_normalized_metrics, df = lin_reg.final_test_linreg(stock, start_date, 
58 | end_date, window, future_gap)
59 | metrics_dic['LinReg'] = normalized_metrics
60 | #PAL
61 | lookup = 5
62 | lag_list = compute_lag_metric(df['Actual'], df['Prediction'], lookup, stock)
63 | df = df[:len(df)-lookup+1]
64 | #Price Forecast Plot
65 | plot_data(df, stock+" 2017 Price Forecast (LinReg)", "Date", "Price", show_plot=False)
66 | #Price Forecast and PAL Overlay Plot
67 | ax = df.plot(title=stock+" 2017 Price Forecast and PAL Overlay")
68 | ax.set_xlabel("Date")
69 | ax.set_ylabel("Price")
70 | ax.legend(loc="best")
71 | ax.grid(True)
72 | ax1 = ax.twinx()
73 | ax1.scatter(df.index, lag_list, c='g')
74 | ax1.set_ylabel("PAL")
75 | 
76 | print(metrics_dic)
77 | plt.show()


--------------------------------------------------------------------------------
/machine_learning/final/models/ffnn.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset, dataset_split
 2 | from machine_learning.final.evaluation.metrics import evaluate
 3 | from keras.models import Sequential
 4 | from keras.layers.core import Dense, Dropout
 5 | from keras.optimizers import Adam
 6 | 
 7 | def build_model(features, neurons, drop_out, decay=0.0):
 8 |     model = Sequential()
 9 |     
10 |     model.add(Dense(neurons[0], input_dim=features, activation='relu',))
11 |     model.add(Dropout(drop_out))
12 |         
13 |     model.add(Dense(neurons[1], activation='relu'))
14 |     model.add(Dropout(drop_out))
15 |         
16 |     model.add(Dense(neurons[2], activation='relu'))        
17 |     model.add(Dense(neurons[3], activation='linear'))
18 | 
19 |     adam = Adam(decay=decay)
20 |     model.compile(loss='mse',optimizer=adam)
21 |     model.summary()
22 |     return model
23 | 
24 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks):
25 | 
26 |     history = model.fit(
27 |         X_train,
28 |         Y_train,
29 |         batch_size = batch_size,
30 |         epochs = epochs,
31 |         validation_split = validation_split,
32 |         verbose = verbose,
33 |         callbacks = callbacks
34 |     )
35 | 
36 |     return history
37 | 
38 | def final_test_ffnn(stock_symbol, start_date, end_date, window, future_gap, neurons, 
39 |                     drop_out, batch_size, epochs, validation_split, verbose, callbacks):
40 |     #building the dataset
41 |     print("> building the dataset...")
42 |     df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window)
43 |     df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
44 |     #reshaping the dataset for FFNN
45 |     print("\n> reshaping the dataset for FFNN...")
46 |     ds_train = df_train.values
47 |     ds_test = df_test.values
48 |     X_train, Y_train = dataset_split(ds_train, future_gap, None)
49 |     X_test, Y_test = dataset_split(ds_test, future_gap, None)
50 |     #building the FFNN model
51 |     print("\n> building the FFNN model...")
52 |     features = X_train.shape[1]
53 |     model = build_model(features, neurons, drop_out)
54 |     #fitting the training data
55 |     print("\n> fitting the training data...")
56 |     model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks)
57 |     #predictions
58 |     print("\n> testing the model for predictions...")
59 |     predictions = model.predict(X_test)
60 |     #inverse-scaling
61 |     print("\n> inverse-scaling the scaled values...")
62 |     predictions = predictions.reshape((predictions.shape[0], 1))
63 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
64 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
65 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
66 |     #evaluation
67 |     normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 
68 |                                                           Y_test_inv_scaled, predictions_inv_scaled)
69 |     #grouping the actual prices and predictions
70 |     print("\n> grouping the actual prices and predictions...")
71 |     feature_cols = df_test.columns.tolist()
72 |     feature_cols.remove("actual_price")
73 |     df_test.drop(columns=feature_cols, inplace=True)
74 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
75 |     df_test = df_test.iloc[future_gap:]
76 |     df_test['Actual'] = Y_test_inv_scaled
77 |     df_test['Prediction'] = predictions_inv_scaled
78 | 
79 |     return normalized_metrics, inv_normalized_metrics, df_test


--------------------------------------------------------------------------------
/machine_learning/final/models/knn_reg.py:
--------------------------------------------------------------------------------
 1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset, dataset_split
 2 | import machine_learning.final.models.knn_wrapper as knn
 3 | from machine_learning.final.evaluation.metrics import evaluate
 4 | 
 5 | def final_test_knnreg(stock_symbol, start_date, end_date, window, future_gap, k):
 6 |     #building the dataset
 7 |     print("> building the dataset...")
 8 |     df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window)
 9 |     df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
10 |     #reshaping the dataset for LinReg
11 |     print("\n> reshaping the dataset for LinReg...")
12 |     ds_train = df_train.values
13 |     ds_test = df_test.values
14 |     X_train, Y_train = dataset_split(ds_train, future_gap, None)
15 |     X_test, Y_test = dataset_split(ds_test, future_gap, None)
16 |     #kNN model
17 |     model = knn.knn(k)
18 |     #fitting the training data
19 |     model.train(X_train, Y_train)
20 |     #predictions
21 |     predictions = model.query(X_test, normalize=False, addDiff=False)
22 |     #inverse-scaling
23 |     print("\n> inverse-scaling the scaled values...")
24 |     predictions = predictions.reshape((predictions.shape[0], 1))
25 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
26 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
27 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
28 |     #evaluation
29 |     normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 
30 |                                                           Y_test_inv_scaled, predictions_inv_scaled)
31 |     #grouping the actual prices and predictions
32 |     print("\n> grouping the actual prices and predictions...")
33 |     feature_cols = df_test.columns.tolist()
34 |     feature_cols.remove("actual_price")
35 |     df_test.drop(columns=feature_cols, inplace=True)
36 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
37 |     df_test = df_test.iloc[future_gap:]
38 |     df_test['Actual'] = Y_test_inv_scaled
39 |     df_test['Prediction'] = predictions_inv_scaled
40 | 
41 |     return normalized_metrics, inv_normalized_metrics, df_test


--------------------------------------------------------------------------------
/machine_learning/final/models/knn_wrapper.py:
--------------------------------------------------------------------------------
 1 | ''' this file contains an implementation of kNN regression
 2 | '''
 3 | import numpy as np
 4 | 
 5 | '''kNN wrapper class
 6 | 
 7 | *k       : k nearest neighbors to be considered
 8 | *dataset : training dataset including the features and the output
 9 | '''
10 | class knn:
11 |     __k = 0
12 |     __dataset = None
13 | 
14 |     '''constructor function
15 | 
16 |     *k       : k nearest neighbors to be considered
17 |     '''
18 |     def __init__(self, k):
19 |         self.__k = k
20 | 
21 |     '''training function
22 | 
23 |     *data_x : training dataset features
24 |     *data_y : training dataset output
25 |     '''
26 |     def train(self, data_x, data_y):
27 |         data_y_reshaped = data_y.reshape((data_y.shape[0], 1))
28 |         self.__dataset = np.concatenate((data_x, data_y_reshaped), axis=1)
29 | 
30 |     '''querying/evaluating function
31 | 
32 |     *features : test dataset features
33 |     '''
34 |     def query(self, features, normalize=True, addDiff=True):
35 |         dataset_price_normed = self.__dataset[:, 0]
36 |         features_price_normed = features[:, 0]
37 | 
38 |         if normalize:
39 |             dataset_price_normed = (self.__dataset[:, 0]/self.__dataset[0, 0]) - 1
40 |             features_price_normed = (features[:, 0]/features[0, 0]) - 1
41 |         
42 |         cumm_difference = np.zeros(features.shape[0])
43 |         predicted_price = np.zeros(features.shape[0])
44 | 
45 |         for i in range(0, features.shape[0]):
46 | 
47 |             price_normed_diff = np.absolute(dataset_price_normed - features_price_normed[i])
48 |             moment_diff = np.absolute(self.__dataset[:, 1] - features[i, 1])
49 |             sma_diff = np.absolute(self.__dataset[:, 2] - features[i, 2])
50 |             b_band_diff =  np.absolute(self.__dataset[:, 3] - features[i, 3])
51 |             std_diff =  np.absolute(self.__dataset[:, 4] - features[i, 4])
52 |             vroc_diff =  np.absolute(self.__dataset[:, 5] - features[i, 5])
53 |             
54 |             cumm_difference = price_normed_diff + moment_diff + sma_diff + b_band_diff + std_diff + vroc_diff
55 |             difference_op = np.asarray([cumm_difference, self.__dataset[:, -1]]).T
56 |             sorting_index = np.argsort(difference_op[:, 0])
57 |             difference_sorted = difference_op[sorting_index]
58 | 
59 |             k_mean = np.mean(difference_sorted[:self.__k, 1])
60 |             predicted_price[i] = k_mean
61 | 
62 |         if addDiff:
63 |             predicted_price += (features[0, 0] - self.__dataset[0, 0])
64 |         return predicted_price


--------------------------------------------------------------------------------
/machine_learning/final/models/lin_reg.py:
--------------------------------------------------------------------------------
  1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset
  2 | from machine_learning.final.evaluation.metrics import evaluate
  3 | import numpy as np
  4 | import scipy.optimize as spo
  5 | 
  6 | '''computes and returns the root mean squared error
  7 | 
  8 | *x : a dynamic variable: (value, array, ...)
  9 | *y : a dynamic variable: (value, array, ...)
 10 | '''
 11 | def calculate_rmse(x, y):
 12 |     #squared error
 13 |     se = (x-y) ** 2
 14 |     #mean squared error
 15 |     mse = np.mean(se)
 16 |     #root mean squared error
 17 |     rmse = mse ** 0.5
 18 |     return rmse
 19 | 
 20 | '''given the fitted line coefficients and the dataset, this
 21 | function computes the rmse between the actual values and 
 22 | the predicted values of the linear regression
 23 | 
 24 | *coefficients : fitted line coefficients array
 25 | *data         : dataset containing the features and the output
 26 | '''
 27 | def error_fun(coefficients, data):
 28 |     price = coefficients[0]*data[:, 0]
 29 |     moment = coefficients[1]*data[:, 1]
 30 |     sma = coefficients[2]*data[:, 2]
 31 |     b_band = coefficients[3]*data[:, 3]
 32 |     std = coefficients[4]*data[:, 4]
 33 |     vroc = coefficients[5]*data[:, 5]
 34 |     constant = coefficients[6]
 35 |     predicted_values = price+moment+sma+b_band+std+vroc+constant
 36 |     actual_values = data[:, -1]
 37 |     rmse = calculate_rmse(predicted_values, actual_values)
 38 |     return rmse
 39 | 
 40 | '''given the data to be passed to the error fcn, this function 
 41 | computes an initial guess of the coefficients and uses SciPy's
 42 | minimize fcn and the error fcn to find the optimal coefficients
 43 | 
 44 | *data    : fitted line coefficients array
 45 | *err_fun : error function to be minimized by SciPy's minimizor
 46 | '''
 47 | def minimize_new_err_fun(data, err_fun):
 48 |     price = np.mean(data[:, 0])
 49 |     moment = np.mean(data[:, 1])
 50 |     sma = np.mean(data[:, 2])
 51 |     b_band = np.mean(data[:, 3])
 52 |     std = np.mean(data[:, 4])
 53 |     vroc = np.mean(data[:, 5])
 54 |     constant = 0
 55 |     coefficients_guess = [price, moment, sma, b_band, std, vroc, constant]
 56 |     result = spo.minimize(error_fun, coefficients_guess, args=(data, ), method="SLSQP", options= {'disp' : True})
 57 |     return result.x
 58 | 
 59 | def dataset_reshape(dataset, future_gap, split):
 60 |     print("Dataset Shape:", dataset.shape)
 61 |     X = dataset[:, :-1]
 62 |     Y = dataset[:, -1]
 63 |     print("X Shape:", X.shape)
 64 |     print("Y Shape:", Y.shape)
 65 | 
 66 |     print("Applying Future Gap...")
 67 |     X = X[:-future_gap]
 68 |     Y = Y[future_gap:]
 69 |     print("X Shape:", X.shape)
 70 |     print("Y Shape:", Y.shape)
 71 | 
 72 |     if split != None:
 73 |         print("Applying training, testing split...")
 74 |         split_index = int(split*X.shape[0])
 75 |         X_train = X[:split_index]
 76 |         X_test = X[split_index:]
 77 |         Y_train = Y[:split_index]
 78 |         Y_test = Y[split_index:]
 79 |         print("(X_train, Y_train, X_test, Y_test) Shapes:")
 80 |         print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
 81 |         return X_train, Y_train, X_test, Y_test
 82 |     
 83 |     return X, Y
 84 | 
 85 | def final_test_linreg(stock_symbol, start_date, end_date, window, future_gap):
 86 |     #building the dataset
 87 |     print("> building the dataset...")
 88 |     df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window)
 89 |     df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
 90 |     #reshaping the dataset for LinReg
 91 |     print("\n> reshaping the dataset for LinReg...")
 92 |     ds_train = df_train.values
 93 |     ds_test = df_test.values
 94 |     X_train, Y_train = dataset_reshape(ds_train, future_gap, None)
 95 |     X_test, Y_test = dataset_reshape(ds_test, future_gap, None)
 96 |     #fitting the training data
 97 |     print("\n> fitting the training data...")
 98 |     Y_train = Y_train.reshape((Y_train.shape[0], 1))
 99 |     training_set = np.concatenate((X_train, Y_train), axis=1)
100 |     fitted_line_coefficients = minimize_new_err_fun(training_set, error_fun)
101 |     print("Line Coefficients:", fitted_line_coefficients)
102 |     #predictions
103 |     price = fitted_line_coefficients[0]*X_test[:, 0]
104 |     moment = fitted_line_coefficients[1]*X_test[:, 1]
105 |     sma = fitted_line_coefficients[2]*X_test[:, 2]
106 |     b_band = fitted_line_coefficients[3]*X_test[:, 3]
107 |     std = fitted_line_coefficients[4]*X_test[:, 4]
108 |     vroc = fitted_line_coefficients[5]*X_test[:, 5]
109 |     constant = fitted_line_coefficients[6]
110 |     predictions = price+moment+sma+b_band+std+vroc+constant
111 |     #inverse-scaling
112 |     print("\n> inverse-scaling the scaled values...")
113 |     predictions = predictions.reshape((predictions.shape[0], 1))
114 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
115 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
116 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
117 |     #evaluation
118 |     normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 
119 |                                                           Y_test_inv_scaled, predictions_inv_scaled)
120 |     #grouping the actual prices and predictions
121 |     print("\n> grouping the actual prices and predictions...")
122 |     feature_cols = df_test.columns.tolist()
123 |     feature_cols.remove("actual_price")
124 |     df_test.drop(columns=feature_cols, inplace=True)
125 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
126 |     df_test = df_test.iloc[future_gap:]
127 |     df_test['Actual'] = Y_test_inv_scaled
128 |     df_test['Prediction'] = predictions_inv_scaled
129 | 
130 |     return normalized_metrics, inv_normalized_metrics, df_test


--------------------------------------------------------------------------------
/machine_learning/final/models/lstm.py:
--------------------------------------------------------------------------------
  1 | from machine_learning.final.utils.dataset import bulid_TIs_dataset
  2 | from machine_learning.final.evaluation.metrics import evaluate
  3 | import numpy as np
  4 | from keras.models import Sequential
  5 | from keras.layers.core import Dense, Dropout
  6 | from keras.layers.recurrent import LSTM
  7 | from keras.optimizers import Adam
  8 | 
  9 | def lstm_dataset_reshape(dataset, time_steps, future_gap, split):
 10 |     print("Dataset Shape:", dataset.shape)
 11 |     X = dataset[:, :-1]
 12 |     Y = dataset[:, -1]
 13 |     print("X Shape:", X.shape)
 14 |     print("Y Shape:", Y.shape)
 15 | 
 16 |     X_sampled = []
 17 |     for i in range(X.shape[0] - time_steps + 1):
 18 |         X_sampled.append(X[i : i+time_steps])
 19 |     X_sampled = np.array(X_sampled)
 20 |     print("Sampled X Shape:", X_sampled.shape)
 21 | 
 22 |     future_gap_index = future_gap - 1
 23 |     X_sampled = X_sampled[:-future_gap]
 24 |     Y_sampled = Y[time_steps+future_gap_index: ]
 25 |     print("Applying Future Gap...")
 26 |     print("Sampled X Shape:", X_sampled.shape)
 27 |     print("Sampled Y Shape:", Y_sampled.shape)
 28 | 
 29 |     if split != None:
 30 |         split_index = int(split*X_sampled.shape[0])
 31 |         X_train = X_sampled[:split_index]
 32 |         X_test = X_sampled[split_index:]
 33 |         Y_train = Y_sampled[:split_index]
 34 |         Y_test = Y_sampled[split_index:]
 35 |         print("(X_train, Y_train, X_test, Y_test) Shapes:")
 36 |         print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
 37 |         return X_train, Y_train, X_test, Y_test
 38 | 
 39 |     return X_sampled, Y_sampled
 40 | 
 41 | def build_model(time_steps, features, neurons, drop_out, decay=0.0):
 42 |     model = Sequential()
 43 |     
 44 |     model.add(LSTM(neurons[0], input_shape=(time_steps, features), return_sequences=True))
 45 |     model.add(Dropout(drop_out))
 46 |         
 47 |     model.add(LSTM(neurons[1], input_shape=(time_steps, features), return_sequences=False))
 48 |     model.add(Dropout(drop_out))
 49 |         
 50 |     model.add(Dense(neurons[2], activation='relu'))        
 51 |     model.add(Dense(neurons[3], activation='linear'))
 52 | 
 53 |     adam = Adam(decay=decay)
 54 |     model.compile(loss='mse',optimizer=adam)
 55 |     model.summary()
 56 |     return model
 57 | 
 58 | def model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks):
 59 | 
 60 |     history = model.fit(
 61 |     X_train,
 62 |     Y_train,
 63 |     batch_size = batch_size,
 64 |     epochs = epochs,
 65 |     validation_split = validation_split,
 66 |     verbose = verbose,
 67 |     callbacks = callbacks
 68 |     )
 69 | 
 70 |     return history
 71 | 
 72 | def final_test_lstm(stock_symbol, start_date, end_date, window, future_gap, time_steps,
 73 |               neurons, drop_out, batch_size, epochs, validation_split, verbose, callbacks):
 74 |     #building the dataset
 75 |     print("> building the dataset...")
 76 |     df_train, _ = bulid_TIs_dataset(stock_symbol, None, start_date, window)
 77 |     df_test, scaler = bulid_TIs_dataset(stock_symbol, start_date, end_date, window)
 78 |     #reshaping the dataset for LSTM
 79 |     print("\n> reshaping the dataset for LSTM...")
 80 |     ds_train = df_train.values
 81 |     ds_test = df_test.values
 82 |     X_train, Y_train = lstm_dataset_reshape(ds_train, time_steps, future_gap, None)
 83 |     X_test, Y_test = lstm_dataset_reshape(ds_test, time_steps, future_gap, None)
 84 |     #building the LSTM model
 85 |     print("\n> building the LSTM model...")
 86 |     features = X_train.shape[2]
 87 |     model = build_model(time_steps, features, neurons, drop_out)
 88 |     #fitting the training data
 89 |     print("\n> fitting the training data...")
 90 |     model_fit(model, X_train, Y_train, batch_size, epochs, validation_split, verbose, callbacks)
 91 |     #predictions
 92 |     print("\n> testing the model for predictions...")
 93 |     predictions = model.predict(X_test)
 94 |     #inverse-scaling
 95 |     print("\n> inverse-scaling the scaled values...")
 96 |     predictions = predictions.reshape((predictions.shape[0], 1))
 97 |     predictions_inv_scaled = scaler.inverse_transform(predictions)
 98 |     Y_test = Y_test.reshape((Y_test.shape[0], 1))
 99 |     Y_test_inv_scaled = scaler.inverse_transform(Y_test)
100 |     #evaluation
101 |     normalized_metrics, inv_normalized_metrics = evaluate(Y_test, predictions, 
102 |                                                           Y_test_inv_scaled, predictions_inv_scaled)
103 |     #grouping the actual prices and predictions
104 |     print("\n> grouping the actual prices and predictions...")
105 |     feature_cols = df_test.columns.tolist()
106 |     feature_cols.remove("actual_price")
107 |     df_test.drop(columns=feature_cols, inplace=True)
108 |     df_test.rename(columns={"actual_price" : 'Actual'}, inplace=True)
109 |     df_test = df_test.iloc[time_steps+future_gap-1:]
110 |     df_test['Actual'] = Y_test_inv_scaled
111 |     df_test['Prediction'] = predictions_inv_scaled
112 | 
113 |     return normalized_metrics, inv_normalized_metrics, df_test


--------------------------------------------------------------------------------
/machine_learning/final/utils/dataset.py:
--------------------------------------------------------------------------------
 1 | from utils.util import get_stock_data
 2 | import numpy as np
 3 | from sklearn.preprocessing import MinMaxScaler
 4 | 
 5 | '''technical indicators computation functions
 6 | 
 7 | *prices : adjusted closing stock prices
 8 | *window : rolling statistics window 
 9 | '''
10 | #BEGIN
11 | def compute_momentum_ratio(prices, window):
12 |     #first window elements >> NA
13 |     momentum_ratio = (prices/prices.shift(periods = 1)) - 1
14 |     return momentum_ratio
15 | 
16 | def compute_sma_ratio(prices, window):
17 |     #Simple Moving Average
18 |     #first window-1 elements >> NA
19 |     sma_ratio = (prices / prices.rolling(window = window).mean()) - 1
20 |     return sma_ratio
21 | 
22 | def compute_bollinger_bands_ratio(prices, window):
23 |     #first window-1 elements >> NA
24 |     bb_ratio = prices - prices.rolling(window = window).mean()
25 |     bb_ratio = bb_ratio / (2 * prices.rolling(window = window).std())
26 |     return bb_ratio
27 | 
28 | def compute_volatility_ratio(prices, window):
29 |     #first window-1 elements >> NA
30 |     volatility_ratio = ((prices/prices.shift(periods = 1)) - 1).rolling(window = window).std()
31 |     return volatility_ratio
32 | 
33 | def compute_vroc_ratio(volume, window):
34 |     #Volume Rate of Change
35 |     #first window-1 elements >> NA
36 |     vroc_ratio = (volume/volume.shift(periods = window)) - 1
37 |     return vroc_ratio
38 | #END
39 | 
40 | def bulid_TIs_dataset(stock_symbol, start_date, end_date, window, normalize=True):
41 |     cols = ["Date", "Adj Close", "Volume"]
42 |     df = get_stock_data(stock_symbol, start_date, end_date, cols)
43 |     df.rename(columns={"Adj Close" : 'price'}, inplace=True)
44 |     df['momentum'] = compute_momentum_ratio(df['price'], window)
45 |     df['sma'] = compute_sma_ratio(df['price'], window)
46 |     df['bolinger_band'] = compute_bollinger_bands_ratio(df['price'], window)
47 |     df['volatility'] = compute_volatility_ratio(df['price'], window)
48 |     df['vroc'] = compute_vroc_ratio(df['Volume'], window)
49 |     df['actual_price'] = df['price']
50 |     df.drop(columns=["Volume"], inplace=True)
51 |     df = df[window:]
52 |     df.replace([np.inf, -np.inf], np.nan, inplace=True)
53 |     df.fillna(method='ffill', inplace=True)
54 |     df.fillna(method='bfill', inplace=True)
55 |     scaler = None
56 | 
57 |     if normalize:        
58 |         scaler = MinMaxScaler()
59 |         df['price'] = scaler.fit_transform(df['price'].values.reshape(-1,1))
60 |         df['momentum'] = scaler.fit_transform(df['momentum'].values.reshape(-1,1))
61 |         df['sma'] = scaler.fit_transform(df['sma'].values.reshape(-1,1))
62 |         df['bolinger_band'] = scaler.fit_transform(df['bolinger_band'].values.reshape(-1,1))
63 |         df['volatility'] = scaler.fit_transform(df['volatility'].values.reshape(-1,1))
64 |         df['vroc'] = scaler.fit_transform(df['vroc'].values.reshape(-1,1))
65 |         df['actual_price'] = scaler.fit_transform(df['actual_price'].values.reshape(-1,1))
66 |         
67 |     print(df.head())
68 |     print(df.tail())
69 |     return df, scaler
70 | 
71 | def dataset_split(dataset, future_gap, split):
72 |     print("Dataset Shape:", dataset.shape)
73 |     X = dataset[:, :-1]
74 |     Y = dataset[:, -1]
75 |     print("X Shape:", X.shape)
76 |     print("Y Shape:", Y.shape)
77 | 
78 |     print("Applying Future Gap...")
79 |     X = X[:-future_gap]
80 |     Y = Y[future_gap:]
81 |     print("X Shape:", X.shape)
82 |     print("Y Shape:", Y.shape)
83 | 
84 |     if split != None:
85 |         print("Applying training, testing split...")
86 |         split_index = int(split*X.shape[0])
87 |         X_train = X[:split_index]
88 |         X_test = X[split_index:]
89 |         Y_train = Y[:split_index]
90 |         Y_test = Y[split_index:]
91 |         print("(X_train, Y_train, X_test, Y_test) Shapes:")
92 |         print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
93 |         return X_train, Y_train, X_test, Y_test
94 |     
95 |     return X, Y


--------------------------------------------------------------------------------
/machine_learning/readme.md:
--------------------------------------------------------------------------------
1 | This directory contains projects applying different machine learning algorithms on historical data of different companies to predict stock prices. The development directory is where the models where developed. It includes initial try-outs, improvements, and multiple 
2 | changes. The final directory includes the finalized versions of the models, the evaluation metrics, and the final experiments.


--------------------------------------------------------------------------------
/results/amazon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/amazon.png


--------------------------------------------------------------------------------
/results/apple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/apple.png


--------------------------------------------------------------------------------
/results/experiments/exp1/ffnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/ffnn.png


--------------------------------------------------------------------------------
/results/experiments/exp1/knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/knn.png


--------------------------------------------------------------------------------
/results/experiments/exp1/lin_reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/lin_reg.png


--------------------------------------------------------------------------------
/results/experiments/exp1/lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp1/lstm.png


--------------------------------------------------------------------------------
/results/experiments/exp2/sudden_vs_normal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal.png


--------------------------------------------------------------------------------
/results/experiments/exp2/sudden_vs_normal_daily_lag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal_daily_lag.png


--------------------------------------------------------------------------------
/results/experiments/exp2/sudden_vs_normal_lag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal_lag.png


--------------------------------------------------------------------------------
/results/experiments/exp2/sudden_vs_normal_pal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp2/sudden_vs_normal_pal.png


--------------------------------------------------------------------------------
/results/experiments/exp3/amazon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/amazon.png


--------------------------------------------------------------------------------
/results/experiments/exp3/apple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/apple.png


--------------------------------------------------------------------------------
/results/experiments/exp3/facebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/facebook.png


--------------------------------------------------------------------------------
/results/experiments/exp3/tesla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp3/tesla.png


--------------------------------------------------------------------------------
/results/experiments/exp4/gap1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap1.png


--------------------------------------------------------------------------------
/results/experiments/exp4/gap2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap2.png


--------------------------------------------------------------------------------
/results/experiments/exp4/gap3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap3.png


--------------------------------------------------------------------------------
/results/experiments/exp4/gap4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap4.png


--------------------------------------------------------------------------------
/results/experiments/exp4/gap5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp4/gap5.png


--------------------------------------------------------------------------------
/results/experiments/exp5/linreg_forecast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_forecast.png


--------------------------------------------------------------------------------
/results/experiments/exp5/linreg_forecast_pal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_forecast_pal.png


--------------------------------------------------------------------------------
/results/experiments/exp5/linreg_pal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_pal.png


--------------------------------------------------------------------------------
/results/experiments/exp5/linreg_pal_daily.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/linreg_pal_daily.png


--------------------------------------------------------------------------------
/results/experiments/exp5/lstm_forecast.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_forecast.png


--------------------------------------------------------------------------------
/results/experiments/exp5/lstm_forecast_pal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_forecast_pal.png


--------------------------------------------------------------------------------
/results/experiments/exp5/lstm_pal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_pal.png


--------------------------------------------------------------------------------
/results/experiments/exp5/lstm_pal_daily.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/experiments/exp5/lstm_pal_daily.png


--------------------------------------------------------------------------------
/results/facebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/facebook.png


--------------------------------------------------------------------------------
/results/ffnn_reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/ffnn_reg.png


--------------------------------------------------------------------------------
/results/future_gap_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/future_gap_test.png


--------------------------------------------------------------------------------
/results/hyperparam_tune_ffnn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_ffnn1.png


--------------------------------------------------------------------------------
/results/hyperparam_tune_ffnn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_ffnn2.png


--------------------------------------------------------------------------------
/results/hyperparam_tune_lstm1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_lstm1.png


--------------------------------------------------------------------------------
/results/hyperparam_tune_lstm2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/hyperparam_tune_lstm2.png


--------------------------------------------------------------------------------
/results/knn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/knn.png


--------------------------------------------------------------------------------
/results/lin_reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/lin_reg.png


--------------------------------------------------------------------------------
/results/lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/lstm.png


--------------------------------------------------------------------------------
/results/new_lin_reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/new_lin_reg.png


--------------------------------------------------------------------------------
/results/optimized_ffnn.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/optimized_ffnn.png


--------------------------------------------------------------------------------
/results/optimized_lstm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/optimized_lstm.png


--------------------------------------------------------------------------------
/results/stable.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/stable.png


--------------------------------------------------------------------------------
/results/stable_lag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/stable_lag.png


--------------------------------------------------------------------------------
/results/sudden_vs_normal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal.png


--------------------------------------------------------------------------------
/results/sudden_vs_normal_daily_lag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_daily_lag.png


--------------------------------------------------------------------------------
/results/sudden_vs_normal_lag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_lag.png


--------------------------------------------------------------------------------
/results/sudden_vs_normal_pal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_pal.png


--------------------------------------------------------------------------------
/results/sudden_vs_normal_pal_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/sudden_vs_normal_pal_1.png


--------------------------------------------------------------------------------
/results/tesla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/tesla.png


--------------------------------------------------------------------------------
/results/volatile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/volatile.png


--------------------------------------------------------------------------------
/results/volatile_lag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/volatile_lag.png


--------------------------------------------------------------------------------
/results/window_test_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/window_test_1.png


--------------------------------------------------------------------------------
/results/window_test_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/results/window_test_2.png


--------------------------------------------------------------------------------
/statistics_and_optimization/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/statistics_and_optimization/__init__.py


--------------------------------------------------------------------------------
/statistics_and_optimization/bollinger_bands.py:
--------------------------------------------------------------------------------
 1 | ''' this file calculates and visualizes a company's stock
 2 | price bollinger bands, to be used as a trading strategy
 3 | '''
 4 | from utils.util import get_data, plot_data
 5 | 
 6 | '''given the rolling mean and std, calculate the upper and
 7 | lower bollinger bands
 8 | 
 9 | *mean : the rolling mean of a stock price
10 | *std  : the rolling standard deviation of a stock price
11 | '''
12 | def get_bollinger_bands(mean, std):
13 |     upper_band = mean + (2*std)
14 |     lower_band = mean - (2*std)
15 |     return upper_band, lower_band
16 | 
17 | '''a tester function
18 | '''
19 | def main():
20 |     start_date = "01/01/2017"
21 |     end_date = "31/12/2017"
22 |     symbols = ["FB"]
23 |     stock_symbol = "FB"
24 |     df = get_data(symbols, start_date, end_date, include_SPY=False)
25 |     print(df.head())
26 |     print(df.tail())
27 |     
28 |     window = 20
29 |     rolling_mean = df[stock_symbol].rolling(window=window).mean()
30 |     rolling_std = df[stock_symbol].rolling(window=window).std()
31 |     df["Rolling Mean"] = rolling_mean
32 |     df["Upper Bollinger Band"], df["Lower Bollinger Band"] = get_bollinger_bands(rolling_mean, rolling_std)
33 |     plot_data(df, stock_symbol+" Bollinger Bands", "Date", "Price")
34 | 
35 | '''to ensure running the tester function only when this file is run, not imported
36 | '''
37 | if __name__ == "__main__":
38 |     main()


--------------------------------------------------------------------------------
/statistics_and_optimization/portfolio_optimization.py:
--------------------------------------------------------------------------------
 1 | ''' this file finds the optimal portfolio allocation to maximize a 
 2 | chosen portfolio statistic
 3 | '''
 4 | from utils.util import get_data, plot_data
 5 | from statistics_and_optimization.portfolio_statistics import compute_daily_portfolio_value, compute_portfolio_statistics
 6 | import pandas as pd
 7 | import numpy as np 
 8 | import scipy.optimize as spo
 9 | 
10 | '''this function returns a portfolio statistic to be maximized,
11 | the value is multiplied by negative one, because it will be
12 | passed to a minimizer in the compute_optimal_allocations fcn
13 | 
14 | *allocations    : given allocations to a portfolio
15 | *df_portfolio   : the portfolio dataframe
16 | '''
17 | def portfolio_statistic(allocations, df_portfolio):
18 |     #Daily Portfolio Value
19 |     daily_portfolio_value = compute_daily_portfolio_value(df_portfolio, 1, allocations)
20 | 
21 |     #Portfolio Statistics
22 |     cummulative_portfolio_return, _, _, _ = compute_portfolio_statistics(daily_portfolio_value)
23 | 
24 |     return -1*cummulative_portfolio_return
25 | 
26 | '''this function uses SciPy's minimizer and portfolio_statistic fcns
27 | to minmize the negative portfolio statistic, and thus maximizing it
28 | it returns the optimal allocation for maximizing the statistic
29 | 
30 | *dataframe  : the portfolio dataframe
31 | '''
32 | def compute_optimal_allocations(dataframe):
33 |     guess = 1.0/dataframe.shape[1]
34 |     allocations_guess = [guess] * dataframe.shape[1]
35 |     bounds = [[0,1]] * dataframe.shape[1]
36 |     constraints = {
37 |                     'type':'eq', 
38 |                     'fun': lambda allocations_guess : 1.0 - np.sum(allocations_guess)
39 |                   }
40 |     minimum = spo.minimize(portfolio_statistic, allocations_guess, args=(dataframe, ),
41 |                            method="SLSQP", bounds=bounds, constraints=constraints,
42 |                            options={'disp':True})
43 |     return minimum.x
44 | 
45 | '''a tester function
46 | '''
47 | def main():
48 |     symbols = ["AAPL", "FB", "GOOG", "SPY"]
49 |     start_date = "01/01/2017"
50 |     end_date = "31/12/2017"
51 | 
52 |     #Portfolio and SPY Dataframes
53 |     df_portfolio = get_data(symbols, start_date, end_date)
54 |     df_SPY = df_portfolio.ix[:, "SPY"]
55 |     df_SPY = df_SPY/df_SPY.ix[0]
56 | 
57 |     #Optimized Allocations
58 |     optimized_allocations = compute_optimal_allocations(df_portfolio)
59 |     optimized_portfolio = compute_daily_portfolio_value(df_portfolio, 100000, optimized_allocations)
60 |     optimized_portfolio = optimized_portfolio/optimized_portfolio.ix[0]
61 | 
62 |     #Default Allocations
63 |     default_allocations = [0.25, 0.25, 0.25,0.25]
64 |     default_portfolio = compute_daily_portfolio_value(df_portfolio, 100000, default_allocations)
65 |     default_portfolio = default_portfolio/default_portfolio.ix[0]
66 | 
67 |     df_comparsion = pd.concat([optimized_portfolio, default_portfolio, df_SPY],
68 |                               keys=["Optimized Portfolio","Default Portfolio","S&P500"], axis=1)
69 |     
70 |     plot_data(df_comparsion, "Portfolio Optimization", "Date", "Price")
71 | 
72 | '''to ensure running the tester function only when this file is run, not imported
73 | '''
74 | if __name__ == "__main__":
75 |     main()


--------------------------------------------------------------------------------
/statistics_and_optimization/portfolio_statistics.py:
--------------------------------------------------------------------------------
  1 | ''' this file constructs a portfolio and computes some
  2 | portfolio statistics
  3 | '''
  4 | import pandas as pd
  5 | from utils.util import get_data, plot_data
  6 | 
  7 | '''this helper function computes the daily value
  8 | of a portfolio
  9 | 
 10 | *df           : dataframe containing the stocks to be included
 11 | *capital      : starting portfolio capital
 12 | *allocations  : allocations to the chosen stocks
 13 | '''
 14 | def compute_daily_portfolio_value(df, capital, allocations):
 15 |     #normalization
 16 |     normalized = df/df.ix[0, :]
 17 |     #allocation
 18 |     allocated = normalized*allocations
 19 |     #capital/position value
 20 |     pos_val = allocated*capital
 21 |     #port value
 22 |     port_val = pos_val.sum(axis=1)
 23 |     return port_val
 24 | 
 25 | '''five helper functions, each computing and returning a portfolio statistic
 26 | '''
 27 | #BEGIN
 28 | def compute_daily_portfolio_return(daily_portfolio_value):
 29 |     return daily_portfolio_value[1:] / daily_portfolio_value[:-1].values - 1
 30 | 
 31 | def compute_cummulative_portfolio_return(daily_portfolio_value):
 32 |     return daily_portfolio_value[-1] / daily_portfolio_value[0] - 1
 33 | 
 34 | def compute_mean_daily_portfolio_return(daily_portfolio_return):
 35 |     return daily_portfolio_return.mean()
 36 | 
 37 | def compute_std_daily_portfolio_return(daily_portfolio_return):
 38 |     return daily_portfolio_return.std()
 39 | 
 40 | def compute_daily_sampled_sharpe_ratio(mean_daily_portfolio_return, std_daily_portfolio_return):
 41 |     return (252**0.5) * mean_daily_portfolio_return/std_daily_portfolio_return
 42 | #END
 43 | 
 44 | '''this helper function wraps all the helper functions that compute 
 45 | the daily statistics of a portfolio and returns all the statistics
 46 | 
 47 | *daily_portfolio_value : dataframe containing the daily values of a portfolio
 48 | '''
 49 | def compute_portfolio_statistics(daily_portfolio_value):
 50 |     daily_portfolio_return =  compute_daily_portfolio_return(daily_portfolio_value)
 51 |     cummulative_portfolio_return = compute_cummulative_portfolio_return(daily_portfolio_value)
 52 |     mean_daily_portfolio_return =  compute_mean_daily_portfolio_return(daily_portfolio_return)
 53 |     std_daily_portfolio_return  = compute_std_daily_portfolio_return(daily_portfolio_return)
 54 |     daily_sampled_sharpe_ratio =  compute_daily_sampled_sharpe_ratio(mean_daily_portfolio_return, std_daily_portfolio_return)
 55 | 
 56 |     return cummulative_portfolio_return, mean_daily_portfolio_return, std_daily_portfolio_return, daily_sampled_sharpe_ratio
 57 | 
 58 | '''a tester function
 59 | '''
 60 | def main():
 61 |     capital = 100000
 62 |     symbols = ["AAPL", "FB", "GOOG", "SPY"]
 63 |     allocations = [0.25, 0.25, 0.25, 0.25]
 64 |     start_date = "01/01/2017"
 65 |     end_date = "31/12/2017"
 66 | 
 67 |     #Portfolio Dataframe
 68 |     df_portfolio = get_data(symbols, start_date, end_date)
 69 |     df_SPY = df_portfolio.ix[:, "SPY"]
 70 | 
 71 |     #Daily Portfolio Value
 72 |     daily_portfolio_value = compute_daily_portfolio_value(df_portfolio, capital, allocations)
 73 |     print(daily_portfolio_value.head())
 74 | 
 75 |     #Daily Portfolio Return
 76 |     daily_portfolio_return = compute_daily_portfolio_return(daily_portfolio_value)
 77 | 
 78 |     #Cummulative Portfolio Return
 79 |     cummulative_portfolio_return = compute_cummulative_portfolio_return(daily_portfolio_value)
 80 |     print("Cummulative Portfolio Return:", cummulative_portfolio_return)
 81 | 
 82 |     #Daily Portfolio Return Mean
 83 |     mean_daily_portfolio_return = compute_mean_daily_portfolio_return(daily_portfolio_return)
 84 |     print("Daily Portfolio Return Mean:", mean_daily_portfolio_return)
 85 | 
 86 |     #Daily Portfolio Return Standard Deviation
 87 |     std_daily_portfolio_return = compute_std_daily_portfolio_return(daily_portfolio_return)
 88 |     print("Daily Portfolio Return Standard Deviation:", std_daily_portfolio_return)
 89 | 
 90 |     #Daily Sampled Sharpe Ratio
 91 |     daily_sampled_sharpe_ratio = compute_daily_sampled_sharpe_ratio(mean_daily_portfolio_return, std_daily_portfolio_return)
 92 |     print("Daily Sampled Sharpe Ratio:", daily_sampled_sharpe_ratio)
 93 | 
 94 |     #Comparing between the portfolio and S&P500
 95 |     daily_portfolio_value_normalized = daily_portfolio_value/daily_portfolio_value.ix[0]
 96 |     df_SPY_normalized = df_SPY/df_SPY.ix[0]
 97 |     df_comparsion = pd.concat([daily_portfolio_value_normalized, df_SPY_normalized], keys=["Portfolio", "SPY"], axis=1)
 98 |     plot_data(df_comparsion, "Portfolio 2017 Normalized Price", "Date", "Price")
 99 | 
100 | '''to ensure running the tester function only when this file is run, not imported
101 | '''
102 | if __name__ == "__main__":
103 |     main()


--------------------------------------------------------------------------------
/statistics_and_optimization/readme.md:
--------------------------------------------------------------------------------
1 | This directory contains files calculating statistics, and representing data visually.
2 | It also includes files dealing with portfolio management, statistics, and optimization.


--------------------------------------------------------------------------------
/utils/README.md:
--------------------------------------------------------------------------------
1 | This directory contains utility files that contain basic essential functions used throughout the project.


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahmedhamdi96/ML4T/e91b3678864d6f37a0b9545cee50d74d5de18a59/utils/__init__.py


--------------------------------------------------------------------------------
/utils/util.py:
--------------------------------------------------------------------------------
 1 | ''' this file contains functions that are used in most files
 2 | of this project, it contains utility functions to read and
 3 | plot stock historical data
 4 | '''
 5 | import os
 6 | import pandas as pd
 7 | import matplotlib.pyplot as plt
 8 | 
 9 | '''this helper function redirects to the directory containing
10 | the stock historical data
11 | 
12 | *symbol : stock symbol
13 | *depth  : directory depth from the root
14 | '''
15 | def symbol_to_path(symbol, depth=1):
16 |     base = os.path.dirname(__file__)
17 | 
18 |     while depth > 0:
19 |         base = os.path.dirname(base)
20 |         depth -= 1
21 |     
22 |     path =  os.path.join(base, "resources", "historical_data", "{}.csv".format(symbol))
23 |     return path
24 | 
25 | '''this function creates a dataframe of chosen stocks with 
26 | dates as the index and the adjusted closing price of each
27 | stock as the columns
28 | 
29 | *symbols        : stock symbol
30 | *start_date     : start date of the dataframe's date index
31 | *end_date       : end date of the dataframe's date index
32 | *include_SPY    : boolean to indicate whether to include
33 |                   S&P500 index stock                        
34 | '''
35 | def get_data(symbols, start_date, end_date, include_SPY=True):
36 |     if include_SPY and "SPY" not in symbols:
37 |         symbols.insert(0, "SPY")
38 | 
39 |     dates_index = pd.date_range(start=start_date, end=end_date)
40 |     df = pd.DataFrame(index = dates_index)
41 | 
42 |     for symbol in symbols:
43 |         df_temp = pd.read_csv(symbol_to_path(symbol), index_col="Date",
44 |                               parse_dates=True, usecols=["Date", "Adj Close"],
45 |                               na_values="nan")
46 |         df_temp = df_temp.rename(columns={"Adj Close" : symbol})
47 |         df = df.join(df_temp, how="inner")
48 | 
49 |     return df
50 | 
51 | '''this function creates a dataframe for a selected stock with 
52 | dates as the index and the chosen columns
53 | 
54 | *symbol     : stock symbol
55 | *start_date : start date of the dataframe's date index
56 | *end_date   : end date of the dataframe's date index   
57 | *columns    : columns to include in the dataframe                     
58 | '''
59 | def get_stock_data(symbol, start_date=None, end_date=None, columns=["Date", "Adj Close"]):
60 | 
61 |     df = pd.read_csv(symbol_to_path(symbol), index_col="Date",
62 |                      parse_dates=True, usecols=columns,
63 |                      na_values="nan")
64 |     return df[start_date:end_date]
65 | 
66 | '''this function plots a given dataframe
67 | 
68 | *dataframe      : dataframe to be plotted
69 | *plot_title     : the plot title
70 | *xlabel         : the horizontal axis label
71 | *ylabel         : the vertical axis label
72 | *leg_loc        : legend location
73 | '''
74 | def plot_data(dataframe, plot_title, xlabel, ylabel, leg_loc="best", show_plot='True'):
75 |     ax = dataframe.plot(title=plot_title)
76 |     ax.set_xlabel(xlabel)
77 |     ax.set_ylabel(ylabel)
78 |     ax.legend(loc=leg_loc)
79 |     ax.grid(True)
80 |     if show_plot:
81 |         plt.show()
82 | 
83 | '''a tester function
84 | '''
85 | def main():
86 |     start_date = "01/01/2017"
87 |     end_date = "31/12/2017"
88 |     symbols = ["GOOG","AAPL","FB"]
89 |     df = get_data(symbols, start_date, end_date)
90 |     print(df)
91 | 
92 |     column_slicing = ['SPY', 'GOOG']
93 |     dataframe_sliced = df.ix[:, column_slicing]
94 |     plot_data(dataframe_sliced, "Selected Stock Prices", "Date", "Price")
95 | 
96 | '''to ensure running the tester function only when this file is run, not imported
97 | '''
98 | if __name__ == "__main__":
99 |     main()


--------------------------------------------------------------------------------