├── CAPSTONE PROJECT REPORT.pdf ├── Compute_KPI_OutputClass.pyc ├── Readme.txt └── CAPSTONE-PROJECT.py /CAPSTONE PROJECT REPORT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bertrandobi/WQU-CAPSTONE-Project/HEAD/CAPSTONE PROJECT REPORT.pdf -------------------------------------------------------------------------------- /Compute_KPI_OutputClass.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bertrandobi/WQU-CAPSTONE-Project/HEAD/Compute_KPI_OutputClass.pyc -------------------------------------------------------------------------------- /Readme.txt: -------------------------------------------------------------------------------- 1 | Capstone Project: CAPSTONE-PROJECT.py 2 | 3 | 4 | - This project runs perfectly in python 2.7, spyder and jupiter notebook 5 | - it works with pandas, sklearn,pandas data_reader, matplotlib, seaborn, etc as shown on liabraries imported 6 | These liabraries can be installed through pip install. 7 | - An ipython notebook version of the project is attached (CAPSTONE-.ipynb) 8 | - It also works with the Compute_KPI_OutputClass as attached to the project file directory. also copy this file to your python directory before importing 9 | The project unpublished draft research report is also attached. The complete research report ready for publication will be prepared and attached after review and approval 10 | -------------------------------------------------------------------------------- /CAPSTONE-PROJECT.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # APPLICATION OF MACHINE LEARNING TO HIGH FREQUENCY TRADING OF STOCKS 5 | 6 | # # PROJECT OBJECTIVES: 7 | 8 | # 1) Create a stock market predictor: 9 | # - Design a function to scrape stock market data from Yahoo Finance 10 | # - Obtain High Frequency Minute Data for any stock in the Dow jones Industrial Average(DJIA) 11 | # - Compute some technical inidcators as features for the stock predictor 12 | # - Use the following Machine learning algorithms: LogisticRegression, Linear DiscriminantAnalysis, KNeighbors Classifier, DecisionTree Classifier, GaussianNB, SVC, Random ForestClassifier, XGBClassifier, etc, for the prediction of stock market movements 13 | # - compute the performance of the Machine Learning Algorithms and choose the best learner 14 | # - Use the best learner to predict the stock price movements 15 | # 16 | # 2) Create a trading strategy based on the predicted market prices movement and backtest the strategy 17 | # 18 | # 3) Measure the performance of the strategy against the non machine learning Buy and Hold strategy. 19 | 20 | # # IMPORT LIABRARIES NEEDDED FOR THE PROJECT 21 | 22 | # In[2]: 23 | 24 | 25 | import csv 26 | import arrow 27 | import pandas as pd 28 | import requests 29 | import matplotlib.pyplot as plt 30 | import seaborn as sb 31 | import numpy as np 32 | from Compute_KPI_OutputClass import Output 33 | import matplotlib.colors as colors 34 | import matplotlib.dates as mdates 35 | import matplotlib.ticker as mticker 36 | import matplotlib.mlab as mlab 37 | import matplotlib.font_manager as font_manager 38 | import datetime 39 | pd.core.common.is_list_like = pd.api.types.is_list_like 40 | import fix_yahoo_finance as yf 41 | from pandas_datareader import data as pdr 42 | import ta 43 | from sklearn.model_selection import train_test_split 44 | from sklearn.preprocessing import StandardScaler 45 | from sklearn.tree import DecisionTreeRegressor 46 | from sklearn.tree import DecisionTreeClassifier 47 | from sklearn.linear_model import LogisticRegression 48 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error,f1_score 49 | from sklearn.cross_validation import cross_val_score 50 | from sklearn.decomposition import PCA 51 | cntk.tests.test_utils.set_device_from_pytest_env() 52 | from sklearn.neighbors import KNeighborsClassifier 53 | from sklearn.naive_bayes import GaussianNB 54 | from sklearn.svm import SVC 55 | from sklearn.discriminant_analysis import LinearDiscriminantAnalysis 56 | from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier 57 | from xgboost import XGBClassifier, XGBRegressor 58 | import matplotlib.pyplot as plt 59 | get_ipython().magic(u'matplotlib inline') 60 | 61 | 62 | # # Yahoo Finance function to download High Frequency (minute) Data 63 | 64 | # This is a Function to scrape intraday High frequency Data from Yahoo Finance. The Function takes as arguments: 65 | # - the stock symbol ('IBM'), 66 | # - data_range in days('7d') The maximum is 7days, 67 | # - data_interval('1m') in minutes, 5minutes, 30 minutes, 60minutes and 1day. 68 | 69 | # In[3]: 70 | 71 | 72 | def get_quote_data(symbol, data_range, data_interval): 73 | #symbol, data_range, data_interval 74 | #symbol=raw_input('Please enter a valid ticker enclose with "" :') 75 | #data_range = raw_input('Please enter date range in days(Maximum: 7d) enclose with "" :') 76 | #data_interval = raw_input('Please enter time interval in minutes enclose with "" (EX. "1m") :') 77 | res = requests.get('https://query1.finance.yahoo.com/v8/finance/chart/{symbol}?range={data_range}&interval={data_interval}'.format(**locals())) 78 | data = res.json() 79 | body = data['chart']['result'][0] 80 | dt = datetime.datetime 81 | dt = pd.Series(map(lambda x: arrow.get(x).to('Asia/Calcutta').datetime.replace(tzinfo=None), body['timestamp']), name='Datetime') 82 | df = pd.DataFrame(body['indicators']['quote'][0], index=dt) 83 | dg = pd.DataFrame(body['timestamp']) 84 | df = df.loc[:, ('open', 'high', 'low', 'close', 'volume')] 85 | df.dropna(inplace=True) #removing NaN rows 86 | df.columns = ['OPEN', 'HIGH','LOW','CLOSE','VOLUME'] #Renaming columns in pandas 87 | return df 88 | 89 | 90 | # # Create a function to compute Technical Indicators based on the data 91 | 92 | # In[4]: 93 | 94 | 95 | def process_data(data): 96 | data['BB_5']=ta.bollinger_mavg(data['CLOSE'],5) #bollinger_moving average 5 trading periods 97 | data['BB_10']=ta.bollinger_mavg(data['CLOSE'],10) #bollinger_moving average 10 trading periods 98 | data['BB_20']=ta.bollinger_mavg(data['CLOSE'],20) # bollinger_moving average 20 periods 99 | data['ADX']=ta.adx(data['HIGH'], data['LOW'], data['CLOSE'], 14) #Average Directional Index 100 | data['ATR']=ta.average_true_range(data['HIGH'], data['LOW'], data['CLOSE'], 14) #Average True Range 101 | data['CCI']=ta.cci(data['HIGH'], data['LOW'], data['CLOSE'], 14) #Commodity Channel Index 102 | data['DCH']=ta.donchian_channel_hband(data['CLOSE']) #Donchian Channel High Band 103 | data['DCL']=ta.donchian_channel_lband(data['CLOSE']) #Donchian Channel Low Band 104 | data['DPO']=ta.dpo(data['CLOSE']) #Detrend Price Oscilator 105 | data['EMAf']=ta.ema_fast(data['CLOSE']) #Expornential Moving Average fast 106 | data['EMAs']=ta.ema_slow(data['CLOSE']) #Expornential Moving Average slow 107 | data['FI']=ta.force_index(data['CLOSE'], data['VOLUME']) # Force Index(reveals the value of a trend) 108 | data['ICHa']=ta.ichimoku_a(data['HIGH'], data['LOW']) #Ichimoku A 109 | data['ICHb']=ta.ichimoku_b(data['HIGH'], data['LOW']) #Ichimoku B 110 | data['KC']=ta.keltner_channel_central(data['HIGH'], data['LOW'], data['CLOSE']) #Keltner channel(KC) Central 111 | data['KST']=ta.kst(data['CLOSE']) #KST Oscillator (KST) identify major stock market cycle junctures 112 | data['MACD']=ta.macd(data['CLOSE']) # Moving Average convergence divergence 113 | data['OBV']=ta.on_balance_volume_mean(data['CLOSE'], data['VOLUME']) # on_balance_volume_mean 114 | data['RSI']=ta.rsi(data['CLOSE']) # Relative Strength Index (RSI) 115 | data['TRIX']=ta.trix(data['CLOSE']) #Shows the percent rate of change of a triple exponentially smoothed moving average 116 | data['TSI']=ta.tsi(data['CLOSE']) #True strength index (TSI) 117 | data['ROC1']=(data['CLOSE']-data['OPEN'])/data['OPEN'] 118 | data['RET']=data['CLOSE'].pct_change() 119 | data['y'] = np.where(data['OPEN'] <= data['CLOSE'],1,-1) 120 | data=data.dropna() 121 | return data 122 | 123 | 124 | # # Create a Function to perform features importance using a correlation matrix 125 | # - This function takes a dataframe generated from Scraped financial data and technical indicators calculated. 126 | 127 | # In[5]: 128 | 129 | 130 | def feature_imp(data): 131 | corrmat=data.corr() 132 | top_corr_features = corrmat.index 133 | plt.figure(figsize=(20,20)) 134 | #plot heat map 135 | g=sb.heatmap(data[top_corr_features].corr(),annot=True,cmap="RdYlGn") 136 | plt.title('Correlation between different fearures and target') 137 | plt.show() 138 | return 139 | 140 | 141 | # # Function to model the data using 8 Machine learning classification algorithms in sklearn. 142 | 143 | # - Scale, transform and partition the data into training (80%) and test (20%) sets 144 | # 145 | # - This function takes the downloaded data and technical indicators as features, train the models using the training set, predict new data points showing if the minutes stock price is increasing (1) or decreasing (-1). A performance matrix showing accuracy score of the various models is generated and the best model is picked to predict stock movements in High frequency trading. 146 | 147 | # In[6]: 148 | 149 | 150 | def modeling(data): 151 | Xi = data.drop(['y'], axis=1) 152 | scaler=StandardScaler().fit(Xi) # Use the standard scaler function from scikit learn 153 | Xs = scaler.transform(Xi) 154 | #pca = PCA(n_components=3) 155 | #pca.fit(Xi) 156 | #X = pca.transform(Xi) 157 | X=Xs 158 | Y=data['y'] 159 | global xTrain 160 | global xTest 161 | global yTrain 162 | global yTest 163 | xTrain, xTest, yTrain, yTest = train_test_split(X,Y, test_size = 0.2, random_state = 0) 164 | models = [] 165 | models.append(('LR' , LogisticRegression())) 166 | models.append(('LDA' , LinearDiscriminantAnalysis())) 167 | models.append(('KNN' , KNeighborsClassifier())) 168 | models.append(('CART' , DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=6))) 169 | models.append(('NB' , GaussianNB())) 170 | models.append(('SVM' , SVC())) 171 | models.append(('RF' , RandomForestClassifier(n_estimators=60))) 172 | models.append(('XGBoost', XGBClassifier(gamma=0.0, n_estimators=60,base_score=0.7, max_depth=3, objective = "binary:logistic", colsample_bytree=1,learning_rate=0.01))) 173 | 174 | results = [] 175 | names = [] 176 | ''' 177 | for name, model in models: 178 | kfold = KFold(n_splits=num_folds, random_state=42) 179 | cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring) 180 | results.append(cv_results) 181 | names.append(name) 182 | msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std()) 183 | print(msg) ''' 184 | for name, model in models: 185 | clf = model 186 | clf.fit(xTrain, yTrain) 187 | y_pred = clf.predict(xTest) 188 | accu_score = accuracy_score(yTest, y_pred) 189 | results.append([name, accu_score]) 190 | #print(name + ": " + str(accu_score)) 191 | re=pd.DataFrame(results, columns=['Model', 'Acuracy_Score']) 192 | re.set_index(['Model']) 193 | return re 194 | 195 | 196 | # # FUNCTION TO PREDICT STOCK MOVEMENT USING DECISION TREE CLASSIFIER 197 | # - This function takes as arguments xTrain, yTrain, xTest and returns predicted stock movement and a classification report. 198 | 199 | # In[7]: 200 | 201 | 202 | def predstockmvt(xTrain, yTrain, xTest): 203 | clf1 = DecisionTreeClassifier(criterion='gini', max_depth=3, min_samples_leaf=6) 204 | clf1=clf1.fit(xTrain, yTrain) 205 | yPreddt = clf1.predict(xTest) 206 | report=classification_report(yTest,yPreddt) 207 | print(report) 208 | return 209 | 210 | 211 | # # Function to backtest the strategy 212 | # - If the Model predicts an increase in price; we buy at the Open 213 | # - If the model predicts a decrease in stock price, we sell the stock 214 | # - The asumption for this strategy is that short selling is allowwed, No transaction cost and there is equal investment. 215 | 216 | # In[8]: 217 | 218 | 219 | def backtest(data): 220 | trade_data=data.iloc[len(xTrain):] 221 | trade_data['signal']=0 222 | trade_data.loc[trade_data['y']>=1,'signal']=1 223 | trade_data.loc[trade_data['y']<=-1,'signal']=0 224 | trade_data['Strategy_return']=trade_data['signal']*trade_data['ROC1'] 225 | trade_data['Market_return']=trade_data['ROC1'] 226 | global perf 227 | perf=trade_data[['Market_return', 'Strategy_return']].cumsum() 228 | #trade_data[['Market_return'], ['Strategy_return']] 229 | plt.figure(figsize=(10,10)) 230 | perf.plot() 231 | plt.title('Evolution of Cumulative Returns') 232 | plt.show() 233 | return 234 | 235 | 236 | # # IMPLEMENT FUNCTIONS ON A CASE STUDY(IBM) 237 | 238 | # # Apply the function to download minute data from Yahoo for the last seven days 239 | 240 | # In[9]: 241 | 242 | 243 | datas=get_quote_data('IBM', '7d', '1m') 244 | print(datas.head()) 245 | 246 | 247 | # # Apply the function to compute all relevant technical Indicators 248 | 249 | # In[10]: 250 | 251 | 252 | se=process_data(datas) 253 | print(se.head()) 254 | 255 | 256 | # # We'll then conduct features engineering to select the best features to be fitted into the Machine learning algorithms in order to predict the stock market price movemenets using the function feature_imp() above. 257 | 258 | # In[11]: 259 | 260 | 261 | feature_imp(se) 262 | 263 | 264 | # The result show a heatmap of relationships of one feature to another. As seen above; most of the features are related to one another and to the target variable 265 | 266 | # From the heatmap above it is clear that all the features contribute to the prediction of the target variable (y). The most outstanding features are: The rate of change (ROC), returns (RET), Relative Strength Index (RSI), Commodity Channel Index (CCI) 267 | 268 | # # Model the data using eight machine learning algorithms and display the performance report and Test the Hypothesis 269 | 270 | # In[18]: 271 | 272 | 273 | print('The shape of the Dataset is:') 274 | print(se.shape) 275 | 276 | 277 | # - The data set consists of 2631 rows with 28 features and one target variable 278 | # - The target variable represents movement in stock prices denoted by '1' when the stock price increase and -1 when it decrease 279 | # - The model trains the dataset with 80% of the data using the features and the targets 280 | # - The models then use features of the test data(20%) to classify the results to '1' if they predict an increase in stock price and to '-1' if they predict a decrease in stock price. 281 | 282 | # In[12]: 283 | 284 | 285 | print(modeling(se)) 286 | 287 | 288 | # - From the analyses all the models achieve considerable level of performance in predicting stock movements. Outstanding models include: Decision Trees(CART), Random Forest(RF), and Extreme Gradient Boosting(XGBoost). 289 | # 290 | # - This greatly justify the hypothesis that Machine Learning algorthms could be used to predict stock movements in High Frequency Trading setting. 291 | # 292 | # - Decision Trees Classifier is thus retained to predict stock movements for the purpose of this project. 293 | 294 | # # Predict Stock Movements using Decision Trees and present the performance matric 295 | 296 | # In[13]: 297 | 298 | 299 | Y=predstockmvt(xTrain, yTrain, xTest) 300 | Y 301 | 302 | 303 | # # Compare number of predicted with actual results 304 | 305 | # In[14]: 306 | 307 | 308 | print(yTest.value_counts()) 309 | print('This justifies the prediction above.') 310 | 311 | 312 | # # Backtest the predicted results using the backtest function and represent the result in a graph 313 | 314 | # In[15]: 315 | 316 | 317 | backtest(se) 318 | 319 | 320 | # In[16]: 321 | 322 | 323 | print('Performance of the Trading strategy(Assume equal Investment') 324 | Risk_return=Output(perf) 325 | print(Risk_return.generate_output()) 326 | 327 | 328 | # # SWOT ANALYSIS 329 | 330 | # # Strengths: 331 | # - Ability the generate several trades(527) in a day using simple machine learning strategy on High Frequency data 332 | # - The system can make use of trading opportunities immidietely as they present themselves in minutes. 333 | # - Ability to generate superior returns about 10 times higher than the market 334 | # - Simple trading strategy based on accurate prediction of market movements using simple Machine Learning Algorithms 335 | # - High Win rate of 100% with a win return per trade of 3% 336 | # - Increased anualised sharp ratio leading to high alpha generation. 337 | # - it ensures "best execution" of trades because and it minimizes the human element in trading decision making. 338 | # - Improves liquidity with lesser Drawdowns 339 | # - The sytem also reduces transactions costs significantly due to limited human interferences 340 | # - The sytem performs significantly well on all the stocks in Dow Jones Industrial Average index and even on stocks out of the index 341 | 342 | # # Weaknesses: 343 | # - High increase in volatility( from 6% to 35%) due to large number of trades within a limited time frame. 344 | # - The system is not very interative to the user, as the user has to enter functions to generate the data, model and return outputs. opportunities exiists to make the system fully functional and interactive 345 | # - Require huge amount of time in designing the functions and optimising the algorithms. 346 | # - Strict monitoring of the system to avoid to system overuns and failures 347 | # - Deficulties in applying the system to several(morethan one) stock at a time due to deficulties in obtaining free High frequency data 348 | # - Transaction cost and other expenses are not factored into the system further development will include the modules 349 | # - Market Sentiments indicators were not included as part of the features set. These indicators could greatly influence the market returns. 350 | 351 | # # Opportunities: 352 | # - Availbility of performant computers, softwares and internet facilities which facilitates the implementation of High Frequency algorithmic trading 353 | # - Availability of programming, application development tools and modules like python, pandas, scikit learn, statsmodels, CNTK, matplotlib, Technical analyses library, etc to facilitate the designing of this project 354 | # - Availability of huge trading opportunities in minutes to take advantage of. 355 | # - Availability of financial markets with regulatory mechanisms(Securities Exchange Commision) to curtail the effects and imperfections of high frequency algorithmic trading. 356 | 357 | # # Threats: 358 | # - Unavailability of free quality High Frequency data for longer period of times. For this project we could only get one minute data for the last seven trading days. 359 | # - High Volatility could lead to frequent stock market breakdowns and imperfections 360 | # - It requires high testing, monitoring and regulation as error in the system could lead to high lost of capital 361 | # - It requires huge investment in the system and trading 362 | # - High cost of acquisition of data for the trading sytem on longer time frames. 363 | 364 | # # Further Research: 365 | # - Continue and build a fully functional and interactive trading system with multiple stcoks and portfolios at a time. 366 | # - Build a database of historical high frquency stock data for major stock exchanges in the world and populate it with Data for the last five years of trading ( I intend to purchase the data). 367 | # - Add other technical indicators to the system and enable the system to trade based on technical indicators and machine learning at a time. 368 | # - Add other Machine Learning, Learning and Reinforcement Learning algorithms to trade combination of technical indicators and machine learning on huge High frequency stock data 369 | # - Add options, commodities, Forex and cryto data on the system for effective High frequency algorithm trading 370 | # - Develop and add other backtest functions based on technical indicators 371 | 372 | # In[ ]: 373 | 374 | 375 | 376 | 377 | --------------------------------------------------------------------------------