├── README.md ├── authtoken.p ├── backtest.py ├── markets.py └── pystocks.py /README.md: -------------------------------------------------------------------------------- 1 | Predicting Stock Market Returns 2 | ======== 3 | 4 | This repository contains the code for the portfolio project I've worked on at **Data Science Retreat** (Berlin). 5 | 6 | The project aim is to build a model to predict **Stock Market Prices**, using a combination of Machine Learning Algorithms. 7 | 8 | The output of the prediction are the daily returns of S&P-500 index. 9 | 10 | * **Binary Classification Problem**: predict positive (**Up**) or negative (**Down**) return respect to the previous day. 11 | 12 | The language I picked to implement the analysis is Python (numpy, scipy, pandas, matplotlib, scikit). 13 | 14 | The main file is **markets.py**. The script calls several functions contained in the pystocks.py. 15 | 16 | -------------------------------------------------------------------------------- /authtoken.p: -------------------------------------------------------------------------------- 1 | S'mCDHcSdN9mQ_Hubid1Uq' 2 | p0 3 | . -------------------------------------------------------------------------------- /backtest.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sat Sep 13 18:22:41 2014 4 | 5 | @author: francesco 6 | """ 7 | 8 | from abc import ABCMeta, abstractmethod 9 | 10 | class Strategy(object): 11 | """Strategy is an abstract base class providing an interface for 12 | all subsequent (inherited) trading strategies. 13 | 14 | The goal of a (derived) Strategy object is to output a list of signals, 15 | which has the form of a time series indexed pandas DataFrame. 16 | 17 | In this instance only a single symbol/instrument is supported.""" 18 | 19 | __metaclass__ = ABCMeta 20 | 21 | @abstractmethod 22 | def generate_signals(self): 23 | """An implementation is required to return the DataFrame of symbols 24 | containing the signals to go long, short or hold (1, -1 or 0).""" 25 | raise NotImplementedError("Should implement generate_signals()!") 26 | 27 | class Portfolio(object): 28 | """An abstract base class representing a portfolio of 29 | positions (including both instruments and cash), determined 30 | on the basis of a set of signals provided by a Strategy.""" 31 | 32 | __metaclass__ = ABCMeta 33 | 34 | @abstractmethod 35 | def generate_positions(self): 36 | """Provides the logic to determine how the portfolio 37 | positions are allocated on the basis of forecasting 38 | signals and available cash.""" 39 | raise NotImplementedError("Should implement generate_positions()!") 40 | 41 | @abstractmethod 42 | def backtest_portfolio(self): 43 | """Provides the logic to generate the trading orders 44 | and subsequent equity curve (i.e. growth of total equity), 45 | as a sum of holdings and cash, and the bar-period returns 46 | associated with this curve based on the 'positions' DataFrame. 47 | 48 | Produces a portfolio object that can be examined by 49 | other classes/functions.""" 50 | raise NotImplementedError("Should implement backtest_portfolio()!") -------------------------------------------------------------------------------- /markets.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Sep 18 00:29:18 2014 4 | 5 | @author: francesco 6 | """ 7 | 8 | import pystocks 9 | import datetime 10 | import sys 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | import locale 14 | locale.setlocale(locale.LC_ALL,'en_US.UTF-8') 15 | from pylab import * 16 | 17 | fout = 'sp500' 18 | method = 'RF' 19 | best_model = 'sp500_57.pickle' 20 | ############ SET PARAMETERS ################################################## 21 | path_datasets = 'path to datasets' 22 | cut = datetime.datetime(1993,1,1) 23 | start_test = datetime.datetime(2014,4,1) 24 | parameters = [] 25 | ############################################################################## 26 | # IN CASE OF FEATURE AND MODEL SELECTION 27 | maxlags = 10 28 | maxdeltas = 10 29 | folds = 10 30 | #grid = {'C': [0.01, 0.1], 'gamma': [0, 1]} 31 | ################################################################################ 32 | # AFTER BEST MODEL SELECTION 33 | bestlags = 9 34 | bestdelta = 9 35 | savemodel = False 36 | ############################################################################## 37 | 38 | if __name__ == "__main__": 39 | 40 | ### PIPELINE 41 | ########################################################################## 42 | ## 1- PERFORM FEATURE SELECTION APPLYING RANDOM FOREST TO THE DATA SET. 43 | ## THE FUNCTION CAN LOAD DATA FROM THE WEB OR FROM CSV FILES PREVIOUSLY SAVED TO DISK. 44 | ## THE OUTPUT IS GOING TO BE A LOG FILE WITH THE RESULT OF CROSS VALIDATION ON TRAIN SET 45 | 46 | sys.stdout = open('path to log txt file', 'w') 47 | pystocks.performFeatureSelection(maxdeltas, maxlags, fout, cut, start_test, path_datasets, savemodel, method, folds, parameters) 48 | 49 | ########################################################################## 50 | # 2- CHECK BEST PARAMETERS 51 | print pystocks.checkModel('path to log txt file') 52 | 53 | ########################################################################## 54 | # 3- AFTER HAVING SELECTED THE TWO PARAMETERS THAT MAXIMIZE THE ACCURACY ON CV 55 | # (MAXDELTA, MAXLAGS) RUN THE MODEL ON THE WHOLE TRAIN SET AND GET A RESULT 56 | # ON TEST SET. IF WILLINGLY SAVE THE MODEL TO CPICKLE. 57 | 58 | print pystocks.performSingleShotClassification(bestdelta, bestlags, fout, cut, start_test, path_datasets, savemodel, method, parameters) 59 | 60 | ########################################################################## 61 | # 4- RUN THE TRADING ALGORITHM ON TOP OF THE PREDICTION AND GET RETURNS OF THE BACKTEST 62 | 63 | #end_period is the last day of backtesting. The data per prediction where collected in the ] 64 | #frame 1/1/1990 - 31/08/2014, after that the first 3 years were cut so the final dataframe 65 | #collects data in the period 1/1/1993 - 31/08/2014. But, as 30-31/08/2014 were Saturday and 66 | #Sunday the data ventually is restricted to 1/1/1993 - 29/08/2014. As we are predicting what 67 | #happens tomorrow (the percentage variation of today closing price of the stock respect to yesterday) 68 | #so in the end we'll have an end_period whose last day is a day before the actual last trading day. 69 | end_period = datetime.datetime(2014,8,28) 70 | 71 | ###### SP500 72 | symbol = 'S&P-500' 73 | name = path_datasets + '/sp500.csv' 74 | prediction = pystocks.getPredictionFromBestModel(9, 9, 'sp500', cut, start_test, path_datasets, 'sp500_57.pickle')[0] 75 | 76 | 77 | bars = pd.read_csv(name, index_col=0, parse_dates=True) 78 | bars = bars[start_test:end_period] 79 | 80 | signals = pd.DataFrame(index=bars.index) 81 | signals['signal'] = 0.0 82 | signals['signal'] = prediction 83 | signals.signal[signals.signal == 0] = -1 84 | signals['positions'] = signals['signal'].diff() 85 | 86 | # Create the portfolio based on the forecaster 87 | amount_of_shares = 500 88 | portfolio = pystocks.MarketIntradayPortfolio(symbol, bars, signals, initial_capital = 100000.0, shares = amount_of_shares) 89 | returns = portfolio.backtest_portfolio() 90 | 91 | # Plot results 92 | f, ax = plt.subplots(2, sharex=True) 93 | f.patch.set_facecolor('white') 94 | ylabel = symbol + ' Close Price in $' 95 | bars['Close_Out'].plot(ax=ax[0], color='r', lw=3.) 96 | ax[0].set_ylabel(ylabel, fontsize=18) 97 | ax[0].set_xlabel('', fontsize=18) 98 | ax[0].legend(('Close Price S&P-500',), loc='upper left', prop={"size":18}) 99 | ax[0].set_title('S&P 500 Close Price VS Portofolio Performance (1 April 2014 - 28 August 2014)', fontsize=20, fontweight="bold") 100 | 101 | returns['total'].plot(ax=ax[1], color='b', lw=3.) 102 | ax[1].set_ylabel('Portfolio value in $', fontsize=18) 103 | ax[1].set_xlabel('Date', fontsize=18) 104 | ax[1].legend(('Portofolio Performance. Capital Invested: 100k $. Shares Traded per day: 500+500',), loc='upper left', prop={"size":18}) 105 | plt.tick_params(axis='both', which='major', labelsize=14) 106 | loc = ax[1].xaxis.get_major_locator() 107 | loc.maxticks[DAILY] = 24 108 | 109 | figManager = plt.get_current_fig_manager() 110 | figManager.window.showMaximized() 111 | 112 | plt.show() 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /pystocks.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Sep 17 23:46:41 2014 4 | 5 | @author: francesco 6 | """ 7 | import cPickle 8 | import numpy as np 9 | import pandas as pd 10 | import datetime 11 | from sklearn import preprocessing 12 | from datetime import datetime 13 | from sklearn.ensemble import RandomForestClassifier 14 | from sklearn import neighbors 15 | from sklearn.ensemble import AdaBoostClassifier 16 | from sklearn.ensemble import GradientBoostingClassifier 17 | from sklearn.svm import SVC 18 | import operator 19 | import pandas.io.data 20 | from sklearn.qda import QDA 21 | import re 22 | from dateutil import parser 23 | from backtest import Strategy, Portfolio 24 | 25 | ############################################################################### 26 | 27 | def loadDatasets(path_directory, fout): 28 | """ 29 | import into dataframe all datasets saved in path_directory 30 | """ 31 | name = path_directory + '/' + fout + '.csv' 32 | out = pd.read_csv(name, index_col=0, parse_dates=True) 33 | 34 | name = path_directory + '/nasdaq.csv' 35 | nasdaq = pd.read_csv(name, index_col=0, parse_dates=True) 36 | 37 | name = path_directory + '/djia.csv' 38 | djia = pd.read_csv(name, index_col=0, parse_dates=True) 39 | 40 | name = path_directory + '/hkong.csv' 41 | hkong = pd.read_csv(name, index_col=0, parse_dates=True) 42 | 43 | name = path_directory + '/frankfurt.csv' 44 | frankfurt = pd.read_csv(name, index_col=0, parse_dates=True) 45 | 46 | name = path_directory + '/paris.csv' 47 | paris = pd.read_csv(name, index_col=0, parse_dates=True) 48 | 49 | name = path_directory + '/nikkei.csv' 50 | nikkei = pd.read_csv(name, index_col=0, parse_dates=True) 51 | 52 | name = path_directory + '/london.csv' 53 | london = pd.read_csv(name, index_col=0, parse_dates=True) 54 | 55 | name = path_directory + '/australia.csv' 56 | australia = pd.read_csv(name, index_col=0, parse_dates=True) 57 | 58 | return [out, nasdaq, djia, frankfurt, london, paris, hkong, nikkei, australia] 59 | 60 | ############################################################################### 61 | 62 | def getStock(symbol, start, end): 63 | """ 64 | downloads stock from yahoo 65 | """ 66 | df = pd.io.data.get_data_yahoo(symbol, start, end) 67 | 68 | df.columns.values[-1] = 'AdjClose' 69 | df.columns = df.columns + '_' + symbol 70 | df['Return_%s' %symbol] = df['AdjClose_%s' %symbol].pct_change() 71 | 72 | return df 73 | 74 | ############################################################################### 75 | 76 | def getStockFromQuandl(symbol, name, start, end): 77 | """ 78 | downloads stock from quandl 79 | """ 80 | import Quandl 81 | df = Quandl.get(symbol, trim_start = start, trim_end = end, authtoken="mCDHcSdN9mQ_Hubid1Uq") 82 | 83 | df.columns.values[-1] = 'AdjClose' 84 | df.columns = df.columns + '_' + name 85 | df['Return_%s' %name] = df['AdjClose_%s' %name].pct_change() 86 | 87 | return df 88 | 89 | ############################################################################### 90 | 91 | def getStockDataFromWeb(fout, start_string, end_string): 92 | """ 93 | collects stocks data from yahoo and quandl 94 | """ 95 | start = parser.parse(start_string) 96 | end = parser.parse(end_string) 97 | 98 | nasdaq = getStock('^IXIC', start, end) 99 | frankfurt = getStock('^GDAXI', start, end) 100 | london = getStock('^FTSE', start, end) 101 | paris = getStock('^FCHI', start, end) 102 | hkong = getStock('^HSI', start, end) 103 | nikkei = getStock('^N225', start, end) 104 | australia = getStock('^AXJO', start, end) 105 | 106 | djia = getStockFromQuandl("YAHOO/INDEX_DJI", 'Djia', start_string, end_string) 107 | 108 | out = pd.io.data.get_data_yahoo(fout, start, end) 109 | out.columns.values[-1] = 'AdjClose' 110 | out.columns = out.columns + '_Out' 111 | out['Return_Out'] = out['AdjClose_Out'].pct_change() 112 | 113 | return [out, nasdaq, djia, frankfurt, london, paris, hkong, nikkei, australia] 114 | 115 | ############################################################################### 116 | 117 | def count_missing(dataframe): 118 | """ 119 | count number of NaN in dataframe 120 | """ 121 | return (dataframe.shape[0] * dataframe.shape[1]) - dataframe.count().sum() 122 | 123 | ############################################################################### 124 | 125 | def addFeatures(dataframe, adjclose, returns, n): 126 | """ 127 | operates on two columns of dataframe: 128 | - n >= 2 129 | - given Return_* computes the return of day i respect to day i-n. 130 | - given AdjClose_* computes its moving average on n days 131 | 132 | """ 133 | 134 | return_n = adjclose[9:] + "Time" + str(n) 135 | dataframe[return_n] = dataframe[adjclose].pct_change(n) 136 | 137 | roll_n = returns[7:] + "RolMean" + str(n) 138 | dataframe[roll_n] = pd.rolling_mean(dataframe[returns], n) 139 | 140 | ############################################################################### 141 | 142 | def applyRollMeanDelayedReturns(datasets, delta): 143 | """ 144 | applies rolling mean and delayed returns to each dataframe in the list 145 | """ 146 | for dataset in datasets: 147 | columns = dataset.columns 148 | adjclose = columns[-2] 149 | returns = columns[-1] 150 | for n in delta: 151 | addFeatures(dataset, adjclose, returns, n) 152 | 153 | return datasets 154 | 155 | ############################################################################### 156 | 157 | def mergeDataframes(datasets, index, cut): 158 | """ 159 | merges datasets in the list 160 | """ 161 | subset = [] 162 | subset = [dataset.iloc[:, index:] for dataset in datasets[1:]] 163 | 164 | first = subset[0].join(subset[1:], how = 'outer') 165 | finance = datasets[0].iloc[:, index:].join(first, how = 'left') 166 | finance = finance[finance.index > cut] 167 | return finance 168 | 169 | ############################################################################### 170 | 171 | def checkModel(filename): 172 | """ 173 | checks max accuracy after CV with specific algorithm 174 | """ 175 | txt = open(filename, "r") 176 | lines = txt.readlines() 177 | accuracies = [line[:-1] for line in lines if line.startswith('0.')] 178 | txt.close() 179 | return max(accuracies) 180 | 181 | 182 | ############################################################################### 183 | 184 | def applyTimeLag(dataset, lags, delta): 185 | """ 186 | apply time lag to return columns selected according to delta. 187 | Days to lag are contained in the lads list passed as argument. 188 | Returns a NaN free dataset obtained cutting the lagged dataset 189 | at head and tail 190 | """ 191 | 192 | dataset.Return_Out = dataset.Return_Out.shift(-1) 193 | maxLag = max(lags) 194 | 195 | columns = dataset.columns[::(2*max(delta)-1)] 196 | for column in columns: 197 | for lag in lags: 198 | newcolumn = column + str(lag) 199 | dataset[newcolumn] = dataset[column].shift(lag) 200 | 201 | return dataset.iloc[maxLag:-1,:] 202 | 203 | ############################################################################### 204 | 205 | def performCV(X_train, y_train, folds, method, parameters, fout, savemodel): 206 | """ 207 | given complete dataframe, number of folds, the % split to generate 208 | train and test set and features to perform prediction --> splits 209 | dataframein test and train set. Takes train set and splits in k folds. 210 | - Train on fold 1, test on 2 211 | - Train on fold 1-2, test on 3 212 | - Train on fold 1-2-3, test on 4 213 | .... 214 | returns mean of test accuracies 215 | """ 216 | print '' 217 | print 'Parameters --------------------------------> ', parameters 218 | print 'Size train set: ', X_train.shape 219 | 220 | k = int(np.floor(float(X_train.shape[0])/folds)) 221 | 222 | print 'Size of each fold: ', k 223 | 224 | acc = np.zeros(folds-1) 225 | for i in range(2, folds+1): 226 | print '' 227 | split = float(i-1)/i 228 | print 'Splitting the first ' + str(i) + ' chuncks at ' + str(i-1) + '/' + str(i) 229 | data = X_train[:(k*i)] 230 | output = y_train[:(k*i)] 231 | print 'Size of train+test: ', data.shape 232 | index = int(np.floor(data.shape[0]*split)) 233 | X_tr = data[:index] 234 | y_tr = output[:index] 235 | 236 | X_te = data[(index+1):] 237 | y_te = output[(index+1):] 238 | 239 | acc[i-2] = performClassification(X_tr, y_tr, X_te, y_te, method, parameters, fout, savemodel) 240 | print 'Accuracy on fold ' + str(i) + ': ', acc[i-2] 241 | 242 | return acc.mean() 243 | 244 | ############################################################################### 245 | 246 | def performTimeSeriesSearchGrid(X_train, y_train, folds, method, grid, fout, savemodel): 247 | """ 248 | parameters is a dictionary with: keys --> parameter , values --> list of values of parameter 249 | """ 250 | print '' 251 | print 'Performing Search Grid CV...' 252 | print 'Algorithm: ', method 253 | param = grid.keys() 254 | finalGrid = {} 255 | if len(param) == 1: 256 | for value_0 in grid[param[0]]: 257 | parameters = [value_0] 258 | accuracy = performCV(X_train, y_train, folds, method, parameters, fout, savemodel) 259 | finalGrid[accuracy] = parameters 260 | final = sorted(finalGrid.iteritems(), key=operator.itemgetter(0), reverse=True) 261 | print '' 262 | print finalGrid 263 | print '' 264 | print 'Final CV Results: ', final 265 | return final[0] 266 | 267 | elif len(param) == 2: 268 | for value_0 in grid[param[0]]: 269 | for value_1 in grid[param[1]]: 270 | parameters = [value_0, value_1] 271 | accuracy = performCV(X_train, y_train, folds, method, parameters, fout, savemodel) 272 | finalGrid[accuracy] = parameters 273 | final = sorted(finalGrid.iteritems(), key=operator.itemgetter(0), reverse=True) 274 | print '' 275 | print finalGrid 276 | print '' 277 | print 'Final CV Results: ', final 278 | return final[0] 279 | 280 | ############################################################################### 281 | 282 | def mergeSentimenToStocks(stocks): 283 | df = pd.read_csv('/home/francesco/BigData/Project/CSV/sentiment.csv', index_col = 'date') 284 | final = stocks.join(df, how='left') 285 | return final 286 | 287 | ############################################################################### 288 | 289 | def prepareDataForClassification(dataset, start_test): 290 | """ 291 | generates categorical to be predicted column, attach to dataframe 292 | and label the categories 293 | """ 294 | le = preprocessing.LabelEncoder() 295 | 296 | dataset['UpDown'] = dataset['Return_Out'] 297 | dataset.UpDown[dataset.UpDown >= 0] = 'Up' 298 | dataset.UpDown[dataset.UpDown < 0] = 'Down' 299 | dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) 300 | 301 | features = dataset.columns[1:-1] 302 | X = dataset[features] 303 | y = dataset.UpDown 304 | 305 | X_train = X[X.index < start_test] 306 | y_train = y[y.index < start_test] 307 | 308 | X_test = X[X.index >= start_test] 309 | y_test = y[y.index >= start_test] 310 | 311 | return X_train, y_train, X_test, y_test 312 | 313 | ############################################################################### 314 | 315 | def performFeatureSelection(maxdeltas, maxlags, fout, cut, start_test, path_datasets, savemodel, method, folds, parameters): 316 | """ 317 | """ 318 | 319 | for maxlag in range(3, maxlags + 2): 320 | lags = range(2, maxlag) 321 | print '' 322 | print '=============================================================' 323 | print 'Maximum time lag applied', max(lags) 324 | print '' 325 | for maxdelta in range(3, maxdeltas + 2): 326 | datasets = loadDatasets(path_datasets, fout) 327 | delta = range(2, maxdelta) 328 | print 'Delta days accounted: ', max(delta) 329 | datasets = applyRollMeanDelayedReturns(datasets, delta) 330 | finance = mergeDataframes(datasets, 6, cut) 331 | print 'Size of data frame: ', finance.shape 332 | print 'Number of NaN after merging: ', count_missing(finance) 333 | finance = finance.interpolate(method='linear') 334 | print 'Number of NaN after time interpolation: ', count_missing(finance) 335 | finance = finance.fillna(finance.mean()) 336 | print 'Number of NaN after mean interpolation: ', count_missing(finance) 337 | finance = applyTimeLag(finance, lags, delta) 338 | print 'Number of NaN after temporal shifting: ', count_missing(finance) 339 | print 'Size of data frame after feature creation: ', finance.shape 340 | X_train, y_train, X_test, y_test = prepareDataForClassification(finance, start_test) 341 | 342 | print performCV(X_train, y_train, folds, method, parameters, fout, savemodel) 343 | print '' 344 | 345 | ############################################################################### 346 | 347 | def performParameterSelection(bestdelta, bestlags, fout, cut, start_test, path_datasets, savemodel, method, folds, parameters, grid): 348 | """ 349 | """ 350 | 351 | lags = range(2, bestlags + 1) 352 | print 'Maximum time lag applied', max(lags) 353 | datasets = loadDatasets(path_datasets, fout) 354 | delta = range(2, bestdelta + 1) 355 | print 'Delta days accounted: ', max(delta) 356 | datasets = applyRollMeanDelayedReturns(datasets, delta) 357 | finance = mergeDataframes(datasets, 6, cut) 358 | print 'Size of data frame: ', finance.shape 359 | print 'Number of NaN after merging: ', count_missing(finance) 360 | finance = finance.interpolate(method='linear') 361 | print 'Number of NaN after time interpolation: ', count_missing(finance) 362 | finance = finance.fillna(finance.mean()) 363 | print 'Number of NaN after mean interpolation: ', count_missing(finance) 364 | finance = applyTimeLag(finance, lags, delta) 365 | print 'Number of NaN after temporal shifting: ', count_missing(finance) 366 | print 'Size of data frame after feature creation: ', finance.shape 367 | X_train, y_train, X_test, y_test = prepareDataForClassification(finance, start_test) 368 | 369 | return performTimeSeriesSearchGrid(X_train, y_train, folds, method, grid, fout, savemodel) 370 | 371 | ############################################################################### 372 | 373 | def performSingleShotClassification(bestdelta, bestlags, fout, cut, start_test, path_datasets, savemodel, method, parameters): 374 | """ 375 | """ 376 | #start_string = '1990-1-1' 377 | #end_string = '2014-8-31' 378 | #datasets = getStockDataFromWeb(fout, start_string, end_string) 379 | 380 | lags = range(2, bestlags + 1) 381 | print 'Maximum time lag applied', max(lags) 382 | datasets = loadDatasets(path_datasets, fout) 383 | delta = range(2, bestdelta + 1) 384 | print 'Delta days accounted: ', max(delta) 385 | datasets = applyRollMeanDelayedReturns(datasets, delta) 386 | finance = mergeDataframes(datasets, 6, cut) 387 | print 'Size of data frame: ', finance.shape 388 | print 'Number of NaN after merging: ', count_missing(finance) 389 | finance = finance.interpolate(method='linear') 390 | print 'Number of NaN after time interpolation: ', count_missing(finance) 391 | finance = finance.fillna(finance.mean()) 392 | print 'Number of NaN after mean interpolation: ', count_missing(finance) 393 | finance = applyTimeLag(finance, lags, delta) 394 | print 'Number of NaN after temporal shifting: ', count_missing(finance) 395 | print 'Size of data frame after feature creation: ', finance.shape 396 | X_train, y_train, X_test, y_test = prepareDataForClassification(finance, start_test) 397 | 398 | return performClassification(X_train, y_train, X_test, y_test, method, parameters, fout, savemodel) 399 | 400 | ############################################################################### 401 | 402 | def getPredictionFromBestModel(bestdelta, bestlags, fout, cut, start_test, path_datasets, best_model): 403 | """ 404 | """ 405 | lags = range(2, bestlags + 1) 406 | datasets = loadDatasets(path_datasets, fout) 407 | delta = range(2, bestdelta + 1) 408 | datasets = applyRollMeanDelayedReturns(datasets, delta) 409 | finance = mergeDataframes(datasets, 6, cut) 410 | finance = finance.interpolate(method='linear') 411 | finance = finance.fillna(finance.mean()) 412 | finance = applyTimeLag(finance, lags, delta) 413 | X_train, y_train, X_test, y_test = prepareDataForClassification(finance, start_test) 414 | with open(best_model, 'rb') as fin: 415 | model = cPickle.load(fin) 416 | 417 | return model.predict(X_test), model.score(X_test, y_test) 418 | 419 | 420 | ############################################################################### 421 | 422 | def performClassification(X_train, y_train, X_test, y_test, method, parameters, fout, savemodel): 423 | """ 424 | performs classification on returns using serveral algorithms 425 | """ 426 | 427 | print 'Performing ' + method + ' Classification...' 428 | print 'Size of train set: ', X_train.shape 429 | print 'Size of test set: ', X_test.shape 430 | 431 | if method == 'RF': 432 | return performRFClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel) 433 | 434 | elif method == 'KNN': 435 | return performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel) 436 | 437 | elif method == 'SVM': 438 | return performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel) 439 | 440 | elif method == 'ADA': 441 | return performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel) 442 | 443 | elif method == 'GTB': 444 | return performGTBClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel) 445 | 446 | elif method == 'QDA': 447 | return performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel) 448 | 449 | ############################################################################### 450 | 451 | def performRFClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): 452 | """ 453 | Random Forest Binary Classification 454 | """ 455 | clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1) 456 | clf.fit(X_train, y_train) 457 | 458 | if savemodel == True: 459 | fname_out = '{}.pickle'.format(fout) 460 | with open(fname_out, 'wb') as f: 461 | cPickle.dump(clf, f, -1) 462 | 463 | accuracy = clf.score(X_test, y_test) 464 | 465 | return accuracy 466 | 467 | ############################################################################### 468 | 469 | def performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): 470 | """ 471 | KNN binary Classification 472 | """ 473 | clf = neighbors.KNeighborsClassifier() 474 | clf.fit(X_train, y_train) 475 | 476 | if savemodel == True: 477 | fname_out = '{}-{}.pickle'.format(fout, datetime.now()) 478 | with open(fname_out, 'wb') as f: 479 | cPickle.dump(clf, f, -1) 480 | 481 | accuracy = clf.score(X_test, y_test) 482 | 483 | return accuracy 484 | 485 | ############################################################################### 486 | 487 | def performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): 488 | """ 489 | SVM binary Classification 490 | """ 491 | c = parameters[0] 492 | g = parameters[1] 493 | clf = SVC() 494 | clf.fit(X_train, y_train) 495 | 496 | if savemodel == True: 497 | fname_out = '{}-{}.pickle'.format(fout, datetime.now()) 498 | with open(fname_out, 'wb') as f: 499 | cPickle.dump(clf, f, -1) 500 | 501 | accuracy = clf.score(X_test, y_test) 502 | 503 | return accuracy 504 | 505 | ############################################################################### 506 | 507 | def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): 508 | """ 509 | Ada Boosting binary Classification 510 | """ 511 | n = parameters[0] 512 | l = parameters[1] 513 | clf = AdaBoostClassifier() 514 | clf.fit(X_train, y_train) 515 | 516 | if savemodel == True: 517 | fname_out = '{}-{}.pickle'.format(fout, datetime.now()) 518 | with open(fname_out, 'wb') as f: 519 | cPickle.dump(clf, f, -1) 520 | 521 | accuracy = clf.score(X_test, y_test) 522 | 523 | return accuracy 524 | 525 | ############################################################################### 526 | 527 | def performGTBClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): 528 | """ 529 | Gradient Tree Boosting binary Classification 530 | """ 531 | clf = GradientBoostingClassifier(n_estimators=100) 532 | clf.fit(X_train, y_train) 533 | 534 | if savemodel == True: 535 | fname_out = '{}-{}.pickle'.format(fout, datetime.now()) 536 | with open(fname_out, 'wb') as f: 537 | cPickle.dump(clf, f, -1) 538 | 539 | accuracy = clf.score(X_test, y_test) 540 | 541 | return accuracy 542 | 543 | ############################################################################### 544 | 545 | def performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel): 546 | """ 547 | Quadratic Discriminant Analysis binary Classification 548 | """ 549 | def replaceTiny(x): 550 | if (abs(x) < 0.0001): 551 | x = 0.0001 552 | 553 | X_train = X_train.apply(replaceTiny) 554 | X_test = X_test.apply(replaceTiny) 555 | 556 | clf = QDA() 557 | clf.fit(X_train, y_train) 558 | 559 | if savemodel == True: 560 | fname_out = '{}-{}.pickle'.format(fout, datetime.now()) 561 | with open(fname_out, 'wb') as f: 562 | cPickle.dump(clf, f, -1) 563 | 564 | accuracy = clf.score(X_test, y_test) 565 | 566 | return accuracy 567 | 568 | ############################################################################### 569 | 570 | class MarketIntradayPortfolio(Portfolio): 571 | """Buys or sells 500 shares of an asset at the opening price of 572 | every bar, depending upon the direction of the forecast, closing 573 | out the trade at the close of the bar. 574 | 575 | Requires: 576 | symbol - A stock symbol which forms the basis of the portfolio. 577 | bars - A DataFrame of bars for a symbol set. 578 | signals - A pandas DataFrame of signals (1, 0, -1) for each symbol. 579 | initial_capital - The amount in cash at the start of the portfolio.""" 580 | 581 | def __init__(self, symbol, bars, signals, initial_capital=100000.0, shares=500): 582 | self.symbol = symbol 583 | self.bars = bars 584 | self.signals = signals 585 | self.initial_capital = float(initial_capital) 586 | self.shares = int(shares) 587 | self.positions = self.generate_positions() 588 | 589 | def generate_positions(self): 590 | """Generate the positions DataFrame, based on the signals 591 | provided by the 'signals' DataFrame.""" 592 | positions = pd.DataFrame(index=self.signals.index).fillna(0.0) 593 | 594 | positions[self.symbol] = self.shares*self.signals['signal'] 595 | return positions 596 | 597 | def backtest_portfolio(self): 598 | """Backtest the portfolio and return a DataFrame containing 599 | the equity curve and the percentage returns.""" 600 | 601 | portfolio = pd.DataFrame(index=self.positions.index) 602 | pos_diff = self.positions.diff() 603 | 604 | portfolio['price_diff'] = self.bars['Close_Out']-self.bars['Open_Out'] 605 | portfolio['price_diff'][0:5] = 0.0 606 | portfolio['profit'] = self.positions[self.symbol] * portfolio['price_diff'] 607 | 608 | portfolio['total'] = self.initial_capital + portfolio['profit'].cumsum() 609 | portfolio['returns'] = portfolio['total'].pct_change() 610 | return portfolio 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | --------------------------------------------------------------------------------