├── README.md
├── authtoken.p
├── backtest.py
├── markets.py
└── pystocks.py


/README.md:
--------------------------------------------------------------------------------
 1 | Predicting Stock Market Returns 
 2 | ========
 3 | 
 4 | This repository contains the code for the portfolio project I've worked on at **Data Science Retreat** (Berlin).
 5 | 
 6 | The project aim is to build a model to predict **Stock Market Prices**, using a combination of Machine Learning Algorithms.
 7 | 
 8 | The output of the prediction are the daily returns of S&P-500 index.
 9 | 
10 | * **Binary Classification Problem**: predict positive (**Up**) or negative (**Down**) return respect to the previous day.
11 | 
12 | The language I picked to implement the analysis is Python (numpy, scipy, pandas, matplotlib, scikit).
13 | 
14 | The main file is **markets.py**. The script calls several functions contained in the pystocks.py.
15 | 
16 | 


--------------------------------------------------------------------------------
/authtoken.p:
--------------------------------------------------------------------------------
1 | S'mCDHcSdN9mQ_Hubid1Uq'
2 | p0
3 | .


--------------------------------------------------------------------------------
/backtest.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on Sat Sep 13 18:22:41 2014
 4 | 
 5 | @author: francesco
 6 | """
 7 | 
 8 | from abc import ABCMeta, abstractmethod
 9 | 
10 | class Strategy(object):
11 |     """Strategy is an abstract base class providing an interface for
12 |     all subsequent (inherited) trading strategies.
13 | 
14 |     The goal of a (derived) Strategy object is to output a list of signals,
15 |     which has the form of a time series indexed pandas DataFrame.
16 | 
17 |     In this instance only a single symbol/instrument is supported."""
18 | 
19 |     __metaclass__ = ABCMeta
20 | 
21 |     @abstractmethod
22 |     def generate_signals(self):
23 |         """An implementation is required to return the DataFrame of symbols 
24 |         containing the signals to go long, short or hold (1, -1 or 0)."""
25 |         raise NotImplementedError("Should implement generate_signals()!")
26 |         
27 | class Portfolio(object):
28 |     """An abstract base class representing a portfolio of 
29 |     positions (including both instruments and cash), determined
30 |     on the basis of a set of signals provided by a Strategy."""
31 | 
32 |     __metaclass__ = ABCMeta
33 | 
34 |     @abstractmethod
35 |     def generate_positions(self):
36 |         """Provides the logic to determine how the portfolio 
37 |         positions are allocated on the basis of forecasting
38 |         signals and available cash."""
39 |         raise NotImplementedError("Should implement generate_positions()!")
40 | 
41 |     @abstractmethod
42 |     def backtest_portfolio(self):
43 |         """Provides the logic to generate the trading orders
44 |         and subsequent equity curve (i.e. growth of total equity),
45 |         as a sum of holdings and cash, and the bar-period returns
46 |         associated with this curve based on the 'positions' DataFrame.
47 | 
48 |         Produces a portfolio object that can be examined by 
49 |         other classes/functions."""
50 |         raise NotImplementedError("Should implement backtest_portfolio()!")


--------------------------------------------------------------------------------
/markets.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Thu Sep 18 00:29:18 2014
  4 | 
  5 | @author: francesco
  6 | """
  7 | 
  8 | import pystocks
  9 | import datetime
 10 | import sys
 11 | import pandas as pd
 12 | import matplotlib.pyplot as plt
 13 | import locale
 14 | locale.setlocale(locale.LC_ALL,'en_US.UTF-8')
 15 | from pylab import *
 16 | 
 17 | fout = 'sp500'
 18 | method = 'RF'
 19 | best_model = 'sp500_57.pickle'
 20 | ############ SET PARAMETERS ##################################################
 21 | path_datasets = 'path to datasets'
 22 | cut = datetime.datetime(1993,1,1)
 23 | start_test = datetime.datetime(2014,4,1)
 24 | parameters = []
 25 | ##############################################################################
 26 | # IN CASE OF FEATURE AND MODEL SELECTION
 27 | maxlags = 10
 28 | maxdeltas = 10
 29 | folds = 10
 30 | #grid = {'C': [0.01, 0.1], 'gamma': [0, 1]}
 31 | ################################################################################
 32 | # AFTER BEST MODEL SELECTION
 33 | bestlags = 9
 34 | bestdelta = 9
 35 | savemodel = False
 36 | ##############################################################################
 37 | 
 38 | if __name__ == "__main__":
 39 |     
 40 |     ### PIPELINE
 41 |     ##########################################################################    
 42 |     ## 1- PERFORM FEATURE SELECTION APPLYING RANDOM FOREST TO THE DATA SET.
 43 |     ##    THE FUNCTION CAN LOAD DATA FROM THE WEB OR FROM CSV FILES PREVIOUSLY SAVED TO DISK.
 44 |     ##    THE OUTPUT IS GOING TO BE A LOG FILE WITH THE RESULT OF CROSS VALIDATION ON TRAIN SET
 45 |     
 46 |     sys.stdout = open('path to log txt file', 'w')  
 47 |     pystocks.performFeatureSelection(maxdeltas, maxlags, fout, cut, start_test, path_datasets, savemodel, method, folds, parameters)             
 48 |     
 49 |     ##########################################################################
 50 |     # 2- CHECK BEST PARAMETERS
 51 |     print pystocks.checkModel('path to log txt file')    
 52 |     
 53 |     ##########################################################################
 54 |     # 3- AFTER HAVING SELECTED THE TWO PARAMETERS THAT MAXIMIZE THE ACCURACY ON CV
 55 |     #    (MAXDELTA, MAXLAGS) RUN THE MODEL ON THE WHOLE TRAIN SET AND GET A RESULT
 56 |     #    ON TEST SET. IF WILLINGLY SAVE THE MODEL TO CPICKLE.
 57 |     
 58 |     print pystocks.performSingleShotClassification(bestdelta, bestlags, fout, cut, start_test, path_datasets, savemodel, method, parameters)
 59 | 
 60 |     ##########################################################################
 61 |     # 4- RUN THE TRADING ALGORITHM ON TOP OF THE PREDICTION AND GET RETURNS OF THE BACKTEST
 62 |     
 63 |     #end_period is the last day of backtesting. The data per prediction where collected in the ]
 64 |     #frame 1/1/1990 - 31/08/2014, after that the first 3 years were cut so the final dataframe
 65 |     #collects data in the period 1/1/1993 - 31/08/2014. But, as 30-31/08/2014 were Saturday and
 66 |     #Sunday the data ventually is restricted to 1/1/1993 - 29/08/2014. As we are predicting what 
 67 |     #happens tomorrow (the percentage variation of today closing price of the stock respect to yesterday)
 68 |     #so in the end we'll have an end_period whose last day is a day before the actual last trading day.
 69 |     end_period = datetime.datetime(2014,8,28) 
 70 | 
 71 |     ###### SP500
 72 |     symbol = 'S&P-500'
 73 |     name = path_datasets + '/sp500.csv'
 74 |     prediction = pystocks.getPredictionFromBestModel(9, 9, 'sp500', cut, start_test, path_datasets, 'sp500_57.pickle')[0]
 75 |     
 76 |     
 77 |     bars = pd.read_csv(name, index_col=0, parse_dates=True)    
 78 |     bars = bars[start_test:end_period]
 79 |  
 80 |     signals = pd.DataFrame(index=bars.index)
 81 |     signals['signal'] = 0.0
 82 |     signals['signal'] = prediction
 83 |     signals.signal[signals.signal == 0] = -1
 84 |     signals['positions'] = signals['signal'].diff()     
 85 | 
 86 |     # Create the portfolio based on the forecaster
 87 |     amount_of_shares = 500
 88 |     portfolio = pystocks.MarketIntradayPortfolio(symbol, bars, signals, initial_capital = 100000.0, shares = amount_of_shares)
 89 |     returns = portfolio.backtest_portfolio()
 90 | 
 91 |     # Plot results
 92 |     f, ax = plt.subplots(2, sharex=True)
 93 |     f.patch.set_facecolor('white')
 94 |     ylabel = symbol + ' Close Price in $'
 95 |     bars['Close_Out'].plot(ax=ax[0], color='r', lw=3.)    
 96 |     ax[0].set_ylabel(ylabel, fontsize=18)
 97 |     ax[0].set_xlabel('', fontsize=18)
 98 |     ax[0].legend(('Close Price S&P-500',), loc='upper left', prop={"size":18})
 99 |     ax[0].set_title('S&P 500 Close Price VS Portofolio Performance (1 April 2014 - 28 August 2014)', fontsize=20, fontweight="bold")
100 |     
101 |     returns['total'].plot(ax=ax[1], color='b', lw=3.)  
102 |     ax[1].set_ylabel('Portfolio value in $', fontsize=18)
103 |     ax[1].set_xlabel('Date', fontsize=18)
104 |     ax[1].legend(('Portofolio Performance. Capital Invested: 100k $. Shares Traded per day: 500+500',), loc='upper left', prop={"size":18})            
105 |     plt.tick_params(axis='both', which='major', labelsize=14)
106 |     loc = ax[1].xaxis.get_major_locator()
107 |     loc.maxticks[DAILY] = 24
108 | 
109 |     figManager = plt.get_current_fig_manager()
110 |     figManager.window.showMaximized()
111 |     
112 |     plt.show()
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/pystocks.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Wed Sep 17 23:46:41 2014
  4 | 
  5 | @author: francesco
  6 | """
  7 | import cPickle
  8 | import numpy as np
  9 | import pandas as pd
 10 | import datetime
 11 | from sklearn import preprocessing
 12 | from datetime import datetime
 13 | from sklearn.ensemble import RandomForestClassifier
 14 | from sklearn import neighbors
 15 | from sklearn.ensemble import AdaBoostClassifier
 16 | from sklearn.ensemble import GradientBoostingClassifier
 17 | from sklearn.svm import SVC
 18 | import operator
 19 | import pandas.io.data
 20 | from sklearn.qda import QDA
 21 | import re
 22 | from dateutil import parser
 23 | from backtest import Strategy, Portfolio
 24 | 
 25 | ###############################################################################
 26 | 
 27 | def loadDatasets(path_directory, fout): 
 28 |     """
 29 |     import into dataframe all datasets saved in path_directory
 30 |     """ 
 31 |     name = path_directory + '/' + fout + '.csv'
 32 |     out = pd.read_csv(name, index_col=0, parse_dates=True)
 33 |     
 34 |     name = path_directory + '/nasdaq.csv'
 35 |     nasdaq = pd.read_csv(name, index_col=0, parse_dates=True)
 36 |     
 37 |     name = path_directory + '/djia.csv'
 38 |     djia = pd.read_csv(name, index_col=0, parse_dates=True)
 39 |     
 40 |     name = path_directory + '/hkong.csv'
 41 |     hkong = pd.read_csv(name, index_col=0, parse_dates=True)
 42 |     
 43 |     name = path_directory + '/frankfurt.csv'
 44 |     frankfurt = pd.read_csv(name, index_col=0, parse_dates=True)
 45 |     
 46 |     name = path_directory + '/paris.csv'
 47 |     paris = pd.read_csv(name, index_col=0, parse_dates=True)
 48 |     
 49 |     name = path_directory + '/nikkei.csv'
 50 |     nikkei = pd.read_csv(name, index_col=0, parse_dates=True)
 51 |     
 52 |     name = path_directory + '/london.csv'
 53 |     london = pd.read_csv(name, index_col=0, parse_dates=True)
 54 |     
 55 |     name = path_directory + '/australia.csv'
 56 |     australia = pd.read_csv(name, index_col=0, parse_dates=True)
 57 |     
 58 |     return [out, nasdaq, djia, frankfurt, london, paris, hkong, nikkei, australia]
 59 |     
 60 | ###############################################################################
 61 |     
 62 | def getStock(symbol, start, end):
 63 |     """
 64 |     downloads stock from yahoo
 65 |     """
 66 |     df =  pd.io.data.get_data_yahoo(symbol, start, end)
 67 | 
 68 |     df.columns.values[-1] = 'AdjClose'
 69 |     df.columns = df.columns + '_' + symbol
 70 |     df['Return_%s' %symbol] = df['AdjClose_%s' %symbol].pct_change()
 71 |     
 72 |     return df
 73 | 
 74 | ###############################################################################
 75 | 
 76 | def getStockFromQuandl(symbol, name, start, end):
 77 |     """
 78 |     downloads stock from quandl
 79 |     """
 80 |     import Quandl
 81 |     df =  Quandl.get(symbol, trim_start = start, trim_end = end, authtoken="mCDHcSdN9mQ_Hubid1Uq")
 82 | 
 83 |     df.columns.values[-1] = 'AdjClose'
 84 |     df.columns = df.columns + '_' + name
 85 |     df['Return_%s' %name] = df['AdjClose_%s' %name].pct_change()
 86 |     
 87 |     return df
 88 |     
 89 | ###############################################################################
 90 |     
 91 | def getStockDataFromWeb(fout, start_string, end_string):
 92 |     """
 93 |     collects stocks data from yahoo and quandl
 94 |     """
 95 |     start = parser.parse(start_string)
 96 |     end = parser.parse(end_string)
 97 |     
 98 |     nasdaq = getStock('^IXIC', start, end)
 99 |     frankfurt = getStock('^GDAXI', start, end)
100 |     london = getStock('^FTSE', start, end)
101 |     paris = getStock('^FCHI', start, end)
102 |     hkong = getStock('^HSI', start, end)
103 |     nikkei = getStock('^N225', start, end)
104 |     australia = getStock('^AXJO', start, end)
105 |     
106 |     djia = getStockFromQuandl("YAHOO/INDEX_DJI", 'Djia', start_string, end_string) 
107 |     
108 |     out =  pd.io.data.get_data_yahoo(fout, start, end)
109 |     out.columns.values[-1] = 'AdjClose'
110 |     out.columns = out.columns + '_Out'
111 |     out['Return_Out'] = out['AdjClose_Out'].pct_change()
112 |     
113 |     return [out, nasdaq, djia, frankfurt, london, paris, hkong, nikkei, australia]
114 |     
115 | ###############################################################################
116 |     
117 | def count_missing(dataframe):
118 |     """
119 |     count number of NaN in dataframe
120 |     """
121 |     return (dataframe.shape[0] * dataframe.shape[1]) - dataframe.count().sum()
122 |     
123 | ###############################################################################    
124 |     
125 | def addFeatures(dataframe, adjclose, returns, n):
126 |     """
127 |     operates on two columns of dataframe:
128 |     - n >= 2
129 |     - given Return_* computes the return of day i respect to day i-n. 
130 |     - given AdjClose_* computes its moving average on n days
131 | 
132 |     """
133 |     
134 |     return_n = adjclose[9:] + "Time" + str(n)
135 |     dataframe[return_n] = dataframe[adjclose].pct_change(n)
136 |     
137 |     roll_n = returns[7:] + "RolMean" + str(n)
138 |     dataframe[roll_n] = pd.rolling_mean(dataframe[returns], n)  
139 | 
140 | ###############################################################################
141 | 
142 | def applyRollMeanDelayedReturns(datasets, delta):
143 |     """
144 |     applies rolling mean and delayed returns to each dataframe in the list
145 |     """
146 |     for dataset in datasets:
147 |         columns = dataset.columns    
148 |         adjclose = columns[-2]
149 |         returns = columns[-1]
150 |         for n in delta:
151 |             addFeatures(dataset, adjclose, returns, n)
152 |     
153 |     return datasets    
154 |     
155 | ###############################################################################    
156 |     
157 | def mergeDataframes(datasets, index, cut):
158 |     """
159 |     merges datasets in the list 
160 |     """
161 |     subset = []
162 |     subset = [dataset.iloc[:, index:] for dataset in datasets[1:]]
163 |     
164 |     first = subset[0].join(subset[1:], how = 'outer')
165 |     finance = datasets[0].iloc[:, index:].join(first, how = 'left') 
166 |     finance = finance[finance.index > cut]
167 |     return finance
168 | 
169 | ###############################################################################
170 | 
171 | def checkModel(filename):
172 |     """
173 |     checks max accuracy after CV  with specific algorithm
174 |     """
175 |     txt = open(filename, "r")
176 |     lines = txt.readlines()
177 |     accuracies = [line[:-1] for line in lines if line.startswith('0.')]    
178 |     txt.close()
179 |     return  max(accuracies)
180 | 
181 |    
182 | ###############################################################################    
183 |     
184 | def applyTimeLag(dataset, lags, delta):
185 |     """
186 |     apply time lag to return columns selected according  to delta.
187 |     Days to lag are contained in the lads list passed as argument.
188 |     Returns a NaN free dataset obtained cutting the lagged dataset
189 |     at head and tail
190 |     """
191 |     
192 |     dataset.Return_Out = dataset.Return_Out.shift(-1)
193 |     maxLag = max(lags)
194 | 
195 |     columns = dataset.columns[::(2*max(delta)-1)]
196 |     for column in columns:
197 |         for lag in lags:
198 |             newcolumn = column + str(lag)
199 |             dataset[newcolumn] = dataset[column].shift(lag)
200 | 
201 |     return dataset.iloc[maxLag:-1,:]    
202 |     
203 | ###############################################################################    
204 |     
205 | def performCV(X_train, y_train, folds, method, parameters, fout, savemodel):
206 |     """
207 |     given complete dataframe, number of folds, the % split to generate 
208 |     train and test set and features to perform prediction --> splits
209 |     dataframein test and train set. Takes train set and splits in k folds.
210 |     - Train on fold 1, test on 2
211 |     - Train on fold 1-2, test on 3
212 |     - Train on fold 1-2-3, test on 4
213 |     ....
214 |     returns mean of test accuracies
215 |     """
216 |     print ''
217 |     print 'Parameters --------------------------------> ', parameters
218 |     print 'Size train set: ', X_train.shape
219 |     
220 |     k = int(np.floor(float(X_train.shape[0])/folds))
221 |     
222 |     print 'Size of each fold: ', k
223 |     
224 |     acc = np.zeros(folds-1)
225 |     for i in range(2, folds+1):
226 |         print ''
227 |         split = float(i-1)/i
228 |         print 'Splitting the first ' + str(i) + ' chuncks at ' + str(i-1) + '/' + str(i) 
229 |         data = X_train[:(k*i)]
230 |         output = y_train[:(k*i)]
231 |         print 'Size of train+test: ', data.shape
232 |         index = int(np.floor(data.shape[0]*split))
233 |         X_tr = data[:index]        
234 |         y_tr = output[:index]
235 |         
236 |         X_te = data[(index+1):]
237 |         y_te = output[(index+1):]        
238 |         
239 |         acc[i-2] = performClassification(X_tr, y_tr, X_te, y_te, method, parameters, fout, savemodel)
240 |         print 'Accuracy on fold ' + str(i) + ': ', acc[i-2]
241 |     
242 |     return acc.mean()     
243 |     
244 | ###############################################################################    
245 |     
246 | def performTimeSeriesSearchGrid(X_train, y_train, folds, method, grid, fout, savemodel):
247 |     """
248 |     parameters is a dictionary with: keys --> parameter , values --> list of values of parameter
249 |     """
250 |     print ''
251 |     print 'Performing Search Grid CV...'
252 |     print 'Algorithm: ', method
253 |     param = grid.keys()
254 |     finalGrid = {}
255 |     if len(param) == 1:
256 |         for value_0 in grid[param[0]]:
257 |             parameters = [value_0]
258 |             accuracy = performCV(X_train, y_train, folds, method, parameters, fout, savemodel)
259 |             finalGrid[accuracy] = parameters
260 |         final = sorted(finalGrid.iteritems(), key=operator.itemgetter(0), reverse=True)  
261 |         print ''
262 |         print finalGrid        
263 |         print ''
264 |         print 'Final CV Results: ', final        
265 |         return final[0]
266 |         
267 |     elif len(param) == 2:
268 |         for value_0 in grid[param[0]]:
269 |             for value_1 in grid[param[1]]:
270 |                 parameters = [value_0, value_1]
271 |                 accuracy = performCV(X_train, y_train, folds, method, parameters, fout, savemodel)
272 |                 finalGrid[accuracy] = parameters
273 |         final = sorted(finalGrid.iteritems(), key=operator.itemgetter(0), reverse=True)
274 |         print ''
275 |         print finalGrid
276 |         print ''
277 |         print 'Final CV Results: ', final
278 |         return final[0]   
279 |     
280 | ###############################################################################    
281 |     
282 | def mergeSentimenToStocks(stocks):
283 |     df = pd.read_csv('/home/francesco/BigData/Project/CSV/sentiment.csv', index_col = 'date')
284 |     final = stocks.join(df, how='left')
285 |     return final   
286 | 
287 | ###############################################################################    
288 |     
289 | def prepareDataForClassification(dataset, start_test):
290 |     """
291 |     generates categorical to be predicted column, attach to dataframe 
292 |     and label the categories
293 |     """
294 |     le = preprocessing.LabelEncoder()
295 |     
296 |     dataset['UpDown'] = dataset['Return_Out']
297 |     dataset.UpDown[dataset.UpDown >= 0] = 'Up'
298 |     dataset.UpDown[dataset.UpDown < 0] = 'Down'
299 |     dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
300 |     
301 |     features = dataset.columns[1:-1]
302 |     X = dataset[features]    
303 |     y = dataset.UpDown    
304 |     
305 |     X_train = X[X.index < start_test]
306 |     y_train = y[y.index < start_test]    
307 |     
308 |     X_test = X[X.index >= start_test]    
309 |     y_test = y[y.index >= start_test]
310 |     
311 |     return X_train, y_train, X_test, y_test   
312 | 
313 | ###############################################################################
314 | 
315 | def performFeatureSelection(maxdeltas, maxlags, fout, cut, start_test, path_datasets, savemodel, method, folds, parameters):
316 |     """
317 |     """
318 |     
319 |     for maxlag in range(3, maxlags + 2):
320 |         lags = range(2, maxlag) 
321 |         print ''
322 |         print '============================================================='
323 |         print 'Maximum time lag applied', max(lags)
324 |         print ''
325 |         for maxdelta in range(3, maxdeltas + 2):
326 |             datasets = loadDatasets(path_datasets, fout)
327 |             delta = range(2, maxdelta) 
328 |             print 'Delta days accounted: ', max(delta)
329 |             datasets = applyRollMeanDelayedReturns(datasets, delta)
330 |             finance = mergeDataframes(datasets, 6, cut)
331 |             print 'Size of data frame: ', finance.shape
332 |             print 'Number of NaN after merging: ', count_missing(finance)
333 |             finance = finance.interpolate(method='linear')
334 |             print 'Number of NaN after time interpolation: ', count_missing(finance)
335 |             finance = finance.fillna(finance.mean())
336 |             print 'Number of NaN after mean interpolation: ', count_missing(finance)    
337 |             finance = applyTimeLag(finance, lags, delta)
338 |             print 'Number of NaN after temporal shifting: ', count_missing(finance)
339 |             print 'Size of data frame after feature creation: ', finance.shape
340 |             X_train, y_train, X_test, y_test  = prepareDataForClassification(finance, start_test)
341 |             
342 |             print performCV(X_train, y_train, folds, method, parameters, fout, savemodel)
343 |             print ''            
344 |             
345 | ###############################################################################
346 | 
347 | def performParameterSelection(bestdelta, bestlags, fout, cut, start_test, path_datasets, savemodel, method, folds, parameters, grid):
348 |     """
349 |     """
350 |     
351 |     lags = range(2, bestlags + 1) 
352 |     print 'Maximum time lag applied', max(lags)
353 |     datasets = loadDatasets(path_datasets, fout)
354 |     delta = range(2, bestdelta + 1) 
355 |     print 'Delta days accounted: ', max(delta)
356 |     datasets = applyRollMeanDelayedReturns(datasets, delta)
357 |     finance = mergeDataframes(datasets, 6, cut)
358 |     print 'Size of data frame: ', finance.shape
359 |     print 'Number of NaN after merging: ', count_missing(finance)
360 |     finance = finance.interpolate(method='linear')
361 |     print 'Number of NaN after time interpolation: ', count_missing(finance)
362 |     finance = finance.fillna(finance.mean())
363 |     print 'Number of NaN after mean interpolation: ', count_missing(finance)    
364 |     finance = applyTimeLag(finance, lags, delta)
365 |     print 'Number of NaN after temporal shifting: ', count_missing(finance)
366 |     print 'Size of data frame after feature creation: ', finance.shape
367 |     X_train, y_train, X_test, y_test  = prepareDataForClassification(finance, start_test)
368 |             
369 |     return performTimeSeriesSearchGrid(X_train, y_train, folds, method, grid, fout, savemodel)
370 |                 
371 | ###############################################################################
372 | 
373 | def performSingleShotClassification(bestdelta, bestlags, fout, cut, start_test, path_datasets, savemodel, method, parameters):
374 |     """
375 |     """
376 |     #start_string = '1990-1-1'
377 |     #end_string = '2014-8-31'
378 |     #datasets = getStockDataFromWeb(fout, start_string, end_string)    
379 |     
380 |     lags = range(2, bestlags + 1) 
381 |     print 'Maximum time lag applied', max(lags)
382 |     datasets = loadDatasets(path_datasets, fout)
383 |     delta = range(2, bestdelta + 1) 
384 |     print 'Delta days accounted: ', max(delta)
385 |     datasets = applyRollMeanDelayedReturns(datasets, delta)
386 |     finance = mergeDataframes(datasets, 6, cut)
387 |     print 'Size of data frame: ', finance.shape
388 |     print 'Number of NaN after merging: ', count_missing(finance)
389 |     finance = finance.interpolate(method='linear')
390 |     print 'Number of NaN after time interpolation: ', count_missing(finance)
391 |     finance = finance.fillna(finance.mean())
392 |     print 'Number of NaN after mean interpolation: ', count_missing(finance)    
393 |     finance = applyTimeLag(finance, lags, delta)
394 |     print 'Number of NaN after temporal shifting: ', count_missing(finance)
395 |     print 'Size of data frame after feature creation: ', finance.shape
396 |     X_train, y_train, X_test, y_test  = prepareDataForClassification(finance, start_test)
397 |             
398 |     return performClassification(X_train, y_train, X_test, y_test, method, parameters, fout, savemodel)                
399 | 
400 | ###############################################################################
401 | 
402 | def getPredictionFromBestModel(bestdelta, bestlags, fout, cut, start_test, path_datasets, best_model):
403 |     """
404 |     """
405 |     lags = range(2, bestlags + 1) 
406 |     datasets = loadDatasets(path_datasets, fout)
407 |     delta = range(2, bestdelta + 1) 
408 |     datasets = applyRollMeanDelayedReturns(datasets, delta)
409 |     finance = mergeDataframes(datasets, 6, cut)
410 |     finance = finance.interpolate(method='linear')
411 |     finance = finance.fillna(finance.mean())    
412 |     finance = applyTimeLag(finance, lags, delta)
413 |     X_train, y_train, X_test, y_test  = prepareDataForClassification(finance, start_test)    
414 |     with open(best_model, 'rb') as fin:
415 |         model = cPickle.load(fin)        
416 |         
417 |     return model.predict(X_test), model.score(X_test, y_test) 
418 | 
419 |                
420 | ###############################################################################
421 | 
422 | def performClassification(X_train, y_train, X_test, y_test, method, parameters, fout, savemodel):
423 |     """
424 |     performs classification on returns using serveral algorithms
425 |     """
426 | 
427 |     print 'Performing ' + method + ' Classification...'    
428 |     print 'Size of train set: ', X_train.shape
429 |     print 'Size of test set: ', X_test.shape
430 |    
431 |     if method == 'RF':   
432 |         return performRFClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
433 |         
434 |     elif method == 'KNN':
435 |         return performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
436 |     
437 |     elif method == 'SVM':   
438 |         return performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
439 |     
440 |     elif method == 'ADA':
441 |         return performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
442 |     
443 |     elif method == 'GTB': 
444 |         return performGTBClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)
445 | 
446 |     elif method == 'QDA': 
447 |         return performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel)    
448 |     
449 | ###############################################################################    
450 |     
451 | def performRFClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
452 |     """
453 |     Random Forest Binary Classification
454 |     """
455 |     clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
456 |     clf.fit(X_train, y_train)
457 |     
458 |     if savemodel == True:
459 |         fname_out = '{}.pickle'.format(fout)
460 |         with open(fname_out, 'wb') as f:
461 |             cPickle.dump(clf, f, -1)    
462 |     
463 |     accuracy = clf.score(X_test, y_test)
464 |     
465 |     return accuracy   
466 | 
467 | ###############################################################################   
468 |     
469 | def performKNNClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
470 |     """
471 |     KNN binary Classification
472 |     """
473 |     clf = neighbors.KNeighborsClassifier()
474 |     clf.fit(X_train, y_train)
475 | 
476 |     if savemodel == True:
477 |         fname_out = '{}-{}.pickle'.format(fout, datetime.now())
478 |         with open(fname_out, 'wb') as f:
479 |             cPickle.dump(clf, f, -1)    
480 |     
481 |     accuracy = clf.score(X_test, y_test)
482 |     
483 |     return accuracy
484 | 
485 | ###############################################################################
486 | 
487 | def performSVMClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
488 |     """
489 |     SVM binary Classification
490 |     """
491 |     c = parameters[0]
492 |     g =  parameters[1]
493 |     clf = SVC()
494 |     clf.fit(X_train, y_train)
495 | 
496 |     if savemodel == True:
497 |         fname_out = '{}-{}.pickle'.format(fout, datetime.now())
498 |         with open(fname_out, 'wb') as f:
499 |             cPickle.dump(clf, f, -1)    
500 |     
501 |     accuracy = clf.score(X_test, y_test)
502 |     
503 |     return accuracy
504 | 
505 | ###############################################################################
506 | 
507 | def performAdaBoostClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
508 |     """
509 |     Ada Boosting binary Classification
510 |     """
511 |     n = parameters[0]
512 |     l =  parameters[1]
513 |     clf = AdaBoostClassifier()
514 |     clf.fit(X_train, y_train)
515 | 
516 |     if savemodel == True:
517 |         fname_out = '{}-{}.pickle'.format(fout, datetime.now())
518 |         with open(fname_out, 'wb') as f:
519 |             cPickle.dump(clf, f, -1)    
520 |     
521 |     accuracy = clf.score(X_test, y_test)
522 |     
523 |     return accuracy
524 |     
525 | ###############################################################################    
526 |     
527 | def performGTBClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
528 |     """
529 |     Gradient Tree Boosting binary Classification
530 |     """
531 |     clf = GradientBoostingClassifier(n_estimators=100)
532 |     clf.fit(X_train, y_train)
533 | 
534 |     if savemodel == True:
535 |         fname_out = '{}-{}.pickle'.format(fout, datetime.now())
536 |         with open(fname_out, 'wb') as f:
537 |             cPickle.dump(clf, f, -1)    
538 |     
539 |     accuracy = clf.score(X_test, y_test)
540 |     
541 |     return accuracy
542 | 
543 | ###############################################################################
544 | 
545 | def performQDAClass(X_train, y_train, X_test, y_test, parameters, fout, savemodel):
546 |     """
547 |     Quadratic Discriminant Analysis binary Classification
548 |     """
549 |     def replaceTiny(x):
550 |         if (abs(x) < 0.0001):
551 |             x = 0.0001
552 |     
553 |     X_train = X_train.apply(replaceTiny)
554 |     X_test = X_test.apply(replaceTiny)
555 |     
556 |     clf = QDA()
557 |     clf.fit(X_train, y_train)
558 | 
559 |     if savemodel == True:
560 |         fname_out = '{}-{}.pickle'.format(fout, datetime.now())
561 |         with open(fname_out, 'wb') as f:
562 |             cPickle.dump(clf, f, -1)    
563 |     
564 |     accuracy = clf.score(X_test, y_test)
565 |     
566 |     return accuracy
567 | 
568 | ###############################################################################
569 | 
570 | class MarketIntradayPortfolio(Portfolio):
571 |     """Buys or sells 500 shares of an asset at the opening price of
572 |     every bar, depending upon the direction of the forecast, closing 
573 |     out the trade at the close of the bar.
574 | 
575 |     Requires:
576 |     symbol - A stock symbol which forms the basis of the portfolio.
577 |     bars - A DataFrame of bars for a symbol set.
578 |     signals - A pandas DataFrame of signals (1, 0, -1) for each symbol.
579 |     initial_capital - The amount in cash at the start of the portfolio."""
580 | 
581 |     def __init__(self, symbol, bars, signals, initial_capital=100000.0, shares=500):
582 |         self.symbol = symbol        
583 |         self.bars = bars
584 |         self.signals = signals
585 |         self.initial_capital = float(initial_capital)
586 |         self.shares = int(shares)
587 |         self.positions = self.generate_positions()
588 |         
589 |     def generate_positions(self):
590 |         """Generate the positions DataFrame, based on the signals
591 |         provided by the 'signals' DataFrame."""
592 |         positions = pd.DataFrame(index=self.signals.index).fillna(0.0)
593 | 
594 |         positions[self.symbol] = self.shares*self.signals['signal']
595 |         return positions
596 |                     
597 |     def backtest_portfolio(self):
598 |         """Backtest the portfolio and return a DataFrame containing
599 |         the equity curve and the percentage returns."""
600 |        
601 |         portfolio = pd.DataFrame(index=self.positions.index)
602 |         pos_diff = self.positions.diff()
603 |             
604 |         portfolio['price_diff'] = self.bars['Close_Out']-self.bars['Open_Out']
605 |         portfolio['price_diff'][0:5] = 0.0
606 |         portfolio['profit'] = self.positions[self.symbol] * portfolio['price_diff']
607 |      
608 |         portfolio['total'] = self.initial_capital + portfolio['profit'].cumsum()
609 |         portfolio['returns'] = portfolio['total'].pct_change()
610 |         return portfolio
611 | 
612 | 
613 | 
614 | 
615 | 
616 | 
617 | 
618 | 
619 | 
620 | 
621 | 
622 | 
623 | 
624 | 
625 | 
626 | 
627 | 
628 |     


--------------------------------------------------------------------------------