├── PYTHON ├── downloader.py ├── my_regression.py ├── portfolio.py └── regression.py ├── README.md └── matlab ├── est_A.m ├── labor.m ├── maxeig.m └── mr_opt_w.m /PYTHON/downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import Quandl 3 | import datetime as dt 4 | import cPickle 5 | import pandas 6 | from numpy import cumsum, log, polyfit, sqrt, std, subtract, insert 7 | import numpy as np 8 | 9 | def hurst(ts): 10 | # Set the range of lag 11 | lags = range(2, 100) 12 | # Calculate the the variances 13 | tau = [sqrt(std(subtract(ts[lag:], ts[:-lag]))) for lag in lags] 14 | # Estimate the Hurst Exponent 15 | poly = polyfit(log(lags), log(tau), 1) 16 | # Return the Hurst exponent from the polyfit output 17 | return poly[0]*2.0 18 | 19 | instruments = ['EURUSD', 'JPYUSD', 'GBPUSD', 'AUDUSD', 'CHFUSD', 'CADUSD', 'HKDUSD', 'SEKUSD', 'NZDUSD', 'KRWUSD', 'SGDUSD', 'NOKUSD', 'MXNUSD', 'INRUSD', 'JPYEUR', 'GBPEUR', 'AUDEUR', 'CHFEUR', 'CADEUR', 'HKDEUR', 'SEKEUR', 'NZDEUR', 'KRWEUR', 'SGDEUR', 'NOKEUR', 'MXNEUR', 'INREUR', 'GBPJPY', 'AUDJPY', 'CHFJPY', 'CADJPY', 'SEKJPY', 'NZDJPY', 'NOKJPY', 'INRJPY', 'AUDGBP', 'CHFGBP', 'CADGBP', 'HKDGBP', 'SEKGBP', 'NZDGBP', 'KRWGBP', 'SGDGBP', 'NOKGBP', 'MXNGBP', 'INRGBP', 'CHFAUD', 'CADAUD', 'HKDAUD', 'SEKAUD', 'NZDAUD', 'KRWAUD', 'SGDAUD', 'NOKAUD', 'MXNAUD', 'INRAUD', 'CADCHF', 'SEKCHF', 'NZDCHF', 'NOKCHF', 'INRCHF', 'SEKCAD', 'NZDCAD', 'NOKCAD', 'INRCAD', 'INRHKD', 'NZDSEK', 'NOKSEK', 'INRSEK', 'NOKNZD', 'INRNZD', 'INRKRW', 'INRSGD', 'INRNOK', 'INRMXN'] 20 | 21 | start=dt.datetime(1990,1,1) 22 | end=dt.datetime(2010,12,31) 23 | data=pandas.DataFrame() 24 | 25 | print 'Download started...' 26 | number_of_assets = 0 27 | for symbol in instruments: 28 | fx = Quandl.get("CURRFX/"+symbol, authtoken="DHRfTADW3mz8jee-sRcb", trim_start=start, trim_end=end).Rate 29 | print ". "+symbol+" downloaded" 30 | if hurst(fx)<0.5: 31 | data[symbol]=fx 32 | number_of_assets=number_of_assets+1 33 | print ".. "+symbol+" passed Hurst-test" 34 | # print tmp.shape 35 | # print tmp.head() 36 | #print data.head() 37 | print 'Download completed.' 38 | pandas.DataFrame.to_csv(data, "fx_data_mean_rev.csv", header=True) 39 | print 'Data modification...' 40 | # exponential moving average 41 | ema200 = pandas.stats.moments.ewma(data, span=200) 42 | tmp=list() 43 | for name in ema200.columns.tolist(): 44 | tmp.append(name+'_ema200') 45 | ema200.columns=tmp 46 | print '... ema(200) calculated' 47 | 48 | ema50 = pandas.stats.moments.ewma(data, span=50) 49 | tmp=list() 50 | for name in ema50.columns.tolist(): 51 | tmp.append(name+'_ema50') 52 | ema50.columns=tmp 53 | print '... ema(50) calculated' 54 | 55 | ema10 = pandas.stats.moments.ewma(data, span=10) 56 | tmp=list() 57 | for name in ema10.columns.tolist(): 58 | tmp.append(name+'_ema10') 59 | ema10.columns=tmp 60 | print '... ema(10) calculated' 61 | 62 | data = data.join(ema200).join(ema50).join(ema10) 63 | 64 | # pandas.DataFrame.to_csv(data, "fx_data_mean_rev_ewma.csv", header=True) 65 | data=data.dropna() 66 | print '.. NaN values dropped' 67 | data=data.as_matrix() # convert to nparray 68 | target=data[:,0:number_of_assets-1] 69 | target=target[1:,:] # every value but the first 70 | target=insert(target, len(target)-1, 1) 71 | print '.. target variable set-up' 72 | print 'Data modification completed' 73 | data = data[0:(data.shape[0]-data.shape[0]%50), ] 74 | print data.shape 75 | target = target[0:(data.shape[0]-data.shape[0]%10), ] 76 | print target.shape 77 | dataset=(data, target) 78 | full_dataset=(dataset, dataset, dataset) 79 | print 'Dump output...' 80 | filename='full_data.save' 81 | out_file=file(filename, 'wb') 82 | cPickle.dump(full_dataset, out_file) 83 | out_file.close() 84 | print 'Dump completed to '+filename 85 | -------------------------------------------------------------------------------- /PYTHON/my_regression.py: -------------------------------------------------------------------------------- 1 | import cPickle 2 | import numpy 3 | import theano 4 | import theano.tensor as T 5 | 6 | class Regression(object): 7 | def __init__(self, input, n_in, n_out): 8 | self.W=theano.shared(value=numpy.zeros((n_in, n_out), dtype=theano.config.floatX), name='W', borrow=True) 9 | self.b=theano.shared(value=numpy.zeros((n_out, 1), dtype=theano.config.floatX), name='b', borrow=True) 10 | self.y_perd=[T.dot(input, self.W)+self.b] 11 | self.params=[self.W, self.b] 12 | def errors(self, y): 13 | return T.sum(T.sum((y-self.y_perd)**2, axis=1)) 14 | 15 | def load_data(dataset): 16 | print '... loading data' 17 | 18 | f = file(dataset, 'rb') 19 | train_set, valid_set, test_set = cPickle.load(f) 20 | f.close() 21 | #train_set, valid_set, test_set format: tuple(input, target) 22 | #input is an numpy.ndarray of 2 dimensions (a matrix) 23 | #witch row's correspond to an example. target is a 24 | #numpy.ndarray of 1 dimensions (vector)) that have the same length as 25 | #the number of rows in the input. It should give the target 26 | #target to the example with the same index in the input. 27 | 28 | def shared_dataset(data_xy, borrow=True): 29 | # Function that loads the dataset into shared variables 30 | # The reason of using shared variables is to reduce memory read time 31 | data_x, data_y = data_xy 32 | shared_x = theano.shared(numpy.asarray(data_x, dtype=theano.config.floatX), borrow=borrow) 33 | shared_y = theano.shared(numpy.asarray(data_y, dtype=theano.config.floatX), borrow=borrow) 34 | return shared_x, shared_y 35 | 36 | test_set_x, test_set_y = shared_dataset(test_set) 37 | valid_set_x, valid_set_y = shared_dataset(valid_set) 38 | train_set_x, train_set_y = shared_dataset(train_set) 39 | 40 | 41 | rval = [(train_set_x, train_set_y), (valid_set_x, valid_set_y), (test_set_x, test_set_y)] 42 | return rval 43 | 44 | def optimization(learning_rate=0.1, n_epochs=10, input_data='full_data.save', batch_size=10): 45 | datasets=load_data(input_data) 46 | 47 | train_set_x, train_set_y = datasets[0] 48 | valid_set_x, valid_set_y = datasets[1] 49 | test_set_x, test_set_y = datasets[2] 50 | 51 | n_train_batches = train_set_x.get_value(borrow=True).shape[0] / batch_size 52 | n_valid_batches = valid_set_x.get_value(borrow=True).shape[0] / batch_size 53 | n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size 54 | 55 | print '... building the model' 56 | index=T.lscalar() #index for the minibatch 57 | 58 | x=T.fmatrix('x') # input data 59 | y=T.fvector('y') # output 60 | 61 | classifier=Regression(input=x, n_in=4, n_out=1) 62 | cost=classifier.errors(y) 63 | 64 | test_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={x: test_set_x[index * batch_size: (index + 1) * batch_size], y: test_set_y[index * batch_size: (index + 1) * batch_size]}) 65 | 66 | validate_model = theano.function(inputs=[index], outputs=classifier.errors(y), givens={x: valid_set_x[index * batch_size: (index + 1) * batch_size], y: valid_set_y[index * batch_size: (index + 1) * batch_size]}) 67 | 68 | # compute the gradient of cost with respect to theta = (W,b) 69 | g_W = T.grad(cost=cost, wrt=classifier.W) 70 | g_b = T.grad(cost=cost, wrt=classifier.b) 71 | 72 | # compute the update parameters for the weigths 73 | updates = [(classifier.W, classifier.W - learning_rate * g_W), (classifier.b, classifier.b - learning_rate * g_b)] 74 | 75 | #retrain the model with the new weigths 76 | train_model = theano.function(inputs=[index], outputs=cost, updates=updates, givens={x: train_set_x[index * batch_size: (index + 1) * batch_size], y: train_set_y[index * batch_size: (index + 1) * batch_size]}) 77 | 78 | print '... training the model' 79 | patience = 5000 80 | patience_increase=2 81 | 82 | improvement_threashold=0.995 83 | validation_frequency = min(n_train_batches, patience/2) 84 | best_validation_loss = numpy.inf 85 | test_score = 0.0 86 | done_looping = False 87 | epoch = 0 88 | while(epoch < n_epochs) and (not done_looping): 89 | epoch = epoch + 1 90 | for minibatch_index in xrange(n_train_batches): 91 | minibatch_avg_cost = train_model(minibatch_index) 92 | iter = (epoch - 1) * n_train_batches + minibatch_index 93 | if (iter + 1) % validation_frequency == 0: 94 | validation_losses = [validate_model(i) for i in xrange(n_valid_batches)] 95 | this_validation_loss = numpy.mean(validation_losses) 96 | print ('epoch %i, minibatch %i/%i, validation error %f %%' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) 97 | 98 | if this_validation_loss < best_validation_loss: 99 | if this_validation_loss < best_validation_loss * improvement_threashold: 100 | patience = max(patience, iter * patience_increase) 101 | best_validation_loss = this_validation_loss 102 | test_losses = [test_model(i) for i in xrange(n_test_batches)] 103 | test_score = numpy.mean(test_losses) 104 | print((' epoch %i, minibatch %i/%i, test error of best model %f %%') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) 105 | if patience <= iter: 106 | done_looping = True 107 | break 108 | print(('Optimitaion complete with best validation score of %f %%,' 'with test performance %f %%') % (best_validation_loss * 100., test_score * 100.)) 109 | """ 110 | 111 | 112 | if __name__ == '__main__': 113 | optimization() -------------------------------------------------------------------------------- /PYTHON/portfolio.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # MatLab 3 | # 4 | # load('s.mat'); 5 | # G = cov(s); 6 | # [T, N] = size(s); 7 | # A = zeros(N, N); 8 | # for t=2:1:T 9 | # A = A + pinv(s(t-1,:).'*s(t-1,:))*(s(t-1,:).'*s(t,:)); 10 | # end 11 | # pinvg = pinv(G)^0.5; 12 | # [v1, l1] = maxeig(pinvg.'*A*G*A.'*pinvg); 13 | # w = real(pinvg*v1); 14 | 15 | import csv 16 | import numpy as np 17 | from numpy.core.multiarray import zeros 18 | from scipy import linalg 19 | import time 20 | 21 | start_time = time.clock() 22 | 23 | Theano = False 24 | 25 | porfolio=1000 26 | fee=1 27 | reader = csv.reader(open("C:\s980.csv", "rb"), delimiter=',') 28 | S = np.matrix(list(reader)).astype('double') 29 | [T, N] = S.shape 30 | G = np.cov(S, rowvar=False) 31 | 32 | A = zeros((N, N)) 33 | for t in range(1, T): # sum 2 to T 34 | STMinusOneTranspose = S[t-1, :].T 35 | A = A + (linalg.pinv2((S[t-1,:].T).dot(S[t-1,:]))).dot((S[t-1,:].T).dot(S[t,:])); 36 | pingv = linalg.sqrtm(linalg.pinv2(G)) 37 | X = (pingv.T).dot(A).dot(G).dot((A.T)).dot(pingv) 38 | [maxEigenValue, maxEigenVector] = linalg.eigh(X, eigvals_only=False, eigvals=(N-1, N-1)) 39 | w = pingv.real.dot(maxEigenVector.real) 40 | 41 | if w.dot(A[:,N-1])>pandas.stats.moments.ewma(A[:,N-1], span=10): 42 | portfolio=porfolio+porftolio*w.dot(A[:,N-1])-fee 43 | else 44 | portfolio=porfolio+porftolio*-w.dot(A[:,N-1])-fee 45 | end_time = time.clock() 46 | print end_time-start_time -------------------------------------------------------------------------------- /PYTHON/regression.py: -------------------------------------------------------------------------------- 1 | import theano 2 | import numpy as N 3 | from theano import tensor as T 4 | from theano.tensor import nnet as NN 5 | from theano.compile import module as M 6 | 7 | class RegressionLayer(M.Module): 8 | def __init__(self, input = None, target = None, regularize = True): 9 | super(RegressionLayer, self).__init__() #boilerplate 10 | # MODEL CONFIGURATION 11 | self.regularize = regularize 12 | # ACQUIRE/MAKE INPUT AND TARGET 13 | if not input: 14 | input = T.matrix('input') 15 | if not target: 16 | target = T.matrix('target') 17 | # HYPER-PARAMETERS 18 | self.stepsize = T.scalar() # a stepsize for gradient descent 19 | # PARAMETERS 20 | self.w = T.matrix() #the linear transform to apply to our input points 21 | self.b = T.vector() #a vector of biases, which make our transform affine instead of linear 22 | # REGRESSION MODEL 23 | self.activation = T.dot(input, self.w) + self.b 24 | self.prediction = self.build_prediction() 25 | # CLASSIFICATION COST 26 | self.classification_cost = self.build_classification_cost(target) 27 | # REGULARIZATION COST 28 | self.regularization = self.build_regularization() 29 | # TOTAL COST 30 | self.cost = self.classification_cost 31 | if self.regularize: 32 | self.cost = self.cost + self.regularization 33 | # GET THE GRADIENTS NECESSARY TO FIT OUR PARAMETERS 34 | self.grad_w, self.grad_b, grad_act = T.grad(self.cost, [self.w, self.b, self.prediction]) 35 | print 'grads', self.grad_w, self.grad_b 36 | # INTERFACE METHODS 37 | self.update = M.Method([input, target], [self.cost, self.grad_w, self.grad_b, grad_act], updates={self.w: self.w - self.stepsize * self.grad_w, self.b: self.b - self.stepsize * self.grad_b}) 38 | self.apply = M.Method(input, self.prediction) 39 | def params(self): 40 | return self.w, self.b 41 | def _instance_initialize(self, obj, input_size = None, target_size = None, seed = 1827, **init): 42 | # obj is an "instance" of this module holding values for each member and 43 | # functions for each method 44 | if input_size and target_size: 45 | # initialize w and b in a special way using input_size and target_size 46 | sz = (input_size, target_size) 47 | rng = N.random.RandomState(seed) 48 | obj.w = rng.uniform(size = sz, low = -0.5, high = 0.5) 49 | obj.b = N.zeros(target_size) 50 | obj.stepsize = 0.01 51 | # here we call the default_initialize method, which takes all the name: value 52 | # pairs in init and sets the property with that name to the provided value 53 | # this covers setting stepsize, l2_coef; w and b can be set that way too 54 | # we call it after as we want the parameter to superseed the default value. 55 | M.default_initialize(obj,**init) 56 | def build_regularization(self): 57 | return T.zero() # no regularization! 58 | 59 | 60 | class SpecifiedRegressionLayer(RegressionLayer): 61 | """ XE mean cross entropy""" 62 | def build_prediction(self): 63 | # return NN.softmax(self.activation) #use this line to expose a slow subtensor 64 | # implementation 65 | return NN.sigmoid(self.activation) 66 | def build_classification_cost(self, target): 67 | self.classification_cost_matrix = (target - self.prediction)**2 68 | #print self.classification_cost_matrix.type 69 | self.classification_costs = T.sum(self.classification_cost_matrix, axis=1) 70 | return T.sum(self.classification_costs) 71 | def build_regularization(self): 72 | self.l2_coef = T.scalar() # we can add a hyper parameter if we need to 73 | return self.l2_coef * T.sum(self.w * self.w) 74 | 75 | 76 | class PrintEverythingMode(theano.Mode): 77 | def __init__(self, linker, optimizer=None): 78 | def print_eval(i, node, fn): 79 | print i, node, [input[0] for input in fn.inputs], 80 | fn() 81 | print [output[0] for output in fn.outputs] 82 | wrap_linker = theano.gof.WrapLinkerMany([linker], [print_eval]) 83 | super(PrintEverythingMode, self).__init__(wrap_linker, optimizer) 84 | 85 | 86 | def test_module_advanced_example(): 87 | 88 | profmode = theano.ProfileMode(optimizer='fast_run', linker=theano.gof.OpWiseCLinker()) 89 | profmode = PrintEverythingMode(theano.gof.OpWiseCLinker(), 'fast_run') 90 | 91 | data_x = N.random.randn(4, 10) 92 | data_y = [ [int(x)] for x in (N.random.randn(4) > 0)] 93 | 94 | 95 | model = SpecifiedRegressionLayer(regularize = False).make(input_size = 10, target_size = 1, stepsize = 0.1, mode=profmode) 96 | 97 | for i in xrange(1000): 98 | xe, gw, gb, ga = model.update([data_x, data_y]) 99 | if i % 100 == 0: 100 | print i, xe 101 | pass 102 | #for inputs, targets in my_training_set(): 103 | #print "cost:", model.update(inputs, targets) 104 | 105 | print "final weights:", model.w 106 | print "final biases:", model.b 107 | 108 | profmode.print_summary() 109 | 110 | if __name__ == '__main__': 111 | test_module_advanced_example() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Stock-Market-Time-Series-Data-Mining-Using-Deep-Learning-Algorithms 2 | In this project we try to predict future price from historical prices with data mining and time series analysis methods. Then based on these retrieved informationa a portfolio is assembled which provides the maximal expected profit. Deep neural networks is used for prediction. 3 | -------------------------------------------------------------------------------- /matlab/est_A.m: -------------------------------------------------------------------------------- 1 | function A = est_A(s) 2 | [T, N] = size(s); 3 | A = zeros(N, N); 4 | for t=2:1:T 5 | A = A + pinv(s(t-1,:).'*s(t-1,:))*(s(t-1,:).'*s(t,:)); 6 | %A = A + pinv(s(t-1,:).'*s(t-1,:)); 7 | end 8 | end -------------------------------------------------------------------------------- /matlab/labor.m: -------------------------------------------------------------------------------- 1 | tic; 2 | load('s.mat'); 3 | G = cov(s); 4 | A = est_A(s); 5 | w = mr_opt_w(A, G); 6 | toc -------------------------------------------------------------------------------- /matlab/maxeig.m: -------------------------------------------------------------------------------- 1 | function [v1, l1] = maxeig(M) 2 | [V,D] = eig(M); 3 | [l1, l1palce] = max(max(D)); 4 | v1 = V(:, l1palce); 5 | end -------------------------------------------------------------------------------- /matlab/mr_opt_w.m: -------------------------------------------------------------------------------- 1 | function w = mr_opt_w(A, G) 2 | pinvg = pinv(G)^0.5; 3 | [v1, l1] = maxeig(pinvg.'*A*G*A.'*pinvg); 4 | w = real(pinvg*v1); 5 | end --------------------------------------------------------------------------------