├── input
    ├── symbols.txt
    └── params.txt
├── Report.pdf
├── screenshots
    └── presentation.gif
├── Documents
    ├── SMAIProjectAbstract.pdf
    └── StockPricePrediction.pdf
├── requirements.txt
├── scripts
    ├── normalization.py
    ├── interpolation.py
    ├── main.py
    ├── add_s_and_p_index.py
    ├── feature_selection.py
    ├── preprocessing.py
    ├── Algorithms
    │   ├── svm.py
    │   ├── regression_models.py
    │   ├── LSTN-RNN.py
    │   ├── rnn_lstm.py
    │   ├── regression_helpers.py
    │   └── Neural_Network.py
    └── fetch_stock_data.py
├── .gitignore
├── LICENSE
└── README.md


/input/symbols.txt:
--------------------------------------------------------------------------------
1 | FB
2 | GOOG
3 | 


--------------------------------------------------------------------------------
/Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/Report.pdf


--------------------------------------------------------------------------------
/screenshots/presentation.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/screenshots/presentation.gif


--------------------------------------------------------------------------------
/Documents/SMAIProjectAbstract.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/Documents/SMAIProjectAbstract.pdf


--------------------------------------------------------------------------------
/Documents/StockPricePrediction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/Documents/StockPricePrediction.pdf


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.10.0
 2 | matplotlib==1.5.1
 3 | numpy==1.11.0
 4 | pandas==0.18.0
 5 | pyparsing==2.1.1
 6 | python-dateutil==2.5.3
 7 | pytz==2016.4
 8 | scikit-learn==0.17.1
 9 | scipy==0.17.0
10 | six==1.10.0
11 | 


--------------------------------------------------------------------------------
/scripts/normalization.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | '''
 3 |     Data Normalization
 4 | '''
 5 | 
 6 | from sklearn import preprocessing
 7 | 
 8 | def normalize(file_dataframe, cols):
 9 |     '''
10 |         Data Normalization.
11 |     '''
12 | 
13 |     for col in cols:
14 |         preprocessing.normalize(file_dataframe[col], \
15 |             axis=1, norm='l2', copy=False)
16 | 
17 |     return file_dataframe


--------------------------------------------------------------------------------
/scripts/interpolation.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | '''
 3 |     Data Interpolation
 4 | '''
 5 | 
 6 | import os, sys
 7 | import pandas as pd
 8 | 
 9 | def interpolate(dataframe, cols_to_interpolate):
10 | 
11 |     for col in cols_to_interpolate:
12 |         dataframe[col] = dataframe[col].interpolate('spline', order=2)
13 | 
14 |     return dataframe
15 | 
16 | 
17 | def main(dir_path):
18 |     files = os.listdir(dir_path)
19 |     for file_name in files:
20 |         dataframe = pd.read_csv(os.path.join(dir_path, file_name))
21 |         dataframe = interpolate(dataframe, \
22 |             ['high', 'open', 'low', 'close', 'volume', 'adj_close'])
23 |         print dataframe
24 | 
25 |         break
26 | 
27 | 
28 | if __name__=="__main__":
29 |     main(sys.argv[1])
30 | 


--------------------------------------------------------------------------------
/scripts/main.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | '''
 3 |     Main File.
 4 | '''
 5 | import os
 6 | import sys
 7 | import pandas as pd
 8 | 
 9 | from interpolation import interpolate
10 | from normalization import normalize
11 | 
12 | 
13 | def main(dir_path, output_dir):
14 |     '''
15 |         Run Pipeline of processes on file one by one.
16 |     '''
17 |     files = os.listdir(dir_path)
18 | 
19 |     for file_name in files:
20 | 
21 |         file_dataframe = pd.read_csv(os.path.join(dir_path, file_name))
22 | 
23 |         cols = ['high', 'open', 'low', 'close', 'volume', 'adj_close']
24 | 
25 |         file_dataframe = interpolate(file_dataframe, cols)
26 | 
27 |         file_dataframe = normalize(file_dataframe, cols)
28 | 
29 |         file_dataframe.to_csv(
30 |             os.path.join(output_dir, file_name), encoding='utf-8')
31 | 
32 | if __name__ == '__main__':
33 |     main(sys.argv[1], sys.argv[2])
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | input/non_params.txt
 2 | scripts/.scrapy/*
 3 | 
 4 | # Byte-compiled / optimized / DLL files
 5 | __pycache__/
 6 | *.py[cod]
 7 | *$py.class
 8 | 
 9 | # C extensions
10 | *.so
11 | 
12 | # Distribution / packaging
13 | .Python
14 | env/
15 | build/
16 | develop-eggs/
17 | dist/
18 | downloads/
19 | eggs/
20 | .eggs/
21 | lib/
22 | lib64/
23 | parts/
24 | sdist/
25 | var/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | 
30 | # PyInstaller
31 | #  Usually these files are written by a python script from a template
32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 | 
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 | 
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *,cover
49 | .hypothesis/
50 | 
51 | # Translations
52 | *.mo
53 | *.pot
54 | 
55 | # Django stuff:
56 | *.log
57 | 
58 | # Sphinx documentation
59 | docs/_build/
60 | 
61 | # PyBuilder
62 | target/
63 | 
64 | #Ipython Notebook
65 | .ipynb_checkpoints
66 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 Sharvil Katariya
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/scripts/add_s_and_p_index.py:
--------------------------------------------------------------------------------
 1 | import os, sys, csv
 2 | import math
 3 | import pprint
 4 | 
 5 | def roundup(var):
 6 | 	return float(format(var, '.6f'))
 7 | 
 8 | def main(dir_path, sp_index_file, output_dir):
 9 | 	files = os.listdir(dir_path)
10 | 
11 | 	for file_name in files:
12 | 		with open( os.path.join(dir_path, file_name), 'r') as textfile:
13 | 
14 | 			new_file = open(os.path.join(output_dir, file_name), 'w+')
15 | 
16 | 			new_list = []
17 | 			new_list.append(['symbol','date','open','high','low','close','volume','adj_close', 'prev_day_diff', '50_day_moving_avg', '10_day_volatility', 
18 | 				's&p_index_open', 's&p_index_high', 's&p_index_low', 's&p_index_close', 's&p_index_volume', 's&p_index_adj_close'])
19 | 
20 | 			dict_mapping = {}
21 | 
22 | 			for count, row in enumerate(reversed(list(csv.reader(textfile)))):
23 | 				if str(row[0])=="symbol":
24 | 					break
25 | 
26 | 				date = str(row[1])
27 | 				dict_mapping[date] = row
28 | 
29 | 			"""
30 | 				Extend to Existing Key-Value in dict_mapping dictionary.
31 | 			"""
32 | 
33 | 			with open(sp_index_file, 'r') as sp_index_fp:
34 | 				for count2, row2 in enumerate(reversed(list(csv.reader(sp_index_fp)))):
35 | 					if str(row2[0]) in dict_mapping:
36 | 						dict_mapping[str(row2[0])].extend(row2[1:])
37 | 
38 | 			#pprint.pprint(dict_mapping, width=1)
39 | 
40 | 			for key in sorted(dict_mapping):
41 | 				new_list.append(dict_mapping[key])
42 | 			
43 | 			writer = csv.writer(new_file)
44 | 			writer.writerows(new_list)
45 | 			new_file.close()
46 | 		textfile.close()
47 | 
48 | if __name__ == '__main__':
49 | 	main(str(sys.argv[1]), str(sys.argv[2]), str(sys.argv[3]))


--------------------------------------------------------------------------------
/input/params.txt:
--------------------------------------------------------------------------------
 1 | enterprise_value
 2 | pe_ratio
 3 | pe_10
 4 | peg_ratio
 5 | earning_yield
 6 | ps_ratio
 7 | price_to_book_value
 8 | ev_revenues
 9 | ev_ebit
10 | operating_earning_yield
11 | shares_outstanding
12 | dividend
13 | dividend_yield
14 | cash_dividend_payout_ratio
15 | payout_ratio
16 | gross_profit_margin
17 | profit_margin
18 | ebitda_margin_ttm
19 | operating_margin_ttm
20 | asset_utilization
21 | days_sales_outstanding
22 | days_payables_outstanding
23 | receivables_turnover
24 | return_on_assets
25 | return_on_equity
26 | return_on_invested_capital
27 | altman_z_score
28 | current_ratio
29 | debt_equity_ratio
30 | free_cash_flow
31 | kz_index
32 | tangible_common_equity_ratio
33 | times_interest_earned
34 | total_employee_number
35 | revenue_per_employee_annual
36 | ni_per_employee_annual
37 | market_beta_60_month
38 | one_month_return
39 | three_month_return
40 | six_month_return
41 | ytd_return
42 | one_year_return
43 | three_year_return
44 | year_high
45 | year_low
46 | revenues_ttm
47 | revenues_per_share
48 | revenues_growth
49 | eps_ttm
50 | eps_growth
51 | net_income_ttm
52 | cash_financing_ttm
53 | cash_investing_ttm
54 | cash_operations_ttm
55 | capex
56 | cash_on_hand
57 | long_term_debt
58 | assets
59 | liabilities
60 | shareholders_equity
61 | book_value_of_equity_per_share
62 | book_value_of_tangible_equity_per_share
63 | accruals
64 | eps_est_0q
65 | eps_est_0y
66 | forward_pe_ratio
67 | forward_pe_ratio_1y
68 | forward_ps_ratio
69 | forward_ps_ratio_1y
70 | net_income_cs_rev
71 | net_income_annual_cs_rev
72 | max_drawdown_all
73 | historical_daily_var_1_all
74 | historical_daily_var_5_all
75 | historical_monthly_var_5_all
76 | historical_monthly_var_1_all
77 | ca_score
78 | f_score_ttm
79 | fulmer_h_score
80 | graham_number
81 | ncavps
82 | ohlson_score
83 | quality_ratio
84 | springate_score
85 | sustainable_growth_rate
86 | tobin_q
87 | market_cap_fractile
88 | quality_ratio_fractile


--------------------------------------------------------------------------------
/scripts/feature_selection.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | import sys
 3 | import os
 4 | import csv
 5 | import pandas
 6 | 
 7 | from sklearn.feature_selection import RFECV
 8 | from sklearn.svm import SVR
 9 | from sklearn.linear_model import LinearRegression
10 | from sklearn.cross_validation import StratifiedKFold
11 | import numpy as np
12 | 
13 | def conv(s):
14 | 	try:
15 | 		s=float(s)
16 | 	except ValueError:
17 | 		pass    
18 | 	return s
19 | 
20 | def main(dir_path):
21 | 
22 | 	files = os.listdir(dir_path)
23 | 
24 | 	X = []
25 | 	y = []
26 | 
27 | 	ranking = []
28 | 
29 | 	for file_name in files:
30 | 		with open( os.path.join(dir_path, file_name), 'r') as textfile:
31 | 			data = pandas.read_csv(os.path.join(dir_path, file_name), header=0)
32 | 			#reader = csv.reader(textfile)
33 | 			#next(reader, None)
34 | 
35 | 			start_test = datetime.datetime(2005, 1, 1)
36 | 
37 | 			col = list(data.adj_close)
38 | 
39 | 			X = [ col[:2] ]
40 | 			y = [ col[-1], col[-2] ]
41 | 
42 | 			print X
43 | 			print y	
44 | 			
45 | 			'''
46 | 
47 | 			for row in reader:
48 | 
49 | 				if any(row[key] in (None, "") for key in range(len(row))):
50 | 					continue
51 | 
52 | 				temp = row[2:7] + row[9:]
53 | 				
54 | 				for i in range(len(temp)):
55 | 					try:
56 | 						temp[i] = float(temp[i])
57 | 					except Exception, e:
58 | 						print temp[i]
59 | 						print temp
60 | 						print file_name
61 | 						print row
62 | 						raise e
63 | 			
64 | 				X.append(temp)
65 | 				y.append(float(row[8]))
66 | 			'''
67 | 
68 | 		X=np.array(X, np.float64)
69 | 		y=np.array(y, np.float64)
70 | 
71 | 		estimator = LinearRegression()
72 | 		selector = RFECV(estimator, step=1, cv=StratifiedKFold(y, 2))
73 | 		
74 | 		selector = selector.fit(X, y)
75 | 		
76 | 		'''
77 | 		except Exception, e:
78 | 			print X
79 | 			print y			
80 | 			raise e
81 | 		'''
82 | 
83 | 		X = []
84 | 		y = []
85 | 
86 | 		if len(ranking)!=0:
87 | 			ranking = [sum(x) for x in zip(ranking, selector.ranking_)]
88 | 		else:
89 | 			ranking = selector.ranking_
90 | 
91 | 		print ranking
92 | 
93 | 	print ranking
94 | 
95 | if __name__ == '__main__':
96 | 	main(sys.argv[1])
97 | 


--------------------------------------------------------------------------------
/scripts/preprocessing.py:
--------------------------------------------------------------------------------
 1 | import os, sys, csv
 2 | import math
 3 | 
 4 | def roundup(var):
 5 | 	return float(format(var, '.6f'))
 6 | 
 7 | def main(dir_path, output_dir):
 8 | 	files = os.listdir(dir_path)
 9 | 	for file_name in files:
10 | 		with open( os.path.join(dir_path, file_name), 'r') as textfile:
11 | 			new_file = open(os.path.join(output_dir, file_name), 'w+')
12 | 			new_list = []
13 | 
14 | 			prev = 0.0
15 | 			diff = 0.0
16 | 			avg = 0.0
17 | 			num_moving_avg = 50
18 | 			volatile_avg = 0.0
19 | 			num_volatile = 10
20 | 			curr_volatility = 0.0
21 | 
22 | 			for count, row in enumerate(reversed(list(csv.reader(textfile)))):
23 | 				if not count:
24 | 					try:
25 | 						row[8]=prev
26 | 					except Exception, e:
27 | 						row.append(prev)
28 | 				else:
29 | 					diff = roundup(float(row[7]) - float(prev))
30 | 					try:
31 | 						row[8]=diff
32 | 					except Exception, e:
33 | 						row.append(diff)
34 | 				
35 | 				if count<num_moving_avg:
36 | 					avg = roundup((count * avg + float(row[7]))/ (count + 1))
37 | 				else:
38 | 					avg = roundup((num_moving_avg * avg + float(row[7]) - float(new_list[count - num_moving_avg][7])) / (num_moving_avg)) 
39 | 
40 | 				prev = float(row[7])
41 | 				
42 | 				if count < num_volatile:
43 | 					volatile_avg = roundup((count * volatile_avg + float(row[7]))/ (count + 1))
44 | 				else:
45 | 					volatile_avg = roundup((num_volatile * volatile_avg + float(row[7]) - float(new_list[count - num_volatile][7])) / (num_volatile))
46 | 
47 | 				if count:
48 | 					loop_count = min(count, num_volatile)
49 | 				
50 | 					for i in range(loop_count):
51 | 						curr_volatility += math.pow((float(row[7]) - volatile_avg), 2)
52 | 
53 | 					curr_volatility = roundup(math.sqrt(curr_volatility / (loop_count))) 
54 | 				
55 | 				try:
56 | 					row[9]=avg
57 | 					row[10]=curr_volatility
58 | 				except Exception, e:
59 | 					row.append(avg)
60 | 					row.append(curr_volatility)
61 | 
62 | 				new_list.append(row)
63 | 				curr_volatility = 0.0
64 | 
65 | 			new_list.insert(0, ['symbol','date','open','high','low','close','volume','adj_close', 'prev_day_diff', '50_day_moving_avg', '10_day_volatility'])
66 | 
67 | 			writer = csv.writer(new_file)
68 | 			writer.writerows(new_list)
69 | 			new_file.close()
70 | 		textfile.close()
71 | 
72 | if __name__ == '__main__':
73 | 	main(str(sys.argv[1]), str(sys.argv[2]))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Stock Market Price Predictor using Supervised Learning
 2 | 
 3 | ### Aim
 4 | To examine a number of different forecasting techniques to predict future stock returns based on past returns and numerical news indicators to construct a portfolio of multiple stocks in order to diversify the risk. We do this by applying supervised learning methods for stock price forecasting by interpreting the seemingly chaotic market data.
 5 | 
 6 | ## Setup Instructions
 7 | ```
 8 |     $ workon myvirtualenv                                  [Optional]
 9 |     $ pip install -r requirements.txt
10 |     $ python scripts/Algorithms/regression_models.py <input-dir> <output-dir>
11 | ```
12 | 
13 | Download the Dataset needed for running the code from [here](https://drive.google.com/open?id=0B2lCmt16L_r3SUtrTjBlRHk3d1E).
14 | 
15 | ## Project Concept Video
16 | [![Project Concept Video](screenshots/presentation.gif)](https://www.youtube.com/watch?v=z6U0OKGrhy0)
17 | 
18 | ### Methodology 
19 | 1. Preprocessing and Cleaning
20 | 2. Feature Extraction
21 | 3. Twitter Sentiment Analysis and Score
22 | 4. Data Normalization
23 | 5. Analysis of various supervised learning methods
24 | 6. Conclusions
25 | 
26 | ### Research Paper
27 | - [Machine Learning in Stock Price Trend Forecasting. Yuqing Dai, Yuning Zhang](http://cs229.stanford.edu/proj2013/DaiZhang-MachineLearningInStockPriceTrendForecasting.pdf)
28 | - [Stock Market Forecasting Using Machine Learning Algorithms. Shunrong Shen, Haomiao Jiang. Department of Electrical Engineering. Stanford University](http://cs229.stanford.edu/proj2012/ShenJiangZhang-StockMarketForecastingusingMachineLearningAlgorithms.pdf)
29 | - [How can machine learning help stock investment?, Xin Guo](http://cs229.stanford.edu/proj2015/009_report.pdf)
30 | 
31 | 
32 | ### Datasets used
33 | 1. http://www.nasdaq.com/
34 | 2. https://in.finance.yahoo.com
35 | 3. https://www.google.com/finance
36 | 
37 | 
38 | ### Useful Links 
39 | - **Slides**: http://www.slideshare.net/SharvilKatariya/stock-price-trend-forecasting-using-supervised-learning
40 | - **Video**: https://www.youtube.com/watch?v=z6U0OKGrhy0
41 | - **Report**: https://github.com/scorpionhiccup/StockPricePrediction/blob/master/Report.pdf
42 | 
43 | ### References
44 | - [Recurrent Neural Networks - LSTM Models](http://colah.github.io/posts/2015-08-Understanding-LSTMs/)
45 | - [ARIMA Models](http://people.duke.edu/~rnau/411arim.htm)
46 | - https://github.com/dv-lebedev/google-quote-downloader
47 | - [Book Value](http://www.investopedia.com/terms/b/bookvalue.asp)
48 | - http://www.investopedia.com/articles/basics/09/simplified-measuring-interpreting-volatility.asp
49 | - [Volatility](http://www.stock-options-made-easy.com/volatility-index.html)
50 | - https://github.com/dzitkowskik/StockPredictionRNN
51 | - [Scikit-Learn](http://scikit-learn.org/stable/)
52 | - [Theano](http://deeplearning.net/software/theano/)
53 | 


--------------------------------------------------------------------------------
/scripts/Algorithms/svm.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/python
 2 | '''
 3 |     Running Support Vector Regression Model.
 4 | '''
 5 | from __future__ import print_function
 6 | 
 7 | import os
 8 | import sys
 9 | import pandas as pd
10 | from sklearn.svm import SVR
11 | from sklearn import cross_validation
12 | import numpy as np
13 | import matplotlib.pyplot as plt
14 | from datetime import datetime
15 | from sklearn.cross_validation import train_test_split
16 | 
17 | def convert_to_integer(dt_time):
18 |     return 10000*dt_time.year + 1000*dt_time.month + dt_time.day
19 | 
20 | 
21 | def preprocess(file_dataframe, cols=['date', 'open']):
22 |     
23 |     if 'date' in cols:
24 |         file_dataframe['date'].applymap(convert_to_integer)
25 | 
26 |     X = file_dataframe['open']
27 |     y = file_dataframe['date']
28 | 
29 |     return X, y
30 | 
31 | 
32 | def svm(file_dataframe, test_size=0.2, cols=['date', 'open']):
33 |     '''
34 |         Run Logistic Regression
35 |     '''
36 | 
37 |     print('Loading data...')
38 | 
39 |     if 'date' in file_dataframe:
40 |         file_dataframe['new_col'] = pd.to_datetime(file_dataframe['date']).astype(datetime)
41 |         #file_dataframe['date'] = pd.to_datetime(file_dataframe['date'])
42 |         file_dataframe['new_col'].apply(lambda dt_time:10000*dt_time.year + 1000*dt_time.month + dt_time.day).astype(int)
43 | 
44 |     print(file_dataframe['new_col'])
45 | 
46 |     X = file_dataframe['open']
47 |     y = file_dataframe['new_col']
48 | 
49 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
50 | 
51 |     #svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1)
52 |     svr_lin = SVR(kernel='linear', C=1e3)
53 |     #svr_poly = SVR(kernel='poly', C=1e3, degree=2)
54 | 
55 |     #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
56 | 
57 |     #loo = cross_validation.LeaveOneOut(len(y_train) - 1)
58 |     #clf = grid_search.GridSearchCV(svr_rbf, parameters)
59 |     scores = []
60 | 
61 |     #svr_rbf.fit(X_train, y_train)
62 |     svr_lin.fit(X_train, y_train)
63 |     #svr_poly.fit(X_train, y_train)
64 | 
65 |     #scores.append(cross_validation.cross_val_score(svr_rbf, \
66 |     #    X_test, y_test, scoring='mean_squared_error', cv=loo).mean())
67 |     scores.append(cross_validation.cross_val_score(svr_lin, \
68 |         X_test, y_test, scoring='mean_squared_error', cv=loo).mean())
69 |     #scores.append(cross_validation.cross_val_score(svr_poly, \
70 |     #    X_test, y_test, scoring='mean_squared_error', cv=loo).mean())
71 |     
72 |     return scores
73 | 
74 | def main(dir_path):
75 |     '''
76 |         Run Pipeline of processes on file one by one.
77 |     '''
78 |     files = os.listdir(dir_path)
79 | 
80 |     for file_name in files:
81 |         print(file_name)
82 | 
83 |         file_dataframe = pd.read_csv(os.path.join(dir_path, file_name), parse_dates=[1])
84 | 
85 |         print(svm(file_dataframe, 0.2, 'high'))
86 | 
87 |         break
88 | 
89 | if __name__ == '__main__':
90 |     main(sys.argv[1])
91 | 


--------------------------------------------------------------------------------
/scripts/Algorithms/regression_models.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 |     Run Stock-Regression Algorithms
 4 | """
 5 | from __future__ import print_function
 6 | from regression_helpers import load_dataset, addFeatures, \
 7 |     mergeDataframes, count_missing, applyTimeLag, performRegression
 8 | import sys
 9 | import os
10 | import pickle
11 | import traceback
12 | 
13 | def main(dir_path, output_dir):
14 |     '''
15 |         Run Pipeline of processes on file one by one.
16 |     '''
17 | 
18 |     scores = {}
19 | 
20 |     files = os.listdir(dir_path)
21 | 
22 |     maxdelta = 30
23 | 
24 |     delta = range(8, maxdelta)
25 |     print('Delta days accounted: ', max(delta))
26 | 
27 |     for file_name in files:
28 |         try:
29 |             symbol = file_name.split('.')[0]
30 |             print(symbol)
31 |             
32 |             datasets = load_dataset(dir_path, file_name)
33 | 
34 |             for dataset in datasets:
35 |                 columns = dataset.columns
36 |                 adjclose = columns[-2]
37 |                 returns = columns[-1]
38 |                 for dele in delta:
39 |                     addFeatures(dataset, adjclose, returns, dele)
40 |                 dataset = dataset.iloc[max(delta):,:] # computation of returns and moving means introduces NaN which are nor removed
41 | 
42 |             finance = mergeDataframes(datasets)
43 | 
44 |             high_value = 365
45 |             high_value = min(high_value, finance.shape[0] - 1)
46 | 
47 |             lags = range(high_value, 30)
48 |             print('Maximum time lag applied', high_value)
49 | 
50 |             if 'symbol' in finance.columns:
51 |                 finance.drop('symbol', axis=1, inplace=True)
52 | 
53 |             print('Size of data frame: ', finance.shape)
54 |             print('Number of NaN after merging: ', count_missing(finance))
55 | 
56 |             finance = finance.interpolate(method='time')
57 |             print('Number of NaN after time interpolation: ', finance.shape[0]*finance.shape[1] - finance.count().sum())
58 | 
59 |             finance = finance.fillna(finance.mean())
60 |             print('Number of NaN after mean interpolation: ', (finance.shape[0]*finance.shape[1] - finance.count().sum()))
61 | 
62 |             finance.columns = [str(col.replace('&', '_and_')) for col in finance.columns]
63 | 
64 |             #Move the Open Values behind by one dataset.
65 |             finance.open = finance.open.shift(-1)
66 | 
67 |             print(high_value)
68 |             finance = applyTimeLag(finance, [high_value], delta)
69 | 
70 |             print('Number of NaN after temporal shifting: ', count_missing(finance))
71 |             print('Size of data frame after feature creation: ', finance.shape)
72 | 
73 |             mean_squared_errors, r2_scores = performRegression(finance, 0.95, \
74 |                 symbol, output_dir)
75 | 
76 |             scores[symbol] = [mean_squared_errors, r2_scores]
77 |         except Exception, e:
78 |             pass
79 |             traceback.print_exc()
80 |     
81 |     with open(os.path.join(output_dir, 'scores.pickle'), 'wb') as handle:
82 |         pickle.dump(scores, handle)
83 | 
84 | if __name__ == '__main__':
85 |     main(sys.argv[1], sys.argv[2])
86 | 


--------------------------------------------------------------------------------
/scripts/fetch_stock_data.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | import sys
  3 | import os
  4 | import csv
  5 | import time
  6 | import datetime
  7 | 
  8 | from ychartspy.client import YChartsClient
  9 | 
 10 | def convert(timestamp):
 11 | 	return datetime.datetime.fromtimestamp(int(timestamp) / 1e3).strftime('%Y-%m-%d')
 12 | 
 13 | def main(symbol_file, parameter_file, output_dir):
 14 | 	'''
 15 | 
 16 | 	Params:
 17 | 	------------------------
 18 | 	parameter_file: This is the path to the file that contains the information that you want to fetch via the stock prediction api like ps_ratio, dividend_yield, etc. An example for this file is in the input directory of the project source directory.
 19 | 	symbol_file: This is path to text file with the symbols that you want the output of seperated by newlines.
 20 | 	output_dir: path to output directory. In this directory the files will be created and saved as symbol_1.csv, symbol_2.csv, etc.
 21 | 	'''
 22 | 	param_fp = open(parameter_file, 'r')
 23 | 
 24 | 	param_list = []
 25 | 
 26 | 	count = {}
 27 | 
 28 | 	for parameter in param_fp:
 29 | 		param_list.append(parameter.strip())
 30 | 		count[parameter.strip()]=0
 31 | 
 32 | 	client = YChartsClient()
 33 | 
 34 | 	error_count = {}
 35 | 
 36 | 	with open(symbol_file, 'r') as sym_fp:
 37 | 		for symbol in list(csv.reader(sym_fp)):
 38 | 			row_info = {}
 39 | 			symbol = symbol[0].strip()
 40 | 
 41 | 			to_write = []
 42 | 			to_write.append(['symbol', 'timestamp'])
 43 | 
 44 | 			non_params = []
 45 | 			
 46 | 			print symbol
 47 | 
 48 | 			for parameter in param_list:
 49 | 				parameter=parameter.strip()
 50 | 				to_write[0].append(parameter)
 51 | 
 52 | 				try:
 53 | 					row = client.get_security_metric(symbol, parameter, start_date="01/01/1900")
 54 | 				except Exception, e:
 55 | 					if parameter in error_count:
 56 | 						error_count[parameter]+=1
 57 | 					else:
 58 | 						error_count[parameter]=1
 59 | 					non_params.append(parameter)
 60 | 					continue
 61 | 				
 62 | 				for row_obj in row:		
 63 | 					if row_obj[0] not in row_info:
 64 | 						row_info[row_obj[0]] = {}
 65 | 					row_info[row_obj[0]][str(parameter)]=row_obj[1]
 66 | 					
 67 | 				if count[parameter]==0:
 68 | 					count[parameter]=1
 69 | 
 70 | 
 71 | 			new_file = open(os.path.join(output_dir, str(symbol) + '.csv'), 'w+')
 72 | 
 73 | 			for key in sorted(row_info):
 74 | 				temp = []
 75 | 				temp.append(str(symbol))
 76 | 				temp.append(convert(key))
 77 | 
 78 | 				for parameter in param_list:
 79 | 					parameter=str(parameter)
 80 | 
 81 | 					if count[parameter]==0:
 82 | 						param_list.remove(parameter)
 83 | 						to_write[0].remove(parameter)
 84 | 						continue
 85 | 
 86 | 					if parameter in row_info[key]:
 87 | 						#print 'HERE', parameter, key
 88 | 						temp.append(row_info[key][parameter])
 89 | 						#to_write[-1].append(row_info[key][str(parameter)])
 90 | 					else:
 91 | 						#print 'NOT ', parameter, row_info[key]
 92 | 						temp.append('NaN')
 93 | 				
 94 | 				to_write.append(temp)
 95 | 
 96 | 				#print to_write[-1], len(to_write[-1])
 97 | 
 98 | 				#row_info[key].insert(0, convert(key))
 99 | 				#row_info[key].insert(0, str(symbol))
100 | 				#to_write.append(row_info[key])
101 | 
102 | 			writer = csv.writer(new_file)
103 | 			writer.writerows(to_write)
104 | 			new_file.close()
105 | 
106 | 	'''
107 | 	for key in error_count:
108 | 		if error_count[key]==7:
109 | 			print key
110 | 	'''
111 | 	#print non_params
112 | 
113 | if __name__ == '__main__':
114 |   	main(str(sys.argv[1]), str(sys.argv[2]), str(sys.argv[3]))
115 | 


--------------------------------------------------------------------------------
/scripts/Algorithms/LSTN-RNN.py:
--------------------------------------------------------------------------------
  1 | ### Incomplete
  2 | 
  3 | import copy, numpy as np
  4 | np.random.seed(0)
  5 | 
  6 | # compute sigmoid nonlinearity
  7 | def sigmoid(x):
  8 |     output = 1/(1+np.exp(-x))
  9 |     return output
 10 | 
 11 | # convert output of sigmoid function to its derivative
 12 | def sigmoid_output_to_derivative(output):
 13 |     return output*(1-output)
 14 | 
 15 | 
 16 | # training dataset generation
 17 | int2binary = {}
 18 | binary_dim = 8
 19 | 
 20 | largest_number = pow(2,binary_dim)
 21 | binary = np.unpackbits(
 22 |     np.array([range(largest_number)],dtype=np.uint8).T,axis=1)
 23 | for i in range(largest_number):
 24 |     int2binary[i] = binary[i]
 25 | 
 26 | 
 27 | # input variables
 28 | alpha = 0.1
 29 | input_dim = 2
 30 | hidden_dim = 16
 31 | output_dim = 1
 32 | 
 33 | 
 34 | # initialize neural network weights
 35 | synapse_0 = 2*np.random.random((input_dim,hidden_dim)) - 1
 36 | synapse_1 = 2*np.random.random((hidden_dim,output_dim)) - 1
 37 | synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) - 1
 38 | 
 39 | synapse_0_update = np.zeros_like(synapse_0)
 40 | synapse_1_update = np.zeros_like(synapse_1)
 41 | synapse_h_update = np.zeros_like(synapse_h)
 42 | 
 43 | # training logic
 44 | for j in range(10000):
 45 |     
 46 |     # generate a simple addition problem (a + b = c)
 47 |     a_int = np.random.randint(largest_number/2) # int version
 48 |     a = int2binary[a_int] # binary encoding
 49 | 
 50 |     b_int = np.random.randint(largest_number/2) # int version
 51 |     b = int2binary[b_int] # binary encoding
 52 | 
 53 |     # true answer
 54 |     c_int = a_int + b_int
 55 |     c = int2binary[c_int]
 56 |     
 57 |     # where we'll store our best guess (binary encoded)
 58 |     d = np.zeros_like(c)
 59 | 
 60 |     overallError = 0
 61 |     
 62 |     layer_2_deltas = list()
 63 |     layer_1_values = list()
 64 |     layer_1_values.append(np.zeros(hidden_dim))
 65 |     
 66 |     # moving along the positions in the binary encoding
 67 |     for position in range(binary_dim):
 68 |         
 69 |         # generate input and output
 70 |         X = np.array([[a[binary_dim - position - 1],b[binary_dim - position - 1]]])
 71 |         y = np.array([[c[binary_dim - position - 1]]]).T
 72 | 
 73 |         # hidden layer (input ~+ prev_hidden)
 74 |         layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h))
 75 | 
 76 |         # output layer (new binary representation)
 77 |         layer_2 = sigmoid(np.dot(layer_1,synapse_1))
 78 | 
 79 |         # did we miss?... if so, by how much?
 80 |         layer_2_error = y - layer_2
 81 |         layer_2_deltas.append((layer_2_error)*sigmoid_output_to_derivative(layer_2))
 82 |         overallError += np.abs(layer_2_error[0])
 83 |     
 84 |         # decode estimate so we can print it out
 85 |         d[binary_dim - position - 1] = np.round(layer_2[0][0])
 86 |         
 87 |         # store hidden layer so we can use it in the next timestep
 88 |         layer_1_values.append(copy.deepcopy(layer_1))
 89 |     
 90 |     future_layer_1_delta = np.zeros(hidden_dim)
 91 |     
 92 |     for position in range(binary_dim):
 93 |         
 94 |         X = np.array([[a[position],b[position]]])
 95 |         layer_1 = layer_1_values[-position-1]
 96 |         prev_layer_1 = layer_1_values[-position-2]
 97 |         
 98 |         # error at output layer
 99 |         layer_2_delta = layer_2_deltas[-position-1]
100 |         # error at hidden layer
101 |         layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1)
102 | 
103 |         # let's update all our weights so we can try again
104 |         synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta)
105 |         synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta)
106 |         synapse_0_update += X.T.dot(layer_1_delta)
107 |         
108 |         future_layer_1_delta = layer_1_delta
109 |     
110 | 
111 |     synapse_0 += synapse_0_update * alpha
112 |     synapse_1 += synapse_1_update * alpha
113 |     synapse_h += synapse_h_update * alpha    
114 | 
115 |     synapse_0_update *= 0
116 |     synapse_1_update *= 0
117 |     synapse_h_update *= 0
118 |     
119 |     # print out progress
120 |     if(j % 1000 == 0):
121 |         print "Error:" + str(overallError)
122 |         print "Pred:" + str(d)
123 |         print "True:" + str(c)
124 |         out = 0
125 |         for index,x in enumerate(reversed(d)):
126 |             out += x*pow(2,index)
127 |         print str(a_int) + " + " + str(b_int) + " = " + str(out)
128 |         print "------------"
129 | 
130 |         


--------------------------------------------------------------------------------
/scripts/Algorithms/rnn_lstm.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/python
  2 | '''
  3 |     Running LSTM Algorithm.
  4 | '''
  5 | from __future__ import print_function
  6 | import numpy as np
  7 | np.random.seed(1337)  # for reproducibility
  8 | 
  9 | from keras.preprocessing import sequence
 10 | from keras.utils import np_utils
 11 | from keras.models import Sequential
 12 | from keras.layers.core import Dense, Dropout, Activation
 13 | from keras.layers.embeddings import Embedding
 14 | from keras.layers.recurrent import LSTM, SimpleRNN, GRU
 15 | from keras.layers.core import *
 16 | 
 17 | max_features = 5883
 18 | maxlen = 80
 19 | batch_size = 32
 20 | 
 21 | in_out_neurons = 2
 22 | hidden_neurons = 300
 23 | 
 24 | import os
 25 | import sys
 26 | import pandas as pd
 27 | 
 28 | 
 29 | def _load_data(data, n_prev=100):
 30 |     """
 31 |     data should be pd.DataFrame()
 32 |     """
 33 | 
 34 |     docX, docY = [], []
 35 |     for i in range(len(data)-n_prev):
 36 |         docX.append(data.iloc[i:i+n_prev])
 37 |         docY.append(data.iloc[i+n_prev])
 38 | 
 39 |     all_X = np.array(docX)
 40 |     all_Y = np.array(docY)
 41 | 
 42 |     return all_X, all_Y
 43 | 
 44 | 
 45 | def train_test_split(dataframe, test_size=0.2):
 46 |     """
 47 |     This just splits data to training and testing parts
 48 |     """
 49 |     ntrn = int(round(len(dataframe) * (1 - test_size)))
 50 | 
 51 |     X_train, y_train = _load_data(dataframe.iloc[0:ntrn])
 52 |     X_test, y_test = _load_data(dataframe.iloc[ntrn:])
 53 | 
 54 |     print(X_train, y_train)
 55 | 
 56 |     return (X_train, y_train), (X_test, y_test)
 57 | 
 58 | 
 59 | def rnn_lstm(file_dataframe, test_size=0.2, col="high"):
 60 |     print('Loading data...')
 61 |     (X_train, y_train), (X_test, y_test) = train_test_split(
 62 |         file_dataframe[col], test_size=0.2)
 63 |     
 64 |     '''
 65 | 
 66 |     X_train = np.array([[ 360, 7, 19, 256, 82, 7], \
 67 |                         [ 6, 102, 37, 5, 1324, 7]])
 68 | 
 69 |     y_train = np.array([1, 0])
 70 | 
 71 |     X_test = X_train
 72 | 
 73 |     y_test = y_train
 74 | 
 75 |     print(X_train.shape, y_train.shape)
 76 |     print(len(X_train), 'train sequences')
 77 |     print(len(X_test), 'test sequences')
 78 |     
 79 |     '''
 80 | 
 81 |     print('Pad sequences (samples x time)')
 82 |     X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
 83 |     X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
 84 |     print('X_train shape:', X_train.shape)
 85 |     print('X_test shape:', X_test.shape)
 86 | 
 87 |     print('Build model...')
 88 |     '''
 89 |     model = Sequential()
 90 |     model.add(Embedding(max_features, hidden_neurons, \
 91 |         input_length=maxlen, dropout=0.2))
 92 |     model.add(LSTM(hidden_neurons, dropout_W=0.2, dropout_U=0.2))
 93 |     model.add(Dense(1))
 94 |     model.add(Activation('sigmoid'))
 95 | 
 96 |     model.compile(loss='binary_crossentropy',
 97 |         optimizer='adam',
 98 |         metrics=['accuracy'])
 99 | 
100 |     #model.compile(loss="mean_squared_error", \
101 |     #    optimizer="rmsprop", metrics=['accuracy'])
102 |     '''
103 | 
104 |     input_dim = 32
105 |     hidden = 32
106 |     step = 10
107 | 
108 |     #The LSTM  model -  output_shape = (batch, step, hidden)
109 |     model1 = Sequential()
110 |     model1.add(LSTM(input_dim=input_dim, output_dim=hidden, input_length=step, return_sequences=True))
111 | 
112 |     #The weight model  - actual output shape  = (batch, step)
113 |     # after reshape : output_shape = (batch, step,  hidden)
114 |     model2 = Sequential()
115 |     model2.add(Dense(input_dim=input_dim, output_dim=step))
116 |     model2.add(Activation('softmax')) # Learn a probability distribution over each  step.
117 |     #Reshape to match LSTM's output shape, so that we can do element-wise multiplication.
118 |     model2.add(RepeatVector(hidden))
119 |     model2.add(Permute((2, 1)))
120 | 
121 |     #The final model which gives the weighted sum:
122 |     model = Sequential()
123 |     model.add(Merge([model1, model2], 'sum', concat_axis=1))  # Multiply each element with corresponding weight a[i][j][k] * b[i][j]
124 |     model.add((Merge([model1, model2], mode='sum', concat_axis=1)) # Sum the weighted elements.
125 | 
126 |     model.compile(loss='mse', optimizer='sgd')
127 | 
128 |     print('Train...')
129 |     print(X_train.shape, X_test.shape)
130 |     print(y_train.shape, y_test.shape)
131 | 
132 |     model.fit(X_train, y_train, batch_size=batch_size, \
133 |         validation_data=(X_test, y_test), nb_epoch=5)
134 |     score, accuracy = model.evaluate(X_test, y_test,
135 |                                 batch_size=batch_size)
136 |     print('Test score:', score)
137 |     print('Test accuracy:', accuracy)
138 | 
139 |     return (score, accuracy)
140 | 
141 | 
142 | def main(dir_path):
143 |     '''
144 |         Run Pipeline of processes on file one by one.
145 |     '''
146 |     files = os.listdir(dir_path)
147 | 
148 |     #for file_name in files:
149 |     file_name="GOOGL.csv"
150 |     print(file_name)
151 | 
152 |     file_dataframe = pd.read_csv(os.path.join(dir_path, file_name))
153 | 
154 |     print(rnn_lstm(file_dataframe, 0.1, 'high'))
155 | 
156 |     #break
157 | 
158 | if __name__ == '__main__':
159 |     main(sys.argv[1])
160 | 


--------------------------------------------------------------------------------
/scripts/Algorithms/regression_helpers.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 |     Miscellaneous Functions for Regression File.
  4 | """
  5 | 
  6 | from __future__ import print_function
  7 | import numpy as np
  8 | import pandas as pd
  9 | from sklearn import preprocessing
 10 | from sklearn.metrics import mean_squared_error, r2_score
 11 | from sklearn.ensemble import RandomForestRegressor
 12 | import matplotlib.pyplot as plt
 13 | from sklearn.ensemble import BaggingRegressor
 14 | from sklearn.ensemble import AdaBoostRegressor
 15 | from sklearn.ensemble import GradientBoostingRegressor
 16 | from sklearn.neighbors import KNeighborsRegressor
 17 | from sklearn.ensemble import RandomForestClassifier
 18 | from sklearn import neighbors
 19 | from sklearn.ensemble import AdaBoostClassifier
 20 | from sklearn.ensemble import GradientBoostingClassifier
 21 | #from sklearn.svm import SVR
 22 | from sklearn.feature_selection import SelectKBest, chi2
 23 | from sklearn.svm import SVC, SVR
 24 | from sklearn.qda import QDA
 25 | import os
 26 | from sklearn.grid_search import GridSearchCV
 27 | from Neural_Network import NeuralNet
 28 | 
 29 | def load_dataset(path_directory, symbol): 
 30 |     """
 31 |         Import DataFrame from Dataset.
 32 |     """
 33 | 
 34 |     path = os.path.join(path_directory, symbol)
 35 | 
 36 |     out = pd.read_csv(path, index_col=2, parse_dates=[2])
 37 |     out.drop(out.columns[0], axis=1, inplace=True)
 38 | 
 39 |     #name = path_directory + '/sp.csv'
 40 |     #sp = pd.read_csv(name, index_col=0, parse_dates=[1])
 41 |     
 42 |     #name = path_directory + '/GOOGL.csv'
 43 |     #nasdaq = pd.read_csv(name, index_col=1, parse_dates=[1])
 44 |     
 45 |     #name = path_directory + '/treasury.csv'
 46 |     #treasury = pd.read_csv(name, index_col=0, parse_dates=[1])
 47 |     
 48 |     #return [sp, nasdaq, djia, treasury, hkong, frankfurt, paris, nikkei, london, australia]
 49 |     #return [out, nasdaq, djia, frankfurt, hkong, nikkei, australia]
 50 |     return [out]    
 51 | 
 52 | def count_missing(dataframe):
 53 |     """
 54 |     count number of NaN in dataframe
 55 |     """
 56 |     return (dataframe.shape[0] * dataframe.shape[1]) - dataframe.count().sum()
 57 | 
 58 |     
 59 | def addFeatures(dataframe, adjclose, returns, n):
 60 |     """
 61 |     operates on two columns of dataframe:
 62 |     - n >= 2
 63 |     - given Return_* computes the return of day i respect to day i-n. 
 64 |     - given AdjClose_* computes its moving average on n days
 65 | 
 66 |     """
 67 |     
 68 |     return_n = adjclose[9:] + "Time" + str(n)
 69 |     dataframe[return_n] = dataframe[adjclose].pct_change(n)
 70 |     
 71 |     roll_n = returns[7:] + "RolMean" + str(n)
 72 |     dataframe[roll_n] = pd.rolling_mean(dataframe[returns], n)
 73 | 
 74 |     exp_ma = returns[7:] + "ExponentMovingAvg" + str(n)
 75 |     dataframe[exp_ma] = pd.ewma(dataframe[returns], halflife=n)
 76 |     
 77 | def mergeDataframes(datasets):
 78 |     """
 79 |         Merge Datasets into Dataframe.
 80 |     """
 81 |     return pd.concat(datasets)
 82 | 
 83 |     
 84 | def applyTimeLag(dataset, lags, delta):
 85 |     """
 86 |         apply time lag to return columns selected according  to delta.
 87 |         Days to lag are contained in the lads list passed as argument.
 88 |         Returns a NaN free dataset obtained cutting the lagged dataset
 89 |         at head and tail
 90 |     """
 91 |     maxLag = max(lags)
 92 | 
 93 |     columns = dataset.columns[::(2*max(delta)-1)]
 94 |     for column in columns:
 95 |         newcolumn = column + str(maxLag)
 96 |         dataset[newcolumn] = dataset[column].shift(maxLag)
 97 | 
 98 |     return dataset.iloc[maxLag:-1, :]
 99 | 
100 | # CLASSIFICATION    
101 | def prepareDataForClassification(dataset, start_test):
102 |     """
103 |     generates categorical to be predicted column, attach to dataframe 
104 |     and label the categories
105 |     """
106 |     le = preprocessing.LabelEncoder()
107 |     
108 |     dataset['UpDown'] = dataset['Return_Out']
109 |     dataset.UpDown[dataset.UpDown >= 0] = 'Up'
110 |     dataset.UpDown[dataset.UpDown < 0] = 'Down'
111 |     dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown)
112 |     
113 |     features = dataset.columns[1:-1]
114 |     X = dataset[features]    
115 |     y = dataset.UpDown    
116 |     
117 |     X_train = X[X.index < start_test]
118 |     y_train = y[y.index < start_test]    
119 |     
120 |     X_test = X[X.index >= start_test]    
121 |     y_test = y[y.index >= start_test]
122 |     
123 |     return X_train, y_train, X_test, y_test    
124 | 
125 | def prepareDataForModelSelection(X_train, y_train, start_validation):
126 |     """
127 |     gets train set and generates a validation set splitting the train.
128 |     The validation set is mandatory for feature and model selection.
129 |     """
130 |     X = X_train[X_train.index < start_validation]
131 |     y = y_train[y_train.index < start_validation]    
132 |     
133 |     X_val = X_train[X_train.index >= start_validation]    
134 |     y_val = y_train[y_train.index >= start_validation]   
135 |     
136 |     return X, y, X_val, y_val
137 | 
138 |   
139 | def performClassification(X_train, y_train, X_test, y_test, method, parameters={}):
140 |     """
141 |         Perform Classification with the help of serveral Algorithms.
142 |     """
143 | 
144 |     print('Performing ' + method + ' Classification...')
145 |     print('Size of train set: ', X_train.shape)
146 |     print('Size of test set: ', X_test.shape)
147 |     print('Size of train set: ', y_train.shape)
148 |     print('Size of test set: ', y_test.shape)
149 |     
150 | 
151 |     classifiers = [
152 |         RandomForestClassifier(n_estimators=100, n_jobs=-1),
153 |         neighbors.KNeighborsClassifier(),
154 |         SVC(degree=100, C=10000, epsilon=.01),
155 |         AdaBoostRegressor(),
156 |         AdaBoostClassifier(**parameters)(),
157 |         GradientBoostingClassifier(n_estimators=100),
158 |         QDA(),
159 |     ]
160 | 
161 |     scores = []
162 | 
163 |     for classifier in classifiers:
164 |         scores.append(benchmark_classifier(classifier, \
165 |             X_train, y_train, X_test, y_test))
166 | 
167 |     print(scores)
168 | 
169 | def benchmark_classifier(clf, X_train, y_train, X_test, y_test):
170 |     clf.fit(X_train, y_train)
171 |     accuracy = clf.score(X_test, y_test)
172 |     #auc = roc_auc_score(y_test, clf.predict(X_test))
173 |     return accuracy
174 | 
175 | # REGRESSION
176 |     
177 | def getFeatures(X_train, y_train, X_test, num_features):
178 |     ch2 = SelectKBest(chi2, k=5)
179 |     X_train = ch2.fit_transform(X_train, y_train)
180 |     X_test = ch2.transform(X_test)
181 |     return X_train, X_test
182 | 
183 | def performRegression(dataset, split, symbol, output_dir):
184 |     """
185 |         Performing Regression on 
186 |         Various algorithms
187 |     """
188 | 
189 |     features = dataset.columns[1:]
190 |     index = int(np.floor(dataset.shape[0]*split))
191 |     train, test = dataset[:index], dataset[index:]
192 |     print('Size of train set: ', train.shape)
193 |     print('Size of test set: ', test.shape)
194 |     
195 |     #train, test = getFeatures(train[features], \
196 |     #    train[output], test[features], 16)
197 | 
198 |     out_params = (symbol, output_dir)
199 | 
200 |     output = dataset.columns[0]
201 | 
202 |     predicted_values = []
203 | 
204 |     classifiers = [
205 |         RandomForestRegressor(n_estimators=10, n_jobs=-1),
206 |         SVR(C=100000, kernel='rbf', epsilon=0.1, gamma=1, degree=2),
207 |         BaggingRegressor(),
208 |         AdaBoostRegressor(),
209 |         KNeighborsRegressor(),
210 |         GradientBoostingRegressor(),
211 |     ]
212 | 
213 |     for classifier in classifiers:
214 | 
215 |         predicted_values.append(benchmark_model(classifier, \
216 |             train, test, features, output, out_params))
217 | 
218 |     maxiter = 1000
219 |     batch = 150
220 | 
221 |     classifier = NeuralNet(50, learn_rate=1e-2)
222 | 
223 |     predicted_values.append(benchmark_model(classifier, \
224 |         train, test, features, output, out_params, \
225 |         fine_tune=False, maxiter=maxiter, SGD=True, batch=batch, rho=0.9))
226 |     
227 | 
228 |     print('-'*80)
229 | 
230 |     mean_squared_errors = []
231 | 
232 |     r2_scores = []
233 | 
234 |     for pred in predicted_values:
235 |         mean_squared_errors.append(mean_squared_error(test[output].as_matrix(), \
236 |             pred.as_matrix()))
237 |         r2_scores.append(r2_score(test[output].as_matrix(), pred.as_matrix()))
238 | 
239 |     print(mean_squared_errors, r2_scores)
240 | 
241 |     return mean_squared_errors, r2_scores
242 | 
243 | def benchmark_model(model, train, test, features, output, \
244 |     output_params, *args, **kwargs):
245 |     '''
246 |         Performs Training and Testing of the Data on the Model.
247 |     '''
248 | 
249 |     print('-'*80)
250 |     model_name = model.__str__().split('(')[0].replace('Regressor', ' Regressor')
251 |     print(model_name)
252 | 
253 |     '''
254 |     if 'SVR' in model.__str__():
255 |         tuned_parameters = [{'kernel': ['rbf', 'polynomial'], 'gamma': [1e-3, 1e-4],
256 |                      'C': [1, 10, 100, 1000]},
257 |                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
258 |         model = GridSearchCV(SVC(C=1), tuned_parameters, cv=5,
259 |                        scoring='%s_weighted' % 'recall')
260 |     '''
261 | 
262 |     symbol, output_dir = output_params
263 | 
264 |     model.fit(train[features].as_matrix(), train[output].as_matrix(), *args, **kwargs)
265 |     predicted_value = model.predict(test[features].as_matrix())
266 | 
267 |     plt.plot(test[output].as_matrix(), color='g', ls='-', label='Actual Value')
268 |     plt.plot(predicted_value, color='b', ls='--', label='predicted_value Value')
269 | 
270 |     plt.xlabel('Number of Set')
271 |     plt.ylabel('Output Value')
272 | 
273 |     plt.title(model_name)
274 |     plt.legend(loc='best')
275 |     plt.tight_layout()
276 |     plt.savefig(os.path.join(output_dir, str(symbol) + '_' \
277 |         + model_name + '.png'), dpi=100)
278 |     #plt.show()
279 |     plt.clf()
280 | 
281 |     return predicted_value
282 | 


--------------------------------------------------------------------------------
/scripts/Algorithms/Neural_Network.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # -*- coding: utf-8 -*-
  3 | import numpy as np
  4 | 
  5 | '''
  6 |     Neural Network Implementation
  7 | '''
  8 | class NeuralNet():
  9 |     
 10 |     def __init__(self, num_nodes, weights_=[], classification=True, auto_encoder=False, penalty=0., learn_rate=0.01):
 11 |         self.num_nodes = num_nodes
 12 |         self.weights_ = weights_
 13 |         self.is_fit = False
 14 |         self.classification = classification
 15 |         self.auto_encoder = auto_encoder
 16 |         if auto_encoder:
 17 |             self.classification = False
 18 |         self.K = 0
 19 |         self.penalty = penalty
 20 |         self.learn_rate = learn_rate
 21 |         np.seterr(all='warn')
 22 |     
 23 |     def __str__(self):
 24 |         return "Neural Networks("
 25 | 
 26 |     def initWeights(self, X, nclass):
 27 |         bias = np.ones((X.shape[0],1))
 28 |         X = np.hstack((bias,X)) #add constant bias to each observation, X now N by P+1
 29 |         sizeX = X.shape[1]
 30 |         node_weights_ = np.random.uniform(-0.08,0.08,size=(self.num_nodes, sizeX)) #M+1 by P+1
 31 |         output_weights_ = np.random.uniform(-0.08,0.08,size=(nclass, self.num_nodes+1)) #K by M+1
 32 |         return X, [node_weights_, output_weights_] #nrows = n_nodes, ncols = sizeX
 33 |         
 34 |     def sigmoid(self, alpha_, X_):
 35 |         v_ = alpha_.dot(X_.T)
 36 |         v_[v_ < -300] = -300
 37 |         v_[v_ > 300] = 300
 38 |         return 1./(1+np.exp(-v_))
 39 |         
 40 |     def relu(self, alpha_, X_):
 41 |         try:
 42 |             v_ = alpha_.dot(X_.T)
 43 |         except ValueError:
 44 |             v_ = X_.dot(alpha_)
 45 |             v_ = v_.T
 46 |         v_[v_ < -300] = -300
 47 |         v_[v_ > 300] = 300
 48 |         return np.maximum(v_,np.zeros(v_.shape))
 49 |         
 50 |     def drelu(self, alpha_, X_):
 51 |         try:
 52 |             v_ = alpha_.dot(X_.T)
 53 |         except ValueError:
 54 |             v_ = X_.dot(alpha_)
 55 |             v_ = v_.T
 56 |         v_[v_ <= 0.] = 0
 57 |         v_[v_ > 0.] = 1.
 58 |         return v_
 59 |         
 60 |     def tanh(self, alpha_, X_):
 61 |         try:
 62 |             v_ = alpha_.dot(X_.T)
 63 |         except ValueError:
 64 |             v_ = X_.dot(alpha_)
 65 |             v_ = v_.T
 66 |         v_[v_ < -300] = -300
 67 |         v_[v_ > 300] = 300
 68 |         return np.tanh(v_)
 69 |         
 70 |     def dtanh(self, alpha_, X_):
 71 |         return 1 - np.multiply(self.tanh(alpha_, X_), self.tanh(alpha_, X_))
 72 |         
 73 |     def softmax(self, T):
 74 |         T[T < -300] = -300
 75 |         T[T > 300] = 300
 76 |         return (np.exp(T)/np.sum(np.exp(T), axis=0)).T #(K by N) / elementwise(1 by N)
 77 |         
 78 |     def initNodes(self, X, Y):
 79 |         K = self.K
 80 |         if self.weights_ == []:
 81 |             X, weights = self.initWeights(X, K)
 82 |         else:
 83 |             weights = self.weights_
 84 |             bias = np.ones((X.shape[0],1))
 85 |             X = np.hstack((bias,X))
 86 |         return X, weights
 87 |         
 88 |     def backPropagate(self, weights, next_weights, X, Y, rho, old_del_alpha, old_del_beta, _dropout, back_delta=0., fine_tune=False):
 89 |         """
 90 |         feed forward then back propagate error, update weights
 91 |         """
 92 |         learn_rate = self.learn_rate
 93 | #        beta = 6.
 94 | #        sparsity = 0.05
 95 |         if _dropout:
 96 |             drop = np.random.uniform(0, 1, weights[0].shape[0])
 97 |         if self.auto_encoder and not fine_tune:
 98 |             sig = self.relu(weights[0], X)
 99 |             dsig = self.drelu(weights[0], X)
100 |             if _dropout:
101 |                 sig[drop>=0.5,:] = 0.
102 |                 dsig[drop>=0.5,:] = 0.
103 | #            avg_sparsity = np.mean(sig, axis=1)
104 |         elif not self.auto_encoder and not fine_tune:
105 |             sig = self.tanh(weights[0], X)
106 |             dsig = self.dtanh(weights[0], X)
107 |             if _dropout:
108 |                 sig[drop>=0.5,:] = 0.
109 |                 dsig[drop>=0.5,:] = 0.
110 |             
111 |         if not fine_tune:
112 |             bias = np.ones((1,sig.shape[1]))
113 |             sig = np.vstack((bias,sig))
114 |             hidden_out = weights[1].dot(sig)
115 |             if self.classification:
116 |                 h = self.softmax(hidden_out)
117 |                 forward_error = h - Y
118 |             else:
119 |                 h = hidden_out.T
120 |                 forward_error = h - Y[:,1:] #both N by K
121 |             dRdBeta = sig.dot(forward_error)/forward_error.shape[0] #(M+1 by N)*(N by K) = M+1 by K gradient-force for each neuron
122 | 
123 |         if fine_tune:
124 |             prop_back = np.multiply(back_delta.dot(next_weights[:,1:]),dsig.T)
125 |             dRdAlpha = prop_back.T.dot(X)/X.shape[0]
126 |         elif not fine_tune and not self.auto_encoder:
127 |             back_error = np.multiply((forward_error.dot(weights[1][:,1:])),(dsig.T)) #((N by K)*(K by M+1))*ewise(N by M+1) = N by M+1
128 |             prop_back = back_error
129 |         elif not fine_tune and self.auto_encoder:
130 |             back_error = np.multiply((forward_error.dot(weights[1][:,1:])),(dsig.T))
131 |             prop_back = 0.
132 | #            back_error = np.multiply((forward_error.dot(weights[1][:,1:])),(dsig.T)) + beta*(-sparsity/avg_sparsity+(1-sparsity)/(1-avg_sparsity))
133 |         if not fine_tune:
134 |             dRdAlpha = (back_error.T).dot(X)/X.shape[0]
135 |             del_beta = rho*old_del_beta - learn_rate*dRdBeta.T
136 |         else:
137 |             del_beta = 0.
138 |         del_alpha = rho*old_del_alpha - learn_rate*dRdAlpha
139 |         
140 |         """Bias weights do not get penalized"""
141 |         if not fine_tune:
142 |             bias1 = np.zeros((weights[1].shape[0], 1))
143 |             weights[1] = weights[1] + del_beta + np.hstack((bias1,self.penalty*weights[1][:,1:])) #M+1 by K
144 |         bias0 = np.zeros((weights[0].shape[0], 1))
145 |         weights[0] = weights[0] + del_alpha + np.hstack((bias0,self.penalty*weights[0][:,1:])) #M+1 by P+1
146 | 
147 |         return weights, del_alpha, del_beta, prop_back
148 |         
149 |     def feedForward(self,X,layers):
150 |         activations = []
151 |         for i,layer in enumerate(layers):
152 |             if i == 0:
153 |                 if layer.auto_encoder:
154 |                     if layer._dropout:
155 |                         sig = layer.relu(layer.weights_[0]/2.,X)
156 |                     else:
157 |                         sig = layer.relu(layer.weights_[0], X)
158 | #                    sig = layer.tanh(layer.weights_[0], X)
159 |                 else:
160 |                     if layer._dropout:
161 |                         sig = layer.tanh(layer.weights_[0]/2.,X)
162 |                     else:
163 |                         sig = layer.tanh(layer.weights_[0], X)
164 |             else:
165 |                 if layer.auto_encoder:
166 |                     bias = np.ones((1,sig.shape[1]))
167 |                     sig = np.vstack((bias,sig))
168 |                     if layer._dropout:
169 |                         sig = layer.relu(layer.weights_[0]/2., sig.T)
170 |                     else:
171 |                         sig = layer.relu(layer.weights_[0], sig.T)
172 |                 else:
173 |                     if layer._dropout:
174 |                         sig = layer.tanh(layer.weights_[0]/2., sig.T)
175 |                     else:
176 |                         sig = layer.tanh(layer.weights_[0], sig.T)
177 |             activations.append(sig.T)
178 |         return activations
179 |             
180 |     def fit(self, X, Y, rho=0., maxiter=300, tol=0.000001, anneal=False, t_0=50, dropout=False, batch=40, SGD=True, layers=[], fine_tune=False):
181 |         self._dropout = dropout      
182 |         grad_alpha, grad_beta = 0., 0.
183 |         layer_alphas = [0. for i in range(len(layers))]
184 |         layer_betas = layer_alphas
185 |         self.is_fit = True
186 |         if self.classification:
187 |         #one-hot encode Y
188 |             try:
189 |                 #if already one-hot encoded, pass Y as Y_new
190 |                 if Y.shape[1] > 1:
191 |                     Y_new = Y
192 |                     self.K = Y.shape[1]
193 |                 #else one-hot encode Y as Y_new
194 |                 else:
195 |                     self.K = len(set(Y.flatten()))
196 |                     Y_new = np.zeros((len(Y),self.K))
197 |                     for i,v in enumerate(Y):
198 |                         Y_new[i,v] = 1.
199 |             #if Y.shape[1] null (1D array), one-hot encode it as Y_new
200 |             except IndexError:
201 |                 self.K = len(set(Y.flatten())) #ditto
202 |                 Y_new = np.zeros((len(Y),self.K))
203 |                 for i,v in enumerate(Y):
204 |                     Y_new[i,v] = 1.
205 |         else:
206 |             Y_new = Y
207 |             if not self.auto_encoder:
208 |                 self.K = 1
209 |             else:
210 |                 self.K = Y.shape[1]
211 |         if layers == []:
212 |             X, w = self.initNodes(X, Y_new)
213 |         else:
214 |             bias = np.ones((X.shape[0],1))
215 |             X = np.hstack((bias,X)) #add constant bias to each observation, X now N by P+1
216 |             X_ = self.feedForward(X,layers)
217 |             X_[-1], w = self.initNodes(X_[-1], Y_new)
218 | 
219 |         for i in range(maxiter):
220 |             if anneal and i != 0 and i % t_0 == 0:
221 |                 self.learn_rate /= (float(i)/t_0)
222 |             if not SGD:
223 |                 if fine_tune and layers != []:
224 |                     X_hidden = self.feedForward(X,layers)
225 |                     bias = np.ones((X_hidden[-1].shape[0],1))
226 |                     X_hidden[-1] = np.hstack((bias,X_hidden[-1]))
227 |                 
228 |                     w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_hidden[-1], Y_new, rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=False)
229 |                     next_weights = w[0]                    
230 |                     for i,layer in enumerate(layers[::-1]):
231 |                         if len(layers)-i-2 >= 0:
232 |                             activations = X_hidden[len(layers)-i-2]
233 |                             bias = np.ones((activations.shape[0], 1))
234 |                             activations = np.hstack((bias,activations))
235 |                         else:
236 |                             activations = X
237 |                         layer.weights_, layer_alphas[i], layer_betas[i], back_error = layer.backPropagate(layer.weights_, next_weights, activations, Y_new, rho, layer_alphas[i], layer_betas[i], dropout, back_delta=back_error, fine_tune=True)                     
238 |                         next_weights = layer.weights_[0]
239 |                 elif not fine_tune:
240 |                     if self.auto_encoder:
241 |                         choose = np.random.binomial(1, 0.9, size=X.shape)
242 |                         X_noisy = np.multiply(choose, X)
243 |                     else:
244 |                         X_noisy = X
245 |                     w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_noisy, Y_new, rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=fine_tune)
246 | 
247 |             else:
248 |                 samples = np.random.choice(range(len(X)),size=batch,replace=False)
249 |                 if fine_tune and layers != []:
250 |                     try:
251 |                         X_hidden = self.feedForward(X[samples,:],layers)
252 |                     except TypeError:
253 |                         X_samples = [X[z] for z in samples]
254 |                         X_hidden = self.feedForward(X_samples,layers)
255 |                     bias = np.ones((X_hidden[-1].shape[0],1))
256 |                     X_hidden[-1] = np.hstack((bias,X_hidden[-1]))
257 |                 
258 |                     w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_hidden[-1], Y_new[samples,:], rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=False)
259 |                     next_weights = w[0]                    
260 |                     for i,layer in enumerate(layers[::-1]):
261 |                         if len(layers)-i-2 >= 0:
262 |                             activations = X_hidden[len(layers)-i-2]
263 |                             bias = np.ones((activations.shape[0],1))
264 |                             activations = np.hstack((bias,activations))
265 |                         else:
266 |                             try:
267 |                                 activations = X[samples,:]
268 |                             except TypeError:
269 |                                 activations = [X[z] for z in samples]
270 |                         layer.weights_, layer_alphas[i], layer_betas[i], back_error = layer.backPropagate(layer.weights_, next_weights, activations, Y_new[samples,:], rho, layer_alphas[i], layer_betas[i], dropout, back_delta=back_error, fine_tune=True)                     
271 |                         next_weights = layer.weights_[0]
272 |                 elif not fine_tune:
273 |                     if self.auto_encoder:
274 |                         choose = np.random.binomial(1,0.9,size=X[samples,:].shape)
275 |                         X_noisy = np.multiply(choose, X[samples,:])
276 |                         Y_test = X[samples,:]
277 |                     else:
278 |                         X_noisy = X[samples,:]
279 |                         if self.classification:
280 |                             Y_test = Y_new[samples,:]
281 |                         else:
282 |                             Y_test = Y_new[samples]
283 |                     w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_noisy, Y_test, rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=fine_tune)
284 |             
285 |         self.weights_ = w
286 |         return layers
287 |             
288 |     def predict(self, X, proba=True, layers=[]):
289 |         if self.is_fit:
290 |             self.predictions = []
291 |             if layers==[]:
292 |                 bias = np.ones((X.shape[0],1))
293 |                 X = np.hstack((bias,X)) #add constant bias to each observation, X now N by P+1
294 |             if layers == []:
295 |                 if self.auto_encoder:
296 |                     if self._dropout:
297 |                         activation = self.relu(self.weights_[0]/2, X)
298 |                     else:
299 |                         activation = self.relu(self.weights_[0], X)
300 |                 else:
301 |                     if self._dropout:
302 |                         activation = self.tanh(self.weights_[0]/2., X)
303 |                     else:
304 |                         activation = self.tanh(self.weights_[0], X)
305 |                 bias = np.ones((1,activation.shape[1]))
306 |                 activation = np.vstack((bias,activation))
307 |                 response = self.weights_[1].dot(activation)
308 |             else:
309 |                 activation = self.feedForward(X,layers)
310 |                 activation = activation[-1]
311 | #            print activation.shape
312 | #            if layers != []:
313 |                 bias = np.ones((activation.shape[0],1))
314 |                 activation = np.hstack((bias,activation))
315 |                 if self._dropout:
316 |                     response = self.tanh(self.weights_[0]/2., activation)
317 |                 else:
318 |                     response = self.tanh(self.weights_[0], activation)
319 |                 bias = np.ones((1,response.shape[1]))
320 |                 response = np.vstack((bias,response))
321 |                 response = self.weights_[1].dot(response)
322 |             if self.classification:
323 |                 predictions = self.softmax(response)
324 |                 if not proba:
325 |                     predictions = np.argmax(predictions, axis=1)
326 |             else:
327 |                 predictions = response
328 |             self.predictions = predictions
329 |             return self.predictions
330 |         else:
331 |             return "Cannot predict without fitting data first!!"
332 |     
333 |     def hidden_activations(self, X):
334 |         if self.is_fit:
335 |             bias = np.ones((X.shape[0],1))
336 |             X = np.hstack((bias,X))
337 |             if self.auto_encoder:
338 |                 if self._dropout:
339 |                     activations = self.relu(self.weights_[0]/2., X)
340 |                 else:
341 |                     activations = self.relu(self.weights_[0], X)
342 | #                activations = self.tanh(self.weights_[0], X)
343 |             else:
344 |                 if self._dropout:
345 |                     activations = self.tanh(self.weights_[0]/2., X)
346 |                 else:
347 |                     activations = self.tanh(self.weights_[0], X)
348 |             return activations.T
349 |         else:
350 |             return "Method 'hidden_activations' can only be called for auto encoders"
351 |         
352 |     def score(self, X_test, Y_test, layers=[]):
353 |         predictions = self.predict(X_test, proba=False, layers=layers)
354 |         if self.classification:
355 |             try:
356 |                 if Y_test.shape[1] > 1:
357 |                     num_correct = predictions == np.argmax(Y_test, axis=1)
358 |                     return float(len(Y_test[num_correct]))/len(Y_test)
359 |                 else:
360 |                     num_correct = predictions == np.array(Y_test).flatten()
361 |                     return float(len(Y_test.flatten()[num_correct]))/len(Y_test)
362 |             except IndexError:
363 |                 num_correct = predictions == np.array(Y_test).flatten()
364 |                 return float(len(Y_test.flatten()[num_correct]))/len(Y_test)
365 |         else:
366 |             n = len(Y_test)
367 |             diff = predictions.T - Y_test
368 |             MSE = 1. - sum(np.multiply(diff,diff))/n
369 |             return MSE


--------------------------------------------------------------------------------