├── input ├── symbols.txt └── params.txt ├── Report.pdf ├── screenshots └── presentation.gif ├── Documents ├── SMAIProjectAbstract.pdf └── StockPricePrediction.pdf ├── requirements.txt ├── scripts ├── normalization.py ├── interpolation.py ├── main.py ├── add_s_and_p_index.py ├── feature_selection.py ├── preprocessing.py ├── Algorithms │ ├── svm.py │ ├── regression_models.py │ ├── LSTN-RNN.py │ ├── rnn_lstm.py │ ├── regression_helpers.py │ └── Neural_Network.py └── fetch_stock_data.py ├── .gitignore ├── LICENSE └── README.md /input/symbols.txt: -------------------------------------------------------------------------------- 1 | FB 2 | GOOG 3 | -------------------------------------------------------------------------------- /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/Report.pdf -------------------------------------------------------------------------------- /screenshots/presentation.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/screenshots/presentation.gif -------------------------------------------------------------------------------- /Documents/SMAIProjectAbstract.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/Documents/SMAIProjectAbstract.pdf -------------------------------------------------------------------------------- /Documents/StockPricePrediction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scorpionhiccup/StockPricePrediction/HEAD/Documents/StockPricePrediction.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.10.0 2 | matplotlib==1.5.1 3 | numpy==1.11.0 4 | pandas==0.18.0 5 | pyparsing==2.1.1 6 | python-dateutil==2.5.3 7 | pytz==2016.4 8 | scikit-learn==0.17.1 9 | scipy==0.17.0 10 | six==1.10.0 11 | -------------------------------------------------------------------------------- /scripts/normalization.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ''' 3 | Data Normalization 4 | ''' 5 | 6 | from sklearn import preprocessing 7 | 8 | def normalize(file_dataframe, cols): 9 | ''' 10 | Data Normalization. 11 | ''' 12 | 13 | for col in cols: 14 | preprocessing.normalize(file_dataframe[col], \ 15 | axis=1, norm='l2', copy=False) 16 | 17 | return file_dataframe -------------------------------------------------------------------------------- /scripts/interpolation.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ''' 3 | Data Interpolation 4 | ''' 5 | 6 | import os, sys 7 | import pandas as pd 8 | 9 | def interpolate(dataframe, cols_to_interpolate): 10 | 11 | for col in cols_to_interpolate: 12 | dataframe[col] = dataframe[col].interpolate('spline', order=2) 13 | 14 | return dataframe 15 | 16 | 17 | def main(dir_path): 18 | files = os.listdir(dir_path) 19 | for file_name in files: 20 | dataframe = pd.read_csv(os.path.join(dir_path, file_name)) 21 | dataframe = interpolate(dataframe, \ 22 | ['high', 'open', 'low', 'close', 'volume', 'adj_close']) 23 | print dataframe 24 | 25 | break 26 | 27 | 28 | if __name__=="__main__": 29 | main(sys.argv[1]) 30 | -------------------------------------------------------------------------------- /scripts/main.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ''' 3 | Main File. 4 | ''' 5 | import os 6 | import sys 7 | import pandas as pd 8 | 9 | from interpolation import interpolate 10 | from normalization import normalize 11 | 12 | 13 | def main(dir_path, output_dir): 14 | ''' 15 | Run Pipeline of processes on file one by one. 16 | ''' 17 | files = os.listdir(dir_path) 18 | 19 | for file_name in files: 20 | 21 | file_dataframe = pd.read_csv(os.path.join(dir_path, file_name)) 22 | 23 | cols = ['high', 'open', 'low', 'close', 'volume', 'adj_close'] 24 | 25 | file_dataframe = interpolate(file_dataframe, cols) 26 | 27 | file_dataframe = normalize(file_dataframe, cols) 28 | 29 | file_dataframe.to_csv( 30 | os.path.join(output_dir, file_name), encoding='utf-8') 31 | 32 | if __name__ == '__main__': 33 | main(sys.argv[1], sys.argv[2]) 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | input/non_params.txt 2 | scripts/.scrapy/* 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | env/ 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *,cover 49 | .hypothesis/ 50 | 51 | # Translations 52 | *.mo 53 | *.pot 54 | 55 | # Django stuff: 56 | *.log 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | 61 | # PyBuilder 62 | target/ 63 | 64 | #Ipython Notebook 65 | .ipynb_checkpoints 66 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Sharvil Katariya 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /scripts/add_s_and_p_index.py: -------------------------------------------------------------------------------- 1 | import os, sys, csv 2 | import math 3 | import pprint 4 | 5 | def roundup(var): 6 | return float(format(var, '.6f')) 7 | 8 | def main(dir_path, sp_index_file, output_dir): 9 | files = os.listdir(dir_path) 10 | 11 | for file_name in files: 12 | with open( os.path.join(dir_path, file_name), 'r') as textfile: 13 | 14 | new_file = open(os.path.join(output_dir, file_name), 'w+') 15 | 16 | new_list = [] 17 | new_list.append(['symbol','date','open','high','low','close','volume','adj_close', 'prev_day_diff', '50_day_moving_avg', '10_day_volatility', 18 | 's&p_index_open', 's&p_index_high', 's&p_index_low', 's&p_index_close', 's&p_index_volume', 's&p_index_adj_close']) 19 | 20 | dict_mapping = {} 21 | 22 | for count, row in enumerate(reversed(list(csv.reader(textfile)))): 23 | if str(row[0])=="symbol": 24 | break 25 | 26 | date = str(row[1]) 27 | dict_mapping[date] = row 28 | 29 | """ 30 | Extend to Existing Key-Value in dict_mapping dictionary. 31 | """ 32 | 33 | with open(sp_index_file, 'r') as sp_index_fp: 34 | for count2, row2 in enumerate(reversed(list(csv.reader(sp_index_fp)))): 35 | if str(row2[0]) in dict_mapping: 36 | dict_mapping[str(row2[0])].extend(row2[1:]) 37 | 38 | #pprint.pprint(dict_mapping, width=1) 39 | 40 | for key in sorted(dict_mapping): 41 | new_list.append(dict_mapping[key]) 42 | 43 | writer = csv.writer(new_file) 44 | writer.writerows(new_list) 45 | new_file.close() 46 | textfile.close() 47 | 48 | if __name__ == '__main__': 49 | main(str(sys.argv[1]), str(sys.argv[2]), str(sys.argv[3])) -------------------------------------------------------------------------------- /input/params.txt: -------------------------------------------------------------------------------- 1 | enterprise_value 2 | pe_ratio 3 | pe_10 4 | peg_ratio 5 | earning_yield 6 | ps_ratio 7 | price_to_book_value 8 | ev_revenues 9 | ev_ebit 10 | operating_earning_yield 11 | shares_outstanding 12 | dividend 13 | dividend_yield 14 | cash_dividend_payout_ratio 15 | payout_ratio 16 | gross_profit_margin 17 | profit_margin 18 | ebitda_margin_ttm 19 | operating_margin_ttm 20 | asset_utilization 21 | days_sales_outstanding 22 | days_payables_outstanding 23 | receivables_turnover 24 | return_on_assets 25 | return_on_equity 26 | return_on_invested_capital 27 | altman_z_score 28 | current_ratio 29 | debt_equity_ratio 30 | free_cash_flow 31 | kz_index 32 | tangible_common_equity_ratio 33 | times_interest_earned 34 | total_employee_number 35 | revenue_per_employee_annual 36 | ni_per_employee_annual 37 | market_beta_60_month 38 | one_month_return 39 | three_month_return 40 | six_month_return 41 | ytd_return 42 | one_year_return 43 | three_year_return 44 | year_high 45 | year_low 46 | revenues_ttm 47 | revenues_per_share 48 | revenues_growth 49 | eps_ttm 50 | eps_growth 51 | net_income_ttm 52 | cash_financing_ttm 53 | cash_investing_ttm 54 | cash_operations_ttm 55 | capex 56 | cash_on_hand 57 | long_term_debt 58 | assets 59 | liabilities 60 | shareholders_equity 61 | book_value_of_equity_per_share 62 | book_value_of_tangible_equity_per_share 63 | accruals 64 | eps_est_0q 65 | eps_est_0y 66 | forward_pe_ratio 67 | forward_pe_ratio_1y 68 | forward_ps_ratio 69 | forward_ps_ratio_1y 70 | net_income_cs_rev 71 | net_income_annual_cs_rev 72 | max_drawdown_all 73 | historical_daily_var_1_all 74 | historical_daily_var_5_all 75 | historical_monthly_var_5_all 76 | historical_monthly_var_1_all 77 | ca_score 78 | f_score_ttm 79 | fulmer_h_score 80 | graham_number 81 | ncavps 82 | ohlson_score 83 | quality_ratio 84 | springate_score 85 | sustainable_growth_rate 86 | tobin_q 87 | market_cap_fractile 88 | quality_ratio_fractile -------------------------------------------------------------------------------- /scripts/feature_selection.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | import sys 3 | import os 4 | import csv 5 | import pandas 6 | 7 | from sklearn.feature_selection import RFECV 8 | from sklearn.svm import SVR 9 | from sklearn.linear_model import LinearRegression 10 | from sklearn.cross_validation import StratifiedKFold 11 | import numpy as np 12 | 13 | def conv(s): 14 | try: 15 | s=float(s) 16 | except ValueError: 17 | pass 18 | return s 19 | 20 | def main(dir_path): 21 | 22 | files = os.listdir(dir_path) 23 | 24 | X = [] 25 | y = [] 26 | 27 | ranking = [] 28 | 29 | for file_name in files: 30 | with open( os.path.join(dir_path, file_name), 'r') as textfile: 31 | data = pandas.read_csv(os.path.join(dir_path, file_name), header=0) 32 | #reader = csv.reader(textfile) 33 | #next(reader, None) 34 | 35 | start_test = datetime.datetime(2005, 1, 1) 36 | 37 | col = list(data.adj_close) 38 | 39 | X = [ col[:2] ] 40 | y = [ col[-1], col[-2] ] 41 | 42 | print X 43 | print y 44 | 45 | ''' 46 | 47 | for row in reader: 48 | 49 | if any(row[key] in (None, "") for key in range(len(row))): 50 | continue 51 | 52 | temp = row[2:7] + row[9:] 53 | 54 | for i in range(len(temp)): 55 | try: 56 | temp[i] = float(temp[i]) 57 | except Exception, e: 58 | print temp[i] 59 | print temp 60 | print file_name 61 | print row 62 | raise e 63 | 64 | X.append(temp) 65 | y.append(float(row[8])) 66 | ''' 67 | 68 | X=np.array(X, np.float64) 69 | y=np.array(y, np.float64) 70 | 71 | estimator = LinearRegression() 72 | selector = RFECV(estimator, step=1, cv=StratifiedKFold(y, 2)) 73 | 74 | selector = selector.fit(X, y) 75 | 76 | ''' 77 | except Exception, e: 78 | print X 79 | print y 80 | raise e 81 | ''' 82 | 83 | X = [] 84 | y = [] 85 | 86 | if len(ranking)!=0: 87 | ranking = [sum(x) for x in zip(ranking, selector.ranking_)] 88 | else: 89 | ranking = selector.ranking_ 90 | 91 | print ranking 92 | 93 | print ranking 94 | 95 | if __name__ == '__main__': 96 | main(sys.argv[1]) 97 | -------------------------------------------------------------------------------- /scripts/preprocessing.py: -------------------------------------------------------------------------------- 1 | import os, sys, csv 2 | import math 3 | 4 | def roundup(var): 5 | return float(format(var, '.6f')) 6 | 7 | def main(dir_path, output_dir): 8 | files = os.listdir(dir_path) 9 | for file_name in files: 10 | with open( os.path.join(dir_path, file_name), 'r') as textfile: 11 | new_file = open(os.path.join(output_dir, file_name), 'w+') 12 | new_list = [] 13 | 14 | prev = 0.0 15 | diff = 0.0 16 | avg = 0.0 17 | num_moving_avg = 50 18 | volatile_avg = 0.0 19 | num_volatile = 10 20 | curr_volatility = 0.0 21 | 22 | for count, row in enumerate(reversed(list(csv.reader(textfile)))): 23 | if not count: 24 | try: 25 | row[8]=prev 26 | except Exception, e: 27 | row.append(prev) 28 | else: 29 | diff = roundup(float(row[7]) - float(prev)) 30 | try: 31 | row[8]=diff 32 | except Exception, e: 33 | row.append(diff) 34 | 35 | if count 11 | ``` 12 | 13 | Download the Dataset needed for running the code from [here](https://drive.google.com/open?id=0B2lCmt16L_r3SUtrTjBlRHk3d1E). 14 | 15 | ## Project Concept Video 16 | [![Project Concept Video](screenshots/presentation.gif)](https://www.youtube.com/watch?v=z6U0OKGrhy0) 17 | 18 | ### Methodology 19 | 1. Preprocessing and Cleaning 20 | 2. Feature Extraction 21 | 3. Twitter Sentiment Analysis and Score 22 | 4. Data Normalization 23 | 5. Analysis of various supervised learning methods 24 | 6. Conclusions 25 | 26 | ### Research Paper 27 | - [Machine Learning in Stock Price Trend Forecasting. Yuqing Dai, Yuning Zhang](http://cs229.stanford.edu/proj2013/DaiZhang-MachineLearningInStockPriceTrendForecasting.pdf) 28 | - [Stock Market Forecasting Using Machine Learning Algorithms. Shunrong Shen, Haomiao Jiang. Department of Electrical Engineering. Stanford University](http://cs229.stanford.edu/proj2012/ShenJiangZhang-StockMarketForecastingusingMachineLearningAlgorithms.pdf) 29 | - [How can machine learning help stock investment?, Xin Guo](http://cs229.stanford.edu/proj2015/009_report.pdf) 30 | 31 | 32 | ### Datasets used 33 | 1. http://www.nasdaq.com/ 34 | 2. https://in.finance.yahoo.com 35 | 3. https://www.google.com/finance 36 | 37 | 38 | ### Useful Links 39 | - **Slides**: http://www.slideshare.net/SharvilKatariya/stock-price-trend-forecasting-using-supervised-learning 40 | - **Video**: https://www.youtube.com/watch?v=z6U0OKGrhy0 41 | - **Report**: https://github.com/scorpionhiccup/StockPricePrediction/blob/master/Report.pdf 42 | 43 | ### References 44 | - [Recurrent Neural Networks - LSTM Models](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) 45 | - [ARIMA Models](http://people.duke.edu/~rnau/411arim.htm) 46 | - https://github.com/dv-lebedev/google-quote-downloader 47 | - [Book Value](http://www.investopedia.com/terms/b/bookvalue.asp) 48 | - http://www.investopedia.com/articles/basics/09/simplified-measuring-interpreting-volatility.asp 49 | - [Volatility](http://www.stock-options-made-easy.com/volatility-index.html) 50 | - https://github.com/dzitkowskik/StockPredictionRNN 51 | - [Scikit-Learn](http://scikit-learn.org/stable/) 52 | - [Theano](http://deeplearning.net/software/theano/) 53 | -------------------------------------------------------------------------------- /scripts/Algorithms/svm.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ''' 3 | Running Support Vector Regression Model. 4 | ''' 5 | from __future__ import print_function 6 | 7 | import os 8 | import sys 9 | import pandas as pd 10 | from sklearn.svm import SVR 11 | from sklearn import cross_validation 12 | import numpy as np 13 | import matplotlib.pyplot as plt 14 | from datetime import datetime 15 | from sklearn.cross_validation import train_test_split 16 | 17 | def convert_to_integer(dt_time): 18 | return 10000*dt_time.year + 1000*dt_time.month + dt_time.day 19 | 20 | 21 | def preprocess(file_dataframe, cols=['date', 'open']): 22 | 23 | if 'date' in cols: 24 | file_dataframe['date'].applymap(convert_to_integer) 25 | 26 | X = file_dataframe['open'] 27 | y = file_dataframe['date'] 28 | 29 | return X, y 30 | 31 | 32 | def svm(file_dataframe, test_size=0.2, cols=['date', 'open']): 33 | ''' 34 | Run Logistic Regression 35 | ''' 36 | 37 | print('Loading data...') 38 | 39 | if 'date' in file_dataframe: 40 | file_dataframe['new_col'] = pd.to_datetime(file_dataframe['date']).astype(datetime) 41 | #file_dataframe['date'] = pd.to_datetime(file_dataframe['date']) 42 | file_dataframe['new_col'].apply(lambda dt_time:10000*dt_time.year + 1000*dt_time.month + dt_time.day).astype(int) 43 | 44 | print(file_dataframe['new_col']) 45 | 46 | X = file_dataframe['open'] 47 | y = file_dataframe['new_col'] 48 | 49 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size) 50 | 51 | #svr_rbf = SVR(kernel='rbf', C=1e3, gamma=0.1) 52 | svr_lin = SVR(kernel='linear', C=1e3) 53 | #svr_poly = SVR(kernel='poly', C=1e3, degree=2) 54 | 55 | #parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]} 56 | 57 | #loo = cross_validation.LeaveOneOut(len(y_train) - 1) 58 | #clf = grid_search.GridSearchCV(svr_rbf, parameters) 59 | scores = [] 60 | 61 | #svr_rbf.fit(X_train, y_train) 62 | svr_lin.fit(X_train, y_train) 63 | #svr_poly.fit(X_train, y_train) 64 | 65 | #scores.append(cross_validation.cross_val_score(svr_rbf, \ 66 | # X_test, y_test, scoring='mean_squared_error', cv=loo).mean()) 67 | scores.append(cross_validation.cross_val_score(svr_lin, \ 68 | X_test, y_test, scoring='mean_squared_error', cv=loo).mean()) 69 | #scores.append(cross_validation.cross_val_score(svr_poly, \ 70 | # X_test, y_test, scoring='mean_squared_error', cv=loo).mean()) 71 | 72 | return scores 73 | 74 | def main(dir_path): 75 | ''' 76 | Run Pipeline of processes on file one by one. 77 | ''' 78 | files = os.listdir(dir_path) 79 | 80 | for file_name in files: 81 | print(file_name) 82 | 83 | file_dataframe = pd.read_csv(os.path.join(dir_path, file_name), parse_dates=[1]) 84 | 85 | print(svm(file_dataframe, 0.2, 'high')) 86 | 87 | break 88 | 89 | if __name__ == '__main__': 90 | main(sys.argv[1]) 91 | -------------------------------------------------------------------------------- /scripts/Algorithms/regression_models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Run Stock-Regression Algorithms 4 | """ 5 | from __future__ import print_function 6 | from regression_helpers import load_dataset, addFeatures, \ 7 | mergeDataframes, count_missing, applyTimeLag, performRegression 8 | import sys 9 | import os 10 | import pickle 11 | import traceback 12 | 13 | def main(dir_path, output_dir): 14 | ''' 15 | Run Pipeline of processes on file one by one. 16 | ''' 17 | 18 | scores = {} 19 | 20 | files = os.listdir(dir_path) 21 | 22 | maxdelta = 30 23 | 24 | delta = range(8, maxdelta) 25 | print('Delta days accounted: ', max(delta)) 26 | 27 | for file_name in files: 28 | try: 29 | symbol = file_name.split('.')[0] 30 | print(symbol) 31 | 32 | datasets = load_dataset(dir_path, file_name) 33 | 34 | for dataset in datasets: 35 | columns = dataset.columns 36 | adjclose = columns[-2] 37 | returns = columns[-1] 38 | for dele in delta: 39 | addFeatures(dataset, adjclose, returns, dele) 40 | dataset = dataset.iloc[max(delta):,:] # computation of returns and moving means introduces NaN which are nor removed 41 | 42 | finance = mergeDataframes(datasets) 43 | 44 | high_value = 365 45 | high_value = min(high_value, finance.shape[0] - 1) 46 | 47 | lags = range(high_value, 30) 48 | print('Maximum time lag applied', high_value) 49 | 50 | if 'symbol' in finance.columns: 51 | finance.drop('symbol', axis=1, inplace=True) 52 | 53 | print('Size of data frame: ', finance.shape) 54 | print('Number of NaN after merging: ', count_missing(finance)) 55 | 56 | finance = finance.interpolate(method='time') 57 | print('Number of NaN after time interpolation: ', finance.shape[0]*finance.shape[1] - finance.count().sum()) 58 | 59 | finance = finance.fillna(finance.mean()) 60 | print('Number of NaN after mean interpolation: ', (finance.shape[0]*finance.shape[1] - finance.count().sum())) 61 | 62 | finance.columns = [str(col.replace('&', '_and_')) for col in finance.columns] 63 | 64 | #Move the Open Values behind by one dataset. 65 | finance.open = finance.open.shift(-1) 66 | 67 | print(high_value) 68 | finance = applyTimeLag(finance, [high_value], delta) 69 | 70 | print('Number of NaN after temporal shifting: ', count_missing(finance)) 71 | print('Size of data frame after feature creation: ', finance.shape) 72 | 73 | mean_squared_errors, r2_scores = performRegression(finance, 0.95, \ 74 | symbol, output_dir) 75 | 76 | scores[symbol] = [mean_squared_errors, r2_scores] 77 | except Exception, e: 78 | pass 79 | traceback.print_exc() 80 | 81 | with open(os.path.join(output_dir, 'scores.pickle'), 'wb') as handle: 82 | pickle.dump(scores, handle) 83 | 84 | if __name__ == '__main__': 85 | main(sys.argv[1], sys.argv[2]) 86 | -------------------------------------------------------------------------------- /scripts/fetch_stock_data.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | import sys 3 | import os 4 | import csv 5 | import time 6 | import datetime 7 | 8 | from ychartspy.client import YChartsClient 9 | 10 | def convert(timestamp): 11 | return datetime.datetime.fromtimestamp(int(timestamp) / 1e3).strftime('%Y-%m-%d') 12 | 13 | def main(symbol_file, parameter_file, output_dir): 14 | ''' 15 | 16 | Params: 17 | ------------------------ 18 | parameter_file: This is the path to the file that contains the information that you want to fetch via the stock prediction api like ps_ratio, dividend_yield, etc. An example for this file is in the input directory of the project source directory. 19 | symbol_file: This is path to text file with the symbols that you want the output of seperated by newlines. 20 | output_dir: path to output directory. In this directory the files will be created and saved as symbol_1.csv, symbol_2.csv, etc. 21 | ''' 22 | param_fp = open(parameter_file, 'r') 23 | 24 | param_list = [] 25 | 26 | count = {} 27 | 28 | for parameter in param_fp: 29 | param_list.append(parameter.strip()) 30 | count[parameter.strip()]=0 31 | 32 | client = YChartsClient() 33 | 34 | error_count = {} 35 | 36 | with open(symbol_file, 'r') as sym_fp: 37 | for symbol in list(csv.reader(sym_fp)): 38 | row_info = {} 39 | symbol = symbol[0].strip() 40 | 41 | to_write = [] 42 | to_write.append(['symbol', 'timestamp']) 43 | 44 | non_params = [] 45 | 46 | print symbol 47 | 48 | for parameter in param_list: 49 | parameter=parameter.strip() 50 | to_write[0].append(parameter) 51 | 52 | try: 53 | row = client.get_security_metric(symbol, parameter, start_date="01/01/1900") 54 | except Exception, e: 55 | if parameter in error_count: 56 | error_count[parameter]+=1 57 | else: 58 | error_count[parameter]=1 59 | non_params.append(parameter) 60 | continue 61 | 62 | for row_obj in row: 63 | if row_obj[0] not in row_info: 64 | row_info[row_obj[0]] = {} 65 | row_info[row_obj[0]][str(parameter)]=row_obj[1] 66 | 67 | if count[parameter]==0: 68 | count[parameter]=1 69 | 70 | 71 | new_file = open(os.path.join(output_dir, str(symbol) + '.csv'), 'w+') 72 | 73 | for key in sorted(row_info): 74 | temp = [] 75 | temp.append(str(symbol)) 76 | temp.append(convert(key)) 77 | 78 | for parameter in param_list: 79 | parameter=str(parameter) 80 | 81 | if count[parameter]==0: 82 | param_list.remove(parameter) 83 | to_write[0].remove(parameter) 84 | continue 85 | 86 | if parameter in row_info[key]: 87 | #print 'HERE', parameter, key 88 | temp.append(row_info[key][parameter]) 89 | #to_write[-1].append(row_info[key][str(parameter)]) 90 | else: 91 | #print 'NOT ', parameter, row_info[key] 92 | temp.append('NaN') 93 | 94 | to_write.append(temp) 95 | 96 | #print to_write[-1], len(to_write[-1]) 97 | 98 | #row_info[key].insert(0, convert(key)) 99 | #row_info[key].insert(0, str(symbol)) 100 | #to_write.append(row_info[key]) 101 | 102 | writer = csv.writer(new_file) 103 | writer.writerows(to_write) 104 | new_file.close() 105 | 106 | ''' 107 | for key in error_count: 108 | if error_count[key]==7: 109 | print key 110 | ''' 111 | #print non_params 112 | 113 | if __name__ == '__main__': 114 | main(str(sys.argv[1]), str(sys.argv[2]), str(sys.argv[3])) 115 | -------------------------------------------------------------------------------- /scripts/Algorithms/LSTN-RNN.py: -------------------------------------------------------------------------------- 1 | ### Incomplete 2 | 3 | import copy, numpy as np 4 | np.random.seed(0) 5 | 6 | # compute sigmoid nonlinearity 7 | def sigmoid(x): 8 | output = 1/(1+np.exp(-x)) 9 | return output 10 | 11 | # convert output of sigmoid function to its derivative 12 | def sigmoid_output_to_derivative(output): 13 | return output*(1-output) 14 | 15 | 16 | # training dataset generation 17 | int2binary = {} 18 | binary_dim = 8 19 | 20 | largest_number = pow(2,binary_dim) 21 | binary = np.unpackbits( 22 | np.array([range(largest_number)],dtype=np.uint8).T,axis=1) 23 | for i in range(largest_number): 24 | int2binary[i] = binary[i] 25 | 26 | 27 | # input variables 28 | alpha = 0.1 29 | input_dim = 2 30 | hidden_dim = 16 31 | output_dim = 1 32 | 33 | 34 | # initialize neural network weights 35 | synapse_0 = 2*np.random.random((input_dim,hidden_dim)) - 1 36 | synapse_1 = 2*np.random.random((hidden_dim,output_dim)) - 1 37 | synapse_h = 2*np.random.random((hidden_dim,hidden_dim)) - 1 38 | 39 | synapse_0_update = np.zeros_like(synapse_0) 40 | synapse_1_update = np.zeros_like(synapse_1) 41 | synapse_h_update = np.zeros_like(synapse_h) 42 | 43 | # training logic 44 | for j in range(10000): 45 | 46 | # generate a simple addition problem (a + b = c) 47 | a_int = np.random.randint(largest_number/2) # int version 48 | a = int2binary[a_int] # binary encoding 49 | 50 | b_int = np.random.randint(largest_number/2) # int version 51 | b = int2binary[b_int] # binary encoding 52 | 53 | # true answer 54 | c_int = a_int + b_int 55 | c = int2binary[c_int] 56 | 57 | # where we'll store our best guess (binary encoded) 58 | d = np.zeros_like(c) 59 | 60 | overallError = 0 61 | 62 | layer_2_deltas = list() 63 | layer_1_values = list() 64 | layer_1_values.append(np.zeros(hidden_dim)) 65 | 66 | # moving along the positions in the binary encoding 67 | for position in range(binary_dim): 68 | 69 | # generate input and output 70 | X = np.array([[a[binary_dim - position - 1],b[binary_dim - position - 1]]]) 71 | y = np.array([[c[binary_dim - position - 1]]]).T 72 | 73 | # hidden layer (input ~+ prev_hidden) 74 | layer_1 = sigmoid(np.dot(X,synapse_0) + np.dot(layer_1_values[-1],synapse_h)) 75 | 76 | # output layer (new binary representation) 77 | layer_2 = sigmoid(np.dot(layer_1,synapse_1)) 78 | 79 | # did we miss?... if so, by how much? 80 | layer_2_error = y - layer_2 81 | layer_2_deltas.append((layer_2_error)*sigmoid_output_to_derivative(layer_2)) 82 | overallError += np.abs(layer_2_error[0]) 83 | 84 | # decode estimate so we can print it out 85 | d[binary_dim - position - 1] = np.round(layer_2[0][0]) 86 | 87 | # store hidden layer so we can use it in the next timestep 88 | layer_1_values.append(copy.deepcopy(layer_1)) 89 | 90 | future_layer_1_delta = np.zeros(hidden_dim) 91 | 92 | for position in range(binary_dim): 93 | 94 | X = np.array([[a[position],b[position]]]) 95 | layer_1 = layer_1_values[-position-1] 96 | prev_layer_1 = layer_1_values[-position-2] 97 | 98 | # error at output layer 99 | layer_2_delta = layer_2_deltas[-position-1] 100 | # error at hidden layer 101 | layer_1_delta = (future_layer_1_delta.dot(synapse_h.T) + layer_2_delta.dot(synapse_1.T)) * sigmoid_output_to_derivative(layer_1) 102 | 103 | # let's update all our weights so we can try again 104 | synapse_1_update += np.atleast_2d(layer_1).T.dot(layer_2_delta) 105 | synapse_h_update += np.atleast_2d(prev_layer_1).T.dot(layer_1_delta) 106 | synapse_0_update += X.T.dot(layer_1_delta) 107 | 108 | future_layer_1_delta = layer_1_delta 109 | 110 | 111 | synapse_0 += synapse_0_update * alpha 112 | synapse_1 += synapse_1_update * alpha 113 | synapse_h += synapse_h_update * alpha 114 | 115 | synapse_0_update *= 0 116 | synapse_1_update *= 0 117 | synapse_h_update *= 0 118 | 119 | # print out progress 120 | if(j % 1000 == 0): 121 | print "Error:" + str(overallError) 122 | print "Pred:" + str(d) 123 | print "True:" + str(c) 124 | out = 0 125 | for index,x in enumerate(reversed(d)): 126 | out += x*pow(2,index) 127 | print str(a_int) + " + " + str(b_int) + " = " + str(out) 128 | print "------------" 129 | 130 | -------------------------------------------------------------------------------- /scripts/Algorithms/rnn_lstm.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python 2 | ''' 3 | Running LSTM Algorithm. 4 | ''' 5 | from __future__ import print_function 6 | import numpy as np 7 | np.random.seed(1337) # for reproducibility 8 | 9 | from keras.preprocessing import sequence 10 | from keras.utils import np_utils 11 | from keras.models import Sequential 12 | from keras.layers.core import Dense, Dropout, Activation 13 | from keras.layers.embeddings import Embedding 14 | from keras.layers.recurrent import LSTM, SimpleRNN, GRU 15 | from keras.layers.core import * 16 | 17 | max_features = 5883 18 | maxlen = 80 19 | batch_size = 32 20 | 21 | in_out_neurons = 2 22 | hidden_neurons = 300 23 | 24 | import os 25 | import sys 26 | import pandas as pd 27 | 28 | 29 | def _load_data(data, n_prev=100): 30 | """ 31 | data should be pd.DataFrame() 32 | """ 33 | 34 | docX, docY = [], [] 35 | for i in range(len(data)-n_prev): 36 | docX.append(data.iloc[i:i+n_prev]) 37 | docY.append(data.iloc[i+n_prev]) 38 | 39 | all_X = np.array(docX) 40 | all_Y = np.array(docY) 41 | 42 | return all_X, all_Y 43 | 44 | 45 | def train_test_split(dataframe, test_size=0.2): 46 | """ 47 | This just splits data to training and testing parts 48 | """ 49 | ntrn = int(round(len(dataframe) * (1 - test_size))) 50 | 51 | X_train, y_train = _load_data(dataframe.iloc[0:ntrn]) 52 | X_test, y_test = _load_data(dataframe.iloc[ntrn:]) 53 | 54 | print(X_train, y_train) 55 | 56 | return (X_train, y_train), (X_test, y_test) 57 | 58 | 59 | def rnn_lstm(file_dataframe, test_size=0.2, col="high"): 60 | print('Loading data...') 61 | (X_train, y_train), (X_test, y_test) = train_test_split( 62 | file_dataframe[col], test_size=0.2) 63 | 64 | ''' 65 | 66 | X_train = np.array([[ 360, 7, 19, 256, 82, 7], \ 67 | [ 6, 102, 37, 5, 1324, 7]]) 68 | 69 | y_train = np.array([1, 0]) 70 | 71 | X_test = X_train 72 | 73 | y_test = y_train 74 | 75 | print(X_train.shape, y_train.shape) 76 | print(len(X_train), 'train sequences') 77 | print(len(X_test), 'test sequences') 78 | 79 | ''' 80 | 81 | print('Pad sequences (samples x time)') 82 | X_train = sequence.pad_sequences(X_train, maxlen=maxlen) 83 | X_test = sequence.pad_sequences(X_test, maxlen=maxlen) 84 | print('X_train shape:', X_train.shape) 85 | print('X_test shape:', X_test.shape) 86 | 87 | print('Build model...') 88 | ''' 89 | model = Sequential() 90 | model.add(Embedding(max_features, hidden_neurons, \ 91 | input_length=maxlen, dropout=0.2)) 92 | model.add(LSTM(hidden_neurons, dropout_W=0.2, dropout_U=0.2)) 93 | model.add(Dense(1)) 94 | model.add(Activation('sigmoid')) 95 | 96 | model.compile(loss='binary_crossentropy', 97 | optimizer='adam', 98 | metrics=['accuracy']) 99 | 100 | #model.compile(loss="mean_squared_error", \ 101 | # optimizer="rmsprop", metrics=['accuracy']) 102 | ''' 103 | 104 | input_dim = 32 105 | hidden = 32 106 | step = 10 107 | 108 | #The LSTM model - output_shape = (batch, step, hidden) 109 | model1 = Sequential() 110 | model1.add(LSTM(input_dim=input_dim, output_dim=hidden, input_length=step, return_sequences=True)) 111 | 112 | #The weight model - actual output shape = (batch, step) 113 | # after reshape : output_shape = (batch, step, hidden) 114 | model2 = Sequential() 115 | model2.add(Dense(input_dim=input_dim, output_dim=step)) 116 | model2.add(Activation('softmax')) # Learn a probability distribution over each step. 117 | #Reshape to match LSTM's output shape, so that we can do element-wise multiplication. 118 | model2.add(RepeatVector(hidden)) 119 | model2.add(Permute((2, 1))) 120 | 121 | #The final model which gives the weighted sum: 122 | model = Sequential() 123 | model.add(Merge([model1, model2], 'sum', concat_axis=1)) # Multiply each element with corresponding weight a[i][j][k] * b[i][j] 124 | model.add((Merge([model1, model2], mode='sum', concat_axis=1)) # Sum the weighted elements. 125 | 126 | model.compile(loss='mse', optimizer='sgd') 127 | 128 | print('Train...') 129 | print(X_train.shape, X_test.shape) 130 | print(y_train.shape, y_test.shape) 131 | 132 | model.fit(X_train, y_train, batch_size=batch_size, \ 133 | validation_data=(X_test, y_test), nb_epoch=5) 134 | score, accuracy = model.evaluate(X_test, y_test, 135 | batch_size=batch_size) 136 | print('Test score:', score) 137 | print('Test accuracy:', accuracy) 138 | 139 | return (score, accuracy) 140 | 141 | 142 | def main(dir_path): 143 | ''' 144 | Run Pipeline of processes on file one by one. 145 | ''' 146 | files = os.listdir(dir_path) 147 | 148 | #for file_name in files: 149 | file_name="GOOGL.csv" 150 | print(file_name) 151 | 152 | file_dataframe = pd.read_csv(os.path.join(dir_path, file_name)) 153 | 154 | print(rnn_lstm(file_dataframe, 0.1, 'high')) 155 | 156 | #break 157 | 158 | if __name__ == '__main__': 159 | main(sys.argv[1]) 160 | -------------------------------------------------------------------------------- /scripts/Algorithms/regression_helpers.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Miscellaneous Functions for Regression File. 4 | """ 5 | 6 | from __future__ import print_function 7 | import numpy as np 8 | import pandas as pd 9 | from sklearn import preprocessing 10 | from sklearn.metrics import mean_squared_error, r2_score 11 | from sklearn.ensemble import RandomForestRegressor 12 | import matplotlib.pyplot as plt 13 | from sklearn.ensemble import BaggingRegressor 14 | from sklearn.ensemble import AdaBoostRegressor 15 | from sklearn.ensemble import GradientBoostingRegressor 16 | from sklearn.neighbors import KNeighborsRegressor 17 | from sklearn.ensemble import RandomForestClassifier 18 | from sklearn import neighbors 19 | from sklearn.ensemble import AdaBoostClassifier 20 | from sklearn.ensemble import GradientBoostingClassifier 21 | #from sklearn.svm import SVR 22 | from sklearn.feature_selection import SelectKBest, chi2 23 | from sklearn.svm import SVC, SVR 24 | from sklearn.qda import QDA 25 | import os 26 | from sklearn.grid_search import GridSearchCV 27 | from Neural_Network import NeuralNet 28 | 29 | def load_dataset(path_directory, symbol): 30 | """ 31 | Import DataFrame from Dataset. 32 | """ 33 | 34 | path = os.path.join(path_directory, symbol) 35 | 36 | out = pd.read_csv(path, index_col=2, parse_dates=[2]) 37 | out.drop(out.columns[0], axis=1, inplace=True) 38 | 39 | #name = path_directory + '/sp.csv' 40 | #sp = pd.read_csv(name, index_col=0, parse_dates=[1]) 41 | 42 | #name = path_directory + '/GOOGL.csv' 43 | #nasdaq = pd.read_csv(name, index_col=1, parse_dates=[1]) 44 | 45 | #name = path_directory + '/treasury.csv' 46 | #treasury = pd.read_csv(name, index_col=0, parse_dates=[1]) 47 | 48 | #return [sp, nasdaq, djia, treasury, hkong, frankfurt, paris, nikkei, london, australia] 49 | #return [out, nasdaq, djia, frankfurt, hkong, nikkei, australia] 50 | return [out] 51 | 52 | def count_missing(dataframe): 53 | """ 54 | count number of NaN in dataframe 55 | """ 56 | return (dataframe.shape[0] * dataframe.shape[1]) - dataframe.count().sum() 57 | 58 | 59 | def addFeatures(dataframe, adjclose, returns, n): 60 | """ 61 | operates on two columns of dataframe: 62 | - n >= 2 63 | - given Return_* computes the return of day i respect to day i-n. 64 | - given AdjClose_* computes its moving average on n days 65 | 66 | """ 67 | 68 | return_n = adjclose[9:] + "Time" + str(n) 69 | dataframe[return_n] = dataframe[adjclose].pct_change(n) 70 | 71 | roll_n = returns[7:] + "RolMean" + str(n) 72 | dataframe[roll_n] = pd.rolling_mean(dataframe[returns], n) 73 | 74 | exp_ma = returns[7:] + "ExponentMovingAvg" + str(n) 75 | dataframe[exp_ma] = pd.ewma(dataframe[returns], halflife=n) 76 | 77 | def mergeDataframes(datasets): 78 | """ 79 | Merge Datasets into Dataframe. 80 | """ 81 | return pd.concat(datasets) 82 | 83 | 84 | def applyTimeLag(dataset, lags, delta): 85 | """ 86 | apply time lag to return columns selected according to delta. 87 | Days to lag are contained in the lads list passed as argument. 88 | Returns a NaN free dataset obtained cutting the lagged dataset 89 | at head and tail 90 | """ 91 | maxLag = max(lags) 92 | 93 | columns = dataset.columns[::(2*max(delta)-1)] 94 | for column in columns: 95 | newcolumn = column + str(maxLag) 96 | dataset[newcolumn] = dataset[column].shift(maxLag) 97 | 98 | return dataset.iloc[maxLag:-1, :] 99 | 100 | # CLASSIFICATION 101 | def prepareDataForClassification(dataset, start_test): 102 | """ 103 | generates categorical to be predicted column, attach to dataframe 104 | and label the categories 105 | """ 106 | le = preprocessing.LabelEncoder() 107 | 108 | dataset['UpDown'] = dataset['Return_Out'] 109 | dataset.UpDown[dataset.UpDown >= 0] = 'Up' 110 | dataset.UpDown[dataset.UpDown < 0] = 'Down' 111 | dataset.UpDown = le.fit(dataset.UpDown).transform(dataset.UpDown) 112 | 113 | features = dataset.columns[1:-1] 114 | X = dataset[features] 115 | y = dataset.UpDown 116 | 117 | X_train = X[X.index < start_test] 118 | y_train = y[y.index < start_test] 119 | 120 | X_test = X[X.index >= start_test] 121 | y_test = y[y.index >= start_test] 122 | 123 | return X_train, y_train, X_test, y_test 124 | 125 | def prepareDataForModelSelection(X_train, y_train, start_validation): 126 | """ 127 | gets train set and generates a validation set splitting the train. 128 | The validation set is mandatory for feature and model selection. 129 | """ 130 | X = X_train[X_train.index < start_validation] 131 | y = y_train[y_train.index < start_validation] 132 | 133 | X_val = X_train[X_train.index >= start_validation] 134 | y_val = y_train[y_train.index >= start_validation] 135 | 136 | return X, y, X_val, y_val 137 | 138 | 139 | def performClassification(X_train, y_train, X_test, y_test, method, parameters={}): 140 | """ 141 | Perform Classification with the help of serveral Algorithms. 142 | """ 143 | 144 | print('Performing ' + method + ' Classification...') 145 | print('Size of train set: ', X_train.shape) 146 | print('Size of test set: ', X_test.shape) 147 | print('Size of train set: ', y_train.shape) 148 | print('Size of test set: ', y_test.shape) 149 | 150 | 151 | classifiers = [ 152 | RandomForestClassifier(n_estimators=100, n_jobs=-1), 153 | neighbors.KNeighborsClassifier(), 154 | SVC(degree=100, C=10000, epsilon=.01), 155 | AdaBoostRegressor(), 156 | AdaBoostClassifier(**parameters)(), 157 | GradientBoostingClassifier(n_estimators=100), 158 | QDA(), 159 | ] 160 | 161 | scores = [] 162 | 163 | for classifier in classifiers: 164 | scores.append(benchmark_classifier(classifier, \ 165 | X_train, y_train, X_test, y_test)) 166 | 167 | print(scores) 168 | 169 | def benchmark_classifier(clf, X_train, y_train, X_test, y_test): 170 | clf.fit(X_train, y_train) 171 | accuracy = clf.score(X_test, y_test) 172 | #auc = roc_auc_score(y_test, clf.predict(X_test)) 173 | return accuracy 174 | 175 | # REGRESSION 176 | 177 | def getFeatures(X_train, y_train, X_test, num_features): 178 | ch2 = SelectKBest(chi2, k=5) 179 | X_train = ch2.fit_transform(X_train, y_train) 180 | X_test = ch2.transform(X_test) 181 | return X_train, X_test 182 | 183 | def performRegression(dataset, split, symbol, output_dir): 184 | """ 185 | Performing Regression on 186 | Various algorithms 187 | """ 188 | 189 | features = dataset.columns[1:] 190 | index = int(np.floor(dataset.shape[0]*split)) 191 | train, test = dataset[:index], dataset[index:] 192 | print('Size of train set: ', train.shape) 193 | print('Size of test set: ', test.shape) 194 | 195 | #train, test = getFeatures(train[features], \ 196 | # train[output], test[features], 16) 197 | 198 | out_params = (symbol, output_dir) 199 | 200 | output = dataset.columns[0] 201 | 202 | predicted_values = [] 203 | 204 | classifiers = [ 205 | RandomForestRegressor(n_estimators=10, n_jobs=-1), 206 | SVR(C=100000, kernel='rbf', epsilon=0.1, gamma=1, degree=2), 207 | BaggingRegressor(), 208 | AdaBoostRegressor(), 209 | KNeighborsRegressor(), 210 | GradientBoostingRegressor(), 211 | ] 212 | 213 | for classifier in classifiers: 214 | 215 | predicted_values.append(benchmark_model(classifier, \ 216 | train, test, features, output, out_params)) 217 | 218 | maxiter = 1000 219 | batch = 150 220 | 221 | classifier = NeuralNet(50, learn_rate=1e-2) 222 | 223 | predicted_values.append(benchmark_model(classifier, \ 224 | train, test, features, output, out_params, \ 225 | fine_tune=False, maxiter=maxiter, SGD=True, batch=batch, rho=0.9)) 226 | 227 | 228 | print('-'*80) 229 | 230 | mean_squared_errors = [] 231 | 232 | r2_scores = [] 233 | 234 | for pred in predicted_values: 235 | mean_squared_errors.append(mean_squared_error(test[output].as_matrix(), \ 236 | pred.as_matrix())) 237 | r2_scores.append(r2_score(test[output].as_matrix(), pred.as_matrix())) 238 | 239 | print(mean_squared_errors, r2_scores) 240 | 241 | return mean_squared_errors, r2_scores 242 | 243 | def benchmark_model(model, train, test, features, output, \ 244 | output_params, *args, **kwargs): 245 | ''' 246 | Performs Training and Testing of the Data on the Model. 247 | ''' 248 | 249 | print('-'*80) 250 | model_name = model.__str__().split('(')[0].replace('Regressor', ' Regressor') 251 | print(model_name) 252 | 253 | ''' 254 | if 'SVR' in model.__str__(): 255 | tuned_parameters = [{'kernel': ['rbf', 'polynomial'], 'gamma': [1e-3, 1e-4], 256 | 'C': [1, 10, 100, 1000]}, 257 | {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}] 258 | model = GridSearchCV(SVC(C=1), tuned_parameters, cv=5, 259 | scoring='%s_weighted' % 'recall') 260 | ''' 261 | 262 | symbol, output_dir = output_params 263 | 264 | model.fit(train[features].as_matrix(), train[output].as_matrix(), *args, **kwargs) 265 | predicted_value = model.predict(test[features].as_matrix()) 266 | 267 | plt.plot(test[output].as_matrix(), color='g', ls='-', label='Actual Value') 268 | plt.plot(predicted_value, color='b', ls='--', label='predicted_value Value') 269 | 270 | plt.xlabel('Number of Set') 271 | plt.ylabel('Output Value') 272 | 273 | plt.title(model_name) 274 | plt.legend(loc='best') 275 | plt.tight_layout() 276 | plt.savefig(os.path.join(output_dir, str(symbol) + '_' \ 277 | + model_name + '.png'), dpi=100) 278 | #plt.show() 279 | plt.clf() 280 | 281 | return predicted_value 282 | -------------------------------------------------------------------------------- /scripts/Algorithms/Neural_Network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | import numpy as np 4 | 5 | ''' 6 | Neural Network Implementation 7 | ''' 8 | class NeuralNet(): 9 | 10 | def __init__(self, num_nodes, weights_=[], classification=True, auto_encoder=False, penalty=0., learn_rate=0.01): 11 | self.num_nodes = num_nodes 12 | self.weights_ = weights_ 13 | self.is_fit = False 14 | self.classification = classification 15 | self.auto_encoder = auto_encoder 16 | if auto_encoder: 17 | self.classification = False 18 | self.K = 0 19 | self.penalty = penalty 20 | self.learn_rate = learn_rate 21 | np.seterr(all='warn') 22 | 23 | def __str__(self): 24 | return "Neural Networks(" 25 | 26 | def initWeights(self, X, nclass): 27 | bias = np.ones((X.shape[0],1)) 28 | X = np.hstack((bias,X)) #add constant bias to each observation, X now N by P+1 29 | sizeX = X.shape[1] 30 | node_weights_ = np.random.uniform(-0.08,0.08,size=(self.num_nodes, sizeX)) #M+1 by P+1 31 | output_weights_ = np.random.uniform(-0.08,0.08,size=(nclass, self.num_nodes+1)) #K by M+1 32 | return X, [node_weights_, output_weights_] #nrows = n_nodes, ncols = sizeX 33 | 34 | def sigmoid(self, alpha_, X_): 35 | v_ = alpha_.dot(X_.T) 36 | v_[v_ < -300] = -300 37 | v_[v_ > 300] = 300 38 | return 1./(1+np.exp(-v_)) 39 | 40 | def relu(self, alpha_, X_): 41 | try: 42 | v_ = alpha_.dot(X_.T) 43 | except ValueError: 44 | v_ = X_.dot(alpha_) 45 | v_ = v_.T 46 | v_[v_ < -300] = -300 47 | v_[v_ > 300] = 300 48 | return np.maximum(v_,np.zeros(v_.shape)) 49 | 50 | def drelu(self, alpha_, X_): 51 | try: 52 | v_ = alpha_.dot(X_.T) 53 | except ValueError: 54 | v_ = X_.dot(alpha_) 55 | v_ = v_.T 56 | v_[v_ <= 0.] = 0 57 | v_[v_ > 0.] = 1. 58 | return v_ 59 | 60 | def tanh(self, alpha_, X_): 61 | try: 62 | v_ = alpha_.dot(X_.T) 63 | except ValueError: 64 | v_ = X_.dot(alpha_) 65 | v_ = v_.T 66 | v_[v_ < -300] = -300 67 | v_[v_ > 300] = 300 68 | return np.tanh(v_) 69 | 70 | def dtanh(self, alpha_, X_): 71 | return 1 - np.multiply(self.tanh(alpha_, X_), self.tanh(alpha_, X_)) 72 | 73 | def softmax(self, T): 74 | T[T < -300] = -300 75 | T[T > 300] = 300 76 | return (np.exp(T)/np.sum(np.exp(T), axis=0)).T #(K by N) / elementwise(1 by N) 77 | 78 | def initNodes(self, X, Y): 79 | K = self.K 80 | if self.weights_ == []: 81 | X, weights = self.initWeights(X, K) 82 | else: 83 | weights = self.weights_ 84 | bias = np.ones((X.shape[0],1)) 85 | X = np.hstack((bias,X)) 86 | return X, weights 87 | 88 | def backPropagate(self, weights, next_weights, X, Y, rho, old_del_alpha, old_del_beta, _dropout, back_delta=0., fine_tune=False): 89 | """ 90 | feed forward then back propagate error, update weights 91 | """ 92 | learn_rate = self.learn_rate 93 | # beta = 6. 94 | # sparsity = 0.05 95 | if _dropout: 96 | drop = np.random.uniform(0, 1, weights[0].shape[0]) 97 | if self.auto_encoder and not fine_tune: 98 | sig = self.relu(weights[0], X) 99 | dsig = self.drelu(weights[0], X) 100 | if _dropout: 101 | sig[drop>=0.5,:] = 0. 102 | dsig[drop>=0.5,:] = 0. 103 | # avg_sparsity = np.mean(sig, axis=1) 104 | elif not self.auto_encoder and not fine_tune: 105 | sig = self.tanh(weights[0], X) 106 | dsig = self.dtanh(weights[0], X) 107 | if _dropout: 108 | sig[drop>=0.5,:] = 0. 109 | dsig[drop>=0.5,:] = 0. 110 | 111 | if not fine_tune: 112 | bias = np.ones((1,sig.shape[1])) 113 | sig = np.vstack((bias,sig)) 114 | hidden_out = weights[1].dot(sig) 115 | if self.classification: 116 | h = self.softmax(hidden_out) 117 | forward_error = h - Y 118 | else: 119 | h = hidden_out.T 120 | forward_error = h - Y[:,1:] #both N by K 121 | dRdBeta = sig.dot(forward_error)/forward_error.shape[0] #(M+1 by N)*(N by K) = M+1 by K gradient-force for each neuron 122 | 123 | if fine_tune: 124 | prop_back = np.multiply(back_delta.dot(next_weights[:,1:]),dsig.T) 125 | dRdAlpha = prop_back.T.dot(X)/X.shape[0] 126 | elif not fine_tune and not self.auto_encoder: 127 | back_error = np.multiply((forward_error.dot(weights[1][:,1:])),(dsig.T)) #((N by K)*(K by M+1))*ewise(N by M+1) = N by M+1 128 | prop_back = back_error 129 | elif not fine_tune and self.auto_encoder: 130 | back_error = np.multiply((forward_error.dot(weights[1][:,1:])),(dsig.T)) 131 | prop_back = 0. 132 | # back_error = np.multiply((forward_error.dot(weights[1][:,1:])),(dsig.T)) + beta*(-sparsity/avg_sparsity+(1-sparsity)/(1-avg_sparsity)) 133 | if not fine_tune: 134 | dRdAlpha = (back_error.T).dot(X)/X.shape[0] 135 | del_beta = rho*old_del_beta - learn_rate*dRdBeta.T 136 | else: 137 | del_beta = 0. 138 | del_alpha = rho*old_del_alpha - learn_rate*dRdAlpha 139 | 140 | """Bias weights do not get penalized""" 141 | if not fine_tune: 142 | bias1 = np.zeros((weights[1].shape[0], 1)) 143 | weights[1] = weights[1] + del_beta + np.hstack((bias1,self.penalty*weights[1][:,1:])) #M+1 by K 144 | bias0 = np.zeros((weights[0].shape[0], 1)) 145 | weights[0] = weights[0] + del_alpha + np.hstack((bias0,self.penalty*weights[0][:,1:])) #M+1 by P+1 146 | 147 | return weights, del_alpha, del_beta, prop_back 148 | 149 | def feedForward(self,X,layers): 150 | activations = [] 151 | for i,layer in enumerate(layers): 152 | if i == 0: 153 | if layer.auto_encoder: 154 | if layer._dropout: 155 | sig = layer.relu(layer.weights_[0]/2.,X) 156 | else: 157 | sig = layer.relu(layer.weights_[0], X) 158 | # sig = layer.tanh(layer.weights_[0], X) 159 | else: 160 | if layer._dropout: 161 | sig = layer.tanh(layer.weights_[0]/2.,X) 162 | else: 163 | sig = layer.tanh(layer.weights_[0], X) 164 | else: 165 | if layer.auto_encoder: 166 | bias = np.ones((1,sig.shape[1])) 167 | sig = np.vstack((bias,sig)) 168 | if layer._dropout: 169 | sig = layer.relu(layer.weights_[0]/2., sig.T) 170 | else: 171 | sig = layer.relu(layer.weights_[0], sig.T) 172 | else: 173 | if layer._dropout: 174 | sig = layer.tanh(layer.weights_[0]/2., sig.T) 175 | else: 176 | sig = layer.tanh(layer.weights_[0], sig.T) 177 | activations.append(sig.T) 178 | return activations 179 | 180 | def fit(self, X, Y, rho=0., maxiter=300, tol=0.000001, anneal=False, t_0=50, dropout=False, batch=40, SGD=True, layers=[], fine_tune=False): 181 | self._dropout = dropout 182 | grad_alpha, grad_beta = 0., 0. 183 | layer_alphas = [0. for i in range(len(layers))] 184 | layer_betas = layer_alphas 185 | self.is_fit = True 186 | if self.classification: 187 | #one-hot encode Y 188 | try: 189 | #if already one-hot encoded, pass Y as Y_new 190 | if Y.shape[1] > 1: 191 | Y_new = Y 192 | self.K = Y.shape[1] 193 | #else one-hot encode Y as Y_new 194 | else: 195 | self.K = len(set(Y.flatten())) 196 | Y_new = np.zeros((len(Y),self.K)) 197 | for i,v in enumerate(Y): 198 | Y_new[i,v] = 1. 199 | #if Y.shape[1] null (1D array), one-hot encode it as Y_new 200 | except IndexError: 201 | self.K = len(set(Y.flatten())) #ditto 202 | Y_new = np.zeros((len(Y),self.K)) 203 | for i,v in enumerate(Y): 204 | Y_new[i,v] = 1. 205 | else: 206 | Y_new = Y 207 | if not self.auto_encoder: 208 | self.K = 1 209 | else: 210 | self.K = Y.shape[1] 211 | if layers == []: 212 | X, w = self.initNodes(X, Y_new) 213 | else: 214 | bias = np.ones((X.shape[0],1)) 215 | X = np.hstack((bias,X)) #add constant bias to each observation, X now N by P+1 216 | X_ = self.feedForward(X,layers) 217 | X_[-1], w = self.initNodes(X_[-1], Y_new) 218 | 219 | for i in range(maxiter): 220 | if anneal and i != 0 and i % t_0 == 0: 221 | self.learn_rate /= (float(i)/t_0) 222 | if not SGD: 223 | if fine_tune and layers != []: 224 | X_hidden = self.feedForward(X,layers) 225 | bias = np.ones((X_hidden[-1].shape[0],1)) 226 | X_hidden[-1] = np.hstack((bias,X_hidden[-1])) 227 | 228 | w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_hidden[-1], Y_new, rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=False) 229 | next_weights = w[0] 230 | for i,layer in enumerate(layers[::-1]): 231 | if len(layers)-i-2 >= 0: 232 | activations = X_hidden[len(layers)-i-2] 233 | bias = np.ones((activations.shape[0], 1)) 234 | activations = np.hstack((bias,activations)) 235 | else: 236 | activations = X 237 | layer.weights_, layer_alphas[i], layer_betas[i], back_error = layer.backPropagate(layer.weights_, next_weights, activations, Y_new, rho, layer_alphas[i], layer_betas[i], dropout, back_delta=back_error, fine_tune=True) 238 | next_weights = layer.weights_[0] 239 | elif not fine_tune: 240 | if self.auto_encoder: 241 | choose = np.random.binomial(1, 0.9, size=X.shape) 242 | X_noisy = np.multiply(choose, X) 243 | else: 244 | X_noisy = X 245 | w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_noisy, Y_new, rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=fine_tune) 246 | 247 | else: 248 | samples = np.random.choice(range(len(X)),size=batch,replace=False) 249 | if fine_tune and layers != []: 250 | try: 251 | X_hidden = self.feedForward(X[samples,:],layers) 252 | except TypeError: 253 | X_samples = [X[z] for z in samples] 254 | X_hidden = self.feedForward(X_samples,layers) 255 | bias = np.ones((X_hidden[-1].shape[0],1)) 256 | X_hidden[-1] = np.hstack((bias,X_hidden[-1])) 257 | 258 | w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_hidden[-1], Y_new[samples,:], rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=False) 259 | next_weights = w[0] 260 | for i,layer in enumerate(layers[::-1]): 261 | if len(layers)-i-2 >= 0: 262 | activations = X_hidden[len(layers)-i-2] 263 | bias = np.ones((activations.shape[0],1)) 264 | activations = np.hstack((bias,activations)) 265 | else: 266 | try: 267 | activations = X[samples,:] 268 | except TypeError: 269 | activations = [X[z] for z in samples] 270 | layer.weights_, layer_alphas[i], layer_betas[i], back_error = layer.backPropagate(layer.weights_, next_weights, activations, Y_new[samples,:], rho, layer_alphas[i], layer_betas[i], dropout, back_delta=back_error, fine_tune=True) 271 | next_weights = layer.weights_[0] 272 | elif not fine_tune: 273 | if self.auto_encoder: 274 | choose = np.random.binomial(1,0.9,size=X[samples,:].shape) 275 | X_noisy = np.multiply(choose, X[samples,:]) 276 | Y_test = X[samples,:] 277 | else: 278 | X_noisy = X[samples,:] 279 | if self.classification: 280 | Y_test = Y_new[samples,:] 281 | else: 282 | Y_test = Y_new[samples] 283 | w, grad_alpha, grad_beta, back_error = self.backPropagate(w, 0., X_noisy, Y_test, rho, grad_alpha, grad_beta, dropout, back_delta=0., fine_tune=fine_tune) 284 | 285 | self.weights_ = w 286 | return layers 287 | 288 | def predict(self, X, proba=True, layers=[]): 289 | if self.is_fit: 290 | self.predictions = [] 291 | if layers==[]: 292 | bias = np.ones((X.shape[0],1)) 293 | X = np.hstack((bias,X)) #add constant bias to each observation, X now N by P+1 294 | if layers == []: 295 | if self.auto_encoder: 296 | if self._dropout: 297 | activation = self.relu(self.weights_[0]/2, X) 298 | else: 299 | activation = self.relu(self.weights_[0], X) 300 | else: 301 | if self._dropout: 302 | activation = self.tanh(self.weights_[0]/2., X) 303 | else: 304 | activation = self.tanh(self.weights_[0], X) 305 | bias = np.ones((1,activation.shape[1])) 306 | activation = np.vstack((bias,activation)) 307 | response = self.weights_[1].dot(activation) 308 | else: 309 | activation = self.feedForward(X,layers) 310 | activation = activation[-1] 311 | # print activation.shape 312 | # if layers != []: 313 | bias = np.ones((activation.shape[0],1)) 314 | activation = np.hstack((bias,activation)) 315 | if self._dropout: 316 | response = self.tanh(self.weights_[0]/2., activation) 317 | else: 318 | response = self.tanh(self.weights_[0], activation) 319 | bias = np.ones((1,response.shape[1])) 320 | response = np.vstack((bias,response)) 321 | response = self.weights_[1].dot(response) 322 | if self.classification: 323 | predictions = self.softmax(response) 324 | if not proba: 325 | predictions = np.argmax(predictions, axis=1) 326 | else: 327 | predictions = response 328 | self.predictions = predictions 329 | return self.predictions 330 | else: 331 | return "Cannot predict without fitting data first!!" 332 | 333 | def hidden_activations(self, X): 334 | if self.is_fit: 335 | bias = np.ones((X.shape[0],1)) 336 | X = np.hstack((bias,X)) 337 | if self.auto_encoder: 338 | if self._dropout: 339 | activations = self.relu(self.weights_[0]/2., X) 340 | else: 341 | activations = self.relu(self.weights_[0], X) 342 | # activations = self.tanh(self.weights_[0], X) 343 | else: 344 | if self._dropout: 345 | activations = self.tanh(self.weights_[0]/2., X) 346 | else: 347 | activations = self.tanh(self.weights_[0], X) 348 | return activations.T 349 | else: 350 | return "Method 'hidden_activations' can only be called for auto encoders" 351 | 352 | def score(self, X_test, Y_test, layers=[]): 353 | predictions = self.predict(X_test, proba=False, layers=layers) 354 | if self.classification: 355 | try: 356 | if Y_test.shape[1] > 1: 357 | num_correct = predictions == np.argmax(Y_test, axis=1) 358 | return float(len(Y_test[num_correct]))/len(Y_test) 359 | else: 360 | num_correct = predictions == np.array(Y_test).flatten() 361 | return float(len(Y_test.flatten()[num_correct]))/len(Y_test) 362 | except IndexError: 363 | num_correct = predictions == np.array(Y_test).flatten() 364 | return float(len(Y_test.flatten()[num_correct]))/len(Y_test) 365 | else: 366 | n = len(Y_test) 367 | diff = predictions.T - Y_test 368 | MSE = 1. - sum(np.multiply(diff,diff))/n 369 | return MSE --------------------------------------------------------------------------------