├── .gitignore ├── IE598_Final_Report.pdf ├── README.md ├── data ├── GP1_CreditScore.xlsx ├── GP2_EconCycle.xlsx ├── bond_ratings.png └── correlation matrix.png └── source ├── GP1.final.py └── GP2.final.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | # User-specific stuff 7 | .idea/**/tasks.xml 8 | .idea/**/usage.statistics.xml 9 | .idea/**/dictionaries 10 | .idea/**/shelf 11 | 12 | # Generated files 13 | .idea/**/contentModel.xml 14 | 15 | # Sensitive or high-churn files 16 | .idea/**/dataSources/ 17 | .idea/**/dataSources.ids 18 | .idea/**/dataSources.local.xml 19 | .idea/**/sqlDataSources.xml 20 | .idea/**/dynamic.xml 21 | .idea/**/uiDesigner.xml 22 | .idea/**/dbnavigator.xml 23 | 24 | # Gradle 25 | .idea/**/gradle.xml 26 | .idea/**/libraries 27 | 28 | # Gradle and Maven with auto-import 29 | # When using Gradle or Maven with auto-import, you should exclude module files, 30 | # since they will be recreated, and may cause churn. Uncomment if using 31 | # auto-import. 32 | # .idea/modules.xml 33 | # .idea/*.iml 34 | # .idea/modules 35 | # *.iml 36 | # *.ipr 37 | 38 | # CMake 39 | cmake-build-*/ 40 | 41 | # Mongo Explorer plugin 42 | .idea/**/mongoSettings.xml 43 | 44 | # File-based project format 45 | *.iws 46 | 47 | # IntelliJ 48 | out/ 49 | 50 | # mpeltonen/sbt-idea plugin 51 | .idea_modules/ 52 | 53 | # JIRA plugin 54 | atlassian-ide-plugin.xml 55 | 56 | # Cursive Clojure plugin 57 | .idea/replstate.xml 58 | 59 | # Crashlytics plugin (for Android Studio and IntelliJ) 60 | com_crashlytics_export_strings.xml 61 | crashlytics.properties 62 | crashlytics-build.properties 63 | fabric.properties 64 | 65 | # Editor-based Rest Client 66 | .idea/httpRequests 67 | 68 | # Android studio 3.1+ serialized cache file 69 | .idea/caches/build_file_checksums.ser 70 | 71 | ### VisualStudioCode template 72 | .vscode/* 73 | !.vscode/settings.json 74 | !.vscode/tasks.json 75 | !.vscode/launch.json 76 | !.vscode/extensions.json 77 | 78 | ### Python template 79 | # Byte-compiled / optimized / DLL files 80 | __pycache__/ 81 | 82 | *$py.class 83 | 84 | # C extensions 85 | *.so 86 | 87 | # Distribution / packaging 88 | build/ 89 | develop-eggs/ 90 | dist/ 91 | downloads/ 92 | eggs/ 93 | .eggs/ 94 | lib/ 95 | lib64/ 96 | parts/ 97 | sdist/ 98 | var/ 99 | wheels/ 100 | pip-wheel-metadata/ 101 | share/python-wheels/ 102 | *.egg-info/ 103 | .installed.cfg 104 | *.egg 105 | MANIFEST 106 | 107 | # PyInstaller 108 | # Usually these files are written by a python script from a template 109 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 110 | *.manifest 111 | *.spec 112 | 113 | # Installer logs 114 | pip-log.txt 115 | pip-delete-this-directory.txt 116 | 117 | # Unit test / coverage reports 118 | htmlcov/ 119 | .tox/ 120 | .nox/ 121 | .coverage 122 | .coverage.* 123 | .cache 124 | nosetests.xml 125 | coverage.xml 126 | *.cover 127 | .hypothesis/ 128 | .pytest_cache/ 129 | 130 | # Translations 131 | *.mo 132 | *.pot 133 | 134 | # Django stuff: 135 | *.log 136 | local_settings.py 137 | db.sqlite3 138 | 139 | # Flask stuff: 140 | instance/ 141 | .webassets-cache 142 | 143 | # Scrapy stuff: 144 | .scrapy 145 | 146 | # Sphinx documentation 147 | docs/_build/ 148 | 149 | # PyBuilder 150 | target/ 151 | 152 | # Jupyter Notebook 153 | .ipynb_checkpoints 154 | 155 | # IPython 156 | profile_default/ 157 | ipython_config.py 158 | 159 | # pyenv 160 | .python-version 161 | 162 | # pipenv 163 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 164 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 165 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 166 | # install all needed dependencies. 167 | #Pipfile.lock 168 | 169 | # celery beat schedule file 170 | celerybeat-schedule 171 | 172 | # SageMath parsed files 173 | *.sage.py 174 | 175 | # Environments 176 | .env 177 | .venv 178 | env/ 179 | venv/ 180 | ENV/ 181 | env.bak/ 182 | venv.bak/ 183 | 184 | # Spyder project settings 185 | .spyderproject 186 | .spyproject 187 | 188 | # Rope project settings 189 | .ropeproject 190 | 191 | # mkdocs documentation 192 | /site 193 | 194 | # mypy 195 | .mypy_cache/ 196 | .dmypy.json 197 | dmypy.json 198 | 199 | # Pyre type checker 200 | .pyre/ 201 | 202 | ### Example user template template 203 | ### Example user template 204 | 205 | # IntelliJ project files 206 | .idea 207 | *.iml 208 | out 209 | gen 210 | .idea/ -------------------------------------------------------------------------------- /IE598_Final_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/IE598_Final_Report.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | IE598 Machine Learning in Finance, Fall 2018 2 | University of Illinois at Urbana-Champaign 3 | 4 | mark 5 | Final Group Project 6 | 7 | Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan 8 | 9 | -------------------------------------------------------------------------------- 10 | Model Outline: 11 | 1) Exploratory Data Analysis 12 | 2) Preprocessing, feature extraction, feature selection 13 | 3) Model fitting and evaluation, (you should fit at least 3 different machine learning models) 14 | 4) Hyperparameter tuning 15 | 5) Ensembling 16 | -------------------------------------------------------------------------------- 17 | IE598 Machine Learning in Finance, Fall 2018 18 | Final Group Project 19 | 20 | Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan 21 | 22 | 23 | 24 | ![Alt text](IE598_Final_Report.pdf?raw=true "Title") -------------------------------------------------------------------------------- /data/GP1_CreditScore.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/GP1_CreditScore.xlsx -------------------------------------------------------------------------------- /data/GP2_EconCycle.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/GP2_EconCycle.xlsx -------------------------------------------------------------------------------- /data/bond_ratings.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/bond_ratings.png -------------------------------------------------------------------------------- /data/correlation matrix.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/correlation matrix.png -------------------------------------------------------------------------------- /source/GP1.final.py: -------------------------------------------------------------------------------- 1 | # IE598 Machine Learning in Finance, Fall 2018 2 | # University of Illinois at Urbana-Champaign 3 | # 4 | # Final Group Project 5 | # 6 | # Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan 7 | # 8 | # source code available at https://github.com/chicago-joe/Machine-Learning-in-Finance-Final-Project 9 | # -------------------------------------------------------------------------------- 10 | # Model Outline: 11 | # 1) Exploratory Data Analysis 12 | # 2) Preprocessing, feature extraction, feature selection 13 | # 3) Model fitting and evaluation, (you should fit at least 3 different machine learning models) 14 | # 4) Hyperparameter tuning 15 | # 5) Ensembling 16 | # -------------------------------------------------------------------------------- 17 | 18 | import matplotlib.pyplot as plt 19 | import numpy as np 20 | import pandas as pd 21 | import seaborn as sns 22 | from sklearn.base import BaseEstimator, ClassifierMixin, clone 23 | from sklearn.ensemble import RandomForestClassifier 24 | import six 25 | from sklearn.feature_selection import SelectFromModel 26 | from sklearn.linear_model import LogisticRegression 27 | from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV 28 | from sklearn.neighbors import KNeighborsClassifier 29 | from sklearn.pipeline import Pipeline, _name_estimators 30 | from sklearn.preprocessing import StandardScaler, LabelEncoder 31 | from sklearn.tree import DecisionTreeClassifier 32 | 33 | 34 | # 1) Exploratory Data Analysis 35 | df = pd.read_excel('C:\\Users\\jloss\\PyCharmProjects\\Machine-Learning-in-Finance-Final-Project\\data\\GP1_CreditScore.xlsx', sep = ',') 36 | 37 | df.shape 38 | df.info() 39 | df.head() 40 | df.describe() 41 | 42 | cols = ['Sales/Revenues', 'Gross Margin', 'EBITDA', 'EBITDA Margin', 'Net Income Before Extras', 43 | 'Total Debt', 'Net Debt', 'LT Debt', 'ST Debt', 'Cash', 'Free Cash Flow', 'Total Debt/EBITDA', 44 | 'Net Debt/EBITDA', 'Total MV', 'Total Debt/MV', 'Net Debt/MV', 'CFO/Debt', 'CFO', 45 | 'Interest Coverage', 'Total Liquidity', 'Current Liquidity', 'Current Liabilities', 46 | 'EPS Before Extras', 'PE', 'ROA', 'ROE', 'InvGrd'] 47 | 48 | # correlation matrix 49 | cm = np.corrcoef(df[cols].values.T) 50 | sns.set(font_scale = 0.5) 51 | hm = sns.heatmap(cm, 52 | cbar = True, 53 | annot = True, 54 | square = True, 55 | fmt = '.2f', 56 | annot_kws = { 'size':3 }, 57 | yticklabels = cols, 58 | xticklabels = cols) 59 | plt.savefig('correlation matrix.png',dpi=960 ) 60 | plt.show() 61 | 62 | X = df[['Sales/Revenues', 'Gross Margin', 'EBITDA', 'EBITDA Margin', 'Net Income Before Extras', 63 | 'Total Debt', 'Net Debt', 'LT Debt', 'ST Debt', 'Cash', 'Free Cash Flow', 'Total Debt/EBITDA', 64 | 'Net Debt/EBITDA', 'Total MV', 'Total Debt/MV', 'Net Debt/MV', 'CFO/Debt', 'CFO', 65 | 'Interest Coverage', 'Total Liquidity', 'Current Liquidity', 'Current Liabilities', 66 | 'EPS Before Extras', 'PE', 'ROA', 'ROE']].values 67 | 68 | y = df['InvGrd'].values 69 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = y) 70 | 71 | sc = StandardScaler() 72 | sc.fit(X_train) 73 | X_train_std = sc.transform(X_train) 74 | X_test_std = sc.transform(X_test) 75 | 76 | # 2) Preprocessing, feature extraction, feature selection 77 | # Select the feature(with importance) 78 | forest = RandomForestClassifier(criterion = 'gini', n_estimators = 100, random_state = 42, n_jobs = 2) 79 | forest.fit(X_train_std, y_train) 80 | 81 | print(forest.feature_importances_) 82 | print(X_train_std.shape) 83 | 84 | model = SelectFromModel(forest, prefit = True) 85 | X_train_std = model.transform(X_train_std) 86 | X_test_std = model.transform(X_test_std) 87 | 88 | print(X_test_std.shape) 89 | print(X_train_std.shape) 90 | 91 | # 3) Model fitting and evaluation, (you should fit at least 3 different machine learning models) 92 | 93 | # 4) Hyperparameter tuning 94 | # KNN 95 | knn = KNeighborsClassifier() 96 | params_knn = { 97 | 'n_neighbors':range(1, 101) 98 | } 99 | grid_knn = GridSearchCV(estimator = knn, 100 | param_grid = params_knn, 101 | scoring = 'accuracy', 102 | cv = 10, 103 | n_jobs = -1) 104 | 105 | grid_knn.fit(X_train_std, y_train) 106 | best_model_knn = grid_knn.best_estimator_ 107 | 108 | print(best_model_knn.score(X_test_std, y_test)) 109 | 110 | # Random Forest 111 | forest = RandomForestClassifier() 112 | params_forest = { 113 | 'criterion':['gini'], 114 | 'n_estimators':range(1, 101), 115 | 'random_state':[42] 116 | } 117 | grid_forest = GridSearchCV(estimator = forest, 118 | param_grid = params_forest, 119 | scoring = 'accuracy', 120 | cv = 10, 121 | n_jobs = -1) 122 | 123 | grid_forest.fit(X_train_std, y_train) 124 | best_model_forest = grid_forest.best_estimator_ 125 | 126 | print(best_model_forest.score(X_test_std, y_test)) 127 | 128 | # Decision Tree 129 | tree = DecisionTreeClassifier() 130 | params_tree = { 131 | 'criterion':['gini'], 132 | 'max_depth':range(1, 101), 133 | 'random_state':[42] 134 | } 135 | grid_tree = GridSearchCV(estimator = tree, 136 | param_grid = params_tree, 137 | scoring = 'accuracy', 138 | cv = 10, 139 | n_jobs = -1) 140 | 141 | grid_tree.fit(X_train_std, y_train) 142 | best_model_tree = grid_tree.best_estimator_ 143 | 144 | print(best_model_tree.score(X_test_std, y_test)) 145 | 146 | # Logistic Regression 147 | lr = LogisticRegression(max_iter = 1000, solver = 'lbfgs', multi_class = 'auto') 148 | params_lr = { 149 | 'C':range(1, 101), 150 | 'random_state':[42] 151 | } 152 | grid_lr = GridSearchCV(estimator = lr, 153 | param_grid = params_lr, 154 | scoring = 'accuracy', 155 | cv = 10, 156 | n_jobs = -1) 157 | 158 | grid_lr.fit(X_train_std, y_train) 159 | best_model_lr = grid_lr.best_estimator_ 160 | 161 | print(best_model_lr.score(X_test_std, y_test)) 162 | 163 | 164 | # 5) Ensembling 165 | # Majority Vote Classifier 166 | class MajorityVoteClassifier(BaseEstimator, 167 | ClassifierMixin): 168 | """ A majority vote ensemble classifier 169 | Parameters 170 | ---------- 171 | classifiers : array-like, shape = [n_classifiers] 172 | Different classifiers for the ensemble 173 | vote : str, {'classlabel', 'probability'} (default='label') 174 | If 'classlabel' the prediction is based on the argmax of 175 | class labels. Else if 'probability', the argmax of 176 | the sum of probabilities is used to predict the class label 177 | (recommended for calibrated classifiers). 178 | weights : array-like, shape = [n_classifiers], optional (default=None) 179 | If a list of `int` or `float` values are provided, the classifiers 180 | are weighted by importance; Uses uniform weights if `weights=None`. 181 | """ 182 | 183 | def __init__(self, classifiers, vote = 'classlabel', weights = None): 184 | 185 | self.classifiers = classifiers 186 | self.named_classifiers = { key:value for key, value 187 | in _name_estimators(classifiers) } 188 | self.vote = vote 189 | self.weights = weights 190 | 191 | def fit(self, X, y): 192 | """ Fit classifiers. 193 | Parameters 194 | ---------- 195 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 196 | Matrix of training samples. 197 | y : array-like, shape = [n_samples] 198 | Vector of target class labels. 199 | Returns 200 | ------- 201 | self : object 202 | """ 203 | if self.vote not in ('probability', 'classlabel'): 204 | raise ValueError("vote must be 'probability' or 'classlabel'" 205 | "; got (vote=%r)" 206 | % self.vote) 207 | 208 | if self.weights and len(self.weights) != len(self.classifiers): 209 | raise ValueError('Number of classifiers and weights must be equal' 210 | '; got %d weights, %d classifiers' 211 | % (len(self.weights), len(self.classifiers))) 212 | 213 | # Use LabelEncoder to ensure class labels start with 0, which 214 | # is important for np.argmax call in self.predict 215 | self.lablenc_ = LabelEncoder() 216 | self.lablenc_.fit(y) 217 | self.classes_ = self.lablenc_.classes_ 218 | self.classifiers_ = [] 219 | for clf in self.classifiers: 220 | fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y)) 221 | self.classifiers_.append(fitted_clf) 222 | return self 223 | 224 | def predict(self, X): 225 | """ Predict class labels for X. 226 | Parameters 227 | ---------- 228 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 229 | Matrix of training samples. 230 | Returns 231 | ---------- 232 | maj_vote : array-like, shape = [n_samples] 233 | Predicted class labels. 234 | 235 | """ 236 | if self.vote == 'probability': 237 | maj_vote = np.argmax(self.predict_proba(X), axis = 1) 238 | else: # 'classlabel' vote 239 | 240 | # Collect results from clf.predict calls 241 | predictions = np.asarray([clf.predict(X) 242 | for clf in self.classifiers_]).T 243 | 244 | maj_vote = np.apply_along_axis( 245 | lambda x: 246 | np.argmax(np.bincount(x, 247 | weights = self.weights)), 248 | axis = 1, 249 | arr = predictions) 250 | maj_vote = self.lablenc_.inverse_transform(maj_vote) 251 | return maj_vote 252 | 253 | def predict_proba(self, X): 254 | """ Predict class probabilities for X. 255 | Parameters 256 | ---------- 257 | X : {array-like, sparse matrix}, shape = [n_samples, n_features] 258 | Training vectors, where n_samples is the number of samples and 259 | n_features is the number of features. 260 | Returns 261 | ---------- 262 | avg_proba : array-like, shape = [n_samples, n_classes] 263 | Weighted average probability for each class per sample. 264 | """ 265 | probas = np.asarray([clf.predict_proba(X) 266 | for clf in self.classifiers_]) 267 | avg_proba = np.average(probas, axis = 0, weights = self.weights) 268 | return avg_proba 269 | 270 | def get_params(self, deep = True): 271 | """ Get classifier parameter names for GridSearch""" 272 | if not deep: 273 | return super(MajorityVoteClassifier, self).get_params(deep = False) 274 | else: 275 | out = self.named_classifiers.copy() 276 | for name, step in six.iteritems(self.named_classifiers): 277 | for key, value in six.iteritems(step.get_params(deep = True)): 278 | out['%s__%s' % (name, key)] = value 279 | return out 280 | 281 | 282 | # Ensembling 283 | clf1 = grid_knn.best_estimator_ 284 | clf2 = grid_forest.best_estimator_ 285 | clf3 = grid_tree.best_estimator_ 286 | 287 | pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]]) 288 | pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]]) 289 | clf_labels = ['KNN', 'RandomForest', 'Decision tree'] 290 | 291 | print('10-fold cross validation:\n') 292 | 293 | mv_clf = MajorityVoteClassifier(classifiers = [pipe1, clf2, pipe3]) 294 | clf_labels += ['Majority voting'] 295 | all_clf = [pipe1, clf2, pipe3, mv_clf] 296 | 297 | for clf, label in zip(all_clf, clf_labels): 298 | scores = cross_val_score(estimator = clf, X = X_train_std, y = y_train, cv = 10, scoring = 'roc_auc') 299 | 300 | print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label)) 301 | 302 | # muti 303 | X = df[['Sales/Revenues', 'Gross Margin', 'EBITDA', 'EBITDA Margin', 'Net Income Before Extras', 304 | 'Total Debt', 'Net Debt', 'LT Debt', 'ST Debt', 'Cash', 'Free Cash Flow', 'Total Debt/EBITDA', 305 | 'Net Debt/EBITDA', 'Total MV', 'Total Debt/MV', 'Net Debt/MV', 'CFO/Debt', 'CFO', 'Interest Coverage', 306 | 'Total Liquidity', 'Current Liquidity', 'Current Liabilities', 'EPS Before Extras', 'PE', 'ROA', 'ROE']].values 307 | 308 | y = df['Class'].values 309 | 310 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = y) 311 | 312 | sc = StandardScaler() 313 | sc.fit(X_train) 314 | 315 | X_train_std = sc.transform(X_train) 316 | X_test_std = sc.transform(X_test) 317 | 318 | # KNN 319 | knn = KNeighborsClassifier() 320 | params_knn = { 321 | 'n_neighbors':range(1, 101) 322 | } 323 | grid_knn = GridSearchCV(estimator = knn, 324 | param_grid = params_knn, 325 | scoring = 'accuracy', 326 | cv = 10, 327 | n_jobs = -1) 328 | 329 | grid_knn.fit(X_train_std, y_train) 330 | best_model_knn_muti = grid_knn.best_estimator_ 331 | 332 | print('muti=' + str(best_model_knn_muti.score(X_test_std, y_test))) 333 | 334 | # RandomForest 335 | forest = RandomForestClassifier() 336 | params_forest = { 337 | 'criterion':['gini'], 338 | 'n_estimators':range(1, 101), 339 | 'random_state':[42] 340 | } 341 | grid_forest = GridSearchCV(estimator = forest, 342 | param_grid = params_forest, 343 | scoring = 'accuracy', 344 | cv = 10, 345 | n_jobs = -1) 346 | 347 | grid_forest.fit(X_train_std, y_train) 348 | best_model_forest_muti = grid_forest.best_estimator_ 349 | 350 | print('muti=' + str(best_model_forest_muti.score(X_test_std, y_test))) 351 | 352 | # DecisionTree 353 | tree = DecisionTreeClassifier() 354 | params_tree = { 355 | 'criterion':['gini'], 356 | 'max_depth':range(1, 101), 357 | 'random_state':[42] 358 | } 359 | grid_tree = GridSearchCV(estimator = tree, 360 | param_grid = params_tree, 361 | scoring = 'accuracy', 362 | cv = 10, 363 | n_jobs = -1) 364 | 365 | grid_tree.fit(X_train_std, y_train) 366 | best_model_tree_muti = grid_tree.best_estimator_ 367 | 368 | print('muti=' + str(best_model_tree_muti.score(X_test_std, y_test))) 369 | 370 | # logistic Regression 371 | lr = LogisticRegression(max_iter = 1000, solver = 'lbfgs', multi_class = 'auto') 372 | params_lr = { 373 | 'C':range(1, 101), 374 | 'random_state':[42] 375 | } 376 | grid_lr = GridSearchCV(estimator = lr, 377 | param_grid = params_lr, 378 | scoring = 'accuracy', 379 | cv = 10, 380 | n_jobs = -1) 381 | 382 | grid_lr.fit(X_train_std, y_train) 383 | best_model_lr_muti = grid_lr.best_estimator_ 384 | 385 | print('muti=' + str(best_model_lr_muti.score(X_test_std, y_test))) 386 | 387 | # This part select the feature use for fitting(but seem to make no sense) 388 | # If you want to use it, run it before the StandardScaler after the model 389 | -------------------------------------------------------------------------------- /source/GP2.final.py: -------------------------------------------------------------------------------- 1 | # IE598 Machine Learning in Finance, Fall 2018 2 | # University of Illinois at Urbana-Champaign 3 | # 4 | # Final Group Project 5 | # 6 | # Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan 7 | # 8 | # source code available at https://github.com/chicago-joe/Machine-Learning-in-Finance-Final-Project 9 | # -------------------------------------------------------------------------------- 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import pandas as pd 13 | import seaborn as sns 14 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor 15 | from sklearn.feature_selection import SelectFromModel 16 | from sklearn.linear_model import LinearRegression, Lasso, Ridge 17 | from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_squared_error 18 | from sklearn.model_selection import train_test_split 19 | from sklearn.preprocessing import StandardScaler 20 | 21 | df = pd.read_excel('C:\\Users\\jloss\\PyCharmProjects\\Machine-Learning-in-Finance-Final-Project\\data\\GP2_EconCycle.xlsx', sep = ',') 22 | 23 | cols = ['T1Y Index', 'T2Y Index', 'T3Y Index', 'T5Y Index', 'T7Y Index', 'T10Y Index', 'CP1M', 'CP3M', 'CP6M', 24 | 'CP1M_T1Y', 'CP3M_T1Y', 'CP6M_T1Y', 'USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD'] 25 | 26 | ## Exploratory Data Analysis 27 | df.dropna(inplace = True) 28 | print(df.shape, df.info(), df.describe(), df.head()) 29 | 30 | CPTcols = ['CP1M_T1Y', 'CP3M_T1Y', 'CP6M_T1Y', 'USPHCI'] 31 | sns.pairplot(df[CPTcols], dropna = True, ) 32 | 33 | # plt.tight_layout() 34 | # plt.savefig('E:\Study\Courses\Fall 2018\IE 598\IE598 Homework\Group Project\scatter_GP2_.png',dpi = 500) 35 | plt.show() 36 | 37 | cm = np.corrcoef(df[cols].values.T) 38 | hm = sns.heatmap(cm, 39 | cbar = False, 40 | annot = True, 41 | square = False, 42 | fmt = '.1f', 43 | annot_kws = { 'size':8 }, 44 | yticklabels = cols, 45 | xticklabels = cols) 46 | 47 | # plt.tight_layout() 48 | # plt.savefig('E:\Study\Courses\Fall 2018\IE 598\IE598 Homework\Group Project\heatmap_rate_GP2_.png',dpi = 15000) 49 | plt.show() 50 | 51 | ## 3- month prediction and model 52 | X = np.array(df.drop(['USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD'], 1)) 53 | y = np.array(df['PCT 3MO FWD']) 54 | sc_x = StandardScaler() 55 | sc_y = StandardScaler() 56 | X_std = sc_x.fit_transform(X) 57 | y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten() 58 | 59 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42) 60 | 61 | # feature importance 62 | feat_labels = cols[:-4] 63 | forest = RandomForestRegressor(n_estimators = 500, random_state = 1) 64 | forest.fit(X_train, y_train) 65 | importances = forest.feature_importances_ 66 | 67 | indices = np.argsort(importances)[::-1] 68 | print("3MO FWD RATE - Feature Importance") 69 | for f in range(X_train.shape[1]): 70 | print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) 71 | 72 | print('\n') 73 | plt.title('Feature Importance PCT 3MO FWD') 74 | plt.bar(range(X_train.shape[1]), importances[indices], align = 'center') 75 | plt.xticks(range(X_train.shape[1]), feat_labels, rotation = 90) 76 | plt.xlim([-1, X_train.shape[1]]) 77 | plt.show() 78 | 79 | # Selection 80 | model = SelectFromModel(forest, prefit = True) 81 | X_train = model.transform(X_train) 82 | X_test = model.transform(X_test) 83 | print(X_test.shape) 84 | print(X_train.shape) 85 | 86 | # LinearRegression 87 | reg = LinearRegression() 88 | reg.fit(X_train, y_train) 89 | y_train_pred = reg.predict(X_train) 90 | y_test_pred = reg.predict(X_test) 91 | plt.scatter(y_train_pred, y_train_pred - y_train, 92 | c = 'steelblue', marker = 'o', edgecolor = 'white', 93 | label = 'Training data') 94 | plt.scatter(y_test_pred, y_test_pred - y_test, 95 | c = 'limegreen', marker = 's', edgecolor = 'white', 96 | label = 'Test data') 97 | plt.xlabel('Predicted values') 98 | plt.ylabel('Residuals') 99 | plt.legend(loc = 'upper left') 100 | plt.hlines(y = 0, xmin = 0, xmax = 1, color = 'black', lw = 2) 101 | plt.xlim([0, 1]) 102 | plt.savefig('LinearRegression.png', dpi = 300) 103 | plt.show() 104 | print('(LR)MSE train: %.3f, test: %.3f' % ( 105 | mean_squared_error(y_train, y_train_pred), 106 | mean_squared_error(y_test, y_test_pred))) 107 | print('(LR)R^2 train: %.3f, test: %.3f' % ( 108 | r2_score(y_train, y_train_pred), 109 | r2_score(y_test, y_test_pred))) 110 | print('(LR)Slope: %.3f' % reg.coef_[0]) 111 | print('(LR)Intercept: %.3f' % reg.intercept_) 112 | 113 | ## 6 month prediction and model 114 | X = np.array(df.drop(['USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD'], 1)) 115 | y = np.array(df['PCT 6MO FWD']) 116 | sc_x = StandardScaler() 117 | sc_y = StandardScaler() 118 | X_std = sc_x.fit_transform(X) 119 | y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten() 120 | 121 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42) 122 | feat_labels = cols[:-4] 123 | 124 | forest = RandomForestRegressor(n_estimators = 500, random_state = 1) 125 | forest.fit(X_train, y_train) 126 | importances = forest.feature_importances_ 127 | 128 | print("6MO FWD RATE - Feature Importance") 129 | indices = np.argsort(importances)[::-1] 130 | for f in range(X_train.shape[1]): 131 | print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) 132 | 133 | print('\n') 134 | plt.title('Feature Importance PCT 6MO FWD') 135 | plt.bar(range(X_train.shape[1]), importances[indices], align = 'center') 136 | plt.xticks(range(X_train.shape[1]), feat_labels, rotation = 90) 137 | plt.xlim([-1, X_train.shape[1]]) 138 | plt.show() 139 | 140 | # Selection 141 | model = SelectFromModel(forest, prefit = True) 142 | X_train = model.transform(X_train) 143 | X_test = model.transform(X_test) 144 | print(X_test.shape) 145 | print(X_train.shape) 146 | 147 | # RidgeRegression 148 | alpha_space = np.logspace(-3, 0, 4) 149 | ridge = Ridge(normalize = True) 150 | # Compute scores over range of alphas 151 | for alpha in alpha_space: 152 | 153 | # Specify the alpha value to use: ridge.alpha 154 | ridge.alpha = alpha 155 | ridge.fit(X_train, y_train) 156 | y_train_pred = ridge.predict(X_train) 157 | y_test_pred = ridge.predict(X_test) 158 | plt.scatter(y_train_pred, y_train_pred - y_train, 159 | c = 'steelblue', marker = 'o', edgecolor = 'white', 160 | label = 'Training data') 161 | plt.scatter(y_test_pred, y_test_pred - y_test, 162 | c = 'limegreen', marker = 's', edgecolor = 'white', 163 | label = 'Test data') 164 | plt.xlabel('Predicted values') 165 | plt.ylabel('Residuals') 166 | plt.legend(loc = 'upper left') 167 | plt.hlines(y = 0, xmin = 0, xmax = 1, color = 'black', lw = 2) 168 | plt.xlim([0, 1]) 169 | plt.savefig('Ridge(alpha=' + str(alpha) + ' ).png', dpi = 300) 170 | plt.show() 171 | print('Ridgealpha: %.3f' % (alpha)) 172 | print('MSE train: %.3f, test: %.3f' % ( 173 | mean_squared_error(y_train, y_train_pred), 174 | mean_squared_error(y_test, y_test_pred))) 175 | print('R^2 train: %.3f, test: %.3f' % ( 176 | r2_score(y_train, y_train_pred), 177 | r2_score(y_test, y_test_pred))) 178 | print('Slope: %.3f' % ridge.coef_[0]) 179 | print('Intercept: %.3f' % ridge.intercept_) 180 | 181 | ## 9-month prediction & model 182 | X = np.array(df.drop(['USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD'], 1)) 183 | y = np.array(df['PCT 9MO FWD']) 184 | sc_x = StandardScaler() 185 | sc_y = StandardScaler() 186 | X_std = sc_x.fit_transform(X) 187 | y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten() 188 | 189 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42) 190 | feat_labels = cols[:-4] 191 | 192 | forest = RandomForestRegressor(n_estimators = 500, random_state = 1) 193 | forest.fit(X_train, y_train) 194 | importances = forest.feature_importances_ 195 | 196 | print("9MO FWD RATE - Feature Importance") 197 | indices = np.argsort(importances)[::-1] 198 | for f in range(X_train.shape[1]): 199 | print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]])) 200 | print('\n') 201 | plt.title('Feature Importance: PCT 9MO FWD ') 202 | plt.bar(range(X_train.shape[1]), importances[indices], align = 'center') 203 | plt.xticks(range(X_train.shape[1]), feat_labels, rotation = 90) 204 | plt.xlim([-1, X_train.shape[1]]) 205 | plt.show() 206 | 207 | # Selection 208 | model = SelectFromModel(forest, prefit = True) 209 | X_train = model.transform(X_train) 210 | X_test = model.transform(X_test) 211 | print(X_test.shape) 212 | print(X_train.shape) 213 | 214 | # LassoRegression 215 | alpha_space = np.logspace(-6, -3, 4) 216 | lasso = Lasso(normalize = True) 217 | # Compute scores over range of alphas 218 | for alpha in alpha_space: 219 | 220 | # Specify the alpha value to use: ridge.alpha 221 | lasso.alpha = alpha 222 | lasso.fit(X_train, y_train) 223 | y_train_pred = lasso.predict(X_train) 224 | y_test_pred = lasso.predict(X_test) 225 | plt.scatter(y_train_pred, y_train_pred - y_train, 226 | c = 'steelblue', marker = 'o', edgecolor = 'white', 227 | label = 'Training data') 228 | plt.scatter(y_test_pred, y_test_pred - y_test, 229 | c = 'limegreen', marker = 's', edgecolor = 'white', 230 | label = 'Test data') 231 | plt.xlabel('Predicted values') 232 | plt.ylabel('Residuals') 233 | plt.legend(loc = 'upper left') 234 | plt.hlines(y = 0, xmin = 0, xmax = 1, color = 'black', lw = 2) 235 | plt.savefig('Lasso(alpha=' + str(alpha) + ' ).png', dpi = 300) 236 | plt.xlim([0, 1]) 237 | plt.show() 238 | print('Lassoalpha: %.6f' % (lasso.alpha)) 239 | print('MSE train: %.3f, test: %.3f' % ( 240 | mean_squared_error(y_train, y_train_pred), 241 | mean_squared_error(y_test, y_test_pred))) 242 | print('R^2 train: %.3f, test: %.3f' % ( 243 | r2_score(y_train, y_train_pred), 244 | r2_score(y_test, y_test_pred))) 245 | print('Slope: %.3f' % lasso.coef_[0]) 246 | print('Intercept: %.3f' % lasso.intercept_) 247 | 248 | ## # Part 5 - Ensemble Learning 249 | # Set seed for reproducibility 250 | SEED = 1 251 | 252 | # Split dataset into 90% train and 10% test 253 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42) 254 | 255 | # Instantiate a GradientBoostingRegressor 'gbr' 256 | gbr = GradientBoostingRegressor(max_features = 4, learning_rate = 0.1, n_estimators = 500, 257 | subsample = 0.3, random_state = 42) 258 | gbr.fit(X_train, y_train) 259 | # Predict the test set labels 260 | y_pred = gbr.predict(X_test) 261 | 262 | # Evaluate the test set RMSE 263 | mse = MSE(y_test, y_pred) 264 | rsquared = r2_score(y_test, y_pred) 265 | 266 | # Print the test set RMSE 267 | print('\n') 268 | print('Test set MSE: {:.2f}'.format(mse)) 269 | print('Test set R-Squared: {:.2f}'.format(rsquared)) 270 | 271 | --------------------------------------------------------------------------------