├── .gitignore
├── IE598_Final_Report.pdf
├── README.md
├── data
    ├── GP1_CreditScore.xlsx
    ├── GP2_EconCycle.xlsx
    ├── bond_ratings.png
    └── correlation matrix.png
└── source
    ├── GP1.final.py
    └── GP2.final.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | # User-specific stuff
  7 | .idea/**/tasks.xml
  8 | .idea/**/usage.statistics.xml
  9 | .idea/**/dictionaries
 10 | .idea/**/shelf
 11 | 
 12 | # Generated files
 13 | .idea/**/contentModel.xml
 14 | 
 15 | # Sensitive or high-churn files
 16 | .idea/**/dataSources/
 17 | .idea/**/dataSources.ids
 18 | .idea/**/dataSources.local.xml
 19 | .idea/**/sqlDataSources.xml
 20 | .idea/**/dynamic.xml
 21 | .idea/**/uiDesigner.xml
 22 | .idea/**/dbnavigator.xml
 23 | 
 24 | # Gradle
 25 | .idea/**/gradle.xml
 26 | .idea/**/libraries
 27 | 
 28 | # Gradle and Maven with auto-import
 29 | # When using Gradle or Maven with auto-import, you should exclude module files,
 30 | # since they will be recreated, and may cause churn.  Uncomment if using
 31 | # auto-import.
 32 | # .idea/modules.xml
 33 | # .idea/*.iml
 34 | # .idea/modules
 35 | # *.iml
 36 | # *.ipr
 37 | 
 38 | # CMake
 39 | cmake-build-*/
 40 | 
 41 | # Mongo Explorer plugin
 42 | .idea/**/mongoSettings.xml
 43 | 
 44 | # File-based project format
 45 | *.iws
 46 | 
 47 | # IntelliJ
 48 | out/
 49 | 
 50 | # mpeltonen/sbt-idea plugin
 51 | .idea_modules/
 52 | 
 53 | # JIRA plugin
 54 | atlassian-ide-plugin.xml
 55 | 
 56 | # Cursive Clojure plugin
 57 | .idea/replstate.xml
 58 | 
 59 | # Crashlytics plugin (for Android Studio and IntelliJ)
 60 | com_crashlytics_export_strings.xml
 61 | crashlytics.properties
 62 | crashlytics-build.properties
 63 | fabric.properties
 64 | 
 65 | # Editor-based Rest Client
 66 | .idea/httpRequests
 67 | 
 68 | # Android studio 3.1+ serialized cache file
 69 | .idea/caches/build_file_checksums.ser
 70 | 
 71 | ### VisualStudioCode template
 72 | .vscode/*
 73 | !.vscode/settings.json
 74 | !.vscode/tasks.json
 75 | !.vscode/launch.json
 76 | !.vscode/extensions.json
 77 | 
 78 | ### Python template
 79 | # Byte-compiled / optimized / DLL files
 80 | __pycache__/
 81 | 
 82 | *$py.class
 83 | 
 84 | # C extensions
 85 | *.so
 86 | 
 87 | # Distribution / packaging
 88 | build/
 89 | develop-eggs/
 90 | dist/
 91 | downloads/
 92 | eggs/
 93 | .eggs/
 94 | lib/
 95 | lib64/
 96 | parts/
 97 | sdist/
 98 | var/
 99 | wheels/
100 | pip-wheel-metadata/
101 | share/python-wheels/
102 | *.egg-info/
103 | .installed.cfg
104 | *.egg
105 | MANIFEST
106 | 
107 | # PyInstaller
108 | #  Usually these files are written by a python script from a template
109 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
110 | *.manifest
111 | *.spec
112 | 
113 | # Installer logs
114 | pip-log.txt
115 | pip-delete-this-directory.txt
116 | 
117 | # Unit test / coverage reports
118 | htmlcov/
119 | .tox/
120 | .nox/
121 | .coverage
122 | .coverage.*
123 | .cache
124 | nosetests.xml
125 | coverage.xml
126 | *.cover
127 | .hypothesis/
128 | .pytest_cache/
129 | 
130 | # Translations
131 | *.mo
132 | *.pot
133 | 
134 | # Django stuff:
135 | *.log
136 | local_settings.py
137 | db.sqlite3
138 | 
139 | # Flask stuff:
140 | instance/
141 | .webassets-cache
142 | 
143 | # Scrapy stuff:
144 | .scrapy
145 | 
146 | # Sphinx documentation
147 | docs/_build/
148 | 
149 | # PyBuilder
150 | target/
151 | 
152 | # Jupyter Notebook
153 | .ipynb_checkpoints
154 | 
155 | # IPython
156 | profile_default/
157 | ipython_config.py
158 | 
159 | # pyenv
160 | .python-version
161 | 
162 | # pipenv
163 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
164 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
165 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
166 | #   install all needed dependencies.
167 | #Pipfile.lock
168 | 
169 | # celery beat schedule file
170 | celerybeat-schedule
171 | 
172 | # SageMath parsed files
173 | *.sage.py
174 | 
175 | # Environments
176 | .env
177 | .venv
178 | env/
179 | venv/
180 | ENV/
181 | env.bak/
182 | venv.bak/
183 | 
184 | # Spyder project settings
185 | .spyderproject
186 | .spyproject
187 | 
188 | # Rope project settings
189 | .ropeproject
190 | 
191 | # mkdocs documentation
192 | /site
193 | 
194 | # mypy
195 | .mypy_cache/
196 | .dmypy.json
197 | dmypy.json
198 | 
199 | # Pyre type checker
200 | .pyre/
201 | 
202 | ### Example user template template
203 | ### Example user template
204 | 
205 | # IntelliJ project files
206 | .idea
207 | *.iml
208 | out
209 | gen
210 | .idea/


--------------------------------------------------------------------------------
/IE598_Final_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/IE598_Final_Report.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | IE598 Machine Learning in Finance, Fall 2018 
 2 | University of Illinois at Urbana-Champaign
 3 | 
 4 | mark
 5 | Final Group Project
 6 | 
 7 | Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan
 8 | 
 9 | --------------------------------------------------------------------------------
10 | Model Outline:
11 | 1) Exploratory Data Analysis
12 | 2) Preprocessing, feature extraction, feature selection
13 | 3) Model fitting and evaluation, (you should fit at least 3 different machine learning models)
14 | 4) Hyperparameter tuning
15 | 5) Ensembling
16 | --------------------------------------------------------------------------------
17 | IE598 Machine Learning in Finance, Fall 2018
18 | Final Group Project
19 | 
20 | Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan
21 | 
22 | 
23 | <!--![Alt text](relative/path/to/img.jpg?raw=true "Title")-->
24 | ![Alt text](IE598_Final_Report.pdf?raw=true "Title")


--------------------------------------------------------------------------------
/data/GP1_CreditScore.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/GP1_CreditScore.xlsx


--------------------------------------------------------------------------------
/data/GP2_EconCycle.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/GP2_EconCycle.xlsx


--------------------------------------------------------------------------------
/data/bond_ratings.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/bond_ratings.png


--------------------------------------------------------------------------------
/data/correlation matrix.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/chicago-joe/Python-ML-for-Financial-Applications/b756db319b9618907816d786fdcf99f08a05af15/data/correlation matrix.png


--------------------------------------------------------------------------------
/source/GP1.final.py:
--------------------------------------------------------------------------------
  1 | # IE598 Machine Learning in Finance, Fall 2018
  2 | # University of Illinois at Urbana-Champaign
  3 | #
  4 | # Final Group Project
  5 | #
  6 | # Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan
  7 | #
  8 | # source code available at https://github.com/chicago-joe/Machine-Learning-in-Finance-Final-Project
  9 | # --------------------------------------------------------------------------------
 10 | # Model Outline:
 11 | # 1) Exploratory Data Analysis
 12 | # 2) Preprocessing, feature extraction, feature selection
 13 | # 3) Model fitting and evaluation, (you should fit at least 3 different machine learning models)
 14 | # 4) Hyperparameter tuning
 15 | # 5) Ensembling
 16 | # --------------------------------------------------------------------------------
 17 | 
 18 | import matplotlib.pyplot as plt
 19 | import numpy as np
 20 | import pandas as pd
 21 | import seaborn as sns
 22 | from sklearn.base import BaseEstimator, ClassifierMixin, clone
 23 | from sklearn.ensemble import RandomForestClassifier
 24 | import six
 25 | from sklearn.feature_selection import SelectFromModel
 26 | from sklearn.linear_model import LogisticRegression
 27 | from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
 28 | from sklearn.neighbors import KNeighborsClassifier
 29 | from sklearn.pipeline import Pipeline, _name_estimators
 30 | from sklearn.preprocessing import StandardScaler, LabelEncoder
 31 | from sklearn.tree import DecisionTreeClassifier
 32 | 
 33 | 
 34 | # 1) Exploratory Data Analysis
 35 | df = pd.read_excel('C:\\Users\\jloss\\PyCharmProjects\\Machine-Learning-in-Finance-Final-Project\\data\\GP1_CreditScore.xlsx', sep = ',')
 36 | 
 37 | df.shape
 38 | df.info()
 39 | df.head()
 40 | df.describe()
 41 | 
 42 | cols = ['Sales/Revenues', 'Gross Margin', 'EBITDA', 'EBITDA Margin', 'Net Income Before Extras',
 43 |         'Total Debt', 'Net Debt', 'LT Debt', 'ST Debt', 'Cash', 'Free Cash Flow', 'Total Debt/EBITDA',
 44 |         'Net Debt/EBITDA', 'Total MV', 'Total Debt/MV', 'Net Debt/MV', 'CFO/Debt', 'CFO',
 45 |         'Interest Coverage', 'Total Liquidity', 'Current Liquidity', 'Current Liabilities',
 46 |         'EPS Before Extras', 'PE', 'ROA', 'ROE', 'InvGrd']
 47 | 
 48 | # correlation matrix
 49 | cm = np.corrcoef(df[cols].values.T)
 50 | sns.set(font_scale = 0.5)
 51 | hm = sns.heatmap(cm,
 52 |                  cbar = True,
 53 |                  annot = True,
 54 |                  square = True,
 55 |                  fmt = '.2f',
 56 |                  annot_kws = { 'size':3 },
 57 |                  yticklabels = cols,
 58 |                  xticklabels = cols)
 59 | plt.savefig('correlation matrix.png',dpi=960 )
 60 | plt.show()
 61 | 
 62 | X = df[['Sales/Revenues', 'Gross Margin', 'EBITDA', 'EBITDA Margin', 'Net Income Before Extras',
 63 |         'Total Debt', 'Net Debt', 'LT Debt', 'ST Debt', 'Cash', 'Free Cash Flow', 'Total Debt/EBITDA',
 64 |         'Net Debt/EBITDA', 'Total MV', 'Total Debt/MV', 'Net Debt/MV', 'CFO/Debt', 'CFO',
 65 |         'Interest Coverage', 'Total Liquidity', 'Current Liquidity', 'Current Liabilities',
 66 |         'EPS Before Extras', 'PE', 'ROA', 'ROE']].values
 67 | 
 68 | y = df['InvGrd'].values
 69 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = y)
 70 | 
 71 | sc = StandardScaler()
 72 | sc.fit(X_train)
 73 | X_train_std = sc.transform(X_train)
 74 | X_test_std = sc.transform(X_test)
 75 | 
 76 | # 2) Preprocessing, feature extraction, feature selection
 77 | # Select the feature(with importance)
 78 | forest = RandomForestClassifier(criterion = 'gini', n_estimators = 100, random_state = 42, n_jobs = 2)
 79 | forest.fit(X_train_std, y_train)
 80 | 
 81 | print(forest.feature_importances_)
 82 | print(X_train_std.shape)
 83 | 
 84 | model = SelectFromModel(forest, prefit = True)
 85 | X_train_std = model.transform(X_train_std)
 86 | X_test_std = model.transform(X_test_std)
 87 | 
 88 | print(X_test_std.shape)
 89 | print(X_train_std.shape)
 90 | 
 91 | # 3) Model fitting and evaluation, (you should fit at least 3 different machine learning models)
 92 | 
 93 | # 4) Hyperparameter tuning
 94 | # KNN
 95 | knn = KNeighborsClassifier()
 96 | params_knn = {
 97 |     'n_neighbors':range(1, 101)
 98 | }
 99 | grid_knn = GridSearchCV(estimator = knn,
100 |                         param_grid = params_knn,
101 |                         scoring = 'accuracy',
102 |                         cv = 10,
103 |                         n_jobs = -1)
104 | 
105 | grid_knn.fit(X_train_std, y_train)
106 | best_model_knn = grid_knn.best_estimator_
107 | 
108 | print(best_model_knn.score(X_test_std, y_test))
109 | 
110 | # Random Forest
111 | forest = RandomForestClassifier()
112 | params_forest = {
113 |     'criterion':['gini'],
114 |     'n_estimators':range(1, 101),
115 |     'random_state':[42]
116 | }
117 | grid_forest = GridSearchCV(estimator = forest,
118 |                            param_grid = params_forest,
119 |                            scoring = 'accuracy',
120 |                            cv = 10,
121 |                            n_jobs = -1)
122 | 
123 | grid_forest.fit(X_train_std, y_train)
124 | best_model_forest = grid_forest.best_estimator_
125 | 
126 | print(best_model_forest.score(X_test_std, y_test))
127 | 
128 | # Decision Tree
129 | tree = DecisionTreeClassifier()
130 | params_tree = {
131 |     'criterion':['gini'],
132 |     'max_depth':range(1, 101),
133 |     'random_state':[42]
134 | }
135 | grid_tree = GridSearchCV(estimator = tree,
136 |                          param_grid = params_tree,
137 |                          scoring = 'accuracy',
138 |                          cv = 10,
139 |                          n_jobs = -1)
140 | 
141 | grid_tree.fit(X_train_std, y_train)
142 | best_model_tree = grid_tree.best_estimator_
143 | 
144 | print(best_model_tree.score(X_test_std, y_test))
145 | 
146 | # Logistic Regression
147 | lr = LogisticRegression(max_iter = 1000, solver = 'lbfgs', multi_class = 'auto')
148 | params_lr = {
149 |     'C':range(1, 101),
150 |     'random_state':[42]
151 | }
152 | grid_lr = GridSearchCV(estimator = lr,
153 |                        param_grid = params_lr,
154 |                        scoring = 'accuracy',
155 |                        cv = 10,
156 |                        n_jobs = -1)
157 | 
158 | grid_lr.fit(X_train_std, y_train)
159 | best_model_lr = grid_lr.best_estimator_
160 | 
161 | print(best_model_lr.score(X_test_std, y_test))
162 | 
163 | 
164 | # 5) Ensembling
165 | # Majority Vote Classifier
166 | class MajorityVoteClassifier(BaseEstimator,
167 |                              ClassifierMixin):
168 |     """ A majority vote ensemble classifier
169 |     Parameters
170 |     ----------
171 |     classifiers : array-like, shape = [n_classifiers]
172 |       Different classifiers for the ensemble
173 |     vote : str, {'classlabel', 'probability'} (default='label')
174 |       If 'classlabel' the prediction is based on the argmax of
175 |         class labels. Else if 'probability', the argmax of
176 |         the sum of probabilities is used to predict the class label
177 |         (recommended for calibrated classifiers).
178 |     weights : array-like, shape = [n_classifiers], optional (default=None)
179 |       If a list of `int` or `float` values are provided, the classifiers
180 |       are weighted by importance; Uses uniform weights if `weights=None`.
181 |     """
182 | 
183 |     def __init__(self, classifiers, vote = 'classlabel', weights = None):
184 | 
185 |         self.classifiers = classifiers
186 |         self.named_classifiers = { key:value for key, value
187 |                                    in _name_estimators(classifiers) }
188 |         self.vote = vote
189 |         self.weights = weights
190 | 
191 |     def fit(self, X, y):
192 |         """ Fit classifiers.
193 |         Parameters
194 |         ----------
195 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
196 |             Matrix of training samples.
197 |         y : array-like, shape = [n_samples]
198 |             Vector of target class labels.
199 |         Returns
200 |         -------
201 |         self : object
202 |         """
203 |         if self.vote not in ('probability', 'classlabel'):
204 |             raise ValueError("vote must be 'probability' or 'classlabel'"
205 |                              "; got (vote=%r)"
206 |                              % self.vote)
207 | 
208 |         if self.weights and len(self.weights) != len(self.classifiers):
209 |             raise ValueError('Number of classifiers and weights must be equal'
210 |                              '; got %d weights, %d classifiers'
211 |                              % (len(self.weights), len(self.classifiers)))
212 | 
213 |         # Use LabelEncoder to ensure class labels start with 0, which
214 |         # is important for np.argmax call in self.predict
215 |         self.lablenc_ = LabelEncoder()
216 |         self.lablenc_.fit(y)
217 |         self.classes_ = self.lablenc_.classes_
218 |         self.classifiers_ = []
219 |         for clf in self.classifiers:
220 |             fitted_clf = clone(clf).fit(X, self.lablenc_.transform(y))
221 |             self.classifiers_.append(fitted_clf)
222 |         return self
223 | 
224 |     def predict(self, X):
225 |         """ Predict class labels for X.
226 |         Parameters
227 |         ----------
228 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
229 |             Matrix of training samples.
230 |         Returns
231 |         ----------
232 |         maj_vote : array-like, shape = [n_samples]
233 |             Predicted class labels.
234 |             
235 |         """
236 |         if self.vote == 'probability':
237 |             maj_vote = np.argmax(self.predict_proba(X), axis = 1)
238 |         else:  # 'classlabel' vote
239 | 
240 |             #  Collect results from clf.predict calls
241 |             predictions = np.asarray([clf.predict(X)
242 |                                       for clf in self.classifiers_]).T
243 | 
244 |             maj_vote = np.apply_along_axis(
245 |                     lambda x:
246 |                     np.argmax(np.bincount(x,
247 |                                           weights = self.weights)),
248 |                     axis = 1,
249 |                     arr = predictions)
250 |         maj_vote = self.lablenc_.inverse_transform(maj_vote)
251 |         return maj_vote
252 | 
253 |     def predict_proba(self, X):
254 |         """ Predict class probabilities for X.
255 |         Parameters
256 |         ----------
257 |         X : {array-like, sparse matrix}, shape = [n_samples, n_features]
258 |             Training vectors, where n_samples is the number of samples and
259 |             n_features is the number of features.
260 |         Returns
261 |         ----------
262 |         avg_proba : array-like, shape = [n_samples, n_classes]
263 |             Weighted average probability for each class per sample.
264 |         """
265 |         probas = np.asarray([clf.predict_proba(X)
266 |                              for clf in self.classifiers_])
267 |         avg_proba = np.average(probas, axis = 0, weights = self.weights)
268 |         return avg_proba
269 | 
270 |     def get_params(self, deep = True):
271 |         """ Get classifier parameter names for GridSearch"""
272 |         if not deep:
273 |             return super(MajorityVoteClassifier, self).get_params(deep = False)
274 |         else:
275 |             out = self.named_classifiers.copy()
276 |             for name, step in six.iteritems(self.named_classifiers):
277 |                 for key, value in six.iteritems(step.get_params(deep = True)):
278 |                     out['%s__%s' % (name, key)] = value
279 |             return out
280 | 
281 | 
282 | # Ensembling
283 | clf1 = grid_knn.best_estimator_
284 | clf2 = grid_forest.best_estimator_
285 | clf3 = grid_tree.best_estimator_
286 | 
287 | pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
288 | pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])
289 | clf_labels = ['KNN', 'RandomForest', 'Decision tree']
290 | 
291 | print('10-fold cross validation:\n')
292 | 
293 | mv_clf = MajorityVoteClassifier(classifiers = [pipe1, clf2, pipe3])
294 | clf_labels += ['Majority voting']
295 | all_clf = [pipe1, clf2, pipe3, mv_clf]
296 | 
297 | for clf, label in zip(all_clf, clf_labels):
298 |     scores = cross_val_score(estimator = clf, X = X_train_std, y = y_train, cv = 10, scoring = 'roc_auc')
299 | 
300 |     print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
301 | 
302 | # muti
303 | X = df[['Sales/Revenues', 'Gross Margin', 'EBITDA', 'EBITDA Margin', 'Net Income Before Extras',
304 |         'Total Debt', 'Net Debt', 'LT Debt', 'ST Debt', 'Cash', 'Free Cash Flow', 'Total Debt/EBITDA',
305 |         'Net Debt/EBITDA', 'Total MV', 'Total Debt/MV', 'Net Debt/MV', 'CFO/Debt', 'CFO', 'Interest Coverage',
306 |         'Total Liquidity', 'Current Liquidity', 'Current Liabilities', 'EPS Before Extras', 'PE', 'ROA', 'ROE']].values
307 | 
308 | y = df['Class'].values
309 | 
310 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42, stratify = y)
311 | 
312 | sc = StandardScaler()
313 | sc.fit(X_train)
314 | 
315 | X_train_std = sc.transform(X_train)
316 | X_test_std = sc.transform(X_test)
317 | 
318 | # KNN
319 | knn = KNeighborsClassifier()
320 | params_knn = {
321 |     'n_neighbors':range(1, 101)
322 | }
323 | grid_knn = GridSearchCV(estimator = knn,
324 |                         param_grid = params_knn,
325 |                         scoring = 'accuracy',
326 |                         cv = 10,
327 |                         n_jobs = -1)
328 | 
329 | grid_knn.fit(X_train_std, y_train)
330 | best_model_knn_muti = grid_knn.best_estimator_
331 | 
332 | print('muti=' + str(best_model_knn_muti.score(X_test_std, y_test)))
333 | 
334 | # RandomForest
335 | forest = RandomForestClassifier()
336 | params_forest = {
337 |     'criterion':['gini'],
338 |     'n_estimators':range(1, 101),
339 |     'random_state':[42]
340 | }
341 | grid_forest = GridSearchCV(estimator = forest,
342 |                            param_grid = params_forest,
343 |                            scoring = 'accuracy',
344 |                            cv = 10,
345 |                            n_jobs = -1)
346 | 
347 | grid_forest.fit(X_train_std, y_train)
348 | best_model_forest_muti = grid_forest.best_estimator_
349 | 
350 | print('muti=' + str(best_model_forest_muti.score(X_test_std, y_test)))
351 | 
352 | # DecisionTree
353 | tree = DecisionTreeClassifier()
354 | params_tree = {
355 |     'criterion':['gini'],
356 |     'max_depth':range(1, 101),
357 |     'random_state':[42]
358 | }
359 | grid_tree = GridSearchCV(estimator = tree,
360 |                          param_grid = params_tree,
361 |                          scoring = 'accuracy',
362 |                          cv = 10,
363 |                          n_jobs = -1)
364 | 
365 | grid_tree.fit(X_train_std, y_train)
366 | best_model_tree_muti = grid_tree.best_estimator_
367 | 
368 | print('muti=' + str(best_model_tree_muti.score(X_test_std, y_test)))
369 | 
370 | # logistic Regression
371 | lr = LogisticRegression(max_iter = 1000, solver = 'lbfgs', multi_class = 'auto')
372 | params_lr = {
373 |     'C':range(1, 101),
374 |     'random_state':[42]
375 | }
376 | grid_lr = GridSearchCV(estimator = lr,
377 |                        param_grid = params_lr,
378 |                        scoring = 'accuracy',
379 |                        cv = 10,
380 |                        n_jobs = -1)
381 | 
382 | grid_lr.fit(X_train_std, y_train)
383 | best_model_lr_muti = grid_lr.best_estimator_
384 | 
385 | print('muti=' + str(best_model_lr_muti.score(X_test_std, y_test)))
386 | 
387 | # This part select the feature use for fitting(but seem to make no sense)
388 | # If you want to use it, run it before the StandardScaler after the model
389 | 


--------------------------------------------------------------------------------
/source/GP2.final.py:
--------------------------------------------------------------------------------
  1 | # IE598 Machine Learning in Finance, Fall 2018
  2 | # University of Illinois at Urbana-Champaign
  3 | #
  4 | # Final Group Project
  5 | #
  6 | # Authors: Joseph Loss, Ruozhong Yang, Fengkai Xu, Biao Feng, and Yuchen Duan
  7 | #
  8 | # source code available at https://github.com/chicago-joe/Machine-Learning-in-Finance-Final-Project
  9 | # --------------------------------------------------------------------------------
 10 | import matplotlib.pyplot as plt
 11 | import numpy as np
 12 | import pandas as pd
 13 | import seaborn as sns
 14 | from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
 15 | from sklearn.feature_selection import SelectFromModel
 16 | from sklearn.linear_model import LinearRegression, Lasso, Ridge
 17 | from sklearn.metrics import mean_squared_error as MSE, r2_score, mean_squared_error
 18 | from sklearn.model_selection import train_test_split
 19 | from sklearn.preprocessing import StandardScaler
 20 | 
 21 | df = pd.read_excel('C:\\Users\\jloss\\PyCharmProjects\\Machine-Learning-in-Finance-Final-Project\\data\\GP2_EconCycle.xlsx', sep = ',')
 22 | 
 23 | cols = ['T1Y Index', 'T2Y Index', 'T3Y Index', 'T5Y Index', 'T7Y Index', 'T10Y Index', 'CP1M', 'CP3M', 'CP6M',
 24 |         'CP1M_T1Y', 'CP3M_T1Y', 'CP6M_T1Y', 'USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD']
 25 | 
 26 | ## Exploratory Data Analysis
 27 | df.dropna(inplace = True)
 28 | print(df.shape, df.info(), df.describe(), df.head())
 29 | 
 30 | CPTcols = ['CP1M_T1Y', 'CP3M_T1Y', 'CP6M_T1Y', 'USPHCI']
 31 | sns.pairplot(df[CPTcols], dropna = True, )
 32 | 
 33 | # plt.tight_layout()
 34 | # plt.savefig('E:\Study\Courses\Fall 2018\IE 598\IE598 Homework\Group Project\scatter_GP2_.png',dpi = 500)
 35 | plt.show()
 36 | 
 37 | cm = np.corrcoef(df[cols].values.T)
 38 | hm = sns.heatmap(cm,
 39 |                  cbar = False,
 40 |                  annot = True,
 41 |                  square = False,
 42 |                  fmt = '.1f',
 43 |                  annot_kws = { 'size':8 },
 44 |                  yticklabels = cols,
 45 |                  xticklabels = cols)
 46 | 
 47 | # plt.tight_layout()
 48 | # plt.savefig('E:\Study\Courses\Fall 2018\IE 598\IE598 Homework\Group Project\heatmap_rate_GP2_.png',dpi = 15000)
 49 | plt.show()
 50 | 
 51 | ## 3- month prediction and model
 52 | X = np.array(df.drop(['USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD'], 1))
 53 | y = np.array(df['PCT 3MO FWD'])
 54 | sc_x = StandardScaler()
 55 | sc_y = StandardScaler()
 56 | X_std = sc_x.fit_transform(X)
 57 | y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()
 58 | 
 59 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42)
 60 | 
 61 | # feature importance
 62 | feat_labels = cols[:-4]
 63 | forest = RandomForestRegressor(n_estimators = 500, random_state = 1)
 64 | forest.fit(X_train, y_train)
 65 | importances = forest.feature_importances_
 66 | 
 67 | indices = np.argsort(importances)[::-1]
 68 | print("3MO FWD RATE - Feature Importance")
 69 | for f in range(X_train.shape[1]):
 70 |     print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
 71 | 
 72 | print('\n')
 73 | plt.title('Feature Importance PCT 3MO FWD')
 74 | plt.bar(range(X_train.shape[1]), importances[indices], align = 'center')
 75 | plt.xticks(range(X_train.shape[1]), feat_labels, rotation = 90)
 76 | plt.xlim([-1, X_train.shape[1]])
 77 | plt.show()
 78 | 
 79 | # Selection
 80 | model = SelectFromModel(forest, prefit = True)
 81 | X_train = model.transform(X_train)
 82 | X_test = model.transform(X_test)
 83 | print(X_test.shape)
 84 | print(X_train.shape)
 85 | 
 86 | # LinearRegression
 87 | reg = LinearRegression()
 88 | reg.fit(X_train, y_train)
 89 | y_train_pred = reg.predict(X_train)
 90 | y_test_pred = reg.predict(X_test)
 91 | plt.scatter(y_train_pred, y_train_pred - y_train,
 92 |             c = 'steelblue', marker = 'o', edgecolor = 'white',
 93 |             label = 'Training data')
 94 | plt.scatter(y_test_pred, y_test_pred - y_test,
 95 |             c = 'limegreen', marker = 's', edgecolor = 'white',
 96 |             label = 'Test data')
 97 | plt.xlabel('Predicted values')
 98 | plt.ylabel('Residuals')
 99 | plt.legend(loc = 'upper left')
100 | plt.hlines(y = 0, xmin = 0, xmax = 1, color = 'black', lw = 2)
101 | plt.xlim([0, 1])
102 | plt.savefig('LinearRegression.png', dpi = 300)
103 | plt.show()
104 | print('(LR)MSE train: %.3f, test: %.3f' % (
105 |     mean_squared_error(y_train, y_train_pred),
106 |     mean_squared_error(y_test, y_test_pred)))
107 | print('(LR)R^2 train: %.3f, test: %.3f' % (
108 |     r2_score(y_train, y_train_pred),
109 |     r2_score(y_test, y_test_pred)))
110 | print('(LR)Slope: %.3f' % reg.coef_[0])
111 | print('(LR)Intercept: %.3f' % reg.intercept_)
112 | 
113 | ## 6 month prediction and model
114 | X = np.array(df.drop(['USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD'], 1))
115 | y = np.array(df['PCT 6MO FWD'])
116 | sc_x = StandardScaler()
117 | sc_y = StandardScaler()
118 | X_std = sc_x.fit_transform(X)
119 | y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()
120 | 
121 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42)
122 | feat_labels = cols[:-4]
123 | 
124 | forest = RandomForestRegressor(n_estimators = 500, random_state = 1)
125 | forest.fit(X_train, y_train)
126 | importances = forest.feature_importances_
127 | 
128 | print("6MO FWD RATE - Feature Importance")
129 | indices = np.argsort(importances)[::-1]
130 | for f in range(X_train.shape[1]):
131 |     print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
132 | 
133 | print('\n')
134 | plt.title('Feature Importance PCT 6MO FWD')
135 | plt.bar(range(X_train.shape[1]), importances[indices], align = 'center')
136 | plt.xticks(range(X_train.shape[1]), feat_labels, rotation = 90)
137 | plt.xlim([-1, X_train.shape[1]])
138 | plt.show()
139 | 
140 | # Selection
141 | model = SelectFromModel(forest, prefit = True)
142 | X_train = model.transform(X_train)
143 | X_test = model.transform(X_test)
144 | print(X_test.shape)
145 | print(X_train.shape)
146 | 
147 | # RidgeRegression
148 | alpha_space = np.logspace(-3, 0, 4)
149 | ridge = Ridge(normalize = True)
150 | # Compute scores over range of alphas
151 | for alpha in alpha_space:
152 | 
153 |     # Specify the alpha value to use: ridge.alpha
154 |     ridge.alpha = alpha
155 |     ridge.fit(X_train, y_train)
156 |     y_train_pred = ridge.predict(X_train)
157 |     y_test_pred = ridge.predict(X_test)
158 |     plt.scatter(y_train_pred, y_train_pred - y_train,
159 |                 c = 'steelblue', marker = 'o', edgecolor = 'white',
160 |                 label = 'Training data')
161 |     plt.scatter(y_test_pred, y_test_pred - y_test,
162 |                 c = 'limegreen', marker = 's', edgecolor = 'white',
163 |                 label = 'Test data')
164 |     plt.xlabel('Predicted values')
165 |     plt.ylabel('Residuals')
166 |     plt.legend(loc = 'upper left')
167 |     plt.hlines(y = 0, xmin = 0, xmax = 1, color = 'black', lw = 2)
168 |     plt.xlim([0, 1])
169 |     plt.savefig('Ridge(alpha=' + str(alpha) + ' ).png', dpi = 300)
170 |     plt.show()
171 |     print('Ridgealpha: %.3f' % (alpha))
172 |     print('MSE train: %.3f, test: %.3f' % (
173 |         mean_squared_error(y_train, y_train_pred),
174 |         mean_squared_error(y_test, y_test_pred)))
175 |     print('R^2 train: %.3f, test: %.3f' % (
176 |         r2_score(y_train, y_train_pred),
177 |         r2_score(y_test, y_test_pred)))
178 |     print('Slope: %.3f' % ridge.coef_[0])
179 |     print('Intercept: %.3f' % ridge.intercept_)
180 | 
181 | ## 9-month prediction & model
182 | X = np.array(df.drop(['USPHCI', 'PCT 3MO FWD', 'PCT 6MO FWD', 'PCT 9MO FWD'], 1))
183 | y = np.array(df['PCT 9MO FWD'])
184 | sc_x = StandardScaler()
185 | sc_y = StandardScaler()
186 | X_std = sc_x.fit_transform(X)
187 | y_std = sc_y.fit_transform(y[:, np.newaxis]).flatten()
188 | 
189 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42)
190 | feat_labels = cols[:-4]
191 | 
192 | forest = RandomForestRegressor(n_estimators = 500, random_state = 1)
193 | forest.fit(X_train, y_train)
194 | importances = forest.feature_importances_
195 | 
196 | print("9MO FWD RATE - Feature Importance")
197 | indices = np.argsort(importances)[::-1]
198 | for f in range(X_train.shape[1]):
199 |     print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))
200 | print('\n')
201 | plt.title('Feature Importance: PCT 9MO FWD ')
202 | plt.bar(range(X_train.shape[1]), importances[indices], align = 'center')
203 | plt.xticks(range(X_train.shape[1]), feat_labels, rotation = 90)
204 | plt.xlim([-1, X_train.shape[1]])
205 | plt.show()
206 | 
207 | # Selection
208 | model = SelectFromModel(forest, prefit = True)
209 | X_train = model.transform(X_train)
210 | X_test = model.transform(X_test)
211 | print(X_test.shape)
212 | print(X_train.shape)
213 | 
214 | # LassoRegression
215 | alpha_space = np.logspace(-6, -3, 4)
216 | lasso = Lasso(normalize = True)
217 | # Compute scores over range of alphas
218 | for alpha in alpha_space:
219 | 
220 |     # Specify the alpha value to use: ridge.alpha
221 |     lasso.alpha = alpha
222 |     lasso.fit(X_train, y_train)
223 |     y_train_pred = lasso.predict(X_train)
224 |     y_test_pred = lasso.predict(X_test)
225 |     plt.scatter(y_train_pred, y_train_pred - y_train,
226 |                 c = 'steelblue', marker = 'o', edgecolor = 'white',
227 |                 label = 'Training data')
228 |     plt.scatter(y_test_pred, y_test_pred - y_test,
229 |                 c = 'limegreen', marker = 's', edgecolor = 'white',
230 |                 label = 'Test data')
231 |     plt.xlabel('Predicted values')
232 |     plt.ylabel('Residuals')
233 |     plt.legend(loc = 'upper left')
234 |     plt.hlines(y = 0, xmin = 0, xmax = 1, color = 'black', lw = 2)
235 |     plt.savefig('Lasso(alpha=' + str(alpha) + ' ).png', dpi = 300)
236 |     plt.xlim([0, 1])
237 |     plt.show()
238 |     print('Lassoalpha: %.6f' % (lasso.alpha))
239 |     print('MSE train: %.3f, test: %.3f' % (
240 |         mean_squared_error(y_train, y_train_pred),
241 |         mean_squared_error(y_test, y_test_pred)))
242 |     print('R^2 train: %.3f, test: %.3f' % (
243 |         r2_score(y_train, y_train_pred),
244 |         r2_score(y_test, y_test_pred)))
245 |     print('Slope: %.3f' % lasso.coef_[0])
246 |     print('Intercept: %.3f' % lasso.intercept_)
247 | 
248 | ## # Part 5 - Ensemble Learning
249 | # Set seed for reproducibility
250 | SEED = 1
251 | 
252 | # Split dataset into 90% train and 10% test
253 | X_train, X_test, y_train, y_test = train_test_split(X_std, y_std, test_size = 0.1, random_state = 42)
254 | 
255 | # Instantiate a GradientBoostingRegressor 'gbr'
256 | gbr = GradientBoostingRegressor(max_features = 4, learning_rate = 0.1, n_estimators = 500,
257 |                                 subsample = 0.3, random_state = 42)
258 | gbr.fit(X_train, y_train)
259 | # Predict the test set labels
260 | y_pred = gbr.predict(X_test)
261 | 
262 | # Evaluate the test set RMSE
263 | mse = MSE(y_test, y_pred)
264 | rsquared = r2_score(y_test, y_pred)
265 | 
266 | # Print the test set RMSE
267 | print('\n')
268 | print('Test set MSE: {:.2f}'.format(mse))
269 | print('Test set R-Squared: {:.2f}'.format(rsquared))
270 | 
271 | 


--------------------------------------------------------------------------------