├── .DS_Store
├── .gitattributes
├── Data
├── 1-focasting_data
│ ├── .DS_Store
│ ├── sector10_clean.xlsx
│ ├── sector15_clean.xlsx
│ ├── sector20_clean.xlsx
│ ├── sector25_clean.xlsx
│ ├── sector30_clean.xlsx
│ ├── sector35_clean.xlsx
│ ├── sector40_clean.xlsx
│ ├── sector45_clean.xlsx
│ ├── sector50_clean.xlsx
│ ├── sector55_clean.xlsx
│ └── sector60_clean.xlsx
├── 1-sp500_adj_price.csv.zip
├── 1-spx_price.xlsx
├── 2-portfolio_data
│ ├── .DS_Store
│ ├── equally_weighted_user8.xlsx
│ ├── mean_weighted_user8.xlsx
│ ├── minimum_weighted_user8.xlsx
│ └── stocks_selected_total_user8.csv
├── all_return_table.pickle
├── all_stocks_info.pickle
├── fundamental_final_table.xlsx
└── stocks_weight_table.xlsx
├── README.md
├── code
├── .DS_Store
├── ml_model.py
└── old_Rcode
│ ├── .DS_Store
│ ├── fundamental_ML_model.R
│ ├── fundamental_run_model.R
│ └── fundamental_select_stock.R
├── figs
├── chart10_insample.PNG
├── chart11_overallPerformance.PNG
├── chart1_datasetPeriod.PNG
├── chart2_rolling_windows.PNG
├── chart3_modelError.PNG
├── chart4_predictedReturn1.PNG
├── chart4_predictedReturn2.PNG
├── chart5_coefficient.PNG
├── chart6_selectedStocks.PNG
├── chart7_efficient1.PNG
├── chart8_PnL.png
├── chart9_TotalValue.png
├── dataperiod.png
├── efficient1.jpg
├── pnl1.jpg
├── rolling_windows.vsdx
└── transaction cost.PNG
├── fundamental_back_testing.ipynb
├── fundamental_portfolio.ipynb
└── fundamental_run_model.py
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/.DS_Store
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/Data/1-focasting_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/.DS_Store
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector10_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector10_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector15_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector15_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector20_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector20_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector25_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector25_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector30_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector30_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector35_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector35_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector40_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector40_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector45_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector45_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector50_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector50_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector55_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector55_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-focasting_data/sector60_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector60_clean.xlsx
--------------------------------------------------------------------------------
/Data/1-sp500_adj_price.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-sp500_adj_price.csv.zip
--------------------------------------------------------------------------------
/Data/1-spx_price.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-spx_price.xlsx
--------------------------------------------------------------------------------
/Data/2-portfolio_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/.DS_Store
--------------------------------------------------------------------------------
/Data/2-portfolio_data/equally_weighted_user8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/equally_weighted_user8.xlsx
--------------------------------------------------------------------------------
/Data/2-portfolio_data/mean_weighted_user8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/mean_weighted_user8.xlsx
--------------------------------------------------------------------------------
/Data/2-portfolio_data/minimum_weighted_user8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/minimum_weighted_user8.xlsx
--------------------------------------------------------------------------------
/Data/all_return_table.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/all_return_table.pickle
--------------------------------------------------------------------------------
/Data/all_stocks_info.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/all_stocks_info.pickle
--------------------------------------------------------------------------------
/Data/fundamental_final_table.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/fundamental_final_table.xlsx
--------------------------------------------------------------------------------
/Data/stocks_weight_table.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/stocks_weight_table.xlsx
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Dynamic-Stock-Recommendation-Machine_Learning
2 |
3 | ## First Author: Published paper on IEEE TrustCom 2018 (http://www.cloud-conf.net/trustcom18/)
4 | Hongyang Yang, Xiao-Yang Liu, Qingwei W. [A Practical Machine Learning Approach for Dynamic Stock Recommendation](https://ssrn.com/abstract=3302088). IEEE TrustCom 2018.
5 |
6 | ### IEEE Official Link of the paper (https://ieeexplore.ieee.org/abstract/document/8456121)
7 | ### SSRN Version: (https://ssrn.com/abstract=3302088)
8 |
9 | ## Abstract:
10 | Stock recommendation is vital to investment companies and investors. However, no single stock selection strategy will always win while analysts may not have enough time to check all S&P 500 stocks (the Standard & Poor’s 500). In this paper, we propose a practical scheme that recommends stocks from S&P 500 using machine learning. Our basic idea is to buy and hold the top 20% stocks dynamically. First, we select representative stock indicators with good explanatory power. Secondly, we take five frequently used machine learning methods, including linear regression, ridge regression, stepwise regression, random forest and generalized boosted regression, to model stock indicators and quarterly log-return in a rolling window. Thirdly, we choose the model with the lowest Mean Square Error in each period to rank stocks. Finally, we test the selected stocks by conducting portfolio allocation methods such as equally weighted, mean- variance, and minimum-variance. Our empirical results show that the proposed scheme outperforms the long-only strategy on the S&P 500 index in terms of Sharpe ratio and cumulative returns.
11 |
12 | ## Index Term:
13 | Stock recommendation, fundamental value investing, machine learning, model selection, risk management
14 |
15 | ## Project summary:
16 | + We developed a practical approach to using machine-learning methods selecting S&P 500 stocks based on financial ratios (e.g., EPS, ROA, ROE, etc). Outperformed the S&P 500 index on out of sample data, achieved a Sharpe ratio of 0.5 (0.19 on SPX).
17 | + We performed feature selection by 11 GICS sectors based on a rolling window to choose the lowest MSE model among Linear Regression, Stepwise Regression, Regression with Ridge, Random Forest, and GBM. Applied a model ensemble method.
18 |
19 |
20 |
21 |
22 |
23 | ## Data:
24 | Retrieved from __WRDS (Wharton Research Data Services)__, Compustat Industrial [27 years daily and quarterly Data]
25 |
26 |
27 |
28 |
29 | + __S&P 500 Fundamental Quarterly Data__ ([fundamental_final_table.xlsx](Data/fundamental_final_table.xlsx))
30 | + Database: Compustat North America (Fundamentals Quarterly) and (Index Constituents)
31 | + Timeline: 27 years (1990-2017)
32 | + Tickers: 1193 stock (all historical S&P 500 component stocks)
33 | + Value: 20 financial ratios calculated from raw accouting report data
34 |
35 | + __S&P 500 Historical Component Stocks Adjusted Daily Price__ ([1-sp500_adj_price.csv.zip](Data/1-sp500_adj_price.csv.zip))
36 | + Database: Compustat North America (Security Daily)
37 | + Timeline: 27 years (1990-2017)
38 | + Tickers: 1193 stock (all historical S&P 500 component stocks)
39 | + Value: Adjusted Daily Close Price
40 |
41 | + __S&P 500 Index Daily Price__ ([1-spx_price.xlsx](Data/1-spx_price.xlsx))
42 | + Database: Yahoo Finance
43 | + Timeline: 27 years (1990-2017)
44 | + Tickers: SPX
45 | + Value: Adjusted Daily Close Price
46 |
47 | ## Code:
48 |
49 | ### __Focasting Model__:
50 | + __Input__: 11 Excel files of cleaned data about fundamental financial ratios (sector 10-Energy, sector 15-Materials, sector 20-Industrials, sector 25-Consumer Discretionary, sector 30-Consumer Staples, sector 35-Health Care, sector 40-Financials, sector 45-Information Technology, sector 50-Telecommunication Services, sector 55-Utilities, sector 60-Real Estate)
51 | + __Python Script__: 2 Scripts
52 | + [ml_model.py](code/ml_model.py): The forecasting function (cornerstone of this project)
53 | + [fundamental_run_model.py](fundamental_run_model.py): The main function to run the forecasting model
54 | ```shell
55 |
56 | python3 fundamental_run_model.py \
57 | -sector_name sector10 \
58 | -fundamental Data/fundamental_final_table.xlsx \
59 | -sector Data/1-focasting_data/sector10_clean.xlsx
60 | ```
61 |
62 |
63 | + __Old R Script__: 3 R Scripts
64 | + [fundamental_run_model.R](code/fundamental_run_model.R): The main function to run the forecasting model
65 | + [fundamental_ML_model.R](code/fundamental_ML_model.R): The forecasting function (cornerstone of this project)
66 | + [fundamental_select_stock.R](code/fundamental_select_stock.R): The function to select top 20% stocks in each sector
67 | + __Output__: [a CSV file](Data/2-portfolio_data/stocks_selected_total_user8.csv) includes __tic__: the stock name, __predicted_return__: predicted return of next quarter by our model, __trade_date__: the date to execute the trades
68 |
69 |
70 |
71 |
72 |
73 | ### __Portfolio Allocation__:
74 |
75 | + __Input__: 2 files
76 | + The [CSV file](Data/2-portfolio_data/stocks_selected_total_user8.csv) generated by forecasting model
77 | + The [adjusted close price data of S&P 500 stocks](Data/1-sp500_adj_price.csv.zip) to calculate covariance matrix
78 |
79 | + __Script__: [fundamental_portfolio.ipynb](fundamental_portfolio.ipynb)
80 |
81 | + __Output__: 3 Excel files each with the following 4 columns
82 | 1. __tic__: the stock name
83 | 2. __predicted_return__: predicted return of next quarter by our model
84 | 3. __weights__: the weights to trade
85 | 4. __trade_date__: the date to execute the trades
86 |
87 |
88 |
89 | ### __Back-testing Model__:
90 |
91 | + __Input__: 5 files
92 | + [equally_weighted](Data/2-portfolio_data/equally_weighted_user8.xlsx): equally-weighted portfolio (Portfolio Benchmark)
93 | + [mean_weighted](Data/2-portfolio_data/mean_weighted_user8.xlsx): mean-variance portfolio
94 | + [minimum_weighted](Data/2-portfolio_data/minimum_weighted_user8.xlsx): minimum-variance portfolio (our model)
95 | + [adjusted daily close price of S&P 500 stocks](Data/1-sp500_adj_price.csv.zip): to calcualte quarterly return
96 | + [SPX adjusted daily close price](Data/1-spx_price.xlsx): The Market Index (Overall Benchmark)
97 |
98 | + __Script__: 1 Python jupyter notebook Script
99 | + [fundamental_back_testing.ipynb](code/fundamental_back_testing.ipynb): The back-testing function
100 |
101 | + __Output__:
102 | 1. Quarterly return of our portfolio with transaction cost
103 | 2. Performance Evaluation: total return, annulized return and standard deviation, maximum drawdown, Sharpe ratio
104 |
--------------------------------------------------------------------------------
/code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/code/.DS_Store
--------------------------------------------------------------------------------
/code/ml_model.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | import traceback
4 |
5 | from sklearn.model_selection import train_test_split
6 | from sklearn.svm import SVC
7 | from sklearn.svm import SVR
8 | from sklearn.metrics import confusion_matrix
9 | from sklearn.model_selection import cross_val_score, cross_val_predict
10 | from sklearn.linear_model import Ridge
11 |
12 | from sklearn.linear_model import LinearRegression
13 | from sklearn.feature_selection import RFE
14 | from sklearn.linear_model import Lasso
15 | from sklearn.ensemble import RandomForestRegressor
16 | from sklearn.ensemble import GradientBoostingRegressor
17 | from sklearn.ensemble import AdaBoostRegressor
18 |
19 | from sklearn.model_selection import TimeSeriesSplit, GridSearchCV,RandomizedSearchCV
20 |
21 | from keras.models import Sequential
22 | from keras.layers import Dense
23 | from keras.layers import LSTM
24 | from keras.layers import Dropout
25 |
26 | import os
27 | import errno
28 |
29 |
30 | def prepare_rolling_train(df,features_column,label_column,date_column,unique_datetime,testing_windows,first_trade_date_index, max_rolling_window_index,current_index):
31 | if current_index <=max_rolling_window_index:
32 | train=df[(df[date_column] >= unique_datetime[0]) \
33 | & (df[date_column] < unique_datetime[current_index-testing_windows])]
34 | else:
35 | train=df[(df[date_column] >= unique_datetime[current_index-max_rolling_window_index]) \
36 | & (df[date_column] < unique_datetime[current_index-testing_windows])]
37 |
38 | X_train=train[features_column]
39 | y_train=train[label_column]
40 | return X_train,y_train
41 |
42 | def prepare_rolling_test(df,features_column,label_column,date_column,unique_datetime,testing_windows,fist_trade_date_index, current_index):
43 | test=df[(df[date_column] >= unique_datetime[current_index-testing_windows]) \
44 | & (df[date_column] < unique_datetime[current_index])]
45 | X_test=test[features_column]
46 | y_test=test[label_column]
47 | return X_test,y_test
48 |
49 | def prepare_trade_data(df,features_column,label_column,date_column,tic_column,unique_datetime,testing_windows,fist_trade_date_index, current_index):
50 | trade = df[df[date_column] == unique_datetime[current_index]]
51 | X_trade = trade[features_column]
52 | y_trade = trade[label_column]
53 | trade_tic = trade[tic_column].values
54 | return X_trade,y_trade,trade_tic
55 |
56 |
57 | def train_linear_regression(X_train,y_train):
58 |
59 | lr_regressor = LinearRegression()
60 | model = lr_regressor.fit(X_train, y_train)
61 |
62 | return model
63 |
64 | def train_recursive_feature_elimination(X_train,y_train):
65 |
66 | lr_regressor = LinearRegression(random_state = 42)
67 | model = RFE(lr_regressor)
68 |
69 | return model
70 |
71 | def train_lasso(X_train, y_train):
72 | # lasso_regressor = Lasso()
73 | # model = lasso_regressor.fit(X_train, y_train)
74 |
75 | lasso = Lasso(random_state = 42)
76 | # scoring_method = 'r2'
77 | # scoring_method = 'explained_variance'
78 | # scoring_method = 'neg_mean_absolute_error'
79 | scoring_method = 'neg_mean_squared_error'
80 | #scoring_method = 'neg_mean_squared_log_error'
81 | parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
82 | # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced)
83 | lasso_regressor = GridSearchCV(lasso, parameters, scoring=scoring_method, cv=3)
84 | lasso_regressor.fit(X_train, y_train)
85 |
86 | model = lasso_regressor.best_estimator_
87 | return model
88 |
89 | def train_ridge(X_train, y_train):
90 | # lasso_regressor = Lasso()
91 | # model = lasso_regressor.fit(X_train, y_train)
92 |
93 | ridge = Ridge(random_state = 42)
94 | # scoring_method = 'r2'
95 | # scoring_method = 'explained_variance'
96 | # scoring_method = 'neg_mean_absolute_error'
97 | scoring_method = 'neg_mean_squared_error'
98 | #scoring_method = 'neg_mean_squared_log_error'
99 | parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
100 | # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced)
101 | ridge_regressor = GridSearchCV(ridge, parameters, scoring=scoring_method, cv=3)
102 | ridge_regressor.fit(X_train, y_train)
103 |
104 | model = ridge_regressor.best_estimator_
105 | return model
106 |
107 | def train_random_forest(X_train, y_train):
108 |
109 | random_grid = {
110 | #'max_depth': [10, 20, 40, 80, 100, None],
111 | 'max_features': ['sqrt'],
112 | 'min_samples_leaf': [0.05,0.1,0.2],
113 | 'min_samples_split': np.linspace(0.1, 1, 10, endpoint=True),
114 | 'n_estimators': [75,100,200]}
115 | # scoring_method = 'r2'
116 | # scoring_method = 'explained_variance'
117 | # scoring_method = 'neg_mean_absolute_error'
118 | scoring_method = 'neg_mean_squared_error'
119 | #scoring_method = 'neg_mean_squared_log_error'
120 |
121 | # my_cv_rf = TimeSeriesSplit(n_splits=5).split(X_train_rf)
122 | rf = RandomForestRegressor(random_state=42)
123 | #RandomizedSearchCV
124 | #randomforest_regressor = RandomizedSearchCV(estimator=rf,
125 | # param_distributions=random_grid,
126 | # n_iter = 100,
127 | # cv=3,
128 | # n_jobs=-1,
129 | # scoring=scoring_method,
130 | # verbose=0)
131 | #GridSearchCV
132 | randomforest_regressor = GridSearchCV(estimator=rf,
133 | param_grid=random_grid,
134 | cv=3,
135 | n_jobs=-1,
136 | scoring=scoring_method,
137 | verbose=0)
138 |
139 | randomforest_regressor.fit(X_train, y_train)
140 | #print(randomforest_regressor.best_params_ )
141 | model = randomforest_regressor.best_estimator_
142 | '''
143 | randomforest_regressor = RandomForestRegressor(random_state = 42,n_estimators = 400, max_features='auto')
144 | #randomforest_regressor = RandomForestRegressor(random_state = 42,n_estimators = 300)
145 |
146 | model = randomforest_regressor.fit(X_train, y_train)
147 | '''
148 | return model
149 |
150 |
151 | def train_svm(X_train, y_train):
152 | svr = SVR(kernel = 'rbf')
153 |
154 | param_grid_svm = {'C':[0.001, 0.1, 1],'gamma': [1e-7,0.1]}
155 | #param_grid_svm = {'kernel': ('linear', 'rbf','poly'), 'C':[0.001, 0.01, 0.1, 1, 10],'gamma': [1e-7, 1e-4,0.001,0.1],'epsilon':[0.1,0.2,0.5,0.3]}
156 |
157 | # scoring_method = 'r2'
158 | # scoring_method = 'explained_variance'
159 | # scoring_method = 'neg_mean_absolute_error'
160 | scoring_method = 'neg_mean_squared_error'
161 | #scoring_method = 'neg_mean_squared_log_error'
162 |
163 | svm_regressor = GridSearchCV(estimator=svr, param_grid =param_grid_svm, cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
164 |
165 | svm_regressor.fit(X_train, y_train)
166 | model = svm_regressor.best_estimator_
167 | #estimator = svm_regressor.best_estimator_
168 | #selector = RFE(estimator, 5, step=1)
169 | #model = selector.fit(X, y)
170 |
171 | return model
172 |
173 |
174 | def train_gbm(X_train, y_train):
175 | gbm = GradientBoostingRegressor(random_state = 42)
176 | # model = gbm.fit(X_train, y_train)
177 |
178 | param_grid_gbm = {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 250, 500,1000]}
179 | # scoring_method = 'r2'
180 | # scoring_method = 'explained_variance'
181 | # scoring_method = 'neg_mean_absolute_error'
182 | scoring_method = 'neg_mean_squared_error'
183 | #scoring_method = 'neg_mean_squared_log_error'
184 | gbm_regressor = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm,
185 | cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
186 |
187 | gbm_regressor.fit(X_train, y_train)
188 | model = gbm_regressor.best_estimator_
189 | '''
190 |
191 | gbm_regressor = GradientBoostingRegressor()
192 | model = gbm_regressor.fit(X_train, y_train)
193 | '''
194 | return model
195 |
196 |
197 |
198 |
199 | def train_ada(X_train, y_train):
200 | ada = AdaBoostRegressor()
201 |
202 | # model = ada.fit(X_train, y_train)
203 |
204 | param_grid_ada = {'n_estimators': [20, 100],
205 | 'learning_rate': [0.01, 0.05, 1]}
206 | # scoring_method = 'r2'
207 | # scoring_method = 'explained_variance'
208 | # scoring_method = 'neg_mean_absolute_error'
209 | # scoring_method = 'neg_mean_squared_error'
210 | #scoring_method = 'neg_mean_squared_log_error'
211 |
212 | ada_regressor = GridSearchCV(estimator=ada, param_distributions=param_grid_ada,
213 | cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
214 |
215 | ada_regressor.fit(X_train, y_train)
216 | model = ada_regressor.best_estimator_
217 | '''
218 | ada_regressor = AdaBoostRegressor()
219 | model = ada_regressor.fit(X_train, y_train)
220 | '''
221 | return model
222 |
223 |
224 | def evaluate_model(model, X_test, y_test):
225 | from sklearn.metrics import mean_squared_error
226 | #from sklearn.metrics import mean_squared_log_error
227 |
228 | from sklearn.metrics import mean_absolute_error
229 | from sklearn.metrics import explained_variance_score
230 | from sklearn.metrics import r2_score
231 | y_predict = model.predict(X_test)
232 |
233 | mae = mean_absolute_error(y_test, y_predict)
234 |
235 |
236 | mse = mean_squared_error(y_test, y_predict)
237 | #msle = mean_squared_log_error(y_test, y_predict)
238 |
239 | explained_variance = explained_variance_score(y_test, y_predict)
240 | r2 = r2_score(y_test, y_predict)
241 |
242 | return mse
243 |
244 |
245 | def append_return_table(df_predict, unique_datetime, y_trade_return, trade_tic, current_index):
246 | tmp_table = pd.DataFrame(columns=trade_tic)
247 | tmp_table = tmp_table.append(pd.Series(y_trade_return, index=trade_tic), ignore_index=True)
248 | df_predict.loc[unique_datetime[current_index]][tmp_table.columns] = tmp_table.loc[0]
249 |
250 |
251 | def run_4model(df,features_column, label_column,date_column,tic_column,
252 | unique_ticker, unique_datetime, trade_date,
253 | first_trade_date_index=20,
254 | testing_windows=4,
255 | max_rolling_window_index=44):
256 | ## initialize all the result tables
257 | ## need date as index and unique tic name as columns
258 | df_predict_lr = pd.DataFrame(columns=unique_ticker, index=trade_date)
259 | df_predict_rf = pd.DataFrame(columns=unique_ticker, index=trade_date)
260 | df_predict_ridge = pd.DataFrame(columns=unique_ticker, index=trade_date)
261 | df_predict_gbm = pd.DataFrame(columns=unique_ticker, index=trade_date)
262 |
263 | df_predict_best = pd.DataFrame(columns=unique_ticker, index=trade_date)
264 | df_best_model_name = pd.DataFrame(columns=['model_name'], index=trade_date)
265 | evaluation_record = {}
266 | # first trade date is 1995-06-01
267 | # fist_trade_date_index = 20
268 | # testing_windows = 6
269 |
270 | for i in range(first_trade_date_index, len(unique_datetime)):
271 | try:
272 | # prepare training data
273 | X_train, y_train = prepare_rolling_train(df,
274 | features_column,
275 | label_column,
276 | date_column,
277 | unique_datetime,
278 | testing_windows,
279 | first_trade_date_index,
280 | max_rolling_window_index,
281 | current_index=i
282 | )
283 |
284 | # prepare testing data
285 | X_test, y_test = prepare_rolling_test(df,
286 | features_column,
287 | label_column,
288 | date_column,
289 | unique_datetime,
290 | testing_windows,
291 | first_trade_date_index,
292 | current_index=i)
293 |
294 | # prepare trade data
295 | X_trade, y_trade, trade_tic = prepare_trade_data(df,
296 | features_column,
297 | label_column,
298 | date_column,
299 | tic_column,
300 | unique_datetime,
301 | testing_windows,
302 | first_trade_date_index,
303 | current_index=i)
304 |
305 | # Training
306 | lr_model = train_linear_regression(X_train, y_train)
307 | rf_model = train_random_forest(X_train, y_train)
308 | ridge_model = train_ridge(X_train, y_train)
309 | gbm_model = train_gbm(X_train, y_train)
310 |
311 |
312 | # Validation
313 | lr_eval = evaluate_model(lr_model, X_test, y_test)
314 | rf_eval = evaluate_model(rf_model, X_test, y_test)
315 | ridge_eval = evaluate_model(ridge_model, X_test, y_test)
316 | gbm_eval = evaluate_model(gbm_model, X_test, y_test)
317 |
318 | # Trading
319 | y_trade_lr = lr_model.predict(X_trade)
320 | y_trade_rf = rf_model.predict(X_trade)
321 | y_trade_ridge = ridge_model.predict(X_trade)
322 | y_trade_gbm = gbm_model.predict(X_trade)
323 |
324 |
325 | # Decide the best model
326 | eval_data = [[lr_eval, y_trade_lr],
327 | [rf_eval, y_trade_rf] ,
328 | [ridge_eval, y_trade_ridge],
329 | [gbm_eval, y_trade_gbm]
330 | ]
331 | eval_table = pd.DataFrame(eval_data, columns=['model_eval', 'model_predict_return'],
332 | index=['lr', 'rf','ridge','gbm'])
333 |
334 |
335 | evaluation_record[unique_datetime[i]]=eval_table
336 |
337 | # lowest error score model
338 | y_trade_best = eval_table.model_predict_return.values[eval_table.model_eval == eval_table.model_eval.min()][0]
339 | best_model_name = eval_table.index.values[eval_table.model_eval == eval_table.model_eval.min()][0]
340 |
341 | # Highest Explained Variance
342 | # y_trade_best = eval_table.model_predict_return.values[eval_table.model_eval==eval_table.model_eval.max()][0]
343 | # best_model_name = eval_table.index.values[eval_table.model_eval==eval_table.model_eval.max()][0]
344 |
345 | df_best_model_name.loc[unique_datetime[i]] = best_model_name
346 |
347 | # Prepare Predicted Return table
348 | append_return_table(df_predict_lr, unique_datetime, y_trade_lr, trade_tic, current_index=i)
349 | append_return_table(df_predict_rf, unique_datetime, y_trade_rf, trade_tic, current_index=i)
350 | append_return_table(df_predict_ridge, unique_datetime, y_trade_ridge, trade_tic, current_index=i)
351 | append_return_table(df_predict_gbm, unique_datetime, y_trade_gbm, trade_tic, current_index=i)
352 |
353 | append_return_table(df_predict_best, unique_datetime, y_trade_best, trade_tic, current_index=i)
354 |
355 | print('Trade Date: ', unique_datetime[i])
356 |
357 | except Exception:
358 | traceback.print_exc()
359 | df_evaluation = get_model_evaluation_table(evaluation_record,trade_date)
360 | return (df_predict_lr,
361 | df_predict_rf,
362 | df_predict_ridge,
363 | df_predict_gbm,
364 | df_predict_best,
365 | df_best_model_name,
366 | evaluation_record,
367 | df_evaluation)
368 |
369 |
370 | def get_model_evaluation_table(evaluation_record,trade_date):
371 | evaluation_list = []
372 | for d in trade_date:
373 | try:
374 | evaluation_list.append(evaluation_record[d]['model_eval'].values)
375 | except:
376 | print('error')
377 | df_evaluation = pd.DataFrame(evaluation_list,columns = ['linear_regression', 'random_forest','ridge','gbm'])
378 | df_evaluation.index = trade_date
379 | return df_evaluation
380 |
381 | def save_model_result(sector_result,sector_name):
382 | df_predict_lr = sector_result[0].astype(np.float64)
383 | df_predict_rf = sector_result[1].astype(np.float64)
384 | df_predict_ridge = sector_result[2].astype(np.float64)
385 | df_predict_gbm = sector_result[3].astype(np.float64)
386 | df_predict_best = sector_result[4].astype(np.float64)
387 |
388 | df_best_model_name = sector_result[5]
389 | df_evaluation_score = sector_result[6]
390 | df_model_score = sector_result[7]
391 |
392 |
393 |
394 | filename = 'results/'+sector_name+'/'
395 | if not os.path.exists(os.path.dirname(filename)):
396 | try:
397 | os.makedirs(os.path.dirname(filename))
398 | except OSError as exc: # Guard against race condition
399 | if exc.errno != errno.EEXIST:
400 | raise
401 |
402 |
403 | df_predict_lr.to_csv('results/'+sector_name+'/df_predict_lr.csv')
404 | df_predict_rf.to_csv('results/'+sector_name+'/df_predict_rf.csv')
405 | df_predict_ridge.to_csv('results/'+sector_name+'/df_predict_ridge.csv')
406 | df_predict_gbm.to_csv('results/'+sector_name+'/df_predict_gbm.csv')
407 | df_predict_best.to_csv('results/'+sector_name+'/df_predict_best.csv')
408 | df_best_model_name.to_csv('results/'+sector_name+'/df_best_model_name.csv')
409 | #df_evaluation_score.to_csv('results/'+sector_name+'/df_evaluation_score.csv')
410 | df_model_score.to_csv('results/'+sector_name+'/df_model_score.csv')
411 |
412 |
413 |
414 | def calculate_sector_daily_return(daily_price, unique_ticker,trade_date):
415 | daily_price_pivot = pd.pivot_table(daily_price, values='adj_price', index=['datadate'],
416 | columns=['tic'], aggfunc=np.mean)
417 | daily_price_pivot=daily_price_pivot[unique_ticker]
418 |
419 | daily_return=daily_price_pivot.pct_change()
420 | daily_return = daily_return[daily_return.index>=trade_date[0]]
421 | return daily_return
422 |
423 | def calculate_sector_quarterly_return(daily_price, unique_ticker,trade_date_plus1):
424 | daily_price_pivot = pd.pivot_table(daily_price, values='adj_price', index=['datadate'],
425 | columns=['tic'], aggfunc=np.mean)
426 | daily_price_pivot=daily_price_pivot[unique_ticker]
427 | quarterly_price_pivot=daily_price_pivot.ix[trade_date_plus1]
428 |
429 | quarterly_return=quarterly_price_pivot.pct_change()
430 | quarterly_return = quarterly_return[quarterly_return.index>trade_date_plus1[0]]
431 |
432 | return quarterly_return
433 |
434 | def pick_stocks_based_on_quantiles_old(df_predict_best):
435 |
436 | quantile_0_25 = {}
437 | quantile_25_50 = {}
438 | quantile_50_75 = {}
439 | quantile_75_100 = {}
440 |
441 |
442 | for i in range(df_predict_best.shape[0]):
443 | q_25=df_predict_best.iloc[i].quantile(0.25)
444 | q_50=df_predict_best.iloc[i].quantile(0.5)
445 | q_75=df_predict_best.iloc[i].quantile(0.75)
446 | q_100=df_predict_best.iloc[i].quantile(1)
447 |
448 | quantile_0_25[df_predict_best.index[i]] = df_predict_best.iloc[i][df_predict_best.iloc[i] <= q_25]
449 | quantile_25_50[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_25) & \
450 | (df_predict_best.iloc[i] <= q_50)]
451 | quantile_50_75[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_50) & \
452 | (df_predict_best.iloc[i] <= q_75)]
453 | quantile_75_100[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_75)]
454 | return (quantile_0_25, quantile_25_50, quantile_50_75, quantile_75_100)
455 |
456 | def pick_stocks_based_on_quantiles(df_predict_best):
457 |
458 | quantile_0_30 = {}
459 |
460 | quantile_70_100 = {}
461 |
462 |
463 | for i in range(df_predict_best.shape[0]):
464 | q_30=df_predict_best.iloc[i].quantile(0.3)
465 | q_70=df_predict_best.iloc[i].quantile(0.7)
466 |
467 | quantile_0_30[df_predict_best.index[i]] = df_predict_best.iloc[i][df_predict_best.iloc[i] <= q_30]
468 |
469 |
470 | quantile_70_100[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] >= q_70)]
471 | return (quantile_0_30, quantile_70_100)
472 |
473 | def calculate_portfolio_return(daily_return,trade_date_plus1,long_dict,frequency_date):
474 | df_portfolio_return = pd.DataFrame(columns=['portfolio_return'])
475 |
476 | for i in range(len(trade_date_plus1) - 1):
477 | # for long only
478 | #equally weight
479 | #long_normalize_weight = 1/long_dict[trade_date_plus1[i]].shape[0]
480 |
481 | # map date and tic
482 | long_tic_return_daily = \
483 | daily_return[(daily_return.index >= trade_date_plus1[i]) &\
484 | (daily_return.index < trade_date_plus1[i + 1])][long_dict[trade_date_plus1[i]].index]
485 | # return * weight
486 | long_daily_return = long_tic_return_daily
487 | df_temp = long_daily_return.mean(axis=1)
488 | df_temp = pd.DataFrame(df_temp, columns=['daily_return'])
489 | df_portfolio_return = df_portfolio_return.append(df_temp)
490 | return df_portfolio_return
491 |
492 | def calculate_portfolio_quarterly_return(quarterly_return,trade_date_plus1,long_dict):
493 | df_portfolio_return = pd.DataFrame(columns=['portfolio_return'])
494 |
495 | for i in range(len(trade_date_plus1) - 1):
496 | # for long only
497 | #equally weight
498 | #long_normalize_weight = 1/long_dict[trade_date_plus1[i]].shape[0]
499 |
500 | # map date and tic
501 | long_tic_return = quarterly_return[quarterly_return.index == trade_date_plus1[i + 1]][long_dict[trade_date_plus1[i]].index]
502 |
503 | df_temp = long_tic_return.mean(axis=1)
504 | df_temp = pd.DataFrame(df_temp, columns=['portfolio_return'])
505 | df_portfolio_return = df_portfolio_return.append(df_temp)
506 | return df_portfolio_return
507 |
508 | def long_only_strategy_daily(df_predict_return, daily_return, trade_month_plus1, top_quantile_threshold=0.75):
509 | long_dict = {}
510 | for i in range(df_predict_return.shape[0]):
511 | top_q = df_predict_return.iloc[i].quantile(top_quantile_threshold)
512 | # low_q=df_predict_return.iloc[i].quantile(0.2)
513 | # Select all stocks
514 | # long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][~np.isnan(df_predict_return.iloc[i])]
515 | # Select Top 30% Stocks
516 | long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i] >= top_q]
517 | # short_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i]<=low_q]
518 |
519 | df_portfolio_return_daily = pd.DataFrame(columns=['daily_return'])
520 | for i in range(len(trade_month_plus1) - 1):
521 | # for long only
522 | #equally weight
523 | long_normalize_weight = 1/long_dict[trade_month_plus1[i]].shape[0]
524 |
525 | # calculate weight based on predicted return
526 | #long_normalize_weight = \
527 | #long_dict[trade_month_plus1[i]] / sum(long_dict[trade_month_plus1[i]].values)
528 | # map date and tic
529 | long_tic_return_daily = \
530 | daily_return[(daily_return.index >= trade_month_plus1[i]) & (daily_return.index < trade_month_plus1[i + 1])][
531 | long_dict[trade_month_plus1[i]].index]
532 | # return * weight
533 | long_daily_return = long_tic_return_daily * long_normalize_weight
534 | df_temp = long_daily_return.sum(axis=1)
535 | df_temp = pd.DataFrame(df_temp, columns=['daily_return'])
536 | df_portfolio_return_daily = df_portfolio_return_daily.append(df_temp)
537 |
538 | # for short only
539 | # short_normalize_weight=short_dict[trade_month[i]]/sum(short_dict[trade_month[i]].values)
540 | # short_tic_return=tic_monthly_return[tic_monthly_return.index==trade_month[i]][short_dict[trade_month[i]].index]
541 | # short_return_table=short_tic_return
542 | # portfolio_return_dic[trade_month[i]] = long_return_table.values.sum() + short_return_table.values.sum()
543 |
544 | return df_portfolio_return_daily
545 |
546 |
547 | def long_only_strategy_monthly(df_predict_return, tic_monthly_return, trade_month, top_quantile_threshold=0.7):
548 | long_dict = {}
549 | short_dict = {}
550 | for i in range(df_predict_return.shape[0]):
551 | top_q = df_predict_return.iloc[i].quantile(top_quantile_threshold)
552 | # low_q=df_predict_return.iloc[i].quantile(0.2)
553 | # Select all stocks
554 | # long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][~np.isnan(df_predict_return.iloc[i])]
555 | # Select Top 30% Stocks
556 | long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i] >= top_q]
557 | # short_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i]<=low_q]
558 |
559 | portfolio_return_dic = {}
560 | for i in range(len(trade_month)):
561 | # for longX_train_rf only
562 | # calculate weight based on predicted return
563 | long_normalize_weight = long_dict[trade_month[i]] / sum(long_dict[trade_month[i]].values)
564 | # map date and tic
565 | long_tic_return = tic_monthly_return[tic_monthly_return.index == trade_month[i]][
566 | long_dict[trade_month[i]].index]
567 | # return * weight
568 | long_return_table = long_tic_return * long_normalize_weight
569 | portfolio_return_dic[trade_month[i]] = long_return_table.values.sum()
570 |
571 | # for short only
572 | # short_normalize_weight=short_dict[trade_month[i]]/sum(short_dict[trade_month[i]].values)
573 | # short_tic_return=tic_monthly_return[tic_monthly_return.index==trade_month[i]][short_dict[trade_month[i]].index]
574 | # short_return_table=short_tic_return
575 | # portfolio_return_dic[trade_month[i]] = long_return_table.values.sum() + short_return_table.values.sum()
576 |
577 | df_portfolio_return = pd.DataFrame.from_dict(portfolio_return_dic, orient='index')
578 | df_portfolio_return = df_portfolio_return.reset_index()
579 | df_portfolio_return.columns = ['trade_month', 'monthly_return']
580 | df_portfolio_return.index = df_portfolio_return.trade_month
581 | df_portfolio_return = df_portfolio_return['monthly_return']
582 | return df_portfolio_return
583 |
584 |
585 |
586 |
587 |
588 | def plot_predict_return_distribution(df_predict_best,sector_name,out_path):
589 | import matplotlib.pyplot as plt
590 |
591 | for i in range(df_predict_best.shape[0]):
592 | fig=plt.figure(figsize=(8,5))
593 | df_predict_best.iloc[i].hist()
594 | plt.xlabel("predicted return",size=15)
595 | plt.ylabel("frequency",size=15)
596 |
597 | plt.title(sector_name+": trade date - "+str(df_predict_best.index[i]),size=15)
598 | plt.savefig(out_path+str(df_predict_best.index[i])+".png")
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
--------------------------------------------------------------------------------
/code/old_Rcode/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/code/old_Rcode/.DS_Store
--------------------------------------------------------------------------------
/code/old_Rcode/fundamental_ML_model.R:
--------------------------------------------------------------------------------
1 | fundamental_ML_model <- function(sector_data,trade_date){
2 | #######################################################
3 | #1. model test error to select models
4 | #2. trade period predicted return to select stocks
5 | #3. linear regression features
6 | #4. random forest features
7 | #5. ridge features
8 | #6. stepwise regression features
9 | #7. gbm features
10 | #sector_data=sector45_data
11 |
12 | #look at the data determine the first factor column number
13 | start_column=12
14 |
15 | #set the rows to 89, because we have 89 stock selections
16 | #may need to adjust and put into function
17 |
18 | #model error to select model
19 | model_error=data.frame(MSE_linear=replicate(89,0))
20 | model_error[,2]=data.frame(MSE_RF=replicate(89,0))
21 | model_error[,3]=data.frame(MSE_ridge=replicate(89,0))
22 | model_error[,4]=data.frame(MSE_step=replicate(89,0))
23 | model_error[,5]=data.frame(MSE_gbm=replicate(89,0))
24 |
25 | #predicte return to select stocks
26 | predicted_return=list()
27 |
28 |
29 |
30 | #main model
31 | LR_features=list()
32 | RF_features=list()
33 | ridge_features=list()
34 |
35 | Step_features=list()
36 | GBM_features=list()
37 |
38 | #for(i in 1:(length(trade_date)-19)){RF_features[[i]]=c(1:i)}
39 |
40 | #understand rolling windows
41 | #for(i in 1:(length(trade_date)-19)){print(c(i,i+15,i+16,i+19,trade_date[i+20]))}
42 |
43 | for(i in 1:(length(trade_date)-21)){
44 |
45 | ###############################################
46 | ###########rolling window########################
47 |
48 | ####train the model based on 4 years, 16 quarters data
49 | #growing window 10 years
50 | if (i<=25) {
51 | data_train=sector_data[(sector_data$tradedate <= trade_date[i+15]),]
52 | train_x=data_train[,c(start_column:(dim(sector_data)[2]-1))]
53 | train_y=data_train[,dim(sector_data)[2]]
54 | } else{
55 | data_train=sector_data[(sector_data$tradedate <= trade_date[i+15]) & sector_data$tradedate >= trade_date[i-25],]
56 | train_x=data_train[,c(start_column:(dim(sector_data)[2]-1))]
57 | train_y=data_train[,dim(sector_data)[2]]
58 | }
59 |
60 | ####test the model based on 1 year, 4 quarters data
61 | data_test=sector_data[(sector_data$tradedate <= trade_date[i+19]) & (sector_data$tradedate >= trade_date[i+16]),]
62 | test_x=data_test[,c(start_column:(dim(sector_data)[2]-1))]
63 | test_y=data_test[,dim(sector_data)[2]]
64 |
65 | train=cbind(train_y,train_x)
66 | test=cbind(test_y,test_x)
67 |
68 |
69 | ####trade data for every quarter
70 | data_trade=sector_data[(sector_data$tradedate == trade_date[i+20]),]
71 | trade_x=data_trade[,c(start_column:(dim(sector_data)[2]-1))]
72 | trade_y=data_trade[,dim(sector_data)[2]]
73 | trade=cbind(trade_x,trade_y)
74 |
75 | row.names(trade_x)=data_trade$tic
76 |
77 | ###########################################
78 | ##############linear regression############
79 | ###########################################
80 | linear_model=lm(y_return~., data=train)
81 | linear_pre_y=predict(linear_model,test_x)
82 | MSE_linear=mean((test_y-linear_pre_y)^2,na.rm=TRUE)
83 | #MSE_linear
84 |
85 | #LR features
86 | LR_features[[i]]=summary(linear_model)
87 |
88 | ###########################################
89 | ################Random Forest##############
90 | ###########################################
91 | # Tune using algorithm tools
92 | # Tunning the mtry
93 | bestmtry <- tuneRF(train[,-1],train[,1], stepFactor=1.5, improve=1e-5, ntree=500,trace=0,plot = FALSE)
94 | #plot(bestmtry,type = "l")
95 | bestmtry=data.frame(bestmtry)
96 | mytry_optimal=bestmtry$mtry[which.min(bestmtry$OOBError)]
97 | #mytry_optimal
98 | RF_Model=randomForest(y_return~.,data = train,ntree=500,mtry=mytry_optimal,importance=TRUE, na.rm = T,trace=0)
99 |
100 | yhat_bag=predict(RF_Model,test_x)
101 | MSE_RF=mean((yhat_bag-test_y)^2)
102 | #MSE_RF
103 | #importance table
104 | #varImp(RF_Model)
105 | #varImpPlot(RF_Model,main='Random Forest Importance Table')
106 |
107 | ########RF features
108 | RF_features[[i]]=varImp(RF_Model)
109 |
110 | #####################################
111 | ################ridge################
112 | #####################################
113 | x_train_ridge=model.matrix(y_return~., train)[,-1]
114 | y_train_ridge=train$y_return
115 |
116 | x_test_ridge=model.matrix(y_return~.,test)[,-1]
117 | y_test_ridge=test$y_return
118 |
119 | #tunning for lambda
120 | #first run ridge on training set and pick the best lambda
121 | cv.out_ridge=cv.glmnet(x_train_ridge,y_train_ridge,alpha=1)
122 | bestlam_ridge=cv.out_ridge$lambda.min
123 |
124 | ridge_model=glmnet(x_train_ridge,y_train_ridge,alpha = 0,lambda = bestlam_ridge)
125 | ridge_pred_y=predict(ridge_model, newx = x_test_ridge)
126 |
127 | MSE_ridge=mean((ridge_pred_y-y_test_ridge)^2,na.rm=TRUE)
128 |
129 | #ridge features
130 | ridge_coeffs <- coef(ridge_model)
131 | ridge_coef=data.frame(name = ridge_coeffs@Dimnames[[1]][ridge_coeffs@i + 1], coefficient = ridge_coeffs@x)
132 |
133 | ridge_features[[i]]=ridge_coef
134 |
135 |
136 |
137 | ###########################################
138 | ##############stepwise regression##########
139 | ###########################################
140 | #based on linear regresion
141 | step_model=stepAIC(linear_model, direction="both",trace = 0)
142 | step_pre_y=predict(step_model,test_x)
143 |
144 | MSE_step=mean((test_y-step_pre_y)^2,na.rm=TRUE)
145 | #MSE_step
146 |
147 | #step features
148 | Step_features[[i]]=summary(step_model)
149 |
150 |
151 | ###################################
152 | ################GBM################
153 | ###################################
154 | #Generalized Boosted Regression Models
155 | gbm_model=gbm(y_return~.,data = train,
156 | dist="gaussian",
157 | n.tree = 400,
158 | shrinkage=0.1,
159 | cv.folds = 5)
160 |
161 | gbm_pred_y = predict(gbm_model, test, n.tree = 400, type = 'response')
162 | MSE_gbm=mean((gbm_pred_y-test_y)^2,na.rm=TRUE)
163 | #MSE_gbm
164 | ########GBM features
165 | GBM_features[[i]]= summary(gbm_model,plot=FALSE)
166 |
167 | ######################################
168 | #############get results#############
169 | ######################################
170 |
171 |
172 |
173 | #####################################
174 | #all model trade data
175 | #trade using linear regression
176 | trade_linear_y=predict(linear_model,trade_x)
177 | #trade using random forest
178 | trade_RF_y=predict(RF_Model,trade_x)
179 | #trade using ridge
180 | x_trade_ridge=model.matrix(y_return~.,trade)[,-1]
181 | row.names(x_trade_ridge)=data_trade$tic
182 | trade_ridge_y=predict(ridge_model,x_trade_ridge)
183 | colnames(trade_ridge_y)=c('trade_ridge_y')
184 |
185 | #trade stepwise regression
186 | trade_step_y=predict(step_model,trade_x)
187 | #trade using GBM
188 | trade_GBM_y=predict(gbm_model,trade_x)
189 |
190 | ###########store model error
191 | if (length(unique(trade_linear_y))0){
14 | install.packages(packages.needed, dependencies = TRUE)
15 | }
16 | library(readxl)
17 | library(MASS)
18 | library(ggplot2)
19 | library(glmnet)
20 | library(ISLR)
21 | library(tree)
22 | library(randomForest)
23 | library(gbm)
24 | library(e1071)
25 | library(caret)
26 |
27 | source("fundamental_ML_model.R")
28 | source("fundamental_select_stock.R")
29 |
30 | ####################get data############################
31 | fundamental_total=read_excel("fundamental_final_table.xlsx",1)
32 | trade_date=unique(fundamental_total$tradedate)
33 | trade_date=sort(trade_date)
34 |
35 | sector10_data=read_excel("sector10_clean.xlsx",1)
36 | dim(sector10_data)
37 |
38 | sector15_data=read_excel("sector15_clean.xlsx",1)
39 | dim(sector15_data)
40 |
41 |
42 | sector20_data=read_excel("sector20_clean.xlsx",1)
43 | dim(sector20_data)
44 |
45 | sector25_data=read_excel("sector25_clean.xlsx",1)
46 | dim(sector25_data)
47 |
48 |
49 | sector30_data=read_excel("sector30_clean.xlsx",1)
50 | dim(sector30_data)
51 |
52 | sector35_data=read_excel("sector35_clean.xlsx",1)
53 | dim(sector35_data)
54 |
55 | sector40_data=read_excel("sector40_clean.xlsx",1)
56 | dim(sector40_data)
57 |
58 | sector45_data=read_excel("sector45_clean.xlsx",1)
59 | dim(sector45_data)
60 |
61 | sector50_data=read_excel("sector50_clean.xlsx",1)
62 | dim(sector50_data)
63 |
64 |
65 | sector55_data=read_excel("sector55_clean.xlsx",1)
66 | dim(sector55_data)
67 |
68 | sector60_data=read_excel("sector60_clean.xlsx",1)
69 | dim(sector60_data)
70 |
71 |
72 | ###############################################################
73 | #####run model and save as RData
74 | ###############################################################
75 |
76 | ######################################
77 | ############sector 10 Energy (5238, 32)
78 | ######################################
79 | ##1.2 hours to run
80 | start.time=Sys.time()
81 | sector10_result=fundamental_ML_model(sector10_data,trade_date)
82 | end.time=Sys.time()
83 | end.time-start.time
84 | save(sector10_result,file = "sector10_result.RData")
85 |
86 | ######################################
87 | ############sector 15 Materials (5216, 32)
88 | ######################################
89 | ##1.2 hours to run
90 | start.time=Sys.time()
91 | sector15_result=fundamental_ML_model(sector15_data,trade_date)
92 | end.time=Sys.time()
93 | end.time-start.time
94 | save(sector15_result,file = "sector15_result.RData")
95 |
96 | ######################################
97 | ############sector 20 Industrials (9881, 26)
98 | ######################################
99 | #2 hours to run
100 | start.time=Sys.time()
101 | sector20_result=fundamental_ML_model(sector20_data,trade_date)
102 | end.time=Sys.time()
103 | end.time-start.time
104 | save(sector20_result,file = "sector20_result.RData")
105 |
106 | ######################################
107 | ############sector 25 Consumer Discretionary (12595, 26)
108 | ######################################
109 | #2.5 hours to run
110 | start.time=Sys.time()
111 | sector25_result=fundamental_ML_model(sector25_data,trade_date)
112 | end.time=Sys.time()
113 | end.time-start.time
114 | save(sector25_result,file = "sector25_result.RData")
115 |
116 | ######################################
117 | ############sector 30 Consumer Staples (5388, 29)
118 | ######################################
119 | #1.2 hours to run
120 | start.time=Sys.time()
121 | sector30_result=fundamental_ML_model(sector30_data,trade_date)
122 | end.time=Sys.time()
123 | end.time-start.time
124 | save(sector30_result,file = "sector30_result.RData")
125 |
126 | ######################################
127 | ############sector 35 Health Cares (7615, 29)
128 | ######################################
129 | #2 hours to run
130 | start.time=Sys.time()
131 | sector35_result=fundamental_ML_model(sector35_data,trade_date)
132 | end.time=Sys.time()
133 | end.time-start.time
134 | save(sector35_result,file = "sector35_result.RData")
135 |
136 | ######################################
137 | ############sector 40 Financials (9480, 21)
138 | ######################################
139 | ##1.5 hours to run
140 | start.time=Sys.time()
141 | sector40_result=fundamental_ML_model(sector40_data,trade_date)
142 | end.time=Sys.time()
143 | end.time-start.time
144 | save(sector40_result,file = "sector40_result.RData")
145 |
146 | ######################################
147 | ############sector 45 Information Technology (10243, 29)
148 | ######################################
149 | ##2.5 hours to run
150 | start.time=Sys.time()
151 | sector45_result=fundamental_ML_model(sector45_data,trade_date)
152 | end.time=Sys.time()
153 | end.time-start.time
154 | save(sector45_result,file = "sector45_result.RData")
155 |
156 | ######################################
157 | ############sector 50 Telecomminucation Services (1127, 32)
158 | ######################################
159 | #20 mins to run
160 | start.time=Sys.time()
161 | sector50_result=fundamental_ML_model(sector50_data,trade_date)
162 | end.time=Sys.time()
163 | end.time-start.time
164 | save(sector50_result,file = "sector50_result.RData")
165 |
166 | ######################################
167 | ############sector 55 Utilities (3903, 32)
168 | ######################################
169 | ##1.2 hours to run
170 | start.time=Sys.time()
171 | sector55_result=fundamental_ML_model(sector55_data,trade_date)
172 | end.time=Sys.time()
173 | end.time-start.time
174 | save(sector55_result,file = "sector55_result.RData")
175 |
176 | ######################################
177 | ############sector 60 Real Estate (3039, 32)
178 | ######################################
179 | #31 mins to run
180 | start.time=Sys.time()
181 | sector60_result=fundamental_ML_model(sector60_data,trade_date)
182 | end.time=Sys.time()
183 | end.time-start.time
184 | save(sector60_result,file = "sector60_result.RData")
185 |
186 | #############################################
187 | #############################################
188 | #############################################
189 | #############################################
190 |
191 | ###############################################################
192 | ################Stock Selection
193 | ###############################################################
194 |
195 | #########stock selection sector 10
196 | #load("sector10_result.RData")
197 | selector10_modelStock=select_modelStock(sector10_result)
198 | selector10_topStock=select_topStock(selector10_modelStock$selected_stocks)
199 | #########stock selection sector 15
200 | #load("sector15_result.RData")
201 | selector15_modelStock=select_modelStock(sector15_result)
202 | selector15_topStock=select_topStock(selector15_modelStock$selected_stocks)
203 | #########stock selection sector 20
204 | #load("sector20_result.RData")
205 | selector20_modelStock=select_modelStock(sector20_result)
206 | selector20_topStock=select_topStock(selector20_modelStock$selected_stocks)
207 | #########stock selection sector 25
208 | #load("sector25_result.RData")
209 | selector25_modelStock=select_modelStock(sector25_result)
210 | selector25_topStock=select_topStock(selector25_modelStock$selected_stocks)
211 | #########stock selection sector 30
212 | #load("sector30_result.RData")
213 | selector30_modelStock=select_modelStock(sector30_result)
214 | selector30_topStock=select_topStock(selector30_modelStock$selected_stocks)
215 | #########stock selection sector 35
216 | #load("sector35_result.RData")
217 | selector35_modelStock=select_modelStock(sector35_result)
218 | selector35_topStock=select_topStock(selector35_modelStock$selected_stocks)
219 | #########stock selection sector 40
220 | #load("sector40_result.RData")
221 | selector40_modelStock=select_modelStock(sector40_result)
222 | selector40_topStock=select_topStock(selector40_modelStock$selected_stocks)
223 | #########stock selection sector 45
224 | #load("sector45_result.RData")
225 | selector45_modelStock=select_modelStock(sector45_result)
226 | selector45_topStock=select_topStock(selector45_modelStock$selected_stocks)
227 | #########stock selection sector 50
228 | #load("sector50_result.RData")
229 | selector50_modelStock=select_modelStock(sector50_result)
230 | selector50_topStock=select_topStock(selector50_modelStock$selected_stocks)
231 | #selector50_topStock[[82]]=selector50_topStock[[81]]
232 | #########stock selection sector 55
233 | #load("sector55_result.RData")
234 | selector55_modelStock=select_modelStock(sector55_result)
235 | selector55_topStock=select_topStock(selector55_modelStock$selected_stocks)
236 | #########stock selection sector 60
237 | #load("sector60_result.RData")
238 | selector60_modelStock=select_modelStock(sector60_result)
239 | selector60_topStock=select_topStock(selector60_modelStock$selected_stocks)
240 |
241 |
242 |
243 | ###############combine stocks together
244 | stocks_selected_total=NULL
245 | for (i in 1:89){
246 |
247 | #sector 10
248 | sector10_temp=selector10_topStock[[i]]
249 | sector10_temp=cbind(names(sector10_temp),unname(sector10_temp),trade_date[i+20])
250 | colnames(sector10_temp)=c('tic','predicted_return','trade_date')
251 |
252 | #sector 15
253 | sector15_temp=selector15_topStock[[i]]
254 | sector15_temp=cbind(names(sector15_temp),unname(sector15_temp),trade_date[i+20])
255 | colnames(sector15_temp)=c('tic','predicted_return','trade_date')
256 |
257 | #sector 20
258 | sector20_temp=selector20_topStock[[i]]
259 | sector20_temp=cbind(names(sector20_temp),unname(sector20_temp),trade_date[i+20])
260 | colnames(sector20_temp)=c('tic','predicted_return','trade_date')
261 |
262 | #sector 25
263 | sector25_temp=selector25_topStock[[i]]
264 | sector25_temp=cbind(names(sector25_temp),unname(sector25_temp),trade_date[i+20])
265 | colnames(sector25_temp)=c('tic','predicted_return','trade_date')
266 |
267 | #sector 30
268 | sector30_temp=selector30_topStock[[i]]
269 | sector30_temp=cbind(names(sector30_temp),unname(sector30_temp),trade_date[i+20])
270 | colnames(sector30_temp)=c('tic','predicted_return','trade_date')
271 |
272 | #sector 35
273 | sector35_temp=selector35_topStock[[i]]
274 | sector35_temp=cbind(names(sector35_temp),unname(sector35_temp),trade_date[i+20])
275 | colnames(sector35_temp)=c('tic','predicted_return','trade_date')
276 |
277 | #sector 40
278 | sector40_temp=selector40_topStock[[i]]
279 | sector40_temp=cbind(names(sector40_temp),unname(sector40_temp),trade_date[i+20])
280 | colnames(sector40_temp)=c('tic','predicted_return','trade_date')
281 |
282 | #sector 45
283 | sector45_temp=selector45_topStock[[i]]
284 | sector45_temp=cbind(names(sector45_temp),unname(sector45_temp),trade_date[i+20])
285 | colnames(sector45_temp)=c('tic','predicted_return','trade_date')
286 |
287 | #sector 50
288 | sector50_temp=selector50_topStock[[i]]
289 | sector50_temp=cbind(names(sector50_temp),unname(sector50_temp),trade_date[i+20])
290 | colnames(sector50_temp)=c('tic','predicted_return','trade_date')
291 |
292 | #sector 55
293 | sector55_temp=selector55_topStock[[i]]
294 | sector55_temp=cbind(names(sector55_temp),unname(sector55_temp),trade_date[i+20])
295 | colnames(sector55_temp)=c('tic','predicted_return','trade_date')
296 |
297 |
298 | #sector 60
299 | sector60_temp=selector60_topStock[[i]]
300 | sector60_temp=cbind(names(sector60_temp),unname(sector60_temp),trade_date[i+20])
301 | colnames(sector60_temp)=c('tic','predicted_return','trade_date')
302 |
303 |
304 | stocks_bind=rbind(sector10_temp,
305 | sector15_temp,
306 | sector20_temp,
307 | sector25_temp,
308 | sector30_temp,
309 | sector35_temp,
310 | sector40_temp,
311 | sector45_temp,
312 | sector50_temp,
313 | sector55_temp,
314 | sector60_temp)
315 |
316 | stocks_selected_total=rbind(stocks_selected_total,stocks_bind)
317 |
318 | }
319 |
320 | stocks_selected_total=as.data.frame(stocks_selected_total)
321 |
322 |
323 | write.csv(stocks_selected_total,"stocks_selected_total.csv")
324 |
--------------------------------------------------------------------------------
/code/old_Rcode/fundamental_select_stock.R:
--------------------------------------------------------------------------------
1 | select_modelStock = function(sector_result){
2 | #sector_result=sector10_result
3 | selected_model=NULL
4 | selected_stocks=list()
5 |
6 | for (i in 1:89){
7 | get_minIndex= apply(sector_result$model_error[i,],1,which.min)
8 | selected_model[i]=colnames(sector_result$model_error[i,])[apply(sector_result$model_error[i,],1,which.min)]
9 | selected_stocks[[i]] = sector_result$predicted_return[[i]][,get_minIndex]
10 |
11 | }
12 |
13 | output=list(selected_stocks=selected_stocks,selected_model=selected_model)
14 | return(output)
15 | }
16 |
17 |
18 |
19 | select_topStock=function(selected_stocks){
20 | selected_topstocks=list()
21 |
22 | for (i in 1:89){
23 | selected_topstocks[[i]]=selected_stocks[[i]][selected_stocks[[i]]>=quantile(selected_stocks[[i]],0.8)]
24 | }
25 | return(selected_topstocks)
26 | }
--------------------------------------------------------------------------------
/figs/chart10_insample.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart10_insample.PNG
--------------------------------------------------------------------------------
/figs/chart11_overallPerformance.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart11_overallPerformance.PNG
--------------------------------------------------------------------------------
/figs/chart1_datasetPeriod.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart1_datasetPeriod.PNG
--------------------------------------------------------------------------------
/figs/chart2_rolling_windows.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart2_rolling_windows.PNG
--------------------------------------------------------------------------------
/figs/chart3_modelError.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart3_modelError.PNG
--------------------------------------------------------------------------------
/figs/chart4_predictedReturn1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart4_predictedReturn1.PNG
--------------------------------------------------------------------------------
/figs/chart4_predictedReturn2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart4_predictedReturn2.PNG
--------------------------------------------------------------------------------
/figs/chart5_coefficient.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart5_coefficient.PNG
--------------------------------------------------------------------------------
/figs/chart6_selectedStocks.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart6_selectedStocks.PNG
--------------------------------------------------------------------------------
/figs/chart7_efficient1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart7_efficient1.PNG
--------------------------------------------------------------------------------
/figs/chart8_PnL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart8_PnL.png
--------------------------------------------------------------------------------
/figs/chart9_TotalValue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart9_TotalValue.png
--------------------------------------------------------------------------------
/figs/dataperiod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/dataperiod.png
--------------------------------------------------------------------------------
/figs/efficient1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/efficient1.jpg
--------------------------------------------------------------------------------
/figs/pnl1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/pnl1.jpg
--------------------------------------------------------------------------------
/figs/rolling_windows.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/rolling_windows.vsdx
--------------------------------------------------------------------------------
/figs/transaction cost.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/transaction cost.PNG
--------------------------------------------------------------------------------
/fundamental_portfolio.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Import packages"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 27,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pandas as pd\n",
17 | "import numpy as np\n",
18 | "from pypfopt.efficient_frontier import EfficientFrontier\n",
19 | "from pypfopt import risk_models\n",
20 | "from pypfopt.risk_models import CovarianceShrinkage\n",
21 | "from pypfopt import expected_returns\n",
22 | "from datetime import datetime\n",
23 | "from pandas.tseries.offsets import BDay"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 28,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "import time\n",
33 | "import pickle"
34 | ]
35 | },
36 | {
37 | "cell_type": "markdown",
38 | "metadata": {},
39 | "source": [
40 | "# 1. Read Input Data"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 29,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stderr",
50 | "output_type": "stream",
51 | "text": [
52 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/numpy/lib/arraysetops.py:568: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
53 | " mask |= (ar1 == a)\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "df_price = pd.read_csv(\"Data/1-sp500_adj_price.csv\",index_col=0)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 30,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/plain": [
69 | "(6438964, 3)"
70 | ]
71 | },
72 | "execution_count": 30,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "df_price.shape"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 31,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "data": {
88 | "text/html": [
89 | "\n",
90 | "\n",
103 | "
\n",
104 | " \n",
105 | " \n",
106 | " | \n",
107 | " datadate | \n",
108 | " tic | \n",
109 | " adj_price | \n",
110 | "
\n",
111 | " \n",
112 | " \n",
113 | " \n",
114 | " 1 | \n",
115 | " 19900102 | \n",
116 | " ADCT | \n",
117 | " 4.074244 | \n",
118 | "
\n",
119 | " \n",
120 | " 2 | \n",
121 | " 19900103 | \n",
122 | " ADCT | \n",
123 | " 4.046900 | \n",
124 | "
\n",
125 | " \n",
126 | " 3 | \n",
127 | " 19900104 | \n",
128 | " ADCT | \n",
129 | " 3.964869 | \n",
130 | "
\n",
131 | " \n",
132 | " 4 | \n",
133 | " 19900105 | \n",
134 | " ADCT | \n",
135 | " 3.992212 | \n",
136 | "
\n",
137 | " \n",
138 | " 5 | \n",
139 | " 19900108 | \n",
140 | " ADCT | \n",
141 | " 3.937525 | \n",
142 | "
\n",
143 | " \n",
144 | "
\n",
145 | "
"
146 | ],
147 | "text/plain": [
148 | " datadate tic adj_price\n",
149 | "1 19900102 ADCT 4.074244\n",
150 | "2 19900103 ADCT 4.046900\n",
151 | "3 19900104 ADCT 3.964869\n",
152 | "4 19900105 ADCT 3.992212\n",
153 | "5 19900108 ADCT 3.937525"
154 | ]
155 | },
156 | "execution_count": 31,
157 | "metadata": {},
158 | "output_type": "execute_result"
159 | }
160 | ],
161 | "source": [
162 | "df_price.head()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 32,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "selected_stock = pd.read_csv(\"Data/2-portfolio_data/stocks_selected_total_user8.csv\")"
172 | ]
173 | },
174 | {
175 | "cell_type": "code",
176 | "execution_count": 33,
177 | "metadata": {},
178 | "outputs": [
179 | {
180 | "data": {
181 | "text/plain": [
182 | "(12932, 3)"
183 | ]
184 | },
185 | "execution_count": 33,
186 | "metadata": {},
187 | "output_type": "execute_result"
188 | }
189 | ],
190 | "source": [
191 | "selected_stock.shape"
192 | ]
193 | },
194 | {
195 | "cell_type": "code",
196 | "execution_count": 34,
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/html": [
202 | "\n",
203 | "\n",
216 | "
\n",
217 | " \n",
218 | " \n",
219 | " | \n",
220 | " tic | \n",
221 | " predicted_return | \n",
222 | " trade_date | \n",
223 | "
\n",
224 | " \n",
225 | " \n",
226 | " \n",
227 | " 0 | \n",
228 | " EOG | \n",
229 | " 0.033723 | \n",
230 | " 19950601 | \n",
231 | "
\n",
232 | " \n",
233 | " 1 | \n",
234 | " EQT | \n",
235 | " 0.037745 | \n",
236 | " 19950601 | \n",
237 | "
\n",
238 | " \n",
239 | " 2 | \n",
240 | " HES | \n",
241 | " 0.051450 | \n",
242 | " 19950601 | \n",
243 | "
\n",
244 | " \n",
245 | " 3 | \n",
246 | " NFX | \n",
247 | " 0.030283 | \n",
248 | " 19950601 | \n",
249 | "
\n",
250 | " \n",
251 | " 4 | \n",
252 | " OKE | \n",
253 | " 0.041020 | \n",
254 | " 19950601 | \n",
255 | "
\n",
256 | " \n",
257 | "
\n",
258 | "
"
259 | ],
260 | "text/plain": [
261 | " tic predicted_return trade_date\n",
262 | "0 EOG 0.033723 19950601\n",
263 | "1 EQT 0.037745 19950601\n",
264 | "2 HES 0.051450 19950601\n",
265 | "3 NFX 0.030283 19950601\n",
266 | "4 OKE 0.041020 19950601"
267 | ]
268 | },
269 | "execution_count": 34,
270 | "metadata": {},
271 | "output_type": "execute_result"
272 | }
273 | ],
274 | "source": [
275 | "selected_stock.head()"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {},
281 | "source": [
282 | "# 2. Get trade date"
283 | ]
284 | },
285 | {
286 | "cell_type": "code",
287 | "execution_count": 35,
288 | "metadata": {},
289 | "outputs": [
290 | {
291 | "name": "stdout",
292 | "output_type": "stream",
293 | "text": [
294 | "Number of unique stocks selected: 982\n"
295 | ]
296 | }
297 | ],
298 | "source": [
299 | "print(\"Number of unique stocks selected: \", len(selected_stock.tic.unique()))"
300 | ]
301 | },
302 | {
303 | "cell_type": "code",
304 | "execution_count": null,
305 | "metadata": {},
306 | "outputs": [],
307 | "source": []
308 | },
309 | {
310 | "cell_type": "code",
311 | "execution_count": 36,
312 | "metadata": {},
313 | "outputs": [],
314 | "source": [
315 | "all_date=df_price.datadate.unique()"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 37,
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "data": {
325 | "text/plain": [
326 | "7155"
327 | ]
328 | },
329 | "execution_count": 37,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "len(all_date)"
336 | ]
337 | },
338 | {
339 | "cell_type": "code",
340 | "execution_count": 38,
341 | "metadata": {},
342 | "outputs": [],
343 | "source": [
344 | "trade_date=selected_stock.trade_date.unique()"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": 39,
350 | "metadata": {},
351 | "outputs": [
352 | {
353 | "data": {
354 | "text/plain": [
355 | "array([19950601, 19950901, 19951201, 19960301, 19960603, 19960903,\n",
356 | " 19961202, 19970303, 19970602, 19970902, 19971201, 19980302,\n",
357 | " 19980601, 19980901, 19981201, 19990301, 19990601, 19990901,\n",
358 | " 19991201, 20000301, 20000601, 20000901, 20001201, 20010301,\n",
359 | " 20010601, 20010904, 20011203, 20020301, 20020603, 20020903,\n",
360 | " 20021202, 20030303, 20030602, 20030902, 20031201, 20040301,\n",
361 | " 20040601, 20040901, 20041201, 20050301, 20050601, 20050901,\n",
362 | " 20051201, 20060301, 20060601, 20060901, 20061201, 20070301,\n",
363 | " 20070601, 20070904, 20071203, 20080303, 20080602, 20080902,\n",
364 | " 20081201, 20090302, 20090601, 20090901, 20091201, 20100301,\n",
365 | " 20100601, 20100901, 20101201, 20110301, 20110601, 20110901,\n",
366 | " 20111201, 20120301, 20120601, 20120904, 20121203, 20130301,\n",
367 | " 20130603, 20130903, 20131202, 20140303, 20140602, 20140902,\n",
368 | " 20141201, 20150302, 20150601, 20150901, 20151201, 20160301,\n",
369 | " 20160601, 20160901, 20161201, 20170301, 20170601])"
370 | ]
371 | },
372 | "execution_count": 39,
373 | "metadata": {},
374 | "output_type": "execute_result"
375 | }
376 | ],
377 | "source": [
378 | "trade_date"
379 | ]
380 | },
381 | {
382 | "cell_type": "code",
383 | "execution_count": 40,
384 | "metadata": {},
385 | "outputs": [
386 | {
387 | "name": "stdout",
388 | "output_type": "stream",
389 | "text": [
390 | "Number of trade dates 89\n"
391 | ]
392 | }
393 | ],
394 | "source": [
395 | "print(\"Number of trade dates\", len(trade_date))"
396 | ]
397 | },
398 | {
399 | "cell_type": "markdown",
400 | "metadata": {},
401 | "source": [
402 | "# 3. Get daily 1 year return table in each 89 trade period"
403 | ]
404 | },
405 | {
406 | "cell_type": "code",
407 | "execution_count": 41,
408 | "metadata": {},
409 | "outputs": [
410 | {
411 | "data": {
412 | "text/html": [
413 | "\n",
414 | "\n",
427 | "
\n",
428 | " \n",
429 | " \n",
430 | " | \n",
431 | " tic | \n",
432 | " predicted_return | \n",
433 | " trade_date | \n",
434 | "
\n",
435 | " \n",
436 | " \n",
437 | " \n",
438 | " 0 | \n",
439 | " EOG | \n",
440 | " 0.033723 | \n",
441 | " 19950601 | \n",
442 | "
\n",
443 | " \n",
444 | " 1 | \n",
445 | " EQT | \n",
446 | " 0.037745 | \n",
447 | " 19950601 | \n",
448 | "
\n",
449 | " \n",
450 | " 2 | \n",
451 | " HES | \n",
452 | " 0.051450 | \n",
453 | " 19950601 | \n",
454 | "
\n",
455 | " \n",
456 | " 3 | \n",
457 | " NFX | \n",
458 | " 0.030283 | \n",
459 | " 19950601 | \n",
460 | "
\n",
461 | " \n",
462 | " 4 | \n",
463 | " OKE | \n",
464 | " 0.041020 | \n",
465 | " 19950601 | \n",
466 | "
\n",
467 | " \n",
468 | "
\n",
469 | "
"
470 | ],
471 | "text/plain": [
472 | " tic predicted_return trade_date\n",
473 | "0 EOG 0.033723 19950601\n",
474 | "1 EQT 0.037745 19950601\n",
475 | "2 HES 0.051450 19950601\n",
476 | "3 NFX 0.030283 19950601\n",
477 | "4 OKE 0.041020 19950601"
478 | ]
479 | },
480 | "execution_count": 41,
481 | "metadata": {},
482 | "output_type": "execute_result"
483 | }
484 | ],
485 | "source": [
486 | "selected_stock.head()"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": 414,
492 | "metadata": {},
493 | "outputs": [
494 | {
495 | "name": "stdout",
496 | "output_type": "stream",
497 | "text": [
498 | "19950601\n",
499 | "19950901\n",
500 | "19951201\n",
501 | "19960301\n",
502 | "19960603\n",
503 | "19960903\n",
504 | "19961202\n",
505 | "19970303\n",
506 | "19970602\n",
507 | "19970902\n",
508 | "19971201\n",
509 | "19980302\n",
510 | "19980601\n",
511 | "19980901\n",
512 | "19981201\n",
513 | "19990301\n",
514 | "19990601\n",
515 | "19990901\n",
516 | "19991201\n",
517 | "20000301\n",
518 | "20000601\n",
519 | "20000901\n",
520 | "20001201\n",
521 | "20010301\n",
522 | "20010601\n",
523 | "20010904\n",
524 | "20011203\n",
525 | "20020301\n",
526 | "20020603\n",
527 | "20020903\n",
528 | "20021202\n",
529 | "20030303\n",
530 | "20030602\n",
531 | "20030902\n",
532 | "20031201\n",
533 | "20040301\n",
534 | "20040601\n",
535 | "20040901\n",
536 | "20041201\n",
537 | "20050301\n",
538 | "20050601\n",
539 | "20050901\n",
540 | "20051201\n",
541 | "20060301\n",
542 | "20060601\n",
543 | "20060901\n",
544 | "20061201\n",
545 | "20070301\n",
546 | "20070601\n",
547 | "20070904\n",
548 | "20071203\n",
549 | "20080303\n",
550 | "20080602\n",
551 | "20080902\n",
552 | "20081201\n",
553 | "20090302\n",
554 | "20090601\n",
555 | "20090901\n",
556 | "20091201\n",
557 | "20100301\n",
558 | "20100601\n",
559 | "20100901\n",
560 | "20101201\n",
561 | "20110301\n",
562 | "20110601\n",
563 | "20110901\n",
564 | "20111201\n",
565 | "20120301\n",
566 | "20120601\n",
567 | "20120904\n",
568 | "20121203\n",
569 | "20130301\n",
570 | "20130603\n",
571 | "20130903\n",
572 | "20131202\n",
573 | "20140303\n",
574 | "20140602\n",
575 | "20140902\n",
576 | "20141201\n",
577 | "20150302\n",
578 | "20150601\n",
579 | "20150901\n",
580 | "20151201\n",
581 | "20160301\n",
582 | "20160601\n",
583 | "20160901\n",
584 | "20161201\n",
585 | "20170301\n",
586 | "20170601\n",
587 | "Time consuming: 92.59127250512441 minutes\n"
588 | ]
589 | }
590 | ],
591 | "source": [
592 | "# took about 90 minutes to run\n",
593 | "start = time.time()\n",
594 | "all_return_table={}\n",
595 | "#all_predicted_return={}\n",
596 | "all_stocks_info = {}\n",
597 | "#for i in range(0,1):\n",
598 | "for i in range(len(trade_date)):\n",
599 | " #match trading date\n",
600 | " index = selected_stock.trade_date==trade_date[i]\n",
601 | " print(trade_date[i])\n",
602 | " #get the corresponding trade period's selected stocks' name\n",
603 | " stocks_name=selected_stock.tic[selected_stock.trade_date==trade_date[i]].values\n",
604 | " temp_info = selected_stock[selected_stock.trade_date==trade_date[i]]\n",
605 | " temp_info = temp_info.reset_index()\n",
606 | " del temp_info['index']\n",
607 | " all_stocks_info[trade_date[i]] = temp_info\n",
608 | " #get the corresponding trade period's selected stocks' predicted return\n",
609 | " asset_expected_return=selected_stock[index].predicted_return.values\n",
610 | " \n",
611 | " #get current trade date and calculate trade date last year, it has to be a business date\n",
612 | " last_year_tradedate=int((trade_date[i]-round(trade_date[i]/10000)*10000)+round(trade_date[i]/10000-1)*10000)\n",
613 | " convert_to_yyyymmdd=datetime.strptime(str(last_year_tradedate), '%Y%m%d').strftime('%Y-%m-%d')\n",
614 | " #determine the business date\n",
615 | " #print(convert_to_yyyymmdd)\n",
616 | " ts = pd.Timestamp(convert_to_yyyymmdd) \n",
617 | " bd = pd.tseries.offsets.BusinessDay(n =1) \n",
618 | " new_timestamp = ts - bd \n",
619 | " lastY_tradedate = int(new_timestamp.date().strftime('%Y%m%d'))\n",
620 | " get_date_index=(all_datelastY_tradedate)\n",
621 | " get_date=all_date[get_date_index]\n",
622 | " #get adjusted price table\n",
623 | " return_table=pd.DataFrame()\n",
624 | " for m in range(len(stocks_name)):\n",
625 | " #get stocks's name\n",
626 | " index_tic=(df_price.tic==stocks_name[m])\n",
627 | " #get this stock's all historicall price from sp500_price\n",
628 | " sp500_temp=df_price[index_tic]\n",
629 | " merge_left_data_table = pd.DataFrame(get_date)\n",
630 | " merge_left_data_table.columns = ['datadate']\n",
631 | " temp_price=merge_left_data_table.merge(sp500_temp, on=['datadate'], how='left')\n",
632 | " temp_price = temp_price.dropna()\n",
633 | " temp_price['daily_return']=temp_price.adj_price.pct_change()\n",
634 | "\n",
635 | " return_table=return_table.append(temp_price,ignore_index=True)\n",
636 | " all_return_table[trade_date[i]] = return_table\n",
637 | "end = time.time()\n",
638 | "print(\"Time consuming: \", (end-start)/60, \" minutes\")\n",
639 | " \n",
640 | " "
641 | ]
642 | },
643 | {
644 | "cell_type": "markdown",
645 | "metadata": {},
646 | "source": [
647 | "## Save to pickle"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": 419,
653 | "metadata": {},
654 | "outputs": [],
655 | "source": [
656 | "#with open('Data/all_return_table.pickle', 'wb') as handle: \n",
657 | "# pickle.dump(all_return_table, handle, protocol=pickle.HIGHEST_PROTOCOL)"
658 | ]
659 | },
660 | {
661 | "cell_type": "code",
662 | "execution_count": 420,
663 | "metadata": {},
664 | "outputs": [],
665 | "source": [
666 | "#with open('Data/all_stocks_info.pickle', 'wb') as handle:\n",
667 | "# pickle.dump(all_stocks_info, handle, protocol=pickle.HIGHEST_PROTOCOL)"
668 | ]
669 | },
670 | {
671 | "cell_type": "code",
672 | "execution_count": 42,
673 | "metadata": {},
674 | "outputs": [],
675 | "source": [
676 | "#with open('Data/all_return_table.pickle', 'rb') as handle:\n",
677 | "# all_return_table = pickle.load(handle)\n",
678 | "\n",
679 | "#with open('Data/all_stocks_info.pickle', 'rb') as handle:\n",
680 | "# all_stocks_info = pickle.load(handle)\n"
681 | ]
682 | },
683 | {
684 | "cell_type": "markdown",
685 | "metadata": {},
686 | "source": [
687 | "# 4. Potfolio Optimization using pypfopt"
688 | ]
689 | },
690 | {
691 | "cell_type": "code",
692 | "execution_count": 44,
693 | "metadata": {},
694 | "outputs": [
695 | {
696 | "name": "stderr",
697 | "output_type": "stream",
698 | "text": [
699 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pypfopt/objective_functions.py:61: RuntimeWarning: invalid value encountered in sqrt\n",
700 | " sigma = np.sqrt(np.dot(weights, np.dot(cov_matrix, weights.T)))\n"
701 | ]
702 | },
703 | {
704 | "name": "stdout",
705 | "output_type": "stream",
706 | "text": [
707 | "19950601 : Done\n",
708 | "19950901 : Done\n",
709 | "19951201 : Done\n",
710 | "19960301 : Done\n",
711 | "19960603 : Done\n",
712 | "19960903 : Done\n",
713 | "19961202 : Done\n",
714 | "19970303 : Done\n",
715 | "19970602 : Done\n",
716 | "19970902 : Done\n",
717 | "19971201 : Done\n",
718 | "19980302 : Done\n",
719 | "19980601 : Done\n",
720 | "19980901 : Done\n",
721 | "19981201 : Done\n",
722 | "19990301 : Done\n",
723 | "19990601 : Done\n",
724 | "19990901 : Done\n",
725 | "19991201 : Done\n",
726 | "20000301 : Done\n",
727 | "20000601 : Done\n",
728 | "20000901 : Done\n",
729 | "20001201 : Done\n",
730 | "20010301 : Done\n",
731 | "20010601 : Done\n",
732 | "20010904 : Done\n",
733 | "20011203 : Done\n",
734 | "20020301 : Done\n",
735 | "20020603 : Done\n",
736 | "20020903 : Done\n",
737 | "20021202 : Done\n",
738 | "20030303 : Done\n",
739 | "20030602 : Done\n",
740 | "20030902 : Done\n",
741 | "20031201 : Done\n",
742 | "20040301 : Done\n",
743 | "20040601 : Done\n",
744 | "20040901 : Done\n",
745 | "20041201 : Done\n",
746 | "20050301 : Done\n",
747 | "20050601 : Done\n",
748 | "20050901 : Done\n",
749 | "20051201 : Done\n",
750 | "20060301 : Done\n"
751 | ]
752 | },
753 | {
754 | "name": "stderr",
755 | "output_type": "stream",
756 | "text": [
757 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pypfopt/base_optimizer.py:56: RuntimeWarning: invalid value encountered in less\n",
758 | " clean_weights[np.abs(clean_weights) < cutoff] = 0\n"
759 | ]
760 | },
761 | {
762 | "name": "stdout",
763 | "output_type": "stream",
764 | "text": [
765 | "20060601 : Done\n",
766 | "20060901 : Done\n",
767 | "20061201 : Done\n",
768 | "20070301 : Done\n",
769 | "20070601 : Done\n",
770 | "20070904 : Done\n",
771 | "20071203 : Done\n",
772 | "20080303 : Done\n",
773 | "20080602 : Done\n",
774 | "20080902 : Done\n",
775 | "20081201 : Done\n",
776 | "20090302 : Done\n",
777 | "20090601 : Done\n",
778 | "20090901 : Done\n",
779 | "20091201 : Done\n",
780 | "20100301 : Done\n",
781 | "20100601 : Done\n",
782 | "20100901 : Done\n",
783 | "20101201 : Done\n",
784 | "20110301 : Done\n",
785 | "20110601 : Done\n",
786 | "20110901 : Done\n",
787 | "20111201 : Done\n",
788 | "20120301 : Done\n",
789 | "20120601 : Done\n",
790 | "20120904 : Done\n",
791 | "20121203 : Done\n",
792 | "20130301 : Done\n",
793 | "20130603 : Done\n",
794 | "20130903 : Done\n",
795 | "20131202 : Done\n",
796 | "20140303 : Done\n",
797 | "20140602 : Done\n",
798 | "20140902 : Done\n",
799 | "20141201 : Done\n",
800 | "20150302 : Done\n",
801 | "20150601 : Done\n",
802 | "20150901 : Done\n",
803 | "20151201 : Done\n",
804 | "20160301 : Done\n",
805 | "20160601 : Done\n",
806 | "20160901 : Done\n",
807 | "20161201 : Done\n",
808 | "20170301 : Done\n",
809 | "20170601 : Done\n"
810 | ]
811 | }
812 | ],
813 | "source": [
814 | "# took under 5 minutes to run\n",
815 | "\n",
816 | "stocks_weight_table = pd.DataFrame([])\n",
817 | "\n",
818 | "for i in range(len(trade_date)):\n",
819 | " # get selected stocks information\n",
820 | " p1_alldata=(all_stocks_info[trade_date[i]])\n",
821 | " # sort it by tic\n",
822 | " p1_alldata=p1_alldata.sort_values('tic')\n",
823 | " p1_alldata = p1_alldata.reset_index()\n",
824 | " del p1_alldata['index']\n",
825 | " \n",
826 | " \n",
827 | " # get selected stocks tic\n",
828 | " p1_stock = p1_alldata.tic\n",
829 | " \n",
830 | " # get predicted return from selected stocks\n",
831 | " p1_predicted_return=p1_alldata.pivot_table(index = 'trade_date',columns = 'tic', values = 'predicted_return')\n",
832 | " # use the predicted returns as the Expected returns to feed into the portfolio object\n",
833 | " mu = p1_predicted_return.T.values\n",
834 | "\n",
835 | " # get the 1-year historical return\n",
836 | " p1_return_table=all_return_table[trade_date[i]]\n",
837 | " p1_return_table_pivot=p1_return_table.pivot_table(index = 'datadate',columns = 'tic', values = 'daily_return')\n",
838 | " # use the 1-year historical return table to calculate covariance matrix between selected stocks\n",
839 | " S = risk_models.sample_cov(p1_return_table_pivot)\n",
840 | " del S.index.name \n",
841 | " \n",
842 | " # mean variance\n",
843 | " ef_mean = EfficientFrontier(mu, S,weight_bounds=(0, 0.05))\n",
844 | " raw_weights_mean = ef_mean.max_sharpe()\n",
845 | " cleaned_weights_mean = ef_mean.clean_weights()\n",
846 | " #print(raw_weights_mean)\n",
847 | " #ef.portfolio_performance(verbose=True)\n",
848 | "\n",
849 | " # minimum variance\n",
850 | " ef_min = EfficientFrontier([0]*len(p1_stock), S,weight_bounds=(0, 0.05))\n",
851 | " raw_weights_min = ef_min.max_sharpe()\n",
852 | " cleaned_weights_min = ef_min.clean_weights()\n",
853 | " #print(cleaned_weights_min)\n",
854 | " \n",
855 | " p1_alldata['mean_weight'] = cleaned_weights_mean.values()\n",
856 | " p1_alldata['min_weight'] = cleaned_weights_min.values()\n",
857 | " \n",
858 | " #ef.portfolio_performance(verbose=True)\n",
859 | "\n",
860 | " \n",
861 | " stocks_weight_table = stocks_weight_table.append(pd.DataFrame(p1_alldata), ignore_index=True)\n",
862 | " print(trade_date[i], \": Done\")\n"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": 45,
868 | "metadata": {},
869 | "outputs": [
870 | {
871 | "data": {
872 | "text/html": [
873 | "\n",
874 | "\n",
887 | "
\n",
888 | " \n",
889 | " \n",
890 | " | \n",
891 | " tic | \n",
892 | " predicted_return | \n",
893 | " trade_date | \n",
894 | " mean_weight | \n",
895 | " min_weight | \n",
896 | "
\n",
897 | " \n",
898 | " \n",
899 | " \n",
900 | " 0 | \n",
901 | " ACV.1 | \n",
902 | " 0.024449 | \n",
903 | " 19950601 | \n",
904 | " 0.00000 | \n",
905 | " 0.00000 | \n",
906 | "
\n",
907 | " \n",
908 | " 1 | \n",
909 | " AES | \n",
910 | " 0.096917 | \n",
911 | " 19950601 | \n",
912 | " 0.00000 | \n",
913 | " 0.00000 | \n",
914 | "
\n",
915 | " \n",
916 | " 2 | \n",
917 | " AHM.1 | \n",
918 | " 0.044516 | \n",
919 | " 19950601 | \n",
920 | " 0.01200 | \n",
921 | " 0.00522 | \n",
922 | "
\n",
923 | " \n",
924 | " 3 | \n",
925 | " AMH.1 | \n",
926 | " 0.105036 | \n",
927 | " 19950601 | \n",
928 | " 0.00000 | \n",
929 | " 0.00000 | \n",
930 | "
\n",
931 | " \n",
932 | " 4 | \n",
933 | " AMT.1 | \n",
934 | " 0.085373 | \n",
935 | " 19950601 | \n",
936 | " 0.00000 | \n",
937 | " 0.00000 | \n",
938 | "
\n",
939 | " \n",
940 | " 5 | \n",
941 | " AOS | \n",
942 | " 0.061494 | \n",
943 | " 19950601 | \n",
944 | " 0.00000 | \n",
945 | " 0.00000 | \n",
946 | "
\n",
947 | " \n",
948 | " 6 | \n",
949 | " APCC. | \n",
950 | " 0.160571 | \n",
951 | " 19950601 | \n",
952 | " 0.00872 | \n",
953 | " 0.02036 | \n",
954 | "
\n",
955 | " \n",
956 | " 7 | \n",
957 | " APH | \n",
958 | " 0.080985 | \n",
959 | " 19950601 | \n",
960 | " 0.01136 | \n",
961 | " 0.00000 | \n",
962 | "
\n",
963 | " \n",
964 | " 8 | \n",
965 | " ARG | \n",
966 | " 0.059334 | \n",
967 | " 19950601 | \n",
968 | " 0.00000 | \n",
969 | " 0.00000 | \n",
970 | "
\n",
971 | " \n",
972 | " 9 | \n",
973 | " ATI.1 | \n",
974 | " 0.170435 | \n",
975 | " 19950601 | \n",
976 | " 0.00926 | \n",
977 | " 0.00000 | \n",
978 | "
\n",
979 | " \n",
980 | " 10 | \n",
981 | " AVATQ | \n",
982 | " 0.051080 | \n",
983 | " 19950601 | \n",
984 | " 0.00020 | \n",
985 | " 0.00000 | \n",
986 | "
\n",
987 | " \n",
988 | " 11 | \n",
989 | " BAY.3 | \n",
990 | " 0.088882 | \n",
991 | " 19950601 | \n",
992 | " 0.00975 | \n",
993 | " 0.00797 | \n",
994 | "
\n",
995 | " \n",
996 | " 12 | \n",
997 | " BBBY | \n",
998 | " 0.108766 | \n",
999 | " 19950601 | \n",
1000 | " 0.00374 | \n",
1001 | " 0.05000 | \n",
1002 | "
\n",
1003 | " \n",
1004 | " 13 | \n",
1005 | " BBY | \n",
1006 | " 0.184360 | \n",
1007 | " 19950601 | \n",
1008 | " 0.01155 | \n",
1009 | " 0.05000 | \n",
1010 | "
\n",
1011 | " \n",
1012 | " 14 | \n",
1013 | " BEV | \n",
1014 | " 0.064850 | \n",
1015 | " 19950601 | \n",
1016 | " 0.00000 | \n",
1017 | " 0.00000 | \n",
1018 | "
\n",
1019 | " \n",
1020 | " 15 | \n",
1021 | " BF.B | \n",
1022 | " 0.036150 | \n",
1023 | " 19950601 | \n",
1024 | " 0.02235 | \n",
1025 | " 0.00000 | \n",
1026 | "
\n",
1027 | " \n",
1028 | " 16 | \n",
1029 | " BGEN | \n",
1030 | " 0.057776 | \n",
1031 | " 19950601 | \n",
1032 | " 0.01425 | \n",
1033 | " 0.05000 | \n",
1034 | "
\n",
1035 | " \n",
1036 | " 17 | \n",
1037 | " BGG | \n",
1038 | " 0.066459 | \n",
1039 | " 19950601 | \n",
1040 | " 0.00000 | \n",
1041 | " 0.05000 | \n",
1042 | "
\n",
1043 | " \n",
1044 | " 18 | \n",
1045 | " BIIB | \n",
1046 | " 0.081258 | \n",
1047 | " 19950601 | \n",
1048 | " 0.01236 | \n",
1049 | " 0.00000 | \n",
1050 | "
\n",
1051 | " \n",
1052 | " 19 | \n",
1053 | " BLL | \n",
1054 | " 0.039363 | \n",
1055 | " 19950601 | \n",
1056 | " 0.03366 | \n",
1057 | " 0.00000 | \n",
1058 | "
\n",
1059 | " \n",
1060 | "
\n",
1061 | "
"
1062 | ],
1063 | "text/plain": [
1064 | " tic predicted_return trade_date mean_weight min_weight\n",
1065 | "0 ACV.1 0.024449 19950601 0.00000 0.00000\n",
1066 | "1 AES 0.096917 19950601 0.00000 0.00000\n",
1067 | "2 AHM.1 0.044516 19950601 0.01200 0.00522\n",
1068 | "3 AMH.1 0.105036 19950601 0.00000 0.00000\n",
1069 | "4 AMT.1 0.085373 19950601 0.00000 0.00000\n",
1070 | "5 AOS 0.061494 19950601 0.00000 0.00000\n",
1071 | "6 APCC. 0.160571 19950601 0.00872 0.02036\n",
1072 | "7 APH 0.080985 19950601 0.01136 0.00000\n",
1073 | "8 ARG 0.059334 19950601 0.00000 0.00000\n",
1074 | "9 ATI.1 0.170435 19950601 0.00926 0.00000\n",
1075 | "10 AVATQ 0.051080 19950601 0.00020 0.00000\n",
1076 | "11 BAY.3 0.088882 19950601 0.00975 0.00797\n",
1077 | "12 BBBY 0.108766 19950601 0.00374 0.05000\n",
1078 | "13 BBY 0.184360 19950601 0.01155 0.05000\n",
1079 | "14 BEV 0.064850 19950601 0.00000 0.00000\n",
1080 | "15 BF.B 0.036150 19950601 0.02235 0.00000\n",
1081 | "16 BGEN 0.057776 19950601 0.01425 0.05000\n",
1082 | "17 BGG 0.066459 19950601 0.00000 0.05000\n",
1083 | "18 BIIB 0.081258 19950601 0.01236 0.00000\n",
1084 | "19 BLL 0.039363 19950601 0.03366 0.00000"
1085 | ]
1086 | },
1087 | "execution_count": 45,
1088 | "metadata": {},
1089 | "output_type": "execute_result"
1090 | }
1091 | ],
1092 | "source": [
1093 | "stocks_weight_table.head(20)\n"
1094 | ]
1095 | },
1096 | {
1097 | "cell_type": "code",
1098 | "execution_count": 46,
1099 | "metadata": {},
1100 | "outputs": [
1101 | {
1102 | "data": {
1103 | "text/plain": [
1104 | "(12932, 5)"
1105 | ]
1106 | },
1107 | "execution_count": 46,
1108 | "metadata": {},
1109 | "output_type": "execute_result"
1110 | }
1111 | ],
1112 | "source": [
1113 | "stocks_weight_table.shape"
1114 | ]
1115 | },
1116 | {
1117 | "cell_type": "markdown",
1118 | "metadata": {},
1119 | "source": [
1120 | "## save to excel or csv"
1121 | ]
1122 | },
1123 | {
1124 | "cell_type": "code",
1125 | "execution_count": 47,
1126 | "metadata": {},
1127 | "outputs": [],
1128 | "source": [
1129 | "stocks_weight_table.to_excel('Data/stocks_weight_table.xlsx','Sheet1')\n"
1130 | ]
1131 | },
1132 | {
1133 | "cell_type": "code",
1134 | "execution_count": null,
1135 | "metadata": {},
1136 | "outputs": [],
1137 | "source": []
1138 | },
1139 | {
1140 | "cell_type": "code",
1141 | "execution_count": null,
1142 | "metadata": {},
1143 | "outputs": [],
1144 | "source": []
1145 | },
1146 | {
1147 | "cell_type": "code",
1148 | "execution_count": null,
1149 | "metadata": {},
1150 | "outputs": [],
1151 | "source": []
1152 | },
1153 | {
1154 | "cell_type": "code",
1155 | "execution_count": null,
1156 | "metadata": {},
1157 | "outputs": [],
1158 | "source": []
1159 | }
1160 | ],
1161 | "metadata": {
1162 | "kernelspec": {
1163 | "display_name": "Python 3",
1164 | "language": "python",
1165 | "name": "python3"
1166 | },
1167 | "language_info": {
1168 | "codemirror_mode": {
1169 | "name": "ipython",
1170 | "version": 3
1171 | },
1172 | "file_extension": ".py",
1173 | "mimetype": "text/x-python",
1174 | "name": "python",
1175 | "nbconvert_exporter": "python",
1176 | "pygments_lexer": "ipython3",
1177 | "version": "3.6.5"
1178 | }
1179 | },
1180 | "nbformat": 4,
1181 | "nbformat_minor": 2
1182 | }
1183 |
--------------------------------------------------------------------------------
/fundamental_run_model.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | warnings.filterwarnings("ignore")
3 |
4 | import pandas as pd
5 | import numpy as np
6 | import time
7 | import traceback
8 | import sys
9 | sys.path.append('code')
10 | import ml_model
11 |
12 |
13 |
14 |
15 | if __name__ == '__main__':
16 | import argparse
17 | parser = argparse.ArgumentParser()
18 |
19 | #sector name
20 | parser.add_argument('-sector_name','--sector_name_input', type=str, required=True,help='sector name: i.e. sector10')
21 |
22 | # file name
23 | parser.add_argument('-fundamental','--fundamental_input', type=str, required=True,help='inputfile name for fundamental table')
24 | parser.add_argument('-sector','--sector_input', type=str, required=True,help='inputfile name for individual sector')
25 |
26 | # rolling window variables
27 | parser.add_argument("-first_trade_index", default=20, type=int)
28 | parser.add_argument("-testing_window", default=4, type=int)
29 |
30 | # column name
31 | parser.add_argument("-label_column", default='y_return', type=str)
32 | parser.add_argument("-date_column", default='tradedate', type=str)
33 | parser.add_argument("-tic_column", default='tic', type=str)
34 | parser.add_argument("-no_feature_column_names", default = ['gvkey', 'tic', 'datadate', 'rdq', 'tradedate', 'fyearq', 'fqtr',
35 | 'conm', 'datacqtr', 'datafqtr', 'gsector','y_return'], type=list,help='column names that are not fundamental features')
36 |
37 |
38 |
39 | args = parser.parse_args()
40 | #load fundamental table
41 | inputfile_fundamental = args.fundamental_input
42 |
43 | fundamental_total=pd.read_excel(inputfile_fundamental)
44 | fundamental_total=fundamental_total[fundamental_total['tradedate'] < 20170901]
45 | #get all unique quarterly date
46 | unique_datetime = sorted(fundamental_total.tradedate.unique())
47 |
48 | # load sector data
49 | inputfile_sector = args.sector_input
50 | sector_data=pd.read_excel(inputfile_sector)
51 |
52 | #get sector unique ticker
53 | unique_ticker=sorted(sector_data.tic.unique())
54 |
55 | #set rolling window
56 | # train: 4 years = 16 quarters
57 | # test: 1 year = 4 quarters
58 | # so first trade date = #20 quarter
59 | #first trade date is 1995-06-01
60 | first_trade_date_index=args.first_trade_index
61 |
62 | #testing window
63 | testing_windows = args.testing_window
64 |
65 | #get all backtesting period trade dates
66 | trade_date=unique_datetime[first_trade_date_index:]
67 |
68 | #variable column name
69 | label_column = args.label_column
70 | date_column = args.date_column
71 | tic_column = args.tic_column
72 |
73 | # features column: different base on sectors
74 | no_feature_column_names = args.no_feature_column_names
75 | features_column = [x for x in sector_data.columns.values if x not in no_feature_column_names]
76 |
77 | #sector name
78 | sector_name = args.sector_name_input
79 |
80 | try:
81 | start = time.time()
82 | model_result=ml_model.run_4model(sector_data,
83 | features_column,
84 | label_column,
85 | date_column,
86 | tic_column,
87 | unique_ticker,
88 | unique_datetime,
89 | trade_date,
90 | first_trade_date_index,
91 | testing_windows)
92 | end = time.time()
93 | print('Time Spent: ',(end-start)/60,' minutes')
94 | ml_model.save_model_result(model_result,sector_name)
95 |
96 | except e:
97 | print(e)
98 |
99 |
100 |
101 | # python3 fundamental_run_model.py -sector_name sector10 -fundamental Data/fundamental_final_table.xlsx -sector Data/1-focasting_data/sector10_clean.xlsx
102 |
--------------------------------------------------------------------------------