├── .DS_Store ├── .gitattributes ├── Data ├── 1-focasting_data │ ├── .DS_Store │ ├── sector10_clean.xlsx │ ├── sector15_clean.xlsx │ ├── sector20_clean.xlsx │ ├── sector25_clean.xlsx │ ├── sector30_clean.xlsx │ ├── sector35_clean.xlsx │ ├── sector40_clean.xlsx │ ├── sector45_clean.xlsx │ ├── sector50_clean.xlsx │ ├── sector55_clean.xlsx │ └── sector60_clean.xlsx ├── 1-sp500_adj_price.csv.zip ├── 1-spx_price.xlsx ├── 2-portfolio_data │ ├── .DS_Store │ ├── equally_weighted_user8.xlsx │ ├── mean_weighted_user8.xlsx │ ├── minimum_weighted_user8.xlsx │ └── stocks_selected_total_user8.csv ├── all_return_table.pickle ├── all_stocks_info.pickle ├── fundamental_final_table.xlsx └── stocks_weight_table.xlsx ├── README.md ├── code ├── .DS_Store ├── ml_model.py └── old_Rcode │ ├── .DS_Store │ ├── fundamental_ML_model.R │ ├── fundamental_run_model.R │ └── fundamental_select_stock.R ├── figs ├── chart10_insample.PNG ├── chart11_overallPerformance.PNG ├── chart1_datasetPeriod.PNG ├── chart2_rolling_windows.PNG ├── chart3_modelError.PNG ├── chart4_predictedReturn1.PNG ├── chart4_predictedReturn2.PNG ├── chart5_coefficient.PNG ├── chart6_selectedStocks.PNG ├── chart7_efficient1.PNG ├── chart8_PnL.png ├── chart9_TotalValue.png ├── dataperiod.png ├── efficient1.jpg ├── pnl1.jpg ├── rolling_windows.vsdx └── transaction cost.PNG ├── fundamental_back_testing.ipynb ├── fundamental_portfolio.ipynb └── fundamental_run_model.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/.DS_Store -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /Data/1-focasting_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/.DS_Store -------------------------------------------------------------------------------- /Data/1-focasting_data/sector10_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector10_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector15_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector15_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector20_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector20_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector25_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector25_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector30_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector30_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector35_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector35_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector40_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector40_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector45_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector45_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector50_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector50_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector55_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector55_clean.xlsx -------------------------------------------------------------------------------- /Data/1-focasting_data/sector60_clean.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector60_clean.xlsx -------------------------------------------------------------------------------- /Data/1-sp500_adj_price.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-sp500_adj_price.csv.zip -------------------------------------------------------------------------------- /Data/1-spx_price.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-spx_price.xlsx -------------------------------------------------------------------------------- /Data/2-portfolio_data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/.DS_Store -------------------------------------------------------------------------------- /Data/2-portfolio_data/equally_weighted_user8.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/equally_weighted_user8.xlsx -------------------------------------------------------------------------------- /Data/2-portfolio_data/mean_weighted_user8.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/mean_weighted_user8.xlsx -------------------------------------------------------------------------------- /Data/2-portfolio_data/minimum_weighted_user8.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/minimum_weighted_user8.xlsx -------------------------------------------------------------------------------- /Data/all_return_table.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/all_return_table.pickle -------------------------------------------------------------------------------- /Data/all_stocks_info.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/all_stocks_info.pickle -------------------------------------------------------------------------------- /Data/fundamental_final_table.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/fundamental_final_table.xlsx -------------------------------------------------------------------------------- /Data/stocks_weight_table.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/stocks_weight_table.xlsx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dynamic-Stock-Recommendation-Machine_Learning 2 | 3 | ## First Author: Published paper on IEEE TrustCom 2018 (http://www.cloud-conf.net/trustcom18/) 4 | Hongyang Yang, Xiao-Yang Liu, Qingwei W. [A Practical Machine Learning Approach for Dynamic Stock Recommendation](https://ssrn.com/abstract=3302088). IEEE TrustCom 2018. 5 | 6 | ### IEEE Official Link of the paper (https://ieeexplore.ieee.org/abstract/document/8456121) 7 | ### SSRN Version: (https://ssrn.com/abstract=3302088) 8 | 9 | ## Abstract: 10 | Stock recommendation is vital to investment companies and investors. However, no single stock selection strategy will always win while analysts may not have enough time to check all S&P 500 stocks (the Standard & Poor’s 500). In this paper, we propose a practical scheme that recommends stocks from S&P 500 using machine learning. Our basic idea is to buy and hold the top 20% stocks dynamically. First, we select representative stock indicators with good explanatory power. Secondly, we take five frequently used machine learning methods, including linear regression, ridge regression, stepwise regression, random forest and generalized boosted regression, to model stock indicators and quarterly log-return in a rolling window. Thirdly, we choose the model with the lowest Mean Square Error in each period to rank stocks. Finally, we test the selected stocks by conducting portfolio allocation methods such as equally weighted, mean- variance, and minimum-variance. Our empirical results show that the proposed scheme outperforms the long-only strategy on the S&P 500 index in terms of Sharpe ratio and cumulative returns. 11 | 12 | ## Index Term: 13 | Stock recommendation, fundamental value investing, machine learning, model selection, risk management 14 | 15 | ## Project summary: 16 | + We developed a practical approach to using machine-learning methods selecting S&P 500 stocks based on financial ratios (e.g., EPS, ROA, ROE, etc). Outperformed the S&P 500 index on out of sample data, achieved a Sharpe ratio of 0.5 (0.19 on SPX). 17 | + We performed feature selection by 11 GICS sectors based on a rolling window to choose the lowest MSE model among Linear Regression, Stepwise Regression, Regression with Ridge, Random Forest, and GBM. Applied a model ensemble method. 18 | 19 | 20 | 21 | 22 | 23 | ## Data: 24 | Retrieved from __WRDS (Wharton Research Data Services)__, Compustat Industrial [27 years daily and quarterly Data] 25 | 26 | 27 | 28 | 29 | + __S&P 500 Fundamental Quarterly Data__ ([fundamental_final_table.xlsx](Data/fundamental_final_table.xlsx)) 30 | + Database: Compustat North America (Fundamentals Quarterly) and (Index Constituents) 31 | + Timeline: 27 years (1990-2017) 32 | + Tickers: 1193 stock (all historical S&P 500 component stocks) 33 | + Value: 20 financial ratios calculated from raw accouting report data 34 | 35 | + __S&P 500 Historical Component Stocks Adjusted Daily Price__ ([1-sp500_adj_price.csv.zip](Data/1-sp500_adj_price.csv.zip)) 36 | + Database: Compustat North America (Security Daily) 37 | + Timeline: 27 years (1990-2017) 38 | + Tickers: 1193 stock (all historical S&P 500 component stocks) 39 | + Value: Adjusted Daily Close Price 40 | 41 | + __S&P 500 Index Daily Price__ ([1-spx_price.xlsx](Data/1-spx_price.xlsx)) 42 | + Database: Yahoo Finance 43 | + Timeline: 27 years (1990-2017) 44 | + Tickers: SPX 45 | + Value: Adjusted Daily Close Price 46 | 47 | ## Code: 48 | 49 | ### __Focasting Model__: 50 | + __Input__: 11 Excel files of cleaned data about fundamental financial ratios (sector 10-Energy, sector 15-Materials, sector 20-Industrials, sector 25-Consumer Discretionary, sector 30-Consumer Staples, sector 35-Health Care, sector 40-Financials, sector 45-Information Technology, sector 50-Telecommunication Services, sector 55-Utilities, sector 60-Real Estate) 51 | + __Python Script__: 2 Scripts 52 | + [ml_model.py](code/ml_model.py): The forecasting function (cornerstone of this project) 53 | + [fundamental_run_model.py](fundamental_run_model.py): The main function to run the forecasting model 54 | ```shell 55 | 56 | python3 fundamental_run_model.py \ 57 | -sector_name sector10 \ 58 | -fundamental Data/fundamental_final_table.xlsx \ 59 | -sector Data/1-focasting_data/sector10_clean.xlsx 60 | ``` 61 | 62 | 63 | + __Old R Script__: 3 R Scripts 64 | + [fundamental_run_model.R](code/fundamental_run_model.R): The main function to run the forecasting model 65 | + [fundamental_ML_model.R](code/fundamental_ML_model.R): The forecasting function (cornerstone of this project) 66 | + [fundamental_select_stock.R](code/fundamental_select_stock.R): The function to select top 20% stocks in each sector 67 | + __Output__: [a CSV file](Data/2-portfolio_data/stocks_selected_total_user8.csv) includes __tic__: the stock name, __predicted_return__: predicted return of next quarter by our model, __trade_date__: the date to execute the trades 68 | 69 | 70 | 71 | 72 | 73 | ### __Portfolio Allocation__: 74 | 75 | + __Input__: 2 files 76 | + The [CSV file](Data/2-portfolio_data/stocks_selected_total_user8.csv) generated by forecasting model 77 | + The [adjusted close price data of S&P 500 stocks](Data/1-sp500_adj_price.csv.zip) to calculate covariance matrix 78 | 79 | + __Script__: [fundamental_portfolio.ipynb](fundamental_portfolio.ipynb) 80 | 81 | + __Output__: 3 Excel files each with the following 4 columns 82 | 1. __tic__: the stock name 83 | 2. __predicted_return__: predicted return of next quarter by our model 84 | 3. __weights__: the weights to trade 85 | 4. __trade_date__: the date to execute the trades 86 | 87 | 88 | 89 | ### __Back-testing Model__: 90 | 91 | + __Input__: 5 files 92 | + [equally_weighted](Data/2-portfolio_data/equally_weighted_user8.xlsx): equally-weighted portfolio (Portfolio Benchmark) 93 | + [mean_weighted](Data/2-portfolio_data/mean_weighted_user8.xlsx): mean-variance portfolio 94 | + [minimum_weighted](Data/2-portfolio_data/minimum_weighted_user8.xlsx): minimum-variance portfolio (our model) 95 | + [adjusted daily close price of S&P 500 stocks](Data/1-sp500_adj_price.csv.zip): to calcualte quarterly return 96 | + [SPX adjusted daily close price](Data/1-spx_price.xlsx): The Market Index (Overall Benchmark) 97 | 98 | + __Script__: 1 Python jupyter notebook Script 99 | + [fundamental_back_testing.ipynb](code/fundamental_back_testing.ipynb): The back-testing function 100 | 101 | + __Output__: 102 | 1. Quarterly return of our portfolio with transaction cost 103 | 2. Performance Evaluation: total return, annulized return and standard deviation, maximum drawdown, Sharpe ratio 104 | -------------------------------------------------------------------------------- /code/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/code/.DS_Store -------------------------------------------------------------------------------- /code/ml_model.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import traceback 4 | 5 | from sklearn.model_selection import train_test_split 6 | from sklearn.svm import SVC 7 | from sklearn.svm import SVR 8 | from sklearn.metrics import confusion_matrix 9 | from sklearn.model_selection import cross_val_score, cross_val_predict 10 | from sklearn.linear_model import Ridge 11 | 12 | from sklearn.linear_model import LinearRegression 13 | from sklearn.feature_selection import RFE 14 | from sklearn.linear_model import Lasso 15 | from sklearn.ensemble import RandomForestRegressor 16 | from sklearn.ensemble import GradientBoostingRegressor 17 | from sklearn.ensemble import AdaBoostRegressor 18 | 19 | from sklearn.model_selection import TimeSeriesSplit, GridSearchCV,RandomizedSearchCV 20 | 21 | from keras.models import Sequential 22 | from keras.layers import Dense 23 | from keras.layers import LSTM 24 | from keras.layers import Dropout 25 | 26 | import os 27 | import errno 28 | 29 | 30 | def prepare_rolling_train(df,features_column,label_column,date_column,unique_datetime,testing_windows,first_trade_date_index, max_rolling_window_index,current_index): 31 | if current_index <=max_rolling_window_index: 32 | train=df[(df[date_column] >= unique_datetime[0]) \ 33 | & (df[date_column] < unique_datetime[current_index-testing_windows])] 34 | else: 35 | train=df[(df[date_column] >= unique_datetime[current_index-max_rolling_window_index]) \ 36 | & (df[date_column] < unique_datetime[current_index-testing_windows])] 37 | 38 | X_train=train[features_column] 39 | y_train=train[label_column] 40 | return X_train,y_train 41 | 42 | def prepare_rolling_test(df,features_column,label_column,date_column,unique_datetime,testing_windows,fist_trade_date_index, current_index): 43 | test=df[(df[date_column] >= unique_datetime[current_index-testing_windows]) \ 44 | & (df[date_column] < unique_datetime[current_index])] 45 | X_test=test[features_column] 46 | y_test=test[label_column] 47 | return X_test,y_test 48 | 49 | def prepare_trade_data(df,features_column,label_column,date_column,tic_column,unique_datetime,testing_windows,fist_trade_date_index, current_index): 50 | trade = df[df[date_column] == unique_datetime[current_index]] 51 | X_trade = trade[features_column] 52 | y_trade = trade[label_column] 53 | trade_tic = trade[tic_column].values 54 | return X_trade,y_trade,trade_tic 55 | 56 | 57 | def train_linear_regression(X_train,y_train): 58 | 59 | lr_regressor = LinearRegression() 60 | model = lr_regressor.fit(X_train, y_train) 61 | 62 | return model 63 | 64 | def train_recursive_feature_elimination(X_train,y_train): 65 | 66 | lr_regressor = LinearRegression(random_state = 42) 67 | model = RFE(lr_regressor) 68 | 69 | return model 70 | 71 | def train_lasso(X_train, y_train): 72 | # lasso_regressor = Lasso() 73 | # model = lasso_regressor.fit(X_train, y_train) 74 | 75 | lasso = Lasso(random_state = 42) 76 | # scoring_method = 'r2' 77 | # scoring_method = 'explained_variance' 78 | # scoring_method = 'neg_mean_absolute_error' 79 | scoring_method = 'neg_mean_squared_error' 80 | #scoring_method = 'neg_mean_squared_log_error' 81 | parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]} 82 | # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced) 83 | lasso_regressor = GridSearchCV(lasso, parameters, scoring=scoring_method, cv=3) 84 | lasso_regressor.fit(X_train, y_train) 85 | 86 | model = lasso_regressor.best_estimator_ 87 | return model 88 | 89 | def train_ridge(X_train, y_train): 90 | # lasso_regressor = Lasso() 91 | # model = lasso_regressor.fit(X_train, y_train) 92 | 93 | ridge = Ridge(random_state = 42) 94 | # scoring_method = 'r2' 95 | # scoring_method = 'explained_variance' 96 | # scoring_method = 'neg_mean_absolute_error' 97 | scoring_method = 'neg_mean_squared_error' 98 | #scoring_method = 'neg_mean_squared_log_error' 99 | parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]} 100 | # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced) 101 | ridge_regressor = GridSearchCV(ridge, parameters, scoring=scoring_method, cv=3) 102 | ridge_regressor.fit(X_train, y_train) 103 | 104 | model = ridge_regressor.best_estimator_ 105 | return model 106 | 107 | def train_random_forest(X_train, y_train): 108 | 109 | random_grid = { 110 | #'max_depth': [10, 20, 40, 80, 100, None], 111 | 'max_features': ['sqrt'], 112 | 'min_samples_leaf': [0.05,0.1,0.2], 113 | 'min_samples_split': np.linspace(0.1, 1, 10, endpoint=True), 114 | 'n_estimators': [75,100,200]} 115 | # scoring_method = 'r2' 116 | # scoring_method = 'explained_variance' 117 | # scoring_method = 'neg_mean_absolute_error' 118 | scoring_method = 'neg_mean_squared_error' 119 | #scoring_method = 'neg_mean_squared_log_error' 120 | 121 | # my_cv_rf = TimeSeriesSplit(n_splits=5).split(X_train_rf) 122 | rf = RandomForestRegressor(random_state=42) 123 | #RandomizedSearchCV 124 | #randomforest_regressor = RandomizedSearchCV(estimator=rf, 125 | # param_distributions=random_grid, 126 | # n_iter = 100, 127 | # cv=3, 128 | # n_jobs=-1, 129 | # scoring=scoring_method, 130 | # verbose=0) 131 | #GridSearchCV 132 | randomforest_regressor = GridSearchCV(estimator=rf, 133 | param_grid=random_grid, 134 | cv=3, 135 | n_jobs=-1, 136 | scoring=scoring_method, 137 | verbose=0) 138 | 139 | randomforest_regressor.fit(X_train, y_train) 140 | #print(randomforest_regressor.best_params_ ) 141 | model = randomforest_regressor.best_estimator_ 142 | ''' 143 | randomforest_regressor = RandomForestRegressor(random_state = 42,n_estimators = 400, max_features='auto') 144 | #randomforest_regressor = RandomForestRegressor(random_state = 42,n_estimators = 300) 145 | 146 | model = randomforest_regressor.fit(X_train, y_train) 147 | ''' 148 | return model 149 | 150 | 151 | def train_svm(X_train, y_train): 152 | svr = SVR(kernel = 'rbf') 153 | 154 | param_grid_svm = {'C':[0.001, 0.1, 1],'gamma': [1e-7,0.1]} 155 | #param_grid_svm = {'kernel': ('linear', 'rbf','poly'), 'C':[0.001, 0.01, 0.1, 1, 10],'gamma': [1e-7, 1e-4,0.001,0.1],'epsilon':[0.1,0.2,0.5,0.3]} 156 | 157 | # scoring_method = 'r2' 158 | # scoring_method = 'explained_variance' 159 | # scoring_method = 'neg_mean_absolute_error' 160 | scoring_method = 'neg_mean_squared_error' 161 | #scoring_method = 'neg_mean_squared_log_error' 162 | 163 | svm_regressor = GridSearchCV(estimator=svr, param_grid =param_grid_svm, cv=3, n_jobs=-1, scoring=scoring_method, verbose=0) 164 | 165 | svm_regressor.fit(X_train, y_train) 166 | model = svm_regressor.best_estimator_ 167 | #estimator = svm_regressor.best_estimator_ 168 | #selector = RFE(estimator, 5, step=1) 169 | #model = selector.fit(X, y) 170 | 171 | return model 172 | 173 | 174 | def train_gbm(X_train, y_train): 175 | gbm = GradientBoostingRegressor(random_state = 42) 176 | # model = gbm.fit(X_train, y_train) 177 | 178 | param_grid_gbm = {'learning_rate': [0.1, 0.01, 0.001], 'n_estimators': [100, 250, 500,1000]} 179 | # scoring_method = 'r2' 180 | # scoring_method = 'explained_variance' 181 | # scoring_method = 'neg_mean_absolute_error' 182 | scoring_method = 'neg_mean_squared_error' 183 | #scoring_method = 'neg_mean_squared_log_error' 184 | gbm_regressor = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm, 185 | cv=3, n_jobs=-1, scoring=scoring_method, verbose=0) 186 | 187 | gbm_regressor.fit(X_train, y_train) 188 | model = gbm_regressor.best_estimator_ 189 | ''' 190 | 191 | gbm_regressor = GradientBoostingRegressor() 192 | model = gbm_regressor.fit(X_train, y_train) 193 | ''' 194 | return model 195 | 196 | 197 | 198 | 199 | def train_ada(X_train, y_train): 200 | ada = AdaBoostRegressor() 201 | 202 | # model = ada.fit(X_train, y_train) 203 | 204 | param_grid_ada = {'n_estimators': [20, 100], 205 | 'learning_rate': [0.01, 0.05, 1]} 206 | # scoring_method = 'r2' 207 | # scoring_method = 'explained_variance' 208 | # scoring_method = 'neg_mean_absolute_error' 209 | # scoring_method = 'neg_mean_squared_error' 210 | #scoring_method = 'neg_mean_squared_log_error' 211 | 212 | ada_regressor = GridSearchCV(estimator=ada, param_distributions=param_grid_ada, 213 | cv=3, n_jobs=-1, scoring=scoring_method, verbose=0) 214 | 215 | ada_regressor.fit(X_train, y_train) 216 | model = ada_regressor.best_estimator_ 217 | ''' 218 | ada_regressor = AdaBoostRegressor() 219 | model = ada_regressor.fit(X_train, y_train) 220 | ''' 221 | return model 222 | 223 | 224 | def evaluate_model(model, X_test, y_test): 225 | from sklearn.metrics import mean_squared_error 226 | #from sklearn.metrics import mean_squared_log_error 227 | 228 | from sklearn.metrics import mean_absolute_error 229 | from sklearn.metrics import explained_variance_score 230 | from sklearn.metrics import r2_score 231 | y_predict = model.predict(X_test) 232 | 233 | mae = mean_absolute_error(y_test, y_predict) 234 | 235 | 236 | mse = mean_squared_error(y_test, y_predict) 237 | #msle = mean_squared_log_error(y_test, y_predict) 238 | 239 | explained_variance = explained_variance_score(y_test, y_predict) 240 | r2 = r2_score(y_test, y_predict) 241 | 242 | return mse 243 | 244 | 245 | def append_return_table(df_predict, unique_datetime, y_trade_return, trade_tic, current_index): 246 | tmp_table = pd.DataFrame(columns=trade_tic) 247 | tmp_table = tmp_table.append(pd.Series(y_trade_return, index=trade_tic), ignore_index=True) 248 | df_predict.loc[unique_datetime[current_index]][tmp_table.columns] = tmp_table.loc[0] 249 | 250 | 251 | def run_4model(df,features_column, label_column,date_column,tic_column, 252 | unique_ticker, unique_datetime, trade_date, 253 | first_trade_date_index=20, 254 | testing_windows=4, 255 | max_rolling_window_index=44): 256 | ## initialize all the result tables 257 | ## need date as index and unique tic name as columns 258 | df_predict_lr = pd.DataFrame(columns=unique_ticker, index=trade_date) 259 | df_predict_rf = pd.DataFrame(columns=unique_ticker, index=trade_date) 260 | df_predict_ridge = pd.DataFrame(columns=unique_ticker, index=trade_date) 261 | df_predict_gbm = pd.DataFrame(columns=unique_ticker, index=trade_date) 262 | 263 | df_predict_best = pd.DataFrame(columns=unique_ticker, index=trade_date) 264 | df_best_model_name = pd.DataFrame(columns=['model_name'], index=trade_date) 265 | evaluation_record = {} 266 | # first trade date is 1995-06-01 267 | # fist_trade_date_index = 20 268 | # testing_windows = 6 269 | 270 | for i in range(first_trade_date_index, len(unique_datetime)): 271 | try: 272 | # prepare training data 273 | X_train, y_train = prepare_rolling_train(df, 274 | features_column, 275 | label_column, 276 | date_column, 277 | unique_datetime, 278 | testing_windows, 279 | first_trade_date_index, 280 | max_rolling_window_index, 281 | current_index=i 282 | ) 283 | 284 | # prepare testing data 285 | X_test, y_test = prepare_rolling_test(df, 286 | features_column, 287 | label_column, 288 | date_column, 289 | unique_datetime, 290 | testing_windows, 291 | first_trade_date_index, 292 | current_index=i) 293 | 294 | # prepare trade data 295 | X_trade, y_trade, trade_tic = prepare_trade_data(df, 296 | features_column, 297 | label_column, 298 | date_column, 299 | tic_column, 300 | unique_datetime, 301 | testing_windows, 302 | first_trade_date_index, 303 | current_index=i) 304 | 305 | # Training 306 | lr_model = train_linear_regression(X_train, y_train) 307 | rf_model = train_random_forest(X_train, y_train) 308 | ridge_model = train_ridge(X_train, y_train) 309 | gbm_model = train_gbm(X_train, y_train) 310 | 311 | 312 | # Validation 313 | lr_eval = evaluate_model(lr_model, X_test, y_test) 314 | rf_eval = evaluate_model(rf_model, X_test, y_test) 315 | ridge_eval = evaluate_model(ridge_model, X_test, y_test) 316 | gbm_eval = evaluate_model(gbm_model, X_test, y_test) 317 | 318 | # Trading 319 | y_trade_lr = lr_model.predict(X_trade) 320 | y_trade_rf = rf_model.predict(X_trade) 321 | y_trade_ridge = ridge_model.predict(X_trade) 322 | y_trade_gbm = gbm_model.predict(X_trade) 323 | 324 | 325 | # Decide the best model 326 | eval_data = [[lr_eval, y_trade_lr], 327 | [rf_eval, y_trade_rf] , 328 | [ridge_eval, y_trade_ridge], 329 | [gbm_eval, y_trade_gbm] 330 | ] 331 | eval_table = pd.DataFrame(eval_data, columns=['model_eval', 'model_predict_return'], 332 | index=['lr', 'rf','ridge','gbm']) 333 | 334 | 335 | evaluation_record[unique_datetime[i]]=eval_table 336 | 337 | # lowest error score model 338 | y_trade_best = eval_table.model_predict_return.values[eval_table.model_eval == eval_table.model_eval.min()][0] 339 | best_model_name = eval_table.index.values[eval_table.model_eval == eval_table.model_eval.min()][0] 340 | 341 | # Highest Explained Variance 342 | # y_trade_best = eval_table.model_predict_return.values[eval_table.model_eval==eval_table.model_eval.max()][0] 343 | # best_model_name = eval_table.index.values[eval_table.model_eval==eval_table.model_eval.max()][0] 344 | 345 | df_best_model_name.loc[unique_datetime[i]] = best_model_name 346 | 347 | # Prepare Predicted Return table 348 | append_return_table(df_predict_lr, unique_datetime, y_trade_lr, trade_tic, current_index=i) 349 | append_return_table(df_predict_rf, unique_datetime, y_trade_rf, trade_tic, current_index=i) 350 | append_return_table(df_predict_ridge, unique_datetime, y_trade_ridge, trade_tic, current_index=i) 351 | append_return_table(df_predict_gbm, unique_datetime, y_trade_gbm, trade_tic, current_index=i) 352 | 353 | append_return_table(df_predict_best, unique_datetime, y_trade_best, trade_tic, current_index=i) 354 | 355 | print('Trade Date: ', unique_datetime[i]) 356 | 357 | except Exception: 358 | traceback.print_exc() 359 | df_evaluation = get_model_evaluation_table(evaluation_record,trade_date) 360 | return (df_predict_lr, 361 | df_predict_rf, 362 | df_predict_ridge, 363 | df_predict_gbm, 364 | df_predict_best, 365 | df_best_model_name, 366 | evaluation_record, 367 | df_evaluation) 368 | 369 | 370 | def get_model_evaluation_table(evaluation_record,trade_date): 371 | evaluation_list = [] 372 | for d in trade_date: 373 | try: 374 | evaluation_list.append(evaluation_record[d]['model_eval'].values) 375 | except: 376 | print('error') 377 | df_evaluation = pd.DataFrame(evaluation_list,columns = ['linear_regression', 'random_forest','ridge','gbm']) 378 | df_evaluation.index = trade_date 379 | return df_evaluation 380 | 381 | def save_model_result(sector_result,sector_name): 382 | df_predict_lr = sector_result[0].astype(np.float64) 383 | df_predict_rf = sector_result[1].astype(np.float64) 384 | df_predict_ridge = sector_result[2].astype(np.float64) 385 | df_predict_gbm = sector_result[3].astype(np.float64) 386 | df_predict_best = sector_result[4].astype(np.float64) 387 | 388 | df_best_model_name = sector_result[5] 389 | df_evaluation_score = sector_result[6] 390 | df_model_score = sector_result[7] 391 | 392 | 393 | 394 | filename = 'results/'+sector_name+'/' 395 | if not os.path.exists(os.path.dirname(filename)): 396 | try: 397 | os.makedirs(os.path.dirname(filename)) 398 | except OSError as exc: # Guard against race condition 399 | if exc.errno != errno.EEXIST: 400 | raise 401 | 402 | 403 | df_predict_lr.to_csv('results/'+sector_name+'/df_predict_lr.csv') 404 | df_predict_rf.to_csv('results/'+sector_name+'/df_predict_rf.csv') 405 | df_predict_ridge.to_csv('results/'+sector_name+'/df_predict_ridge.csv') 406 | df_predict_gbm.to_csv('results/'+sector_name+'/df_predict_gbm.csv') 407 | df_predict_best.to_csv('results/'+sector_name+'/df_predict_best.csv') 408 | df_best_model_name.to_csv('results/'+sector_name+'/df_best_model_name.csv') 409 | #df_evaluation_score.to_csv('results/'+sector_name+'/df_evaluation_score.csv') 410 | df_model_score.to_csv('results/'+sector_name+'/df_model_score.csv') 411 | 412 | 413 | 414 | def calculate_sector_daily_return(daily_price, unique_ticker,trade_date): 415 | daily_price_pivot = pd.pivot_table(daily_price, values='adj_price', index=['datadate'], 416 | columns=['tic'], aggfunc=np.mean) 417 | daily_price_pivot=daily_price_pivot[unique_ticker] 418 | 419 | daily_return=daily_price_pivot.pct_change() 420 | daily_return = daily_return[daily_return.index>=trade_date[0]] 421 | return daily_return 422 | 423 | def calculate_sector_quarterly_return(daily_price, unique_ticker,trade_date_plus1): 424 | daily_price_pivot = pd.pivot_table(daily_price, values='adj_price', index=['datadate'], 425 | columns=['tic'], aggfunc=np.mean) 426 | daily_price_pivot=daily_price_pivot[unique_ticker] 427 | quarterly_price_pivot=daily_price_pivot.ix[trade_date_plus1] 428 | 429 | quarterly_return=quarterly_price_pivot.pct_change() 430 | quarterly_return = quarterly_return[quarterly_return.index>trade_date_plus1[0]] 431 | 432 | return quarterly_return 433 | 434 | def pick_stocks_based_on_quantiles_old(df_predict_best): 435 | 436 | quantile_0_25 = {} 437 | quantile_25_50 = {} 438 | quantile_50_75 = {} 439 | quantile_75_100 = {} 440 | 441 | 442 | for i in range(df_predict_best.shape[0]): 443 | q_25=df_predict_best.iloc[i].quantile(0.25) 444 | q_50=df_predict_best.iloc[i].quantile(0.5) 445 | q_75=df_predict_best.iloc[i].quantile(0.75) 446 | q_100=df_predict_best.iloc[i].quantile(1) 447 | 448 | quantile_0_25[df_predict_best.index[i]] = df_predict_best.iloc[i][df_predict_best.iloc[i] <= q_25] 449 | quantile_25_50[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_25) & \ 450 | (df_predict_best.iloc[i] <= q_50)] 451 | quantile_50_75[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_50) & \ 452 | (df_predict_best.iloc[i] <= q_75)] 453 | quantile_75_100[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_75)] 454 | return (quantile_0_25, quantile_25_50, quantile_50_75, quantile_75_100) 455 | 456 | def pick_stocks_based_on_quantiles(df_predict_best): 457 | 458 | quantile_0_30 = {} 459 | 460 | quantile_70_100 = {} 461 | 462 | 463 | for i in range(df_predict_best.shape[0]): 464 | q_30=df_predict_best.iloc[i].quantile(0.3) 465 | q_70=df_predict_best.iloc[i].quantile(0.7) 466 | 467 | quantile_0_30[df_predict_best.index[i]] = df_predict_best.iloc[i][df_predict_best.iloc[i] <= q_30] 468 | 469 | 470 | quantile_70_100[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] >= q_70)] 471 | return (quantile_0_30, quantile_70_100) 472 | 473 | def calculate_portfolio_return(daily_return,trade_date_plus1,long_dict,frequency_date): 474 | df_portfolio_return = pd.DataFrame(columns=['portfolio_return']) 475 | 476 | for i in range(len(trade_date_plus1) - 1): 477 | # for long only 478 | #equally weight 479 | #long_normalize_weight = 1/long_dict[trade_date_plus1[i]].shape[0] 480 | 481 | # map date and tic 482 | long_tic_return_daily = \ 483 | daily_return[(daily_return.index >= trade_date_plus1[i]) &\ 484 | (daily_return.index < trade_date_plus1[i + 1])][long_dict[trade_date_plus1[i]].index] 485 | # return * weight 486 | long_daily_return = long_tic_return_daily 487 | df_temp = long_daily_return.mean(axis=1) 488 | df_temp = pd.DataFrame(df_temp, columns=['daily_return']) 489 | df_portfolio_return = df_portfolio_return.append(df_temp) 490 | return df_portfolio_return 491 | 492 | def calculate_portfolio_quarterly_return(quarterly_return,trade_date_plus1,long_dict): 493 | df_portfolio_return = pd.DataFrame(columns=['portfolio_return']) 494 | 495 | for i in range(len(trade_date_plus1) - 1): 496 | # for long only 497 | #equally weight 498 | #long_normalize_weight = 1/long_dict[trade_date_plus1[i]].shape[0] 499 | 500 | # map date and tic 501 | long_tic_return = quarterly_return[quarterly_return.index == trade_date_plus1[i + 1]][long_dict[trade_date_plus1[i]].index] 502 | 503 | df_temp = long_tic_return.mean(axis=1) 504 | df_temp = pd.DataFrame(df_temp, columns=['portfolio_return']) 505 | df_portfolio_return = df_portfolio_return.append(df_temp) 506 | return df_portfolio_return 507 | 508 | def long_only_strategy_daily(df_predict_return, daily_return, trade_month_plus1, top_quantile_threshold=0.75): 509 | long_dict = {} 510 | for i in range(df_predict_return.shape[0]): 511 | top_q = df_predict_return.iloc[i].quantile(top_quantile_threshold) 512 | # low_q=df_predict_return.iloc[i].quantile(0.2) 513 | # Select all stocks 514 | # long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][~np.isnan(df_predict_return.iloc[i])] 515 | # Select Top 30% Stocks 516 | long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i] >= top_q] 517 | # short_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i]<=low_q] 518 | 519 | df_portfolio_return_daily = pd.DataFrame(columns=['daily_return']) 520 | for i in range(len(trade_month_plus1) - 1): 521 | # for long only 522 | #equally weight 523 | long_normalize_weight = 1/long_dict[trade_month_plus1[i]].shape[0] 524 | 525 | # calculate weight based on predicted return 526 | #long_normalize_weight = \ 527 | #long_dict[trade_month_plus1[i]] / sum(long_dict[trade_month_plus1[i]].values) 528 | # map date and tic 529 | long_tic_return_daily = \ 530 | daily_return[(daily_return.index >= trade_month_plus1[i]) & (daily_return.index < trade_month_plus1[i + 1])][ 531 | long_dict[trade_month_plus1[i]].index] 532 | # return * weight 533 | long_daily_return = long_tic_return_daily * long_normalize_weight 534 | df_temp = long_daily_return.sum(axis=1) 535 | df_temp = pd.DataFrame(df_temp, columns=['daily_return']) 536 | df_portfolio_return_daily = df_portfolio_return_daily.append(df_temp) 537 | 538 | # for short only 539 | # short_normalize_weight=short_dict[trade_month[i]]/sum(short_dict[trade_month[i]].values) 540 | # short_tic_return=tic_monthly_return[tic_monthly_return.index==trade_month[i]][short_dict[trade_month[i]].index] 541 | # short_return_table=short_tic_return 542 | # portfolio_return_dic[trade_month[i]] = long_return_table.values.sum() + short_return_table.values.sum() 543 | 544 | return df_portfolio_return_daily 545 | 546 | 547 | def long_only_strategy_monthly(df_predict_return, tic_monthly_return, trade_month, top_quantile_threshold=0.7): 548 | long_dict = {} 549 | short_dict = {} 550 | for i in range(df_predict_return.shape[0]): 551 | top_q = df_predict_return.iloc[i].quantile(top_quantile_threshold) 552 | # low_q=df_predict_return.iloc[i].quantile(0.2) 553 | # Select all stocks 554 | # long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][~np.isnan(df_predict_return.iloc[i])] 555 | # Select Top 30% Stocks 556 | long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i] >= top_q] 557 | # short_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i]<=low_q] 558 | 559 | portfolio_return_dic = {} 560 | for i in range(len(trade_month)): 561 | # for longX_train_rf only 562 | # calculate weight based on predicted return 563 | long_normalize_weight = long_dict[trade_month[i]] / sum(long_dict[trade_month[i]].values) 564 | # map date and tic 565 | long_tic_return = tic_monthly_return[tic_monthly_return.index == trade_month[i]][ 566 | long_dict[trade_month[i]].index] 567 | # return * weight 568 | long_return_table = long_tic_return * long_normalize_weight 569 | portfolio_return_dic[trade_month[i]] = long_return_table.values.sum() 570 | 571 | # for short only 572 | # short_normalize_weight=short_dict[trade_month[i]]/sum(short_dict[trade_month[i]].values) 573 | # short_tic_return=tic_monthly_return[tic_monthly_return.index==trade_month[i]][short_dict[trade_month[i]].index] 574 | # short_return_table=short_tic_return 575 | # portfolio_return_dic[trade_month[i]] = long_return_table.values.sum() + short_return_table.values.sum() 576 | 577 | df_portfolio_return = pd.DataFrame.from_dict(portfolio_return_dic, orient='index') 578 | df_portfolio_return = df_portfolio_return.reset_index() 579 | df_portfolio_return.columns = ['trade_month', 'monthly_return'] 580 | df_portfolio_return.index = df_portfolio_return.trade_month 581 | df_portfolio_return = df_portfolio_return['monthly_return'] 582 | return df_portfolio_return 583 | 584 | 585 | 586 | 587 | 588 | def plot_predict_return_distribution(df_predict_best,sector_name,out_path): 589 | import matplotlib.pyplot as plt 590 | 591 | for i in range(df_predict_best.shape[0]): 592 | fig=plt.figure(figsize=(8,5)) 593 | df_predict_best.iloc[i].hist() 594 | plt.xlabel("predicted return",size=15) 595 | plt.ylabel("frequency",size=15) 596 | 597 | plt.title(sector_name+": trade date - "+str(df_predict_best.index[i]),size=15) 598 | plt.savefig(out_path+str(df_predict_best.index[i])+".png") 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | -------------------------------------------------------------------------------- /code/old_Rcode/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/code/old_Rcode/.DS_Store -------------------------------------------------------------------------------- /code/old_Rcode/fundamental_ML_model.R: -------------------------------------------------------------------------------- 1 | fundamental_ML_model <- function(sector_data,trade_date){ 2 | ####################################################### 3 | #1. model test error to select models 4 | #2. trade period predicted return to select stocks 5 | #3. linear regression features 6 | #4. random forest features 7 | #5. ridge features 8 | #6. stepwise regression features 9 | #7. gbm features 10 | #sector_data=sector45_data 11 | 12 | #look at the data determine the first factor column number 13 | start_column=12 14 | 15 | #set the rows to 89, because we have 89 stock selections 16 | #may need to adjust and put into function 17 | 18 | #model error to select model 19 | model_error=data.frame(MSE_linear=replicate(89,0)) 20 | model_error[,2]=data.frame(MSE_RF=replicate(89,0)) 21 | model_error[,3]=data.frame(MSE_ridge=replicate(89,0)) 22 | model_error[,4]=data.frame(MSE_step=replicate(89,0)) 23 | model_error[,5]=data.frame(MSE_gbm=replicate(89,0)) 24 | 25 | #predicte return to select stocks 26 | predicted_return=list() 27 | 28 | 29 | 30 | #main model 31 | LR_features=list() 32 | RF_features=list() 33 | ridge_features=list() 34 | 35 | Step_features=list() 36 | GBM_features=list() 37 | 38 | #for(i in 1:(length(trade_date)-19)){RF_features[[i]]=c(1:i)} 39 | 40 | #understand rolling windows 41 | #for(i in 1:(length(trade_date)-19)){print(c(i,i+15,i+16,i+19,trade_date[i+20]))} 42 | 43 | for(i in 1:(length(trade_date)-21)){ 44 | 45 | ############################################### 46 | ###########rolling window######################## 47 | 48 | ####train the model based on 4 years, 16 quarters data 49 | #growing window 10 years 50 | if (i<=25) { 51 | data_train=sector_data[(sector_data$tradedate <= trade_date[i+15]),] 52 | train_x=data_train[,c(start_column:(dim(sector_data)[2]-1))] 53 | train_y=data_train[,dim(sector_data)[2]] 54 | } else{ 55 | data_train=sector_data[(sector_data$tradedate <= trade_date[i+15]) & sector_data$tradedate >= trade_date[i-25],] 56 | train_x=data_train[,c(start_column:(dim(sector_data)[2]-1))] 57 | train_y=data_train[,dim(sector_data)[2]] 58 | } 59 | 60 | ####test the model based on 1 year, 4 quarters data 61 | data_test=sector_data[(sector_data$tradedate <= trade_date[i+19]) & (sector_data$tradedate >= trade_date[i+16]),] 62 | test_x=data_test[,c(start_column:(dim(sector_data)[2]-1))] 63 | test_y=data_test[,dim(sector_data)[2]] 64 | 65 | train=cbind(train_y,train_x) 66 | test=cbind(test_y,test_x) 67 | 68 | 69 | ####trade data for every quarter 70 | data_trade=sector_data[(sector_data$tradedate == trade_date[i+20]),] 71 | trade_x=data_trade[,c(start_column:(dim(sector_data)[2]-1))] 72 | trade_y=data_trade[,dim(sector_data)[2]] 73 | trade=cbind(trade_x,trade_y) 74 | 75 | row.names(trade_x)=data_trade$tic 76 | 77 | ########################################### 78 | ##############linear regression############ 79 | ########################################### 80 | linear_model=lm(y_return~., data=train) 81 | linear_pre_y=predict(linear_model,test_x) 82 | MSE_linear=mean((test_y-linear_pre_y)^2,na.rm=TRUE) 83 | #MSE_linear 84 | 85 | #LR features 86 | LR_features[[i]]=summary(linear_model) 87 | 88 | ########################################### 89 | ################Random Forest############## 90 | ########################################### 91 | # Tune using algorithm tools 92 | # Tunning the mtry 93 | bestmtry <- tuneRF(train[,-1],train[,1], stepFactor=1.5, improve=1e-5, ntree=500,trace=0,plot = FALSE) 94 | #plot(bestmtry,type = "l") 95 | bestmtry=data.frame(bestmtry) 96 | mytry_optimal=bestmtry$mtry[which.min(bestmtry$OOBError)] 97 | #mytry_optimal 98 | RF_Model=randomForest(y_return~.,data = train,ntree=500,mtry=mytry_optimal,importance=TRUE, na.rm = T,trace=0) 99 | 100 | yhat_bag=predict(RF_Model,test_x) 101 | MSE_RF=mean((yhat_bag-test_y)^2) 102 | #MSE_RF 103 | #importance table 104 | #varImp(RF_Model) 105 | #varImpPlot(RF_Model,main='Random Forest Importance Table') 106 | 107 | ########RF features 108 | RF_features[[i]]=varImp(RF_Model) 109 | 110 | ##################################### 111 | ################ridge################ 112 | ##################################### 113 | x_train_ridge=model.matrix(y_return~., train)[,-1] 114 | y_train_ridge=train$y_return 115 | 116 | x_test_ridge=model.matrix(y_return~.,test)[,-1] 117 | y_test_ridge=test$y_return 118 | 119 | #tunning for lambda 120 | #first run ridge on training set and pick the best lambda 121 | cv.out_ridge=cv.glmnet(x_train_ridge,y_train_ridge,alpha=1) 122 | bestlam_ridge=cv.out_ridge$lambda.min 123 | 124 | ridge_model=glmnet(x_train_ridge,y_train_ridge,alpha = 0,lambda = bestlam_ridge) 125 | ridge_pred_y=predict(ridge_model, newx = x_test_ridge) 126 | 127 | MSE_ridge=mean((ridge_pred_y-y_test_ridge)^2,na.rm=TRUE) 128 | 129 | #ridge features 130 | ridge_coeffs <- coef(ridge_model) 131 | ridge_coef=data.frame(name = ridge_coeffs@Dimnames[[1]][ridge_coeffs@i + 1], coefficient = ridge_coeffs@x) 132 | 133 | ridge_features[[i]]=ridge_coef 134 | 135 | 136 | 137 | ########################################### 138 | ##############stepwise regression########## 139 | ########################################### 140 | #based on linear regresion 141 | step_model=stepAIC(linear_model, direction="both",trace = 0) 142 | step_pre_y=predict(step_model,test_x) 143 | 144 | MSE_step=mean((test_y-step_pre_y)^2,na.rm=TRUE) 145 | #MSE_step 146 | 147 | #step features 148 | Step_features[[i]]=summary(step_model) 149 | 150 | 151 | ################################### 152 | ################GBM################ 153 | ################################### 154 | #Generalized Boosted Regression Models 155 | gbm_model=gbm(y_return~.,data = train, 156 | dist="gaussian", 157 | n.tree = 400, 158 | shrinkage=0.1, 159 | cv.folds = 5) 160 | 161 | gbm_pred_y = predict(gbm_model, test, n.tree = 400, type = 'response') 162 | MSE_gbm=mean((gbm_pred_y-test_y)^2,na.rm=TRUE) 163 | #MSE_gbm 164 | ########GBM features 165 | GBM_features[[i]]= summary(gbm_model,plot=FALSE) 166 | 167 | ###################################### 168 | #############get results############# 169 | ###################################### 170 | 171 | 172 | 173 | ##################################### 174 | #all model trade data 175 | #trade using linear regression 176 | trade_linear_y=predict(linear_model,trade_x) 177 | #trade using random forest 178 | trade_RF_y=predict(RF_Model,trade_x) 179 | #trade using ridge 180 | x_trade_ridge=model.matrix(y_return~.,trade)[,-1] 181 | row.names(x_trade_ridge)=data_trade$tic 182 | trade_ridge_y=predict(ridge_model,x_trade_ridge) 183 | colnames(trade_ridge_y)=c('trade_ridge_y') 184 | 185 | #trade stepwise regression 186 | trade_step_y=predict(step_model,trade_x) 187 | #trade using GBM 188 | trade_GBM_y=predict(gbm_model,trade_x) 189 | 190 | ###########store model error 191 | if (length(unique(trade_linear_y))0){ 14 | install.packages(packages.needed, dependencies = TRUE) 15 | } 16 | library(readxl) 17 | library(MASS) 18 | library(ggplot2) 19 | library(glmnet) 20 | library(ISLR) 21 | library(tree) 22 | library(randomForest) 23 | library(gbm) 24 | library(e1071) 25 | library(caret) 26 | 27 | source("fundamental_ML_model.R") 28 | source("fundamental_select_stock.R") 29 | 30 | ####################get data############################ 31 | fundamental_total=read_excel("fundamental_final_table.xlsx",1) 32 | trade_date=unique(fundamental_total$tradedate) 33 | trade_date=sort(trade_date) 34 | 35 | sector10_data=read_excel("sector10_clean.xlsx",1) 36 | dim(sector10_data) 37 | 38 | sector15_data=read_excel("sector15_clean.xlsx",1) 39 | dim(sector15_data) 40 | 41 | 42 | sector20_data=read_excel("sector20_clean.xlsx",1) 43 | dim(sector20_data) 44 | 45 | sector25_data=read_excel("sector25_clean.xlsx",1) 46 | dim(sector25_data) 47 | 48 | 49 | sector30_data=read_excel("sector30_clean.xlsx",1) 50 | dim(sector30_data) 51 | 52 | sector35_data=read_excel("sector35_clean.xlsx",1) 53 | dim(sector35_data) 54 | 55 | sector40_data=read_excel("sector40_clean.xlsx",1) 56 | dim(sector40_data) 57 | 58 | sector45_data=read_excel("sector45_clean.xlsx",1) 59 | dim(sector45_data) 60 | 61 | sector50_data=read_excel("sector50_clean.xlsx",1) 62 | dim(sector50_data) 63 | 64 | 65 | sector55_data=read_excel("sector55_clean.xlsx",1) 66 | dim(sector55_data) 67 | 68 | sector60_data=read_excel("sector60_clean.xlsx",1) 69 | dim(sector60_data) 70 | 71 | 72 | ############################################################### 73 | #####run model and save as RData 74 | ############################################################### 75 | 76 | ###################################### 77 | ############sector 10 Energy (5238, 32) 78 | ###################################### 79 | ##1.2 hours to run 80 | start.time=Sys.time() 81 | sector10_result=fundamental_ML_model(sector10_data,trade_date) 82 | end.time=Sys.time() 83 | end.time-start.time 84 | save(sector10_result,file = "sector10_result.RData") 85 | 86 | ###################################### 87 | ############sector 15 Materials (5216, 32) 88 | ###################################### 89 | ##1.2 hours to run 90 | start.time=Sys.time() 91 | sector15_result=fundamental_ML_model(sector15_data,trade_date) 92 | end.time=Sys.time() 93 | end.time-start.time 94 | save(sector15_result,file = "sector15_result.RData") 95 | 96 | ###################################### 97 | ############sector 20 Industrials (9881, 26) 98 | ###################################### 99 | #2 hours to run 100 | start.time=Sys.time() 101 | sector20_result=fundamental_ML_model(sector20_data,trade_date) 102 | end.time=Sys.time() 103 | end.time-start.time 104 | save(sector20_result,file = "sector20_result.RData") 105 | 106 | ###################################### 107 | ############sector 25 Consumer Discretionary (12595, 26) 108 | ###################################### 109 | #2.5 hours to run 110 | start.time=Sys.time() 111 | sector25_result=fundamental_ML_model(sector25_data,trade_date) 112 | end.time=Sys.time() 113 | end.time-start.time 114 | save(sector25_result,file = "sector25_result.RData") 115 | 116 | ###################################### 117 | ############sector 30 Consumer Staples (5388, 29) 118 | ###################################### 119 | #1.2 hours to run 120 | start.time=Sys.time() 121 | sector30_result=fundamental_ML_model(sector30_data,trade_date) 122 | end.time=Sys.time() 123 | end.time-start.time 124 | save(sector30_result,file = "sector30_result.RData") 125 | 126 | ###################################### 127 | ############sector 35 Health Cares (7615, 29) 128 | ###################################### 129 | #2 hours to run 130 | start.time=Sys.time() 131 | sector35_result=fundamental_ML_model(sector35_data,trade_date) 132 | end.time=Sys.time() 133 | end.time-start.time 134 | save(sector35_result,file = "sector35_result.RData") 135 | 136 | ###################################### 137 | ############sector 40 Financials (9480, 21) 138 | ###################################### 139 | ##1.5 hours to run 140 | start.time=Sys.time() 141 | sector40_result=fundamental_ML_model(sector40_data,trade_date) 142 | end.time=Sys.time() 143 | end.time-start.time 144 | save(sector40_result,file = "sector40_result.RData") 145 | 146 | ###################################### 147 | ############sector 45 Information Technology (10243, 29) 148 | ###################################### 149 | ##2.5 hours to run 150 | start.time=Sys.time() 151 | sector45_result=fundamental_ML_model(sector45_data,trade_date) 152 | end.time=Sys.time() 153 | end.time-start.time 154 | save(sector45_result,file = "sector45_result.RData") 155 | 156 | ###################################### 157 | ############sector 50 Telecomminucation Services (1127, 32) 158 | ###################################### 159 | #20 mins to run 160 | start.time=Sys.time() 161 | sector50_result=fundamental_ML_model(sector50_data,trade_date) 162 | end.time=Sys.time() 163 | end.time-start.time 164 | save(sector50_result,file = "sector50_result.RData") 165 | 166 | ###################################### 167 | ############sector 55 Utilities (3903, 32) 168 | ###################################### 169 | ##1.2 hours to run 170 | start.time=Sys.time() 171 | sector55_result=fundamental_ML_model(sector55_data,trade_date) 172 | end.time=Sys.time() 173 | end.time-start.time 174 | save(sector55_result,file = "sector55_result.RData") 175 | 176 | ###################################### 177 | ############sector 60 Real Estate (3039, 32) 178 | ###################################### 179 | #31 mins to run 180 | start.time=Sys.time() 181 | sector60_result=fundamental_ML_model(sector60_data,trade_date) 182 | end.time=Sys.time() 183 | end.time-start.time 184 | save(sector60_result,file = "sector60_result.RData") 185 | 186 | ############################################# 187 | ############################################# 188 | ############################################# 189 | ############################################# 190 | 191 | ############################################################### 192 | ################Stock Selection 193 | ############################################################### 194 | 195 | #########stock selection sector 10 196 | #load("sector10_result.RData") 197 | selector10_modelStock=select_modelStock(sector10_result) 198 | selector10_topStock=select_topStock(selector10_modelStock$selected_stocks) 199 | #########stock selection sector 15 200 | #load("sector15_result.RData") 201 | selector15_modelStock=select_modelStock(sector15_result) 202 | selector15_topStock=select_topStock(selector15_modelStock$selected_stocks) 203 | #########stock selection sector 20 204 | #load("sector20_result.RData") 205 | selector20_modelStock=select_modelStock(sector20_result) 206 | selector20_topStock=select_topStock(selector20_modelStock$selected_stocks) 207 | #########stock selection sector 25 208 | #load("sector25_result.RData") 209 | selector25_modelStock=select_modelStock(sector25_result) 210 | selector25_topStock=select_topStock(selector25_modelStock$selected_stocks) 211 | #########stock selection sector 30 212 | #load("sector30_result.RData") 213 | selector30_modelStock=select_modelStock(sector30_result) 214 | selector30_topStock=select_topStock(selector30_modelStock$selected_stocks) 215 | #########stock selection sector 35 216 | #load("sector35_result.RData") 217 | selector35_modelStock=select_modelStock(sector35_result) 218 | selector35_topStock=select_topStock(selector35_modelStock$selected_stocks) 219 | #########stock selection sector 40 220 | #load("sector40_result.RData") 221 | selector40_modelStock=select_modelStock(sector40_result) 222 | selector40_topStock=select_topStock(selector40_modelStock$selected_stocks) 223 | #########stock selection sector 45 224 | #load("sector45_result.RData") 225 | selector45_modelStock=select_modelStock(sector45_result) 226 | selector45_topStock=select_topStock(selector45_modelStock$selected_stocks) 227 | #########stock selection sector 50 228 | #load("sector50_result.RData") 229 | selector50_modelStock=select_modelStock(sector50_result) 230 | selector50_topStock=select_topStock(selector50_modelStock$selected_stocks) 231 | #selector50_topStock[[82]]=selector50_topStock[[81]] 232 | #########stock selection sector 55 233 | #load("sector55_result.RData") 234 | selector55_modelStock=select_modelStock(sector55_result) 235 | selector55_topStock=select_topStock(selector55_modelStock$selected_stocks) 236 | #########stock selection sector 60 237 | #load("sector60_result.RData") 238 | selector60_modelStock=select_modelStock(sector60_result) 239 | selector60_topStock=select_topStock(selector60_modelStock$selected_stocks) 240 | 241 | 242 | 243 | ###############combine stocks together 244 | stocks_selected_total=NULL 245 | for (i in 1:89){ 246 | 247 | #sector 10 248 | sector10_temp=selector10_topStock[[i]] 249 | sector10_temp=cbind(names(sector10_temp),unname(sector10_temp),trade_date[i+20]) 250 | colnames(sector10_temp)=c('tic','predicted_return','trade_date') 251 | 252 | #sector 15 253 | sector15_temp=selector15_topStock[[i]] 254 | sector15_temp=cbind(names(sector15_temp),unname(sector15_temp),trade_date[i+20]) 255 | colnames(sector15_temp)=c('tic','predicted_return','trade_date') 256 | 257 | #sector 20 258 | sector20_temp=selector20_topStock[[i]] 259 | sector20_temp=cbind(names(sector20_temp),unname(sector20_temp),trade_date[i+20]) 260 | colnames(sector20_temp)=c('tic','predicted_return','trade_date') 261 | 262 | #sector 25 263 | sector25_temp=selector25_topStock[[i]] 264 | sector25_temp=cbind(names(sector25_temp),unname(sector25_temp),trade_date[i+20]) 265 | colnames(sector25_temp)=c('tic','predicted_return','trade_date') 266 | 267 | #sector 30 268 | sector30_temp=selector30_topStock[[i]] 269 | sector30_temp=cbind(names(sector30_temp),unname(sector30_temp),trade_date[i+20]) 270 | colnames(sector30_temp)=c('tic','predicted_return','trade_date') 271 | 272 | #sector 35 273 | sector35_temp=selector35_topStock[[i]] 274 | sector35_temp=cbind(names(sector35_temp),unname(sector35_temp),trade_date[i+20]) 275 | colnames(sector35_temp)=c('tic','predicted_return','trade_date') 276 | 277 | #sector 40 278 | sector40_temp=selector40_topStock[[i]] 279 | sector40_temp=cbind(names(sector40_temp),unname(sector40_temp),trade_date[i+20]) 280 | colnames(sector40_temp)=c('tic','predicted_return','trade_date') 281 | 282 | #sector 45 283 | sector45_temp=selector45_topStock[[i]] 284 | sector45_temp=cbind(names(sector45_temp),unname(sector45_temp),trade_date[i+20]) 285 | colnames(sector45_temp)=c('tic','predicted_return','trade_date') 286 | 287 | #sector 50 288 | sector50_temp=selector50_topStock[[i]] 289 | sector50_temp=cbind(names(sector50_temp),unname(sector50_temp),trade_date[i+20]) 290 | colnames(sector50_temp)=c('tic','predicted_return','trade_date') 291 | 292 | #sector 55 293 | sector55_temp=selector55_topStock[[i]] 294 | sector55_temp=cbind(names(sector55_temp),unname(sector55_temp),trade_date[i+20]) 295 | colnames(sector55_temp)=c('tic','predicted_return','trade_date') 296 | 297 | 298 | #sector 60 299 | sector60_temp=selector60_topStock[[i]] 300 | sector60_temp=cbind(names(sector60_temp),unname(sector60_temp),trade_date[i+20]) 301 | colnames(sector60_temp)=c('tic','predicted_return','trade_date') 302 | 303 | 304 | stocks_bind=rbind(sector10_temp, 305 | sector15_temp, 306 | sector20_temp, 307 | sector25_temp, 308 | sector30_temp, 309 | sector35_temp, 310 | sector40_temp, 311 | sector45_temp, 312 | sector50_temp, 313 | sector55_temp, 314 | sector60_temp) 315 | 316 | stocks_selected_total=rbind(stocks_selected_total,stocks_bind) 317 | 318 | } 319 | 320 | stocks_selected_total=as.data.frame(stocks_selected_total) 321 | 322 | 323 | write.csv(stocks_selected_total,"stocks_selected_total.csv") 324 | -------------------------------------------------------------------------------- /code/old_Rcode/fundamental_select_stock.R: -------------------------------------------------------------------------------- 1 | select_modelStock = function(sector_result){ 2 | #sector_result=sector10_result 3 | selected_model=NULL 4 | selected_stocks=list() 5 | 6 | for (i in 1:89){ 7 | get_minIndex= apply(sector_result$model_error[i,],1,which.min) 8 | selected_model[i]=colnames(sector_result$model_error[i,])[apply(sector_result$model_error[i,],1,which.min)] 9 | selected_stocks[[i]] = sector_result$predicted_return[[i]][,get_minIndex] 10 | 11 | } 12 | 13 | output=list(selected_stocks=selected_stocks,selected_model=selected_model) 14 | return(output) 15 | } 16 | 17 | 18 | 19 | select_topStock=function(selected_stocks){ 20 | selected_topstocks=list() 21 | 22 | for (i in 1:89){ 23 | selected_topstocks[[i]]=selected_stocks[[i]][selected_stocks[[i]]>=quantile(selected_stocks[[i]],0.8)] 24 | } 25 | return(selected_topstocks) 26 | } -------------------------------------------------------------------------------- /figs/chart10_insample.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart10_insample.PNG -------------------------------------------------------------------------------- /figs/chart11_overallPerformance.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart11_overallPerformance.PNG -------------------------------------------------------------------------------- /figs/chart1_datasetPeriod.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart1_datasetPeriod.PNG -------------------------------------------------------------------------------- /figs/chart2_rolling_windows.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart2_rolling_windows.PNG -------------------------------------------------------------------------------- /figs/chart3_modelError.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart3_modelError.PNG -------------------------------------------------------------------------------- /figs/chart4_predictedReturn1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart4_predictedReturn1.PNG -------------------------------------------------------------------------------- /figs/chart4_predictedReturn2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart4_predictedReturn2.PNG -------------------------------------------------------------------------------- /figs/chart5_coefficient.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart5_coefficient.PNG -------------------------------------------------------------------------------- /figs/chart6_selectedStocks.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart6_selectedStocks.PNG -------------------------------------------------------------------------------- /figs/chart7_efficient1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart7_efficient1.PNG -------------------------------------------------------------------------------- /figs/chart8_PnL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart8_PnL.png -------------------------------------------------------------------------------- /figs/chart9_TotalValue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart9_TotalValue.png -------------------------------------------------------------------------------- /figs/dataperiod.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/dataperiod.png -------------------------------------------------------------------------------- /figs/efficient1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/efficient1.jpg -------------------------------------------------------------------------------- /figs/pnl1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/pnl1.jpg -------------------------------------------------------------------------------- /figs/rolling_windows.vsdx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/rolling_windows.vsdx -------------------------------------------------------------------------------- /figs/transaction cost.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/transaction cost.PNG -------------------------------------------------------------------------------- /fundamental_portfolio.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Import packages" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 27, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "import numpy as np\n", 18 | "from pypfopt.efficient_frontier import EfficientFrontier\n", 19 | "from pypfopt import risk_models\n", 20 | "from pypfopt.risk_models import CovarianceShrinkage\n", 21 | "from pypfopt import expected_returns\n", 22 | "from datetime import datetime\n", 23 | "from pandas.tseries.offsets import BDay" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 28, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "import time\n", 33 | "import pickle" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "# 1. Read Input Data" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 29, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stderr", 50 | "output_type": "stream", 51 | "text": [ 52 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/numpy/lib/arraysetops.py:568: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n", 53 | " mask |= (ar1 == a)\n" 54 | ] 55 | } 56 | ], 57 | "source": [ 58 | "df_price = pd.read_csv(\"Data/1-sp500_adj_price.csv\",index_col=0)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 30, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "(6438964, 3)" 70 | ] 71 | }, 72 | "execution_count": 30, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "df_price.shape" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 31, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "data": { 88 | "text/html": [ 89 | "
\n", 90 | "\n", 103 | "\n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | "
datadateticadj_price
119900102ADCT4.074244
219900103ADCT4.046900
319900104ADCT3.964869
419900105ADCT3.992212
519900108ADCT3.937525
\n", 145 | "
" 146 | ], 147 | "text/plain": [ 148 | " datadate tic adj_price\n", 149 | "1 19900102 ADCT 4.074244\n", 150 | "2 19900103 ADCT 4.046900\n", 151 | "3 19900104 ADCT 3.964869\n", 152 | "4 19900105 ADCT 3.992212\n", 153 | "5 19900108 ADCT 3.937525" 154 | ] 155 | }, 156 | "execution_count": 31, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "df_price.head()" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 32, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [ 171 | "selected_stock = pd.read_csv(\"Data/2-portfolio_data/stocks_selected_total_user8.csv\")" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": 33, 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "data": { 181 | "text/plain": [ 182 | "(12932, 3)" 183 | ] 184 | }, 185 | "execution_count": 33, 186 | "metadata": {}, 187 | "output_type": "execute_result" 188 | } 189 | ], 190 | "source": [ 191 | "selected_stock.shape" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 34, 197 | "metadata": {}, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/html": [ 202 | "
\n", 203 | "\n", 216 | "\n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | "
ticpredicted_returntrade_date
0EOG0.03372319950601
1EQT0.03774519950601
2HES0.05145019950601
3NFX0.03028319950601
4OKE0.04102019950601
\n", 258 | "
" 259 | ], 260 | "text/plain": [ 261 | " tic predicted_return trade_date\n", 262 | "0 EOG 0.033723 19950601\n", 263 | "1 EQT 0.037745 19950601\n", 264 | "2 HES 0.051450 19950601\n", 265 | "3 NFX 0.030283 19950601\n", 266 | "4 OKE 0.041020 19950601" 267 | ] 268 | }, 269 | "execution_count": 34, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "selected_stock.head()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "# 2. Get trade date" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 35, 288 | "metadata": {}, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "Number of unique stocks selected: 982\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "print(\"Number of unique stocks selected: \", len(selected_stock.tic.unique()))" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": 36, 312 | "metadata": {}, 313 | "outputs": [], 314 | "source": [ 315 | "all_date=df_price.datadate.unique()" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 37, 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "7155" 327 | ] 328 | }, 329 | "execution_count": 37, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "len(all_date)" 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 38, 341 | "metadata": {}, 342 | "outputs": [], 343 | "source": [ 344 | "trade_date=selected_stock.trade_date.unique()" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": 39, 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "data": { 354 | "text/plain": [ 355 | "array([19950601, 19950901, 19951201, 19960301, 19960603, 19960903,\n", 356 | " 19961202, 19970303, 19970602, 19970902, 19971201, 19980302,\n", 357 | " 19980601, 19980901, 19981201, 19990301, 19990601, 19990901,\n", 358 | " 19991201, 20000301, 20000601, 20000901, 20001201, 20010301,\n", 359 | " 20010601, 20010904, 20011203, 20020301, 20020603, 20020903,\n", 360 | " 20021202, 20030303, 20030602, 20030902, 20031201, 20040301,\n", 361 | " 20040601, 20040901, 20041201, 20050301, 20050601, 20050901,\n", 362 | " 20051201, 20060301, 20060601, 20060901, 20061201, 20070301,\n", 363 | " 20070601, 20070904, 20071203, 20080303, 20080602, 20080902,\n", 364 | " 20081201, 20090302, 20090601, 20090901, 20091201, 20100301,\n", 365 | " 20100601, 20100901, 20101201, 20110301, 20110601, 20110901,\n", 366 | " 20111201, 20120301, 20120601, 20120904, 20121203, 20130301,\n", 367 | " 20130603, 20130903, 20131202, 20140303, 20140602, 20140902,\n", 368 | " 20141201, 20150302, 20150601, 20150901, 20151201, 20160301,\n", 369 | " 20160601, 20160901, 20161201, 20170301, 20170601])" 370 | ] 371 | }, 372 | "execution_count": 39, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | } 376 | ], 377 | "source": [ 378 | "trade_date" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 40, 384 | "metadata": {}, 385 | "outputs": [ 386 | { 387 | "name": "stdout", 388 | "output_type": "stream", 389 | "text": [ 390 | "Number of trade dates 89\n" 391 | ] 392 | } 393 | ], 394 | "source": [ 395 | "print(\"Number of trade dates\", len(trade_date))" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "metadata": {}, 401 | "source": [ 402 | "# 3. Get daily 1 year return table in each 89 trade period" 403 | ] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": 41, 408 | "metadata": {}, 409 | "outputs": [ 410 | { 411 | "data": { 412 | "text/html": [ 413 | "
\n", 414 | "\n", 427 | "\n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | "
ticpredicted_returntrade_date
0EOG0.03372319950601
1EQT0.03774519950601
2HES0.05145019950601
3NFX0.03028319950601
4OKE0.04102019950601
\n", 469 | "
" 470 | ], 471 | "text/plain": [ 472 | " tic predicted_return trade_date\n", 473 | "0 EOG 0.033723 19950601\n", 474 | "1 EQT 0.037745 19950601\n", 475 | "2 HES 0.051450 19950601\n", 476 | "3 NFX 0.030283 19950601\n", 477 | "4 OKE 0.041020 19950601" 478 | ] 479 | }, 480 | "execution_count": 41, 481 | "metadata": {}, 482 | "output_type": "execute_result" 483 | } 484 | ], 485 | "source": [ 486 | "selected_stock.head()" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 414, 492 | "metadata": {}, 493 | "outputs": [ 494 | { 495 | "name": "stdout", 496 | "output_type": "stream", 497 | "text": [ 498 | "19950601\n", 499 | "19950901\n", 500 | "19951201\n", 501 | "19960301\n", 502 | "19960603\n", 503 | "19960903\n", 504 | "19961202\n", 505 | "19970303\n", 506 | "19970602\n", 507 | "19970902\n", 508 | "19971201\n", 509 | "19980302\n", 510 | "19980601\n", 511 | "19980901\n", 512 | "19981201\n", 513 | "19990301\n", 514 | "19990601\n", 515 | "19990901\n", 516 | "19991201\n", 517 | "20000301\n", 518 | "20000601\n", 519 | "20000901\n", 520 | "20001201\n", 521 | "20010301\n", 522 | "20010601\n", 523 | "20010904\n", 524 | "20011203\n", 525 | "20020301\n", 526 | "20020603\n", 527 | "20020903\n", 528 | "20021202\n", 529 | "20030303\n", 530 | "20030602\n", 531 | "20030902\n", 532 | "20031201\n", 533 | "20040301\n", 534 | "20040601\n", 535 | "20040901\n", 536 | "20041201\n", 537 | "20050301\n", 538 | "20050601\n", 539 | "20050901\n", 540 | "20051201\n", 541 | "20060301\n", 542 | "20060601\n", 543 | "20060901\n", 544 | "20061201\n", 545 | "20070301\n", 546 | "20070601\n", 547 | "20070904\n", 548 | "20071203\n", 549 | "20080303\n", 550 | "20080602\n", 551 | "20080902\n", 552 | "20081201\n", 553 | "20090302\n", 554 | "20090601\n", 555 | "20090901\n", 556 | "20091201\n", 557 | "20100301\n", 558 | "20100601\n", 559 | "20100901\n", 560 | "20101201\n", 561 | "20110301\n", 562 | "20110601\n", 563 | "20110901\n", 564 | "20111201\n", 565 | "20120301\n", 566 | "20120601\n", 567 | "20120904\n", 568 | "20121203\n", 569 | "20130301\n", 570 | "20130603\n", 571 | "20130903\n", 572 | "20131202\n", 573 | "20140303\n", 574 | "20140602\n", 575 | "20140902\n", 576 | "20141201\n", 577 | "20150302\n", 578 | "20150601\n", 579 | "20150901\n", 580 | "20151201\n", 581 | "20160301\n", 582 | "20160601\n", 583 | "20160901\n", 584 | "20161201\n", 585 | "20170301\n", 586 | "20170601\n", 587 | "Time consuming: 92.59127250512441 minutes\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "# took about 90 minutes to run\n", 593 | "start = time.time()\n", 594 | "all_return_table={}\n", 595 | "#all_predicted_return={}\n", 596 | "all_stocks_info = {}\n", 597 | "#for i in range(0,1):\n", 598 | "for i in range(len(trade_date)):\n", 599 | " #match trading date\n", 600 | " index = selected_stock.trade_date==trade_date[i]\n", 601 | " print(trade_date[i])\n", 602 | " #get the corresponding trade period's selected stocks' name\n", 603 | " stocks_name=selected_stock.tic[selected_stock.trade_date==trade_date[i]].values\n", 604 | " temp_info = selected_stock[selected_stock.trade_date==trade_date[i]]\n", 605 | " temp_info = temp_info.reset_index()\n", 606 | " del temp_info['index']\n", 607 | " all_stocks_info[trade_date[i]] = temp_info\n", 608 | " #get the corresponding trade period's selected stocks' predicted return\n", 609 | " asset_expected_return=selected_stock[index].predicted_return.values\n", 610 | " \n", 611 | " #get current trade date and calculate trade date last year, it has to be a business date\n", 612 | " last_year_tradedate=int((trade_date[i]-round(trade_date[i]/10000)*10000)+round(trade_date[i]/10000-1)*10000)\n", 613 | " convert_to_yyyymmdd=datetime.strptime(str(last_year_tradedate), '%Y%m%d').strftime('%Y-%m-%d')\n", 614 | " #determine the business date\n", 615 | " #print(convert_to_yyyymmdd)\n", 616 | " ts = pd.Timestamp(convert_to_yyyymmdd) \n", 617 | " bd = pd.tseries.offsets.BusinessDay(n =1) \n", 618 | " new_timestamp = ts - bd \n", 619 | " lastY_tradedate = int(new_timestamp.date().strftime('%Y%m%d'))\n", 620 | " get_date_index=(all_datelastY_tradedate)\n", 621 | " get_date=all_date[get_date_index]\n", 622 | " #get adjusted price table\n", 623 | " return_table=pd.DataFrame()\n", 624 | " for m in range(len(stocks_name)):\n", 625 | " #get stocks's name\n", 626 | " index_tic=(df_price.tic==stocks_name[m])\n", 627 | " #get this stock's all historicall price from sp500_price\n", 628 | " sp500_temp=df_price[index_tic]\n", 629 | " merge_left_data_table = pd.DataFrame(get_date)\n", 630 | " merge_left_data_table.columns = ['datadate']\n", 631 | " temp_price=merge_left_data_table.merge(sp500_temp, on=['datadate'], how='left')\n", 632 | " temp_price = temp_price.dropna()\n", 633 | " temp_price['daily_return']=temp_price.adj_price.pct_change()\n", 634 | "\n", 635 | " return_table=return_table.append(temp_price,ignore_index=True)\n", 636 | " all_return_table[trade_date[i]] = return_table\n", 637 | "end = time.time()\n", 638 | "print(\"Time consuming: \", (end-start)/60, \" minutes\")\n", 639 | " \n", 640 | " " 641 | ] 642 | }, 643 | { 644 | "cell_type": "markdown", 645 | "metadata": {}, 646 | "source": [ 647 | "## Save to pickle" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 419, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "#with open('Data/all_return_table.pickle', 'wb') as handle: \n", 657 | "# pickle.dump(all_return_table, handle, protocol=pickle.HIGHEST_PROTOCOL)" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 420, 663 | "metadata": {}, 664 | "outputs": [], 665 | "source": [ 666 | "#with open('Data/all_stocks_info.pickle', 'wb') as handle:\n", 667 | "# pickle.dump(all_stocks_info, handle, protocol=pickle.HIGHEST_PROTOCOL)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 42, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "#with open('Data/all_return_table.pickle', 'rb') as handle:\n", 677 | "# all_return_table = pickle.load(handle)\n", 678 | "\n", 679 | "#with open('Data/all_stocks_info.pickle', 'rb') as handle:\n", 680 | "# all_stocks_info = pickle.load(handle)\n" 681 | ] 682 | }, 683 | { 684 | "cell_type": "markdown", 685 | "metadata": {}, 686 | "source": [ 687 | "# 4. Potfolio Optimization using pypfopt" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": 44, 693 | "metadata": {}, 694 | "outputs": [ 695 | { 696 | "name": "stderr", 697 | "output_type": "stream", 698 | "text": [ 699 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pypfopt/objective_functions.py:61: RuntimeWarning: invalid value encountered in sqrt\n", 700 | " sigma = np.sqrt(np.dot(weights, np.dot(cov_matrix, weights.T)))\n" 701 | ] 702 | }, 703 | { 704 | "name": "stdout", 705 | "output_type": "stream", 706 | "text": [ 707 | "19950601 : Done\n", 708 | "19950901 : Done\n", 709 | "19951201 : Done\n", 710 | "19960301 : Done\n", 711 | "19960603 : Done\n", 712 | "19960903 : Done\n", 713 | "19961202 : Done\n", 714 | "19970303 : Done\n", 715 | "19970602 : Done\n", 716 | "19970902 : Done\n", 717 | "19971201 : Done\n", 718 | "19980302 : Done\n", 719 | "19980601 : Done\n", 720 | "19980901 : Done\n", 721 | "19981201 : Done\n", 722 | "19990301 : Done\n", 723 | "19990601 : Done\n", 724 | "19990901 : Done\n", 725 | "19991201 : Done\n", 726 | "20000301 : Done\n", 727 | "20000601 : Done\n", 728 | "20000901 : Done\n", 729 | "20001201 : Done\n", 730 | "20010301 : Done\n", 731 | "20010601 : Done\n", 732 | "20010904 : Done\n", 733 | "20011203 : Done\n", 734 | "20020301 : Done\n", 735 | "20020603 : Done\n", 736 | "20020903 : Done\n", 737 | "20021202 : Done\n", 738 | "20030303 : Done\n", 739 | "20030602 : Done\n", 740 | "20030902 : Done\n", 741 | "20031201 : Done\n", 742 | "20040301 : Done\n", 743 | "20040601 : Done\n", 744 | "20040901 : Done\n", 745 | "20041201 : Done\n", 746 | "20050301 : Done\n", 747 | "20050601 : Done\n", 748 | "20050901 : Done\n", 749 | "20051201 : Done\n", 750 | "20060301 : Done\n" 751 | ] 752 | }, 753 | { 754 | "name": "stderr", 755 | "output_type": "stream", 756 | "text": [ 757 | "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pypfopt/base_optimizer.py:56: RuntimeWarning: invalid value encountered in less\n", 758 | " clean_weights[np.abs(clean_weights) < cutoff] = 0\n" 759 | ] 760 | }, 761 | { 762 | "name": "stdout", 763 | "output_type": "stream", 764 | "text": [ 765 | "20060601 : Done\n", 766 | "20060901 : Done\n", 767 | "20061201 : Done\n", 768 | "20070301 : Done\n", 769 | "20070601 : Done\n", 770 | "20070904 : Done\n", 771 | "20071203 : Done\n", 772 | "20080303 : Done\n", 773 | "20080602 : Done\n", 774 | "20080902 : Done\n", 775 | "20081201 : Done\n", 776 | "20090302 : Done\n", 777 | "20090601 : Done\n", 778 | "20090901 : Done\n", 779 | "20091201 : Done\n", 780 | "20100301 : Done\n", 781 | "20100601 : Done\n", 782 | "20100901 : Done\n", 783 | "20101201 : Done\n", 784 | "20110301 : Done\n", 785 | "20110601 : Done\n", 786 | "20110901 : Done\n", 787 | "20111201 : Done\n", 788 | "20120301 : Done\n", 789 | "20120601 : Done\n", 790 | "20120904 : Done\n", 791 | "20121203 : Done\n", 792 | "20130301 : Done\n", 793 | "20130603 : Done\n", 794 | "20130903 : Done\n", 795 | "20131202 : Done\n", 796 | "20140303 : Done\n", 797 | "20140602 : Done\n", 798 | "20140902 : Done\n", 799 | "20141201 : Done\n", 800 | "20150302 : Done\n", 801 | "20150601 : Done\n", 802 | "20150901 : Done\n", 803 | "20151201 : Done\n", 804 | "20160301 : Done\n", 805 | "20160601 : Done\n", 806 | "20160901 : Done\n", 807 | "20161201 : Done\n", 808 | "20170301 : Done\n", 809 | "20170601 : Done\n" 810 | ] 811 | } 812 | ], 813 | "source": [ 814 | "# took under 5 minutes to run\n", 815 | "\n", 816 | "stocks_weight_table = pd.DataFrame([])\n", 817 | "\n", 818 | "for i in range(len(trade_date)):\n", 819 | " # get selected stocks information\n", 820 | " p1_alldata=(all_stocks_info[trade_date[i]])\n", 821 | " # sort it by tic\n", 822 | " p1_alldata=p1_alldata.sort_values('tic')\n", 823 | " p1_alldata = p1_alldata.reset_index()\n", 824 | " del p1_alldata['index']\n", 825 | " \n", 826 | " \n", 827 | " # get selected stocks tic\n", 828 | " p1_stock = p1_alldata.tic\n", 829 | " \n", 830 | " # get predicted return from selected stocks\n", 831 | " p1_predicted_return=p1_alldata.pivot_table(index = 'trade_date',columns = 'tic', values = 'predicted_return')\n", 832 | " # use the predicted returns as the Expected returns to feed into the portfolio object\n", 833 | " mu = p1_predicted_return.T.values\n", 834 | "\n", 835 | " # get the 1-year historical return\n", 836 | " p1_return_table=all_return_table[trade_date[i]]\n", 837 | " p1_return_table_pivot=p1_return_table.pivot_table(index = 'datadate',columns = 'tic', values = 'daily_return')\n", 838 | " # use the 1-year historical return table to calculate covariance matrix between selected stocks\n", 839 | " S = risk_models.sample_cov(p1_return_table_pivot)\n", 840 | " del S.index.name \n", 841 | " \n", 842 | " # mean variance\n", 843 | " ef_mean = EfficientFrontier(mu, S,weight_bounds=(0, 0.05))\n", 844 | " raw_weights_mean = ef_mean.max_sharpe()\n", 845 | " cleaned_weights_mean = ef_mean.clean_weights()\n", 846 | " #print(raw_weights_mean)\n", 847 | " #ef.portfolio_performance(verbose=True)\n", 848 | "\n", 849 | " # minimum variance\n", 850 | " ef_min = EfficientFrontier([0]*len(p1_stock), S,weight_bounds=(0, 0.05))\n", 851 | " raw_weights_min = ef_min.max_sharpe()\n", 852 | " cleaned_weights_min = ef_min.clean_weights()\n", 853 | " #print(cleaned_weights_min)\n", 854 | " \n", 855 | " p1_alldata['mean_weight'] = cleaned_weights_mean.values()\n", 856 | " p1_alldata['min_weight'] = cleaned_weights_min.values()\n", 857 | " \n", 858 | " #ef.portfolio_performance(verbose=True)\n", 859 | "\n", 860 | " \n", 861 | " stocks_weight_table = stocks_weight_table.append(pd.DataFrame(p1_alldata), ignore_index=True)\n", 862 | " print(trade_date[i], \": Done\")\n" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": 45, 868 | "metadata": {}, 869 | "outputs": [ 870 | { 871 | "data": { 872 | "text/html": [ 873 | "
\n", 874 | "\n", 887 | "\n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | "
ticpredicted_returntrade_datemean_weightmin_weight
0ACV.10.024449199506010.000000.00000
1AES0.096917199506010.000000.00000
2AHM.10.044516199506010.012000.00522
3AMH.10.105036199506010.000000.00000
4AMT.10.085373199506010.000000.00000
5AOS0.061494199506010.000000.00000
6APCC.0.160571199506010.008720.02036
7APH0.080985199506010.011360.00000
8ARG0.059334199506010.000000.00000
9ATI.10.170435199506010.009260.00000
10AVATQ0.051080199506010.000200.00000
11BAY.30.088882199506010.009750.00797
12BBBY0.108766199506010.003740.05000
13BBY0.184360199506010.011550.05000
14BEV0.064850199506010.000000.00000
15BF.B0.036150199506010.022350.00000
16BGEN0.057776199506010.014250.05000
17BGG0.066459199506010.000000.05000
18BIIB0.081258199506010.012360.00000
19BLL0.039363199506010.033660.00000
\n", 1061 | "
" 1062 | ], 1063 | "text/plain": [ 1064 | " tic predicted_return trade_date mean_weight min_weight\n", 1065 | "0 ACV.1 0.024449 19950601 0.00000 0.00000\n", 1066 | "1 AES 0.096917 19950601 0.00000 0.00000\n", 1067 | "2 AHM.1 0.044516 19950601 0.01200 0.00522\n", 1068 | "3 AMH.1 0.105036 19950601 0.00000 0.00000\n", 1069 | "4 AMT.1 0.085373 19950601 0.00000 0.00000\n", 1070 | "5 AOS 0.061494 19950601 0.00000 0.00000\n", 1071 | "6 APCC. 0.160571 19950601 0.00872 0.02036\n", 1072 | "7 APH 0.080985 19950601 0.01136 0.00000\n", 1073 | "8 ARG 0.059334 19950601 0.00000 0.00000\n", 1074 | "9 ATI.1 0.170435 19950601 0.00926 0.00000\n", 1075 | "10 AVATQ 0.051080 19950601 0.00020 0.00000\n", 1076 | "11 BAY.3 0.088882 19950601 0.00975 0.00797\n", 1077 | "12 BBBY 0.108766 19950601 0.00374 0.05000\n", 1078 | "13 BBY 0.184360 19950601 0.01155 0.05000\n", 1079 | "14 BEV 0.064850 19950601 0.00000 0.00000\n", 1080 | "15 BF.B 0.036150 19950601 0.02235 0.00000\n", 1081 | "16 BGEN 0.057776 19950601 0.01425 0.05000\n", 1082 | "17 BGG 0.066459 19950601 0.00000 0.05000\n", 1083 | "18 BIIB 0.081258 19950601 0.01236 0.00000\n", 1084 | "19 BLL 0.039363 19950601 0.03366 0.00000" 1085 | ] 1086 | }, 1087 | "execution_count": 45, 1088 | "metadata": {}, 1089 | "output_type": "execute_result" 1090 | } 1091 | ], 1092 | "source": [ 1093 | "stocks_weight_table.head(20)\n" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "code", 1098 | "execution_count": 46, 1099 | "metadata": {}, 1100 | "outputs": [ 1101 | { 1102 | "data": { 1103 | "text/plain": [ 1104 | "(12932, 5)" 1105 | ] 1106 | }, 1107 | "execution_count": 46, 1108 | "metadata": {}, 1109 | "output_type": "execute_result" 1110 | } 1111 | ], 1112 | "source": [ 1113 | "stocks_weight_table.shape" 1114 | ] 1115 | }, 1116 | { 1117 | "cell_type": "markdown", 1118 | "metadata": {}, 1119 | "source": [ 1120 | "## save to excel or csv" 1121 | ] 1122 | }, 1123 | { 1124 | "cell_type": "code", 1125 | "execution_count": 47, 1126 | "metadata": {}, 1127 | "outputs": [], 1128 | "source": [ 1129 | "stocks_weight_table.to_excel('Data/stocks_weight_table.xlsx','Sheet1')\n" 1130 | ] 1131 | }, 1132 | { 1133 | "cell_type": "code", 1134 | "execution_count": null, 1135 | "metadata": {}, 1136 | "outputs": [], 1137 | "source": [] 1138 | }, 1139 | { 1140 | "cell_type": "code", 1141 | "execution_count": null, 1142 | "metadata": {}, 1143 | "outputs": [], 1144 | "source": [] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": null, 1149 | "metadata": {}, 1150 | "outputs": [], 1151 | "source": [] 1152 | }, 1153 | { 1154 | "cell_type": "code", 1155 | "execution_count": null, 1156 | "metadata": {}, 1157 | "outputs": [], 1158 | "source": [] 1159 | } 1160 | ], 1161 | "metadata": { 1162 | "kernelspec": { 1163 | "display_name": "Python 3", 1164 | "language": "python", 1165 | "name": "python3" 1166 | }, 1167 | "language_info": { 1168 | "codemirror_mode": { 1169 | "name": "ipython", 1170 | "version": 3 1171 | }, 1172 | "file_extension": ".py", 1173 | "mimetype": "text/x-python", 1174 | "name": "python", 1175 | "nbconvert_exporter": "python", 1176 | "pygments_lexer": "ipython3", 1177 | "version": "3.6.5" 1178 | } 1179 | }, 1180 | "nbformat": 4, 1181 | "nbformat_minor": 2 1182 | } 1183 | -------------------------------------------------------------------------------- /fundamental_run_model.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | warnings.filterwarnings("ignore") 3 | 4 | import pandas as pd 5 | import numpy as np 6 | import time 7 | import traceback 8 | import sys 9 | sys.path.append('code') 10 | import ml_model 11 | 12 | 13 | 14 | 15 | if __name__ == '__main__': 16 | import argparse 17 | parser = argparse.ArgumentParser() 18 | 19 | #sector name 20 | parser.add_argument('-sector_name','--sector_name_input', type=str, required=True,help='sector name: i.e. sector10') 21 | 22 | # file name 23 | parser.add_argument('-fundamental','--fundamental_input', type=str, required=True,help='inputfile name for fundamental table') 24 | parser.add_argument('-sector','--sector_input', type=str, required=True,help='inputfile name for individual sector') 25 | 26 | # rolling window variables 27 | parser.add_argument("-first_trade_index", default=20, type=int) 28 | parser.add_argument("-testing_window", default=4, type=int) 29 | 30 | # column name 31 | parser.add_argument("-label_column", default='y_return', type=str) 32 | parser.add_argument("-date_column", default='tradedate', type=str) 33 | parser.add_argument("-tic_column", default='tic', type=str) 34 | parser.add_argument("-no_feature_column_names", default = ['gvkey', 'tic', 'datadate', 'rdq', 'tradedate', 'fyearq', 'fqtr', 35 | 'conm', 'datacqtr', 'datafqtr', 'gsector','y_return'], type=list,help='column names that are not fundamental features') 36 | 37 | 38 | 39 | args = parser.parse_args() 40 | #load fundamental table 41 | inputfile_fundamental = args.fundamental_input 42 | 43 | fundamental_total=pd.read_excel(inputfile_fundamental) 44 | fundamental_total=fundamental_total[fundamental_total['tradedate'] < 20170901] 45 | #get all unique quarterly date 46 | unique_datetime = sorted(fundamental_total.tradedate.unique()) 47 | 48 | # load sector data 49 | inputfile_sector = args.sector_input 50 | sector_data=pd.read_excel(inputfile_sector) 51 | 52 | #get sector unique ticker 53 | unique_ticker=sorted(sector_data.tic.unique()) 54 | 55 | #set rolling window 56 | # train: 4 years = 16 quarters 57 | # test: 1 year = 4 quarters 58 | # so first trade date = #20 quarter 59 | #first trade date is 1995-06-01 60 | first_trade_date_index=args.first_trade_index 61 | 62 | #testing window 63 | testing_windows = args.testing_window 64 | 65 | #get all backtesting period trade dates 66 | trade_date=unique_datetime[first_trade_date_index:] 67 | 68 | #variable column name 69 | label_column = args.label_column 70 | date_column = args.date_column 71 | tic_column = args.tic_column 72 | 73 | # features column: different base on sectors 74 | no_feature_column_names = args.no_feature_column_names 75 | features_column = [x for x in sector_data.columns.values if x not in no_feature_column_names] 76 | 77 | #sector name 78 | sector_name = args.sector_name_input 79 | 80 | try: 81 | start = time.time() 82 | model_result=ml_model.run_4model(sector_data, 83 | features_column, 84 | label_column, 85 | date_column, 86 | tic_column, 87 | unique_ticker, 88 | unique_datetime, 89 | trade_date, 90 | first_trade_date_index, 91 | testing_windows) 92 | end = time.time() 93 | print('Time Spent: ',(end-start)/60,' minutes') 94 | ml_model.save_model_result(model_result,sector_name) 95 | 96 | except e: 97 | print(e) 98 | 99 | 100 | 101 | # python3 fundamental_run_model.py -sector_name sector10 -fundamental Data/fundamental_final_table.xlsx -sector Data/1-focasting_data/sector10_clean.xlsx 102 | --------------------------------------------------------------------------------