├── .DS_Store
├── .gitattributes
├── Data
    ├── 1-focasting_data
    │   ├── .DS_Store
    │   ├── sector10_clean.xlsx
    │   ├── sector15_clean.xlsx
    │   ├── sector20_clean.xlsx
    │   ├── sector25_clean.xlsx
    │   ├── sector30_clean.xlsx
    │   ├── sector35_clean.xlsx
    │   ├── sector40_clean.xlsx
    │   ├── sector45_clean.xlsx
    │   ├── sector50_clean.xlsx
    │   ├── sector55_clean.xlsx
    │   └── sector60_clean.xlsx
    ├── 1-sp500_adj_price.csv.zip
    ├── 1-spx_price.xlsx
    ├── 2-portfolio_data
    │   ├── .DS_Store
    │   ├── equally_weighted_user8.xlsx
    │   ├── mean_weighted_user8.xlsx
    │   ├── minimum_weighted_user8.xlsx
    │   └── stocks_selected_total_user8.csv
    ├── all_return_table.pickle
    ├── all_stocks_info.pickle
    ├── fundamental_final_table.xlsx
    └── stocks_weight_table.xlsx
├── README.md
├── code
    ├── .DS_Store
    ├── ml_model.py
    └── old_Rcode
    │   ├── .DS_Store
    │   ├── fundamental_ML_model.R
    │   ├── fundamental_run_model.R
    │   └── fundamental_select_stock.R
├── figs
    ├── chart10_insample.PNG
    ├── chart11_overallPerformance.PNG
    ├── chart1_datasetPeriod.PNG
    ├── chart2_rolling_windows.PNG
    ├── chart3_modelError.PNG
    ├── chart4_predictedReturn1.PNG
    ├── chart4_predictedReturn2.PNG
    ├── chart5_coefficient.PNG
    ├── chart6_selectedStocks.PNG
    ├── chart7_efficient1.PNG
    ├── chart8_PnL.png
    ├── chart9_TotalValue.png
    ├── dataperiod.png
    ├── efficient1.jpg
    ├── pnl1.jpg
    ├── rolling_windows.vsdx
    └── transaction cost.PNG
├── fundamental_back_testing.ipynb
├── fundamental_portfolio.ipynb
└── fundamental_run_model.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/.DS_Store


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/Data/1-focasting_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/.DS_Store


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector10_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector10_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector15_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector15_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector20_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector20_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector25_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector25_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector30_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector30_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector35_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector35_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector40_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector40_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector45_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector45_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector50_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector50_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector55_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector55_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-focasting_data/sector60_clean.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-focasting_data/sector60_clean.xlsx


--------------------------------------------------------------------------------
/Data/1-sp500_adj_price.csv.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-sp500_adj_price.csv.zip


--------------------------------------------------------------------------------
/Data/1-spx_price.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/1-spx_price.xlsx


--------------------------------------------------------------------------------
/Data/2-portfolio_data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/.DS_Store


--------------------------------------------------------------------------------
/Data/2-portfolio_data/equally_weighted_user8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/equally_weighted_user8.xlsx


--------------------------------------------------------------------------------
/Data/2-portfolio_data/mean_weighted_user8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/mean_weighted_user8.xlsx


--------------------------------------------------------------------------------
/Data/2-portfolio_data/minimum_weighted_user8.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/2-portfolio_data/minimum_weighted_user8.xlsx


--------------------------------------------------------------------------------
/Data/all_return_table.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/all_return_table.pickle


--------------------------------------------------------------------------------
/Data/all_stocks_info.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/all_stocks_info.pickle


--------------------------------------------------------------------------------
/Data/fundamental_final_table.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/fundamental_final_table.xlsx


--------------------------------------------------------------------------------
/Data/stocks_weight_table.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/Data/stocks_weight_table.xlsx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Dynamic-Stock-Recommendation-Machine_Learning
  2 | 
  3 | ## First Author: Published paper on IEEE TrustCom 2018 (http://www.cloud-conf.net/trustcom18/)
  4 | Hongyang Yang, Xiao-Yang Liu, Qingwei W. [A Practical Machine Learning Approach for Dynamic Stock Recommendation](https://ssrn.com/abstract=3302088). IEEE TrustCom 2018.
  5 | 
  6 | ### IEEE Official Link of the paper (https://ieeexplore.ieee.org/abstract/document/8456121)
  7 | ### SSRN Version: (https://ssrn.com/abstract=3302088)
  8 | 
  9 | ## Abstract:
 10 | Stock recommendation is vital to investment companies and investors. However, no single stock selection strategy will always win while analysts may not have enough time to check all S&P 500 stocks (the Standard & Poor’s 500). In this paper, we propose a practical scheme that recommends stocks from S&P 500 using machine learning. Our basic idea is to buy and hold the top 20% stocks dynamically. First, we select representative stock indicators with good explanatory power. Secondly, we take five frequently used machine learning methods, including linear regression, ridge regression, stepwise regression, random forest and generalized boosted regression, to model stock indicators and quarterly log-return in a rolling window. Thirdly, we choose the model with the lowest Mean Square Error in each period to rank stocks. Finally, we test the selected stocks by conducting portfolio allocation methods such as equally weighted, mean- variance, and minimum-variance. Our empirical results show that the proposed scheme outperforms the long-only strategy on the S&P 500 index in terms of Sharpe ratio and cumulative returns.
 11 | 
 12 | ## Index Term:
 13 | Stock recommendation, fundamental value investing, machine learning, model selection, risk management
 14 | 
 15 | ## Project summary：
 16 | + We developed a practical approach to using machine-learning methods selecting S&P 500 stocks based on financial ratios (e.g., EPS, ROA, ROE, etc). Outperformed the S&P 500 index on out of sample data, achieved a Sharpe ratio of 0.5 (0.19 on SPX).
 17 | + We performed feature selection by 11 GICS sectors based on a rolling window to choose the lowest MSE model among Linear Regression, Stepwise Regression, Regression with Ridge, Random Forest, and GBM. Applied a model ensemble method.
 18 | 
 19 | <img src=figs/chart10_insample.PNG width="500">
 20 | 
 21 | <img src=figs/chart11_overallPerformance.PNG width="500">
 22 | 
 23 | ## Data:
 24 | Retrieved from __WRDS (Wharton Research Data Services)__, Compustat Industrial [27 years daily and quarterly Data]
 25 | 
 26 | <img src=figs/chart1_datasetPeriod.PNG width="500">
 27 | 
 28 | 
 29 | + __S&P 500 Fundamental Quarterly Data__ ([fundamental_final_table.xlsx](Data/fundamental_final_table.xlsx))
 30 |   + Database: Compustat North America (Fundamentals Quarterly) and (Index Constituents)
 31 |   + Timeline: 27 years (1990-2017)
 32 |   + Tickers: 1193 stock (all historical S&P 500 component stocks)
 33 |   + Value: 20 financial ratios calculated from raw accouting report data
 34 | 
 35 | + __S&P 500 Historical Component Stocks Adjusted Daily Price__ ([1-sp500_adj_price.csv.zip](Data/1-sp500_adj_price.csv.zip))
 36 |   + Database: Compustat North America (Security Daily)
 37 |   + Timeline: 27 years (1990-2017)
 38 |   + Tickers: 1193 stock (all historical S&P 500 component stocks)
 39 |   + Value: Adjusted Daily Close Price
 40 | 
 41 | + __S&P 500 Index Daily Price__ ([1-spx_price.xlsx](Data/1-spx_price.xlsx))
 42 |   + Database: Yahoo Finance
 43 |   + Timeline: 27 years (1990-2017)
 44 |   + Tickers: SPX
 45 |   + Value: Adjusted Daily Close Price
 46 | 
 47 | ## Code:
 48 | 
 49 | ### __Focasting Model__:
 50 | + __Input__: 11 Excel files of cleaned data about fundamental financial ratios (sector 10-Energy, sector 15-Materials, sector 20-Industrials, sector 25-Consumer Discretionary, sector 30-Consumer Staples, sector 35-Health Care, sector 40-Financials, sector 45-Information Technology, sector 50-Telecommunication Services, sector 55-Utilities, sector 60-Real Estate)
 51 | + __Python Script__: 2 Scripts
 52 |   + [ml_model.py](code/ml_model.py): The forecasting function (cornerstone of this project)
 53 |   + [fundamental_run_model.py](fundamental_run_model.py): The main function to run the forecasting model  
 54 | ```shell
 55 | 
 56 | python3 fundamental_run_model.py \
 57 |   -sector_name sector10 \
 58 |   -fundamental Data/fundamental_final_table.xlsx \
 59 |   -sector Data/1-focasting_data/sector10_clean.xlsx 
 60 | ```
 61 | 
 62 | 
 63 | + __Old R Script__: 3 R Scripts
 64 |   + [fundamental_run_model.R](code/fundamental_run_model.R): The main function to run the forecasting model
 65 |   + [fundamental_ML_model.R](code/fundamental_ML_model.R): The forecasting function (cornerstone of this project)
 66 |   + [fundamental_select_stock.R](code/fundamental_select_stock.R): The function to select top 20% stocks in each sector
 67 | + __Output__: [a CSV file](Data/2-portfolio_data/stocks_selected_total_user8.csv) includes __tic__: the stock name, __predicted_return__: predicted return of next quarter by our model, __trade_date__: the date to execute the trades
 68 | 
 69 | 
 70 | 
 71 | 
 72 | 
 73 | ### __Portfolio Allocation__:
 74 | 
 75 | + __Input__: 2 files
 76 |   + The [CSV file](Data/2-portfolio_data/stocks_selected_total_user8.csv) generated by forecasting model
 77 |   + The [adjusted close price data of S&P 500 stocks](Data/1-sp500_adj_price.csv.zip) to calculate covariance matrix
 78 | 
 79 | + __Script__: [fundamental_portfolio.ipynb](fundamental_portfolio.ipynb)
 80 | 
 81 | + __Output__: 3 Excel files each with the following 4 columns
 82 |   1. __tic__: the stock name
 83 |   2. __predicted_return__: predicted return of next quarter by our model
 84 |   3. __weights__: the weights to trade
 85 |   4. __trade_date__: the date to execute the trades
 86 | 
 87 | 
 88 | 
 89 | ### __Back-testing Model__:
 90 | 
 91 | + __Input__: 5 files
 92 |   + [equally_weighted](Data/2-portfolio_data/equally_weighted_user8.xlsx): equally-weighted portfolio (Portfolio Benchmark)
 93 |   + [mean_weighted](Data/2-portfolio_data/mean_weighted_user8.xlsx): mean-variance portfolio
 94 |   + [minimum_weighted](Data/2-portfolio_data/minimum_weighted_user8.xlsx): minimum-variance portfolio (our model)
 95 |   + [adjusted daily close price of S&P 500 stocks](Data/1-sp500_adj_price.csv.zip): to calcualte quarterly return
 96 |   + [SPX adjusted daily close price](Data/1-spx_price.xlsx): The Market Index (Overall Benchmark)
 97 | 
 98 | + __Script__: 1 Python jupyter notebook Script
 99 |   + [fundamental_back_testing.ipynb](code/fundamental_back_testing.ipynb): The back-testing function
100 | 
101 | + __Output__:
102 |   1. Quarterly return of our portfolio with transaction cost
103 |   2. Performance Evaluation: total return, annulized return and standard deviation, maximum drawdown, Sharpe ratio
104 | 


--------------------------------------------------------------------------------
/code/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/code/.DS_Store


--------------------------------------------------------------------------------
/code/ml_model.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import traceback
  4 | 
  5 | from sklearn.model_selection import train_test_split
  6 | from sklearn.svm import SVC
  7 | from sklearn.svm import SVR
  8 | from sklearn.metrics import confusion_matrix
  9 | from sklearn.model_selection import cross_val_score, cross_val_predict
 10 | from sklearn.linear_model import Ridge
 11 | 
 12 | from sklearn.linear_model import LinearRegression
 13 | from sklearn.feature_selection import RFE
 14 | from sklearn.linear_model import Lasso
 15 | from sklearn.ensemble import RandomForestRegressor
 16 | from sklearn.ensemble import GradientBoostingRegressor
 17 | from sklearn.ensemble import AdaBoostRegressor
 18 | 
 19 | from sklearn.model_selection import TimeSeriesSplit, GridSearchCV,RandomizedSearchCV
 20 | 
 21 | from keras.models import Sequential
 22 | from keras.layers import Dense
 23 | from keras.layers import LSTM
 24 | from keras.layers import Dropout
 25 | 
 26 | import os
 27 | import errno
 28 | 
 29 | 
 30 | def prepare_rolling_train(df,features_column,label_column,date_column,unique_datetime,testing_windows,first_trade_date_index, max_rolling_window_index,current_index):
 31 |     if current_index <=max_rolling_window_index:
 32 |         train=df[(df[date_column] >= unique_datetime[0]) \
 33 |                 & (df[date_column] < unique_datetime[current_index-testing_windows])]
 34 |     else:
 35 |         train=df[(df[date_column] >= unique_datetime[current_index-max_rolling_window_index]) \
 36 |                 & (df[date_column] < unique_datetime[current_index-testing_windows])]
 37 |         
 38 |     X_train=train[features_column]
 39 |     y_train=train[label_column]
 40 |     return X_train,y_train
 41 | 
 42 | def prepare_rolling_test(df,features_column,label_column,date_column,unique_datetime,testing_windows,fist_trade_date_index, current_index):
 43 |     test=df[(df[date_column] >= unique_datetime[current_index-testing_windows]) \
 44 |             & (df[date_column] < unique_datetime[current_index])]
 45 |     X_test=test[features_column]
 46 |     y_test=test[label_column]
 47 |     return X_test,y_test
 48 | 
 49 | def prepare_trade_data(df,features_column,label_column,date_column,tic_column,unique_datetime,testing_windows,fist_trade_date_index, current_index):
 50 |     trade  = df[df[date_column] == unique_datetime[current_index]]
 51 |     X_trade = trade[features_column]
 52 |     y_trade = trade[label_column]
 53 |     trade_tic = trade[tic_column].values
 54 |     return X_trade,y_trade,trade_tic
 55 | 
 56 | 
 57 | def train_linear_regression(X_train,y_train):
 58 | 
 59 |     lr_regressor = LinearRegression()
 60 |     model = lr_regressor.fit(X_train, y_train)
 61 |     
 62 |     return model
 63 | 
 64 | def train_recursive_feature_elimination(X_train,y_train):
 65 | 
 66 |     lr_regressor = LinearRegression(random_state = 42)
 67 |     model = RFE(lr_regressor)
 68 |     
 69 |     return model
 70 | 
 71 | def train_lasso(X_train, y_train):
 72 |     # lasso_regressor = Lasso()
 73 |     # model = lasso_regressor.fit(X_train, y_train)
 74 | 
 75 |     lasso = Lasso(random_state = 42)
 76 |     # scoring_method = 'r2'
 77 |     # scoring_method = 'explained_variance'
 78 |     # scoring_method = 'neg_mean_absolute_error'
 79 |     scoring_method = 'neg_mean_squared_error'
 80 |     #scoring_method = 'neg_mean_squared_log_error'
 81 |     parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
 82 |     # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced)
 83 |     lasso_regressor = GridSearchCV(lasso, parameters, scoring=scoring_method, cv=3)
 84 |     lasso_regressor.fit(X_train, y_train)
 85 | 
 86 |     model = lasso_regressor.best_estimator_
 87 |     return model
 88 | 
 89 | def train_ridge(X_train, y_train):
 90 |     # lasso_regressor = Lasso()
 91 |     # model = lasso_regressor.fit(X_train, y_train)
 92 | 
 93 |     ridge = Ridge(random_state = 42)
 94 |     # scoring_method = 'r2'
 95 |     # scoring_method = 'explained_variance'
 96 |     # scoring_method = 'neg_mean_absolute_error'
 97 |     scoring_method = 'neg_mean_squared_error'
 98 |     #scoring_method = 'neg_mean_squared_log_error'
 99 |     parameters = {'alpha': [1e-15, 1e-10, 1e-8, 1e-4, 1e-3, 1e-2, 1, 5, 10, 20]}
100 |     # my_cv_lasso = TimeSeriesSplit(n_splits=3).split(X_train_advanced)
101 |     ridge_regressor = GridSearchCV(ridge, parameters, scoring=scoring_method, cv=3)
102 |     ridge_regressor.fit(X_train, y_train)
103 | 
104 |     model = ridge_regressor.best_estimator_
105 |     return model
106 | 
107 | def train_random_forest(X_train, y_train):
108 |     
109 |     random_grid = {
110 |                    #'max_depth': [10, 20, 40, 80, 100, None],
111 |                    'max_features': ['sqrt'],
112 |                    'min_samples_leaf': [0.05,0.1,0.2],
113 |                    'min_samples_split': np.linspace(0.1, 1, 10, endpoint=True),
114 |                    'n_estimators': [75,100,200]}
115 |     # scoring_method = 'r2'
116 |     # scoring_method = 'explained_variance'
117 |     # scoring_method = 'neg_mean_absolute_error'
118 |     scoring_method = 'neg_mean_squared_error'
119 |     #scoring_method = 'neg_mean_squared_log_error'
120 | 
121 |     # my_cv_rf = TimeSeriesSplit(n_splits=5).split(X_train_rf)
122 |     rf = RandomForestRegressor(random_state=42)
123 |     #RandomizedSearchCV
124 |     #randomforest_regressor = RandomizedSearchCV(estimator=rf, 
125 |     #                                            param_distributions=random_grid,
126 |     #                                            n_iter = 100,
127 |     #                                            cv=3, 
128 |     #                                            n_jobs=-1, 
129 |     #                                            scoring=scoring_method, 
130 |     #                                            verbose=0)
131 |     #GridSearchCV
132 |     randomforest_regressor = GridSearchCV(estimator=rf, 
133 |                                           param_grid=random_grid,
134 |                                           cv=3, 
135 |                                           n_jobs=-1, 
136 |                                           scoring=scoring_method, 
137 |                                           verbose=0)  
138 |     
139 |     randomforest_regressor.fit(X_train, y_train)
140 |     #print(randomforest_regressor.best_params_ )
141 |     model = randomforest_regressor.best_estimator_
142 |     '''
143 |     randomforest_regressor = RandomForestRegressor(random_state = 42,n_estimators = 400, max_features='auto')
144 |     #randomforest_regressor = RandomForestRegressor(random_state = 42,n_estimators = 300)
145 | 
146 |     model = randomforest_regressor.fit(X_train, y_train)
147 |     '''
148 |     return model
149 | 
150 | 
151 | def train_svm(X_train, y_train):
152 |     svr = SVR(kernel = 'rbf')
153 | 
154 |     param_grid_svm = {'C':[0.001, 0.1, 1],'gamma': [1e-7,0.1]}
155 |     #param_grid_svm = {'kernel': ('linear', 'rbf','poly'), 'C':[0.001, 0.01, 0.1, 1, 10],'gamma': [1e-7, 1e-4,0.001,0.1],'epsilon':[0.1,0.2,0.5,0.3]}
156 | 
157 |     # scoring_method = 'r2'
158 |     # scoring_method = 'explained_variance'
159 |     # scoring_method = 'neg_mean_absolute_error'
160 |     scoring_method = 'neg_mean_squared_error'
161 |     #scoring_method = 'neg_mean_squared_log_error'
162 |     
163 |     svm_regressor = GridSearchCV(estimator=svr, param_grid =param_grid_svm, cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
164 |     
165 |     svm_regressor.fit(X_train, y_train)
166 |     model = svm_regressor.best_estimator_
167 |     #estimator = svm_regressor.best_estimator_
168 |     #selector = RFE(estimator, 5, step=1)
169 |     #model = selector.fit(X, y)
170 | 
171 |     return model
172 | 
173 | 
174 | def train_gbm(X_train, y_train):
175 |     gbm = GradientBoostingRegressor(random_state = 42)
176 |     # model = gbm.fit(X_train, y_train)
177 | 
178 |     param_grid_gbm = {'learning_rate': [0.1,  0.01, 0.001], 'n_estimators': [100, 250, 500,1000]}
179 |     # scoring_method = 'r2'
180 |     # scoring_method = 'explained_variance'
181 |     # scoring_method = 'neg_mean_absolute_error'
182 |     scoring_method = 'neg_mean_squared_error'
183 |     #scoring_method = 'neg_mean_squared_log_error'
184 |     gbm_regressor = GridSearchCV(estimator=gbm, param_grid=param_grid_gbm,
185 |                                        cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
186 | 
187 |     gbm_regressor.fit(X_train, y_train)
188 |     model = gbm_regressor.best_estimator_
189 |     '''
190 |     
191 |     gbm_regressor = GradientBoostingRegressor()
192 |     model = gbm_regressor.fit(X_train, y_train)
193 |     '''
194 |     return model
195 | 
196 | 
197 | 
198 | 
199 | def train_ada(X_train, y_train):
200 |     ada = AdaBoostRegressor()
201 | 
202 |     # model = ada.fit(X_train, y_train)
203 | 
204 |     param_grid_ada = {'n_estimators': [20, 100],
205 |                       'learning_rate': [0.01, 0.05, 1]}
206 |     # scoring_method = 'r2'
207 |     # scoring_method = 'explained_variance'
208 |     # scoring_method = 'neg_mean_absolute_error'
209 |     # scoring_method = 'neg_mean_squared_error'
210 |     #scoring_method = 'neg_mean_squared_log_error'
211 | 
212 |     ada_regressor = GridSearchCV(estimator=ada, param_distributions=param_grid_ada,
213 |                                        cv=3, n_jobs=-1, scoring=scoring_method, verbose=0)
214 | 
215 |     ada_regressor.fit(X_train, y_train)
216 |     model = ada_regressor.best_estimator_
217 |     '''
218 |     ada_regressor = AdaBoostRegressor()
219 |     model = ada_regressor.fit(X_train, y_train)
220 |     '''
221 |     return model
222 | 
223 | 
224 | def evaluate_model(model, X_test, y_test):
225 |     from sklearn.metrics import mean_squared_error
226 |     #from sklearn.metrics import mean_squared_log_error
227 | 
228 |     from sklearn.metrics import mean_absolute_error
229 |     from sklearn.metrics import explained_variance_score
230 |     from sklearn.metrics import r2_score
231 |     y_predict = model.predict(X_test)
232 | 
233 |     mae = mean_absolute_error(y_test, y_predict)
234 |     
235 | 
236 |     mse = mean_squared_error(y_test, y_predict)
237 |     #msle = mean_squared_log_error(y_test, y_predict)
238 | 
239 |     explained_variance = explained_variance_score(y_test, y_predict)
240 |     r2 = r2_score(y_test, y_predict)
241 | 
242 |     return mse
243 | 
244 | 
245 | def append_return_table(df_predict, unique_datetime, y_trade_return, trade_tic, current_index):
246 |     tmp_table = pd.DataFrame(columns=trade_tic)
247 |     tmp_table = tmp_table.append(pd.Series(y_trade_return, index=trade_tic), ignore_index=True)
248 |     df_predict.loc[unique_datetime[current_index]][tmp_table.columns] = tmp_table.loc[0]
249 | 
250 | 
251 | def run_4model(df,features_column, label_column,date_column,tic_column,
252 |               unique_ticker, unique_datetime, trade_date, 
253 |               first_trade_date_index=20,
254 |               testing_windows=4,
255 |               max_rolling_window_index=44):
256 |     ## initialize all the result tables
257 |     ## need date as index and unique tic name as columns
258 |     df_predict_lr = pd.DataFrame(columns=unique_ticker, index=trade_date)
259 |     df_predict_rf = pd.DataFrame(columns=unique_ticker, index=trade_date)
260 |     df_predict_ridge = pd.DataFrame(columns=unique_ticker, index=trade_date)
261 |     df_predict_gbm = pd.DataFrame(columns=unique_ticker, index=trade_date)
262 | 
263 |     df_predict_best = pd.DataFrame(columns=unique_ticker, index=trade_date)
264 |     df_best_model_name = pd.DataFrame(columns=['model_name'], index=trade_date)
265 |     evaluation_record = {}
266 |     # first trade date is 1995-06-01
267 |     # fist_trade_date_index = 20
268 |     # testing_windows = 6
269 | 
270 |     for i in range(first_trade_date_index, len(unique_datetime)):
271 |         try:
272 |             # prepare training data
273 |             X_train, y_train = prepare_rolling_train(df, 
274 |                                                      features_column,
275 |                                                      label_column,
276 |                                                      date_column, 
277 |                                                      unique_datetime, 
278 |                                                      testing_windows, 
279 |                                                      first_trade_date_index, 
280 |                                                      max_rolling_window_index,
281 |                                                      current_index=i
282 |                                                      )
283 | 
284 |             # prepare testing data
285 |             X_test, y_test = prepare_rolling_test(df, 
286 |                                                   features_column,
287 |                                                   label_column,
288 |                                                   date_column, 
289 |                                                   unique_datetime, 
290 |                                                   testing_windows, 
291 |                                                   first_trade_date_index,
292 |                                                   current_index=i)
293 | 
294 |             # prepare trade data
295 |             X_trade, y_trade, trade_tic = prepare_trade_data(df,
296 |                                                              features_column,
297 |                                                              label_column,
298 |                                                              date_column,
299 |                                                              tic_column, 
300 |                                                              unique_datetime, 
301 |                                                              testing_windows, 
302 |                                                              first_trade_date_index, 
303 |                                                              current_index=i)
304 | 
305 |             # Training
306 |             lr_model = train_linear_regression(X_train, y_train)
307 |             rf_model = train_random_forest(X_train, y_train)
308 |             ridge_model = train_ridge(X_train, y_train)
309 |             gbm_model = train_gbm(X_train, y_train)
310 | 
311 | 
312 |             # Validation
313 |             lr_eval = evaluate_model(lr_model, X_test, y_test)
314 |             rf_eval = evaluate_model(rf_model, X_test, y_test)
315 |             ridge_eval = evaluate_model(ridge_model, X_test, y_test)
316 |             gbm_eval = evaluate_model(gbm_model, X_test, y_test)
317 | 
318 |             # Trading
319 |             y_trade_lr = lr_model.predict(X_trade)
320 |             y_trade_rf = rf_model.predict(X_trade)
321 |             y_trade_ridge = ridge_model.predict(X_trade)
322 |             y_trade_gbm  = gbm_model.predict(X_trade)
323 | 
324 | 
325 |             # Decide the best model
326 |             eval_data = [[lr_eval, y_trade_lr], 
327 |                          [rf_eval, y_trade_rf] ,
328 |                          [ridge_eval, y_trade_ridge],
329 |                          [gbm_eval, y_trade_gbm]
330 |                                 ]
331 |             eval_table = pd.DataFrame(eval_data, columns=['model_eval', 'model_predict_return'],
332 |                                               index=['lr', 'rf','ridge','gbm'])        
333 | 
334 | 
335 |             evaluation_record[unique_datetime[i]]=eval_table
336 | 
337 |             # lowest error score model
338 |             y_trade_best = eval_table.model_predict_return.values[eval_table.model_eval == eval_table.model_eval.min()][0]
339 |             best_model_name = eval_table.index.values[eval_table.model_eval == eval_table.model_eval.min()][0]
340 | 
341 |             # Highest Explained Variance
342 |             # y_trade_best = eval_table.model_predict_return.values[eval_table.model_eval==eval_table.model_eval.max()][0]
343 |             # best_model_name = eval_table.index.values[eval_table.model_eval==eval_table.model_eval.max()][0]
344 | 
345 |             df_best_model_name.loc[unique_datetime[i]] = best_model_name
346 | 
347 |             # Prepare Predicted Return table
348 |             append_return_table(df_predict_lr, unique_datetime, y_trade_lr, trade_tic, current_index=i)
349 |             append_return_table(df_predict_rf, unique_datetime, y_trade_rf, trade_tic, current_index=i)
350 |             append_return_table(df_predict_ridge, unique_datetime, y_trade_ridge, trade_tic, current_index=i)
351 |             append_return_table(df_predict_gbm, unique_datetime, y_trade_gbm, trade_tic, current_index=i)
352 | 
353 |             append_return_table(df_predict_best, unique_datetime, y_trade_best, trade_tic, current_index=i)
354 | 
355 |             print('Trade Date: ', unique_datetime[i])
356 | 
357 |         except Exception:
358 |             traceback.print_exc()
359 |     df_evaluation = get_model_evaluation_table(evaluation_record,trade_date)
360 |     return (df_predict_lr, 
361 |             df_predict_rf, 
362 |             df_predict_ridge, 
363 |             df_predict_gbm,
364 |             df_predict_best,
365 |             df_best_model_name, 
366 |             evaluation_record,
367 |             df_evaluation)
368 | 
369 | 
370 | def get_model_evaluation_table(evaluation_record,trade_date):
371 |     evaluation_list = []
372 |     for d in trade_date:
373 |         try:
374 |             evaluation_list.append(evaluation_record[d]['model_eval'].values)
375 |         except:
376 |             print('error')
377 |     df_evaluation = pd.DataFrame(evaluation_list,columns = ['linear_regression', 'random_forest','ridge','gbm'])
378 |     df_evaluation.index = trade_date
379 |     return df_evaluation
380 | 
381 | def save_model_result(sector_result,sector_name):
382 |     df_predict_lr = sector_result[0].astype(np.float64)
383 |     df_predict_rf = sector_result[1].astype(np.float64)
384 |     df_predict_ridge = sector_result[2].astype(np.float64)
385 |     df_predict_gbm = sector_result[3].astype(np.float64)
386 |     df_predict_best = sector_result[4].astype(np.float64)
387 | 
388 |     df_best_model_name = sector_result[5]
389 |     df_evaluation_score = sector_result[6]
390 |     df_model_score = sector_result[7]
391 |     
392 | 
393 | 
394 |     filename = 'results/'+sector_name+'/'
395 |     if not os.path.exists(os.path.dirname(filename)):
396 |         try:
397 |             os.makedirs(os.path.dirname(filename))
398 |         except OSError as exc: # Guard against race condition
399 |             if exc.errno != errno.EEXIST:
400 |                 raise
401 |     
402 |     
403 |     df_predict_lr.to_csv('results/'+sector_name+'/df_predict_lr.csv')
404 |     df_predict_rf.to_csv('results/'+sector_name+'/df_predict_rf.csv')
405 |     df_predict_ridge.to_csv('results/'+sector_name+'/df_predict_ridge.csv')
406 |     df_predict_gbm.to_csv('results/'+sector_name+'/df_predict_gbm.csv')
407 |     df_predict_best.to_csv('results/'+sector_name+'/df_predict_best.csv')
408 |     df_best_model_name.to_csv('results/'+sector_name+'/df_best_model_name.csv')
409 |     #df_evaluation_score.to_csv('results/'+sector_name+'/df_evaluation_score.csv')
410 |     df_model_score.to_csv('results/'+sector_name+'/df_model_score.csv')
411 | 
412 | 
413 | 
414 | def calculate_sector_daily_return(daily_price, unique_ticker,trade_date):
415 |     daily_price_pivot = pd.pivot_table(daily_price, values='adj_price', index=['datadate'],
416 |                        columns=['tic'], aggfunc=np.mean)
417 |     daily_price_pivot=daily_price_pivot[unique_ticker]
418 |     
419 |     daily_return=daily_price_pivot.pct_change()
420 |     daily_return = daily_return[daily_return.index>=trade_date[0]]
421 |     return daily_return
422 | 
423 | def calculate_sector_quarterly_return(daily_price, unique_ticker,trade_date_plus1):
424 |     daily_price_pivot = pd.pivot_table(daily_price, values='adj_price', index=['datadate'],
425 |                        columns=['tic'], aggfunc=np.mean)
426 |     daily_price_pivot=daily_price_pivot[unique_ticker]
427 |     quarterly_price_pivot=daily_price_pivot.ix[trade_date_plus1]
428 |     
429 |     quarterly_return=quarterly_price_pivot.pct_change()
430 |     quarterly_return = quarterly_return[quarterly_return.index>trade_date_plus1[0]]
431 |     
432 |     return quarterly_return
433 | 
434 | def pick_stocks_based_on_quantiles_old(df_predict_best):
435 | 
436 |     quantile_0_25 = {}
437 |     quantile_25_50 = {}
438 |     quantile_50_75 = {}
439 |     quantile_75_100 = {}
440 | 
441 | 
442 |     for i in range(df_predict_best.shape[0]):
443 |         q_25=df_predict_best.iloc[i].quantile(0.25)
444 |         q_50=df_predict_best.iloc[i].quantile(0.5)
445 |         q_75=df_predict_best.iloc[i].quantile(0.75)
446 |         q_100=df_predict_best.iloc[i].quantile(1)
447 | 
448 |         quantile_0_25[df_predict_best.index[i]] = df_predict_best.iloc[i][df_predict_best.iloc[i] <= q_25]
449 |         quantile_25_50[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_25) & \
450 |                                                                              (df_predict_best.iloc[i] <= q_50)]
451 |         quantile_50_75[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_50) & \
452 |                                                                                (df_predict_best.iloc[i] <= q_75)]
453 |         quantile_75_100[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] > q_75)]
454 |     return (quantile_0_25, quantile_25_50, quantile_50_75, quantile_75_100)        
455 | 
456 | def pick_stocks_based_on_quantiles(df_predict_best):
457 | 
458 |     quantile_0_30 = {}
459 | 
460 |     quantile_70_100 = {}
461 | 
462 | 
463 |     for i in range(df_predict_best.shape[0]):
464 |         q_30=df_predict_best.iloc[i].quantile(0.3)
465 |         q_70=df_predict_best.iloc[i].quantile(0.7)
466 | 
467 |         quantile_0_30[df_predict_best.index[i]] = df_predict_best.iloc[i][df_predict_best.iloc[i] <= q_30]
468 |                                                                              
469 | 
470 |         quantile_70_100[df_predict_best.index[i]] = df_predict_best.iloc[i][(df_predict_best.iloc[i] >= q_70)]
471 |     return (quantile_0_30, quantile_70_100)   
472 | 
473 | def calculate_portfolio_return(daily_return,trade_date_plus1,long_dict,frequency_date):
474 |     df_portfolio_return = pd.DataFrame(columns=['portfolio_return'])
475 | 
476 |     for i in range(len(trade_date_plus1) - 1):
477 |         # for long only
478 |         #equally weight
479 |         #long_normalize_weight = 1/long_dict[trade_date_plus1[i]].shape[0]
480 | 
481 |         # map date and tic
482 |         long_tic_return_daily = \
483 |             daily_return[(daily_return.index >= trade_date_plus1[i]) &\
484 |                          (daily_return.index < trade_date_plus1[i + 1])][long_dict[trade_date_plus1[i]].index]
485 |         # return * weight
486 |         long_daily_return = long_tic_return_daily 
487 |         df_temp = long_daily_return.mean(axis=1)
488 |         df_temp = pd.DataFrame(df_temp, columns=['daily_return'])
489 |         df_portfolio_return = df_portfolio_return.append(df_temp)
490 |     return df_portfolio_return    
491 | 
492 | def calculate_portfolio_quarterly_return(quarterly_return,trade_date_plus1,long_dict):
493 |     df_portfolio_return = pd.DataFrame(columns=['portfolio_return'])
494 | 
495 |     for i in range(len(trade_date_plus1) - 1):
496 |         # for long only
497 |         #equally weight
498 |         #long_normalize_weight = 1/long_dict[trade_date_plus1[i]].shape[0]
499 | 
500 |         # map date and tic
501 |         long_tic_return = quarterly_return[quarterly_return.index == trade_date_plus1[i + 1]][long_dict[trade_date_plus1[i]].index]
502 | 
503 |         df_temp = long_tic_return.mean(axis=1)
504 |         df_temp = pd.DataFrame(df_temp, columns=['portfolio_return'])
505 |         df_portfolio_return = df_portfolio_return.append(df_temp)
506 |     return df_portfolio_return    
507 | 
508 | def long_only_strategy_daily(df_predict_return, daily_return, trade_month_plus1, top_quantile_threshold=0.75):
509 |     long_dict = {}
510 |     for i in range(df_predict_return.shape[0]):
511 |         top_q = df_predict_return.iloc[i].quantile(top_quantile_threshold)
512 |         # low_q=df_predict_return.iloc[i].quantile(0.2)
513 |         # Select all stocks
514 |         # long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][~np.isnan(df_predict_return.iloc[i])]
515 |         # Select Top 30% Stocks
516 |         long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i] >= top_q]
517 |         # short_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i]<=low_q]
518 | 
519 |     df_portfolio_return_daily = pd.DataFrame(columns=['daily_return'])
520 |     for i in range(len(trade_month_plus1) - 1):
521 |         # for long only
522 |         #equally weight
523 |         long_normalize_weight = 1/long_dict[trade_month_plus1[i]].shape[0]
524 |         
525 |         # calculate weight based on predicted return
526 |         #long_normalize_weight = \
527 |         #long_dict[trade_month_plus1[i]] / sum(long_dict[trade_month_plus1[i]].values)
528 |         # map date and tic
529 |         long_tic_return_daily = \
530 |         daily_return[(daily_return.index >= trade_month_plus1[i]) & (daily_return.index < trade_month_plus1[i + 1])][
531 |             long_dict[trade_month_plus1[i]].index]
532 |         # return * weight
533 |         long_daily_return = long_tic_return_daily * long_normalize_weight
534 |         df_temp = long_daily_return.sum(axis=1)
535 |         df_temp = pd.DataFrame(df_temp, columns=['daily_return'])
536 |         df_portfolio_return_daily = df_portfolio_return_daily.append(df_temp)
537 | 
538 |         # for short only
539 |         # short_normalize_weight=short_dict[trade_month[i]]/sum(short_dict[trade_month[i]].values)
540 |         # short_tic_return=tic_monthly_return[tic_monthly_return.index==trade_month[i]][short_dict[trade_month[i]].index]
541 |         # short_return_table=short_tic_return
542 |         # portfolio_return_dic[trade_month[i]] = long_return_table.values.sum() + short_return_table.values.sum()
543 | 
544 |     return df_portfolio_return_daily
545 | 
546 | 
547 | def long_only_strategy_monthly(df_predict_return, tic_monthly_return, trade_month, top_quantile_threshold=0.7):
548 |     long_dict = {}
549 |     short_dict = {}
550 |     for i in range(df_predict_return.shape[0]):
551 |         top_q = df_predict_return.iloc[i].quantile(top_quantile_threshold)
552 |         # low_q=df_predict_return.iloc[i].quantile(0.2)
553 |         # Select all stocks
554 |         # long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][~np.isnan(df_predict_return.iloc[i])]
555 |         # Select Top 30% Stocks
556 |         long_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i] >= top_q]
557 |         # short_dict[df_predict_return.index[i]] = df_predict_return.iloc[i][df_predict_return.iloc[i]<=low_q]
558 | 
559 |     portfolio_return_dic = {}
560 |     for i in range(len(trade_month)):
561 |         # for longX_train_rf only
562 |         # calculate weight based on predicted return
563 |         long_normalize_weight = long_dict[trade_month[i]] / sum(long_dict[trade_month[i]].values)
564 |         # map date and tic
565 |         long_tic_return = tic_monthly_return[tic_monthly_return.index == trade_month[i]][
566 |             long_dict[trade_month[i]].index]
567 |         # return * weight
568 |         long_return_table = long_tic_return * long_normalize_weight
569 |         portfolio_return_dic[trade_month[i]] = long_return_table.values.sum()
570 | 
571 |         # for short only
572 |         # short_normalize_weight=short_dict[trade_month[i]]/sum(short_dict[trade_month[i]].values)
573 |         # short_tic_return=tic_monthly_return[tic_monthly_return.index==trade_month[i]][short_dict[trade_month[i]].index]
574 |         # short_return_table=short_tic_return
575 |         # portfolio_return_dic[trade_month[i]] = long_return_table.values.sum() + short_return_table.values.sum()
576 | 
577 |     df_portfolio_return = pd.DataFrame.from_dict(portfolio_return_dic, orient='index')
578 |     df_portfolio_return = df_portfolio_return.reset_index()
579 |     df_portfolio_return.columns = ['trade_month', 'monthly_return']
580 |     df_portfolio_return.index = df_portfolio_return.trade_month
581 |     df_portfolio_return = df_portfolio_return['monthly_return']
582 |     return df_portfolio_return
583 | 
584 | 
585 | 
586 | 
587 | 
588 | def plot_predict_return_distribution(df_predict_best,sector_name,out_path):
589 |     import matplotlib.pyplot as plt
590 | 
591 |     for i in range(df_predict_best.shape[0]):
592 |         fig=plt.figure(figsize=(8,5))
593 |         df_predict_best.iloc[i].hist()
594 |         plt.xlabel("predicted return",size=15)
595 |         plt.ylabel("frequency",size=15)
596 | 
597 |         plt.title(sector_name+": trade date - "+str(df_predict_best.index[i]),size=15)
598 |     plt.savefig(out_path+str(df_predict_best.index[i])+".png")
599 | 
600 | 
601 | 
602 | 
603 | 
604 | 
605 | 
606 | 
607 | 
608 | 
609 | 


--------------------------------------------------------------------------------
/code/old_Rcode/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/code/old_Rcode/.DS_Store


--------------------------------------------------------------------------------
/code/old_Rcode/fundamental_ML_model.R:
--------------------------------------------------------------------------------
  1 | fundamental_ML_model <- function(sector_data,trade_date){
  2 |   #######################################################
  3 |   #1. model test error to select models
  4 |   #2. trade period predicted return to select stocks
  5 |   #3. linear regression features
  6 |   #4. random forest features
  7 |   #5. ridge features
  8 |   #6. stepwise regression features
  9 |   #7. gbm features
 10 |   #sector_data=sector45_data
 11 |   
 12 |   #look at the data determine the first factor column number
 13 |   start_column=12
 14 | 
 15 |   #set the rows to 89, because we have 89 stock selections  
 16 |   #may need to adjust and put into function
 17 |   
 18 |   #model error to select model
 19 |   model_error=data.frame(MSE_linear=replicate(89,0))
 20 |   model_error[,2]=data.frame(MSE_RF=replicate(89,0))
 21 |   model_error[,3]=data.frame(MSE_ridge=replicate(89,0))
 22 |   model_error[,4]=data.frame(MSE_step=replicate(89,0))
 23 |   model_error[,5]=data.frame(MSE_gbm=replicate(89,0))
 24 |   
 25 |   #predicte return to select stocks
 26 |   predicted_return=list()
 27 |   
 28 |   
 29 |   
 30 |   #main model
 31 |   LR_features=list()
 32 |   RF_features=list()
 33 |   ridge_features=list()
 34 |   
 35 |   Step_features=list()
 36 |   GBM_features=list()
 37 |   
 38 |   #for(i in 1:(length(trade_date)-19)){RF_features[[i]]=c(1:i)}
 39 |   
 40 |   #understand rolling windows
 41 |   #for(i in 1:(length(trade_date)-19)){print(c(i,i+15,i+16,i+19,trade_date[i+20]))}
 42 |   
 43 |   for(i in 1:(length(trade_date)-21)){
 44 |     
 45 |     ###############################################
 46 |     ###########rolling window########################
 47 |     
 48 |     ####train the model based on 4 years, 16 quarters data
 49 |     #growing window 10 years
 50 |     if (i<=25) {
 51 |       data_train=sector_data[(sector_data$tradedate <= trade_date[i+15]),]
 52 |       train_x=data_train[,c(start_column:(dim(sector_data)[2]-1))]
 53 |       train_y=data_train[,dim(sector_data)[2]]
 54 |     } else{
 55 |       data_train=sector_data[(sector_data$tradedate <= trade_date[i+15]) & sector_data$tradedate >= trade_date[i-25],]
 56 |       train_x=data_train[,c(start_column:(dim(sector_data)[2]-1))]
 57 |       train_y=data_train[,dim(sector_data)[2]]
 58 |     }
 59 |     
 60 |     ####test the model based on 1 year, 4 quarters data
 61 |     data_test=sector_data[(sector_data$tradedate <= trade_date[i+19]) & (sector_data$tradedate >= trade_date[i+16]),]
 62 |     test_x=data_test[,c(start_column:(dim(sector_data)[2]-1))]
 63 |     test_y=data_test[,dim(sector_data)[2]]
 64 |     
 65 |     train=cbind(train_y,train_x)
 66 |     test=cbind(test_y,test_x)
 67 |     
 68 |     
 69 |     ####trade data for every quarter
 70 |     data_trade=sector_data[(sector_data$tradedate == trade_date[i+20]),]
 71 |     trade_x=data_trade[,c(start_column:(dim(sector_data)[2]-1))]
 72 |     trade_y=data_trade[,dim(sector_data)[2]]
 73 |     trade=cbind(trade_x,trade_y)
 74 |     
 75 |     row.names(trade_x)=data_trade$tic
 76 |     
 77 |     ###########################################
 78 |     ##############linear regression############
 79 |     ###########################################
 80 |     linear_model=lm(y_return~., data=train)
 81 |     linear_pre_y=predict(linear_model,test_x)
 82 |     MSE_linear=mean((test_y-linear_pre_y)^2,na.rm=TRUE)
 83 |     #MSE_linear
 84 |     
 85 |     #LR features
 86 |     LR_features[[i]]=summary(linear_model)
 87 |     
 88 |     ###########################################
 89 |     ################Random Forest##############
 90 |     ###########################################
 91 |     # Tune using algorithm tools
 92 |     # Tunning the mtry
 93 |     bestmtry <- tuneRF(train[,-1],train[,1], stepFactor=1.5, improve=1e-5, ntree=500,trace=0,plot = FALSE)
 94 |     #plot(bestmtry,type = "l")
 95 |     bestmtry=data.frame(bestmtry)
 96 |     mytry_optimal=bestmtry$mtry[which.min(bestmtry$OOBError)]
 97 |     #mytry_optimal
 98 |     RF_Model=randomForest(y_return~.,data = train,ntree=500,mtry=mytry_optimal,importance=TRUE, na.rm = T,trace=0)
 99 |     
100 |     yhat_bag=predict(RF_Model,test_x)
101 |     MSE_RF=mean((yhat_bag-test_y)^2)
102 |     #MSE_RF
103 |     #importance table
104 |     #varImp(RF_Model)
105 |     #varImpPlot(RF_Model,main='Random Forest Importance Table')
106 |     
107 |     ########RF features
108 |     RF_features[[i]]=varImp(RF_Model)
109 |     
110 |     #####################################
111 |     ################ridge################
112 |     #####################################
113 |     x_train_ridge=model.matrix(y_return~., train)[,-1]
114 |     y_train_ridge=train$y_return
115 |     
116 |     x_test_ridge=model.matrix(y_return~.,test)[,-1]
117 |     y_test_ridge=test$y_return
118 |     
119 |     #tunning for lambda
120 |     #first run ridge on training set and pick the best lambda
121 |     cv.out_ridge=cv.glmnet(x_train_ridge,y_train_ridge,alpha=1)
122 |     bestlam_ridge=cv.out_ridge$lambda.min
123 |     
124 |     ridge_model=glmnet(x_train_ridge,y_train_ridge,alpha = 0,lambda = bestlam_ridge)
125 |     ridge_pred_y=predict(ridge_model, newx = x_test_ridge)
126 |     
127 |     MSE_ridge=mean((ridge_pred_y-y_test_ridge)^2,na.rm=TRUE)
128 |     
129 |     #ridge features
130 |     ridge_coeffs <- coef(ridge_model)
131 |     ridge_coef=data.frame(name = ridge_coeffs@Dimnames[[1]][ridge_coeffs@i + 1], coefficient = ridge_coeffs@x)
132 |     
133 |     ridge_features[[i]]=ridge_coef
134 |     
135 |     
136 |     
137 |     ###########################################
138 |     ##############stepwise regression##########
139 |     ###########################################
140 |     #based on linear regresion
141 |     step_model=stepAIC(linear_model, direction="both",trace = 0)
142 |     step_pre_y=predict(step_model,test_x)
143 |     
144 |     MSE_step=mean((test_y-step_pre_y)^2,na.rm=TRUE)
145 |     #MSE_step
146 |     
147 |     #step features
148 |     Step_features[[i]]=summary(step_model)
149 |     
150 |     
151 |     ###################################
152 |     ################GBM################
153 |     ###################################
154 |     #Generalized Boosted Regression Models
155 |     gbm_model=gbm(y_return~.,data = train,
156 |                   dist="gaussian",
157 |                   n.tree = 400,
158 |                   shrinkage=0.1, 
159 |                   cv.folds = 5)
160 |     
161 |     gbm_pred_y = predict(gbm_model, test, n.tree = 400, type = 'response')
162 |     MSE_gbm=mean((gbm_pred_y-test_y)^2,na.rm=TRUE)
163 |     #MSE_gbm
164 |     ########GBM features
165 |     GBM_features[[i]]=  summary(gbm_model,plot=FALSE)
166 |     
167 |     ######################################
168 |     #############get results#############
169 |     ######################################
170 | 
171 |     
172 |     
173 |     #####################################
174 |     #all model trade data
175 |     #trade using linear regression
176 |     trade_linear_y=predict(linear_model,trade_x)
177 |     #trade using random forest
178 |     trade_RF_y=predict(RF_Model,trade_x)
179 |     #trade using ridge
180 |     x_trade_ridge=model.matrix(y_return~.,trade)[,-1]
181 |     row.names(x_trade_ridge)=data_trade$tic
182 |     trade_ridge_y=predict(ridge_model,x_trade_ridge)
183 |     colnames(trade_ridge_y)=c('trade_ridge_y')
184 | 
185 |     #trade stepwise regression
186 |     trade_step_y=predict(step_model,trade_x)
187 |     #trade using GBM
188 |     trade_GBM_y=predict(gbm_model,trade_x)
189 |     
190 |     ###########store model error
191 |     if (length(unique(trade_linear_y))<length(trade_linear_y)*0.2){
192 |       MSE_linear=NA
193 |     }
194 |     
195 |     if(length(unique(trade_RF_y))<length(trade_RF_y)*0.2){
196 |       MSE_RF=NA
197 |     }
198 |     
199 |     if(length(unique(trade_ridge_y))<length(trade_ridge_y)*0.2){
200 |       MSE_ridge=NA
201 |     }
202 |     
203 |     
204 |     if(length(unique(trade_step_y))<length(trade_step_y)*0.2){
205 |       MSE_step=NA
206 |     }
207 |     
208 |     if(length(unique(trade_GBM_y))<length(trade_GBM_y)*0.2){
209 |       MSE_gbm=NA
210 |     }
211 |     
212 |     model_error[i,]=c(MSE_linear,MSE_RF,MSE_ridge,MSE_step,MSE_gbm)
213 |     
214 |     #store all the predicted returns
215 |     temp_return=cbind(trade_linear_y,trade_RF_y,trade_ridge_y,trade_step_y,trade_GBM_y)
216 |     predicted_return[[i]]=temp_return
217 |     
218 |   }
219 |   
220 |   output=list(model_error=model_error, 
221 |               predicted_return=predicted_return, 
222 |               
223 |               LR_features=LR_features, 
224 |               RF_features=RF_features, 
225 |               ridge_features=ridge_features,
226 |               
227 |               Step_features=Step_features, 
228 |               GBM_features=GBM_features
229 |               )
230 |   return(output)
231 |   
232 |   
233 |   
234 | }


--------------------------------------------------------------------------------
/code/old_Rcode/fundamental_run_model.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | #install and load package
  4 | packages.used=c("readxl", "MASS", "ggplot2", 
  5 |                 "glmnet", "ISLR", "tree",
  6 |                 "randomForest", "gbm", "e1071","caret")
  7 | 
  8 | # check packages that need to be installed.
  9 | packages.needed=setdiff(packages.used, 
 10 |                         intersect(installed.packages()[,1], 
 11 |                                   packages.used))
 12 | # install additional packages
 13 | if(length(packages.needed)>0){
 14 |   install.packages(packages.needed, dependencies = TRUE)
 15 | }
 16 | library(readxl)
 17 | library(MASS)
 18 | library(ggplot2)
 19 | library(glmnet)
 20 | library(ISLR)
 21 | library(tree)
 22 | library(randomForest)
 23 | library(gbm)
 24 | library(e1071)
 25 | library(caret)
 26 | 
 27 | source("fundamental_ML_model.R")
 28 | source("fundamental_select_stock.R")
 29 | 
 30 | ####################get data############################
 31 | fundamental_total=read_excel("fundamental_final_table.xlsx",1)
 32 | trade_date=unique(fundamental_total$tradedate)
 33 | trade_date=sort(trade_date)
 34 | 
 35 | sector10_data=read_excel("sector10_clean.xlsx",1)
 36 | dim(sector10_data)
 37 | 
 38 | sector15_data=read_excel("sector15_clean.xlsx",1)
 39 | dim(sector15_data)
 40 | 
 41 | 
 42 | sector20_data=read_excel("sector20_clean.xlsx",1)
 43 | dim(sector20_data)
 44 | 
 45 | sector25_data=read_excel("sector25_clean.xlsx",1)
 46 | dim(sector25_data)
 47 | 
 48 | 
 49 | sector30_data=read_excel("sector30_clean.xlsx",1)
 50 | dim(sector30_data)
 51 | 
 52 | sector35_data=read_excel("sector35_clean.xlsx",1)
 53 | dim(sector35_data)
 54 | 
 55 | sector40_data=read_excel("sector40_clean.xlsx",1)
 56 | dim(sector40_data)
 57 | 
 58 | sector45_data=read_excel("sector45_clean.xlsx",1)
 59 | dim(sector45_data)
 60 | 
 61 | sector50_data=read_excel("sector50_clean.xlsx",1)
 62 | dim(sector50_data)
 63 | 
 64 | 
 65 | sector55_data=read_excel("sector55_clean.xlsx",1)
 66 | dim(sector55_data)
 67 | 
 68 | sector60_data=read_excel("sector60_clean.xlsx",1)
 69 | dim(sector60_data)
 70 | 
 71 | 
 72 | ###############################################################
 73 | #####run model and save as RData
 74 | ###############################################################
 75 | 
 76 | ######################################
 77 | ############sector 10 Energy (5238, 32)
 78 | ######################################
 79 | ##1.2 hours to run
 80 | start.time=Sys.time()
 81 | sector10_result=fundamental_ML_model(sector10_data,trade_date)
 82 | end.time=Sys.time()
 83 | end.time-start.time
 84 | save(sector10_result,file = "sector10_result.RData")
 85 | 
 86 | ######################################
 87 | ############sector 15 Materials (5216, 32)
 88 | ######################################
 89 | ##1.2 hours to run
 90 | start.time=Sys.time()
 91 | sector15_result=fundamental_ML_model(sector15_data,trade_date)
 92 | end.time=Sys.time()
 93 | end.time-start.time
 94 | save(sector15_result,file = "sector15_result.RData")
 95 | 
 96 | ######################################
 97 | ############sector 20 Industrials (9881, 26)
 98 | ######################################
 99 | #2 hours to run
100 | start.time=Sys.time()
101 | sector20_result=fundamental_ML_model(sector20_data,trade_date)
102 | end.time=Sys.time()
103 | end.time-start.time
104 | save(sector20_result,file = "sector20_result.RData")
105 | 
106 | ######################################
107 | ############sector 25 Consumer Discretionary (12595, 26)
108 | ######################################
109 | #2.5 hours to run
110 | start.time=Sys.time()
111 | sector25_result=fundamental_ML_model(sector25_data,trade_date)
112 | end.time=Sys.time()
113 | end.time-start.time
114 | save(sector25_result,file = "sector25_result.RData")
115 | 
116 | ######################################
117 | ############sector 30 Consumer Staples (5388, 29)
118 | ######################################
119 | #1.2 hours to run
120 | start.time=Sys.time()
121 | sector30_result=fundamental_ML_model(sector30_data,trade_date)
122 | end.time=Sys.time()
123 | end.time-start.time
124 | save(sector30_result,file = "sector30_result.RData")
125 | 
126 | ######################################
127 | ############sector 35 Health Cares (7615, 29)
128 | ######################################
129 | #2 hours to run
130 | start.time=Sys.time()
131 | sector35_result=fundamental_ML_model(sector35_data,trade_date)
132 | end.time=Sys.time()
133 | end.time-start.time
134 | save(sector35_result,file = "sector35_result.RData")
135 | 
136 | ######################################
137 | ############sector 40 Financials (9480, 21)
138 | ######################################
139 | ##1.5 hours to run
140 | start.time=Sys.time()
141 | sector40_result=fundamental_ML_model(sector40_data,trade_date)
142 | end.time=Sys.time()
143 | end.time-start.time
144 | save(sector40_result,file = "sector40_result.RData")
145 | 
146 | ######################################
147 | ############sector 45 Information Technology (10243, 29)
148 | ######################################
149 | ##2.5 hours to run
150 | start.time=Sys.time()
151 | sector45_result=fundamental_ML_model(sector45_data,trade_date)
152 | end.time=Sys.time()
153 | end.time-start.time
154 | save(sector45_result,file = "sector45_result.RData")
155 | 
156 | ######################################
157 | ############sector 50 Telecomminucation Services (1127, 32)
158 | ######################################
159 | #20 mins to run
160 | start.time=Sys.time()
161 | sector50_result=fundamental_ML_model(sector50_data,trade_date)
162 | end.time=Sys.time()
163 | end.time-start.time
164 | save(sector50_result,file = "sector50_result.RData")
165 | 
166 | ######################################
167 | ############sector 55 Utilities (3903, 32)
168 | ######################################
169 | ##1.2 hours to run
170 | start.time=Sys.time()
171 | sector55_result=fundamental_ML_model(sector55_data,trade_date)
172 | end.time=Sys.time()
173 | end.time-start.time
174 | save(sector55_result,file = "sector55_result.RData")
175 | 
176 | ######################################
177 | ############sector 60 Real Estate (3039, 32)
178 | ######################################
179 | #31 mins to run
180 | start.time=Sys.time()
181 | sector60_result=fundamental_ML_model(sector60_data,trade_date)
182 | end.time=Sys.time()
183 | end.time-start.time
184 | save(sector60_result,file = "sector60_result.RData")
185 | 
186 | #############################################
187 | #############################################
188 | #############################################
189 | #############################################
190 | 
191 | ###############################################################
192 | ################Stock Selection
193 | ###############################################################
194 | 
195 | #########stock selection sector 10
196 | #load("sector10_result.RData")
197 | selector10_modelStock=select_modelStock(sector10_result)
198 | selector10_topStock=select_topStock(selector10_modelStock$selected_stocks)
199 | #########stock selection sector 15
200 | #load("sector15_result.RData")
201 | selector15_modelStock=select_modelStock(sector15_result)
202 | selector15_topStock=select_topStock(selector15_modelStock$selected_stocks)
203 | #########stock selection sector 20
204 | #load("sector20_result.RData")
205 | selector20_modelStock=select_modelStock(sector20_result)
206 | selector20_topStock=select_topStock(selector20_modelStock$selected_stocks)
207 | #########stock selection sector 25
208 | #load("sector25_result.RData")
209 | selector25_modelStock=select_modelStock(sector25_result)
210 | selector25_topStock=select_topStock(selector25_modelStock$selected_stocks)
211 | #########stock selection sector 30
212 | #load("sector30_result.RData")
213 | selector30_modelStock=select_modelStock(sector30_result)
214 | selector30_topStock=select_topStock(selector30_modelStock$selected_stocks)
215 | #########stock selection sector 35
216 | #load("sector35_result.RData")
217 | selector35_modelStock=select_modelStock(sector35_result)
218 | selector35_topStock=select_topStock(selector35_modelStock$selected_stocks)
219 | #########stock selection sector 40
220 | #load("sector40_result.RData")
221 | selector40_modelStock=select_modelStock(sector40_result)
222 | selector40_topStock=select_topStock(selector40_modelStock$selected_stocks)
223 | #########stock selection sector 45
224 | #load("sector45_result.RData")
225 | selector45_modelStock=select_modelStock(sector45_result)
226 | selector45_topStock=select_topStock(selector45_modelStock$selected_stocks)
227 | #########stock selection sector 50
228 | #load("sector50_result.RData")
229 | selector50_modelStock=select_modelStock(sector50_result)
230 | selector50_topStock=select_topStock(selector50_modelStock$selected_stocks)
231 | #selector50_topStock[[82]]=selector50_topStock[[81]]
232 | #########stock selection sector 55
233 | #load("sector55_result.RData")
234 | selector55_modelStock=select_modelStock(sector55_result)
235 | selector55_topStock=select_topStock(selector55_modelStock$selected_stocks)
236 | #########stock selection sector 60
237 | #load("sector60_result.RData")
238 | selector60_modelStock=select_modelStock(sector60_result)
239 | selector60_topStock=select_topStock(selector60_modelStock$selected_stocks)
240 | 
241 | 
242 | 
243 | ###############combine stocks together
244 | stocks_selected_total=NULL
245 | for (i in 1:89){
246 |   
247 |   #sector 10
248 |   sector10_temp=selector10_topStock[[i]]
249 |   sector10_temp=cbind(names(sector10_temp),unname(sector10_temp),trade_date[i+20])
250 |   colnames(sector10_temp)=c('tic','predicted_return','trade_date')
251 |   
252 |   #sector 15
253 |   sector15_temp=selector15_topStock[[i]]
254 |   sector15_temp=cbind(names(sector15_temp),unname(sector15_temp),trade_date[i+20])
255 |   colnames(sector15_temp)=c('tic','predicted_return','trade_date')
256 | 
257 |   #sector 20
258 |   sector20_temp=selector20_topStock[[i]]
259 |   sector20_temp=cbind(names(sector20_temp),unname(sector20_temp),trade_date[i+20])
260 |   colnames(sector20_temp)=c('tic','predicted_return','trade_date')
261 | 
262 |   #sector 25
263 |   sector25_temp=selector25_topStock[[i]]
264 |   sector25_temp=cbind(names(sector25_temp),unname(sector25_temp),trade_date[i+20])
265 |   colnames(sector25_temp)=c('tic','predicted_return','trade_date')
266 |   
267 |   #sector 30
268 |   sector30_temp=selector30_topStock[[i]]
269 |   sector30_temp=cbind(names(sector30_temp),unname(sector30_temp),trade_date[i+20])
270 |   colnames(sector30_temp)=c('tic','predicted_return','trade_date')
271 |   
272 |   #sector 35
273 |   sector35_temp=selector35_topStock[[i]]
274 |   sector35_temp=cbind(names(sector35_temp),unname(sector35_temp),trade_date[i+20])
275 |   colnames(sector35_temp)=c('tic','predicted_return','trade_date')
276 |   
277 |   #sector 40
278 |   sector40_temp=selector40_topStock[[i]]
279 |   sector40_temp=cbind(names(sector40_temp),unname(sector40_temp),trade_date[i+20])
280 |   colnames(sector40_temp)=c('tic','predicted_return','trade_date')
281 |   
282 |   #sector 45
283 |   sector45_temp=selector45_topStock[[i]]
284 |   sector45_temp=cbind(names(sector45_temp),unname(sector45_temp),trade_date[i+20])
285 |   colnames(sector45_temp)=c('tic','predicted_return','trade_date')
286 |   
287 |   #sector 50
288 |   sector50_temp=selector50_topStock[[i]]
289 |   sector50_temp=cbind(names(sector50_temp),unname(sector50_temp),trade_date[i+20])
290 |   colnames(sector50_temp)=c('tic','predicted_return','trade_date')
291 |   
292 |   #sector 55
293 |   sector55_temp=selector55_topStock[[i]]
294 |   sector55_temp=cbind(names(sector55_temp),unname(sector55_temp),trade_date[i+20])
295 |   colnames(sector55_temp)=c('tic','predicted_return','trade_date')
296 | 
297 |   
298 |   #sector 60
299 |   sector60_temp=selector60_topStock[[i]]
300 |   sector60_temp=cbind(names(sector60_temp),unname(sector60_temp),trade_date[i+20])
301 |   colnames(sector60_temp)=c('tic','predicted_return','trade_date')
302 | 
303 |   
304 |   stocks_bind=rbind(sector10_temp,
305 |                     sector15_temp,
306 |                     sector20_temp,
307 |                     sector25_temp,
308 |                     sector30_temp,
309 |                     sector35_temp,
310 |                     sector40_temp,
311 |                     sector45_temp,
312 |                     sector50_temp,
313 |                     sector55_temp,
314 |                     sector60_temp)
315 | 
316 |   stocks_selected_total=rbind(stocks_selected_total,stocks_bind)
317 | 
318 | }
319 | 
320 | stocks_selected_total=as.data.frame(stocks_selected_total)
321 | 
322 | 
323 | write.csv(stocks_selected_total,"stocks_selected_total.csv")
324 | 


--------------------------------------------------------------------------------
/code/old_Rcode/fundamental_select_stock.R:
--------------------------------------------------------------------------------
 1 | select_modelStock = function(sector_result){
 2 |   #sector_result=sector10_result
 3 |   selected_model=NULL
 4 |   selected_stocks=list()
 5 |   
 6 |   for (i in 1:89){
 7 |   get_minIndex= apply(sector_result$model_error[i,],1,which.min)
 8 |   selected_model[i]=colnames(sector_result$model_error[i,])[apply(sector_result$model_error[i,],1,which.min)]
 9 |   selected_stocks[[i]] = sector_result$predicted_return[[i]][,get_minIndex]
10 | 
11 |   }
12 |   
13 |   output=list(selected_stocks=selected_stocks,selected_model=selected_model)
14 |   return(output)
15 | }
16 | 
17 | 
18 | 
19 | select_topStock=function(selected_stocks){
20 |   selected_topstocks=list()
21 |   
22 |   for (i in 1:89){
23 |     selected_topstocks[[i]]=selected_stocks[[i]][selected_stocks[[i]]>=quantile(selected_stocks[[i]],0.8)]
24 |   } 
25 |   return(selected_topstocks)
26 | }


--------------------------------------------------------------------------------
/figs/chart10_insample.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart10_insample.PNG


--------------------------------------------------------------------------------
/figs/chart11_overallPerformance.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart11_overallPerformance.PNG


--------------------------------------------------------------------------------
/figs/chart1_datasetPeriod.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart1_datasetPeriod.PNG


--------------------------------------------------------------------------------
/figs/chart2_rolling_windows.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart2_rolling_windows.PNG


--------------------------------------------------------------------------------
/figs/chart3_modelError.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart3_modelError.PNG


--------------------------------------------------------------------------------
/figs/chart4_predictedReturn1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart4_predictedReturn1.PNG


--------------------------------------------------------------------------------
/figs/chart4_predictedReturn2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart4_predictedReturn2.PNG


--------------------------------------------------------------------------------
/figs/chart5_coefficient.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart5_coefficient.PNG


--------------------------------------------------------------------------------
/figs/chart6_selectedStocks.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart6_selectedStocks.PNG


--------------------------------------------------------------------------------
/figs/chart7_efficient1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart7_efficient1.PNG


--------------------------------------------------------------------------------
/figs/chart8_PnL.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart8_PnL.png


--------------------------------------------------------------------------------
/figs/chart9_TotalValue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/chart9_TotalValue.png


--------------------------------------------------------------------------------
/figs/dataperiod.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/dataperiod.png


--------------------------------------------------------------------------------
/figs/efficient1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/efficient1.jpg


--------------------------------------------------------------------------------
/figs/pnl1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/pnl1.jpg


--------------------------------------------------------------------------------
/figs/rolling_windows.vsdx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/rolling_windows.vsdx


--------------------------------------------------------------------------------
/figs/transaction cost.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AI4Finance-Foundation/Dynamic-Stock-Recommendation-Machine_Learning-Published-Paper-IEEE/0f9d87da97edc1ee8cab3a882ff18e47bf7aaccd/figs/transaction cost.PNG


--------------------------------------------------------------------------------
/fundamental_portfolio.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Import packages"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 27,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "import pandas as pd\n",
  17 |     "import numpy as np\n",
  18 |     "from pypfopt.efficient_frontier import EfficientFrontier\n",
  19 |     "from pypfopt import risk_models\n",
  20 |     "from pypfopt.risk_models import CovarianceShrinkage\n",
  21 |     "from pypfopt import expected_returns\n",
  22 |     "from datetime import datetime\n",
  23 |     "from pandas.tseries.offsets import BDay"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 28,
  29 |    "metadata": {},
  30 |    "outputs": [],
  31 |    "source": [
  32 |     "import time\n",
  33 |     "import pickle"
  34 |    ]
  35 |   },
  36 |   {
  37 |    "cell_type": "markdown",
  38 |    "metadata": {},
  39 |    "source": [
  40 |     "# 1. Read Input Data"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "code",
  45 |    "execution_count": 29,
  46 |    "metadata": {},
  47 |    "outputs": [
  48 |     {
  49 |      "name": "stderr",
  50 |      "output_type": "stream",
  51 |      "text": [
  52 |       "/home/ubuntu/anaconda3/lib/python3.6/site-packages/numpy/lib/arraysetops.py:568: FutureWarning: elementwise comparison failed; returning scalar instead, but in the future will perform elementwise comparison\n",
  53 |       "  mask |= (ar1 == a)\n"
  54 |      ]
  55 |     }
  56 |    ],
  57 |    "source": [
  58 |     "df_price = pd.read_csv(\"Data/1-sp500_adj_price.csv\",index_col=0)"
  59 |    ]
  60 |   },
  61 |   {
  62 |    "cell_type": "code",
  63 |    "execution_count": 30,
  64 |    "metadata": {},
  65 |    "outputs": [
  66 |     {
  67 |      "data": {
  68 |       "text/plain": [
  69 |        "(6438964, 3)"
  70 |       ]
  71 |      },
  72 |      "execution_count": 30,
  73 |      "metadata": {},
  74 |      "output_type": "execute_result"
  75 |     }
  76 |    ],
  77 |    "source": [
  78 |     "df_price.shape"
  79 |    ]
  80 |   },
  81 |   {
  82 |    "cell_type": "code",
  83 |    "execution_count": 31,
  84 |    "metadata": {},
  85 |    "outputs": [
  86 |     {
  87 |      "data": {
  88 |       "text/html": [
  89 |        "<div>\n",
  90 |        "<style scoped>\n",
  91 |        "    .dataframe tbody tr th:only-of-type {\n",
  92 |        "        vertical-align: middle;\n",
  93 |        "    }\n",
  94 |        "\n",
  95 |        "    .dataframe tbody tr th {\n",
  96 |        "        vertical-align: top;\n",
  97 |        "    }\n",
  98 |        "\n",
  99 |        "    .dataframe thead th {\n",
 100 |        "        text-align: right;\n",
 101 |        "    }\n",
 102 |        "</style>\n",
 103 |        "<table border=\"1\" class=\"dataframe\">\n",
 104 |        "  <thead>\n",
 105 |        "    <tr style=\"text-align: right;\">\n",
 106 |        "      <th></th>\n",
 107 |        "      <th>datadate</th>\n",
 108 |        "      <th>tic</th>\n",
 109 |        "      <th>adj_price</th>\n",
 110 |        "    </tr>\n",
 111 |        "  </thead>\n",
 112 |        "  <tbody>\n",
 113 |        "    <tr>\n",
 114 |        "      <th>1</th>\n",
 115 |        "      <td>19900102</td>\n",
 116 |        "      <td>ADCT</td>\n",
 117 |        "      <td>4.074244</td>\n",
 118 |        "    </tr>\n",
 119 |        "    <tr>\n",
 120 |        "      <th>2</th>\n",
 121 |        "      <td>19900103</td>\n",
 122 |        "      <td>ADCT</td>\n",
 123 |        "      <td>4.046900</td>\n",
 124 |        "    </tr>\n",
 125 |        "    <tr>\n",
 126 |        "      <th>3</th>\n",
 127 |        "      <td>19900104</td>\n",
 128 |        "      <td>ADCT</td>\n",
 129 |        "      <td>3.964869</td>\n",
 130 |        "    </tr>\n",
 131 |        "    <tr>\n",
 132 |        "      <th>4</th>\n",
 133 |        "      <td>19900105</td>\n",
 134 |        "      <td>ADCT</td>\n",
 135 |        "      <td>3.992212</td>\n",
 136 |        "    </tr>\n",
 137 |        "    <tr>\n",
 138 |        "      <th>5</th>\n",
 139 |        "      <td>19900108</td>\n",
 140 |        "      <td>ADCT</td>\n",
 141 |        "      <td>3.937525</td>\n",
 142 |        "    </tr>\n",
 143 |        "  </tbody>\n",
 144 |        "</table>\n",
 145 |        "</div>"
 146 |       ],
 147 |       "text/plain": [
 148 |        "   datadate   tic  adj_price\n",
 149 |        "1  19900102  ADCT   4.074244\n",
 150 |        "2  19900103  ADCT   4.046900\n",
 151 |        "3  19900104  ADCT   3.964869\n",
 152 |        "4  19900105  ADCT   3.992212\n",
 153 |        "5  19900108  ADCT   3.937525"
 154 |       ]
 155 |      },
 156 |      "execution_count": 31,
 157 |      "metadata": {},
 158 |      "output_type": "execute_result"
 159 |     }
 160 |    ],
 161 |    "source": [
 162 |     "df_price.head()"
 163 |    ]
 164 |   },
 165 |   {
 166 |    "cell_type": "code",
 167 |    "execution_count": 32,
 168 |    "metadata": {},
 169 |    "outputs": [],
 170 |    "source": [
 171 |     "selected_stock = pd.read_csv(\"Data/2-portfolio_data/stocks_selected_total_user8.csv\")"
 172 |    ]
 173 |   },
 174 |   {
 175 |    "cell_type": "code",
 176 |    "execution_count": 33,
 177 |    "metadata": {},
 178 |    "outputs": [
 179 |     {
 180 |      "data": {
 181 |       "text/plain": [
 182 |        "(12932, 3)"
 183 |       ]
 184 |      },
 185 |      "execution_count": 33,
 186 |      "metadata": {},
 187 |      "output_type": "execute_result"
 188 |     }
 189 |    ],
 190 |    "source": [
 191 |     "selected_stock.shape"
 192 |    ]
 193 |   },
 194 |   {
 195 |    "cell_type": "code",
 196 |    "execution_count": 34,
 197 |    "metadata": {},
 198 |    "outputs": [
 199 |     {
 200 |      "data": {
 201 |       "text/html": [
 202 |        "<div>\n",
 203 |        "<style scoped>\n",
 204 |        "    .dataframe tbody tr th:only-of-type {\n",
 205 |        "        vertical-align: middle;\n",
 206 |        "    }\n",
 207 |        "\n",
 208 |        "    .dataframe tbody tr th {\n",
 209 |        "        vertical-align: top;\n",
 210 |        "    }\n",
 211 |        "\n",
 212 |        "    .dataframe thead th {\n",
 213 |        "        text-align: right;\n",
 214 |        "    }\n",
 215 |        "</style>\n",
 216 |        "<table border=\"1\" class=\"dataframe\">\n",
 217 |        "  <thead>\n",
 218 |        "    <tr style=\"text-align: right;\">\n",
 219 |        "      <th></th>\n",
 220 |        "      <th>tic</th>\n",
 221 |        "      <th>predicted_return</th>\n",
 222 |        "      <th>trade_date</th>\n",
 223 |        "    </tr>\n",
 224 |        "  </thead>\n",
 225 |        "  <tbody>\n",
 226 |        "    <tr>\n",
 227 |        "      <th>0</th>\n",
 228 |        "      <td>EOG</td>\n",
 229 |        "      <td>0.033723</td>\n",
 230 |        "      <td>19950601</td>\n",
 231 |        "    </tr>\n",
 232 |        "    <tr>\n",
 233 |        "      <th>1</th>\n",
 234 |        "      <td>EQT</td>\n",
 235 |        "      <td>0.037745</td>\n",
 236 |        "      <td>19950601</td>\n",
 237 |        "    </tr>\n",
 238 |        "    <tr>\n",
 239 |        "      <th>2</th>\n",
 240 |        "      <td>HES</td>\n",
 241 |        "      <td>0.051450</td>\n",
 242 |        "      <td>19950601</td>\n",
 243 |        "    </tr>\n",
 244 |        "    <tr>\n",
 245 |        "      <th>3</th>\n",
 246 |        "      <td>NFX</td>\n",
 247 |        "      <td>0.030283</td>\n",
 248 |        "      <td>19950601</td>\n",
 249 |        "    </tr>\n",
 250 |        "    <tr>\n",
 251 |        "      <th>4</th>\n",
 252 |        "      <td>OKE</td>\n",
 253 |        "      <td>0.041020</td>\n",
 254 |        "      <td>19950601</td>\n",
 255 |        "    </tr>\n",
 256 |        "  </tbody>\n",
 257 |        "</table>\n",
 258 |        "</div>"
 259 |       ],
 260 |       "text/plain": [
 261 |        "   tic  predicted_return  trade_date\n",
 262 |        "0  EOG          0.033723    19950601\n",
 263 |        "1  EQT          0.037745    19950601\n",
 264 |        "2  HES          0.051450    19950601\n",
 265 |        "3  NFX          0.030283    19950601\n",
 266 |        "4  OKE          0.041020    19950601"
 267 |       ]
 268 |      },
 269 |      "execution_count": 34,
 270 |      "metadata": {},
 271 |      "output_type": "execute_result"
 272 |     }
 273 |    ],
 274 |    "source": [
 275 |     "selected_stock.head()"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "markdown",
 280 |    "metadata": {},
 281 |    "source": [
 282 |     "# 2. Get trade date"
 283 |    ]
 284 |   },
 285 |   {
 286 |    "cell_type": "code",
 287 |    "execution_count": 35,
 288 |    "metadata": {},
 289 |    "outputs": [
 290 |     {
 291 |      "name": "stdout",
 292 |      "output_type": "stream",
 293 |      "text": [
 294 |       "Number of unique stocks selected:  982\n"
 295 |      ]
 296 |     }
 297 |    ],
 298 |    "source": [
 299 |     "print(\"Number of unique stocks selected: \", len(selected_stock.tic.unique()))"
 300 |    ]
 301 |   },
 302 |   {
 303 |    "cell_type": "code",
 304 |    "execution_count": null,
 305 |    "metadata": {},
 306 |    "outputs": [],
 307 |    "source": []
 308 |   },
 309 |   {
 310 |    "cell_type": "code",
 311 |    "execution_count": 36,
 312 |    "metadata": {},
 313 |    "outputs": [],
 314 |    "source": [
 315 |     "all_date=df_price.datadate.unique()"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "code",
 320 |    "execution_count": 37,
 321 |    "metadata": {},
 322 |    "outputs": [
 323 |     {
 324 |      "data": {
 325 |       "text/plain": [
 326 |        "7155"
 327 |       ]
 328 |      },
 329 |      "execution_count": 37,
 330 |      "metadata": {},
 331 |      "output_type": "execute_result"
 332 |     }
 333 |    ],
 334 |    "source": [
 335 |     "len(all_date)"
 336 |    ]
 337 |   },
 338 |   {
 339 |    "cell_type": "code",
 340 |    "execution_count": 38,
 341 |    "metadata": {},
 342 |    "outputs": [],
 343 |    "source": [
 344 |     "trade_date=selected_stock.trade_date.unique()"
 345 |    ]
 346 |   },
 347 |   {
 348 |    "cell_type": "code",
 349 |    "execution_count": 39,
 350 |    "metadata": {},
 351 |    "outputs": [
 352 |     {
 353 |      "data": {
 354 |       "text/plain": [
 355 |        "array([19950601, 19950901, 19951201, 19960301, 19960603, 19960903,\n",
 356 |        "       19961202, 19970303, 19970602, 19970902, 19971201, 19980302,\n",
 357 |        "       19980601, 19980901, 19981201, 19990301, 19990601, 19990901,\n",
 358 |        "       19991201, 20000301, 20000601, 20000901, 20001201, 20010301,\n",
 359 |        "       20010601, 20010904, 20011203, 20020301, 20020603, 20020903,\n",
 360 |        "       20021202, 20030303, 20030602, 20030902, 20031201, 20040301,\n",
 361 |        "       20040601, 20040901, 20041201, 20050301, 20050601, 20050901,\n",
 362 |        "       20051201, 20060301, 20060601, 20060901, 20061201, 20070301,\n",
 363 |        "       20070601, 20070904, 20071203, 20080303, 20080602, 20080902,\n",
 364 |        "       20081201, 20090302, 20090601, 20090901, 20091201, 20100301,\n",
 365 |        "       20100601, 20100901, 20101201, 20110301, 20110601, 20110901,\n",
 366 |        "       20111201, 20120301, 20120601, 20120904, 20121203, 20130301,\n",
 367 |        "       20130603, 20130903, 20131202, 20140303, 20140602, 20140902,\n",
 368 |        "       20141201, 20150302, 20150601, 20150901, 20151201, 20160301,\n",
 369 |        "       20160601, 20160901, 20161201, 20170301, 20170601])"
 370 |       ]
 371 |      },
 372 |      "execution_count": 39,
 373 |      "metadata": {},
 374 |      "output_type": "execute_result"
 375 |     }
 376 |    ],
 377 |    "source": [
 378 |     "trade_date"
 379 |    ]
 380 |   },
 381 |   {
 382 |    "cell_type": "code",
 383 |    "execution_count": 40,
 384 |    "metadata": {},
 385 |    "outputs": [
 386 |     {
 387 |      "name": "stdout",
 388 |      "output_type": "stream",
 389 |      "text": [
 390 |       "Number of trade dates 89\n"
 391 |      ]
 392 |     }
 393 |    ],
 394 |    "source": [
 395 |     "print(\"Number of trade dates\", len(trade_date))"
 396 |    ]
 397 |   },
 398 |   {
 399 |    "cell_type": "markdown",
 400 |    "metadata": {},
 401 |    "source": [
 402 |     "# 3. Get daily 1 year return table in each 89 trade period"
 403 |    ]
 404 |   },
 405 |   {
 406 |    "cell_type": "code",
 407 |    "execution_count": 41,
 408 |    "metadata": {},
 409 |    "outputs": [
 410 |     {
 411 |      "data": {
 412 |       "text/html": [
 413 |        "<div>\n",
 414 |        "<style scoped>\n",
 415 |        "    .dataframe tbody tr th:only-of-type {\n",
 416 |        "        vertical-align: middle;\n",
 417 |        "    }\n",
 418 |        "\n",
 419 |        "    .dataframe tbody tr th {\n",
 420 |        "        vertical-align: top;\n",
 421 |        "    }\n",
 422 |        "\n",
 423 |        "    .dataframe thead th {\n",
 424 |        "        text-align: right;\n",
 425 |        "    }\n",
 426 |        "</style>\n",
 427 |        "<table border=\"1\" class=\"dataframe\">\n",
 428 |        "  <thead>\n",
 429 |        "    <tr style=\"text-align: right;\">\n",
 430 |        "      <th></th>\n",
 431 |        "      <th>tic</th>\n",
 432 |        "      <th>predicted_return</th>\n",
 433 |        "      <th>trade_date</th>\n",
 434 |        "    </tr>\n",
 435 |        "  </thead>\n",
 436 |        "  <tbody>\n",
 437 |        "    <tr>\n",
 438 |        "      <th>0</th>\n",
 439 |        "      <td>EOG</td>\n",
 440 |        "      <td>0.033723</td>\n",
 441 |        "      <td>19950601</td>\n",
 442 |        "    </tr>\n",
 443 |        "    <tr>\n",
 444 |        "      <th>1</th>\n",
 445 |        "      <td>EQT</td>\n",
 446 |        "      <td>0.037745</td>\n",
 447 |        "      <td>19950601</td>\n",
 448 |        "    </tr>\n",
 449 |        "    <tr>\n",
 450 |        "      <th>2</th>\n",
 451 |        "      <td>HES</td>\n",
 452 |        "      <td>0.051450</td>\n",
 453 |        "      <td>19950601</td>\n",
 454 |        "    </tr>\n",
 455 |        "    <tr>\n",
 456 |        "      <th>3</th>\n",
 457 |        "      <td>NFX</td>\n",
 458 |        "      <td>0.030283</td>\n",
 459 |        "      <td>19950601</td>\n",
 460 |        "    </tr>\n",
 461 |        "    <tr>\n",
 462 |        "      <th>4</th>\n",
 463 |        "      <td>OKE</td>\n",
 464 |        "      <td>0.041020</td>\n",
 465 |        "      <td>19950601</td>\n",
 466 |        "    </tr>\n",
 467 |        "  </tbody>\n",
 468 |        "</table>\n",
 469 |        "</div>"
 470 |       ],
 471 |       "text/plain": [
 472 |        "   tic  predicted_return  trade_date\n",
 473 |        "0  EOG          0.033723    19950601\n",
 474 |        "1  EQT          0.037745    19950601\n",
 475 |        "2  HES          0.051450    19950601\n",
 476 |        "3  NFX          0.030283    19950601\n",
 477 |        "4  OKE          0.041020    19950601"
 478 |       ]
 479 |      },
 480 |      "execution_count": 41,
 481 |      "metadata": {},
 482 |      "output_type": "execute_result"
 483 |     }
 484 |    ],
 485 |    "source": [
 486 |     "selected_stock.head()"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "code",
 491 |    "execution_count": 414,
 492 |    "metadata": {},
 493 |    "outputs": [
 494 |     {
 495 |      "name": "stdout",
 496 |      "output_type": "stream",
 497 |      "text": [
 498 |       "19950601\n",
 499 |       "19950901\n",
 500 |       "19951201\n",
 501 |       "19960301\n",
 502 |       "19960603\n",
 503 |       "19960903\n",
 504 |       "19961202\n",
 505 |       "19970303\n",
 506 |       "19970602\n",
 507 |       "19970902\n",
 508 |       "19971201\n",
 509 |       "19980302\n",
 510 |       "19980601\n",
 511 |       "19980901\n",
 512 |       "19981201\n",
 513 |       "19990301\n",
 514 |       "19990601\n",
 515 |       "19990901\n",
 516 |       "19991201\n",
 517 |       "20000301\n",
 518 |       "20000601\n",
 519 |       "20000901\n",
 520 |       "20001201\n",
 521 |       "20010301\n",
 522 |       "20010601\n",
 523 |       "20010904\n",
 524 |       "20011203\n",
 525 |       "20020301\n",
 526 |       "20020603\n",
 527 |       "20020903\n",
 528 |       "20021202\n",
 529 |       "20030303\n",
 530 |       "20030602\n",
 531 |       "20030902\n",
 532 |       "20031201\n",
 533 |       "20040301\n",
 534 |       "20040601\n",
 535 |       "20040901\n",
 536 |       "20041201\n",
 537 |       "20050301\n",
 538 |       "20050601\n",
 539 |       "20050901\n",
 540 |       "20051201\n",
 541 |       "20060301\n",
 542 |       "20060601\n",
 543 |       "20060901\n",
 544 |       "20061201\n",
 545 |       "20070301\n",
 546 |       "20070601\n",
 547 |       "20070904\n",
 548 |       "20071203\n",
 549 |       "20080303\n",
 550 |       "20080602\n",
 551 |       "20080902\n",
 552 |       "20081201\n",
 553 |       "20090302\n",
 554 |       "20090601\n",
 555 |       "20090901\n",
 556 |       "20091201\n",
 557 |       "20100301\n",
 558 |       "20100601\n",
 559 |       "20100901\n",
 560 |       "20101201\n",
 561 |       "20110301\n",
 562 |       "20110601\n",
 563 |       "20110901\n",
 564 |       "20111201\n",
 565 |       "20120301\n",
 566 |       "20120601\n",
 567 |       "20120904\n",
 568 |       "20121203\n",
 569 |       "20130301\n",
 570 |       "20130603\n",
 571 |       "20130903\n",
 572 |       "20131202\n",
 573 |       "20140303\n",
 574 |       "20140602\n",
 575 |       "20140902\n",
 576 |       "20141201\n",
 577 |       "20150302\n",
 578 |       "20150601\n",
 579 |       "20150901\n",
 580 |       "20151201\n",
 581 |       "20160301\n",
 582 |       "20160601\n",
 583 |       "20160901\n",
 584 |       "20161201\n",
 585 |       "20170301\n",
 586 |       "20170601\n",
 587 |       "Time consuming:  92.59127250512441  minutes\n"
 588 |      ]
 589 |     }
 590 |    ],
 591 |    "source": [
 592 |     "# took about 90 minutes to run\n",
 593 |     "start = time.time()\n",
 594 |     "all_return_table={}\n",
 595 |     "#all_predicted_return={}\n",
 596 |     "all_stocks_info = {}\n",
 597 |     "#for i in range(0,1):\n",
 598 |     "for i in range(len(trade_date)):\n",
 599 |     "    #match trading date\n",
 600 |     "    index = selected_stock.trade_date==trade_date[i]\n",
 601 |     "    print(trade_date[i])\n",
 602 |     "    #get the corresponding trade period's selected stocks' name\n",
 603 |     "    stocks_name=selected_stock.tic[selected_stock.trade_date==trade_date[i]].values\n",
 604 |     "    temp_info = selected_stock[selected_stock.trade_date==trade_date[i]]\n",
 605 |     "    temp_info = temp_info.reset_index()\n",
 606 |     "    del temp_info['index']\n",
 607 |     "    all_stocks_info[trade_date[i]] = temp_info\n",
 608 |     "    #get the corresponding trade period's selected stocks' predicted return\n",
 609 |     "    asset_expected_return=selected_stock[index].predicted_return.values\n",
 610 |     "    \n",
 611 |     "    #get current trade date and calculate trade date last year, it has to be a business date\n",
 612 |     "    last_year_tradedate=int((trade_date[i]-round(trade_date[i]/10000)*10000)+round(trade_date[i]/10000-1)*10000)\n",
 613 |     "    convert_to_yyyymmdd=datetime.strptime(str(last_year_tradedate), '%Y%m%d').strftime('%Y-%m-%d')\n",
 614 |     "    #determine the business date\n",
 615 |     "    #print(convert_to_yyyymmdd)\n",
 616 |     "    ts = pd.Timestamp(convert_to_yyyymmdd) \n",
 617 |     "    bd = pd.tseries.offsets.BusinessDay(n =1) \n",
 618 |     "    new_timestamp = ts - bd \n",
 619 |     "    lastY_tradedate = int(new_timestamp.date().strftime('%Y%m%d'))\n",
 620 |     "    get_date_index=(all_date<trade_date[i]) & (all_date>lastY_tradedate)\n",
 621 |     "    get_date=all_date[get_date_index]\n",
 622 |     "    #get adjusted price table\n",
 623 |     "    return_table=pd.DataFrame()\n",
 624 |     "    for m in range(len(stocks_name)):\n",
 625 |     "        #get stocks's name\n",
 626 |     "        index_tic=(df_price.tic==stocks_name[m])\n",
 627 |     "        #get this stock's all historicall price from sp500_price\n",
 628 |     "        sp500_temp=df_price[index_tic]\n",
 629 |     "        merge_left_data_table = pd.DataFrame(get_date)\n",
 630 |     "        merge_left_data_table.columns = ['datadate']\n",
 631 |     "        temp_price=merge_left_data_table.merge(sp500_temp, on=['datadate'], how='left')\n",
 632 |     "        temp_price = temp_price.dropna()\n",
 633 |     "        temp_price['daily_return']=temp_price.adj_price.pct_change()\n",
 634 |     "\n",
 635 |     "        return_table=return_table.append(temp_price,ignore_index=True)\n",
 636 |     "    all_return_table[trade_date[i]] = return_table\n",
 637 |     "end = time.time()\n",
 638 |     "print(\"Time consuming: \", (end-start)/60, \" minutes\")\n",
 639 |     "    \n",
 640 |     "    "
 641 |    ]
 642 |   },
 643 |   {
 644 |    "cell_type": "markdown",
 645 |    "metadata": {},
 646 |    "source": [
 647 |     "## Save to pickle"
 648 |    ]
 649 |   },
 650 |   {
 651 |    "cell_type": "code",
 652 |    "execution_count": 419,
 653 |    "metadata": {},
 654 |    "outputs": [],
 655 |    "source": [
 656 |     "#with open('Data/all_return_table.pickle', 'wb') as handle: \n",
 657 |     "#    pickle.dump(all_return_table, handle, protocol=pickle.HIGHEST_PROTOCOL)"
 658 |    ]
 659 |   },
 660 |   {
 661 |    "cell_type": "code",
 662 |    "execution_count": 420,
 663 |    "metadata": {},
 664 |    "outputs": [],
 665 |    "source": [
 666 |     "#with open('Data/all_stocks_info.pickle', 'wb') as handle:\n",
 667 |     "#    pickle.dump(all_stocks_info, handle, protocol=pickle.HIGHEST_PROTOCOL)"
 668 |    ]
 669 |   },
 670 |   {
 671 |    "cell_type": "code",
 672 |    "execution_count": 42,
 673 |    "metadata": {},
 674 |    "outputs": [],
 675 |    "source": [
 676 |     "#with open('Data/all_return_table.pickle', 'rb') as handle:\n",
 677 |     "#    all_return_table = pickle.load(handle)\n",
 678 |     "\n",
 679 |     "#with open('Data/all_stocks_info.pickle', 'rb') as handle:\n",
 680 |     "#    all_stocks_info = pickle.load(handle)\n"
 681 |    ]
 682 |   },
 683 |   {
 684 |    "cell_type": "markdown",
 685 |    "metadata": {},
 686 |    "source": [
 687 |     "# 4. Potfolio Optimization using pypfopt"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": 44,
 693 |    "metadata": {},
 694 |    "outputs": [
 695 |     {
 696 |      "name": "stderr",
 697 |      "output_type": "stream",
 698 |      "text": [
 699 |       "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pypfopt/objective_functions.py:61: RuntimeWarning: invalid value encountered in sqrt\n",
 700 |       "  sigma = np.sqrt(np.dot(weights, np.dot(cov_matrix, weights.T)))\n"
 701 |      ]
 702 |     },
 703 |     {
 704 |      "name": "stdout",
 705 |      "output_type": "stream",
 706 |      "text": [
 707 |       "19950601 : Done\n",
 708 |       "19950901 : Done\n",
 709 |       "19951201 : Done\n",
 710 |       "19960301 : Done\n",
 711 |       "19960603 : Done\n",
 712 |       "19960903 : Done\n",
 713 |       "19961202 : Done\n",
 714 |       "19970303 : Done\n",
 715 |       "19970602 : Done\n",
 716 |       "19970902 : Done\n",
 717 |       "19971201 : Done\n",
 718 |       "19980302 : Done\n",
 719 |       "19980601 : Done\n",
 720 |       "19980901 : Done\n",
 721 |       "19981201 : Done\n",
 722 |       "19990301 : Done\n",
 723 |       "19990601 : Done\n",
 724 |       "19990901 : Done\n",
 725 |       "19991201 : Done\n",
 726 |       "20000301 : Done\n",
 727 |       "20000601 : Done\n",
 728 |       "20000901 : Done\n",
 729 |       "20001201 : Done\n",
 730 |       "20010301 : Done\n",
 731 |       "20010601 : Done\n",
 732 |       "20010904 : Done\n",
 733 |       "20011203 : Done\n",
 734 |       "20020301 : Done\n",
 735 |       "20020603 : Done\n",
 736 |       "20020903 : Done\n",
 737 |       "20021202 : Done\n",
 738 |       "20030303 : Done\n",
 739 |       "20030602 : Done\n",
 740 |       "20030902 : Done\n",
 741 |       "20031201 : Done\n",
 742 |       "20040301 : Done\n",
 743 |       "20040601 : Done\n",
 744 |       "20040901 : Done\n",
 745 |       "20041201 : Done\n",
 746 |       "20050301 : Done\n",
 747 |       "20050601 : Done\n",
 748 |       "20050901 : Done\n",
 749 |       "20051201 : Done\n",
 750 |       "20060301 : Done\n"
 751 |      ]
 752 |     },
 753 |     {
 754 |      "name": "stderr",
 755 |      "output_type": "stream",
 756 |      "text": [
 757 |       "/home/ubuntu/anaconda3/lib/python3.6/site-packages/pypfopt/base_optimizer.py:56: RuntimeWarning: invalid value encountered in less\n",
 758 |       "  clean_weights[np.abs(clean_weights) < cutoff] = 0\n"
 759 |      ]
 760 |     },
 761 |     {
 762 |      "name": "stdout",
 763 |      "output_type": "stream",
 764 |      "text": [
 765 |       "20060601 : Done\n",
 766 |       "20060901 : Done\n",
 767 |       "20061201 : Done\n",
 768 |       "20070301 : Done\n",
 769 |       "20070601 : Done\n",
 770 |       "20070904 : Done\n",
 771 |       "20071203 : Done\n",
 772 |       "20080303 : Done\n",
 773 |       "20080602 : Done\n",
 774 |       "20080902 : Done\n",
 775 |       "20081201 : Done\n",
 776 |       "20090302 : Done\n",
 777 |       "20090601 : Done\n",
 778 |       "20090901 : Done\n",
 779 |       "20091201 : Done\n",
 780 |       "20100301 : Done\n",
 781 |       "20100601 : Done\n",
 782 |       "20100901 : Done\n",
 783 |       "20101201 : Done\n",
 784 |       "20110301 : Done\n",
 785 |       "20110601 : Done\n",
 786 |       "20110901 : Done\n",
 787 |       "20111201 : Done\n",
 788 |       "20120301 : Done\n",
 789 |       "20120601 : Done\n",
 790 |       "20120904 : Done\n",
 791 |       "20121203 : Done\n",
 792 |       "20130301 : Done\n",
 793 |       "20130603 : Done\n",
 794 |       "20130903 : Done\n",
 795 |       "20131202 : Done\n",
 796 |       "20140303 : Done\n",
 797 |       "20140602 : Done\n",
 798 |       "20140902 : Done\n",
 799 |       "20141201 : Done\n",
 800 |       "20150302 : Done\n",
 801 |       "20150601 : Done\n",
 802 |       "20150901 : Done\n",
 803 |       "20151201 : Done\n",
 804 |       "20160301 : Done\n",
 805 |       "20160601 : Done\n",
 806 |       "20160901 : Done\n",
 807 |       "20161201 : Done\n",
 808 |       "20170301 : Done\n",
 809 |       "20170601 : Done\n"
 810 |      ]
 811 |     }
 812 |    ],
 813 |    "source": [
 814 |     "# took under 5 minutes to run\n",
 815 |     "\n",
 816 |     "stocks_weight_table = pd.DataFrame([])\n",
 817 |     "\n",
 818 |     "for i in range(len(trade_date)):\n",
 819 |     "    # get selected stocks information\n",
 820 |     "    p1_alldata=(all_stocks_info[trade_date[i]])\n",
 821 |     "    # sort it by tic\n",
 822 |     "    p1_alldata=p1_alldata.sort_values('tic')\n",
 823 |     "    p1_alldata = p1_alldata.reset_index()\n",
 824 |     "    del p1_alldata['index']\n",
 825 |     "    \n",
 826 |     "    \n",
 827 |     "    # get selected stocks tic\n",
 828 |     "    p1_stock = p1_alldata.tic\n",
 829 |     "    \n",
 830 |     "    # get predicted return from selected stocks\n",
 831 |     "    p1_predicted_return=p1_alldata.pivot_table(index = 'trade_date',columns = 'tic', values = 'predicted_return')\n",
 832 |     "    # use the predicted returns as the Expected returns to feed into the portfolio object\n",
 833 |     "    mu = p1_predicted_return.T.values\n",
 834 |     "\n",
 835 |     "    # get the 1-year historical return\n",
 836 |     "    p1_return_table=all_return_table[trade_date[i]]\n",
 837 |     "    p1_return_table_pivot=p1_return_table.pivot_table(index = 'datadate',columns = 'tic', values = 'daily_return')\n",
 838 |     "    # use the 1-year historical return table to calculate covariance matrix between selected stocks\n",
 839 |     "    S = risk_models.sample_cov(p1_return_table_pivot)\n",
 840 |     "    del S.index.name \n",
 841 |     "    \n",
 842 |     "    # mean variance\n",
 843 |     "    ef_mean = EfficientFrontier(mu, S,weight_bounds=(0, 0.05))\n",
 844 |     "    raw_weights_mean = ef_mean.max_sharpe()\n",
 845 |     "    cleaned_weights_mean = ef_mean.clean_weights()\n",
 846 |     "    #print(raw_weights_mean)\n",
 847 |     "    #ef.portfolio_performance(verbose=True)\n",
 848 |     "\n",
 849 |     "    # minimum variance\n",
 850 |     "    ef_min = EfficientFrontier([0]*len(p1_stock), S,weight_bounds=(0, 0.05))\n",
 851 |     "    raw_weights_min = ef_min.max_sharpe()\n",
 852 |     "    cleaned_weights_min = ef_min.clean_weights()\n",
 853 |     "    #print(cleaned_weights_min)\n",
 854 |     "    \n",
 855 |     "    p1_alldata['mean_weight'] = cleaned_weights_mean.values()\n",
 856 |     "    p1_alldata['min_weight'] = cleaned_weights_min.values()\n",
 857 |     "    \n",
 858 |     "    #ef.portfolio_performance(verbose=True)\n",
 859 |     "\n",
 860 |     "    \n",
 861 |     "    stocks_weight_table = stocks_weight_table.append(pd.DataFrame(p1_alldata), ignore_index=True)\n",
 862 |     "    print(trade_date[i], \": Done\")\n"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": 45,
 868 |    "metadata": {},
 869 |    "outputs": [
 870 |     {
 871 |      "data": {
 872 |       "text/html": [
 873 |        "<div>\n",
 874 |        "<style scoped>\n",
 875 |        "    .dataframe tbody tr th:only-of-type {\n",
 876 |        "        vertical-align: middle;\n",
 877 |        "    }\n",
 878 |        "\n",
 879 |        "    .dataframe tbody tr th {\n",
 880 |        "        vertical-align: top;\n",
 881 |        "    }\n",
 882 |        "\n",
 883 |        "    .dataframe thead th {\n",
 884 |        "        text-align: right;\n",
 885 |        "    }\n",
 886 |        "</style>\n",
 887 |        "<table border=\"1\" class=\"dataframe\">\n",
 888 |        "  <thead>\n",
 889 |        "    <tr style=\"text-align: right;\">\n",
 890 |        "      <th></th>\n",
 891 |        "      <th>tic</th>\n",
 892 |        "      <th>predicted_return</th>\n",
 893 |        "      <th>trade_date</th>\n",
 894 |        "      <th>mean_weight</th>\n",
 895 |        "      <th>min_weight</th>\n",
 896 |        "    </tr>\n",
 897 |        "  </thead>\n",
 898 |        "  <tbody>\n",
 899 |        "    <tr>\n",
 900 |        "      <th>0</th>\n",
 901 |        "      <td>ACV.1</td>\n",
 902 |        "      <td>0.024449</td>\n",
 903 |        "      <td>19950601</td>\n",
 904 |        "      <td>0.00000</td>\n",
 905 |        "      <td>0.00000</td>\n",
 906 |        "    </tr>\n",
 907 |        "    <tr>\n",
 908 |        "      <th>1</th>\n",
 909 |        "      <td>AES</td>\n",
 910 |        "      <td>0.096917</td>\n",
 911 |        "      <td>19950601</td>\n",
 912 |        "      <td>0.00000</td>\n",
 913 |        "      <td>0.00000</td>\n",
 914 |        "    </tr>\n",
 915 |        "    <tr>\n",
 916 |        "      <th>2</th>\n",
 917 |        "      <td>AHM.1</td>\n",
 918 |        "      <td>0.044516</td>\n",
 919 |        "      <td>19950601</td>\n",
 920 |        "      <td>0.01200</td>\n",
 921 |        "      <td>0.00522</td>\n",
 922 |        "    </tr>\n",
 923 |        "    <tr>\n",
 924 |        "      <th>3</th>\n",
 925 |        "      <td>AMH.1</td>\n",
 926 |        "      <td>0.105036</td>\n",
 927 |        "      <td>19950601</td>\n",
 928 |        "      <td>0.00000</td>\n",
 929 |        "      <td>0.00000</td>\n",
 930 |        "    </tr>\n",
 931 |        "    <tr>\n",
 932 |        "      <th>4</th>\n",
 933 |        "      <td>AMT.1</td>\n",
 934 |        "      <td>0.085373</td>\n",
 935 |        "      <td>19950601</td>\n",
 936 |        "      <td>0.00000</td>\n",
 937 |        "      <td>0.00000</td>\n",
 938 |        "    </tr>\n",
 939 |        "    <tr>\n",
 940 |        "      <th>5</th>\n",
 941 |        "      <td>AOS</td>\n",
 942 |        "      <td>0.061494</td>\n",
 943 |        "      <td>19950601</td>\n",
 944 |        "      <td>0.00000</td>\n",
 945 |        "      <td>0.00000</td>\n",
 946 |        "    </tr>\n",
 947 |        "    <tr>\n",
 948 |        "      <th>6</th>\n",
 949 |        "      <td>APCC.</td>\n",
 950 |        "      <td>0.160571</td>\n",
 951 |        "      <td>19950601</td>\n",
 952 |        "      <td>0.00872</td>\n",
 953 |        "      <td>0.02036</td>\n",
 954 |        "    </tr>\n",
 955 |        "    <tr>\n",
 956 |        "      <th>7</th>\n",
 957 |        "      <td>APH</td>\n",
 958 |        "      <td>0.080985</td>\n",
 959 |        "      <td>19950601</td>\n",
 960 |        "      <td>0.01136</td>\n",
 961 |        "      <td>0.00000</td>\n",
 962 |        "    </tr>\n",
 963 |        "    <tr>\n",
 964 |        "      <th>8</th>\n",
 965 |        "      <td>ARG</td>\n",
 966 |        "      <td>0.059334</td>\n",
 967 |        "      <td>19950601</td>\n",
 968 |        "      <td>0.00000</td>\n",
 969 |        "      <td>0.00000</td>\n",
 970 |        "    </tr>\n",
 971 |        "    <tr>\n",
 972 |        "      <th>9</th>\n",
 973 |        "      <td>ATI.1</td>\n",
 974 |        "      <td>0.170435</td>\n",
 975 |        "      <td>19950601</td>\n",
 976 |        "      <td>0.00926</td>\n",
 977 |        "      <td>0.00000</td>\n",
 978 |        "    </tr>\n",
 979 |        "    <tr>\n",
 980 |        "      <th>10</th>\n",
 981 |        "      <td>AVATQ</td>\n",
 982 |        "      <td>0.051080</td>\n",
 983 |        "      <td>19950601</td>\n",
 984 |        "      <td>0.00020</td>\n",
 985 |        "      <td>0.00000</td>\n",
 986 |        "    </tr>\n",
 987 |        "    <tr>\n",
 988 |        "      <th>11</th>\n",
 989 |        "      <td>BAY.3</td>\n",
 990 |        "      <td>0.088882</td>\n",
 991 |        "      <td>19950601</td>\n",
 992 |        "      <td>0.00975</td>\n",
 993 |        "      <td>0.00797</td>\n",
 994 |        "    </tr>\n",
 995 |        "    <tr>\n",
 996 |        "      <th>12</th>\n",
 997 |        "      <td>BBBY</td>\n",
 998 |        "      <td>0.108766</td>\n",
 999 |        "      <td>19950601</td>\n",
1000 |        "      <td>0.00374</td>\n",
1001 |        "      <td>0.05000</td>\n",
1002 |        "    </tr>\n",
1003 |        "    <tr>\n",
1004 |        "      <th>13</th>\n",
1005 |        "      <td>BBY</td>\n",
1006 |        "      <td>0.184360</td>\n",
1007 |        "      <td>19950601</td>\n",
1008 |        "      <td>0.01155</td>\n",
1009 |        "      <td>0.05000</td>\n",
1010 |        "    </tr>\n",
1011 |        "    <tr>\n",
1012 |        "      <th>14</th>\n",
1013 |        "      <td>BEV</td>\n",
1014 |        "      <td>0.064850</td>\n",
1015 |        "      <td>19950601</td>\n",
1016 |        "      <td>0.00000</td>\n",
1017 |        "      <td>0.00000</td>\n",
1018 |        "    </tr>\n",
1019 |        "    <tr>\n",
1020 |        "      <th>15</th>\n",
1021 |        "      <td>BF.B</td>\n",
1022 |        "      <td>0.036150</td>\n",
1023 |        "      <td>19950601</td>\n",
1024 |        "      <td>0.02235</td>\n",
1025 |        "      <td>0.00000</td>\n",
1026 |        "    </tr>\n",
1027 |        "    <tr>\n",
1028 |        "      <th>16</th>\n",
1029 |        "      <td>BGEN</td>\n",
1030 |        "      <td>0.057776</td>\n",
1031 |        "      <td>19950601</td>\n",
1032 |        "      <td>0.01425</td>\n",
1033 |        "      <td>0.05000</td>\n",
1034 |        "    </tr>\n",
1035 |        "    <tr>\n",
1036 |        "      <th>17</th>\n",
1037 |        "      <td>BGG</td>\n",
1038 |        "      <td>0.066459</td>\n",
1039 |        "      <td>19950601</td>\n",
1040 |        "      <td>0.00000</td>\n",
1041 |        "      <td>0.05000</td>\n",
1042 |        "    </tr>\n",
1043 |        "    <tr>\n",
1044 |        "      <th>18</th>\n",
1045 |        "      <td>BIIB</td>\n",
1046 |        "      <td>0.081258</td>\n",
1047 |        "      <td>19950601</td>\n",
1048 |        "      <td>0.01236</td>\n",
1049 |        "      <td>0.00000</td>\n",
1050 |        "    </tr>\n",
1051 |        "    <tr>\n",
1052 |        "      <th>19</th>\n",
1053 |        "      <td>BLL</td>\n",
1054 |        "      <td>0.039363</td>\n",
1055 |        "      <td>19950601</td>\n",
1056 |        "      <td>0.03366</td>\n",
1057 |        "      <td>0.00000</td>\n",
1058 |        "    </tr>\n",
1059 |        "  </tbody>\n",
1060 |        "</table>\n",
1061 |        "</div>"
1062 |       ],
1063 |       "text/plain": [
1064 |        "      tic  predicted_return  trade_date  mean_weight  min_weight\n",
1065 |        "0   ACV.1          0.024449    19950601      0.00000     0.00000\n",
1066 |        "1     AES          0.096917    19950601      0.00000     0.00000\n",
1067 |        "2   AHM.1          0.044516    19950601      0.01200     0.00522\n",
1068 |        "3   AMH.1          0.105036    19950601      0.00000     0.00000\n",
1069 |        "4   AMT.1          0.085373    19950601      0.00000     0.00000\n",
1070 |        "5     AOS          0.061494    19950601      0.00000     0.00000\n",
1071 |        "6   APCC.          0.160571    19950601      0.00872     0.02036\n",
1072 |        "7     APH          0.080985    19950601      0.01136     0.00000\n",
1073 |        "8     ARG          0.059334    19950601      0.00000     0.00000\n",
1074 |        "9   ATI.1          0.170435    19950601      0.00926     0.00000\n",
1075 |        "10  AVATQ          0.051080    19950601      0.00020     0.00000\n",
1076 |        "11  BAY.3          0.088882    19950601      0.00975     0.00797\n",
1077 |        "12   BBBY          0.108766    19950601      0.00374     0.05000\n",
1078 |        "13    BBY          0.184360    19950601      0.01155     0.05000\n",
1079 |        "14    BEV          0.064850    19950601      0.00000     0.00000\n",
1080 |        "15   BF.B          0.036150    19950601      0.02235     0.00000\n",
1081 |        "16   BGEN          0.057776    19950601      0.01425     0.05000\n",
1082 |        "17    BGG          0.066459    19950601      0.00000     0.05000\n",
1083 |        "18   BIIB          0.081258    19950601      0.01236     0.00000\n",
1084 |        "19    BLL          0.039363    19950601      0.03366     0.00000"
1085 |       ]
1086 |      },
1087 |      "execution_count": 45,
1088 |      "metadata": {},
1089 |      "output_type": "execute_result"
1090 |     }
1091 |    ],
1092 |    "source": [
1093 |     "stocks_weight_table.head(20)\n"
1094 |    ]
1095 |   },
1096 |   {
1097 |    "cell_type": "code",
1098 |    "execution_count": 46,
1099 |    "metadata": {},
1100 |    "outputs": [
1101 |     {
1102 |      "data": {
1103 |       "text/plain": [
1104 |        "(12932, 5)"
1105 |       ]
1106 |      },
1107 |      "execution_count": 46,
1108 |      "metadata": {},
1109 |      "output_type": "execute_result"
1110 |     }
1111 |    ],
1112 |    "source": [
1113 |     "stocks_weight_table.shape"
1114 |    ]
1115 |   },
1116 |   {
1117 |    "cell_type": "markdown",
1118 |    "metadata": {},
1119 |    "source": [
1120 |     "## save to excel or csv"
1121 |    ]
1122 |   },
1123 |   {
1124 |    "cell_type": "code",
1125 |    "execution_count": 47,
1126 |    "metadata": {},
1127 |    "outputs": [],
1128 |    "source": [
1129 |     "stocks_weight_table.to_excel('Data/stocks_weight_table.xlsx','Sheet1')\n"
1130 |    ]
1131 |   },
1132 |   {
1133 |    "cell_type": "code",
1134 |    "execution_count": null,
1135 |    "metadata": {},
1136 |    "outputs": [],
1137 |    "source": []
1138 |   },
1139 |   {
1140 |    "cell_type": "code",
1141 |    "execution_count": null,
1142 |    "metadata": {},
1143 |    "outputs": [],
1144 |    "source": []
1145 |   },
1146 |   {
1147 |    "cell_type": "code",
1148 |    "execution_count": null,
1149 |    "metadata": {},
1150 |    "outputs": [],
1151 |    "source": []
1152 |   },
1153 |   {
1154 |    "cell_type": "code",
1155 |    "execution_count": null,
1156 |    "metadata": {},
1157 |    "outputs": [],
1158 |    "source": []
1159 |   }
1160 |  ],
1161 |  "metadata": {
1162 |   "kernelspec": {
1163 |    "display_name": "Python 3",
1164 |    "language": "python",
1165 |    "name": "python3"
1166 |   },
1167 |   "language_info": {
1168 |    "codemirror_mode": {
1169 |     "name": "ipython",
1170 |     "version": 3
1171 |    },
1172 |    "file_extension": ".py",
1173 |    "mimetype": "text/x-python",
1174 |    "name": "python",
1175 |    "nbconvert_exporter": "python",
1176 |    "pygments_lexer": "ipython3",
1177 |    "version": "3.6.5"
1178 |   }
1179 |  },
1180 |  "nbformat": 4,
1181 |  "nbformat_minor": 2
1182 | }
1183 | 


--------------------------------------------------------------------------------
/fundamental_run_model.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | warnings.filterwarnings("ignore")
  3 | 
  4 | import pandas as pd
  5 | import numpy as np
  6 | import time
  7 | import traceback
  8 | import sys
  9 | sys.path.append('code')
 10 | import ml_model
 11 | 
 12 | 
 13 | 
 14 | 
 15 | if __name__ == '__main__':
 16 |     import argparse
 17 |     parser = argparse.ArgumentParser()
 18 |     
 19 |     #sector name
 20 |     parser.add_argument('-sector_name','--sector_name_input', type=str,  required=True,help='sector name: i.e. sector10')
 21 | 
 22 |     # file name
 23 |     parser.add_argument('-fundamental','--fundamental_input', type=str,  required=True,help='inputfile name for fundamental table')
 24 |     parser.add_argument('-sector','--sector_input', type=str,  required=True,help='inputfile name for individual sector')
 25 |     
 26 |     # rolling window variables
 27 |     parser.add_argument("-first_trade_index", default=20, type=int)
 28 |     parser.add_argument("-testing_window", default=4, type=int)
 29 |     
 30 |     # column name
 31 |     parser.add_argument("-label_column", default='y_return', type=str)
 32 |     parser.add_argument("-date_column", default='tradedate', type=str)
 33 |     parser.add_argument("-tic_column", default='tic', type=str)
 34 |     parser.add_argument("-no_feature_column_names", default = ['gvkey', 'tic', 'datadate', 'rdq', 'tradedate', 'fyearq', 'fqtr',
 35 |        'conm', 'datacqtr', 'datafqtr', 'gsector','y_return'], type=list,help='column names that are not fundamental features')
 36 | 
 37 |     
 38 | 
 39 |     args = parser.parse_args()
 40 |     #load fundamental table
 41 |     inputfile_fundamental = args.fundamental_input
 42 |     
 43 |     fundamental_total=pd.read_excel(inputfile_fundamental)
 44 |     fundamental_total=fundamental_total[fundamental_total['tradedate'] < 20170901]
 45 |     #get all unique quarterly date
 46 |     unique_datetime = sorted(fundamental_total.tradedate.unique())
 47 | 
 48 |     # load sector data
 49 |     inputfile_sector = args.sector_input
 50 |     sector_data=pd.read_excel(inputfile_sector)
 51 | 
 52 |     #get sector unique ticker
 53 |     unique_ticker=sorted(sector_data.tic.unique())
 54 | 
 55 |     #set rolling window
 56 |     # train: 4 years = 16 quarters
 57 |     # test: 1 year = 4 quarters
 58 |     # so first trade date = #20 quarter
 59 |     #first trade date is 1995-06-01
 60 |     first_trade_date_index=args.first_trade_index
 61 | 
 62 |     #testing window
 63 |     testing_windows = args.testing_window
 64 | 
 65 |     #get all backtesting period trade dates
 66 |     trade_date=unique_datetime[first_trade_date_index:]
 67 |     
 68 |     #variable column name
 69 |     label_column = args.label_column
 70 |     date_column = args.date_column
 71 |     tic_column = args.tic_column
 72 |     
 73 |     # features column: different base on sectors
 74 |     no_feature_column_names = args.no_feature_column_names
 75 |     features_column = [x for x in sector_data.columns.values if x not in no_feature_column_names]
 76 |     
 77 |     #sector name
 78 |     sector_name = args.sector_name_input
 79 |     
 80 |     try:
 81 |         start = time.time()
 82 |         model_result=ml_model.run_4model(sector_data,
 83 |                                             features_column, 
 84 |                                             label_column, 
 85 |                                             date_column,
 86 |                                             tic_column,
 87 |                                             unique_ticker, 
 88 |                                             unique_datetime, 
 89 |                                             trade_date,
 90 |                                             first_trade_date_index,
 91 |                                             testing_windows)
 92 |         end = time.time()
 93 |         print('Time Spent: ',(end-start)/60,' minutes')
 94 |         ml_model.save_model_result(model_result,sector_name)
 95 | 
 96 |     except e:
 97 |         print(e)
 98 | 
 99 |     
100 | 
101 | # python3 fundamental_run_model.py -sector_name sector10 -fundamental Data/fundamental_final_table.xlsx -sector Data/1-focasting_data/sector10_clean.xlsx 
102 | 


--------------------------------------------------------------------------------