├── images
    ├── strategy_overview.png
    ├── PortfolioPerformanceBetaCap_1.png
    ├── PortfolioPerformanceBetaCap_10.png
    ├── Distribution of Portfolio Returns.png
    ├── PortfolioPerformanceBetaCap_10_Split.png
    ├── PortfolioPerformanceBetaCap_1_Split.png
    ├── Distribution of Portfolio Mean Returns.png
    ├── Monthly Standard Deviations Line Chart.png
    ├── PorfolioPerformanceBetaCap_1_DataFrame.png
    ├── PortfolioPerformanceWithVaryingBetaCaps_3.png
    ├── PortfolioPerformanceBetaCap_1_AgainstMarket.png
    ├── PortfolioPerformanceBetaCap_1_withoutSentiment.png
    ├── PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png
    ├── PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png
    ├── PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png
    └── .ipynb_checkpoints
    │   └── PortfolioPerformanceBetaCap_1_withoutSentiment-checkpoint.png
├── HM Capital Management Pitchbook.pptx
├── README.md
├── HM Capital Management Strategy Write Up.ipynb
└── strategy.py


/images/strategy_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/strategy_overview.png


--------------------------------------------------------------------------------
/HM Capital Management Pitchbook.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/HM Capital Management Pitchbook.pptx


--------------------------------------------------------------------------------
/images/PortfolioPerformanceBetaCap_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1.png


--------------------------------------------------------------------------------
/images/PortfolioPerformanceBetaCap_10.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_10.png


--------------------------------------------------------------------------------
/images/Distribution of Portfolio Returns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/Distribution of Portfolio Returns.png


--------------------------------------------------------------------------------
/images/PortfolioPerformanceBetaCap_10_Split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_10_Split.png


--------------------------------------------------------------------------------
/images/PortfolioPerformanceBetaCap_1_Split.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_Split.png


--------------------------------------------------------------------------------
/images/Distribution of Portfolio Mean Returns.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/Distribution of Portfolio Mean Returns.png


--------------------------------------------------------------------------------
/images/Monthly Standard Deviations Line Chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/Monthly Standard Deviations Line Chart.png


--------------------------------------------------------------------------------
/images/PorfolioPerformanceBetaCap_1_DataFrame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PorfolioPerformanceBetaCap_1_DataFrame.png


--------------------------------------------------------------------------------
/images/PortfolioPerformanceWithVaryingBetaCaps_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceWithVaryingBetaCaps_3.png


--------------------------------------------------------------------------------
/images/PortfolioPerformanceBetaCap_1_AgainstMarket.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_AgainstMarket.png


--------------------------------------------------------------------------------
/images/PortfolioPerformanceBetaCap_1_withoutSentiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_withoutSentiment.png


--------------------------------------------------------------------------------
/images/PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png


--------------------------------------------------------------------------------
/images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png


--------------------------------------------------------------------------------
/images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png


--------------------------------------------------------------------------------
/images/.ipynb_checkpoints/PortfolioPerformanceBetaCap_1_withoutSentiment-checkpoint.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/.ipynb_checkpoints/PortfolioPerformanceBetaCap_1_withoutSentiment-checkpoint.png


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Quantitative Investment Strategy
  2 | Code can be found [`strategy.py`](strategy.py).
  3 | 
  4 | ## Overview
  5 | This project combines logistic regression, gradient boosting, and LSTMs to predict next-month returns in equities using fundamental and sentiment features. See below for details.
  6 | 
  7 | <img src="images/strategy_overview.png">
  8 | 
  9 | <img src="images/PortfolioPerformanceBetaCap_10.png">
 10 | 
 11 | <img src="images/PortfolioPerformanceBetaCap_10_Split.png">
 12 | 
 13 | *Notice: this analysis is in-sample inference only.*
 14 | 
 15 | 
 16 | ### Motivation Behind Exploring this Strategy:
 17 | 
 18 | The motivation behind exploring this strategy is to explore the investable opportunities in equities by analyzing their sentimental features. Quandl provided pre-computed sentiment features for equities that were easily accessible to analyze. Additionally, we wanted to explore basic machine learning algorithms for predicting next period returns and measure performance against the market as a benchmark.  
 19 | 
 20 | <!--To explore sentiment features, quandl provided pre-computed sentiment features that we could conveniently try out. Addeitionally, we wanted to try some basic machine learning algorithms for predicting next period returns. -->
 21 | 
 22 | Skiena et. al. (2010) uses sentiment data to build simple long-short strategies. Skiena’s dataset contains similar information to ours; it uses feature based on “polarity”, how positive or negative the mention is, and volume of news mentions. However, one difference is that Skiena separates information from blogs and news, while our dataset contains all sources together. They find that patterns from opinion in blogs persists longer than from news, which makes sense since opinions are less likely to contain “true” information that will change investors’ investment decisions. This paper provides evidence that sentiment-based investing, which we are analyzing in this write-up, is widespread. 
 23 | 
 24 | Fundamental analysis and investing is a widely-known investing technique that is used in different approaches but is ultimately based on how the company is perfoming as represented by their fundamental financial data. Charles Lee, the Henrietta J. Louis Professor at Cornell University, strives to connect fundamental value and investor sentiment with what is known as "fusion investing" in his Fusion Investing paper (2003). Lee states that while "researchers are finding that even though returns are difficult to predict, prediction is not an impossible task" and that there are observable systematic patterns in price return series (Lee 2003). While Lee takes the position of observing sentiment value from an investor, our strategy takes the available news sentiment data of the company, and not an individual investor, and attempts to predict performance within those believed systematic patterns.
 25 | 
 26 | ### Creating a Universe of Equities to Use:
 27 | The idea of using sentiment as a basis for our strategy requires a few different components. First, we would need a universe of equities to use. We also need specific sentiment data for the strategy, end of day data to measure performance, and fundamental financial ratios as a complement to the sentiment data. The fundamental data consists of a set of Zack's files from the Quandl database and both the sentiment data and the end of day data were queried from Quandl using their Quandl API. 
 28 | 
 29 | Since the fundamental data is from fixed csv files, which can be found in [Quandl's Zacks Collection B database](https://www.quandl.com/databases/ZFB/data "Quandl Zacks"), that's where the universe of equities started. The fundamental dataset consists of **9107** unique tickers which established the foundation for our universe. The fundamental dataset contains the following columns as features: *Weighted Average Shares Outstanding, Current Ratio, Gross Margin, Operating Profit Margin, EBIT Margin, Pretax Profit Margin, Profit Margin, Free Cash Flow, Asset Turnover, Inventory Turover, Receivables Turnover, Days Sales in Receivables, Return on Equity, Return on Tangible Equity, Return on Assets, Return on Investments, Free Cash Flow Per Share, Book Value Per Share, and Operating Cash Flow Per Share.*
 30 | 
 31 | With the understanding that end of day data would be a bit easier to find than sentiment, we queried [quandl's sentiment database](https://www.quandl.com/data/NS1-FinSentS-Web-News-Sentiment "Quandl FinSent") with the **9107** tickers. The database consists of 5 different sentiment features: *Sentiment, Sentiment High, Sentiment Low, News Volume,* and *News Buzz*. 
 32 |    - The *Sentiment* feature is a numeric measure of the bullishness / bearishness of news coverage of the stock.
 33 |    - The *Sentiment High/Low* feature is the highest and lowest intraday sentiment scores.
 34 |    - The *News Volume* feature is the absolute number of news articles covering the stock.
 35 |    - The *News Buzz* feature is a numeric measure of the change in coverage volume for the stock.
 36 |    
 37 | We found that after filtering out the tickers that were not present in the database and handling any errors and data quality issues, such as missing data, the sentiment dataset held **4753** unique tickers with the 5 different feature sets. 
 38 | 
 39 | At that point, we took the smaller of the two ticker lists and scraped [Quandl's EOD database](https://www.quandl.com/data/EOD-End-of-Day-US-Stock-Prices "Quandl EOD"). After handling any present data quality issues, the EOD database contained **2490** unique tickers. 
 40 | 
 41 | After examining the three unique datasets, the selection of tickers to use for the merging of the three datasets came from the database with the smallest number of tickers: the end of day data. 
 42 | 
 43 | Running through the concatenating loop and adding a few more necessary filters to handle any remaining data quality issues during the concatenation, leaves a total of **1130** tickers in our universe with the necessary fundamental data, sentiment data, and end of day data. 
 44 | 
 45 | The output of the _ObtainDataFile.py_ script is a dictionary with two initial keys: *Raw_Data* and *Factors*. The *Factors* key contains a dictionary of [monthly](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html "Fama-French 5 Factor Monthly Data") and [daily](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html "Fama-French 5 Factor Daily Data") Five Factor datasets from [Fama and French](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html "Kenneth French's Website"). The *Raw_Data* value contains **1130** dictionaries with the ticker as the key and a dataframe from the concatenation as the value.
 46 | 
 47 | 
 48 | 
 49 | <!--\*_The current script ObtainDataFile.py only reads data from csv files instead of querying Quandl. This is to save time because the queries can take a while. The code to query quandl is just commented out and instead, it just reads the necessary csv files that are in this project. The total time for this obtaining portion takes about 20 minutes to run on my specific computer._\*-->
 50 | 
 51 | 
 52 | ```python
 53 | # from ObtainDataFile import *
 54 | #from PerformanceMetrics import *
 55 | # from strategy import *
 56 | ```
 57 | 
 58 | By running the *start_running()* function, you will start the process of gathering and combining the data streams together to return a single dictionary data structure with the universe of tickers.
 59 | 
 60 | 
 61 | ```python
 62 | # all_data = start_running()
 63 | # raw_data = pd.read_pickle('complete_dataset.pickle')
 64 | ```
 65 | 
 66 |     Data compilation is complete. 
 67 |     1130 tickers in universe.
 68 |     File name is complete_dataset.pickle and is located in the local directory.
 69 | 
 70 | 
 71 | This *.read_pickle()* statement will read, instead of compile, the single dictionary data structure with the universe of tickers. 
 72 | 
 73 | ## Strategy Description: 
 74 | 
 75 | This is a long-only equity strategy with monthly frequency. The equities for the given month are selected based on their predicted probability of having positive returns in the next month. The probability is predicted using three models averaged together:
 76 | 1. logistic regression
 77 | 2. gradient boosting, 
 78 | 3. LSTM (long short-term memory) 
 79 | 
 80 | #### Target Variable
 81 | * 0: returns in following month are _negative_
 82 | * 1: returns in following month are _positive_
 83 | 
 84 | In other words, we predict the binary target variable of returns being positive or negative in the following month, using three sepearate models, then average their predictions (which are probabilities).
 85 | 
 86 | #### Equity selection
 87 | * average the three models' predictions
 88 | * select top 20%, go long a random half of them (10% total)
 89 | * go short the market by an amount $\beta$ times the amount you go long the equity. This is to be market neutral.
 90 | 
 91 | Beta is calculated as the correalation of the equity's daily returns with the market's returns (Nasdaq) over the previous month. We do _not_ go short any equities to avoid the complexities of shorting.
 92 | 
 93 | The models are selected using grid-search for logistic and gradient boosting, and using default values for the LSTM. The LSTM uses time windows of length (i.e. predicting next-month's return sign using the prior 3 months' values). The LSTM could be futher optimized to possibly get better performance.
 94 | 
 95 | #### Input features
 96 | * Features come as daily
 97 | * then aggregated to monthly using quantiles
 98 | * 50%, 75%, and 90% quantile of daily values over the previous month. 
 99 | 
100 | For example, five daily features would turn in to 15 monthly features (5 features * 3 quantiles). This aggregation method allows us to use information about the _higher_ values from the previous month (90% quantile) as well as the _more typical_ values (50% quantile).
101 | 
102 | ### Data
103 | * 1130 tickers
104 | * 76 months (January 2013 to April 2019)
105 | * 69 features
106 |   * 23 base features, each with 3 quantiles
107 |     * 5 sentiment 
108 |     * 17 fundamental
109 | 
110 | <img src="images/strategy_overview.png">
111 | 
112 | #### Generating Portfolios from our Randomly Selected Equity Positions in the top perfoming **20%** based on Performance Metrics
113 | 
114 | One idea was to take a random selection of positions that the strategy lists in the top **20%** -quantile. We observed 100 generated portfolios that all produced similar results, as you can see below. The figure of 100 histograms shows the distribution of their returns with a density line for each chart. 
115 | 
116 | <img src="images/Distribution of Portfolio Returns.png">
117 | <!-- Source Code is located in the RandomSelectionMetrics.ipynb file. -->
118 | 
119 | Each portfolio performed similarly across the months as well. Below, you can see a single histogram with a density line describing the mean returns across all portfolios for each month. 
120 | 
121 | <img src="images/Distribution of Portfolio Mean Returns.png">
122 | 
123 | <!-- Source Code is located in the RandomSelectionMetrics.ipynb file. -->
124 | 
125 | By taking the standard deviation of the monthly PnL across all the portfolios, we can observe the values increasing as time continues, as the figure below shows. 
126 | 
127 | <img src="images/Monthly Standard Deviations Line Chart.png">
128 | 
129 | <!-- Source Code is located in the RandomSelectionMetrics.ipynb file. -->
130 | 
131 | #### Analysis on Generating Portfolios from Random Equity Selections:
132 | While the initial thought seemed like a good idea, in reality, the randomly selected equities to populate portfolios showed that there was no outstanding value in randomly selecting from our top performing equities in the top **20%**-quantile. This can easily be viewed in the **Distribution of Portfolio Returns...** figure showing that the portfolios more or less behaved the same over the given time period. 
133 | 
134 | ### Running the Strategy:
135 | 
136 | 
137 | ```python
138 | # i = 0.7 #fraction that is training data
139 | # b_cap = 10 #beta cap
140 | # s = True #includes sentiment features
141 | # df_test, results = develop_all_three_models(frac_training=i, use_sentiment=s)    
142 | # results_summary = run_strategy(df_test, ticker_dict=raw_data['Raw_Data'], sort_column='y_pred_avg', 
143 | #                                seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-b_cap, 
144 | #                                beta_cap_ceil=b_cap, plot=False)
145 | ```
146 | 
147 | ### Strategy Results and Performance Metrics:
148 | 
149 | 
150 | ```python
151 | # take input of strategy. 
152 | # total_dict = getStatsAll(results_summary) #using a beta cap of +/- 10. 
153 | ```
154 | 
155 | <img src="images/PortfolioPerformanceBetaCap_10.png">
156 | 
157 | <img src="images/PortfolioPerformanceBetaCap_10_Split.png">
158 | 
159 | *Notice: this analysis is in-sample inference only.*
160 | 
161 | ### The longer testing periods are more reliable
162 | * The strategy is developed using progressivley larger training sets. The orange line was trained on only 10% of months (starting January 2013), and tested on the remaining 90% of months (up to April 2019).
163 | * The strategy which was tested the shortest is the brown, and its returns are not good. In its five months, it swings widely and finishes barely positive.
164 | * However, this strategy (three models estimated on training data, then averaged) is the least reliable because it was tested on the shortest period. It doesn't get much time to reveal its performance on test data, so we do not rely on these short test period. 
165 | * However, using a shorter training set risks the model getting enough data to find the patterns in the data. With shorter training sets, the model could be less predictive, but we will more likely detect that it is unpredictive. With longer training sets, the model could be more predictive, but we are less likely to detect that it is unpredictive. 
166 | * The brown line is the least reliable, due to its short testing period.
167 | 
168 | ### The trend is up, but with a large drop at the end
169 | * By contrast, the orange line was tested on 90% of the data, giving it a long testing period to reveal its performance out-of-sample. The orange line has an upward trend and a sharp dip at the end. This suggests some optimism, but so does the market returns. Also, there is concern from the large concern for the sudden drop.
170 | 
171 | ### Large drop comes from large a net short position in the market moving against us
172 | * The drop is due to large net short position in the market that happends to turn against us. This large short position comes from an unusually large individual beta. The large net short position (from one stock with unusually large beta) is not unique, but the adverse price change while holding the large position is unique. There are other periods where we hold large short market positions, but the price doesn't move so badly in those periods. In February and March, it does hurt us.
173 | 
174 | ### Medium-range forecasts are similar, but with a slower start
175 | * The remaining three strategies (greed, red, purple) show similar patterns; general upward trends with a large drop near the end (February and March). However, they are begin flat for the first few months before starting their rise.
176 | 
177 | ### Let's address the large drop at the end
178 | * Again, the large drop is due to large shorts market positions moving against us. It's difficult to predict when the market will move against us, so instead we will cap our short positions so that markets drops don't hurt as much (_favorable_ market moves will also help us less under capping. It limits both downside and upside).
179 | * We now apply a hard cap to the short market positions that we are taking. The betas of any given stock in any prior month are generally less than 5, but sometimes larger 10 or even 30. The strategy runs above used a cap of 10, so now we tighten the cap to 5 (applied as absolute value of 5), and 1, to see if we can soften that large drop.
180 | 
181 | ## Analysis
182 | 
183 | 
184 | ```python
185 | # take input of strategy. 
186 | # total_dict = getStatsAll(results_summary, beta_plot = True, training_split_value_to_plot=0.3) #using a beta cap of +/- 1. 
187 | ```
188 | 
189 | <img src="images/PortfolioPerformanceBetaCap_1.png">
190 | 
191 | <img src="images/PortfolioPerformanceBetaCap_1_Split.png">
192 | 
193 | <img src="images/PortfolioPerformanceWithVaryingBetaCaps_3.png">
194 | 
195 | Regarding the Top Figure above:
196 | * With the tight cap on beta (between -1 and 1), we again see a general upward trend. You can see that the strategy using the training size of 30% (green line) in the tightest beta constraint configuration of 1.0, outperforms the other training sizes in the same beta cap grouping. 
197 | 
198 | Regarding the Middle Figure of 5 plots above: 
199 | * You can see that when the beta cap group of training sizes is separated into individual charts, the split size of 30% and 50% outperform the market but only the 30% split results significantly outperform the market. 
200 | 
201 | Regarding the Bottom Figure above:
202 | * We choose training fraction of 0.3, then show the strategy using different caps for beta. There is mostly overlap, but the strictest beta cap (1.0) shows highest performance, especially by the end of the simulation.
203 | * Also, the sharp drops around November 2016 and March 2019 seem to be softened, as was expected from the capping. However, the small caps (5 & 10) didn't seem to curtail the drops; only cap of 1 seems to work.
204 | * For the first half of the strategy, capping seemed to have minimal effect. But by the end, the strict cap lead to the best performance.
205 | * While capping could plausibly improve PnL here, keep in mind that it would also reduce the market-neutrality, which was the point of investing the index in the first place. We do not explore whether market-neutrality was affected here.
206 | * The optimal stratey identified is when the training size is only 30% of the entire dataset with a beta cap of +/- 1.0. You can see that in the bottom figure below, the *Beta Cap: 1.0* line easily outperforms the others in the same training size group and outperforms the Market, the purple line. These results show that the strategy made smart decisions, did not take too many losses, and capitalized on an increasing Market.
207 | 
208 | <img src="images/PorfolioPerformanceBetaCap_1_DataFrame.png">
209 | 
210 | The image above represents the dataframe of strategy results using a beta cap of 1.0. Our best performing strategy, using the training split of 30%, had an **Annualized Return of 18.77%, Annualized Volatility of 27.73%, an Annualized Sharpe of 0.605, a Max Drawdown of 0.167, and an Alpha of 0.014.** Below, you wcan see just how the strategy performs against the market since the start of 2015. The strategy looks promising but we need to stress test to be sure. 
211 | 
212 | <img src="images/PortfolioPerformanceBetaCap_1_AgainstMarket.png">
213 | 
214 | ### Stress Testing Against Simulated 2008 Performance Compared to Recent Performance Without Sentiment
215 | 
216 | Since the sentiment data does not go past 2013, we are opting to stress test the strategy by observing the strategy's performance without the sentiment features from 2015 to 2019 and observe the strategy's performance between 2005 and 2010. Then we will compare the differential performance of the annualized market returns and annualized strategy returns between the two time periods.
217 | 
218 | <img src="images/PortfolioPerformanceBetaCap_1_withoutSentiment.png">
219 | 
220 | <img src="images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png">
221 | 
222 | The strategy can be observed against the market from 2015 to 2019 in the time period above. Take note that the **Annualized Strategy Return is 2.74% against the Market's Annualized Return of 12.67%.** We will be comparing these along with the other listed metrics to the strategy's performance from 2015 to 2010.  
223 | 
224 | <img src="images/PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png">
225 | 
226 | <img src="images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png">
227 | 
228 | ### Stress Testing Analysis
229 | Now taking a look at the performance of the stressed strategy from 2007 through 2010. The strategy does produce a higher **Annualized Strategy Return of 5.09% against the Market's Annualized Return of 0.58%.** We do see a larger max drawdown and a larger portfolio beta on the stressed strategy and this can be expected. It is surprising to see that not only did the strategy perform positively during this period, it beat the market and beat the 2015-2019 period we were comparing it to. 
230 | 
231 | Some of the other metrics did perform better in the 2015-2019 dataset, such as a drawdown that is almost half at **0.259** and a much lower **portfolio beta at 0.332**. Volatility  was higher for both the strategy and the market during the stress period and this is expected since this was a purposeful stress run during a known financial crisis. 
232 | 
233 | ### Concluding Thoughts:
234 | 
235 | We conclude that there is evidence of proof of concept utilizing a strategy that leverages sentiment data. While these initial results seem profitable and optimistic, it would be wise to continue with our testing by continuing to fine-tune the machine learning models, refine the universe filtering, and stress test against a different variety of scenarios and not just a single crisis. Sentiment data proves to be an unique feature set and as new natural language processing techniques continue to develop and employed, sentiment data will continue to grow and offer new ways of exploring different investment opportunities. 
236 | 
237 | <!-- ### Online Resources Include: 
238 | - [Kenneth French's Website](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html)
239 | - [Quandl.com](https://www.quandl.com)
240 | - [Quantopian.com](https://www.quantopian.com) 
241 | - [Stackoverflow.com](https://stackoverflow.com)
242 | -->
243 | 
244 | ### Bibliography & Data Sources: 
245 | 1. **French, Kenneth R.** Kenneth R. French - Home Page. Accessed June 01, 2019.  
246 | &nbsp;&nbsp;&nbsp;&nbsp;http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html.  
247 | 2. **Lee, Charles M.C.** 2003. Fusion Investing. Tech. Fusion Investing. AIMR.
248 | 3. **"Quandl."** Quandl.com. Accessed June 01, 2019.  
249 | &nbsp;&nbsp;&nbsp;&nbsp;https://www.quandl.com/.  
250 | 4. **"The Place For Learning Quant Finance."** Quantopian. Accessed June 01, 2019.   
251 | &nbsp;&nbsp;&nbsp;&nbsp;https://www.quantopian.com/.  
252 | 5. **"Where Developers Learn, Share, & Build Careers."** Stack Overflow. Accessed June 02, 2019.  
253 | &nbsp;&nbsp;&nbsp;&nbsp;https://stackoverflow.com/.
254 | 6. **Zhang, Wenbin, and Steven Skiena.** 2010. Trading Strategies to Exploit Blog and News Sentiment. Tech.  
255 | &nbsp;&nbsp;&nbsp;&nbsp;Trading Strategies to Exploit Blog and News Sentiment. Association for the Advancement of Artificial Intelligence.
256 | 


--------------------------------------------------------------------------------
/HM Capital Management Strategy Write Up.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# HM Capital Management\n",
  8 |     "[Robert Hatem](https://www.linkedin.com/in/robert-e-hatem/ \"LinkedIn Profile: Robert Hatem\")  \n",
  9 |     "[Benjamin Morgan](https://www.linkedin.com/in/benjaminmorgan0921/ \"LinkedIn Profile: Benjamin Morgan\")  \n",
 10 |     "Spring 2019"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "### Motivation Behind Exploring this Strategy:\n",
 18 |     "\n",
 19 |     "The motivation behind exploring this strategy is to explore the investable opportunities in equities by analyzing their sentimental features. Quandl provided pre-computed sentiment features for equities that were easily accessible to analyze. Additionally, we wanted to explore basic machine learning algorithms for predicting next period returns and measure performance against the market as a benchmark.  \n",
 20 |     "\n",
 21 |     "<!--To explore sentiment features, quandl provided pre-computed sentiment features that we could conveniently try out. Addeitionally, we wanted to try some basic machine learning algorithms for predicting next period returns. -->\n",
 22 |     "\n",
 23 |     "Skiena et. al. (2010) uses sentiment data to build simple long-short strategies. Skiena’s dataset contains similar information to ours; it uses feature based on “polarity”, how positive or negative the mention is, and volume of news mentions. However, one difference is that Skiena separates information from blogs and news, while our dataset contains all sources together. They find that patterns from opinion in blogs persists longer than from news, which makes sense since opinions are less likely to contain “true” information that will change investors’ investment decisions. This paper provides evidence that sentiment-based investing, which we are analyzing in this write-up, is widespread. \n",
 24 |     "\n",
 25 |     "Fundamental analysis and investing is a widely-known investing technique that is used in different approaches but is ultimately based on how the company is perfoming as represented by their fundamental financial data. Charles Lee, the Henrietta J. Louis Professor at Cornell University, strives to connect fundamental value and investor sentiment with what is known as \"fusion investing\" in his Fusion Investing paper (2003). Lee states that while \"researchers are finding that even though returns are difficult to predict, prediction is not an impossible task\" and that there are observable systematic patterns in price return series (Lee 2003). While Lee takes the position of observing sentiment value from an investor, our strategy takes the available news sentiment data of the company, and not an individual investor, and attempts to predict performance within those believed systematic patterns."
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Creating a Universe of Equities to Use:\n",
 33 |     "The idea of using sentiment as a basis for our strategy requires a few different components. First, we would need a universe of equities to use. We also need specific sentiment data for the strategy, end of day data to measure performance, and fundamental financial ratios as a complement to the sentiment data. The fundamental data consists of a set of Zack's files from the Quandl database and both the sentiment data and the end of day data were queried from Quandl using their Quandl API. \n",
 34 |     "\n",
 35 |     "Since the fundamental data is from fixed csv files, which can be found in [Quandl's Zacks Collection B database](https://www.quandl.com/databases/ZFB/data \"Quandl Zacks\"), that's where the universe of equities started. The fundamental dataset consists of **9107** unique tickers which established the foundation for our universe. The fundamental dataset contains the following columns as features: *Weighted Average Shares Outstanding, Current Ratio, Gross Margin, Operating Profit Margin, EBIT Margin, Pretax Profit Margin, Profit Margin, Free Cash Flow, Asset Turnover, Inventory Turover, Receivables Turnover, Days Sales in Receivables, Return on Equity, Return on Tangible Equity, Return on Assets, Return on Investments, Free Cash Flow Per Share, Book Value Per Share, and Operating Cash Flow Per Share.*\n",
 36 |     "\n",
 37 |     "With the understanding that end of day data would be a bit easier to find than sentiment, we queried [quandl's sentiment database](https://www.quandl.com/data/NS1-FinSentS-Web-News-Sentiment \"Quandl FinSent\") with the **9107** tickers. The database consists of 5 different sentiment features: *Sentiment, Sentiment High, Sentiment Low, News Volume,* and *News Buzz*. \n",
 38 |     "   - The *Sentiment* feature is a numeric measure of the bullishness / bearishness of news coverage of the stock.\n",
 39 |     "   - The *Sentiment High/Low* feature is the highest and lowest intraday sentiment scores.\n",
 40 |     "   - The *News Volume* feature is the absolute number of news articles covering the stock.\n",
 41 |     "   - The *News Buzz* feature is a numeric measure of the change in coverage volume for the stock.\n",
 42 |     "   \n",
 43 |     "We found that after filtering out the tickers that were not present in the database and handling any errors and data quality issues, such as missing data, the sentiment dataset held **4753** unique tickers with the 5 different feature sets. \n",
 44 |     "\n",
 45 |     "At that point, we took the smaller of the two ticker lists and scraped [Quandl's EOD database](https://www.quandl.com/data/EOD-End-of-Day-US-Stock-Prices \"Quandl EOD\"). After handling any present data quality issues, the EOD database contained **2490** unique tickers. \n",
 46 |     "\n",
 47 |     "After examining the three unique datasets, the selection of tickers to use for the merging of the three datasets came from the database with the smallest number of tickers: the end of day data. \n",
 48 |     "\n",
 49 |     "Running through the concatenating loop and adding a few more necessary filters to handle any remaining data quality issues during the concatenation, leaves a total of **1130** tickers in our universe with the necessary fundamental data, sentiment data, and end of day data. \n",
 50 |     "\n",
 51 |     "The output of the _ObtainDataFile.py_ script is a dictionary with two initial keys: *Raw_Data* and *Factors*. The *Factors* key contains a dictionary of [monthly](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html \"Fama-French 5 Factor Monthly Data\") and [daily](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html \"Fama-French 5 Factor Daily Data\") Five Factor datasets from [Fama and French](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html \"Kenneth French's Website\"). The *Raw_Data* value contains **1130** dictionaries with the ticker as the key and a dataframe from the concatenation as the value.\n",
 52 |     "\n",
 53 |     "\n",
 54 |     "\n",
 55 |     "<!--\\*_The current script ObtainDataFile.py only reads data from csv files instead of querying Quandl. This is to save time because the queries can take a while. The code to query quandl is just commented out and instead, it just reads the necessary csv files that are in this project. The total time for this obtaining portion takes about 20 minutes to run on my specific computer._\\*-->"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 25,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "# from ObtainDataFile import *\n",
 65 |     "#from PerformanceMetrics import *\n",
 66 |     "# from strategy import *"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "By running the *start_running()* function, you will start the process of gathering and combining the data streams together to return a single dictionary data structure with the universe of tickers."
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 10,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "Data compilation is complete. \n",
 86 |       "1130 tickers in universe.\n",
 87 |       "File name is complete_dataset.pickle and is located in the local directory.\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# all_data = start_running()\n",
 93 |     "# raw_data = pd.read_pickle('complete_dataset.pickle')"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "This *.read_pickle()* statement will read, instead of compile, the single dictionary data structure with the universe of tickers. "
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Strategy Description: \n",
108 |     "\n",
109 |     "This is a long-only equity strategy with monthly frequency. The equities for the given month are selected based on their predicted probability of having positive returns in the next month. The probability is predicted using three models averaged together:\n",
110 |     "1. logistic regression\n",
111 |     "2. gradient boosting, \n",
112 |     "3. LSTM (long short-term memory) \n",
113 |     "\n",
114 |     "#### Target Variable\n",
115 |     "* 0: returns in following month are _negative_\n",
116 |     "* 1: returns in following month are _positive_\n",
117 |     "\n",
118 |     "In other words, we predict the binary target variable of returns being positive or negative in the following month, using three sepearate models, then average their predictions (which are probabilities).\n",
119 |     "\n",
120 |     "#### Equity selection\n",
121 |     "* average the three models' predictions\n",
122 |     "* select top 20%, go long a random half of them (10% total)\n",
123 |     "* go short the market by an amount $\\beta$ times the amount you go long the equity. This is to be market neutral.\n",
124 |     "\n",
125 |     "Beta is calculated as the correalation of the equity's daily returns with the market's returns (Nasdaq) over the previous month. We do _not_ go short any equities to avoid the complexities of shorting.\n",
126 |     "\n",
127 |     "The models are selected using grid-search for logistic and gradient boosting, and using default values for the LSTM. The LSTM uses time windows of length (i.e. predicting next-month's return sign using the prior 3 months' values). The LSTM could be futher optimized to possibly get better performance.\n",
128 |     "\n",
129 |     "#### Input features\n",
130 |     "* Features come as daily\n",
131 |     "* then aggregated to monthly using quantiles\n",
132 |     "* 50%, 75%, and 90% quantile of daily values over the previous month. \n",
133 |     "\n",
134 |     "For example, five daily features would turn in to 15 monthly features (5 features * 3 quantiles). This aggregation method allows us to use information about the _higher_ values from the previous month (90% quantile) as well as the _more typical_ values (50% quantile).\n",
135 |     "\n",
136 |     "### Data\n",
137 |     "* 1130 tickers\n",
138 |     "* 76 months (January 2013 to April 2019)\n",
139 |     "* 69 features\n",
140 |     "  * 23 base features, each with 3 quantiles\n",
141 |     "    * 5 sentiment \n",
142 |     "    * 17 fundamental"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "markdown",
147 |    "metadata": {},
148 |    "source": [
149 |     "<img src=\"images/strategy_overview.png\">"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "markdown",
154 |    "metadata": {},
155 |    "source": [
156 |     "#### Generating Portfolios from our Randomly Selected Equity Positions in the top perfoming **20%** based on Performance Metrics"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "markdown",
161 |    "metadata": {},
162 |    "source": [
163 |     "One idea was to take a random selection of positions that the strategy lists in the top **20%** -quantile. We observed 100 generated portfolios that all produced similar results, as you can see below. The figure of 100 histograms shows the distribution of their returns with a density line for each chart. "
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "metadata": {},
169 |    "source": [
170 |     "<img src=\"images/Distribution of Portfolio Returns.png\">\n",
171 |     "<!-- Source Code is located in the RandomSelectionMetrics.ipynb file. -->"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "Each portfolio performed similarly across the months as well. Below, you can see a single histogram with a density line describing the mean returns across all portfolios for each month. "
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "<img src=\"images/Distribution of Portfolio Mean Returns.png\">\n",
186 |     "\n",
187 |     "<!-- Source Code is located in the RandomSelectionMetrics.ipynb file. -->"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "By taking the standard deviation of the monthly PnL across all the portfolios, we can observe the values increasing as time continues, as the figure below shows. "
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "<img src=\"images/Monthly Standard Deviations Line Chart.png\">\n",
202 |     "\n",
203 |     "<!-- Source Code is located in the RandomSelectionMetrics.ipynb file. -->"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "markdown",
208 |    "metadata": {},
209 |    "source": [
210 |     "#### Analysis on Generating Portfolios from Random Equity Selections:\n",
211 |     "While the initial thought seemed like a good idea, in reality, the randomly selected equities to populate portfolios showed that there was no outstanding value in randomly selecting from our top performing equities in the top **20%**-quantile. This can easily be viewed in the **Distribution of Portfolio Returns...** figure showing that the portfolios more or less behaved the same over the given time period. "
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "markdown",
216 |    "metadata": {},
217 |    "source": [
218 |     "### Running the Strategy:"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": [
227 |     "# i = 0.7 #fraction that is training data\n",
228 |     "# b_cap = 10 #beta cap\n",
229 |     "# s = True #includes sentiment features\n",
230 |     "# df_test, results = develop_all_three_models(frac_training=i, use_sentiment=s)    \n",
231 |     "# results_summary = run_strategy(df_test, ticker_dict=raw_data['Raw_Data'], sort_column='y_pred_avg', \n",
232 |     "#                                seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-b_cap, \n",
233 |     "#                                beta_cap_ceil=b_cap, plot=False)"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "markdown",
238 |    "metadata": {},
239 |    "source": [
240 |     "### Strategy Results and Performance Metrics:"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# take input of strategy. \n",
250 |     "# total_dict = getStatsAll(results_summary) #using a beta cap of +/- 10. "
251 |    ]
252 |   },
253 |   {
254 |    "cell_type": "markdown",
255 |    "metadata": {},
256 |    "source": [
257 |     "<img src=\"images/PortfolioPerformanceBetaCap_10.png\">\n",
258 |     "\n",
259 |     "<img src=\"images/PortfolioPerformanceBetaCap_10_Split.png\">"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "*Notice: this analysis is in-sample inference only.*\n",
267 |     "\n",
268 |     "### The longer testing periods are more reliable\n",
269 |     "* The strategy is developed using progressivley larger training sets. The orange line was trained on only 10% of months (starting January 2013), and tested on the remaining 90% of months (up to April 2019).\n",
270 |     "* The strategy which was tested the shortest is the brown, and its returns are not good. In its five months, it swings widely and finishes barely positive.\n",
271 |     "* However, this strategy (three models estimated on training data, then averaged) is the least reliable because it was tested on the shortest period. It doesn't get much time to reveal its performance on test data, so we do not rely on these short test period. \n",
272 |     "* However, using a shorter training set risks the model getting enough data to find the patterns in the data. With shorter training sets, the model could be less predictive, but we will more likely detect that it is unpredictive. With longer training sets, the model could be more predictive, but we are less likely to detect that it is unpredictive. \n",
273 |     "* The brown line is the least reliable, due to its short testing period.\n",
274 |     "\n",
275 |     "### The trend is up, but with a large drop at the end\n",
276 |     "* By contrast, the orange line was tested on 90% of the data, giving it a long testing period to reveal its performance out-of-sample. The orange line has an upward trend and a sharp dip at the end. This suggests some optimism, but so does the market returns. Also, there is concern from the large concern for the sudden drop.\n",
277 |     "\n",
278 |     "### Large drop comes from large a net short position in the market moving against us\n",
279 |     "* The drop is due to large net short position in the market that happends to turn against us. This large short position comes from an unusually large individual beta. The large net short position (from one stock with unusually large beta) is not unique, but the adverse price change while holding the large position is unique. There are other periods where we hold large short market positions, but the price doesn't move so badly in those periods. In February and March, it does hurt us.\n",
280 |     "\n",
281 |     "### Medium-range forecasts are similar, but with a slower start\n",
282 |     "* The remaining three strategies (greed, red, purple) show similar patterns; general upward trends with a large drop near the end (February and March). However, they are begin flat for the first few months before starting their rise.\n",
283 |     "\n",
284 |     "### Let's address the large drop at the end\n",
285 |     "* Again, the large drop is due to large shorts market positions moving against us. It's difficult to predict when the market will move against us, so instead we will cap our short positions so that markets drops don't hurt as much (_favorable_ market moves will also help us less under capping. It limits both downside and upside).\n",
286 |     "* We now apply a hard cap to the short market positions that we are taking. The betas of any given stock in any prior month are generally less than 5, but sometimes larger 10 or even 30. The strategy runs above used a cap of 10, so now we tighten the cap to 5 (applied as absolute value of 5), and 1, to see if we can soften that large drop."
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "metadata": {},
292 |    "source": [
293 |     "## Analysis"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "# take input of strategy. \n",
303 |     "# total_dict = getStatsAll(results_summary, beta_plot = True, training_split_value_to_plot=0.3) #using a beta cap of +/- 1. "
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "<img src=\"images/PortfolioPerformanceBetaCap_1.png\">\n",
311 |     "\n",
312 |     "<img src=\"images/PortfolioPerformanceBetaCap_1_Split.png\">\n",
313 |     "\n",
314 |     "<img src=\"images/PortfolioPerformanceWithVaryingBetaCaps_3.png\">"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "markdown",
319 |    "metadata": {},
320 |    "source": [
321 |     "Regarding the Top Figure above:\n",
322 |     "* With the tight cap on beta (between -1 and 1), we again see a general upward trend. You can see that the strategy using the training size of 30% (green line) in the tightest beta constraint configuration of 1.0, outperforms the other training sizes in the same beta cap grouping. \n",
323 |     "\n",
324 |     "Regarding the Middle Figure of 5 plots above: \n",
325 |     "* You can see that when the beta cap group of training sizes is separated into individual charts, the split size of 30% and 50% outperform the market but only the 30% split results significantly outperform the market. \n",
326 |     "\n",
327 |     "Regarding the Bottom Figure above:\n",
328 |     "* We choose training fraction of 0.3, then show the strategy using different caps for beta. There is mostly overlap, but the strictest beta cap (1.0) shows highest performance, especially by the end of the simulation.\n",
329 |     "* Also, the sharp drops around November 2016 and March 2019 seem to be softened, as was expected from the capping. However, the small caps (5 & 10) didn't seem to curtail the drops; only cap of 1 seems to work.\n",
330 |     "* For the first half of the strategy, capping seemed to have minimal effect. But by the end, the strict cap lead to the best performance.\n",
331 |     "* While capping could plausibly improve PnL here, keep in mind that it would also reduce the market-neutrality, which was the point of investing the index in the first place. We do not explore whether market-neutrality was affected here.\n",
332 |     "* The optimal stratey identified is when the training size is only 30% of the entire dataset with a beta cap of +/- 1.0. You can see that in the bottom figure below, the *Beta Cap: 1.0* line easily outperforms the others in the same training size group and outperforms the Market, the purple line. These results show that the strategy made smart decisions, did not take too many losses, and capitalized on an increasing Market."
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "markdown",
337 |    "metadata": {},
338 |    "source": [
339 |     "<img src=\"images/PorfolioPerformanceBetaCap_1_DataFrame.png\">"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "The image above represents the dataframe of strategy results using a beta cap of 1.0. Our best performing strategy, using the training split of 30%, had an **Annualized Return of 18.77%, Annualized Volatility of 27.73%, an Annualized Sharpe of 0.605, a Max Drawdown of 0.167, and an Alpha of 0.014.** Below, you wcan see just how the strategy performs against the market since the start of 2015. The strategy looks promising but we need to stress test to be sure. "
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "<img src=\"images/PortfolioPerformanceBetaCap_1_AgainstMarket.png\">"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "### Stress Testing Against Simulated 2008 Performance Compared to Recent Performance Without Sentiment"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "Since the sentiment data does not go past 2013, we are opting to stress test the strategy by observing the strategy's performance without the sentiment features from 2015 to 2019 and observe the strategy's performance between 2005 and 2010. Then we will compare the differential performance of the annualized market returns and annualized strategy returns between the two time periods."
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "markdown",
372 |    "metadata": {},
373 |    "source": [
374 |     "<img src=\"images/PortfolioPerformanceBetaCap_1_withoutSentiment.png\">\n",
375 |     "\n",
376 |     "<img src=\"images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png\">"
377 |    ]
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "The strategy can be observed against the market from 2015 to 2019 in the time period above. Take note that the **Annualized Strategy Return is 2.74% against the Market's Annualized Return of 12.67%.** We will be comparing these along with the other listed metrics to the strategy's performance from 2015 to 2010.  "
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {},
389 |    "source": [
390 |     "<img src=\"images/PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png\">\n",
391 |     "\n",
392 |     "<img src=\"images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png\">"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "metadata": {},
398 |    "source": [
399 |     "### Stress Testing Analysis\n",
400 |     "Now taking a look at the performance of the stressed strategy from 2007 through 2010. The strategy does produce a higher **Annualized Strategy Return of 5.09% against the Market's Annualized Return of 0.58%.** We do see a larger max drawdown and a larger portfolio beta on the stressed strategy and this can be expected. It is surprising to see that not only did the strategy perform positively during this period, it beat the market and beat the 2015-2019 period we were comparing it to. \n",
401 |     "\n",
402 |     "Some of the other metrics did perform better in the 2015-2019 dataset, such as a drawdown that is almost half at **0.259** and a much lower **portfolio beta at 0.332**. Volatility  was higher for both the strategy and the market during the stress period and this is expected since this was a purposeful stress run during a known financial crisis. "
403 |    ]
404 |   },
405 |   {
406 |    "cell_type": "markdown",
407 |    "metadata": {},
408 |    "source": [
409 |     "### Concluding Thoughts:"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "metadata": {},
415 |    "source": [
416 |     "We conclude that there is evidence of proof of concept utilizing a strategy that leverages sentiment data. While these initial results seem profitable and optimistic, it would be wise to continue with our testing by continuing to fine-tune the machine learning models, refine the universe filtering, and stress test against a different variety of scenarios and not just a single crisis. Sentiment data proves to be an unique feature set and as new natural language processing techniques continue to develop and employed, sentiment data will continue to grow and offer new ways of exploring different investment opportunities. "
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "<!-- ### Online Resources Include: \n",
424 |     "- [Kenneth French's Website](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html)\n",
425 |     "- [Quandl.com](https://www.quandl.com)\n",
426 |     "- [Quantopian.com](https://www.quantopian.com) \n",
427 |     "- [Stackoverflow.com](https://stackoverflow.com)\n",
428 |     "-->"
429 |    ]
430 |   },
431 |   {
432 |    "cell_type": "markdown",
433 |    "metadata": {},
434 |    "source": [
435 |     "### Bibliography & Data Sources: \n",
436 |     "1. **French, Kenneth R.** Kenneth R. French - Home Page. Accessed June 01, 2019.  \n",
437 |     "&nbsp;&nbsp;&nbsp;&nbsp;http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html.  \n",
438 |     "2. **Lee, Charles M.C.** 2003. Fusion Investing. Tech. Fusion Investing. AIMR.\n",
439 |     "3. **\"Quandl.\"** Quandl.com. Accessed June 01, 2019.  \n",
440 |     "&nbsp;&nbsp;&nbsp;&nbsp;https://www.quandl.com/.  \n",
441 |     "4. **\"The Place For Learning Quant Finance.\"** Quantopian. Accessed June 01, 2019.   \n",
442 |     "&nbsp;&nbsp;&nbsp;&nbsp;https://www.quantopian.com/.  \n",
443 |     "5. **\"Where Developers Learn, Share, & Build Careers.\"** Stack Overflow. Accessed June 02, 2019.  \n",
444 |     "&nbsp;&nbsp;&nbsp;&nbsp;https://stackoverflow.com/.\n",
445 |     "6. **Zhang, Wenbin, and Steven Skiena.** 2010. Trading Strategies to Exploit Blog and News Sentiment. Tech.  \n",
446 |     "&nbsp;&nbsp;&nbsp;&nbsp;Trading Strategies to Exploit Blog and News Sentiment. Association for the Advancement of Artificial Intelligence."
447 |    ]
448 |   }
449 |  ],
450 |  "metadata": {
451 |   "kernelspec": {
452 |    "display_name": "Python 3",
453 |    "language": "python",
454 |    "name": "python3"
455 |   },
456 |   "language_info": {
457 |    "codemirror_mode": {
458 |     "name": "ipython",
459 |     "version": 3
460 |    },
461 |    "file_extension": ".py",
462 |    "mimetype": "text/x-python",
463 |    "name": "python",
464 |    "nbconvert_exporter": "python",
465 |    "pygments_lexer": "ipython3",
466 |    "version": "3.6.8"
467 |   }
468 |  },
469 |  "nbformat": 4,
470 |  "nbformat_minor": 2
471 | }
472 | 


--------------------------------------------------------------------------------
/strategy.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import matplotlib.pyplot as plt
  3 | import pandas as pd
  4 | import numpy as np
  5 | import scipy as sp
  6 | import functools
  7 | import seaborn as sns
  8 | import time
  9 | import h5py
 10 | import statsmodels.api as sm
 11 | import copy
 12 | import pickle
 13 | import pdb
 14 | from math import ceil
 15 | from scipy import stats
 16 | import importlib
 17 | 
 18 | from sklearn.linear_model import LinearRegression
 19 | from sklearn.preprocessing import Binarizer
 20 | from sklearn.preprocessing import MinMaxScaler
 21 | 
 22 | 
 23 | 
 24 | def aggregate_from_daily(df, ticker):
 25 |     """
 26 |     Use in combine_all_tickers.
 27 |     Aggregates data from daily to monthly level.
 28 |     """
 29 |     
 30 |     df_temp = df.copy()
 31 |     df_temp.index.names = ['Date']
 32 |     
 33 |     ticker_price_beginning = df_temp.loc[:,['Adj_Close']].resample('M').first().rename(columns={'Adj_Close':'ticker_price_beginning'})
 34 |     ticker_price_end = df_temp.loc[:,['Adj_Close']].resample('M').last().rename(columns={'Adj_Close':'ticker_price_end'})
 35 |     mkt_price_beginning = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').first().rename(columns={'QQQ_Adj_Close':'mkt_price_beginning'})
 36 |     mkt_price_end = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').last().rename(columns={'QQQ_Adj_Close':'mkt_price_end'})
 37 |     df4 = ticker_price_beginning.merge(ticker_price_end, left_index=True, right_index=True, how='outer').merge(mkt_price_beginning, left_index=True, right_index=True, how='outer').merge(mkt_price_end, left_index=True, right_index=True, how='outer')
 38 |     
 39 |     custom_aggretator = lambda array_like: array_like.max()  # aggregate days to month, for sentiment features
 40 |     
 41 |     df1 = df_temp.drop(columns=['Adj_Close', 'Adj_Volume','QQQ_Adj_Close']).resample('M').apply(custom_aggretator)  # for sentiment features, take max
 42 |     df2 = df_temp.loc[:, ['Adj_Volume']].resample('M').median().merge(df1, left_index=True, right_index=True, how='outer')    # for volume, take median
 43 |     
 44 |     df2 = df2.assign(Ticker=ticker).reset_index().set_index(['Date','Ticker']).rename(columns={'Adj_Volume': 'volume'})
 45 |     df5 = df4.merge(df2, left_index=True, right_index=True, how='outer')
 46 |     return df5
 47 | 
 48 | def combine_all_tickers(ticker_dict):
 49 |     """
 50 |     Combines dictionary-of-dataframes into one dataframe.
 51 |     """
 52 |     
 53 |     first_key = list(ticker_dict.keys())[0]
 54 |     df1 = ticker_dict[first_key].copy()    # df for first ticker
 55 |     first_df_cols = df1.columns.tolist()
 56 |     df2 = aggregate_from_daily(df1, first_key)   # aggregate to monthly level
 57 |     j=0
 58 |     for key, value in ticker_dict.items():    # for each ticker, aggregate then concat to master df
 59 |         #print(key,j)
 60 |         #j+=1
 61 |         if key==first_key: continue
 62 |         if first_df_cols != value.columns.tolist(): print('bad columns for {}!'.format(key))
 63 |         df3 = aggregate_from_daily(value, key)
 64 |         df2 = pd.concat([df2, df3])
 65 |         
 66 |     df2 = df2.sort_index(level=[0,1])
 67 |     return df2
 68 | 
 69 | # used in create_weights()
 70 | def sigmoid(x):
 71 |     p = 1 / (1 + np.exp(-x))
 72 |     return p/p.sum()
 73 | 
 74 | # used in create_weights()
 75 | def position_weights(x):
 76 |     x = x.astype(float).values.copy()
 77 |     idx_nonzero = np.nonzero(x)
 78 |     x[idx_nonzero] = sigmoid( x[idx_nonzero] )  # change the non-zero scores to 
 79 |     return pd.Series(x)
 80 | 
 81 | 
 82 | def transform_function(x, seed=False, num_long_tickers=10, mult_extra=2.):
 83 | 	"""
 84 | 	Used in create_weights(). Takes in series, randomly puts one
 85 | 	for random num_long_tickers tickers.
 86 | 	Seed is if you don't want to choose randomly, to get reproducible
 87 | 	strategy.
 88 | 	"""
 89 | 	#mult_extra=2
 90 | 
 91 | 	if seed: np.random.seed(42)
 92 | 	if num_long_tickers*mult_extra > x.shape[0]: print('mult_extra is too big!')
 93 | 	output = np.concatenate([np.ones(num_long_tickers*mult_extra), np.zeros( x.shape[0]-mult_extra*num_long_tickers)])
 94 | 	indices = np.random.choice(np.arange(mult_extra*num_long_tickers), replace=False, size=num_long_tickers)
 95 | 	output[indices] = 0.
 96 | 	return pd.Series(output)  # go long top fraction of tickers, no short positions
 97 | 
 98 | 
 99 | def create_weights(df, sort_column='score', frac_long_tickers=0.01, seed=False, mult_extra=2.):
100 |     """
101 |     For each month, find the tickers that we will go long.
102 |     Calculate their weights accoring to the ranking column.
103 |     Shift those weights a month forward so we can trade on them.
104 |     """
105 |     
106 |     df1 = df.copy()
107 |     num_tickers = df1.index.get_level_values(1).unique().shape[0]
108 |     num_long_tickers = int(round(num_tickers*frac_long_tickers))
109 |     #transform_function = lambda x: pd.Series(np.concatenate([np.ones(num_long_tickers), np.zeros(x.shape[0]-2*num_long_tickers), np.zeros(num_long_tickers)]))  # go long top fraction of tickers, no short positions. DEPRECATED
110 |     transform_function1 = lambda x: transform_function(x, seed=seed, num_long_tickers=num_long_tickers, mult_extra=int(mult_extra))
111 | 
112 |     df2 = df1.loc[:,[sort_column]].sort_values(by=['Date',sort_column], ascending=[True, False]).groupby(level=0, as_index=False).transform(transform_function1)  # turn into position (1,0,0). transform takes in series
113 |     
114 |     # print(num_long_tickers, df2.loc[:,sort_column].groupby(level=0).sum())
115 |     df2 = df2.rename(columns={sort_column:'position_current'})
116 |     df1 = df1.loc[:,[sort_column, 'ticker_price_beginning', 'ticker_price_end', 'mkt_price_beginning', 'mkt_price_end']]
117 |     df3 = df1.merge(df2, left_index=True, right_index=True, how='outer').sort_values(by=['Date','Ticker'], ascending=[True, True])  # merge position (1,0,-1) with other columns         
118 |     df3 = df3.assign(position_predictive=df3.position_current.shift(num_tickers)) #.sort_values(by=['Date',sort_column], ascending=[True, False])  # predictive takes position from _previous_ month
119 |     
120 |     df3 = df3.assign(score_current = df3[sort_column]*df3.position_current)
121 |     df3 = df3.assign(weight_current = df3.loc[:,['score_current']].groupby(level=0, as_index=False).transform(position_weights).iloc[:,0])  # find weights based on current rankings
122 |     df3 = df3.assign(weight_pred = df3.weight_current.shift(num_tickers))  # find weights based on current rankings
123 |     df3 = df3.drop(columns=['score_current'])
124 |     
125 |     df4 = df3[(df3.position_predictive!=0)& (df3.position_predictive.notna())].copy()
126 |     return df4
127 | 
128 | def calculate_capital(df, initial_capital=1e6):
129 |     """
130 |     Start with the initial capital. For each month, spend it all on long positions.
131 |     Then calculate the caplital left at end of the month, and re-invest that amount 
132 |     next month.
133 |     """
134 |     
135 |     i=0
136 |     for date, new_df in df.groupby(level=0):  # iterate through dates, compute new capital held, and new positions
137 |         if i==0:
138 |             new_df1 = new_df.copy()
139 |             new_df1 = new_df1.assign(total_notional_begin = initial_capital)
140 |             new_df1 = new_df1.assign(notional_begin = new_df1.total_notional_begin *new_df1.weight_pred)  # must provide initial capital
141 |             new_df1 = new_df1.assign(num_shares_begin = np.floor(new_df1.notional_begin/new_df1.ticker_price_beginning))  # buy the number of shares afforded by the capital to spend
142 |             new_df1 = new_df1.assign(notional_end = new_df1.ticker_price_end*new_df1.num_shares_begin)  # exit position, calculate ending capital 
143 |             new_df1 = new_df1.assign(cashflow= new_df1.notional_end - new_df1.notional_begin)   # create cashflows
144 |             total_notional_end = new_df1.notional_end.groupby(level=0).sum().values[0]  # sum the ending notional accros assets
145 |             new_df1 = new_df1.assign(total_notional_end=total_notional_end)
146 |             i+=1
147 |         else:
148 |             new_df2 = new_df.copy()
149 |             new_df2 = new_df2.assign(total_notional_begin = total_notional_end)  # use total_notional_end from previous iteration
150 |             new_df2 = new_df2.assign(notional_begin = new_df2.total_notional_begin *new_df2.weight_pred)  # must provide initial capital
151 |             new_df2 = new_df2.assign(num_shares_begin = np.floor(new_df2.notional_begin/new_df2.ticker_price_beginning))
152 |             new_df2 = new_df2.assign(notional_end = new_df2.ticker_price_end*new_df2.num_shares_begin)
153 |             new_df2 = new_df2.assign(cashflow= new_df2.notional_end - new_df2.notional_begin)   # create cashflows
154 |             total_notional_end = new_df2.notional_end.groupby(level=0).sum().values[0]
155 |             new_df2 = new_df2.assign(total_notional_end=total_notional_end)
156 |             new_df1 = pd.concat([new_df1, new_df2])       
157 |     return new_df1
158 |     
159 |     
160 | def calculate_beta(df, ticker_dict, beta_cap_floor=-10.0, beta_cap_ceil=10.0):
161 |     """
162 |     For each of the tickers that we go long, compute beta from prior month's daily returns.
163 |     """
164 |     
165 |     df1 = df.copy()
166 |     df_temp = df1.assign(beta=0.)
167 |     df_temp = df_temp.loc[:,['beta']]
168 |     
169 |     for index, row in df_temp.iterrows():
170 |         date_end = index[0] - pd.tseries.offsets.MonthEnd(1)  # dates for prior month
171 |         date_begin = date_end - pd.tseries.offsets.MonthBegin(1)
172 |         ticker = index[1]
173 | 
174 |         df2 = ticker_dict[ticker][date_begin:date_end].copy()  # select prior month's data
175 |         df2 = df2[df2.index.dayofweek < 5]  # remove weekends
176 |         df2 = df2.loc[:,['Adj_Close','QQQ_Adj_Close']]  # only need ticker's price and QQQ price
177 |         
178 |         X = df2.values[:,[0]]
179 |         y = df2.values[:,[1]]
180 |         reg = LinearRegression().fit(X, y)  # run regression
181 |         df_temp.at[index,'beta'] = reg.coef_  # put in beta
182 |       
183 |     df1 = df1.assign(beta=df_temp.beta.clip(beta_cap_floor, beta_cap_ceil))  # add beta column, with beta clipped
184 |     return df1
185 | 
186 |  
187 | def calculate_mkt_positions(df):
188 |     """
189 |     Use the betas to calculate the positions in the index. The market positions are short the same amount (except
190 |     rounding) as the long positions times beta.
191 |     """
192 |     df1 = df.copy()
193 |     df1 = df1.assign(notional_begin_mkt = - df1.beta * df1.notional_begin)  # if long x, then short (beta) * x
194 |     df1 = df1.assign(total_notional_begin_mkt=0.)  # start with no market positions
195 |     df1 = df1.assign(num_shares_begin_mkt = np.floor(df1.notional_begin_mkt/df1.mkt_price_beginning))  # buy value -beta*num_shares for each ticker
196 |     df1 = df1.assign(notional_end_mkt = df1.mkt_price_end*df1.num_shares_begin_mkt)  # notional held at end of month, after exiting positions
197 |     
198 |     df1 = df1.assign(cashflow_mkt = df1.notional_end_mkt - df1.notional_begin_mkt)  # cashflow over month period
199 |     df1a = df1.loc[:,['cashflow_mkt']].groupby(by=['Date']).sum().rename(columns={'cashflow_mkt':'total_notional_end_mkt'})  # ending notional summed across assets
200 |     df1 = df1.merge(df1a, left_index=True, right_index=True, how='inner')
201 |     
202 |     i =df1.index.get_level_values(0)[0]  # first date
203 |     num_tickers = df1.loc[i,:].index.values.shape[0]  # number of tickers, for shifting
204 |     df1 = df1.assign(notional_from_prior_period = df1.total_notional_end_mkt.shift(num_tickers))  # shift, to add later
205 |     df1.notional_from_prior_period.fillna(0, inplace=True)
206 |     
207 |     df1 = df1.assign(total_notional = df1.total_notional_end + df1.total_notional_end_mkt + df1.notional_from_prior_period) # add tickers, market, and leftover from prior period's market
208 |     return df1
209 |     
210 |     
211 | def calculate_pnl_sub_strategy(df, initial_capital=1e6):
212 |     """
213 |     Aggregates monthly positions across tickers to get monthly PnL
214 |     """
215 |     df1 = df.loc[:,['total_notional']].groupby(level=0, as_index=True).median()  # take median of total_notional, which are all same anyway
216 |     
217 |     df3 = df.loc[:,['mkt_price_beginning','mkt_price_end']].groupby(by='Date').first()  # take market prices
218 |     df3 = df3.assign(returns_mkt = (df3.mkt_price_end - df3.mkt_price_beginning)/df3.mkt_price_beginning )  # market returns over month
219 |    
220 |     df1 = df1.assign(returns_mkt = df3.returns_mkt)
221 |     
222 |     #df1 = df1.assign(mkt_price_beginning = df3.mkt_price_beginning.values)
223 |     #df1 = df1.assign(mkt_price_end = df3.mkt_price_end.values)
224 |     
225 |     #ind = np.array([np.datetime64('2013-01-31')])  # add Jan '13 data point
226 |     #df2 = pd.DataFrame(data={'total_notional':initial_capital, 'returns_mkt':np.nan, 'mkt_price_beginning':np.nan, 'mkt_price_end':np.nan}, index=ind)
227 |     #df2.index = df2.index.rename('Date')
228 |     #df2 = pd.concat([df1,df2]).sort_index()
229 |     
230 |     df2 = df1.copy()
231 |     df2 = df2.assign(returns_strategy=df2.total_notional.pct_change())
232 |     
233 |     df2 = df2[['returns_mkt','returns_strategy','total_notional']]
234 |     df2 = df2.rename(columns={'total_notional':'pnl'})
235 |     return df2
236 | 
237 | 
238 | def run_strategy(df, ticker_dict, sort_column='score', frac_long_tickers=0.01, seed=False, mult_extra=2., beta_cap_floor=-10.0, beta_cap_ceil=10.0, plot=False):
239 |     df2=df.copy()
240 |     df3 = create_weights(df2, sort_column=sort_column, frac_long_tickers=frac_long_tickers, seed=seed, mult_extra=mult_extra)
241 |     df4 = calculate_capital(df3, initial_capital=1e6)
242 |     #print('Calculated capital for each month')
243 |     df5 = calculate_beta(df4, ticker_dict=ticker_dict, beta_cap_floor=beta_cap_floor, beta_cap_ceil=beta_cap_ceil)
244 |     #print('Calculated market beta of the ticker-level returns')
245 |     df6 = calculate_mkt_positions(df5)
246 |     #print('Calculated market positions from the betas')
247 |     df7 = calculate_pnl_sub_strategy(df6)
248 |     if plot: df7.loc[:,['pnl']].plot()
249 |     return df7
250 | 
251 | 
252 | def print_hi():
253 |     print('no more')
254 | 
255 | 
256 | ##### Create features for machine learning #####
257 | 
258 | def aggregate_from_daily_ml(df, ticker):
259 |     """
260 |     Use in combine_all_tickers_ml. Aggregates data from daily to monthly level.
261 |     Features are quantiles over previous month.
262 |     """
263 |     
264 |     df_temp = df.copy()
265 |     df_temp.index.names = ['Date']
266 |     ticker_price_beginning = df_temp.loc[:,['Adj_Close']].resample('M').first().rename(columns={'Adj_Close':'ticker_price_beginning'})
267 |     ticker_price_end = df_temp.loc[:,['Adj_Close']].resample('M').last().rename(columns={'Adj_Close':'ticker_price_end'})
268 |     mkt_price_beginning = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').first().rename(columns={'QQQ_Adj_Close':'mkt_price_beginning'})
269 |     mkt_price_end = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').last().rename(columns={'QQQ_Adj_Close':'mkt_price_end'})
270 |     df4 = ticker_price_beginning.merge(ticker_price_end, left_index=True, right_index=True, how='outer').merge(mkt_price_beginning, left_index=True, right_index=True, how='outer').merge(mkt_price_end, left_index=True, right_index=True, how='outer')
271 |     df4.columns = pd.MultiIndex.from_tuples([(col, 'NA') for col in df4.columns.tolist()])
272 |     
273 |     quantiles = [.5,.75,.9]
274 |     custom_aggretator = lambda array_like: array_like.quantile(q=quantiles)  # aggregate days to month, for sentiment features
275 |     
276 |     df1 = df_temp.drop(columns=['Adj_Close','QQQ_Adj_Close']).resample('M').apply(custom_aggretator)  # for sentiment features, take max
277 |     
278 |     #df6 = df_temp.drop(columns=['Adj_Close','QQQ_Adj_Close']).rolling(30).agg(custom_aggretator) #.resample('M')
279 |     
280 |     df1 = df1.rename(columns={'Adj_Volume':'Volume'})
281 |     df1 = df1.unstack()   # turn the row index into columns
282 |     
283 |     df2 = df1.iloc[:,0:3].drop([0.75,0.9], axis=1, level=1)  # keep only median for volume
284 |     df1 = df1.drop('Volume', axis=1, level=0)
285 |     df1 = df1.merge(df2, left_index=True, right_index=True)
286 |     
287 |     df4 = df4.merge(df1, left_index=True, right_index=True, how='inner')
288 |     df4 = df4.assign(Ticker=ticker).reset_index().set_index(['Date','Ticker'])
289 |     
290 |     df4[('Returns','Next_Month')] = ((df4[('ticker_price_end', 'NA')] - df4[('ticker_price_beginning', 'NA')]) / df4[('ticker_price_beginning', 'NA')]).shift(-1)
291 |     
292 |     return df4
293 | 
294 | 
295 | # one-time function
296 | def create_features(overwrite=False):
297 |     """
298 |     Combines dictionary-of-dataframes into one dataframe.
299 |     Creates features as it goes.
300 |     Creates data.pkl
301 |     """
302 |     
303 |     data_dict = pd.read_pickle('complete_dataset.pickle')  # upload dictionary of tickers
304 |     ticker_dict = data_dict['Raw_Data']
305 | 
306 |     # initialize dataframe
307 |     first_key = list(ticker_dict.keys())[0]  # find the first ticker
308 |     df1 = ticker_dict[first_key].copy()    # df for first ticker
309 |     first_df_cols = df1.columns.tolist()
310 |     df2 = aggregate_from_daily_ml(df1, first_key)   # aggregate to monthly level
311 |     j=0
312 |     for key, value in ticker_dict.items():    # for each ticker, aggregate then concat to master df
313 |         if key==first_key: continue
314 |         if first_df_cols != value.columns.tolist(): print('bad columns for {}!'.format(key))
315 |         df3 = aggregate_from_daily_ml(value, key)
316 |         
317 |         df2 = pd.concat([df2, df3])
318 |         if j%(round(len(ticker_dict)/10))==0: print('Fraction done: {}'.format(round(j/len(ticker_dict),5)))
319 |         j+=1
320 |     df2 = df2.sort_index(level=[0,1])
321 |     
322 |     df2.columns = [col[0] + '_' + str(col[1]) if str(col[1])!='NA' else col[0] for col in df2.columns.tolist()]
323 | 
324 |     df3 = create_target(df2, threshold=0.0)
325 |     df3.columns = [col[0] + '_' + str(col[1]) if str(col[1])!='NA' else col[0] for col in df3.columns.tolist()]
326 | 
327 | 
328 |     if overwrite:
329 |         print('Saving to data.pkl')
330 |         df3.to_pickle('data.pkl')
331 |     else:
332 |         print('File not being saved. To save, use overwrite=True')
333 | 
334 |     return df3
335 | 
336 | 
337 | def create_target(data, threshold=0.0):
338 |     '''
339 |     Create target variable that is binary {0,1}.
340 |     Split into X and y.
341 |     data: dataframe
342 |     '''
343 | 
344 |     data1 = data.dropna().copy()
345 |     binarizer = Binarizer(threshold=threshold)
346 |     target = binarizer.transform(data1[('Returns','Next_Month')].values.reshape(-1,1))
347 | 
348 |     data1 = data1.join(pd.DataFrame(target,
349 |         columns=pd.MultiIndex.from_product([['Returns'], ['Target']]),
350 |         index=data1.index))
351 | 
352 |     return data1
353 | 
354 | 
355 | def create_predictions_sklearn(data, model, name='y_pred_ml', num_cols_non_feats=1):
356 |     """
357 |     Runs the models on the data (dataframe) and creates a new column called score.
358 |     Works for sklearn models, not keras models.
359 |     """
360 |     X = data.iloc[:,:-num_cols_non_feats].copy().values  # pick out only the predictor variables. The model picks certain columns via its pipeline.
361 |     y_pred = model.predict_proba(X)[:,1]
362 |     data1 = data.copy()
363 |     data1[name] = y_pred
364 |     return data1
365 | 
366 | 
367 | # split a multivariate sequence into samples
368 | # helper function
369 | def split_sequences(sequences, n_steps=3):
370 | 	"""
371 |     Takes in dateset with has (X,y) stacked together horizontally. Samples with rolling window
372 |     of length n_steps, and outputs the results to X (n_samples, n_timesteps, n_features) and
373 |     y (n_samples). Note that y is the one-step-ahead y, as it should for time-series prediction.
374 |     """
375 | 
376 | 	X, y = list(), list()
377 | 	for i in range(len(sequences)):
378 | 		# find the end of this pattern
379 | 		end_ix = i + n_steps
380 | 		# check if we are beyond the dataset
381 | 		if end_ix > len(sequences):
382 | 			break
383 | 		# gather input and output parts of the pattern
384 | 		seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
385 | 		X.append(seq_x)
386 | 		y.append(seq_y)
387 | 	
388 | 	return np.array(X), np.array(y)
389 | 
390 | 
391 | def build_dataset_for_rnn(df11, n_steps=3, frac_train=0.5):
392 |     """
393 |     Takes in dataframe. Splits out X and y. Normalizes X. Samples (X,y) with a rolling window of length n_steps,
394 |     and saves as 3d array (n_samples, n_timesteps, n_features).
395 | 
396 |     This acts only on sentiment features!!
397 |     """
398 |     
399 |     # Split by company
400 |     X_tickers = []
401 |     y_list = []
402 | 
403 |     scaler = MinMaxScaler(feature_range=(-1,1))
404 |     #print('Building dataset...')
405 |     for i, ticker in enumerate(df11.index.get_level_values(1).unique()):  # split by ticker
406 |         X = df11.reset_index().loc[df11.reset_index().Ticker==ticker].set_index(['Date','Ticker']).iloc[:,:-1].values
407 |         X = scaler.fit_transform(X)  # standardize the features
408 |         y = df11.reset_index().loc[df11.reset_index().Ticker==ticker].set_index(['Date','Ticker']).iloc[:,-1].values
409 |         dataset = np.hstack((X, y.reshape(-1,1)))
410 |         # convert into input/output
411 |         X, y = split_sequences(dataset, n_steps)
412 |         X_tickers.append(X)
413 |         y_list.append(y)
414 |         #if i%(int(df11.index.get_level_values(1).unique().shape[0])/5)==0: print('Done {} percent'.format(round(100*i/int(df11.index.get_level_values(1).unique().shape[0]),2)))
415 |     
416 |     n_features = X_tickers[0].shape[2]
417 |     
418 |     try:
419 |     	X = np.array(X_tickers).reshape(-1, n_steps, n_features)
420 |     except ValueError:
421 |     	print(len(X_tickers), X_tickers[0].shape, n_steps, n_features)
422 |     y = np.array(y_list).reshape(-1,1)
423 | 
424 |     end_train = int(X.shape[0]*frac_train)
425 |     x_train = X[:end_train]
426 |     y_train = y[:end_train]
427 |     x_test = X[end_train:]
428 |     y_test = y[end_train:]
429 |     
430 |     #print('Done building dataset')
431 |     #print('x_train shape:', x_train.shape)
432 |     #print('x_test shape:', x_test.shape)
433 |     #print('y_test shape:', y_train.shape)
434 |     #print('y_test shape:', y_test.shape)
435 |     
436 |     return x_train, y_train, x_test, y_test
437 | 
438 | 
439 | def create_predictions_keras(df_test, model, name='y_pred_nn',n_steps=3,verbose=False):
440 |     """
441 |     Takes in dataframe of features and makes predictions from the model.
442 |     The number of steps (n_steps) must match the number of steps the
443 |     model was trained on.
444 |     """
445 |     
446 |     scaler = MinMaxScaler(feature_range=(-1,1))
447 |     
448 |     count_rows = 0
449 |     tickers = df_test.index.get_level_values(1).unique()
450 |     for i, ticker in enumerate(tickers):  # split by ticker
451 |     
452 |         if i==0:
453 |             df1 = df_test.reset_index().loc[df_test.reset_index().Ticker==ticker].set_index(['Date','Ticker']).copy()
454 |             X = df1.iloc[:,:-3].values  # chop off two y's from previous two models
455 |             X = scaler.fit_transform(X)  # standardize the features
456 |             y = df1.iloc[:,-3].values
457 |             dataset = np.hstack((X, y.reshape(-1,1)))
458 |             X, y = split_sequences(dataset, n_steps)
459 |             y_pred = model.predict(X)
460 |             try:
461 |                 y_pred = model.predict(X)
462 |             except ValueError: 
463 |                 print('Not enough dates to make prediction on ticker {}. Returning  current dataframe.'.format(ticker))
464 |                 return df1
465 |             a = np.array((n_steps-1)*[np.nan])
466 |             y_pred = np.concatenate([a, y_pred.ravel()])
467 |             df1[name] = y_pred
468 |             count_rows += df1.shape[0]
469 |             
470 |         else:
471 |             df2 = df_test.reset_index().loc[df_test.reset_index().Ticker==ticker].set_index(['Date','Ticker']).copy()
472 |             X = df2.iloc[:,:-3].values  # chop off two y's from previous two models
473 |             X = scaler.fit_transform(X)  # standardize the features
474 |             y = df2.iloc[:,-3].values
475 |             dataset = np.hstack((X, y.reshape(-1,1)))
476 |             X, y = split_sequences(dataset, n_steps=n_steps)
477 |             try:
478 |                 y_pred = model.predict(X)
479 |             except ValueError:
480 |                 print('Not enough dates to make prediction on ticker {}. Returning current dataframe.'.format(ticker))
481 |                 return df1
482 |             a = np.array((n_steps-1)*[np.nan])
483 |             y_pred = np.concatenate([a, y_pred.ravel()])
484 |             df2[name] = y_pred
485 |             df1 = pd.concat([df1, df2])
486 |             count_rows += df2.shape[0]
487 |             df1 = df1.sort_values(by=['Date','Ticker'])
488 |         if verbose:
489 |             if i%(tickers.shape[0]/10)==0: print('done {} percent'.format(100*i/tickers.shape[0]))
490 |     
491 |     df1 = df1.sort_values(by=['Date','Ticker'])
492 |     return df1
493 | 
494 | 
495 | ############## Three ML Models ##############
496 | from sklearn.decomposition import PCA
497 | from sklearn.pipeline import Pipeline
498 | from sklearn.preprocessing import StandardScaler, FunctionTransformer
499 | from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
500 | from sklearn.metrics import fbeta_score, make_scorer, f1_score, precision_score, roc_auc_score, accuracy_score
501 | from sklearn.linear_model import LogisticRegression
502 | 
503 | ##### Logistic Regression #####
504 | def develop_logistic_model(df11_train):
505 |     # dataset FOR SKLEARN
506 |     N = df11_train.shape[0]
507 |     end = int(N*0.75)
508 | 
509 |     # select a training set of first half
510 |     X_train = df11_train.iloc[:end, :-1].values
511 |     y_train = df11_train.iloc[:end, -1].values
512 |     X_val = df11_train.iloc[end:, :-1].values
513 |     y_val = df11_train.iloc[end:, -1].values
514 | 
515 |     clf = LogisticRegression(penalty='l2',
516 |                              C=1.0,
517 |                              random_state=0,
518 |                              solver='sag',
519 |                              max_iter=10000)
520 |     # use later
521 |     #pca = PCA(n_components=5)
522 | 
523 |     # https://scikit-learn.org/stable/auto_examples/preprocessing/plot_function_transformer.html#sphx-glr-auto-examples-preprocessing-plot-function-transformer-py
524 |     def select_sentiment_colums(X):  # selects the 15 sentiment features
525 |         return X[:,-16:-1]
526 | 
527 |     pipe = Pipeline(steps=[('sentiment_cols', FunctionTransformer(select_sentiment_colums, validate=True)), 
528 |                            ('scale', StandardScaler()), 
529 |                            ('logreg', clf)])
530 | 
531 |     # https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html
532 |     param_grid = {
533 |         'logreg__C': [1e-2, 5e-2, 7.5e-2, 1.0]
534 |     }
535 | 
536 |     def my_custom_loss_func(y_true, y_pred):
537 |         c = np.array([y_pred,y_true])
538 |         d = np.sort(c)
539 |         e = np.flip(d, axis=1)
540 |         f = e[:,0:int(e.shape[1]*0.5)]
541 |         g = np.absolute(f[0,:] - f[1,:])
542 |         return g.mean()
543 |     my_scorer = make_scorer(my_custom_loss_func, greater_is_better=False)
544 | 
545 |     search = GridSearchCV(pipe, 
546 |                           param_grid, 
547 |                           scoring='roc_auc', #'roc_auc', # my_scorer
548 |                           iid=False,
549 |                           cv=TimeSeriesSplit(n_splits=3))
550 | 
551 |     search.fit(X_train, y_train.ravel());
552 |     
553 |     y_pred = search.predict_proba(X_train)[:,1]
554 |     #print('\n ###### Logistic Regression ###### \n')
555 |     #print('Best params:',search.best_params_)
556 |     #print('Mean (CV) AUC of best estimator:',search.best_score_)
557 |     #print('Validation AUC of best estimator:',search.best_estimator_.score(X_val, y_val)) # why is this so low?
558 |     #print('Validation AUC of best estimator:',search.score(X_val, y_val))  # should be auc
559 |     #print(accuracy_score(y,y_pred))
560 |     #print('Train AUC:',roc_auc_score(y_train, y_pred))
561 |     
562 |     return search
563 | 
564 | ##### Extreme Gradient Boosting #####
565 | import xgboost as xgb
566 | #from xgboost import XGBClassifier
567 | from xgboost.sklearn import XGBClassifier
568 | from sklearn.model_selection import GridSearchCV
569 | import graphviz
570 | 
571 | def develop_xgb_model(df11_train):
572 | 
573 | 	# dataset FOR SKLEARN
574 |     N = df11_train.shape[0]
575 |     end = int(N*0.75)
576 | 
577 |     # select a training set of first half
578 |     X_train = df11_train.iloc[:end, :-1].values
579 |     y_train = df11_train.iloc[:end, -1].values
580 |     X_val = df11_train.iloc[end:, :-1].values
581 |     y_val = df11_train.iloc[end:, -1].values
582 | 
583 |     xgb_model = xgb.XGBClassifier()
584 | 
585 |     def select_sentiment_colums(X):  # selects the 15 sentiment features
586 |         return X[:,-18:-3]
587 | 
588 |     pipe = Pipeline(steps=[#('sentiment_cols', FunctionTransformer(select_sentiment_colums, validate=True)), 
589 |                            ('scale', StandardScaler()), 
590 |                            ('xgb', xgb_model)])
591 | 
592 |     test_params = {
593 |         'xgb__eta': [0.05, 0.3, 1],
594 |         'xgb__min_child_weight': [1],
595 |         'xgb__max_depth': [2],#,5],
596 |         'xgb__gamma': [0],#,0.1,0.2],
597 |         'xgb__n_estimators': [20],#30,40],
598 |         'xgb__reg_alpha':[1e-5]#, 1e-2, 0.1]
599 |     }
600 | 
601 |     xgb_search = GridSearchCV(pipe,
602 |                               test_params,
603 |                               scoring='roc_auc', #'roc_auc', # my_scorer
604 |                               iid=False,
605 |                               cv=TimeSeriesSplit(n_splits=3))
606 | 
607 |     xgb_search.fit(X_train, y_train.ravel())
608 |     y_pred = xgb_search.predict_proba(X_train)
609 |     
610 |     #print('\n ###### XGB ###### \n')
611 |     #print('Best params: {}'.format(xgb_search.best_params_))
612 |     #print('Mean cv score of best estimator: {}'.format( round(xgb_search.best_score_,2)))  # shoiuld be AUC
613 |     #print('Training AUC: {}'.format( round(roc_auc_score(y_train, y_pred[:,1]),2)))
614 |     #print('Test AUC: {}'.format( round(roc_auc_score(y_val, xgb_search.predict_proba(X_val)[:,1])),2) )
615 |     
616 |     return xgb_search
617 | 
618 | 
619 | ##### LSTM #####
620 | from keras.preprocessing import sequence
621 | from keras.models import Sequential
622 | from keras.layers import Dense, Embedding
623 | from keras.layers import LSTM
624 | from keras.datasets import imdb
625 | 
626 | # turorials
627 | # https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
628 | # https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/
629 | 
630 | def develop_lstm_model(df11_train):
631 |     print('\n Building dataset for LSTM...')
632 |     x_train, y_train, x_val, y_val = build_dataset_for_rnn(df11_train, n_steps=3, frac_train=0.75)
633 |     batch_size = 32
634 |     n_steps, n_features = x_train.shape[1], x_train.shape[2]
635 | 
636 |     print('Build LSTM...')
637 |     model = Sequential()
638 |     #model.add(Embedding(input_dim=max_features, output_dim=128)) #use only for 
639 |     #model.add(Dense(32, input_shape=(70,)))
640 |     model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2, input_shape=(n_steps, n_features)))  # 50 memory units
641 |     model.add(Dense(1, activation='sigmoid'))  # classification, so sigmoid outcome
642 | 
643 |     # try using different optimizers and different optimizer configs
644 |     model.compile(loss='binary_crossentropy',  # since we're doing binary classification
645 |                   optimizer='adam',
646 |                   metrics=['accuracy'])
647 | 
648 |     print('Train LSTM...')
649 |     model.fit(x_train, y_train,
650 |               batch_size=batch_size,
651 |               epochs=10, #15
652 |               validation_data=(x_val, y_val),
653 |               verbose=0)
654 |     score, acc = model.evaluate(x_val, y_val,
655 |                                 batch_size=batch_size)
656 |     y_pred = model.predict(x_val)
657 | 
658 |     #print('\n ###### LSTM ###### \n')
659 |     #print('Test score:', round(score,2))
660 |     #print('Test accuracy:', round(acc,2))  # accuracy
661 |     #print('Train AUC:', round(roc_auc_score(y_train, model.predict(x_train)),2))
662 |     val_auc = roc_auc_score(y_val, y_pred)
663 |     #print('Validation AUC:', round(val_auc,2))
664 |     
665 |     return model, val_auc
666 | 
667 | 
668 | def make_predictions(df11_test, df11_all, model1, model2, model3):
669 |     df1 = create_predictions_sklearn(df11_test, model1, name='y_pred_log_reg', num_cols_non_feats=1) # chop off y
670 |     #print(df1.shape)
671 |     df2 = create_predictions_sklearn(df1, model2, name='y_pred_xgb', num_cols_non_feats=2)  # chop off 2 y's
672 |     #print(df2.shape)
673 |     df3 = create_predictions_keras(df2, model3, n_steps=3, verbose=False)
674 |     #print(df3.shape)
675 |     df4 = df3.join(df11_all.iloc[:,:4], how='left')
676 |     cols = df4.columns.tolist()
677 |     df5 = df4[cols[-4:]+cols[:-4]].copy()  # switch order of columns
678 |     y_pred_avg = df5.iloc[:,-3:].mean(axis=1).values
679 |     df6 = df5.assign(y_pred_avg=y_pred_avg)
680 |     return df6
681 | 
682 | ##### prepare dataset #####
683 | def prepare_dataset(frac_training=0.5, use_sentiment=True):
684 |     """
685 | 	Split dataset and prepare it for the ML/NN model fitting.
686 | 	It rounds down to the month, to avoid splitting months between
687 | 	training and test.
688 |     """
689 | 
690 |     d2 = pd.read_pickle('complete_dataset.pickle')
691 |     #df2 = pd.read_pickle('all_tickers_combined.pickle')
692 |     #df2a = df2.loc[:,['ticker_price_beginning', 'ticker_price_end', 'mkt_price_beginning','mkt_price_end','Sentiment']]
693 |     df11_all = pd.read_pickle('data.pkl')
694 |     df11 = df11_all.iloc[:, list(range(4,73))+[-1]].copy()  # select only features and target variable
695 |     
696 |     if not use_sentiment: df11 = df11.iloc[:,-15:].copy()  # remove sentiment features
697 | 
698 |     # dataset FOR SKLEARN
699 |     N = df11.shape[0]
700 |     end = int(N*frac_training)  # worked with 0.5
701 | 
702 |     remainder = end%1130
703 |     end = end-remainder
704 |     
705 |     # select a training set of first half
706 |     df11_train = df11.iloc[:end,:]
707 |     df11_test = df11.iloc[end:,:]
708 |     
709 |     return df11_train, df11_test
710 | 
711 | 
712 | def develop_all_three_models(frac_training=0.6, use_sentiment=True):
713 |     """
714 | 	Splits data, builds all three models, and returns some results.
715 |     """
716 | 
717 |     # split data
718 |     df11_all = pd.read_pickle('data.pkl')
719 |     df11_train, df11_test = prepare_dataset(frac_training=frac_training, use_sentiment=use_sentiment)
720 | 
721 |     # select logistic regression
722 |     search = develop_logistic_model(df11_train)
723 |     print('done logistic')
724 |     
725 |     # select xgb
726 |     xgb_search = develop_xgb_model(df11_train)
727 |     print('done gradient boosting')
728 |     
729 |     # train (not select) LSTM
730 |     model, val_auc = develop_lstm_model(df11_train)
731 |     print('done LSTM')
732 |     
733 |     # make predictions
734 |     print('Making predictions...')
735 |     df17_test = make_predictions(df11_test=df11_test, df11_all=df11_all, model1=search, model2=xgb_search, model3=model)
736 |     print('done predictions...')
737 |     
738 |     results = pd.DataFrame({'Validation AUC':[search.best_score_, xgb_search.best_score_, val_auc]})
739 |     results.index = ['log','xgb','lstm']
740 |     
741 |     print('Mean CV AUC:')
742 |     print(results)
743 | 
744 |     return df17_test, results
745 | 
746 |     # run strategy
747 | #    print('running strategy...')
748 | #    df18_test = run_strategy(df17_test, ticker_dict=d2['Raw_Data'], sort_column='y_pred_avg', seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-10., beta_cap_ceil=10., plot=False)
749 |     
750 | #    results = pd.DataFrame({'Validation AUC':[search.best_score_, xgb_search.best_score_, val_auc]})
751 | #    results.index = ['log','xgb','lstm']
752 | #    return df18_test, results
753 | 
754 | 
755 | def get_results_by_training_cutoff(save_result=False):
756 |     """
757 |     Splits training and test set, develops models, runs strategy,
758 |     and returns some results for plotting and analysis.
759 |     Creates results_by_training_cutoff.pkl.
760 |     Takes 30+ minutes to run.
761 |     """
762 | 
763 |     print('Warning: this code could take longer than 30 minutes.')
764 |     res = {}
765 |     d2 = pd.read_pickle('complete_dataset.pickle')  # complete_dataset.pkl was made by Benjamin, not Robert
766 |     for i in np.arange(0.1,1,0.2):
767 |         print(i)
768 |         df18_test, results = develop_all_three_models(frac_training=i)    
769 |         df19_test = run_strategy(df18_test, ticker_dict=d2['Raw_Data'], sort_column='y_pred_avg', seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-10., beta_cap_ceil=10., plot=False)
770 |         res[i]=(df18_test, results, df19_test)
771 |     if save_result:
772 |         print('Saving dictionary to results_by_training_cutoff.pkl')
773 |         # Create an variable to pickle and open it in write mode
774 |         list_pickle_path = 'results_by_training_cutoff.pkl'
775 |         list_pickle = open(list_pickle_path, 'wb')
776 |         pickle.dump(res, list_pickle)
777 |         list_pickle.close()
778 |     return res
779 | 
780 | 
781 | 
782 | 
783 | 
784 | 
785 | 
786 | if __name__ == '__main__':  # prints when run 
787 |     print('You just ran this from the command line')
788 | 
789 | #if __name__ == 'final_project':  # prints when you import the package
790 | #    print('Importing final_project')


--------------------------------------------------------------------------------