├── images ├── strategy_overview.png ├── PortfolioPerformanceBetaCap_1.png ├── PortfolioPerformanceBetaCap_10.png ├── Distribution of Portfolio Returns.png ├── PortfolioPerformanceBetaCap_10_Split.png ├── PortfolioPerformanceBetaCap_1_Split.png ├── Distribution of Portfolio Mean Returns.png ├── Monthly Standard Deviations Line Chart.png ├── PorfolioPerformanceBetaCap_1_DataFrame.png ├── PortfolioPerformanceWithVaryingBetaCaps_3.png ├── PortfolioPerformanceBetaCap_1_AgainstMarket.png ├── PortfolioPerformanceBetaCap_1_withoutSentiment.png ├── PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png ├── PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png ├── PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png └── .ipynb_checkpoints │ └── PortfolioPerformanceBetaCap_1_withoutSentiment-checkpoint.png ├── HM Capital Management Pitchbook.pptx ├── README.md ├── HM Capital Management Strategy Write Up.ipynb └── strategy.py /images/strategy_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/strategy_overview.png -------------------------------------------------------------------------------- /HM Capital Management Pitchbook.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/HM Capital Management Pitchbook.pptx -------------------------------------------------------------------------------- /images/PortfolioPerformanceBetaCap_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1.png -------------------------------------------------------------------------------- /images/PortfolioPerformanceBetaCap_10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_10.png -------------------------------------------------------------------------------- /images/Distribution of Portfolio Returns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/Distribution of Portfolio Returns.png -------------------------------------------------------------------------------- /images/PortfolioPerformanceBetaCap_10_Split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_10_Split.png -------------------------------------------------------------------------------- /images/PortfolioPerformanceBetaCap_1_Split.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_Split.png -------------------------------------------------------------------------------- /images/Distribution of Portfolio Mean Returns.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/Distribution of Portfolio Mean Returns.png -------------------------------------------------------------------------------- /images/Monthly Standard Deviations Line Chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/Monthly Standard Deviations Line Chart.png -------------------------------------------------------------------------------- /images/PorfolioPerformanceBetaCap_1_DataFrame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PorfolioPerformanceBetaCap_1_DataFrame.png -------------------------------------------------------------------------------- /images/PortfolioPerformanceWithVaryingBetaCaps_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceWithVaryingBetaCaps_3.png -------------------------------------------------------------------------------- /images/PortfolioPerformanceBetaCap_1_AgainstMarket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_AgainstMarket.png -------------------------------------------------------------------------------- /images/PortfolioPerformanceBetaCap_1_withoutSentiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_withoutSentiment.png -------------------------------------------------------------------------------- /images/PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PortfolioPerformanceBetaCap_1_withoutSentiment_stress.png -------------------------------------------------------------------------------- /images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment.png -------------------------------------------------------------------------------- /images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/PorfolioPerformanceBetaCap_1_DataFrame_withoutSentiment_stress.png -------------------------------------------------------------------------------- /images/.ipynb_checkpoints/PortfolioPerformanceBetaCap_1_withoutSentiment-checkpoint.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hatemr/quantitative-trading-project/master/images/.ipynb_checkpoints/PortfolioPerformanceBetaCap_1_withoutSentiment-checkpoint.png -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Quantitative Investment Strategy 2 | Code can be found [`strategy.py`](strategy.py). 3 | 4 | ## Overview 5 | This project combines logistic regression, gradient boosting, and LSTMs to predict next-month returns in equities using fundamental and sentiment features. See below for details. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | *Notice: this analysis is in-sample inference only.* 14 | 15 | 16 | ### Motivation Behind Exploring this Strategy: 17 | 18 | The motivation behind exploring this strategy is to explore the investable opportunities in equities by analyzing their sentimental features. Quandl provided pre-computed sentiment features for equities that were easily accessible to analyze. Additionally, we wanted to explore basic machine learning algorithms for predicting next period returns and measure performance against the market as a benchmark. 19 | 20 | 21 | 22 | Skiena et. al. (2010) uses sentiment data to build simple long-short strategies. Skiena’s dataset contains similar information to ours; it uses feature based on “polarity”, how positive or negative the mention is, and volume of news mentions. However, one difference is that Skiena separates information from blogs and news, while our dataset contains all sources together. They find that patterns from opinion in blogs persists longer than from news, which makes sense since opinions are less likely to contain “true” information that will change investors’ investment decisions. This paper provides evidence that sentiment-based investing, which we are analyzing in this write-up, is widespread. 23 | 24 | Fundamental analysis and investing is a widely-known investing technique that is used in different approaches but is ultimately based on how the company is perfoming as represented by their fundamental financial data. Charles Lee, the Henrietta J. Louis Professor at Cornell University, strives to connect fundamental value and investor sentiment with what is known as "fusion investing" in his Fusion Investing paper (2003). Lee states that while "researchers are finding that even though returns are difficult to predict, prediction is not an impossible task" and that there are observable systematic patterns in price return series (Lee 2003). While Lee takes the position of observing sentiment value from an investor, our strategy takes the available news sentiment data of the company, and not an individual investor, and attempts to predict performance within those believed systematic patterns. 25 | 26 | ### Creating a Universe of Equities to Use: 27 | The idea of using sentiment as a basis for our strategy requires a few different components. First, we would need a universe of equities to use. We also need specific sentiment data for the strategy, end of day data to measure performance, and fundamental financial ratios as a complement to the sentiment data. The fundamental data consists of a set of Zack's files from the Quandl database and both the sentiment data and the end of day data were queried from Quandl using their Quandl API. 28 | 29 | Since the fundamental data is from fixed csv files, which can be found in [Quandl's Zacks Collection B database](https://www.quandl.com/databases/ZFB/data "Quandl Zacks"), that's where the universe of equities started. The fundamental dataset consists of **9107** unique tickers which established the foundation for our universe. The fundamental dataset contains the following columns as features: *Weighted Average Shares Outstanding, Current Ratio, Gross Margin, Operating Profit Margin, EBIT Margin, Pretax Profit Margin, Profit Margin, Free Cash Flow, Asset Turnover, Inventory Turover, Receivables Turnover, Days Sales in Receivables, Return on Equity, Return on Tangible Equity, Return on Assets, Return on Investments, Free Cash Flow Per Share, Book Value Per Share, and Operating Cash Flow Per Share.* 30 | 31 | With the understanding that end of day data would be a bit easier to find than sentiment, we queried [quandl's sentiment database](https://www.quandl.com/data/NS1-FinSentS-Web-News-Sentiment "Quandl FinSent") with the **9107** tickers. The database consists of 5 different sentiment features: *Sentiment, Sentiment High, Sentiment Low, News Volume,* and *News Buzz*. 32 | - The *Sentiment* feature is a numeric measure of the bullishness / bearishness of news coverage of the stock. 33 | - The *Sentiment High/Low* feature is the highest and lowest intraday sentiment scores. 34 | - The *News Volume* feature is the absolute number of news articles covering the stock. 35 | - The *News Buzz* feature is a numeric measure of the change in coverage volume for the stock. 36 | 37 | We found that after filtering out the tickers that were not present in the database and handling any errors and data quality issues, such as missing data, the sentiment dataset held **4753** unique tickers with the 5 different feature sets. 38 | 39 | At that point, we took the smaller of the two ticker lists and scraped [Quandl's EOD database](https://www.quandl.com/data/EOD-End-of-Day-US-Stock-Prices "Quandl EOD"). After handling any present data quality issues, the EOD database contained **2490** unique tickers. 40 | 41 | After examining the three unique datasets, the selection of tickers to use for the merging of the three datasets came from the database with the smallest number of tickers: the end of day data. 42 | 43 | Running through the concatenating loop and adding a few more necessary filters to handle any remaining data quality issues during the concatenation, leaves a total of **1130** tickers in our universe with the necessary fundamental data, sentiment data, and end of day data. 44 | 45 | The output of the _ObtainDataFile.py_ script is a dictionary with two initial keys: *Raw_Data* and *Factors*. The *Factors* key contains a dictionary of [monthly](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html "Fama-French 5 Factor Monthly Data") and [daily](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html "Fama-French 5 Factor Daily Data") Five Factor datasets from [Fama and French](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html "Kenneth French's Website"). The *Raw_Data* value contains **1130** dictionaries with the ticker as the key and a dataframe from the concatenation as the value. 46 | 47 | 48 | 49 | 50 | 51 | 52 | ```python 53 | # from ObtainDataFile import * 54 | #from PerformanceMetrics import * 55 | # from strategy import * 56 | ``` 57 | 58 | By running the *start_running()* function, you will start the process of gathering and combining the data streams together to return a single dictionary data structure with the universe of tickers. 59 | 60 | 61 | ```python 62 | # all_data = start_running() 63 | # raw_data = pd.read_pickle('complete_dataset.pickle') 64 | ``` 65 | 66 | Data compilation is complete. 67 | 1130 tickers in universe. 68 | File name is complete_dataset.pickle and is located in the local directory. 69 | 70 | 71 | This *.read_pickle()* statement will read, instead of compile, the single dictionary data structure with the universe of tickers. 72 | 73 | ## Strategy Description: 74 | 75 | This is a long-only equity strategy with monthly frequency. The equities for the given month are selected based on their predicted probability of having positive returns in the next month. The probability is predicted using three models averaged together: 76 | 1. logistic regression 77 | 2. gradient boosting, 78 | 3. LSTM (long short-term memory) 79 | 80 | #### Target Variable 81 | * 0: returns in following month are _negative_ 82 | * 1: returns in following month are _positive_ 83 | 84 | In other words, we predict the binary target variable of returns being positive or negative in the following month, using three sepearate models, then average their predictions (which are probabilities). 85 | 86 | #### Equity selection 87 | * average the three models' predictions 88 | * select top 20%, go long a random half of them (10% total) 89 | * go short the market by an amount $\beta$ times the amount you go long the equity. This is to be market neutral. 90 | 91 | Beta is calculated as the correalation of the equity's daily returns with the market's returns (Nasdaq) over the previous month. We do _not_ go short any equities to avoid the complexities of shorting. 92 | 93 | The models are selected using grid-search for logistic and gradient boosting, and using default values for the LSTM. The LSTM uses time windows of length (i.e. predicting next-month's return sign using the prior 3 months' values). The LSTM could be futher optimized to possibly get better performance. 94 | 95 | #### Input features 96 | * Features come as daily 97 | * then aggregated to monthly using quantiles 98 | * 50%, 75%, and 90% quantile of daily values over the previous month. 99 | 100 | For example, five daily features would turn in to 15 monthly features (5 features * 3 quantiles). This aggregation method allows us to use information about the _higher_ values from the previous month (90% quantile) as well as the _more typical_ values (50% quantile). 101 | 102 | ### Data 103 | * 1130 tickers 104 | * 76 months (January 2013 to April 2019) 105 | * 69 features 106 |   * 23 base features, each with 3 quantiles 107 |     * 5 sentiment 108 |     * 17 fundamental 109 | 110 | 111 | 112 | #### Generating Portfolios from our Randomly Selected Equity Positions in the top perfoming **20%** based on Performance Metrics 113 | 114 | One idea was to take a random selection of positions that the strategy lists in the top **20%** -quantile. We observed 100 generated portfolios that all produced similar results, as you can see below. The figure of 100 histograms shows the distribution of their returns with a density line for each chart. 115 | 116 | 117 | 118 | 119 | Each portfolio performed similarly across the months as well. Below, you can see a single histogram with a density line describing the mean returns across all portfolios for each month. 120 | 121 | 122 | 123 | 124 | 125 | By taking the standard deviation of the monthly PnL across all the portfolios, we can observe the values increasing as time continues, as the figure below shows. 126 | 127 | 128 | 129 | 130 | 131 | #### Analysis on Generating Portfolios from Random Equity Selections: 132 | While the initial thought seemed like a good idea, in reality, the randomly selected equities to populate portfolios showed that there was no outstanding value in randomly selecting from our top performing equities in the top **20%**-quantile. This can easily be viewed in the **Distribution of Portfolio Returns...** figure showing that the portfolios more or less behaved the same over the given time period. 133 | 134 | ### Running the Strategy: 135 | 136 | 137 | ```python 138 | # i = 0.7 #fraction that is training data 139 | # b_cap = 10 #beta cap 140 | # s = True #includes sentiment features 141 | # df_test, results = develop_all_three_models(frac_training=i, use_sentiment=s)     142 | # results_summary = run_strategy(df_test, ticker_dict=raw_data['Raw_Data'], sort_column='y_pred_avg', 143 | # seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-b_cap, 144 | # beta_cap_ceil=b_cap, plot=False) 145 | ``` 146 | 147 | ### Strategy Results and Performance Metrics: 148 | 149 | 150 | ```python 151 | # take input of strategy. 152 | # total_dict = getStatsAll(results_summary) #using a beta cap of +/- 10. 153 | ``` 154 | 155 | 156 | 157 | 158 | 159 | *Notice: this analysis is in-sample inference only.* 160 | 161 | ### The longer testing periods are more reliable 162 | * The strategy is developed using progressivley larger training sets. The orange line was trained on only 10% of months (starting January 2013), and tested on the remaining 90% of months (up to April 2019). 163 | * The strategy which was tested the shortest is the brown, and its returns are not good. In its five months, it swings widely and finishes barely positive. 164 | * However, this strategy (three models estimated on training data, then averaged) is the least reliable because it was tested on the shortest period. It doesn't get much time to reveal its performance on test data, so we do not rely on these short test period. 165 | * However, using a shorter training set risks the model getting enough data to find the patterns in the data. With shorter training sets, the model could be less predictive, but we will more likely detect that it is unpredictive. With longer training sets, the model could be more predictive, but we are less likely to detect that it is unpredictive. 166 | * The brown line is the least reliable, due to its short testing period. 167 | 168 | ### The trend is up, but with a large drop at the end 169 | * By contrast, the orange line was tested on 90% of the data, giving it a long testing period to reveal its performance out-of-sample. The orange line has an upward trend and a sharp dip at the end. This suggests some optimism, but so does the market returns. Also, there is concern from the large concern for the sudden drop. 170 | 171 | ### Large drop comes from large a net short position in the market moving against us 172 | * The drop is due to large net short position in the market that happends to turn against us. This large short position comes from an unusually large individual beta. The large net short position (from one stock with unusually large beta) is not unique, but the adverse price change while holding the large position is unique. There are other periods where we hold large short market positions, but the price doesn't move so badly in those periods. In February and March, it does hurt us. 173 | 174 | ### Medium-range forecasts are similar, but with a slower start 175 | * The remaining three strategies (greed, red, purple) show similar patterns; general upward trends with a large drop near the end (February and March). However, they are begin flat for the first few months before starting their rise. 176 | 177 | ### Let's address the large drop at the end 178 | * Again, the large drop is due to large shorts market positions moving against us. It's difficult to predict when the market will move against us, so instead we will cap our short positions so that markets drops don't hurt as much (_favorable_ market moves will also help us less under capping. It limits both downside and upside). 179 | * We now apply a hard cap to the short market positions that we are taking. The betas of any given stock in any prior month are generally less than 5, but sometimes larger 10 or even 30. The strategy runs above used a cap of 10, so now we tighten the cap to 5 (applied as absolute value of 5), and 1, to see if we can soften that large drop. 180 | 181 | ## Analysis 182 | 183 | 184 | ```python 185 | # take input of strategy. 186 | # total_dict = getStatsAll(results_summary, beta_plot = True, training_split_value_to_plot=0.3) #using a beta cap of +/- 1. 187 | ``` 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | Regarding the Top Figure above: 196 | * With the tight cap on beta (between -1 and 1), we again see a general upward trend. You can see that the strategy using the training size of 30% (green line) in the tightest beta constraint configuration of 1.0, outperforms the other training sizes in the same beta cap grouping. 197 | 198 | Regarding the Middle Figure of 5 plots above: 199 | * You can see that when the beta cap group of training sizes is separated into individual charts, the split size of 30% and 50% outperform the market but only the 30% split results significantly outperform the market. 200 | 201 | Regarding the Bottom Figure above: 202 | * We choose training fraction of 0.3, then show the strategy using different caps for beta. There is mostly overlap, but the strictest beta cap (1.0) shows highest performance, especially by the end of the simulation. 203 | * Also, the sharp drops around November 2016 and March 2019 seem to be softened, as was expected from the capping. However, the small caps (5 & 10) didn't seem to curtail the drops; only cap of 1 seems to work. 204 | * For the first half of the strategy, capping seemed to have minimal effect. But by the end, the strict cap lead to the best performance. 205 | * While capping could plausibly improve PnL here, keep in mind that it would also reduce the market-neutrality, which was the point of investing the index in the first place. We do not explore whether market-neutrality was affected here. 206 | * The optimal stratey identified is when the training size is only 30% of the entire dataset with a beta cap of +/- 1.0. You can see that in the bottom figure below, the *Beta Cap: 1.0* line easily outperforms the others in the same training size group and outperforms the Market, the purple line. These results show that the strategy made smart decisions, did not take too many losses, and capitalized on an increasing Market. 207 | 208 | 209 | 210 | The image above represents the dataframe of strategy results using a beta cap of 1.0. Our best performing strategy, using the training split of 30%, had an **Annualized Return of 18.77%, Annualized Volatility of 27.73%, an Annualized Sharpe of 0.605, a Max Drawdown of 0.167, and an Alpha of 0.014.** Below, you wcan see just how the strategy performs against the market since the start of 2015. The strategy looks promising but we need to stress test to be sure. 211 | 212 | 213 | 214 | ### Stress Testing Against Simulated 2008 Performance Compared to Recent Performance Without Sentiment 215 | 216 | Since the sentiment data does not go past 2013, we are opting to stress test the strategy by observing the strategy's performance without the sentiment features from 2015 to 2019 and observe the strategy's performance between 2005 and 2010. Then we will compare the differential performance of the annualized market returns and annualized strategy returns between the two time periods. 217 | 218 | 219 | 220 | 221 | 222 | The strategy can be observed against the market from 2015 to 2019 in the time period above. Take note that the **Annualized Strategy Return is 2.74% against the Market's Annualized Return of 12.67%.** We will be comparing these along with the other listed metrics to the strategy's performance from 2015 to 2010. 223 | 224 | 225 | 226 | 227 | 228 | ### Stress Testing Analysis 229 | Now taking a look at the performance of the stressed strategy from 2007 through 2010. The strategy does produce a higher **Annualized Strategy Return of 5.09% against the Market's Annualized Return of 0.58%.** We do see a larger max drawdown and a larger portfolio beta on the stressed strategy and this can be expected. It is surprising to see that not only did the strategy perform positively during this period, it beat the market and beat the 2015-2019 period we were comparing it to. 230 | 231 | Some of the other metrics did perform better in the 2015-2019 dataset, such as a drawdown that is almost half at **0.259** and a much lower **portfolio beta at 0.332**. Volatility was higher for both the strategy and the market during the stress period and this is expected since this was a purposeful stress run during a known financial crisis. 232 | 233 | ### Concluding Thoughts: 234 | 235 | We conclude that there is evidence of proof of concept utilizing a strategy that leverages sentiment data. While these initial results seem profitable and optimistic, it would be wise to continue with our testing by continuing to fine-tune the machine learning models, refine the universe filtering, and stress test against a different variety of scenarios and not just a single crisis. Sentiment data proves to be an unique feature set and as new natural language processing techniques continue to develop and employed, sentiment data will continue to grow and offer new ways of exploring different investment opportunities. 236 | 237 | 243 | 244 | ### Bibliography & Data Sources: 245 | 1. **French, Kenneth R.** Kenneth R. French - Home Page. Accessed June 01, 2019. 246 |     http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html. 247 | 2. **Lee, Charles M.C.** 2003. Fusion Investing. Tech. Fusion Investing. AIMR. 248 | 3. **"Quandl."** Quandl.com. Accessed June 01, 2019. 249 |     https://www.quandl.com/. 250 | 4. **"The Place For Learning Quant Finance."** Quantopian. Accessed June 01, 2019. 251 |     https://www.quantopian.com/. 252 | 5. **"Where Developers Learn, Share, & Build Careers."** Stack Overflow. Accessed June 02, 2019. 253 |     https://stackoverflow.com/. 254 | 6. **Zhang, Wenbin, and Steven Skiena.** 2010. Trading Strategies to Exploit Blog and News Sentiment. Tech. 255 |     Trading Strategies to Exploit Blog and News Sentiment. Association for the Advancement of Artificial Intelligence. 256 | -------------------------------------------------------------------------------- /HM Capital Management Strategy Write Up.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# HM Capital Management\n", 8 | "[Robert Hatem](https://www.linkedin.com/in/robert-e-hatem/ \"LinkedIn Profile: Robert Hatem\") \n", 9 | "[Benjamin Morgan](https://www.linkedin.com/in/benjaminmorgan0921/ \"LinkedIn Profile: Benjamin Morgan\") \n", 10 | "Spring 2019" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Motivation Behind Exploring this Strategy:\n", 18 | "\n", 19 | "The motivation behind exploring this strategy is to explore the investable opportunities in equities by analyzing their sentimental features. Quandl provided pre-computed sentiment features for equities that were easily accessible to analyze. Additionally, we wanted to explore basic machine learning algorithms for predicting next period returns and measure performance against the market as a benchmark. \n", 20 | "\n", 21 | "\n", 22 | "\n", 23 | "Skiena et. al. (2010) uses sentiment data to build simple long-short strategies. Skiena’s dataset contains similar information to ours; it uses feature based on “polarity”, how positive or negative the mention is, and volume of news mentions. However, one difference is that Skiena separates information from blogs and news, while our dataset contains all sources together. They find that patterns from opinion in blogs persists longer than from news, which makes sense since opinions are less likely to contain “true” information that will change investors’ investment decisions. This paper provides evidence that sentiment-based investing, which we are analyzing in this write-up, is widespread. \n", 24 | "\n", 25 | "Fundamental analysis and investing is a widely-known investing technique that is used in different approaches but is ultimately based on how the company is perfoming as represented by their fundamental financial data. Charles Lee, the Henrietta J. Louis Professor at Cornell University, strives to connect fundamental value and investor sentiment with what is known as \"fusion investing\" in his Fusion Investing paper (2003). Lee states that while \"researchers are finding that even though returns are difficult to predict, prediction is not an impossible task\" and that there are observable systematic patterns in price return series (Lee 2003). While Lee takes the position of observing sentiment value from an investor, our strategy takes the available news sentiment data of the company, and not an individual investor, and attempts to predict performance within those believed systematic patterns." 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Creating a Universe of Equities to Use:\n", 33 | "The idea of using sentiment as a basis for our strategy requires a few different components. First, we would need a universe of equities to use. We also need specific sentiment data for the strategy, end of day data to measure performance, and fundamental financial ratios as a complement to the sentiment data. The fundamental data consists of a set of Zack's files from the Quandl database and both the sentiment data and the end of day data were queried from Quandl using their Quandl API. \n", 34 | "\n", 35 | "Since the fundamental data is from fixed csv files, which can be found in [Quandl's Zacks Collection B database](https://www.quandl.com/databases/ZFB/data \"Quandl Zacks\"), that's where the universe of equities started. The fundamental dataset consists of **9107** unique tickers which established the foundation for our universe. The fundamental dataset contains the following columns as features: *Weighted Average Shares Outstanding, Current Ratio, Gross Margin, Operating Profit Margin, EBIT Margin, Pretax Profit Margin, Profit Margin, Free Cash Flow, Asset Turnover, Inventory Turover, Receivables Turnover, Days Sales in Receivables, Return on Equity, Return on Tangible Equity, Return on Assets, Return on Investments, Free Cash Flow Per Share, Book Value Per Share, and Operating Cash Flow Per Share.*\n", 36 | "\n", 37 | "With the understanding that end of day data would be a bit easier to find than sentiment, we queried [quandl's sentiment database](https://www.quandl.com/data/NS1-FinSentS-Web-News-Sentiment \"Quandl FinSent\") with the **9107** tickers. The database consists of 5 different sentiment features: *Sentiment, Sentiment High, Sentiment Low, News Volume,* and *News Buzz*. \n", 38 | " - The *Sentiment* feature is a numeric measure of the bullishness / bearishness of news coverage of the stock.\n", 39 | " - The *Sentiment High/Low* feature is the highest and lowest intraday sentiment scores.\n", 40 | " - The *News Volume* feature is the absolute number of news articles covering the stock.\n", 41 | " - The *News Buzz* feature is a numeric measure of the change in coverage volume for the stock.\n", 42 | " \n", 43 | "We found that after filtering out the tickers that were not present in the database and handling any errors and data quality issues, such as missing data, the sentiment dataset held **4753** unique tickers with the 5 different feature sets. \n", 44 | "\n", 45 | "At that point, we took the smaller of the two ticker lists and scraped [Quandl's EOD database](https://www.quandl.com/data/EOD-End-of-Day-US-Stock-Prices \"Quandl EOD\"). After handling any present data quality issues, the EOD database contained **2490** unique tickers. \n", 46 | "\n", 47 | "After examining the three unique datasets, the selection of tickers to use for the merging of the three datasets came from the database with the smallest number of tickers: the end of day data. \n", 48 | "\n", 49 | "Running through the concatenating loop and adding a few more necessary filters to handle any remaining data quality issues during the concatenation, leaves a total of **1130** tickers in our universe with the necessary fundamental data, sentiment data, and end of day data. \n", 50 | "\n", 51 | "The output of the _ObtainDataFile.py_ script is a dictionary with two initial keys: *Raw_Data* and *Factors*. The *Factors* key contains a dictionary of [monthly](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html \"Fama-French 5 Factor Monthly Data\") and [daily](https://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/f-f_5_factors_2x3.html \"Fama-French 5 Factor Daily Data\") Five Factor datasets from [Fama and French](http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html \"Kenneth French's Website\"). The *Raw_Data* value contains **1130** dictionaries with the ticker as the key and a dataframe from the concatenation as the value.\n", 52 | "\n", 53 | "\n", 54 | "\n", 55 | "" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 25, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "# from ObtainDataFile import *\n", 65 | "#from PerformanceMetrics import *\n", 66 | "# from strategy import *" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "By running the *start_running()* function, you will start the process of gathering and combining the data streams together to return a single dictionary data structure with the universe of tickers." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 10, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "Data compilation is complete. \n", 86 | "1130 tickers in universe.\n", 87 | "File name is complete_dataset.pickle and is located in the local directory.\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "# all_data = start_running()\n", 93 | "# raw_data = pd.read_pickle('complete_dataset.pickle')" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "This *.read_pickle()* statement will read, instead of compile, the single dictionary data structure with the universe of tickers. " 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## Strategy Description: \n", 108 | "\n", 109 | "This is a long-only equity strategy with monthly frequency. The equities for the given month are selected based on their predicted probability of having positive returns in the next month. The probability is predicted using three models averaged together:\n", 110 | "1. logistic regression\n", 111 | "2. gradient boosting, \n", 112 | "3. LSTM (long short-term memory) \n", 113 | "\n", 114 | "#### Target Variable\n", 115 | "* 0: returns in following month are _negative_\n", 116 | "* 1: returns in following month are _positive_\n", 117 | "\n", 118 | "In other words, we predict the binary target variable of returns being positive or negative in the following month, using three sepearate models, then average their predictions (which are probabilities).\n", 119 | "\n", 120 | "#### Equity selection\n", 121 | "* average the three models' predictions\n", 122 | "* select top 20%, go long a random half of them (10% total)\n", 123 | "* go short the market by an amount $\\beta$ times the amount you go long the equity. This is to be market neutral.\n", 124 | "\n", 125 | "Beta is calculated as the correalation of the equity's daily returns with the market's returns (Nasdaq) over the previous month. We do _not_ go short any equities to avoid the complexities of shorting.\n", 126 | "\n", 127 | "The models are selected using grid-search for logistic and gradient boosting, and using default values for the LSTM. The LSTM uses time windows of length (i.e. predicting next-month's return sign using the prior 3 months' values). The LSTM could be futher optimized to possibly get better performance.\n", 128 | "\n", 129 | "#### Input features\n", 130 | "* Features come as daily\n", 131 | "* then aggregated to monthly using quantiles\n", 132 | "* 50%, 75%, and 90% quantile of daily values over the previous month. \n", 133 | "\n", 134 | "For example, five daily features would turn in to 15 monthly features (5 features * 3 quantiles). This aggregation method allows us to use information about the _higher_ values from the previous month (90% quantile) as well as the _more typical_ values (50% quantile).\n", 135 | "\n", 136 | "### Data\n", 137 | "* 1130 tickers\n", 138 | "* 76 months (January 2013 to April 2019)\n", 139 | "* 69 features\n", 140 | "  * 23 base features, each with 3 quantiles\n", 141 | "    * 5 sentiment \n", 142 | "    * 17 fundamental" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "#### Generating Portfolios from our Randomly Selected Equity Positions in the top perfoming **20%** based on Performance Metrics" 157 | ] 158 | }, 159 | { 160 | "cell_type": "markdown", 161 | "metadata": {}, 162 | "source": [ 163 | "One idea was to take a random selection of positions that the strategy lists in the top **20%** -quantile. We observed 100 generated portfolios that all produced similar results, as you can see below. The figure of 100 histograms shows the distribution of their returns with a density line for each chart. " 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "\n", 171 | "" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "Each portfolio performed similarly across the months as well. Below, you can see a single histogram with a density line describing the mean returns across all portfolios for each month. " 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "\n", 186 | "\n", 187 | "" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "By taking the standard deviation of the monthly PnL across all the portfolios, we can observe the values increasing as time continues, as the figure below shows. " 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "\n", 202 | "\n", 203 | "" 204 | ] 205 | }, 206 | { 207 | "cell_type": "markdown", 208 | "metadata": {}, 209 | "source": [ 210 | "#### Analysis on Generating Portfolios from Random Equity Selections:\n", 211 | "While the initial thought seemed like a good idea, in reality, the randomly selected equities to populate portfolios showed that there was no outstanding value in randomly selecting from our top performing equities in the top **20%**-quantile. This can easily be viewed in the **Distribution of Portfolio Returns...** figure showing that the portfolios more or less behaved the same over the given time period. " 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "### Running the Strategy:" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "# i = 0.7 #fraction that is training data\n", 228 | "# b_cap = 10 #beta cap\n", 229 | "# s = True #includes sentiment features\n", 230 | "# df_test, results = develop_all_three_models(frac_training=i, use_sentiment=s)    \n", 231 | "# results_summary = run_strategy(df_test, ticker_dict=raw_data['Raw_Data'], sort_column='y_pred_avg', \n", 232 | "# seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-b_cap, \n", 233 | "# beta_cap_ceil=b_cap, plot=False)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "### Strategy Results and Performance Metrics:" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# take input of strategy. \n", 250 | "# total_dict = getStatsAll(results_summary) #using a beta cap of +/- 10. " 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": {}, 256 | "source": [ 257 | "\n", 258 | "\n", 259 | "" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "*Notice: this analysis is in-sample inference only.*\n", 267 | "\n", 268 | "### The longer testing periods are more reliable\n", 269 | "* The strategy is developed using progressivley larger training sets. The orange line was trained on only 10% of months (starting January 2013), and tested on the remaining 90% of months (up to April 2019).\n", 270 | "* The strategy which was tested the shortest is the brown, and its returns are not good. In its five months, it swings widely and finishes barely positive.\n", 271 | "* However, this strategy (three models estimated on training data, then averaged) is the least reliable because it was tested on the shortest period. It doesn't get much time to reveal its performance on test data, so we do not rely on these short test period. \n", 272 | "* However, using a shorter training set risks the model getting enough data to find the patterns in the data. With shorter training sets, the model could be less predictive, but we will more likely detect that it is unpredictive. With longer training sets, the model could be more predictive, but we are less likely to detect that it is unpredictive. \n", 273 | "* The brown line is the least reliable, due to its short testing period.\n", 274 | "\n", 275 | "### The trend is up, but with a large drop at the end\n", 276 | "* By contrast, the orange line was tested on 90% of the data, giving it a long testing period to reveal its performance out-of-sample. The orange line has an upward trend and a sharp dip at the end. This suggests some optimism, but so does the market returns. Also, there is concern from the large concern for the sudden drop.\n", 277 | "\n", 278 | "### Large drop comes from large a net short position in the market moving against us\n", 279 | "* The drop is due to large net short position in the market that happends to turn against us. This large short position comes from an unusually large individual beta. The large net short position (from one stock with unusually large beta) is not unique, but the adverse price change while holding the large position is unique. There are other periods where we hold large short market positions, but the price doesn't move so badly in those periods. In February and March, it does hurt us.\n", 280 | "\n", 281 | "### Medium-range forecasts are similar, but with a slower start\n", 282 | "* The remaining three strategies (greed, red, purple) show similar patterns; general upward trends with a large drop near the end (February and March). However, they are begin flat for the first few months before starting their rise.\n", 283 | "\n", 284 | "### Let's address the large drop at the end\n", 285 | "* Again, the large drop is due to large shorts market positions moving against us. It's difficult to predict when the market will move against us, so instead we will cap our short positions so that markets drops don't hurt as much (_favorable_ market moves will also help us less under capping. It limits both downside and upside).\n", 286 | "* We now apply a hard cap to the short market positions that we are taking. The betas of any given stock in any prior month are generally less than 5, but sometimes larger 10 or even 30. The strategy runs above used a cap of 10, so now we tighten the cap to 5 (applied as absolute value of 5), and 1, to see if we can soften that large drop." 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": {}, 292 | "source": [ 293 | "## Analysis" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# take input of strategy. \n", 303 | "# total_dict = getStatsAll(results_summary, beta_plot = True, training_split_value_to_plot=0.3) #using a beta cap of +/- 1. " 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "\n", 311 | "\n", 312 | "\n", 313 | "\n", 314 | "" 315 | ] 316 | }, 317 | { 318 | "cell_type": "markdown", 319 | "metadata": {}, 320 | "source": [ 321 | "Regarding the Top Figure above:\n", 322 | "* With the tight cap on beta (between -1 and 1), we again see a general upward trend. You can see that the strategy using the training size of 30% (green line) in the tightest beta constraint configuration of 1.0, outperforms the other training sizes in the same beta cap grouping. \n", 323 | "\n", 324 | "Regarding the Middle Figure of 5 plots above: \n", 325 | "* You can see that when the beta cap group of training sizes is separated into individual charts, the split size of 30% and 50% outperform the market but only the 30% split results significantly outperform the market. \n", 326 | "\n", 327 | "Regarding the Bottom Figure above:\n", 328 | "* We choose training fraction of 0.3, then show the strategy using different caps for beta. There is mostly overlap, but the strictest beta cap (1.0) shows highest performance, especially by the end of the simulation.\n", 329 | "* Also, the sharp drops around November 2016 and March 2019 seem to be softened, as was expected from the capping. However, the small caps (5 & 10) didn't seem to curtail the drops; only cap of 1 seems to work.\n", 330 | "* For the first half of the strategy, capping seemed to have minimal effect. But by the end, the strict cap lead to the best performance.\n", 331 | "* While capping could plausibly improve PnL here, keep in mind that it would also reduce the market-neutrality, which was the point of investing the index in the first place. We do not explore whether market-neutrality was affected here.\n", 332 | "* The optimal stratey identified is when the training size is only 30% of the entire dataset with a beta cap of +/- 1.0. You can see that in the bottom figure below, the *Beta Cap: 1.0* line easily outperforms the others in the same training size group and outperforms the Market, the purple line. These results show that the strategy made smart decisions, did not take too many losses, and capitalized on an increasing Market." 333 | ] 334 | }, 335 | { 336 | "cell_type": "markdown", 337 | "metadata": {}, 338 | "source": [ 339 | "" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "The image above represents the dataframe of strategy results using a beta cap of 1.0. Our best performing strategy, using the training split of 30%, had an **Annualized Return of 18.77%, Annualized Volatility of 27.73%, an Annualized Sharpe of 0.605, a Max Drawdown of 0.167, and an Alpha of 0.014.** Below, you wcan see just how the strategy performs against the market since the start of 2015. The strategy looks promising but we need to stress test to be sure. " 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": {}, 352 | "source": [ 353 | "" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | "### Stress Testing Against Simulated 2008 Performance Compared to Recent Performance Without Sentiment" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "Since the sentiment data does not go past 2013, we are opting to stress test the strategy by observing the strategy's performance without the sentiment features from 2015 to 2019 and observe the strategy's performance between 2005 and 2010. Then we will compare the differential performance of the annualized market returns and annualized strategy returns between the two time periods." 368 | ] 369 | }, 370 | { 371 | "cell_type": "markdown", 372 | "metadata": {}, 373 | "source": [ 374 | "\n", 375 | "\n", 376 | "" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "The strategy can be observed against the market from 2015 to 2019 in the time period above. Take note that the **Annualized Strategy Return is 2.74% against the Market's Annualized Return of 12.67%.** We will be comparing these along with the other listed metrics to the strategy's performance from 2015 to 2010. " 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "\n", 391 | "\n", 392 | "" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "metadata": {}, 398 | "source": [ 399 | "### Stress Testing Analysis\n", 400 | "Now taking a look at the performance of the stressed strategy from 2007 through 2010. The strategy does produce a higher **Annualized Strategy Return of 5.09% against the Market's Annualized Return of 0.58%.** We do see a larger max drawdown and a larger portfolio beta on the stressed strategy and this can be expected. It is surprising to see that not only did the strategy perform positively during this period, it beat the market and beat the 2015-2019 period we were comparing it to. \n", 401 | "\n", 402 | "Some of the other metrics did perform better in the 2015-2019 dataset, such as a drawdown that is almost half at **0.259** and a much lower **portfolio beta at 0.332**. Volatility was higher for both the strategy and the market during the stress period and this is expected since this was a purposeful stress run during a known financial crisis. " 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "### Concluding Thoughts:" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "We conclude that there is evidence of proof of concept utilizing a strategy that leverages sentiment data. While these initial results seem profitable and optimistic, it would be wise to continue with our testing by continuing to fine-tune the machine learning models, refine the universe filtering, and stress test against a different variety of scenarios and not just a single crisis. Sentiment data proves to be an unique feature set and as new natural language processing techniques continue to develop and employed, sentiment data will continue to grow and offer new ways of exploring different investment opportunities. " 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": {}, 434 | "source": [ 435 | "### Bibliography & Data Sources: \n", 436 | "1. **French, Kenneth R.** Kenneth R. French - Home Page. Accessed June 01, 2019. \n", 437 | "    http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/index.html. \n", 438 | "2. **Lee, Charles M.C.** 2003. Fusion Investing. Tech. Fusion Investing. AIMR.\n", 439 | "3. **\"Quandl.\"** Quandl.com. Accessed June 01, 2019. \n", 440 | "    https://www.quandl.com/. \n", 441 | "4. **\"The Place For Learning Quant Finance.\"** Quantopian. Accessed June 01, 2019. \n", 442 | "    https://www.quantopian.com/. \n", 443 | "5. **\"Where Developers Learn, Share, & Build Careers.\"** Stack Overflow. Accessed June 02, 2019. \n", 444 | "    https://stackoverflow.com/.\n", 445 | "6. **Zhang, Wenbin, and Steven Skiena.** 2010. Trading Strategies to Exploit Blog and News Sentiment. Tech. \n", 446 | "    Trading Strategies to Exploit Blog and News Sentiment. Association for the Advancement of Artificial Intelligence." 447 | ] 448 | } 449 | ], 450 | "metadata": { 451 | "kernelspec": { 452 | "display_name": "Python 3", 453 | "language": "python", 454 | "name": "python3" 455 | }, 456 | "language_info": { 457 | "codemirror_mode": { 458 | "name": "ipython", 459 | "version": 3 460 | }, 461 | "file_extension": ".py", 462 | "mimetype": "text/x-python", 463 | "name": "python", 464 | "nbconvert_exporter": "python", 465 | "pygments_lexer": "ipython3", 466 | "version": "3.6.8" 467 | } 468 | }, 469 | "nbformat": 4, 470 | "nbformat_minor": 2 471 | } 472 | -------------------------------------------------------------------------------- /strategy.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import matplotlib.pyplot as plt 3 | import pandas as pd 4 | import numpy as np 5 | import scipy as sp 6 | import functools 7 | import seaborn as sns 8 | import time 9 | import h5py 10 | import statsmodels.api as sm 11 | import copy 12 | import pickle 13 | import pdb 14 | from math import ceil 15 | from scipy import stats 16 | import importlib 17 | 18 | from sklearn.linear_model import LinearRegression 19 | from sklearn.preprocessing import Binarizer 20 | from sklearn.preprocessing import MinMaxScaler 21 | 22 | 23 | 24 | def aggregate_from_daily(df, ticker): 25 | """ 26 | Use in combine_all_tickers. 27 | Aggregates data from daily to monthly level. 28 | """ 29 | 30 | df_temp = df.copy() 31 | df_temp.index.names = ['Date'] 32 | 33 | ticker_price_beginning = df_temp.loc[:,['Adj_Close']].resample('M').first().rename(columns={'Adj_Close':'ticker_price_beginning'}) 34 | ticker_price_end = df_temp.loc[:,['Adj_Close']].resample('M').last().rename(columns={'Adj_Close':'ticker_price_end'}) 35 | mkt_price_beginning = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').first().rename(columns={'QQQ_Adj_Close':'mkt_price_beginning'}) 36 | mkt_price_end = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').last().rename(columns={'QQQ_Adj_Close':'mkt_price_end'}) 37 | df4 = ticker_price_beginning.merge(ticker_price_end, left_index=True, right_index=True, how='outer').merge(mkt_price_beginning, left_index=True, right_index=True, how='outer').merge(mkt_price_end, left_index=True, right_index=True, how='outer') 38 | 39 | custom_aggretator = lambda array_like: array_like.max() # aggregate days to month, for sentiment features 40 | 41 | df1 = df_temp.drop(columns=['Adj_Close', 'Adj_Volume','QQQ_Adj_Close']).resample('M').apply(custom_aggretator) # for sentiment features, take max 42 | df2 = df_temp.loc[:, ['Adj_Volume']].resample('M').median().merge(df1, left_index=True, right_index=True, how='outer') # for volume, take median 43 | 44 | df2 = df2.assign(Ticker=ticker).reset_index().set_index(['Date','Ticker']).rename(columns={'Adj_Volume': 'volume'}) 45 | df5 = df4.merge(df2, left_index=True, right_index=True, how='outer') 46 | return df5 47 | 48 | def combine_all_tickers(ticker_dict): 49 | """ 50 | Combines dictionary-of-dataframes into one dataframe. 51 | """ 52 | 53 | first_key = list(ticker_dict.keys())[0] 54 | df1 = ticker_dict[first_key].copy() # df for first ticker 55 | first_df_cols = df1.columns.tolist() 56 | df2 = aggregate_from_daily(df1, first_key) # aggregate to monthly level 57 | j=0 58 | for key, value in ticker_dict.items(): # for each ticker, aggregate then concat to master df 59 | #print(key,j) 60 | #j+=1 61 | if key==first_key: continue 62 | if first_df_cols != value.columns.tolist(): print('bad columns for {}!'.format(key)) 63 | df3 = aggregate_from_daily(value, key) 64 | df2 = pd.concat([df2, df3]) 65 | 66 | df2 = df2.sort_index(level=[0,1]) 67 | return df2 68 | 69 | # used in create_weights() 70 | def sigmoid(x): 71 | p = 1 / (1 + np.exp(-x)) 72 | return p/p.sum() 73 | 74 | # used in create_weights() 75 | def position_weights(x): 76 | x = x.astype(float).values.copy() 77 | idx_nonzero = np.nonzero(x) 78 | x[idx_nonzero] = sigmoid( x[idx_nonzero] ) # change the non-zero scores to 79 | return pd.Series(x) 80 | 81 | 82 | def transform_function(x, seed=False, num_long_tickers=10, mult_extra=2.): 83 | """ 84 | Used in create_weights(). Takes in series, randomly puts one 85 | for random num_long_tickers tickers. 86 | Seed is if you don't want to choose randomly, to get reproducible 87 | strategy. 88 | """ 89 | #mult_extra=2 90 | 91 | if seed: np.random.seed(42) 92 | if num_long_tickers*mult_extra > x.shape[0]: print('mult_extra is too big!') 93 | output = np.concatenate([np.ones(num_long_tickers*mult_extra), np.zeros( x.shape[0]-mult_extra*num_long_tickers)]) 94 | indices = np.random.choice(np.arange(mult_extra*num_long_tickers), replace=False, size=num_long_tickers) 95 | output[indices] = 0. 96 | return pd.Series(output) # go long top fraction of tickers, no short positions 97 | 98 | 99 | def create_weights(df, sort_column='score', frac_long_tickers=0.01, seed=False, mult_extra=2.): 100 | """ 101 | For each month, find the tickers that we will go long. 102 | Calculate their weights accoring to the ranking column. 103 | Shift those weights a month forward so we can trade on them. 104 | """ 105 | 106 | df1 = df.copy() 107 | num_tickers = df1.index.get_level_values(1).unique().shape[0] 108 | num_long_tickers = int(round(num_tickers*frac_long_tickers)) 109 | #transform_function = lambda x: pd.Series(np.concatenate([np.ones(num_long_tickers), np.zeros(x.shape[0]-2*num_long_tickers), np.zeros(num_long_tickers)])) # go long top fraction of tickers, no short positions. DEPRECATED 110 | transform_function1 = lambda x: transform_function(x, seed=seed, num_long_tickers=num_long_tickers, mult_extra=int(mult_extra)) 111 | 112 | df2 = df1.loc[:,[sort_column]].sort_values(by=['Date',sort_column], ascending=[True, False]).groupby(level=0, as_index=False).transform(transform_function1) # turn into position (1,0,0). transform takes in series 113 | 114 | # print(num_long_tickers, df2.loc[:,sort_column].groupby(level=0).sum()) 115 | df2 = df2.rename(columns={sort_column:'position_current'}) 116 | df1 = df1.loc[:,[sort_column, 'ticker_price_beginning', 'ticker_price_end', 'mkt_price_beginning', 'mkt_price_end']] 117 | df3 = df1.merge(df2, left_index=True, right_index=True, how='outer').sort_values(by=['Date','Ticker'], ascending=[True, True]) # merge position (1,0,-1) with other columns 118 | df3 = df3.assign(position_predictive=df3.position_current.shift(num_tickers)) #.sort_values(by=['Date',sort_column], ascending=[True, False]) # predictive takes position from _previous_ month 119 | 120 | df3 = df3.assign(score_current = df3[sort_column]*df3.position_current) 121 | df3 = df3.assign(weight_current = df3.loc[:,['score_current']].groupby(level=0, as_index=False).transform(position_weights).iloc[:,0]) # find weights based on current rankings 122 | df3 = df3.assign(weight_pred = df3.weight_current.shift(num_tickers)) # find weights based on current rankings 123 | df3 = df3.drop(columns=['score_current']) 124 | 125 | df4 = df3[(df3.position_predictive!=0)& (df3.position_predictive.notna())].copy() 126 | return df4 127 | 128 | def calculate_capital(df, initial_capital=1e6): 129 | """ 130 | Start with the initial capital. For each month, spend it all on long positions. 131 | Then calculate the caplital left at end of the month, and re-invest that amount 132 | next month. 133 | """ 134 | 135 | i=0 136 | for date, new_df in df.groupby(level=0): # iterate through dates, compute new capital held, and new positions 137 | if i==0: 138 | new_df1 = new_df.copy() 139 | new_df1 = new_df1.assign(total_notional_begin = initial_capital) 140 | new_df1 = new_df1.assign(notional_begin = new_df1.total_notional_begin *new_df1.weight_pred) # must provide initial capital 141 | new_df1 = new_df1.assign(num_shares_begin = np.floor(new_df1.notional_begin/new_df1.ticker_price_beginning)) # buy the number of shares afforded by the capital to spend 142 | new_df1 = new_df1.assign(notional_end = new_df1.ticker_price_end*new_df1.num_shares_begin) # exit position, calculate ending capital 143 | new_df1 = new_df1.assign(cashflow= new_df1.notional_end - new_df1.notional_begin) # create cashflows 144 | total_notional_end = new_df1.notional_end.groupby(level=0).sum().values[0] # sum the ending notional accros assets 145 | new_df1 = new_df1.assign(total_notional_end=total_notional_end) 146 | i+=1 147 | else: 148 | new_df2 = new_df.copy() 149 | new_df2 = new_df2.assign(total_notional_begin = total_notional_end) # use total_notional_end from previous iteration 150 | new_df2 = new_df2.assign(notional_begin = new_df2.total_notional_begin *new_df2.weight_pred) # must provide initial capital 151 | new_df2 = new_df2.assign(num_shares_begin = np.floor(new_df2.notional_begin/new_df2.ticker_price_beginning)) 152 | new_df2 = new_df2.assign(notional_end = new_df2.ticker_price_end*new_df2.num_shares_begin) 153 | new_df2 = new_df2.assign(cashflow= new_df2.notional_end - new_df2.notional_begin) # create cashflows 154 | total_notional_end = new_df2.notional_end.groupby(level=0).sum().values[0] 155 | new_df2 = new_df2.assign(total_notional_end=total_notional_end) 156 | new_df1 = pd.concat([new_df1, new_df2]) 157 | return new_df1 158 | 159 | 160 | def calculate_beta(df, ticker_dict, beta_cap_floor=-10.0, beta_cap_ceil=10.0): 161 | """ 162 | For each of the tickers that we go long, compute beta from prior month's daily returns. 163 | """ 164 | 165 | df1 = df.copy() 166 | df_temp = df1.assign(beta=0.) 167 | df_temp = df_temp.loc[:,['beta']] 168 | 169 | for index, row in df_temp.iterrows(): 170 | date_end = index[0] - pd.tseries.offsets.MonthEnd(1) # dates for prior month 171 | date_begin = date_end - pd.tseries.offsets.MonthBegin(1) 172 | ticker = index[1] 173 | 174 | df2 = ticker_dict[ticker][date_begin:date_end].copy() # select prior month's data 175 | df2 = df2[df2.index.dayofweek < 5] # remove weekends 176 | df2 = df2.loc[:,['Adj_Close','QQQ_Adj_Close']] # only need ticker's price and QQQ price 177 | 178 | X = df2.values[:,[0]] 179 | y = df2.values[:,[1]] 180 | reg = LinearRegression().fit(X, y) # run regression 181 | df_temp.at[index,'beta'] = reg.coef_ # put in beta 182 | 183 | df1 = df1.assign(beta=df_temp.beta.clip(beta_cap_floor, beta_cap_ceil)) # add beta column, with beta clipped 184 | return df1 185 | 186 | 187 | def calculate_mkt_positions(df): 188 | """ 189 | Use the betas to calculate the positions in the index. The market positions are short the same amount (except 190 | rounding) as the long positions times beta. 191 | """ 192 | df1 = df.copy() 193 | df1 = df1.assign(notional_begin_mkt = - df1.beta * df1.notional_begin) # if long x, then short (beta) * x 194 | df1 = df1.assign(total_notional_begin_mkt=0.) # start with no market positions 195 | df1 = df1.assign(num_shares_begin_mkt = np.floor(df1.notional_begin_mkt/df1.mkt_price_beginning)) # buy value -beta*num_shares for each ticker 196 | df1 = df1.assign(notional_end_mkt = df1.mkt_price_end*df1.num_shares_begin_mkt) # notional held at end of month, after exiting positions 197 | 198 | df1 = df1.assign(cashflow_mkt = df1.notional_end_mkt - df1.notional_begin_mkt) # cashflow over month period 199 | df1a = df1.loc[:,['cashflow_mkt']].groupby(by=['Date']).sum().rename(columns={'cashflow_mkt':'total_notional_end_mkt'}) # ending notional summed across assets 200 | df1 = df1.merge(df1a, left_index=True, right_index=True, how='inner') 201 | 202 | i =df1.index.get_level_values(0)[0] # first date 203 | num_tickers = df1.loc[i,:].index.values.shape[0] # number of tickers, for shifting 204 | df1 = df1.assign(notional_from_prior_period = df1.total_notional_end_mkt.shift(num_tickers)) # shift, to add later 205 | df1.notional_from_prior_period.fillna(0, inplace=True) 206 | 207 | df1 = df1.assign(total_notional = df1.total_notional_end + df1.total_notional_end_mkt + df1.notional_from_prior_period) # add tickers, market, and leftover from prior period's market 208 | return df1 209 | 210 | 211 | def calculate_pnl_sub_strategy(df, initial_capital=1e6): 212 | """ 213 | Aggregates monthly positions across tickers to get monthly PnL 214 | """ 215 | df1 = df.loc[:,['total_notional']].groupby(level=0, as_index=True).median() # take median of total_notional, which are all same anyway 216 | 217 | df3 = df.loc[:,['mkt_price_beginning','mkt_price_end']].groupby(by='Date').first() # take market prices 218 | df3 = df3.assign(returns_mkt = (df3.mkt_price_end - df3.mkt_price_beginning)/df3.mkt_price_beginning ) # market returns over month 219 | 220 | df1 = df1.assign(returns_mkt = df3.returns_mkt) 221 | 222 | #df1 = df1.assign(mkt_price_beginning = df3.mkt_price_beginning.values) 223 | #df1 = df1.assign(mkt_price_end = df3.mkt_price_end.values) 224 | 225 | #ind = np.array([np.datetime64('2013-01-31')]) # add Jan '13 data point 226 | #df2 = pd.DataFrame(data={'total_notional':initial_capital, 'returns_mkt':np.nan, 'mkt_price_beginning':np.nan, 'mkt_price_end':np.nan}, index=ind) 227 | #df2.index = df2.index.rename('Date') 228 | #df2 = pd.concat([df1,df2]).sort_index() 229 | 230 | df2 = df1.copy() 231 | df2 = df2.assign(returns_strategy=df2.total_notional.pct_change()) 232 | 233 | df2 = df2[['returns_mkt','returns_strategy','total_notional']] 234 | df2 = df2.rename(columns={'total_notional':'pnl'}) 235 | return df2 236 | 237 | 238 | def run_strategy(df, ticker_dict, sort_column='score', frac_long_tickers=0.01, seed=False, mult_extra=2., beta_cap_floor=-10.0, beta_cap_ceil=10.0, plot=False): 239 | df2=df.copy() 240 | df3 = create_weights(df2, sort_column=sort_column, frac_long_tickers=frac_long_tickers, seed=seed, mult_extra=mult_extra) 241 | df4 = calculate_capital(df3, initial_capital=1e6) 242 | #print('Calculated capital for each month') 243 | df5 = calculate_beta(df4, ticker_dict=ticker_dict, beta_cap_floor=beta_cap_floor, beta_cap_ceil=beta_cap_ceil) 244 | #print('Calculated market beta of the ticker-level returns') 245 | df6 = calculate_mkt_positions(df5) 246 | #print('Calculated market positions from the betas') 247 | df7 = calculate_pnl_sub_strategy(df6) 248 | if plot: df7.loc[:,['pnl']].plot() 249 | return df7 250 | 251 | 252 | def print_hi(): 253 | print('no more') 254 | 255 | 256 | ##### Create features for machine learning ##### 257 | 258 | def aggregate_from_daily_ml(df, ticker): 259 | """ 260 | Use in combine_all_tickers_ml. Aggregates data from daily to monthly level. 261 | Features are quantiles over previous month. 262 | """ 263 | 264 | df_temp = df.copy() 265 | df_temp.index.names = ['Date'] 266 | ticker_price_beginning = df_temp.loc[:,['Adj_Close']].resample('M').first().rename(columns={'Adj_Close':'ticker_price_beginning'}) 267 | ticker_price_end = df_temp.loc[:,['Adj_Close']].resample('M').last().rename(columns={'Adj_Close':'ticker_price_end'}) 268 | mkt_price_beginning = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').first().rename(columns={'QQQ_Adj_Close':'mkt_price_beginning'}) 269 | mkt_price_end = df_temp.loc[:,['QQQ_Adj_Close']].resample('M').last().rename(columns={'QQQ_Adj_Close':'mkt_price_end'}) 270 | df4 = ticker_price_beginning.merge(ticker_price_end, left_index=True, right_index=True, how='outer').merge(mkt_price_beginning, left_index=True, right_index=True, how='outer').merge(mkt_price_end, left_index=True, right_index=True, how='outer') 271 | df4.columns = pd.MultiIndex.from_tuples([(col, 'NA') for col in df4.columns.tolist()]) 272 | 273 | quantiles = [.5,.75,.9] 274 | custom_aggretator = lambda array_like: array_like.quantile(q=quantiles) # aggregate days to month, for sentiment features 275 | 276 | df1 = df_temp.drop(columns=['Adj_Close','QQQ_Adj_Close']).resample('M').apply(custom_aggretator) # for sentiment features, take max 277 | 278 | #df6 = df_temp.drop(columns=['Adj_Close','QQQ_Adj_Close']).rolling(30).agg(custom_aggretator) #.resample('M') 279 | 280 | df1 = df1.rename(columns={'Adj_Volume':'Volume'}) 281 | df1 = df1.unstack() # turn the row index into columns 282 | 283 | df2 = df1.iloc[:,0:3].drop([0.75,0.9], axis=1, level=1) # keep only median for volume 284 | df1 = df1.drop('Volume', axis=1, level=0) 285 | df1 = df1.merge(df2, left_index=True, right_index=True) 286 | 287 | df4 = df4.merge(df1, left_index=True, right_index=True, how='inner') 288 | df4 = df4.assign(Ticker=ticker).reset_index().set_index(['Date','Ticker']) 289 | 290 | df4[('Returns','Next_Month')] = ((df4[('ticker_price_end', 'NA')] - df4[('ticker_price_beginning', 'NA')]) / df4[('ticker_price_beginning', 'NA')]).shift(-1) 291 | 292 | return df4 293 | 294 | 295 | # one-time function 296 | def create_features(overwrite=False): 297 | """ 298 | Combines dictionary-of-dataframes into one dataframe. 299 | Creates features as it goes. 300 | Creates data.pkl 301 | """ 302 | 303 | data_dict = pd.read_pickle('complete_dataset.pickle') # upload dictionary of tickers 304 | ticker_dict = data_dict['Raw_Data'] 305 | 306 | # initialize dataframe 307 | first_key = list(ticker_dict.keys())[0] # find the first ticker 308 | df1 = ticker_dict[first_key].copy() # df for first ticker 309 | first_df_cols = df1.columns.tolist() 310 | df2 = aggregate_from_daily_ml(df1, first_key) # aggregate to monthly level 311 | j=0 312 | for key, value in ticker_dict.items(): # for each ticker, aggregate then concat to master df 313 | if key==first_key: continue 314 | if first_df_cols != value.columns.tolist(): print('bad columns for {}!'.format(key)) 315 | df3 = aggregate_from_daily_ml(value, key) 316 | 317 | df2 = pd.concat([df2, df3]) 318 | if j%(round(len(ticker_dict)/10))==0: print('Fraction done: {}'.format(round(j/len(ticker_dict),5))) 319 | j+=1 320 | df2 = df2.sort_index(level=[0,1]) 321 | 322 | df2.columns = [col[0] + '_' + str(col[1]) if str(col[1])!='NA' else col[0] for col in df2.columns.tolist()] 323 | 324 | df3 = create_target(df2, threshold=0.0) 325 | df3.columns = [col[0] + '_' + str(col[1]) if str(col[1])!='NA' else col[0] for col in df3.columns.tolist()] 326 | 327 | 328 | if overwrite: 329 | print('Saving to data.pkl') 330 | df3.to_pickle('data.pkl') 331 | else: 332 | print('File not being saved. To save, use overwrite=True') 333 | 334 | return df3 335 | 336 | 337 | def create_target(data, threshold=0.0): 338 | ''' 339 | Create target variable that is binary {0,1}. 340 | Split into X and y. 341 | data: dataframe 342 | ''' 343 | 344 | data1 = data.dropna().copy() 345 | binarizer = Binarizer(threshold=threshold) 346 | target = binarizer.transform(data1[('Returns','Next_Month')].values.reshape(-1,1)) 347 | 348 | data1 = data1.join(pd.DataFrame(target, 349 | columns=pd.MultiIndex.from_product([['Returns'], ['Target']]), 350 | index=data1.index)) 351 | 352 | return data1 353 | 354 | 355 | def create_predictions_sklearn(data, model, name='y_pred_ml', num_cols_non_feats=1): 356 | """ 357 | Runs the models on the data (dataframe) and creates a new column called score. 358 | Works for sklearn models, not keras models. 359 | """ 360 | X = data.iloc[:,:-num_cols_non_feats].copy().values # pick out only the predictor variables. The model picks certain columns via its pipeline. 361 | y_pred = model.predict_proba(X)[:,1] 362 | data1 = data.copy() 363 | data1[name] = y_pred 364 | return data1 365 | 366 | 367 | # split a multivariate sequence into samples 368 | # helper function 369 | def split_sequences(sequences, n_steps=3): 370 | """ 371 | Takes in dateset with has (X,y) stacked together horizontally. Samples with rolling window 372 | of length n_steps, and outputs the results to X (n_samples, n_timesteps, n_features) and 373 | y (n_samples). Note that y is the one-step-ahead y, as it should for time-series prediction. 374 | """ 375 | 376 | X, y = list(), list() 377 | for i in range(len(sequences)): 378 | # find the end of this pattern 379 | end_ix = i + n_steps 380 | # check if we are beyond the dataset 381 | if end_ix > len(sequences): 382 | break 383 | # gather input and output parts of the pattern 384 | seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1] 385 | X.append(seq_x) 386 | y.append(seq_y) 387 | 388 | return np.array(X), np.array(y) 389 | 390 | 391 | def build_dataset_for_rnn(df11, n_steps=3, frac_train=0.5): 392 | """ 393 | Takes in dataframe. Splits out X and y. Normalizes X. Samples (X,y) with a rolling window of length n_steps, 394 | and saves as 3d array (n_samples, n_timesteps, n_features). 395 | 396 | This acts only on sentiment features!! 397 | """ 398 | 399 | # Split by company 400 | X_tickers = [] 401 | y_list = [] 402 | 403 | scaler = MinMaxScaler(feature_range=(-1,1)) 404 | #print('Building dataset...') 405 | for i, ticker in enumerate(df11.index.get_level_values(1).unique()): # split by ticker 406 | X = df11.reset_index().loc[df11.reset_index().Ticker==ticker].set_index(['Date','Ticker']).iloc[:,:-1].values 407 | X = scaler.fit_transform(X) # standardize the features 408 | y = df11.reset_index().loc[df11.reset_index().Ticker==ticker].set_index(['Date','Ticker']).iloc[:,-1].values 409 | dataset = np.hstack((X, y.reshape(-1,1))) 410 | # convert into input/output 411 | X, y = split_sequences(dataset, n_steps) 412 | X_tickers.append(X) 413 | y_list.append(y) 414 | #if i%(int(df11.index.get_level_values(1).unique().shape[0])/5)==0: print('Done {} percent'.format(round(100*i/int(df11.index.get_level_values(1).unique().shape[0]),2))) 415 | 416 | n_features = X_tickers[0].shape[2] 417 | 418 | try: 419 | X = np.array(X_tickers).reshape(-1, n_steps, n_features) 420 | except ValueError: 421 | print(len(X_tickers), X_tickers[0].shape, n_steps, n_features) 422 | y = np.array(y_list).reshape(-1,1) 423 | 424 | end_train = int(X.shape[0]*frac_train) 425 | x_train = X[:end_train] 426 | y_train = y[:end_train] 427 | x_test = X[end_train:] 428 | y_test = y[end_train:] 429 | 430 | #print('Done building dataset') 431 | #print('x_train shape:', x_train.shape) 432 | #print('x_test shape:', x_test.shape) 433 | #print('y_test shape:', y_train.shape) 434 | #print('y_test shape:', y_test.shape) 435 | 436 | return x_train, y_train, x_test, y_test 437 | 438 | 439 | def create_predictions_keras(df_test, model, name='y_pred_nn',n_steps=3,verbose=False): 440 | """ 441 | Takes in dataframe of features and makes predictions from the model. 442 | The number of steps (n_steps) must match the number of steps the 443 | model was trained on. 444 | """ 445 | 446 | scaler = MinMaxScaler(feature_range=(-1,1)) 447 | 448 | count_rows = 0 449 | tickers = df_test.index.get_level_values(1).unique() 450 | for i, ticker in enumerate(tickers): # split by ticker 451 | 452 | if i==0: 453 | df1 = df_test.reset_index().loc[df_test.reset_index().Ticker==ticker].set_index(['Date','Ticker']).copy() 454 | X = df1.iloc[:,:-3].values # chop off two y's from previous two models 455 | X = scaler.fit_transform(X) # standardize the features 456 | y = df1.iloc[:,-3].values 457 | dataset = np.hstack((X, y.reshape(-1,1))) 458 | X, y = split_sequences(dataset, n_steps) 459 | y_pred = model.predict(X) 460 | try: 461 | y_pred = model.predict(X) 462 | except ValueError: 463 | print('Not enough dates to make prediction on ticker {}. Returning current dataframe.'.format(ticker)) 464 | return df1 465 | a = np.array((n_steps-1)*[np.nan]) 466 | y_pred = np.concatenate([a, y_pred.ravel()]) 467 | df1[name] = y_pred 468 | count_rows += df1.shape[0] 469 | 470 | else: 471 | df2 = df_test.reset_index().loc[df_test.reset_index().Ticker==ticker].set_index(['Date','Ticker']).copy() 472 | X = df2.iloc[:,:-3].values # chop off two y's from previous two models 473 | X = scaler.fit_transform(X) # standardize the features 474 | y = df2.iloc[:,-3].values 475 | dataset = np.hstack((X, y.reshape(-1,1))) 476 | X, y = split_sequences(dataset, n_steps=n_steps) 477 | try: 478 | y_pred = model.predict(X) 479 | except ValueError: 480 | print('Not enough dates to make prediction on ticker {}. Returning current dataframe.'.format(ticker)) 481 | return df1 482 | a = np.array((n_steps-1)*[np.nan]) 483 | y_pred = np.concatenate([a, y_pred.ravel()]) 484 | df2[name] = y_pred 485 | df1 = pd.concat([df1, df2]) 486 | count_rows += df2.shape[0] 487 | df1 = df1.sort_values(by=['Date','Ticker']) 488 | if verbose: 489 | if i%(tickers.shape[0]/10)==0: print('done {} percent'.format(100*i/tickers.shape[0])) 490 | 491 | df1 = df1.sort_values(by=['Date','Ticker']) 492 | return df1 493 | 494 | 495 | ############## Three ML Models ############## 496 | from sklearn.decomposition import PCA 497 | from sklearn.pipeline import Pipeline 498 | from sklearn.preprocessing import StandardScaler, FunctionTransformer 499 | from sklearn.model_selection import GridSearchCV, TimeSeriesSplit 500 | from sklearn.metrics import fbeta_score, make_scorer, f1_score, precision_score, roc_auc_score, accuracy_score 501 | from sklearn.linear_model import LogisticRegression 502 | 503 | ##### Logistic Regression ##### 504 | def develop_logistic_model(df11_train): 505 | # dataset FOR SKLEARN 506 | N = df11_train.shape[0] 507 | end = int(N*0.75) 508 | 509 | # select a training set of first half 510 | X_train = df11_train.iloc[:end, :-1].values 511 | y_train = df11_train.iloc[:end, -1].values 512 | X_val = df11_train.iloc[end:, :-1].values 513 | y_val = df11_train.iloc[end:, -1].values 514 | 515 | clf = LogisticRegression(penalty='l2', 516 | C=1.0, 517 | random_state=0, 518 | solver='sag', 519 | max_iter=10000) 520 | # use later 521 | #pca = PCA(n_components=5) 522 | 523 | # https://scikit-learn.org/stable/auto_examples/preprocessing/plot_function_transformer.html#sphx-glr-auto-examples-preprocessing-plot-function-transformer-py 524 | def select_sentiment_colums(X): # selects the 15 sentiment features 525 | return X[:,-16:-1] 526 | 527 | pipe = Pipeline(steps=[('sentiment_cols', FunctionTransformer(select_sentiment_colums, validate=True)), 528 | ('scale', StandardScaler()), 529 | ('logreg', clf)]) 530 | 531 | # https://scikit-learn.org/stable/tutorial/statistical_inference/putting_together.html 532 | param_grid = { 533 | 'logreg__C': [1e-2, 5e-2, 7.5e-2, 1.0] 534 | } 535 | 536 | def my_custom_loss_func(y_true, y_pred): 537 | c = np.array([y_pred,y_true]) 538 | d = np.sort(c) 539 | e = np.flip(d, axis=1) 540 | f = e[:,0:int(e.shape[1]*0.5)] 541 | g = np.absolute(f[0,:] - f[1,:]) 542 | return g.mean() 543 | my_scorer = make_scorer(my_custom_loss_func, greater_is_better=False) 544 | 545 | search = GridSearchCV(pipe, 546 | param_grid, 547 | scoring='roc_auc', #'roc_auc', # my_scorer 548 | iid=False, 549 | cv=TimeSeriesSplit(n_splits=3)) 550 | 551 | search.fit(X_train, y_train.ravel()); 552 | 553 | y_pred = search.predict_proba(X_train)[:,1] 554 | #print('\n ###### Logistic Regression ###### \n') 555 | #print('Best params:',search.best_params_) 556 | #print('Mean (CV) AUC of best estimator:',search.best_score_) 557 | #print('Validation AUC of best estimator:',search.best_estimator_.score(X_val, y_val)) # why is this so low? 558 | #print('Validation AUC of best estimator:',search.score(X_val, y_val)) # should be auc 559 | #print(accuracy_score(y,y_pred)) 560 | #print('Train AUC:',roc_auc_score(y_train, y_pred)) 561 | 562 | return search 563 | 564 | ##### Extreme Gradient Boosting ##### 565 | import xgboost as xgb 566 | #from xgboost import XGBClassifier 567 | from xgboost.sklearn import XGBClassifier 568 | from sklearn.model_selection import GridSearchCV 569 | import graphviz 570 | 571 | def develop_xgb_model(df11_train): 572 | 573 | # dataset FOR SKLEARN 574 | N = df11_train.shape[0] 575 | end = int(N*0.75) 576 | 577 | # select a training set of first half 578 | X_train = df11_train.iloc[:end, :-1].values 579 | y_train = df11_train.iloc[:end, -1].values 580 | X_val = df11_train.iloc[end:, :-1].values 581 | y_val = df11_train.iloc[end:, -1].values 582 | 583 | xgb_model = xgb.XGBClassifier() 584 | 585 | def select_sentiment_colums(X): # selects the 15 sentiment features 586 | return X[:,-18:-3] 587 | 588 | pipe = Pipeline(steps=[#('sentiment_cols', FunctionTransformer(select_sentiment_colums, validate=True)), 589 | ('scale', StandardScaler()), 590 | ('xgb', xgb_model)]) 591 | 592 | test_params = { 593 | 'xgb__eta': [0.05, 0.3, 1], 594 | 'xgb__min_child_weight': [1], 595 | 'xgb__max_depth': [2],#,5], 596 | 'xgb__gamma': [0],#,0.1,0.2], 597 | 'xgb__n_estimators': [20],#30,40], 598 | 'xgb__reg_alpha':[1e-5]#, 1e-2, 0.1] 599 | } 600 | 601 | xgb_search = GridSearchCV(pipe, 602 | test_params, 603 | scoring='roc_auc', #'roc_auc', # my_scorer 604 | iid=False, 605 | cv=TimeSeriesSplit(n_splits=3)) 606 | 607 | xgb_search.fit(X_train, y_train.ravel()) 608 | y_pred = xgb_search.predict_proba(X_train) 609 | 610 | #print('\n ###### XGB ###### \n') 611 | #print('Best params: {}'.format(xgb_search.best_params_)) 612 | #print('Mean cv score of best estimator: {}'.format( round(xgb_search.best_score_,2))) # shoiuld be AUC 613 | #print('Training AUC: {}'.format( round(roc_auc_score(y_train, y_pred[:,1]),2))) 614 | #print('Test AUC: {}'.format( round(roc_auc_score(y_val, xgb_search.predict_proba(X_val)[:,1])),2) ) 615 | 616 | return xgb_search 617 | 618 | 619 | ##### LSTM ##### 620 | from keras.preprocessing import sequence 621 | from keras.models import Sequential 622 | from keras.layers import Dense, Embedding 623 | from keras.layers import LSTM 624 | from keras.datasets import imdb 625 | 626 | # turorials 627 | # https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/ 628 | # https://machinelearningmastery.com/how-to-develop-lstm-models-for-time-series-forecasting/ 629 | 630 | def develop_lstm_model(df11_train): 631 | print('\n Building dataset for LSTM...') 632 | x_train, y_train, x_val, y_val = build_dataset_for_rnn(df11_train, n_steps=3, frac_train=0.75) 633 | batch_size = 32 634 | n_steps, n_features = x_train.shape[1], x_train.shape[2] 635 | 636 | print('Build LSTM...') 637 | model = Sequential() 638 | #model.add(Embedding(input_dim=max_features, output_dim=128)) #use only for 639 | #model.add(Dense(32, input_shape=(70,))) 640 | model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2, input_shape=(n_steps, n_features))) # 50 memory units 641 | model.add(Dense(1, activation='sigmoid')) # classification, so sigmoid outcome 642 | 643 | # try using different optimizers and different optimizer configs 644 | model.compile(loss='binary_crossentropy', # since we're doing binary classification 645 | optimizer='adam', 646 | metrics=['accuracy']) 647 | 648 | print('Train LSTM...') 649 | model.fit(x_train, y_train, 650 | batch_size=batch_size, 651 | epochs=10, #15 652 | validation_data=(x_val, y_val), 653 | verbose=0) 654 | score, acc = model.evaluate(x_val, y_val, 655 | batch_size=batch_size) 656 | y_pred = model.predict(x_val) 657 | 658 | #print('\n ###### LSTM ###### \n') 659 | #print('Test score:', round(score,2)) 660 | #print('Test accuracy:', round(acc,2)) # accuracy 661 | #print('Train AUC:', round(roc_auc_score(y_train, model.predict(x_train)),2)) 662 | val_auc = roc_auc_score(y_val, y_pred) 663 | #print('Validation AUC:', round(val_auc,2)) 664 | 665 | return model, val_auc 666 | 667 | 668 | def make_predictions(df11_test, df11_all, model1, model2, model3): 669 | df1 = create_predictions_sklearn(df11_test, model1, name='y_pred_log_reg', num_cols_non_feats=1) # chop off y 670 | #print(df1.shape) 671 | df2 = create_predictions_sklearn(df1, model2, name='y_pred_xgb', num_cols_non_feats=2) # chop off 2 y's 672 | #print(df2.shape) 673 | df3 = create_predictions_keras(df2, model3, n_steps=3, verbose=False) 674 | #print(df3.shape) 675 | df4 = df3.join(df11_all.iloc[:,:4], how='left') 676 | cols = df4.columns.tolist() 677 | df5 = df4[cols[-4:]+cols[:-4]].copy() # switch order of columns 678 | y_pred_avg = df5.iloc[:,-3:].mean(axis=1).values 679 | df6 = df5.assign(y_pred_avg=y_pred_avg) 680 | return df6 681 | 682 | ##### prepare dataset ##### 683 | def prepare_dataset(frac_training=0.5, use_sentiment=True): 684 | """ 685 | Split dataset and prepare it for the ML/NN model fitting. 686 | It rounds down to the month, to avoid splitting months between 687 | training and test. 688 | """ 689 | 690 | d2 = pd.read_pickle('complete_dataset.pickle') 691 | #df2 = pd.read_pickle('all_tickers_combined.pickle') 692 | #df2a = df2.loc[:,['ticker_price_beginning', 'ticker_price_end', 'mkt_price_beginning','mkt_price_end','Sentiment']] 693 | df11_all = pd.read_pickle('data.pkl') 694 | df11 = df11_all.iloc[:, list(range(4,73))+[-1]].copy() # select only features and target variable 695 | 696 | if not use_sentiment: df11 = df11.iloc[:,-15:].copy() # remove sentiment features 697 | 698 | # dataset FOR SKLEARN 699 | N = df11.shape[0] 700 | end = int(N*frac_training) # worked with 0.5 701 | 702 | remainder = end%1130 703 | end = end-remainder 704 | 705 | # select a training set of first half 706 | df11_train = df11.iloc[:end,:] 707 | df11_test = df11.iloc[end:,:] 708 | 709 | return df11_train, df11_test 710 | 711 | 712 | def develop_all_three_models(frac_training=0.6, use_sentiment=True): 713 | """ 714 | Splits data, builds all three models, and returns some results. 715 | """ 716 | 717 | # split data 718 | df11_all = pd.read_pickle('data.pkl') 719 | df11_train, df11_test = prepare_dataset(frac_training=frac_training, use_sentiment=use_sentiment) 720 | 721 | # select logistic regression 722 | search = develop_logistic_model(df11_train) 723 | print('done logistic') 724 | 725 | # select xgb 726 | xgb_search = develop_xgb_model(df11_train) 727 | print('done gradient boosting') 728 | 729 | # train (not select) LSTM 730 | model, val_auc = develop_lstm_model(df11_train) 731 | print('done LSTM') 732 | 733 | # make predictions 734 | print('Making predictions...') 735 | df17_test = make_predictions(df11_test=df11_test, df11_all=df11_all, model1=search, model2=xgb_search, model3=model) 736 | print('done predictions...') 737 | 738 | results = pd.DataFrame({'Validation AUC':[search.best_score_, xgb_search.best_score_, val_auc]}) 739 | results.index = ['log','xgb','lstm'] 740 | 741 | print('Mean CV AUC:') 742 | print(results) 743 | 744 | return df17_test, results 745 | 746 | # run strategy 747 | # print('running strategy...') 748 | # df18_test = run_strategy(df17_test, ticker_dict=d2['Raw_Data'], sort_column='y_pred_avg', seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-10., beta_cap_ceil=10., plot=False) 749 | 750 | # results = pd.DataFrame({'Validation AUC':[search.best_score_, xgb_search.best_score_, val_auc]}) 751 | # results.index = ['log','xgb','lstm'] 752 | # return df18_test, results 753 | 754 | 755 | def get_results_by_training_cutoff(save_result=False): 756 | """ 757 | Splits training and test set, develops models, runs strategy, 758 | and returns some results for plotting and analysis. 759 | Creates results_by_training_cutoff.pkl. 760 | Takes 30+ minutes to run. 761 | """ 762 | 763 | print('Warning: this code could take longer than 30 minutes.') 764 | res = {} 765 | d2 = pd.read_pickle('complete_dataset.pickle') # complete_dataset.pkl was made by Benjamin, not Robert 766 | for i in np.arange(0.1,1,0.2): 767 | print(i) 768 | df18_test, results = develop_all_three_models(frac_training=i) 769 | df19_test = run_strategy(df18_test, ticker_dict=d2['Raw_Data'], sort_column='y_pred_avg', seed=False, frac_long_tickers=0.01, mult_extra=2, beta_cap_floor=-10., beta_cap_ceil=10., plot=False) 770 | res[i]=(df18_test, results, df19_test) 771 | if save_result: 772 | print('Saving dictionary to results_by_training_cutoff.pkl') 773 | # Create an variable to pickle and open it in write mode 774 | list_pickle_path = 'results_by_training_cutoff.pkl' 775 | list_pickle = open(list_pickle_path, 'wb') 776 | pickle.dump(res, list_pickle) 777 | list_pickle.close() 778 | return res 779 | 780 | 781 | 782 | 783 | 784 | 785 | 786 | if __name__ == '__main__': # prints when run 787 | print('You just ran this from the command line') 788 | 789 | #if __name__ == 'final_project': # prints when you import the package 790 | # print('Importing final_project') --------------------------------------------------------------------------------