10 |
11 |
--------------------------------------------------------------------------------
/Old_versions/scrape_data.py:
--------------------------------------------------------------------------------
1 | from bs4 import BeautifulSoup
2 | import requests
3 | import sys
4 |
5 | URL_part_1 = "https://finance.yahoo.com/quote/"
6 | URL_part_2 = "/history"
7 | PARSER = "html.parser"
8 |
9 |
10 | def get_page_content(name):
11 | try:
12 | page = requests.get(URL_part_1+name+URL_part_2)
13 | except requests.exceptions.RequestException as e:
14 | print(e)
15 | sys.exit(1)
16 | return BeautifulSoup(page.content, PARSER)
17 |
18 |
19 | def main():
20 | content = get_page_content("goog")
21 | print(content)
22 |
23 | if __name__ == "__main__":
24 | main()
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # markov_stock_analysis
2 | The program implements an algorithm that finds the percentage change in a security based on historical daily data from Yahoo Finance and visualizes the information as a time series plot in matplotlib. The program also creates a Markov chain model in which the states are bull market, bear market, and stagnant market. Using the probabilities associated with this Markov chain model, the program will model the estimated growth of the security price through a Gaussian random walk.
3 |
4 | The real purpose of the program is not to accurately predict the growth or fall fall of a security price. Instead, the program serves as a simple simulation to study finite markov chain model behavior in an applied context. Currently, I occasionally use the program as a research tool to study the long and short term behavior of individual securities and indices using experimental markov models.
5 |
6 | The current problem I face is that I assumed, for simplicity, that the historical percentage price changes are all normally distributed with a definite mean and standard deviation. My shifts from one state to another are heavily dependent on this not-so-normal price change distribution. Of course, this assumption is most likely not necessarily true; in my Gaussian walk, we cannot assume that the pricing changes are even approximately normal because the data is NOT identically and independently distributed.
7 |
8 | To resolve the faulty assumption, the next steps in the model would be to use a nonparametric distribution to model the steps in my random walk. A nonparametric distribution would not have any mean, standard distribution, etc. I would probably get a more realistic model if I used one, and I would not need to worry about too many assumptions. Once I am very familiar with the details and theory on nonparametric statistics will I resolve this issue.
--------------------------------------------------------------------------------
/Old_versions/markov_stock_analysis.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Nikhil Bhaip
3 | @version: 2.0
4 | @since: 6/13/16
5 |
6 | The S&P 500 program implements an algorithm that finds the percentage change in the S&P 500 Index based on historical
7 | weekly data and visualizes the information as a time series plot in matplotlib. The program also creates a Markov chain
8 | model in which the states are bull market, bear market, and stagnant market. Using the probabilities associated with
9 | this Markov chain model, the program will predict the future S&P 500 data through a random walk.
10 |
11 | The next step would be to change other variables like periodicity (daily, weekly, monthly, etc.), use stock data rather
12 | than S&P 500, and incorporate other newer variables like seasonality.
13 |
14 | """
15 | import pandas as pd
16 | import datetime as dt
17 | import matplotlib.pyplot as plt
18 | from matplotlib.dates import MonthLocator, DateFormatter
19 | from matplotlib.ticker import MultipleLocator
20 | import matplotlib.mlab as mlab
21 | import numpy as np
22 |
23 |
24 | def get_data():
25 | """
26 | This function obtains weekly S&P500 data from the last 7 years as a DataFrame from Quandl. I'm mainly interested
27 | in the date, adjusted_closing and the percentage change in adj_closing from the last week.
28 |
29 | :return: A Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
30 | """
31 | sp_500_df = pd.read_csv("https://www.quandl.com/api/v3/datasets/YAHOO/INDEX_GSPC.csv?collapse=weekly" +
32 | "&start_date=2009-05-23&end_date=2016-05-23&api_key=7NU4-sXfczxA9fsf_C8E")
33 | adjusted_df = sp_500_df.ix[:, ['Date', 'Adjusted Close']]
34 | adjusted_df["Percentage Change"] = adjusted_df['Adjusted Close'][::-1].pct_change() * 100
35 | print(adjusted_df)
36 | return adjusted_df
37 | #get_data()
38 |
39 |
40 | def percent_change_as_time_plot(adjusted_df):
41 | """
42 | This function visualizes the percentage change data as a time series plot.
43 |
44 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
45 | """
46 |
47 | pct_change_list = adjusted_df['Percentage Change'].tolist()
48 | date_list = [dt.datetime.strptime(d, '%Y-%m-%d').date() for d in adjusted_df['Date'].tolist()]
49 | fig, ax = plt.subplots()
50 | ax.plot(date_list, pct_change_list)
51 | plt.xlabel("Years")
52 | plt.ylabel("Percentage change from last week")
53 | plt.title("Percentage change in S&P 500 weekly data from 2009 to 2016")
54 | ax.xaxis.set_minor_locator(MonthLocator())
55 | ax.yaxis.set_minor_locator(MultipleLocator(1))
56 | ax.fmt_xdata = DateFormatter('%Y-%m-%d')
57 | ax.autoscale_view()
58 | fig.autofmt_xdate()
59 |
60 | plt.show()
61 |
62 |
63 | def get_params_for_norm_dist(adjusted_df):
64 | """
65 | This function returns the mean and standard deviation in the percentage change column of a DataFrame.
66 | """
67 | mean = adjusted_df["Percentage Change"].mean()
68 | std = adjusted_df["Percentage Change"].std()
69 | return mean, std
70 |
71 |
72 | def percent_change_as_hist(adjusted_df):
73 | """
74 | This function visualizes the percentage change data as a histogram. The graph is also fitted to a normal
75 | distribution curve.
76 |
77 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
78 | """
79 | pct_change_list = adjusted_df['Percentage Change']
80 |
81 | # Code below removes the NaN value and plots the histogram. Bins are left adjusted right now, so when plotting the
82 | # normal distribution function, we must adjust it to be based off the center (average) of the bins.
83 | n, bins, patches = plt.hist(pct_change_list.dropna(), bins=25, normed=True)
84 | bincenters = 0.5*(bins[1:]+bins[:-1])
85 |
86 | plt.xlabel("Percentage change")
87 | plt.ylabel("Frequency")
88 | mean, std = get_params_for_norm_dist(adjusted_df)
89 | plt.title("Distribution of percentage change in S&P 500. Mu: %.3f, Sigma: %.3f" % (mean, std), y=1.03)
90 |
91 | # adds vertical lines to the graph corresponding to the x's that represent the number of deviations from the mean
92 | for num_std_from_mean in range(-3, 4):
93 | plt.axvline(mean + std * num_std_from_mean)
94 |
95 | # plots the normal pdf of best fit
96 | y = mlab.normpdf(bincenters, mean, std)
97 | plt.plot(bincenters, y, 'r--', linewidth=1)
98 |
99 | plt.show()
100 |
101 |
102 | def percent_change_prob(adjusted_df, threshold=0):
103 | """
104 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
105 | two states: % change greater than or equal to a threshold (A) and % change less than a threshold (B). The threshold
106 | is defaulted to zero, so that the states are roughly divided into positive and negative changes. The four
107 | probabilities are: a P(A | A), b (A | B) , c (B | A) , d (B | B). The sum of the rows in the matrix must add up to
108 | 1: (a + b = 1 and c + d = 1)
109 |
110 | A B
111 | P = A a b
112 | B c d
113 |
114 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
115 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
116 |
117 | """
118 |
119 | a_count = 0 # counts frequency of when A occurs then the next week A occurs
120 | b_count = 0 # counts frequency of when A occurs then the next week B occurs
121 | c_count = 0 # counts frequency of when B occurs then the next week A occurs
122 | d_count = 0 # counts frequency of when B occurs then the next week B occurs
123 |
124 | # returns a series of % change without missing data and reverses the order of data, so it starts from earliest date
125 | pct_change_series = adjusted_df['Percentage Change'].dropna().iloc[::-1]
126 | print(pct_change_series)
127 | for index, pct in pct_change_series.iteritems():
128 | if index == 0: # prevents program from calculating a future probability
129 | continue
130 | if pct >= threshold:
131 | if pct_change_series[index-1] >= threshold:
132 | a_count += 1
133 | else:
134 | b_count += 1
135 | else:
136 | if pct_change_series[index-1] >= threshold:
137 | c_count += 1
138 | else:
139 | d_count += 1
140 | print(index)
141 |
142 | # Given event A just happened, this is the probability that another event A occurs
143 | a_prob = a_count / (a_count + b_count)
144 |
145 | # Given event A just happened, this is the probability that event B occurs
146 | b_prob = b_count / (a_count + b_count)
147 |
148 | # Given event B just happened, this is the probability that event A occurs
149 | c_prob = c_count / (c_count + d_count)
150 |
151 | # Given event B just happened, this is the probability that another event B occurs
152 | d_prob = d_count / (c_count + d_count)
153 |
154 | prob_list = [[a_prob, b_prob], [c_prob, d_prob]]
155 | print(prob_list, "\n")
156 |
157 | print("\nIf the S&P 500 has a positive percentage change this week, there is a %.3f chance that "
158 | "next week there will be a repeat positive percentage change. If the index rises this week, then there is a "
159 | "%.3f chance that next week the index will fall. However, if the S&P drops in one week there is a %.3f that"
160 | " next week there will be a repeat negative percentage change. If the index falls this week, then there is a "
161 | "%.3f chance that the index will rise next week. \n" % (a_prob, b_prob, d_prob, c_prob))
162 | return prob_list
163 |
164 |
165 | def random_walk_norm_pdf(adjusted_df, start=2099, num_periods=12):
166 | """
167 | This function calculates and visualizes a random walk assuming that S&P 500 data are independent of current state.
168 | Based on a basic normal distribution and a starting point, the function will predict the S&P 500
169 | Index movement for a finite number of periods. This is the most fundamental random walk and has many unrealistic
170 | assumptions, such as the data are independently and identically distributed, which is likely not true for the
171 | S&P500 Index.
172 |
173 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
174 | :param start: starting value for S&P 500 random walk
175 | :param num_periods: number of steps in the random walk process
176 |
177 | """
178 | mean, std = get_params_for_norm_dist(adjusted_df)
179 | pct_change_list = []
180 | all_walks = [] # will hold all the random walk data
181 | for i in range(100):
182 | random_walk = [start]
183 | for period in range(num_periods):
184 | # sets the step as the last element in the random walk
185 | step = random_walk[-1]
186 |
187 | # picks a random percent change from a Gaussian distribution based on historical mean and standard deviation
188 | pct_change = np.random.normal(mean, std)
189 | pct_change_list.append(pct_change)
190 |
191 | # reordering of percent change formula
192 | step = ((pct_change * step / 100) + step)
193 |
194 | random_walk.append(step)
195 | all_walks.append(random_walk)
196 | show_rand_walks(all_walks)
197 |
198 |
199 | def prob_from_bins(heights, bins):
200 | """
201 | Chooses a random bin based on the prob distribution in the histogram. Then returns a random percentage change from
202 | that bin.
203 |
204 | :param heights: heights of the histogram
205 | :param bins: left-hand edges of each bin; must have at least two values in list
206 | :return: random percentage change
207 | """
208 | np_heights = np.asarray(heights)
209 | bin_length = bins[1]-bins[0]
210 | np_area = bin_length * np_heights # sum of area is equal to 1
211 | bin_num = np.random.choice(np.arange(start=1, stop=len(bins)), p=np_area)
212 | rand_pct_change = bin_length * np.random.ranf() + bins[bin_num-1]
213 | return rand_pct_change
214 |
215 |
216 | def rand_walk_2x2_markov(adjusted_df, prob_list, num_bins=10, threshold=0, start=2099, num_periods=12):
217 | """
218 | Divides the per
219 |
220 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
221 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
222 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
223 | the probability distribution will be
224 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold)
225 | :param start: starting value for S&P 500 random walk
226 | :param num_periods: number of steps in the random walk process
227 | """
228 |
229 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
230 | pct_above_threshold_array = pct_change_array[pct_change_array >= threshold]
231 | pct_below_threshold_array = pct_change_array[pct_change_array < threshold]
232 | n_above, bins_above, patches_above = plt.hist(pct_above_threshold_array, bins=num_bins, normed=True)
233 | n_below, bins_below, patches_below = plt.hist(pct_below_threshold_array, bins=num_bins, normed=True)
234 |
235 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
236 | # percentage change of the previous period
237 | pct_change_list = []
238 | all_walks = [] # will hold all the random walk data
239 | for i in range(100):
240 | mean, std = get_params_for_norm_dist(adjusted_df)
241 | first_pct_change = np.random.normal(mean, std)
242 | pct_change_list.append(first_pct_change)
243 | first_step = ((first_pct_change * start / 100) + start)
244 | random_walk = [start, first_step]
245 |
246 | for period in range(num_periods):
247 | step = random_walk[-1]
248 | prev_pct_change = pct_change_list[-1]
249 |
250 | # random number used to test whether event A will occur or event B will occur
251 | rand_prob = np.random.random_sample()
252 | if prev_pct_change >= threshold: # If true, event A occurred
253 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
254 | if rand_prob <= prob_list[0][0]: # If true, A then A
255 | pct_change = prob_from_bins(n_above, bins_above)
256 | else: # If true, A then B
257 | pct_change = prob_from_bins(n_below, bins_below)
258 | else: # If true, event B occurred
259 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
260 | if rand_prob <= prob_list[1][0]: # If true, B then A
261 | pct_change = prob_from_bins(n_above, bins_above)
262 | else: # If true, B then B
263 | pct_change = prob_from_bins(n_below, bins_below)
264 |
265 | pct_change_list.append(pct_change)
266 |
267 | step = ((pct_change * step / 100) + step)
268 |
269 | random_walk.append(step)
270 | all_walks.append(random_walk)
271 | show_rand_walks(all_walks)
272 |
273 |
274 | def show_rand_walks(all_walks):
275 | """
276 | Visualizes all random walks as a plot and distribution.
277 |
278 | :param all_walks: list of all random walks conducted
279 | """
280 | np_aw = np.array(all_walks) # converts the list of all random walks to a Numpy Array
281 | np_aw_t = np.transpose(np_aw) # must transpose the array for graph to display properly
282 | plt.clf()
283 | plt.plot(np_aw_t)
284 | plt.xlabel("Steps")
285 | plt.ylabel("S&P 500 Index Value")
286 | plt.title("All Random Walks of the S&P 500 Index")
287 | plt.show()
288 |
289 | # Select last row from np_aw_t: ends
290 | ends = np_aw_t[-1]
291 |
292 | # Plot histogram of ends, display plot
293 | n, bins, patches = plt.hist(ends, bins=25, normed=True)
294 | plt.xlabel("Final S&P 500 Index Value at end of period.")
295 | plt.ylabel("Frequency")
296 | rand_mean = ends.mean()
297 | rand_std = ends.std()
298 |
299 | plt.title("Distribution of Random Walk Final Values. Mean is %d and Standard Deviation is %d"
300 | % (rand_mean, rand_std), y=1.03)
301 | for num_std_from_mean in range(-3, 4):
302 | plt.axvline(rand_mean + rand_std * num_std_from_mean)
303 | bincenters = 0.5*(bins[1:]+bins[:-1])
304 | # plots the normal pdf of best fit
305 | y = mlab.normpdf(bincenters, rand_mean, rand_std)
306 | plt.plot(bincenters, y, 'r--', linewidth=3)
307 | plt.show()
308 |
309 |
310 | my_sp_500_df = get_data()
311 | #print(my_sp_500_df)
312 | #percent_change_as_time_plot(my_sp_500_df)
313 | #percent_change_as_hist(my_sp_500_df)
314 | prob_matrix = percent_change_prob(my_sp_500_df)
315 | #random_walk_norm_pdf(my_sp_500_df, num_periods=12)
316 | #rand_walk_2x2_markov(my_sp_500_df, prob_list=prob_matrix, threshold=0)
--------------------------------------------------------------------------------
/Old_versions/s&p_500_list:
--------------------------------------------------------------------------------
1 | Symbol,Name,Sector
2 | MMM,3M Company,Industrials
3 | ABT,Abbott Laboratories,Health Care
4 | ABBV,AbbVie,Health Care
5 | ACN,Accenture plc,Information Technology
6 | ATVI,Activision Blizzard,Information Technology
7 | AYI,Acuity Brands Inc,Industrials
8 | ADBE,Adobe Systems Inc,Information Technology
9 | AAP,Advance Auto Parts,Consumer Discretionary
10 | AES,AES Corp,Utilities
11 | AET,Aetna Inc,Health Care
12 | AMG,Affiliated Managers Group Inc,Financials
13 | AFL,AFLAC Inc,Financials
14 | A,Agilent Technologies Inc,Health Care
15 | GAS,AGL Resources Inc.,Utilities
16 | APD,Air Products & Chemicals Inc,Materials
17 | AKAM,Akamai Technologies Inc,Information Technology
18 | ALK,Alaska Air Group Inc,Industrials
19 | AA,Alcoa Inc,Materials
20 | ALXN,Alexion Pharmaceuticals,Health Care
21 | ALLE,Allegion,Industrials
22 | AGN,Allergan plc,Health Care
23 | ADS,Alliance Data Systems,Information Technology
24 | ALL,Allstate Corp,Financials
25 | GOOGL,Alphabet Inc Class A,Information Technology
26 | GOOG,Alphabet Inc Class C,Information Technology
27 | MO,Altria Group Inc,Consumer Staples
28 | AMZN,Amazon.com Inc,Consumer Discretionary
29 | AEE,Ameren Corp,Utilities
30 | AAL,American Airlines Group,Industrials
31 | AEP,American Electric Power,Utilities
32 | AXP,American Express Co,Financials
33 | AIG,"American International Group, Inc.",Financials
34 | AMT,American Tower Corp A,Financials
35 | AWK,American Water Works Company Inc,Utilities
36 | AMP,Ameriprise Financial,Financials
37 | ABC,AmerisourceBergen Corp,Health Care
38 | AME,Ametek,Industrials
39 | AMGN,Amgen Inc,Health Care
40 | APH,Amphenol Corp A,Industrials
41 | APC,Anadarko Petroleum Corp,Energy
42 | ADI,"Analog Devices, Inc.",Information Technology
43 | ANTM,Anthem Inc.,Health Care
44 | AON,Aon plc,Financials
45 | APA,Apache Corporation,Energy
46 | AIV,Apartment Investment & Mgmt,Financials
47 | AAPL,Apple Inc.,Information Technology
48 | AMAT,Applied Materials Inc,Information Technology
49 | ADM,Archer-Daniels-Midland Co,Consumer Staples
50 | AJG,Arthur J. Gallagher & Co.,Financials
51 | AIZ,Assurant Inc,Financials
52 | T,AT&T Inc,Telecommunications Services
53 | ADSK,Autodesk Inc,Information Technology
54 | ADP,Automatic Data Processing,Information Technology
55 | AN,AutoNation Inc,Consumer Discretionary
56 | AZO,AutoZone Inc,Consumer Discretionary
57 | AVGO,Avago Technologies,Information Technology
58 | AVB,"AvalonBay Communities, Inc.",Financials
59 | AVY,Avery Dennison Corp,Materials
60 | BHI,Baker Hughes Inc,Energy
61 | BLL,Ball Corp,Materials
62 | BAC,Bank of America Corp,Financials
63 | BCR,Bard (C.R.) Inc.,Health Care
64 | BAX,Baxter International Inc.,Health Care
65 | BBT,BB&T Corporation,Financials
66 | BDX,Becton Dickinson,Health Care
67 | BBBY,Bed Bath & Beyond,Consumer Discretionary
68 | BRK-B,Berkshire Hathaway,Financials
69 | BBY,Best Buy Co. Inc.,Consumer Discretionary
70 | BIIB,BIOGEN IDEC Inc.,Health Care
71 | BLK,BlackRock,Financials
72 | HRB,Block H&R,Financials
73 | BA,Boeing Company,Industrials
74 | BWA,BorgWarner,Consumer Discretionary
75 | BXP,Boston Properties,Financials
76 | BSX,Boston Scientific,Health Care
77 | BMY,Bristol-Myers Squibb,Health Care
78 | BF-B,Brown-Forman Corporation,Consumer Staples
79 | CHRW,C. H. Robinson Worldwide,Industrials
80 | CA,"CA, Inc.",Information Technology
81 | CVC,Cablevision Systems Corp.,Consumer Discretionary
82 | COG,Cabot Oil & Gas,Energy
83 | CPB,Campbell Soup,Consumer Staples
84 | COF,Capital One Financial,Financials
85 | CAH,Cardinal Health Inc.,Health Care
86 | KMX,Carmax Inc,Consumer Discretionary
87 | CCL,Carnival Corp.,Consumer Discretionary
88 | CAT,Caterpillar Inc.,Industrials
89 | CBG,CBRE Group,Financials
90 | CBS,CBS Corp.,Consumer Discretionary
91 | CELG,Celgene Corp.,Health Care
92 | CNC,Centene Corporation,Health Care
93 | CNP,CenterPoint Energy,Utilities
94 | CTL,CenturyLink Inc,Telecommunications Services
95 | CERN,Cerner,Health Care
96 | CF,CF Industries Holdings Inc,Materials
97 | SCHW,Charles Schwab Corporation,Financials
98 | CHK,Chesapeake Energy,Energy
99 | CVX,Chevron Corp.,Energy
100 | CMG,Chipotle Mexican Grill,Consumer Discretionary
101 | CB,Chubb Limited,Financials
102 | CHD,Church & Dwight,Consumer Staples
103 | CI,CIGNA Corp.,Health Care
104 | XEC,Cimarex Energy,Energy
105 | CINF,Cincinnati Financial,Financials
106 | CTAS,Cintas Corporation,Industrials
107 | CSCO,Cisco Systems,Information Technology
108 | C,Citigroup Inc.,Financials
109 | CFG,Citizens Financial Group,Financials
110 | CTXS,Citrix Systems,Information Technology
111 | CME,CME Group Inc.,Financials
112 | CMS,CMS Energy,Utilities
113 | COH,Coach Inc.,Consumer Discretionary
114 | CTSH,Cognizant Technology Solutions,Information Technology
115 | CL,Colgate-Palmolive,Consumer Staples
116 | CPGX,Columbia Pipeline Group Inc,Energy
117 | CMCSA,Comcast A Corp,Consumer Discretionary
118 | CMA,Comerica Inc.,Financials
119 | CAG,ConAgra Foods Inc.,Consumer Staples
120 | CXO,Concho Resources,Energy
121 | COP,ConocoPhillips,Energy
122 | ED,Consolidated Edison,Utilities
123 | STZ,Constellation Brands,Consumer Staples
124 | GLW,Corning Inc.,Industrials
125 | COST,Costco Co.,Consumer Staples
126 | CCI,Crown Castle International Corp.,Financials
127 | CSRA,CSRA Inc.,Information Technology
128 | CSX,CSX Corp.,Industrials
129 | CMI,Cummins Inc.,Industrials
130 | CVS,CVS Health,Consumer Staples
131 | DHI,D. R. Horton,Consumer Discretionary
132 | DHR,Danaher Corp.,Industrials
133 | DRI,Darden Restaurants,Consumer Discretionary
134 | DVA,DaVita Inc.,Health Care
135 | DE,Deere & Co.,Industrials
136 | DLPH,Delphi Automotive,Consumer Discretionary
137 | DAL,Delta Air Lines,Industrials
138 | XRAY,Dentsply Sirona,Health Care
139 | DVN,Devon Energy Corp.,Energy
140 | DO,Diamond Offshore Drilling,Energy
141 | DLR,Digital Realty Trust,Financials
142 | DFS,Discover Financial Services,Financials
143 | DISCA,Discovery Communications-A,Consumer Discretionary
144 | DISCK,Discovery Communications-C,Consumer Discretionary
145 | DG,Dollar General,Consumer Discretionary
146 | DLTR,Dollar Tree,Consumer Discretionary
147 | D,Dominion Resources,Utilities
148 | DOV,Dover Corp.,Industrials
149 | DOW,Dow Chemical,Materials
150 | DPS,Dr Pepper Snapple Group,Consumer Staples
151 | DTE,DTE Energy Co.,Utilities
152 | DD,Du Pont (E.I.),Materials
153 | DUK,Duke Energy,Utilities
154 | DNB,Dun & Bradstreet,Industrials
155 | ETFC,E*Trade,Financials
156 | EMN,Eastman Chemical,Materials
157 | ETN,Eaton Corporation,Industrials
158 | EBAY,eBay Inc.,Information Technology
159 | ECL,Ecolab Inc.,Materials
160 | EIX,Edison Int'l,Utilities
161 | EW,Edwards Lifesciences,Health Care
162 | EA,Electronic Arts,Information Technology
163 | EMC,EMC Corp.,Information Technology
164 | EMR,Emerson Electric Company,Industrials
165 | ENDP,Endo International,Health Care
166 | ETR,Entergy Corp.,Utilities
167 | EOG,EOG Resources,Energy
168 | EQT,EQT Corporation,Energy
169 | EFX,Equifax Inc.,Financials
170 | EQIX,Equinix,Information Technology
171 | EQR,Equity Residential,Financials
172 | ESS,Essex Property Trust Inc,Financials
173 | EL,Estee Lauder Cos.,Consumer Staples
174 | ES,Eversource Energy,Utilities
175 | EXC,Exelon Corp.,Utilities
176 | EXPE,Expedia Inc.,Consumer Discretionary
177 | EXPD,Expeditors Int'l,Industrials
178 | ESRX,Express Scripts,Health Care
179 | EXR,Extra Space Storage,Financials
180 | XOM,Exxon Mobil Corp.,Energy
181 | FFIV,F5 Networks,Information Technology
182 | FB,Facebook,Information Technology
183 | FAST,Fastenal Co,Industrials
184 | FRT,Federal Realty Investment Trust,Financials
185 | FDX,FedEx Corporation,Industrials
186 | FIS,Fidelity National Information Services,Information Technology
187 | FITB,Fifth Third Bancorp,Financials
188 | FSLR,First Solar Inc,Information Technology
189 | FE,FirstEnergy Corp,Utilities
190 | FISV,Fiserv Inc,Information Technology
191 | FLIR,FLIR Systems,Industrials
192 | FLS,Flowserve Corporation,Industrials
193 | FLR,Fluor Corp.,Industrials
194 | FMC,FMC Corporation,Materials
195 | FTI,FMC Technologies Inc.,Energy
196 | FL,Foot Locker Inc,Consumer Discretionary
197 | F,Ford Motor,Consumer Discretionary
198 | BEN,Franklin Resources,Financials
199 | FCX,Freeport-McMoran Cp & Gld,Materials
200 | FTR,Frontier Communications,Telecommunications Services
201 | GPS,Gap (The),Consumer Discretionary
202 | GRMN,Garmin Ltd.,Consumer Discretionary
203 | GD,General Dynamics,Industrials
204 | GE,General Electric,Industrials
205 | GGP,General Growth Properties Inc.,Financials
206 | GIS,General Mills,Consumer Staples
207 | GM,General Motors,Consumer Discretionary
208 | GPC,Genuine Parts,Consumer Discretionary
209 | GILD,Gilead Sciences,Health Care
210 | GPN,Global Payments Inc,Information Technology
211 | GS,Goldman Sachs Group,Financials
212 | GT,Goodyear Tire & Rubber,Consumer Discretionary
213 | GWW,Grainger (W.W.) Inc.,Industrials
214 | HAL,Halliburton Co.,Energy
215 | HBI,Hanesbrands Inc,Consumer Discretionary
216 | HOG,Harley-Davidson,Consumer Discretionary
217 | HAR,Harman Int'l Industries,Consumer Discretionary
218 | HRS,Harris Corporation,Information Technology
219 | HIG,Hartford Financial Svc.Gp.,Financials
220 | HAS,Hasbro Inc.,Consumer Discretionary
221 | HCA,HCA Holdings,Health Care
222 | HCP,HCP Inc.,Financials
223 | HP,Helmerich & Payne,Energy
224 | HSIC,Henry Schein,Health Care
225 | HES,Hess Corporation,Energy
226 | HPE,Hewlett Packard Enterprise,Information Technology
227 | HOLX,Hologic,Health Care
228 | HD,Home Depot,Consumer Discretionary
229 | HON,Honeywell Int'l Inc.,Industrials
230 | HRL,Hormel Foods Corp.,Consumer Staples
231 | HST,Host Hotels & Resorts,Financials
232 | HPQ,HP Inc.,Information Technology
233 | HUM,Humana Inc.,Health Care
234 | HBAN,Huntington Bancshares,Financials
235 | ITW,Illinois Tool Works,Industrials
236 | ILMN,Illumina Inc,Health Care
237 | IR,Ingersoll-Rand PLC,Industrials
238 | INTC,Intel Corp.,Information Technology
239 | ICE,Intercontinental Exchange,Financials
240 | IBM,International Bus. Machines,Information Technology
241 | IP,International Paper,Materials
242 | IPG,Interpublic Group,Consumer Discretionary
243 | IFF,Intl Flavors & Fragrances,Materials
244 | INTU,Intuit Inc.,Information Technology
245 | ISRG,Intuitive Surgical Inc.,Health Care
246 | IVZ,Invesco Ltd.,Financials
247 | IRM,Iron Mountain Incorporated,Industrials
248 | JBHT,J. B. Hunt Transport Services,Industrials
249 | JEC,Jacobs Engineering Group,Industrials
250 | JNJ,Johnson & Johnson,Health Care
251 | JCI,Johnson Controls,Consumer Discretionary
252 | JPM,JPMorgan Chase & Co.,Financials
253 | JNPR,Juniper Networks,Information Technology
254 | KSU,Kansas City Southern,Industrials
255 | K,Kellogg Co.,Consumer Staples
256 | KEY,KeyCorp,Financials
257 | KMB,Kimberly-Clark,Consumer Staples
258 | KIM,Kimco Realty,Financials
259 | KMI,Kinder Morgan,Energy
260 | KLAC,KLA-Tencor Corp.,Information Technology
261 | KSS,Kohl's Corp.,Consumer Discretionary
262 | KHC,Kraft Heinz Co,Consumer Staples
263 | KR,Kroger Co.,Consumer Staples
264 | LB,L Brands Inc.,Consumer Discretionary
265 | LLL,L-3 Communications Holdings,Industrials
266 | LH,Laboratory Corp. of America Holding,Health Care
267 | LRCX,Lam Research,Information Technology
268 | LM,Legg Mason,Financials
269 | LEG,Leggett & Platt,Industrials
270 | LEN,Lennar Corp.,Consumer Discretionary
271 | LUK,Leucadia National Corp.,Financials
272 | LVLT,Level 3 Communications,Telecommunications Services
273 | LLY,Lilly (Eli) & Co.,Health Care
274 | LNC,Lincoln National,Financials
275 | LLTC,Linear Technology Corp.,Information Technology
276 | LKQ,LKQ Corporation,Consumer Discretionary
277 | LMT,Lockheed Martin Corp.,Industrials
278 | L,Loews Corp.,Financials
279 | LOW,Lowe's Cos.,Consumer Discretionary
280 | LYB,LyondellBasell,Materials
281 | MTB,M&T Bank Corp.,Financials
282 | MAC,Macerich,Financials
283 | M,Macy's Inc.,Consumer Discretionary
284 | MNK,Mallinckrodt Plc,Health Care
285 | MRO,Marathon Oil Corp.,Energy
286 | MPC,Marathon Petroleum,Energy
287 | MAR,Marriott Int'l.,Consumer Discretionary
288 | MMC,Marsh & McLennan,Financials
289 | MLM,Martin Marietta Materials,Materials
290 | MAS,Masco Corp.,Industrials
291 | MA,Mastercard Inc.,Information Technology
292 | MAT,Mattel Inc.,Consumer Discretionary
293 | MKC,McCormick & Co.,Consumer Staples
294 | MCD,McDonald's Corp.,Consumer Discretionary
295 | MCK,McKesson Corp.,Health Care
296 | MJN,Mead Johnson,Consumer Staples
297 | MDT,Medtronic plc,Health Care
298 | MRK,Merck & Co.,Health Care
299 | MET,MetLife Inc.,Financials
300 | KORS,Michael Kors Holdings,Consumer Discretionary
301 | MCHP,Microchip Technology,Information Technology
302 | MU,Micron Technology,Information Technology
303 | MSFT,Microsoft Corp.,Information Technology
304 | MHK,Mohawk Industries,Consumer Discretionary
305 | TAP,Molson Coors Brewing Company,Consumer Staples
306 | MDLZ,Mondelez International,Consumer Staples
307 | MON,Monsanto Co.,Materials
308 | MNST,Monster Beverage,Consumer Staples
309 | MCO,Moody's Corp,Financials
310 | MS,Morgan Stanley,Financials
311 | MSI,Motorola Solutions Inc.,Information Technology
312 | MUR,Murphy Oil,Energy
313 | MYL,Mylan N.V.,Health Care
314 | NDAQ,NASDAQ OMX Group,Financials
315 | NOV,National Oilwell Varco Inc.,Energy
316 | NAVI,Navient,Financials
317 | NTAP,NetApp,Information Technology
318 | NFLX,Netflix Inc.,Information Technology
319 | NWL,Newell Rubbermaid Co.,Consumer Discretionary
320 | NFX,Newfield Exploration Co,Energy
321 | NEM,Newmont Mining Corp. (Hldg. Co.),Materials
322 | NWSA,News Corp. Class A,Consumer Discretionary
323 | NWS,News Corp. Class B,Consumer Discretionary
324 | NEE,NextEra Energy,Utilities
325 | NLSN,Nielsen Holdings,Industrials
326 | NKE,Nike,Consumer Discretionary
327 | NI,NiSource Inc.,Utilities
328 | NBL,Noble Energy Inc,Energy
329 | JWN,Nordstrom,Consumer Discretionary
330 | NSC,Norfolk Southern Corp.,Industrials
331 | NTRS,Northern Trust Corp.,Financials
332 | NOC,Northrop Grumman Corp.,Industrials
333 | NRG,NRG Energy,Utilities
334 | NUE,Nucor Corp.,Materials
335 | NVDA,Nvidia Corporation,Information Technology
336 | ORLY,O'Reilly Automotive,Consumer Discretionary
337 | OXY,Occidental Petroleum,Energy
338 | OMC,Omnicom Group,Consumer Discretionary
339 | OKE,ONEOK,Energy
340 | ORCL,Oracle Corp.,Information Technology
341 | OI,Owens-Illinois Inc,Materials
342 | PCAR,PACCAR Inc.,Industrials
343 | PH,Parker-Hannifin,Industrials
344 | PDCO,Patterson Companies,Health Care
345 | PAYX,Paychex Inc.,Information Technology
346 | PYPL,PayPal,Information Technology
347 | PNR,Pentair Ltd.,Industrials
348 | PBCT,People's United Financial,Financials
349 | PEP,PepsiCo Inc.,Consumer Staples
350 | PKI,PerkinElmer,Health Care
351 | PRGO,Perrigo,Health Care
352 | PFE,Pfizer Inc.,Health Care
353 | PCG,PG&E Corp.,Utilities
354 | PM,Philip Morris International,Consumer Staples
355 | PSX,Phillips 66,Energy
356 | PNW,Pinnacle West Capital,Utilities
357 | PXD,Pioneer Natural Resources,Energy
358 | PBI,Pitney-Bowes,Industrials
359 | PNC,PNC Financial Services,Financials
360 | RL,Polo Ralph Lauren Corp.,Consumer Discretionary
361 | PPG,PPG Industries,Materials
362 | PPL,PPL Corp.,Utilities
363 | PX,Praxair Inc.,Materials
364 | PCLN,Priceline.com Inc,Consumer Discretionary
365 | PFG,Principal Financial Group,Financials
366 | PG,Procter & Gamble,Consumer Staples
367 | PGR,Progressive Corp.,Financials
368 | PLD,Prologis,Financials
369 | PRU,Prudential Financial,Financials
370 | PEG,Public Serv. Enterprise Inc.,Utilities
371 | PSA,Public Storage,Financials
372 | PHM,Pulte Homes Inc.,Consumer Discretionary
373 | PVH,PVH Corp.,Consumer Discretionary
374 | QRVO,Qorvo,Information Technology
375 | QCOM,QUALCOMM Inc.,Information Technology
376 | PWR,Quanta Services Inc.,Industrials
377 | DGX,Quest Diagnostics,Health Care
378 | RRC,Range Resources Corp.,Energy
379 | RTN,Raytheon Co.,Industrials
380 | O,Realty Income Corporation,Financials
381 | RHT,Red Hat Inc.,Information Technology
382 | REGN,Regeneron,Health Care
383 | RF,Regions Financial Corp.,Financials
384 | RSG,Republic Services Inc,Industrials
385 | RAI,Reynolds American Inc.,Consumer Staples
386 | RHI,Robert Half International,Industrials
387 | ROK,Rockwell Automation Inc.,Industrials
388 | COL,Rockwell Collins,Industrials
389 | ROP,Roper Industries,Industrials
390 | ROST,Ross Stores,Consumer Discretionary
391 | RCL,Royal Caribbean Cruises Ltd,Consumer Discretionary
392 | R,Ryder System,Industrials
393 | SPGI,"S&P Global, Inc.",Financials
394 | CRM,Salesforce.com,Information Technology
395 | SCG,SCANA Corp,Utilities
396 | SLB,Schlumberger Ltd.,Energy
397 | SNI,Scripps Networks Interactive Inc.,Consumer Discretionary
398 | STX,Seagate Technology,Information Technology
399 | SEE,Sealed Air Corp.(New),Materials
400 | SRE,Sempra Energy,Utilities
401 | SHW,Sherwin-Williams,Materials
402 | SIG,Signet Jewelers,Consumer Discretionary
403 | SPG,Simon Property Group Inc,Financials
404 | SWKS,Skyworks Solutions,Information Technology
405 | SLG,SL Green Realty,Financials
406 | SJM,Smucker (J.M.),Consumer Staples
407 | SNA,Snap-On Inc.,Consumer Discretionary
408 | SO,Southern Co.,Utilities
409 | LUV,Southwest Airlines,Industrials
410 | SWN,Southwestern Energy,Energy
411 | SE,Spectra Energy Corp.,Energy
412 | STJ,St Jude Medical,Health Care
413 | SWK,Stanley Black & Decker,Consumer Discretionary
414 | SPLS,Staples Inc.,Consumer Discretionary
415 | SBUX,Starbucks Corp.,Consumer Discretionary
416 | HOT,Starwood Hotels & Resorts,Consumer Discretionary
417 | STT,State Street Corp.,Financials
418 | SRCL,Stericycle Inc,Industrials
419 | SYK,Stryker Corp.,Health Care
420 | STI,SunTrust Banks,Financials
421 | SYMC,Symantec Corp.,Information Technology
422 | SYF,Synchrony Financial,Financials
423 | SYY,Sysco Corp.,Consumer Staples
424 | TROW,T. Rowe Price Group,Financials
425 | TGT,Target Corp.,Consumer Discretionary
426 | TEL,TE Connectivity Ltd.,Information Technology
427 | TE,TECO Energy,Utilities
428 | TGNA,Tegna,Consumer Discretionary
429 | TDC,Teradata Corp.,Information Technology
430 | TSO,Tesoro Petroleum Co.,Energy
431 | TXN,Texas Instruments,Information Technology
432 | TXT,Textron Inc.,Industrials
433 | BK,The Bank of New York Mellon Corp.,Financials
434 | CLX,The Clorox Company,Consumer Staples
435 | KO,The Coca Cola Company,Consumer Staples
436 | HSY,The Hershey Company,Consumer Staples
437 | MOS,The Mosaic Company,Materials
438 | TRV,The Travelers Companies Inc.,Financials
439 | DIS,The Walt Disney Company,Consumer Discretionary
440 | TMO,Thermo Fisher Scientific,Health Care
441 | TIF,Tiffany & Co.,Consumer Discretionary
442 | TWX,Time Warner Inc.,Consumer Discretionary
443 | TJX,TJX Companies Inc.,Consumer Discretionary
444 | TMK,Torchmark Corp.,Financials
445 | TSS,Total System Services,Information Technology
446 | TSCO,Tractor Supply Company,Consumer Discretionary
447 | TDG,TransDigm Group,Industrials
448 | RIG,Transocean,Energy
449 | TRIP,TripAdvisor,Consumer Discretionary
450 | FOXA,Twenty-First Century Fox Class A,Consumer Discretionary
451 | FOX,Twenty-First Century Fox Class B,Consumer Discretionary
452 | TYC,Tyco International,Industrials
453 | TSN,Tyson Foods,Consumer Staples
454 | USB,U.S. Bancorp,Financials
455 | UDR,UDR Inc,Financials
456 | ULTA,Ulta Salon Cosmetics & Fragrance Inc,Consumer Discretionary
457 | UA,Under Armour,Consumer Discretionary
458 | UNP,Union Pacific,Industrials
459 | UAL,United Continental Holdings,Industrials
460 | UNH,United Health Group Inc.,Health Care
461 | UPS,United Parcel Service,Industrials
462 | URI,"United Rentals, Inc.",Industrials
463 | UTX,United Technologies,Industrials
464 | UHS,"Universal Health Services, Inc.",Health Care
465 | UNM,Unum Group,Financials
466 | URBN,Urban Outfitters,Consumer Discretionary
467 | VFC,V.F. Corp.,Consumer Discretionary
468 | VLO,Valero Energy,Energy
469 | VAR,Varian Medical Systems,Health Care
470 | VTR,Ventas Inc,Financials
471 | VRSN,Verisign Inc.,Information Technology
472 | VRSK,Verisk Analytics,Industrials
473 | VZ,Verizon Communications,Telecommunications Services
474 | VRTX,Vertex Pharmaceuticals Inc,Health Care
475 | VIAB,Viacom Inc.,Consumer Discretionary
476 | V,Visa Inc.,Information Technology
477 | VNO,Vornado Realty Trust,Financials
478 | VMC,Vulcan Materials,Materials
479 | WMT,Wal-Mart Stores,Consumer Staples
480 | WBA,Walgreens Boots Alliance,Consumer Staples
481 | WM,Waste Management Inc.,Industrials
482 | WAT,Waters Corporation,Health Care
483 | WFC,Wells Fargo,Financials
484 | HCN,Welltower Inc.,Financials
485 | WDC,Western Digital,Information Technology
486 | WU,Western Union Co,Information Technology
487 | WRK,Westrock Co,Materials
488 | WY,Weyerhaeuser Corp.,Financials
489 | WHR,Whirlpool Corp.,Consumer Discretionary
490 | WFM,Whole Foods Market,Consumer Staples
491 | WMB,Williams Cos.,Energy
492 | WLTW,Willis Towers Watson,Financials
493 | WEC,Wisconsin Energy Corporation,Utilities
494 | WYN,Wyndham Worldwide,Consumer Discretionary
495 | WYNN,Wynn Resorts Ltd,Consumer Discretionary
496 | XEL,Xcel Energy Inc,Utilities
497 | XRX,Xerox Corp.,Information Technology
498 | XLNX,Xilinx Inc,Information Technology
499 | XL,XL Capital,Financials
500 | XYL,Xylem Inc.,Industrials
501 | YHOO,Yahoo Inc.,Information Technology
502 | YUM,Yum! Brands Inc,Consumer Discretionary
503 | ZBH,Zimmer Biomet Holdings,Health Care
504 | ZION,Zions Bancorp,Financials
505 | ZTS,Zoetis,Health Care
--------------------------------------------------------------------------------
/Old_versions/markov_stock_analysis v2-2.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Nikhil Bhaip
3 | @version: 2.2
4 | @since: 6/15/16
5 |
6 | The S&P 500 program implements an algorithm that finds the percentage change in the S&P 500 Index based on
7 | historical weekly data and visualizes the information as a time series plot in matplotlib. The program also creates a
8 | Markov chain model in which the states are bull market, bear market, and stagnant market. Using the probabilities
9 | associated with this Markov chain model and external technical analysis, the program will predict the future S&P 500
10 | data through a random walk.
11 |
12 | The next step would be to include other newer variables like seasonality and clean this up in terms of organization.
13 |
14 | """
15 | # import pandas as pd
16 | # import datetime as dt
17 | import matplotlib.pyplot as plt
18 | from matplotlib.dates import MonthLocator, DateFormatter
19 | from matplotlib.ticker import MultipleLocator
20 | import matplotlib.mlab as mlab
21 | import numpy as np
22 | import quandl
23 |
24 |
25 | def get_data():
26 | """
27 | This function obtains data under certain parameters from Quandl and returns the following information as a Pandas
28 | DataFrame: date, adjusted closing, and percentage change in adjusted closing from the last week.
29 |
30 | :return: A Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
31 | """
32 | quandl.ApiConfig.api_key = "7NU4-sXfczxA9fsf_C8E"
33 | name = "AMZN" #INDEX_GSPC
34 | start = "2009-05-23"
35 | end = "2016-05-23"
36 | period = "weekly"
37 | raw_df = quandl.get("YAHOO/" + name, start_date=start, end_date=end, collapse=period)
38 | adjusted_df = raw_df.ix[:, ['Adjusted Close']]
39 | adjusted_df["Percentage Change"] = adjusted_df['Adjusted Close'].pct_change() * 100
40 | return adjusted_df
41 |
42 |
43 | def percent_change_as_time_plot(adjusted_df):
44 | """
45 | This function visualizes the percentage change data as a time series plot.
46 |
47 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
48 | """
49 |
50 | pct_change_list = adjusted_df['Percentage Change'].tolist()
51 | date_list = adjusted_df.index.values
52 | fig, ax = plt.subplots()
53 | ax.plot(date_list, pct_change_list)
54 | #ax.plot(date_list, adjusted_df["Adjusted Close"])
55 | plt.xlabel("Years")
56 | plt.ylabel("Percentage change from last week")
57 | plt.title("Percentage change in S&P 500 weekly data from 2009 to 2016")
58 | ax.xaxis.set_minor_locator(MonthLocator())
59 | ax.yaxis.set_minor_locator(MultipleLocator(1))
60 | ax.fmt_xdata = DateFormatter('%Y-%m-%d')
61 | ax.autoscale_view()
62 | fig.autofmt_xdate()
63 |
64 | plt.show()
65 |
66 |
67 | def get_params_for_norm_dist(adjusted_df):
68 | """
69 | This function returns the mean and standard deviation in the percentage change column of a DataFrame.
70 | :param adjusted_df: must have 'Percentage Change' column
71 |
72 | :returns mean and standard deviation of the percentage change column
73 | """
74 | mean = adjusted_df["Percentage Change"].mean()
75 | std = adjusted_df["Percentage Change"].std()
76 | return mean, std
77 |
78 |
79 | def percent_change_as_hist(adjusted_df):
80 | """
81 | This function visualizes the percentage change data as a histogram. The graph is also fitted to a normal
82 | distribution curve.
83 |
84 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
85 | """
86 | pct_change_list = adjusted_df['Percentage Change']
87 |
88 | # Code below removes the NaN value and plots the histogram. Bins are left adjusted right now, so when plotting the
89 | # normal distribution function, we must adjust it to be based off the center (average) of the bins.
90 | n, bins, patches = plt.hist(pct_change_list.dropna(), bins=25, normed=True)
91 | bincenters = 0.5*(bins[1:]+bins[:-1])
92 |
93 | plt.xlabel("Percentage change")
94 | plt.ylabel("Frequency")
95 | mean, std = get_params_for_norm_dist(adjusted_df)
96 | plt.title("Distribution of percentage change in S&P 500. Mu: %.3f, Sigma: %.3f" % (mean, std), y=1.03)
97 |
98 | # adds vertical lines to the graph corresponding to the x's that represent the number of deviations from the mean
99 | for num_std_from_mean in range(-3, 4):
100 | plt.axvline(mean + std * num_std_from_mean)
101 |
102 | # plots the normal pdf of best fit
103 | y = mlab.normpdf(bincenters, mean, std)
104 | plt.plot(bincenters, y, 'r--', linewidth=1)
105 |
106 | plt.show()
107 |
108 |
109 | def percent_change_prob_2x2(adjusted_df, threshold=0):
110 | """
111 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
112 | two states: % change greater than or equal to a threshold (A) and % change less than a threshold (B). The threshold
113 | is defaulted to zero, so that the states are roughly divided into positive and negative changes. The four
114 | probabilities are: a (A | A), b (B | A) , c (A | B) , d (B | B). By definition, the sum of the rows in the right
115 | stochastic transition matrix must add up to 1: (a + b = 1 and c + d = 1)
116 |
117 | A B
118 | P = A a b
119 | B c d
120 |
121 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
122 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
123 | """
124 | a_count = 0 # counts frequency of when A occurs then the next period A occurs
125 | b_count = 0 # counts frequency of when A occurs then the next period B occurs
126 | c_count = 0 # counts frequency of when B occurs then the next period A occurs
127 | d_count = 0 # counts frequency of when B occurs then the next period B occurs
128 |
129 | # creates a n
130 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
131 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
132 |
133 | for index, pct in new_df['Percentage Change'].iteritems():
134 | if index == 1: # prevents program from calculating a future probability
135 | break
136 | if pct >= threshold:
137 | if new_df['Percentage Change'][index-1] >= threshold:
138 | a_count += 1
139 | else:
140 | b_count += 1
141 | else:
142 | if new_df['Percentage Change'][index-1] >= threshold:
143 | c_count += 1
144 | else:
145 | d_count += 1
146 |
147 | # Given event A just happened, this is the probability that another event A occurs
148 | a_prob = a_count / (a_count + b_count)
149 |
150 | # Given event A just happened, this is the probability that event B occurs
151 | b_prob = b_count / (a_count + b_count)
152 |
153 | # Given event B just happened, this is the probability that event A occurs
154 | c_prob = c_count / (c_count + d_count)
155 |
156 | # Given event B just happened, this is the probability that another event B occurs
157 | d_prob = d_count / (c_count + d_count)
158 |
159 | prob_list = [[a_prob, b_prob], [c_prob, d_prob]]
160 | print(prob_list, "\n")
161 |
162 | print("\nIf the S&P 500 has a positive percentage change this week, there is a %.3f chance that "
163 | "next week there will be a repeat positive percentage change. If the index rises this week, then there is a "
164 | "%.3f chance that next week the index will fall. However, if the S&P drops in one week there is a %.3f that"
165 | " next week there will be a repeat negative percentage change. If the index falls this week, then there is a "
166 | "%.3f chance that the index will rise next week. \n" % (a_prob, b_prob, d_prob, c_prob))
167 | return prob_list
168 |
169 |
170 | def percent_change_prob_3x3(adjusted_df, lower_thresh=-1.0, upper_thresh=1.0):
171 | """
172 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
173 | three states: % change less than or equal to a lower threshold (A), % change between the upper and lower
174 | thresholds (B)and % change greater than or equal to an upper threshold (C). The lower threshold is defaulted to -1,
175 | and the upper threshold is defaulted to +1. Percentage changes below the lower threshold may be considered bearish,
176 | in between the two thresholds considered stagnant, and above the threshold considered bullish. The nine
177 | probabilities are: a P(A | A), b (B | A) , c (C | A) , d (A | B), e (B | B), f (C | B), g (A | C), h (B | C), and
178 | i (C | C). The sum of the rows in the matrix must add up to 1: (a + b + c = 1 and d + e + f = 1 and g + h + i = 1)
179 |
180 | A B C
181 | P = A a b c
182 | B d e f
183 | C g h i
184 |
185 | See percent_change_prob_2x2 for more details
186 |
187 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
188 | :param lower_thresh: Represents the level dividing events A & B
189 | :param upper_thresh: Represents the level dividing events B & C
190 | """
191 | # counts frequency of sub-events
192 | count_list = [[0, 0, 0], # a_count, b_count, c_count
193 | [0, 0, 0], # d_count, e_count, f_count
194 | [0, 0, 0]] # g_count, h_count, i_count
195 |
196 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
197 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
198 |
199 | for index, pct in new_df['Percentage Change'].iteritems():
200 | if index == 1: # prevents program from calculating a future probability
201 | break
202 | if pct <= lower_thresh:
203 |
204 | if new_df['Percentage Change'][index-1] <= lower_thresh:
205 | count_list[0][0] += 1 # increment a_count
206 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
207 | count_list[0][1] += 1 # increment b_count
208 | else:
209 | count_list[0][2] += 1 # increment c_count
210 |
211 | elif lower_thresh < pct < upper_thresh:
212 |
213 | if new_df['Percentage Change'][index-1] <= lower_thresh:
214 | count_list[1][0] += 1 # increment d_count
215 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
216 | count_list[1][1] += 1 # increment e_count
217 | else:
218 | count_list[1][2] += 1 # increment f_count
219 |
220 | else:
221 |
222 | if new_df['Percentage Change'][index-1] <= lower_thresh:
223 | count_list[2][0] += 1 # increment g_count
224 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
225 | count_list[2][1] += 1 # increment h_count
226 | else:
227 | count_list[2][2] += 1 # increment i_count
228 |
229 | prob_list = [[count / sum(group) for count in group] for group in count_list]
230 | for group in prob_list:
231 | print(group)
232 | lower_thresh_str = "{:.2f}%".format(lower_thresh)
233 | upper_thresh_str = "{:.2f}%".format(upper_thresh)
234 | for i in range(3):
235 | if i == 0:
236 | part_1_summary = "\nIf the security falls below {lower_thresh} in one period (bearish),"
237 | elif i == 1:
238 | part_1_summary = "\nIf the security changes between {lower_thresh} and {upper_thresh} in one period " \
239 | "(stagnant),"
240 | else:
241 | part_1_summary = "\nIf the security rises above {upper_thresh} in one period (bullish),"
242 |
243 | part_2_summary = "in the next period, there is a {:.3f} chance that the security will fall by more than " \
244 | "{lower_thresh} (bearish), a {:.3f} chance that the security will change somewhere between " \
245 | "{lower_thresh} and {upper_thresh} (stagnant), and a {:.3f} chance that the security will " \
246 | "rise by more than {upper_thresh} (bullish)."
247 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1], prob_list[i][2],
248 | lower_thresh=lower_thresh_str, upper_thresh=upper_thresh_str))
249 |
250 | return prob_list
251 |
252 |
253 | def random_walk_norm_pdf(adjusted_df, start=2099, num_periods=12):
254 | """
255 | This function calculates and visualizes a random walk assuming that S&P 500 data are independent of current state.
256 | Based on a basic normal distribution and a starting point, the function will predict the S&P 500
257 | Index movement for a finite number of periods. This is the most fundamental random walk and has many unrealistic
258 | assumptions, such as the data are independently and identically distributed, which is likely not true for the
259 | S&P500 Index.
260 |
261 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
262 | :param start: starting value for S&P 500 random walk
263 | :param num_periods: number of steps in the random walk process
264 |
265 | """
266 | mean, std = get_params_for_norm_dist(adjusted_df)
267 | pct_change_list = []
268 | all_walks = [] # will hold all the random walk data
269 | for i in range(100):
270 | random_walk = [start]
271 | for period in range(num_periods):
272 | # sets the step as the last element in the random walk
273 | step = random_walk[-1]
274 |
275 | # picks a random percent change from a Gaussian distribution based on historical mean and standard deviation
276 | pct_change = np.random.normal(mean, std)
277 | pct_change_list.append(pct_change)
278 |
279 | # reordering of percent change formula
280 | step = ((pct_change * step / 100) + step)
281 |
282 | random_walk.append(step)
283 | all_walks.append(random_walk)
284 | show_rand_walks(all_walks)
285 |
286 |
287 | def prob_from_bins(heights, bins):
288 | """
289 | Chooses a random bin based on the prob distribution in the histogram. Then returns a random percentage change from
290 | that bin.
291 |
292 | :param heights: heights of the histogram
293 | :param bins: left-hand edges of each bin; must have at least two values in list
294 | :return: random percentage change
295 | """
296 | np_heights = np.asarray(heights)
297 | bin_length = bins[1]-bins[0]
298 | np_area = bin_length * np_heights # sum of area is equal to 1
299 | bin_num = np.random.choice(np.arange(start=1, stop=len(bins)), p=np_area)
300 | rand_pct_change = bin_length * np.random.ranf() + bins[bin_num-1]
301 | return rand_pct_change
302 |
303 |
304 | def rand_walk_2x2_markov(adjusted_df, prob_list, num_bins=10, threshold=0, start=2099, num_periods=12):
305 | """
306 |
307 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
308 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
309 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
310 | the probability distribution will be
311 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold)
312 | :param start: starting value for S&P 500 random walk
313 | :param num_periods: number of steps in the random walk process
314 | """
315 |
316 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
317 | pct_above_threshold_array = pct_change_array[pct_change_array >= threshold]
318 | pct_below_threshold_array = pct_change_array[pct_change_array < threshold]
319 | n_above, bins_above, patches_above = plt.hist(pct_above_threshold_array, bins=num_bins, normed=True)
320 | n_below, bins_below, patches_below = plt.hist(pct_below_threshold_array, bins=num_bins, normed=True)
321 |
322 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
323 | # percentage change of the previous period
324 | pct_change_list = []
325 | all_walks = [] # will hold all the random walk data
326 | for i in range(1000):
327 | mean, std = get_params_for_norm_dist(adjusted_df)
328 | first_pct_change = np.random.normal(mean, std)
329 | pct_change_list.append(first_pct_change)
330 | first_step = ((first_pct_change * start / 100) + start)
331 | random_walk = [start, first_step]
332 |
333 | for period in range(num_periods):
334 | step = random_walk[-1]
335 | prev_pct_change = pct_change_list[-1]
336 |
337 | # random number used to test whether event A will occur or event B will occur
338 | rand_prob = np.random.random_sample()
339 | if prev_pct_change >= threshold: # If true, event A occurred
340 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
341 | if rand_prob <= prob_list[0][0]: # If true, A then A
342 | pct_change = prob_from_bins(n_above, bins_above)
343 | else: # If true, A then B
344 | pct_change = prob_from_bins(n_below, bins_below)
345 | else: # If true, event B occurred
346 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
347 | if rand_prob <= prob_list[1][0]: # If true, B then A
348 | pct_change = prob_from_bins(n_above, bins_above)
349 | else: # If true, B then B
350 | pct_change = prob_from_bins(n_below, bins_below)
351 |
352 | pct_change_list.append(pct_change)
353 |
354 | step = ((pct_change * step / 100) + step)
355 |
356 | random_walk.append(step)
357 | all_walks.append(random_walk)
358 | show_rand_walks(all_walks)
359 |
360 |
361 | def rand_walk_3x3_markov(adjusted_df, prob_list, num_bins=10, lower_thresh=-1, upper_thresh=1, start=2099,
362 | num_periods=12):
363 | """
364 |
365 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
366 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
367 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
368 | the probability distribution will be
369 | :param lower_thresh: Represents the level dividing events A (pct change < lower thresh) & B(lower thresh <=
370 | pct change < upper thresh)
371 | :param upper_thresh: Represents the level dividing events B (lower thresh < pct change < upper thresh) &
372 | C(upper thresh < pct change)
373 | :param start: starting value for S&P 500 random walk
374 | :param num_periods: number of steps in the random walk process
375 | """
376 |
377 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
378 | pct_above_array = pct_change_array[pct_change_array >= upper_thresh]
379 | pct_between_array = pct_change_array[np.logical_and(pct_change_array > lower_thresh,
380 | pct_change_array < upper_thresh)]
381 | pct_below_array = pct_change_array[pct_change_array <= lower_thresh]
382 | n_above, bins_above, patches_above = plt.hist(pct_above_array, bins=num_bins, normed=True)
383 | n_between, bins_between, patches_between = plt.hist(pct_between_array, bins=num_bins, normed=True)
384 | n_below, bins_below, patches_below = plt.hist(pct_below_array, bins=num_bins, normed=True)
385 |
386 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
387 | # percentage change of the previous period
388 | pct_change_list = []
389 | all_walks = [] # will hold all the random walk data
390 | for i in range(1000):
391 | mean, std = get_params_for_norm_dist(adjusted_df)
392 | first_pct_change = np.random.normal(mean, std)
393 | pct_change_list.append(first_pct_change)
394 | first_step = ((first_pct_change * start / 100) + start)
395 | random_walk = [start, first_step]
396 |
397 | for period in range(num_periods):
398 | step = random_walk[-1]
399 | prev_pct_change = pct_change_list[-1]
400 |
401 | # random number used to test whether event A will occur or event B will occur
402 | rand_prob = np.random.random_sample()
403 | if prev_pct_change <= lower_thresh: # If true, event A occurred
404 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
405 | if 0 < rand_prob <= prob_list[0][0]: # If true, A then A
406 | pct_change = prob_from_bins(n_below, bins_below)
407 | elif prob_list[0][0] < rand_prob < (prob_list[0][0] + prob_list[0][1]): # If true, A then B
408 | pct_change = prob_from_bins(n_between, bins_between)
409 | else: # If true, A then C
410 | pct_change = prob_from_bins(n_above, bins_above)
411 |
412 | elif lower_thresh < prev_pct_change < upper_thresh: # If true, event B occurred
413 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
414 | if 0 < rand_prob <= prob_list[1][0]: # If true, B then A
415 | pct_change = prob_from_bins(n_below, bins_below)
416 | elif prob_list[1][0] < rand_prob < (prob_list[1][0] + prob_list[1][1]): # If true, B then B
417 | pct_change = prob_from_bins(n_between, bins_between)
418 | else: # If true, B then C
419 | pct_change = prob_from_bins(n_above, bins_above)
420 |
421 | else: # If true, event C occurred
422 | # prob_list[2][0] is probability that event A will occur, given event C has happened already
423 | if 0 < rand_prob <= prob_list[2][0]: # If true, C then A
424 | pct_change = prob_from_bins(n_below, bins_below)
425 | elif prob_list[2][0] < rand_prob < (prob_list[2][0] + prob_list[2][1]): # If true, C then B
426 | pct_change = prob_from_bins(n_between, bins_between)
427 | else: # If true, C then C
428 | pct_change = prob_from_bins(n_above, bins_above)
429 |
430 | pct_change_list.append(pct_change)
431 |
432 | step = ((pct_change * step / 100) + step)
433 |
434 | random_walk.append(step)
435 | all_walks.append(random_walk)
436 | show_rand_walks(all_walks)
437 |
438 |
439 | def show_rand_walks(all_walks):
440 | """
441 | Visualizes all random walks as a plot and distribution.
442 |
443 | :param all_walks: list of all random walks conducted
444 | """
445 | np_aw = np.array(all_walks) # converts the list of all random walks to a Numpy Array
446 | np_aw_t = np.transpose(np_aw) # must transpose the array for graph to display properly
447 | plt.clf()
448 | plt.plot(np_aw_t)
449 | plt.xlabel("Steps")
450 | plt.ylabel("S&P 500 Index Value")
451 | plt.title("All Random Walks of the S&P 500 Index")
452 | plt.show()
453 |
454 | # Select last row from np_aw_t: ends
455 | ends = np_aw_t[-1]
456 |
457 | # Plot histogram of ends, display plot
458 | n, bins, patches = plt.hist(ends, bins=25, normed=True)
459 | plt.xlabel("Final S&P 500 Index Value at end of period.")
460 | plt.ylabel("Frequency")
461 | rand_mean = ends.mean()
462 | rand_std = ends.std()
463 |
464 | plt.title("Distribution of Random Walk Final Values. Mean is %d and Standard Deviation is %d"
465 | % (rand_mean, rand_std), y=1.03)
466 | for num_std_from_mean in range(-3, 4):
467 | plt.axvline(rand_mean + rand_std * num_std_from_mean)
468 | bincenters = 0.5*(bins[1:]+bins[:-1])
469 | # plots the normal pdf of best fit
470 | y = mlab.normpdf(bincenters, rand_mean, rand_std)
471 | plt.plot(bincenters, y, 'r--', linewidth=3)
472 | plt.show()
473 |
474 |
475 | markov_df = get_data()
476 | #print(type(markov_df))
477 | #print(markov_df)
478 | percent_change_as_time_plot(markov_df)
479 | percent_change_as_hist(markov_df)
480 | prob_matrix = percent_change_prob_2x2(markov_df)
481 | prob_matrix2 = percent_change_prob_3x3(markov_df, lower_thresh= -1, upper_thresh= 1)
482 | rand_walk_2x2_markov(markov_df, prob_list=prob_matrix)
483 | rand_walk_3x3_markov(markov_df, prob_matrix2, lower_thresh= -1, upper_thresh= 1)
484 | #random_walk_norm_pdf(markov_df, num_periods=12)
485 |
--------------------------------------------------------------------------------
/markov_stock_forecasting_model_v3.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Nikhil Bhaip
3 | @version: 3.0
4 | @since: 8/09/17
5 |
6 | The markov_stock analysis program implements an algorithm that finds the percentage change in a security based on
7 | historical weekly data from Yahoo Finance and visualizes the information as a time series plot in matplotlib. The
8 | program also creates a Markov chain model in which the states are bull market, bear market, and stagnant market.
9 | Using the probabilities associated with this Markov chain model, the program will model the estimated growth of the
10 | security price through a random walk. This program can be used as a tool to analyze securities, like stocks and
11 | indexes, as well as study the state of the market for a wide number of applications including options and technical
12 | analysis.
13 |
14 |
15 | """
16 | import datetime as dt
17 | import matplotlib.pyplot as plt
18 | from matplotlib.dates import MonthLocator, DateFormatter
19 | from matplotlib.ticker import MultipleLocator
20 | import matplotlib.mlab as mlab
21 | import numpy as np
22 | import pandas_datareader.data as web
23 | import sys
24 |
25 |
26 | class SecurityInfo:
27 | """
28 | Holds information about a security (stock, index) to be used when retrieving data from Quandl and accessing
29 | information for other functions within the program.
30 |
31 | """
32 | def __init__(self, name, start, end):
33 | self.name = name
34 | try:
35 | dt.datetime.strptime(start, '%Y-%m-%d')
36 | dt.datetime.strptime(end, '%Y-%m-%d')
37 | self.start = start
38 | self.end = end
39 | except ValueError:
40 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
41 |
42 | def summary(self):
43 | print("Name: " + self.name)
44 | print("Starting Date: " + self.start)
45 | print("Ending Date: " + self.end)
46 |
47 | def valid_date(self, new_date):
48 | try:
49 | dt.datetime.strptime(new_date, '%Y-%m-%d')
50 | return True
51 | except ValueError:
52 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
53 |
54 | def set_name(self, new_name):
55 | self.name = new_name
56 |
57 | def get_name(self):
58 | return self.name
59 |
60 | def set_start(self, new_start):
61 | if self.valid_date(new_start):
62 | self.start = new_start
63 |
64 | def get_start(self):
65 | return self.start
66 |
67 | def set_end(self, new_end):
68 | if self.valid_date(new_end):
69 | self.end = new_end
70 |
71 | def get_end(self):
72 | return self.end
73 |
74 |
75 | def get_data(security):
76 | """
77 | This function obtains data through an API called pandas_datareader and returns the following information as a Pandas
78 | DataFrame: date, adjusted closing, and percentage change in adjusted closing from the last week.
79 |
80 | :param security: Holds information about the requested security
81 | :return: A Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
82 | """
83 | name = security.get_name()
84 | start = security.get_start()
85 | end = security.get_end()
86 | try:
87 | raw_df = web.DataReader(name, 'yahoo', start, end)
88 | except Exception as e:
89 | print(e)
90 | sys.exit(1)
91 | adjusted_df = raw_df.ix[:, ['Adj Close']]
92 | adjusted_df["Percentage Change"] = adjusted_df['Adj Close'].pct_change() * 100
93 | return adjusted_df
94 |
95 |
96 | def percent_change_as_time_plot(adjusted_df, security):
97 | """
98 | This function visualizes the percentage change data as a time series plot.
99 |
100 | :param adjusted_df: Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
101 | :param security: Holds information about the requested security
102 | """
103 |
104 | pct_change_list = adjusted_df['Percentage Change'].tolist()
105 | date_list = adjusted_df.index.values
106 | fig, ax = plt.subplots()
107 | ax.plot(date_list, pct_change_list)
108 | plt.xlabel("Dates")
109 | plt.ylabel("Percentage change from last period")
110 | plt.title("Percentage change in " + security.get_name() + " data", y=1.03)
111 | ax.xaxis.set_minor_locator(MonthLocator())
112 | ax.yaxis.set_minor_locator(MultipleLocator(1))
113 | ax.fmt_xdata = DateFormatter('%Y-%m-%d')
114 | ax.autoscale_view()
115 | fig.autofmt_xdate()
116 |
117 | plt.show()
118 |
119 |
120 | def get_params_for_norm_dist(adjusted_df):
121 | """
122 | This function returns the mean and standard deviation in the percentage change column of a DataFrame.
123 | :param adjusted_df: must have 'Percentage Change' column
124 |
125 | :returns mean and standard deviation of the percentage change column
126 | """
127 | mean = adjusted_df["Percentage Change"].mean()
128 | std = adjusted_df["Percentage Change"].std()
129 | return mean, std
130 |
131 |
132 | def percent_change_as_hist(adjusted_df, security):
133 | """
134 | This function visualizes the percentage change data as a histogram. The graph is also fitted to a normal
135 | distribution curve.
136 |
137 | :param security: Holds information about the requested security
138 | :param adjusted_df: Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
139 | """
140 | pct_change_list = adjusted_df['Percentage Change']
141 |
142 | # Code below removes the NaN value and plots the histogram. Bins are left adjusted right now, so when plotting the
143 | # normal distribution function, we must adjust it to be based off the center (average) of the bins.
144 | n, bins, patches = plt.hist(pct_change_list.dropna(), bins=25, normed=True)
145 | bincenters = 0.5*(bins[1:]+bins[:-1])
146 |
147 | plt.xlabel("Percentage change")
148 | plt.ylabel("Frequency")
149 | mean, std = get_params_for_norm_dist(adjusted_df)
150 | plt.title("Distribution of percentage change in " + security.get_name() + " Mu: %.3f, Sigma: %.3f"
151 | % (mean, std), y=1.03)
152 |
153 | # adds vertical lines to the graph corresponding to the x's that represent the number of deviations from the mean
154 | for num_std_from_mean in range(-3, 4):
155 | plt.axvline(mean + std * num_std_from_mean)
156 |
157 | # plots the normal pdf of best fit
158 | y = mlab.normpdf(bincenters, mean, std)
159 | plt.plot(bincenters, y, 'r--', linewidth=1)
160 |
161 | plt.show()
162 |
163 | def getLastClosingDate(adjusted_df):
164 | return adjusted_df['Adj Close'].iloc[-1]
165 |
166 | def percent_change_prob_2x2(adjusted_df, security, threshold=0.0):
167 | """
168 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
169 | two states: % change greater than or equal to a threshold (A) and % change less than a threshold (B). The threshold
170 | is defaulted to zero, so that the states are roughly divided into positive and negative changes. The four
171 | probabilities are: a (A | A), b (B | A) , c (A | B) , d (B | B). By definition, the sum of the rows in the right
172 | stochastic transition matrix must add up to 1: (a + b = 1 and c + d = 1)
173 |
174 | A B
175 | P = A a b
176 | B c d
177 |
178 | :param adjusted_df: Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
179 | :param security: Holds information about the requested security
180 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
181 | """
182 | count_list = [[0, 0], # a_count, b_count,
183 | [0, 0]] # c_count, d_count
184 |
185 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
186 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
187 |
188 | for index, pct in new_df['Percentage Change'].iteritems():
189 | if index == 1: # prevents program from calculating a future probability
190 | break
191 | if pct >= threshold:
192 | if new_df['Percentage Change'][index-1] >= threshold:
193 | count_list[0][0] += 1 # event A occurred, then event A occurred
194 | else:
195 | count_list[0][1] += 1 # event A occurred, then event B occurred
196 | else:
197 | if new_df['Percentage Change'][index-1] >= threshold:
198 | count_list[1][0] += 1 # event B occurred, then event A occurred
199 | else:
200 | count_list[1][1] += 1 # event B occurred, then event B occurred
201 |
202 | prob_list = [[count / sum(group) for count in group] for group in count_list]
203 |
204 | print(prob_list, "\n")
205 | thresh_str = "{:.2f}%".format(threshold)
206 | for i in range(2):
207 | if i == 0:
208 | part_1_summary = "\nIf " + security.get_name() + " rises above {thresh} in one period, "
209 | else:
210 | part_1_summary = "\nIf " + security.get_name() + " falls below {thresh} in one period, "
211 | part_2_summary = "in the next period, there is a {:.2f} chance that the security will rise above {thresh} " \
212 | "and a {:.2f} chance that it will fall below this threshold."
213 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1],
214 | thresh=thresh_str))
215 | return prob_list
216 |
217 |
218 | def percent_change_prob_3x3(adjusted_df, security, lower_thresh=-1.0, upper_thresh=1.0):
219 | """
220 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
221 | three states: % change less than or equal to a lower threshold (A), % change between the upper and lower
222 | thresholds (B)and % change greater than or equal to an upper threshold (C). The lower threshold is defaulted to -1,
223 | and the upper threshold is defaulted to +1. Percentage changes below the lower threshold may be considered bearish,
224 | in between the two thresholds considered stagnant, and above the threshold considered bullish. The nine
225 | probabilities are: a P(A | A), b (B | A) , c (C | A) , d (A | B), e (B | B), f (C | B), g (A | C), h (B | C), and
226 | i (C | C). The sum of the rows in the matrix must add up to 1: (a + b + c = 1 and d + e + f = 1 and g + h + i = 1)
227 |
228 | A B C
229 | P = A a b c
230 | B d e f
231 | C g h i
232 |
233 | See percent_change_prob_2x2 for more details
234 |
235 | :param adjusted_df: Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
236 | :param security: Holds information about the requested security
237 | :param lower_thresh: Represents the level dividing events A & B
238 | :param upper_thresh: Represents the level dividing events A & B
239 | """
240 | # counts frequency of sub-events
241 | count_list = [[0, 0, 0], # a_count, b_count, c_count
242 | [0, 0, 0], # d_count, e_count, f_count
243 | [0, 0, 0]] # g_count, h_count, i_count
244 |
245 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
246 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
247 |
248 | for index, pct in new_df['Percentage Change'].iteritems():
249 | if index == 1: # prevents program from calculating a future probability
250 | break
251 | if pct <= lower_thresh:
252 |
253 | if new_df['Percentage Change'][index-1] <= lower_thresh:
254 | count_list[0][0] += 1 # increment a_count
255 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
256 | count_list[0][1] += 1 # increment b_count
257 | else:
258 | count_list[0][2] += 1 # increment c_count
259 |
260 | elif lower_thresh < pct < upper_thresh:
261 |
262 | if new_df['Percentage Change'][index-1] <= lower_thresh:
263 | count_list[1][0] += 1 # increment d_count
264 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
265 | count_list[1][1] += 1 # increment e_count
266 | else:
267 | count_list[1][2] += 1 # increment f_count
268 |
269 | else:
270 |
271 | if new_df['Percentage Change'][index-1] <= lower_thresh:
272 | count_list[2][0] += 1 # increment g_count
273 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
274 | count_list[2][1] += 1 # increment h_count
275 | else:
276 | count_list[2][2] += 1 # increment i_count
277 |
278 | prob_list = [[count / sum(group) for count in group] for group in count_list]
279 | for group in prob_list:
280 | print(group)
281 | lower_thresh_str = "{:.2f}%".format(lower_thresh)
282 | upper_thresh_str = "{:.2f}%".format(upper_thresh)
283 | for i in range(3):
284 | part_1_summary = ""
285 | if i == 0:
286 | part_1_summary = "\nIf " + security.get_name() + " falls below {lower_thresh} in one period (bearish),"
287 | elif i ==1:
288 | part_1_summary = "\nIf " + security.get_name() + " changes between {lower_thresh} and {upper_thresh} in " \
289 | "one period (stagnant),"
290 | else:
291 | part_1_summary = "\nIf " + security.get_name() + " rises above {upper_thresh} in one period (bullish),"
292 |
293 | part_2_summary = "in the next period, there is a {:.3f} chance that the security will fall by more than " \
294 | "{lower_thresh} (bearish), a {:.3f} chance that the security will change somewhere between " \
295 | "{lower_thresh} and {upper_thresh} (stagnant), and a {:.3f} chance that the security will " \
296 | "rise by more than {upper_thresh} (bullish)."
297 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1], prob_list[i][2],
298 | lower_thresh=lower_thresh_str, upper_thresh=upper_thresh_str))
299 |
300 | return prob_list
301 |
302 |
303 | def random_walk_norm_pdf(adjusted_df, num_periods=12):
304 | """
305 | This function calculates and visualizes a random walk assuming that security price data are independent of current state.
306 | Based on a basic normal distribution and a starting point, the function will predict the security price
307 | Index movement for a finite number of periods. This is the most fundamental random walk and has many unrealistic
308 | assumptions, such as the data are independently and identically distributed, which is very likely not true for the
309 | S&P500 Index.
310 |
311 | :param adjusted_df: Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
312 | :param start: starting value for security price random walk
313 | :param num_periods: number of steps in the random walk process
314 |
315 | """
316 | mean, std = get_params_for_norm_dist(adjusted_df)
317 | pct_change_list = []
318 | all_walks = [] # will hold all the random walk data
319 | for i in range(100):
320 | random_walk = [getLastClosingDate(adjusted_df)] # walk starts with the last closing date
321 | for period in range(num_periods):
322 | # sets the step as the last element in the random walk
323 | step = random_walk[-1]
324 |
325 | # picks a random percent change from a Gaussian distribution based on historical mean and standard deviation
326 | pct_change = np.random.normal(mean, std)
327 | pct_change_list.append(pct_change)
328 |
329 | # reordering of percent change formula
330 | step = ((pct_change * step / 100) + step)
331 |
332 | random_walk.append(step)
333 | all_walks.append(random_walk)
334 | show_rand_walks(all_walks)
335 |
336 |
337 | def prob_from_bins(heights, bins):
338 | """
339 | Chooses a random bin based on the prob distribution in the histogram. Then returns a random percentage change from
340 | that bin.
341 |
342 | :param heights: heights of the histogram
343 | :param bins: left-hand edges of each bin; must have at least two values in list
344 | :return: random percentage change
345 | """
346 | np_heights = np.asarray(heights)
347 | bin_length = bins[1]-bins[0]
348 | np_area = bin_length * np_heights # sum of area is equal to 1
349 | bin_num = np.random.choice(np.arange(start=1, stop=len(bins)), p=np_area)
350 | rand_pct_change = bin_length * np.random.ranf() + bins[bin_num-1]
351 | return rand_pct_change
352 |
353 |
354 | def rand_walk_2x2_markov(adjusted_df, prob_list, security, num_bins=10, threshold=0.0, num_periods=12):
355 | """
356 |
357 |
358 | :param adjusted_df: Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
359 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
360 | :param security: Holds information about the requested security
361 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
362 | the probability distribution will be
363 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold)
364 | :param start: starting value for security price random walk
365 | :param num_periods: number of steps in the random walk process
366 | """
367 |
368 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
369 | pct_above_threshold_array = pct_change_array[pct_change_array >= threshold]
370 | pct_below_threshold_array = pct_change_array[pct_change_array < threshold]
371 | n_above, bins_above, patches_above = plt.hist(pct_above_threshold_array, bins=num_bins, normed=True)
372 | n_below, bins_below, patches_below = plt.hist(pct_below_threshold_array, bins=num_bins, normed=True)
373 |
374 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
375 | # percentage change of the previous period
376 | pct_change_list = []
377 | all_walks = [] # will hold all the random walk data
378 | start=getLastClosingDate(adjusted_df)
379 | for i in range(100):
380 | mean, std = get_params_for_norm_dist(adjusted_df)
381 | first_pct_change = np.random.normal(mean, std)
382 | pct_change_list.append(first_pct_change)
383 | first_step = ((first_pct_change * start / 100) + start)
384 | random_walk = [start, first_step]
385 |
386 | for period in range(num_periods):
387 | step = random_walk[-1]
388 | prev_pct_change = pct_change_list[-1]
389 |
390 | # random number used to test whether event A will occur or event B will occur
391 | rand_prob = np.random.random_sample()
392 | if prev_pct_change >= threshold: # If true, event A occurred
393 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
394 | if rand_prob <= prob_list[0][0]: # If true, A then A
395 | pct_change = prob_from_bins(n_above, bins_above)
396 | else: # If true, A then B
397 | pct_change = prob_from_bins(n_below, bins_below)
398 | else: # If true, event B occurred
399 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
400 | if rand_prob <= prob_list[1][0]: # If true, B then A
401 | pct_change = prob_from_bins(n_above, bins_above)
402 | else: # If true, B then B
403 | pct_change = prob_from_bins(n_below, bins_below)
404 |
405 | pct_change_list.append(pct_change)
406 |
407 | step = ((pct_change * step / 100) + step)
408 |
409 | random_walk.append(step)
410 | all_walks.append(random_walk)
411 | show_rand_walks(all_walks, security)
412 |
413 |
414 | def rand_walk_3x3_markov(adjusted_df, prob_list, security, num_bins=10, lower_thresh=-1.0, upper_thresh=1.0,
415 | num_periods=12):
416 | """
417 |
418 | :param adjusted_df: Pandas DataFrame with columns: Date, Adj Close, and Percentage Change.
419 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
420 | :param security: Holds information about the requested security
421 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
422 | the probability distribution will be
423 | :param lower_thresh: Represents the level dividing events A (pct change < lower thresh) & B(lower thresh <=
424 | pct change < upper thresh)
425 | :param upper_thresh: Represents the level dividing events B (lower thresh < pct change < upper thresh) &
426 | C(upper thresh < pct change)
427 | :param start: starting value for security price random walk
428 | :param num_periods: number of steps in the random walk process
429 | """
430 |
431 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
432 | pct_above_array = pct_change_array[pct_change_array >= upper_thresh]
433 | pct_between_array = pct_change_array[np.logical_and(pct_change_array > lower_thresh,
434 | pct_change_array < upper_thresh)]
435 | pct_below_array = pct_change_array[pct_change_array <= lower_thresh]
436 | n_above, bins_above, patches_above = plt.hist(pct_above_array, bins=num_bins, normed=True)
437 | n_between, bins_between, patches_between = plt.hist(pct_between_array, bins=num_bins, normed=True)
438 | n_below, bins_below, patches_below = plt.hist(pct_below_array, bins=num_bins, normed=True)
439 |
440 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
441 | # percentage change of the previous period
442 | pct_change_list = []
443 | all_walks = [] # will hold all the random walk data
444 | start = getLastClosingDate(adjusted_df)# walk starts with the last closing date
445 | for i in range(1000):
446 | mean, std = get_params_for_norm_dist(adjusted_df)
447 | first_pct_change = np.random.normal(mean, std)
448 | pct_change_list.append(first_pct_change)
449 | first_step = ((first_pct_change * start / 100) + start)
450 | random_walk = [start, first_step]
451 |
452 | for period in range(num_periods):
453 | step = random_walk[-1]
454 | prev_pct_change = pct_change_list[-1]
455 |
456 | # random number used to test whether event A will occur or event B will occur
457 | rand_prob = np.random.random_sample()
458 | if prev_pct_change <= lower_thresh: # If true, event A occurred
459 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
460 | if 0 < rand_prob <= prob_list[0][0]: # If true, A then A
461 | pct_change = prob_from_bins(n_below, bins_below)
462 | elif prob_list[0][0] < rand_prob < (prob_list[0][0] + prob_list[0][1]): # If true, A then B
463 | pct_change = prob_from_bins(n_between, bins_between)
464 | else: # If true, A then C
465 | pct_change = prob_from_bins(n_above, bins_above)
466 |
467 | elif lower_thresh < prev_pct_change < upper_thresh: # If true, event B occurred
468 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
469 | if 0 < rand_prob <= prob_list[1][0]: # If true, B then A
470 | pct_change = prob_from_bins(n_below, bins_below)
471 | elif prob_list[1][0] < rand_prob < (prob_list[1][0] + prob_list[1][1]): # If true, B then B
472 | pct_change = prob_from_bins(n_between, bins_between)
473 | else: # If true, B then C
474 | pct_change = prob_from_bins(n_above, bins_above)
475 |
476 | else: # If true, event C occurred
477 | # prob_list[2][0] is probability that event A will occur, given event C has happened already
478 | if 0 < rand_prob <= prob_list[2][0]: # If true, C then A
479 | pct_change = prob_from_bins(n_below, bins_below)
480 | elif prob_list[2][0] < rand_prob < (prob_list[2][0] + prob_list[2][1]): # If true, C then B
481 | pct_change = prob_from_bins(n_between, bins_between)
482 | else: # If true, C then C
483 | pct_change = prob_from_bins(n_above, bins_above)
484 |
485 | pct_change_list.append(pct_change)
486 |
487 | step = ((pct_change * step / 100) + step)
488 |
489 | random_walk.append(step)
490 | all_walks.append(random_walk)
491 | show_rand_walks(all_walks, security)
492 |
493 |
494 | def show_rand_walks(all_walks, security):
495 | """
496 | Visualizes all random walks as a plot and distribution.
497 |
498 | :param all_walks: list of all random walks conducted
499 | """
500 | np_aw = np.array(all_walks) # converts the list of all random walks to a Numpy Array
501 | np_aw_t = np.transpose(np_aw) # must transpose the array for graph to display properly
502 | plt.clf()
503 | plt.plot(np_aw_t)
504 | plt.xlabel("Steps")
505 | plt.ylabel("Value of " + security.get_name())
506 | plt.title("All Random Walks of " + security.get_name())
507 | plt.show()
508 |
509 | # Select last row from np_aw_t: ends
510 | ends = np_aw_t[-1]
511 |
512 | # Plot histogram of ends, display plot
513 | n, bins, patches = plt.hist(ends, bins=25, normed=True)
514 | plt.xlabel("Final Value of " + security.get_name() + " at end of period.")
515 | plt.ylabel("Frequency")
516 | rand_mean = ends.mean()
517 | rand_std = ends.std()
518 |
519 | plt.title("Distribution of Random Walk Final Values. Mean is %d and Standard Deviation is %d"
520 | % (rand_mean, rand_std), y=1.03)
521 | for num_std_from_mean in range(-3, 4):
522 | plt.axvline(rand_mean + rand_std * num_std_from_mean)
523 | bincenters = 0.5*(bins[1:]+bins[:-1])
524 | # plots the normal pdf of best fit
525 | y = mlab.normpdf(bincenters, rand_mean, rand_std)
526 | plt.plot(bincenters, y, 'r--', linewidth=3)
527 | plt.show()
528 |
529 |
530 | def main():
531 | security = SecurityInfo(name="ORBK", start="2000-03-10", end="2003-03-10")
532 | markov_df = get_data(security)
533 |
534 | matrix = percent_change_prob_3x3(markov_df, security, lower_thresh=-1, upper_thresh=1)
535 |
536 | rand_walk_3x3_markov(markov_df, matrix, security, lower_thresh=-1, upper_thresh=1)
537 |
538 | if __name__ == '__main__':
539 | main()
540 |
--------------------------------------------------------------------------------
/Old_versions/markov_stock_analysis v2-3.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Nikhil Bhaip
3 | @version: 2.3
4 | @since: 6/15/16
5 |
6 | The markov_stock analysis program implements an algorithm that finds the percentage change in a security based on
7 | historical weekly data from Yahoo Finance and visualizes the information as a time series plot in matplotlib. The
8 | program also creates a Markov chain model in which the states are bull market, bear market, and stagnant market.
9 | Using the probabilities associated with this Markov chain model, the program will predict the future S&P 500 data
10 | through a random walk. This program can be used as a tool to analyze securities, like stocks and indexes, as well as
11 | study the state of the market for a wide number of applications including options and technical analysis.
12 |
13 | The next step would be to include other newer variables like seasonality and organize/clean the code using OOP concepts.
14 |
15 | """
16 | import datetime as dt
17 | import matplotlib.pyplot as plt
18 | from matplotlib.dates import MonthLocator, DateFormatter
19 | from matplotlib.ticker import MultipleLocator
20 | import matplotlib.mlab as mlab
21 | import numpy as np
22 | import quandl
23 |
24 |
25 | class SecurityInfo:
26 | """
27 | Holds information about a security (stock, index) to be used when retrieving data from Quandl and accessing
28 | information for other functions within the program.
29 |
30 | """
31 | def __init__(self, name, start, end, period="weekly"):
32 | self.name = name
33 | try:
34 | dt.datetime.strptime(start, '%Y-%m-%d')
35 | dt.datetime.strptime(end, '%Y-%m-%d')
36 | self.start = start
37 | self.end = end
38 | except ValueError:
39 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
40 | period_list = ["none", "daily", "weekly", "monthly", "quarterly", "annual"]
41 | if period in period_list:
42 | self.period = period
43 | else:
44 | print("Invalid period format. Using default period as 'weekly'")
45 | self.period = "weekly"
46 |
47 | def summary(self):
48 | print("Name: " + self.name)
49 | print("Starting Date: " + self.start)
50 | print("Ending Date: " + self.end)
51 | print("Period: " + self.period)
52 |
53 | def valid_date(self, new_date):
54 | try:
55 | dt.datetime.strptime(new_date, '%Y-%m-%d')
56 | return True
57 | except ValueError:
58 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
59 |
60 | def set_name(self, new_name):
61 | self.name = new_name
62 |
63 | def get_name(self):
64 | return self.name
65 |
66 | def set_start(self, new_start):
67 | if self.valid_date(new_start):
68 | self.start = new_start
69 |
70 | def get_start(self):
71 | return self.start
72 |
73 | def set_end(self, new_end):
74 | if self.valid_date(new_end):
75 | self.end = new_end
76 |
77 | def get_end(self):
78 | return self.end
79 |
80 | def set_period(self, new_period):
81 | self.period = new_period
82 |
83 | def get_period(self):
84 | return self.period
85 |
86 |
87 | def get_data(security):
88 | """
89 | This function obtains data under certain parameters from Quandl and returns the following information as a Pandas
90 | DataFrame: date, adjusted closing, and percentage change in adjusted closing from the last week.
91 |
92 | :param security: Holds information about the requested security
93 | :return: A Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
94 | """
95 | quandl.ApiConfig.api_key = "7NU4-sXfczxA9fsf_C8E"
96 | name = security.get_name()
97 | start = security.get_start()
98 | end = security.get_end()
99 | period = security.get_period()
100 | raw_df = quandl.get("YAHOO/" + name, start_date=start, end_date=end, collapse=period)
101 | adjusted_df = raw_df.ix[:, ['Adjusted Close']]
102 | adjusted_df["Percentage Change"] = adjusted_df['Adjusted Close'].pct_change() * 100
103 | return adjusted_df
104 |
105 |
106 | def percent_change_as_time_plot(adjusted_df, security):
107 | """
108 | This function visualizes the percentage change data as a time series plot.
109 |
110 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
111 | :param security: Holds information about the requested security
112 | """
113 |
114 | pct_change_list = adjusted_df['Percentage Change'].tolist()
115 | date_list = adjusted_df.index.values
116 | fig, ax = plt.subplots()
117 | ax.plot(date_list, pct_change_list)
118 | plt.xlabel("Dates")
119 | plt.ylabel("Percentage change from last period")
120 | if security.get_period() == "none":
121 | plt.title("Percentage change in " + security.get_name(), y=1.03)
122 | else:
123 | plt.title("Percentage change in " + security.get_name() + " " + security.get_period() + " data", y=1.03)
124 | ax.xaxis.set_minor_locator(MonthLocator())
125 | ax.yaxis.set_minor_locator(MultipleLocator(1))
126 | ax.fmt_xdata = DateFormatter('%Y-%m-%d')
127 | ax.autoscale_view()
128 | fig.autofmt_xdate()
129 |
130 | plt.show()
131 |
132 |
133 | def get_params_for_norm_dist(adjusted_df):
134 | """
135 | This function returns the mean and standard deviation in the percentage change column of a DataFrame.
136 | :param adjusted_df: must have 'Percentage Change' column
137 |
138 | :returns mean and standard deviation of the percentage change column
139 | """
140 | mean = adjusted_df["Percentage Change"].mean()
141 | std = adjusted_df["Percentage Change"].std()
142 | return mean, std
143 |
144 |
145 | def percent_change_as_hist(adjusted_df, security):
146 | """
147 | This function visualizes the percentage change data as a histogram. The graph is also fitted to a normal
148 | distribution curve.
149 |
150 | :param security: Holds information about the requested security
151 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
152 | """
153 | pct_change_list = adjusted_df['Percentage Change']
154 |
155 | # Code below removes the NaN value and plots the histogram. Bins are left adjusted right now, so when plotting the
156 | # normal distribution function, we must adjust it to be based off the center (average) of the bins.
157 | n, bins, patches = plt.hist(pct_change_list.dropna(), bins=25, normed=True)
158 | bincenters = 0.5*(bins[1:]+bins[:-1])
159 |
160 | plt.xlabel("Percentage change")
161 | plt.ylabel("Frequency")
162 | mean, std = get_params_for_norm_dist(adjusted_df)
163 | plt.title("Distribution of percentage change in " + security.get_name() + " Mu: %.3f, Sigma: %.3f"
164 | % (mean, std), y=1.03)
165 |
166 | # adds vertical lines to the graph corresponding to the x's that represent the number of deviations from the mean
167 | for num_std_from_mean in range(-3, 4):
168 | plt.axvline(mean + std * num_std_from_mean)
169 |
170 | # plots the normal pdf of best fit
171 | y = mlab.normpdf(bincenters, mean, std)
172 | plt.plot(bincenters, y, 'r--', linewidth=1)
173 |
174 | plt.show()
175 |
176 |
177 | def percent_change_prob_2x2(adjusted_df, security, threshold=0.0):
178 | """
179 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
180 | two states: % change greater than or equal to a threshold (A) and % change less than a threshold (B). The threshold
181 | is defaulted to zero, so that the states are roughly divided into positive and negative changes. The four
182 | probabilities are: a (A | A), b (B | A) , c (A | B) , d (B | B). By definition, the sum of the rows in the right
183 | stochastic transition matrix must add up to 1: (a + b = 1 and c + d = 1)
184 |
185 | A B
186 | P = A a b
187 | B c d
188 |
189 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
190 | :param security: Holds information about the requested security
191 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
192 | """
193 | count_list = [[0, 0], # a_count, b_count,
194 | [0, 0]] # c_count, d_count
195 |
196 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
197 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
198 |
199 | for index, pct in new_df['Percentage Change'].iteritems():
200 | if index == 1: # prevents program from calculating a future probability
201 | break
202 | if pct >= threshold:
203 | if new_df['Percentage Change'][index-1] >= threshold:
204 | count_list[0][0] += 1 # event A occurred, then event A occurred
205 | else:
206 | count_list[0][1] += 1 # event A occurred, then event B occurred
207 | else:
208 | if new_df['Percentage Change'][index-1] >= threshold:
209 | count_list[1][0] += 1 # event B occurred, then event A occurred
210 | else:
211 | count_list[1][1] += 1 # event B occurred, then event B occurred
212 |
213 | prob_list = [[count / sum(group) for count in group] for group in count_list]
214 |
215 | print(prob_list, "\n")
216 | thresh_str = "{:.2f}%".format(threshold)
217 | for i in range(2):
218 | if i == 0:
219 | part_1_summary = "\nIf " + security.get_name() + " rises above {thresh} in one period, "
220 | else:
221 | part_1_summary = "\nIf " + security.get_name() + " falls below {thresh} in one period, "
222 | part_2_summary = "in the next period, there is a {:.2f} chance that the security will rise above {thresh} " \
223 | "and a {:.2f} chance that it will fall below this threshold."
224 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1],
225 | thresh=thresh_str))
226 | return prob_list
227 |
228 |
229 | def percent_change_prob_3x3(adjusted_df, security, lower_thresh=-1.0, upper_thresh=1.0):
230 | """
231 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
232 | three states: % change less than or equal to a lower threshold (A), % change between the upper and lower
233 | thresholds (B)and % change greater than or equal to an upper threshold (C). The lower threshold is defaulted to -1,
234 | and the upper threshold is defaulted to +1. Percentage changes below the lower threshold may be considered bearish,
235 | in between the two thresholds considered stagnant, and above the threshold considered bullish. The nine
236 | probabilities are: a P(A | A), b (B | A) , c (C | A) , d (A | B), e (B | B), f (C | B), g (A | C), h (B | C), and
237 | i (C | C). The sum of the rows in the matrix must add up to 1: (a + b + c = 1 and d + e + f = 1 and g + h + i = 1)
238 |
239 | A B C
240 | P = A a b c
241 | B d e f
242 | C g h i
243 |
244 | See percent_change_prob_2x2 for more details
245 |
246 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
247 | :param security: Holds information about the requested security
248 | :param lower_thresh: Represents the level dividing events A & B
249 | :param upper_thresh: Represents the level dividing events A & B
250 | """
251 | # counts frequency of sub-events
252 | count_list = [[0, 0, 0], # a_count, b_count, c_count
253 | [0, 0, 0], # d_count, e_count, f_count
254 | [0, 0, 0]] # g_count, h_count, i_count
255 |
256 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
257 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
258 |
259 | for index, pct in new_df['Percentage Change'].iteritems():
260 | if index == 1: # prevents program from calculating a future probability
261 | break
262 | if pct <= lower_thresh:
263 |
264 | if new_df['Percentage Change'][index-1] <= lower_thresh:
265 | count_list[0][0] += 1 # increment a_count
266 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
267 | count_list[0][1] += 1 # increment b_count
268 | else:
269 | count_list[0][2] += 1 # increment c_count
270 |
271 | elif lower_thresh < pct < upper_thresh:
272 |
273 | if new_df['Percentage Change'][index-1] <= lower_thresh:
274 | count_list[1][0] += 1 # increment d_count
275 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
276 | count_list[1][1] += 1 # increment e_count
277 | else:
278 | count_list[1][2] += 1 # increment f_count
279 |
280 | else:
281 |
282 | if new_df['Percentage Change'][index-1] <= lower_thresh:
283 | count_list[2][0] += 1 # increment g_count
284 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
285 | count_list[2][1] += 1 # increment h_count
286 | else:
287 | count_list[2][2] += 1 # increment i_count
288 |
289 | prob_list = [[count / sum(group) for count in group] for group in count_list]
290 | for group in prob_list:
291 | print(group)
292 | lower_thresh_str = "{:.2f}%".format(lower_thresh)
293 | upper_thresh_str = "{:.2f}%".format(upper_thresh)
294 | for i in range(3):
295 | part_1_summary = ""
296 | if i == 0:
297 | part_1_summary = "\nIf " + security.get_name() + " falls below {lower_thresh} in one period (bearish),"
298 | elif i ==1:
299 | part_1_summary = "\nIf " + security.get_name() + " changes between {lower_thresh} and {upper_thresh} in " \
300 | "one period (stagnant),"
301 | else:
302 | part_1_summary = "\nIf " + security.get_name() + " rises above {upper_thresh} in one period (bullish),"
303 |
304 | part_2_summary = "in the next period, there is a {:.3f} chance that the security will fall by more than " \
305 | "{lower_thresh} (bearish), a {:.3f} chance that the security will change somewhere between " \
306 | "{lower_thresh} and {upper_thresh} (stagnant), and a {:.3f} chance that the security will " \
307 | "rise by more than {upper_thresh} (bullish)."
308 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1], prob_list[i][2],
309 | lower_thresh=lower_thresh_str, upper_thresh=upper_thresh_str))
310 |
311 | return prob_list
312 |
313 |
314 | def random_walk_norm_pdf(adjusted_df, start=2099, num_periods=12):
315 | """
316 | This function calculates and visualizes a random walk assuming that S&P 500 data are independent of current state.
317 | Based on a basic normal distribution and a starting point, the function will predict the S&P 500
318 | Index movement for a finite number of periods. This is the most fundamental random walk and has many unrealistic
319 | assumptions, such as the data are independently and identically distributed, which is likely not true for the
320 | S&P500 Index.
321 |
322 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
323 | :param start: starting value for S&P 500 random walk
324 | :param num_periods: number of steps in the random walk process
325 |
326 | """
327 | mean, std = get_params_for_norm_dist(adjusted_df)
328 | pct_change_list = []
329 | all_walks = [] # will hold all the random walk data
330 | for i in range(100):
331 | random_walk = [start]
332 | for period in range(num_periods):
333 | # sets the step as the last element in the random walk
334 | step = random_walk[-1]
335 |
336 | # picks a random percent change from a Gaussian distribution based on historical mean and standard deviation
337 | pct_change = np.random.normal(mean, std)
338 | pct_change_list.append(pct_change)
339 |
340 | # reordering of percent change formula
341 | step = ((pct_change * step / 100) + step)
342 |
343 | random_walk.append(step)
344 | all_walks.append(random_walk)
345 | show_rand_walks(all_walks)
346 |
347 |
348 | def prob_from_bins(heights, bins):
349 | """
350 | Chooses a random bin based on the prob distribution in the histogram. Then returns a random percentage change from
351 | that bin.
352 |
353 | :param heights: heights of the histogram
354 | :param bins: left-hand edges of each bin; must have at least two values in list
355 | :return: random percentage change
356 | """
357 | np_heights = np.asarray(heights)
358 | bin_length = bins[1]-bins[0]
359 | np_area = bin_length * np_heights # sum of area is equal to 1
360 | bin_num = np.random.choice(np.arange(start=1, stop=len(bins)), p=np_area)
361 | rand_pct_change = bin_length * np.random.ranf() + bins[bin_num-1]
362 | return rand_pct_change
363 |
364 |
365 | def rand_walk_2x2_markov(adjusted_df, prob_list, security, num_bins=10, threshold=0.0, start=2099.0, num_periods=12):
366 | """
367 | Divides the per
368 |
369 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
370 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
371 | :param security: Holds information about the requested security
372 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
373 | the probability distribution will be
374 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold)
375 | :param start: starting value for S&P 500 random walk
376 | :param num_periods: number of steps in the random walk process
377 | """
378 |
379 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
380 | pct_above_threshold_array = pct_change_array[pct_change_array >= threshold]
381 | pct_below_threshold_array = pct_change_array[pct_change_array < threshold]
382 | n_above, bins_above, patches_above = plt.hist(pct_above_threshold_array, bins=num_bins, normed=True)
383 | n_below, bins_below, patches_below = plt.hist(pct_below_threshold_array, bins=num_bins, normed=True)
384 |
385 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
386 | # percentage change of the previous period
387 | pct_change_list = []
388 | all_walks = [] # will hold all the random walk data
389 | for i in range(100):
390 | mean, std = get_params_for_norm_dist(adjusted_df)
391 | first_pct_change = np.random.normal(mean, std)
392 | pct_change_list.append(first_pct_change)
393 | first_step = ((first_pct_change * start / 100) + start)
394 | random_walk = [start, first_step]
395 |
396 | for period in range(num_periods):
397 | step = random_walk[-1]
398 | prev_pct_change = pct_change_list[-1]
399 |
400 | # random number used to test whether event A will occur or event B will occur
401 | rand_prob = np.random.random_sample()
402 | if prev_pct_change >= threshold: # If true, event A occurred
403 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
404 | if rand_prob <= prob_list[0][0]: # If true, A then A
405 | pct_change = prob_from_bins(n_above, bins_above)
406 | else: # If true, A then B
407 | pct_change = prob_from_bins(n_below, bins_below)
408 | else: # If true, event B occurred
409 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
410 | if rand_prob <= prob_list[1][0]: # If true, B then A
411 | pct_change = prob_from_bins(n_above, bins_above)
412 | else: # If true, B then B
413 | pct_change = prob_from_bins(n_below, bins_below)
414 |
415 | pct_change_list.append(pct_change)
416 |
417 | step = ((pct_change * step / 100) + step)
418 |
419 | random_walk.append(step)
420 | all_walks.append(random_walk)
421 | show_rand_walks(all_walks, security)
422 |
423 |
424 | def rand_walk_3x3_markov(adjusted_df, prob_list, security, num_bins=10, lower_thresh=-1.0, upper_thresh=1.0,
425 | start=2099.0, num_periods=12):
426 | """
427 |
428 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
429 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
430 | :param security: Holds information about the requested security
431 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
432 | the probability distribution will be
433 | :param lower_thresh: Represents the level dividing events A (pct change < lower thresh) & B(lower thresh <=
434 | pct change < upper thresh)
435 | :param upper_thresh: Represents the level dividing events B (lower thresh < pct change < upper thresh) &
436 | C(upper thresh < pct change)
437 | :param start: starting value for S&P 500 random walk
438 | :param num_periods: number of steps in the random walk process
439 | """
440 |
441 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
442 | pct_above_array = pct_change_array[pct_change_array >= upper_thresh]
443 | pct_between_array = pct_change_array[np.logical_and(pct_change_array > lower_thresh,
444 | pct_change_array < upper_thresh)]
445 | pct_below_array = pct_change_array[pct_change_array <= lower_thresh]
446 | n_above, bins_above, patches_above = plt.hist(pct_above_array, bins=num_bins, normed=True)
447 | n_between, bins_between, patches_between = plt.hist(pct_between_array, bins=num_bins, normed=True)
448 | n_below, bins_below, patches_below = plt.hist(pct_below_array, bins=num_bins, normed=True)
449 |
450 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
451 | # percentage change of the previous period
452 | pct_change_list = []
453 | all_walks = [] # will hold all the random walk data
454 | for i in range(1000):
455 | mean, std = get_params_for_norm_dist(adjusted_df)
456 | first_pct_change = np.random.normal(mean, std)
457 | pct_change_list.append(first_pct_change)
458 | first_step = ((first_pct_change * start / 100) + start)
459 | random_walk = [start, first_step]
460 |
461 | for period in range(num_periods):
462 | step = random_walk[-1]
463 | prev_pct_change = pct_change_list[-1]
464 |
465 | # random number used to test whether event A will occur or event B will occur
466 | rand_prob = np.random.random_sample()
467 | if prev_pct_change <= lower_thresh: # If true, event A occurred
468 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
469 | if 0 < rand_prob <= prob_list[0][0]: # If true, A then A
470 | pct_change = prob_from_bins(n_below, bins_below)
471 | elif prob_list[0][0] < rand_prob < (prob_list[0][0] + prob_list[0][1]): # If true, A then B
472 | pct_change = prob_from_bins(n_between, bins_between)
473 | else: # If true, A then C
474 | pct_change = prob_from_bins(n_above, bins_above)
475 |
476 | elif lower_thresh < prev_pct_change < upper_thresh: # If true, event B occurred
477 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
478 | if 0 < rand_prob <= prob_list[1][0]: # If true, B then A
479 | pct_change = prob_from_bins(n_below, bins_below)
480 | elif prob_list[1][0] < rand_prob < (prob_list[1][0] + prob_list[1][1]): # If true, B then B
481 | pct_change = prob_from_bins(n_between, bins_between)
482 | else: # If true, B then C
483 | pct_change = prob_from_bins(n_above, bins_above)
484 |
485 | else: # If true, event C occurred
486 | # prob_list[2][0] is probability that event A will occur, given event C has happened already
487 | if 0 < rand_prob <= prob_list[2][0]: # If true, C then A
488 | pct_change = prob_from_bins(n_below, bins_below)
489 | elif prob_list[2][0] < rand_prob < (prob_list[2][0] + prob_list[2][1]): # If true, C then B
490 | pct_change = prob_from_bins(n_between, bins_between)
491 | else: # If true, C then C
492 | pct_change = prob_from_bins(n_above, bins_above)
493 |
494 | pct_change_list.append(pct_change)
495 |
496 | step = ((pct_change * step / 100) + step)
497 |
498 | random_walk.append(step)
499 | all_walks.append(random_walk)
500 | show_rand_walks(all_walks, security)
501 |
502 |
503 | def show_rand_walks(all_walks, security):
504 | """
505 | Visualizes all random walks as a plot and distribution.
506 |
507 | :param all_walks: list of all random walks conducted
508 | """
509 | np_aw = np.array(all_walks) # converts the list of all random walks to a Numpy Array
510 | np_aw_t = np.transpose(np_aw) # must transpose the array for graph to display properly
511 | plt.clf()
512 | plt.plot(np_aw_t)
513 | plt.xlabel("Steps")
514 | plt.ylabel("Value of " + security.get_name())
515 | plt.title("All Random Walks of " + security.get_name())
516 | plt.show()
517 |
518 | # Select last row from np_aw_t: ends
519 | ends = np_aw_t[-1]
520 |
521 | # Plot histogram of ends, display plot
522 | n, bins, patches = plt.hist(ends, bins=25, normed=True)
523 | plt.xlabel("Final Value of " + security.get_name() + " at end of period.")
524 | plt.ylabel("Frequency")
525 | rand_mean = ends.mean()
526 | rand_std = ends.std()
527 |
528 | plt.title("Distribution of Random Walk Final Values. Mean is %d and Standard Deviation is %d"
529 | % (rand_mean, rand_std), y=1.03)
530 | for num_std_from_mean in range(-3, 4):
531 | plt.axvline(rand_mean + rand_std * num_std_from_mean)
532 | bincenters = 0.5*(bins[1:]+bins[:-1])
533 | # plots the normal pdf of best fit
534 | y = mlab.normpdf(bincenters, rand_mean, rand_std)
535 | plt.plot(bincenters, y, 'r--', linewidth=3)
536 | plt.show()
537 |
538 |
539 | x = SecurityInfo(name="YELP", start="2009-06-06", end="2016-06-06", period="weekly")
540 | markov_df = get_data(x)
541 | matrix = percent_change_prob_3x3(markov_df, x, lower_thresh=-3, upper_thresh=3)
542 | rand_walk_3x3_markov(markov_df, matrix, x, lower_thresh=-3, upper_thresh=3)
543 |
--------------------------------------------------------------------------------
/Old_versions/markov_stock_forecasting_model_v2-5.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Nikhil Bhaip
3 | @version: 2.5
4 | @since: 6/15/16
5 |
6 | The markov_stock analysis program implements an algorithm that finds the percentage change in a security based on
7 | historical weekly data from Yahoo Finance and visualizes the information as a time series plot in matplotlib. The
8 | program also creates a Markov chain model in which the states are bull market, bear market, and stagnant market.
9 | Using the probabilities associated with this Markov chain model, the program will model the estimated growth of the
10 | security price through a random walk. This program can be used as a tool to analyze securities, like stocks and
11 | indexes, as well as study the state of the market for a wide number of applications including options and technical
12 | analysis.
13 |
14 | The next step would be to include other newer variables like seasonality and organize/clean the code using OOP concepts.
15 |
16 | """
17 | import datetime as dt
18 | import matplotlib.pyplot as plt
19 | from matplotlib.dates import MonthLocator, DateFormatter
20 | from matplotlib.ticker import MultipleLocator
21 | import matplotlib.mlab as mlab
22 | import numpy as np
23 | import quandl
24 |
25 |
26 |
27 | class SecurityInfo:
28 | """
29 | Holds information about a security (stock, index) to be used when retrieving data from Quandl and accessing
30 | information for other functions within the program.
31 |
32 | """
33 | def __init__(self, name, start, end, period="weekly"):
34 | self.name = name
35 | try:
36 | dt.datetime.strptime(start, '%Y-%m-%d')
37 | dt.datetime.strptime(end, '%Y-%m-%d')
38 | self.start = start
39 | self.end = end
40 | except ValueError:
41 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
42 | period_list = ["none", "daily", "weekly", "monthly", "quarterly", "annual"]
43 | if period in period_list:
44 | self.period = period
45 | else:
46 | print("Invalid period format. Using default period as 'weekly'")
47 | self.period = "weekly"
48 |
49 | def summary(self):
50 | print("Name: " + self.name)
51 | print("Starting Date: " + self.start)
52 | print("Ending Date: " + self.end)
53 | print("Period: " + self.period)
54 |
55 | def valid_date(self, new_date):
56 | try:
57 | dt.datetime.strptime(new_date, '%Y-%m-%d')
58 | return True
59 | except ValueError:
60 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
61 |
62 | def set_name(self, new_name):
63 | self.name = new_name
64 |
65 | def get_name(self):
66 | return self.name
67 |
68 | def set_start(self, new_start):
69 | if self.valid_date(new_start):
70 | self.start = new_start
71 |
72 | def get_start(self):
73 | return self.start
74 |
75 | def set_end(self, new_end):
76 | if self.valid_date(new_end):
77 | self.end = new_end
78 |
79 | def get_end(self):
80 | return self.end
81 |
82 | def set_period(self, new_period):
83 | self.period = new_period
84 |
85 | def get_period(self):
86 | return self.period
87 |
88 |
89 | def get_data(security):
90 | """
91 | This function obtains data under certain parameters from Quandl and returns the following information as a Pandas
92 | DataFrame: date, adjusted closing, and percentage change in adjusted closing from the last week.
93 |
94 | :param security: Holds information about the requested security
95 | :return: A Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
96 | """
97 | quandl.ApiConfig.api_key = "7NU4-sXfczxA9fsf_C8E"
98 | name = security.get_name()
99 | start = security.get_start()
100 | end = security.get_end()
101 | period = security.get_period()
102 | raw_df = quandl.get("YAHOO/" + name, start_date=start, end_date=end, collapse=period)
103 | adjusted_df = raw_df.ix[:, ['Adjusted Close']]
104 | adjusted_df["Percentage Change"] = adjusted_df['Adjusted Close'].pct_change() * 100
105 | return adjusted_df
106 |
107 |
108 | def percent_change_as_time_plot(adjusted_df, security):
109 | """
110 | This function visualizes the percentage change data as a time series plot.
111 |
112 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
113 | :param security: Holds information about the requested security
114 | """
115 |
116 | pct_change_list = adjusted_df['Percentage Change'].tolist()
117 | date_list = adjusted_df.index.values
118 | fig, ax = plt.subplots()
119 | ax.plot(date_list, pct_change_list)
120 | plt.xlabel("Dates")
121 | plt.ylabel("Percentage change from last period")
122 | if security.get_period() == "none":
123 | plt.title("Percentage change in " + security.get_name(), y=1.03)
124 | else:
125 | plt.title("Percentage change in " + security.get_name() + " " + security.get_period() + " data", y=1.03)
126 | ax.xaxis.set_minor_locator(MonthLocator())
127 | ax.yaxis.set_minor_locator(MultipleLocator(1))
128 | ax.fmt_xdata = DateFormatter('%Y-%m-%d')
129 | ax.autoscale_view()
130 | fig.autofmt_xdate()
131 |
132 | plt.show()
133 |
134 |
135 | def get_params_for_norm_dist(adjusted_df):
136 | """
137 | This function returns the mean and standard deviation in the percentage change column of a DataFrame.
138 | :param adjusted_df: must have 'Percentage Change' column
139 |
140 | :returns mean and standard deviation of the percentage change column
141 | """
142 | mean = adjusted_df["Percentage Change"].mean()
143 | std = adjusted_df["Percentage Change"].std()
144 | return mean, std
145 |
146 |
147 | def percent_change_as_hist(adjusted_df, security):
148 | """
149 | This function visualizes the percentage change data as a histogram. The graph is also fitted to a normal
150 | distribution curve.
151 |
152 | :param security: Holds information about the requested security
153 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
154 | """
155 | pct_change_list = adjusted_df['Percentage Change']
156 |
157 | # Code below removes the NaN value and plots the histogram. Bins are left adjusted right now, so when plotting the
158 | # normal distribution function, we must adjust it to be based off the center (average) of the bins.
159 | n, bins, patches = plt.hist(pct_change_list.dropna(), bins=25, normed=True)
160 | bincenters = 0.5*(bins[1:]+bins[:-1])
161 |
162 | plt.xlabel("Percentage change")
163 | plt.ylabel("Frequency")
164 | mean, std = get_params_for_norm_dist(adjusted_df)
165 | plt.title("Distribution of percentage change in " + security.get_name() + " Mu: %.3f, Sigma: %.3f"
166 | % (mean, std), y=1.03)
167 |
168 | # adds vertical lines to the graph corresponding to the x's that represent the number of deviations from the mean
169 | for num_std_from_mean in range(-3, 4):
170 | plt.axvline(mean + std * num_std_from_mean)
171 |
172 | # plots the normal pdf of best fit
173 | y = mlab.normpdf(bincenters, mean, std)
174 | plt.plot(bincenters, y, 'r--', linewidth=1)
175 |
176 | plt.show()
177 |
178 | def getLastClosingDate(adjusted_df):
179 | return adjusted_df['Adjusted Close'].iloc[-1]
180 |
181 | def percent_change_prob_2x2(adjusted_df, security, threshold=0.0):
182 | """
183 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
184 | two states: % change greater than or equal to a threshold (A) and % change less than a threshold (B). The threshold
185 | is defaulted to zero, so that the states are roughly divided into positive and negative changes. The four
186 | probabilities are: a (A | A), b (B | A) , c (A | B) , d (B | B). By definition, the sum of the rows in the right
187 | stochastic transition matrix must add up to 1: (a + b = 1 and c + d = 1)
188 |
189 | A B
190 | P = A a b
191 | B c d
192 |
193 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
194 | :param security: Holds information about the requested security
195 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
196 | """
197 | count_list = [[0, 0], # a_count, b_count,
198 | [0, 0]] # c_count, d_count
199 |
200 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
201 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
202 |
203 | for index, pct in new_df['Percentage Change'].iteritems():
204 | if index == 1: # prevents program from calculating a future probability
205 | break
206 | if pct >= threshold:
207 | if new_df['Percentage Change'][index-1] >= threshold:
208 | count_list[0][0] += 1 # event A occurred, then event A occurred
209 | else:
210 | count_list[0][1] += 1 # event A occurred, then event B occurred
211 | else:
212 | if new_df['Percentage Change'][index-1] >= threshold:
213 | count_list[1][0] += 1 # event B occurred, then event A occurred
214 | else:
215 | count_list[1][1] += 1 # event B occurred, then event B occurred
216 |
217 | prob_list = [[count / sum(group) for count in group] for group in count_list]
218 |
219 | print(prob_list, "\n")
220 | thresh_str = "{:.2f}%".format(threshold)
221 | for i in range(2):
222 | if i == 0:
223 | part_1_summary = "\nIf " + security.get_name() + " rises above {thresh} in one period, "
224 | else:
225 | part_1_summary = "\nIf " + security.get_name() + " falls below {thresh} in one period, "
226 | part_2_summary = "in the next period, there is a {:.2f} chance that the security will rise above {thresh} " \
227 | "and a {:.2f} chance that it will fall below this threshold."
228 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1],
229 | thresh=thresh_str))
230 | return prob_list
231 |
232 |
233 | def percent_change_prob_3x3(adjusted_df, security, lower_thresh=-1.0, upper_thresh=1.0):
234 | """
235 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
236 | three states: % change less than or equal to a lower threshold (A), % change between the upper and lower
237 | thresholds (B)and % change greater than or equal to an upper threshold (C). The lower threshold is defaulted to -1,
238 | and the upper threshold is defaulted to +1. Percentage changes below the lower threshold may be considered bearish,
239 | in between the two thresholds considered stagnant, and above the threshold considered bullish. The nine
240 | probabilities are: a P(A | A), b (B | A) , c (C | A) , d (A | B), e (B | B), f (C | B), g (A | C), h (B | C), and
241 | i (C | C). The sum of the rows in the matrix must add up to 1: (a + b + c = 1 and d + e + f = 1 and g + h + i = 1)
242 |
243 | A B C
244 | P = A a b c
245 | B d e f
246 | C g h i
247 |
248 | See percent_change_prob_2x2 for more details
249 |
250 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
251 | :param security: Holds information about the requested security
252 | :param lower_thresh: Represents the level dividing events A & B
253 | :param upper_thresh: Represents the level dividing events A & B
254 | """
255 | # counts frequency of sub-events
256 | count_list = [[0, 0, 0], # a_count, b_count, c_count
257 | [0, 0, 0], # d_count, e_count, f_count
258 | [0, 0, 0]] # g_count, h_count, i_count
259 |
260 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
261 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
262 |
263 | for index, pct in new_df['Percentage Change'].iteritems():
264 | if index == 1: # prevents program from calculating a future probability
265 | break
266 | if pct <= lower_thresh:
267 |
268 | if new_df['Percentage Change'][index-1] <= lower_thresh:
269 | count_list[0][0] += 1 # increment a_count
270 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
271 | count_list[0][1] += 1 # increment b_count
272 | else:
273 | count_list[0][2] += 1 # increment c_count
274 |
275 | elif lower_thresh < pct < upper_thresh:
276 |
277 | if new_df['Percentage Change'][index-1] <= lower_thresh:
278 | count_list[1][0] += 1 # increment d_count
279 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
280 | count_list[1][1] += 1 # increment e_count
281 | else:
282 | count_list[1][2] += 1 # increment f_count
283 |
284 | else:
285 |
286 | if new_df['Percentage Change'][index-1] <= lower_thresh:
287 | count_list[2][0] += 1 # increment g_count
288 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
289 | count_list[2][1] += 1 # increment h_count
290 | else:
291 | count_list[2][2] += 1 # increment i_count
292 |
293 | prob_list = [[count / sum(group) for count in group] for group in count_list]
294 | for group in prob_list:
295 | print(group)
296 | lower_thresh_str = "{:.2f}%".format(lower_thresh)
297 | upper_thresh_str = "{:.2f}%".format(upper_thresh)
298 | for i in range(3):
299 | part_1_summary = ""
300 | if i == 0:
301 | part_1_summary = "\nIf " + security.get_name() + " falls below {lower_thresh} in one period (bearish),"
302 | elif i ==1:
303 | part_1_summary = "\nIf " + security.get_name() + " changes between {lower_thresh} and {upper_thresh} in " \
304 | "one period (stagnant),"
305 | else:
306 | part_1_summary = "\nIf " + security.get_name() + " rises above {upper_thresh} in one period (bullish),"
307 |
308 | part_2_summary = "in the next period, there is a {:.3f} chance that the security will fall by more than " \
309 | "{lower_thresh} (bearish), a {:.3f} chance that the security will change somewhere between " \
310 | "{lower_thresh} and {upper_thresh} (stagnant), and a {:.3f} chance that the security will " \
311 | "rise by more than {upper_thresh} (bullish)."
312 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1], prob_list[i][2],
313 | lower_thresh=lower_thresh_str, upper_thresh=upper_thresh_str))
314 |
315 | return prob_list
316 |
317 |
318 | def random_walk_norm_pdf(adjusted_df, num_periods=12):
319 | """
320 | This function calculates and visualizes a random walk assuming that S&P 500 data are independent of current state.
321 | Based on a basic normal distribution and a starting point, the function will predict the S&P 500
322 | Index movement for a finite number of periods. This is the most fundamental random walk and has many unrealistic
323 | assumptions, such as the data are independently and identically distributed, which is likely not true for the
324 | S&P500 Index.
325 |
326 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
327 | :param start: starting value for S&P 500 random walk
328 | :param num_periods: number of steps in the random walk process
329 |
330 | """
331 | mean, std = get_params_for_norm_dist(adjusted_df)
332 | pct_change_list = []
333 | all_walks = [] # will hold all the random walk data
334 | for i in range(100):
335 | random_walk = [getLastClosingDate(adjusted_df)] # walk starts with the last closing date
336 | for period in range(num_periods):
337 | # sets the step as the last element in the random walk
338 | step = random_walk[-1]
339 |
340 | # picks a random percent change from a Gaussian distribution based on historical mean and standard deviation
341 | pct_change = np.random.normal(mean, std)
342 | pct_change_list.append(pct_change)
343 |
344 | # reordering of percent change formula
345 | step = ((pct_change * step / 100) + step)
346 |
347 | random_walk.append(step)
348 | all_walks.append(random_walk)
349 | show_rand_walks(all_walks)
350 |
351 |
352 | def prob_from_bins(heights, bins):
353 | """
354 | Chooses a random bin based on the prob distribution in the histogram. Then returns a random percentage change from
355 | that bin.
356 |
357 | :param heights: heights of the histogram
358 | :param bins: left-hand edges of each bin; must have at least two values in list
359 | :return: random percentage change
360 | """
361 | np_heights = np.asarray(heights)
362 | bin_length = bins[1]-bins[0]
363 | np_area = bin_length * np_heights # sum of area is equal to 1
364 | bin_num = np.random.choice(np.arange(start=1, stop=len(bins)), p=np_area)
365 | rand_pct_change = bin_length * np.random.ranf() + bins[bin_num-1]
366 | return rand_pct_change
367 |
368 |
369 | def rand_walk_2x2_markov(adjusted_df, prob_list, security, num_bins=10, threshold=0.0, num_periods=12):
370 | """
371 | Divides the per
372 |
373 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
374 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
375 | :param security: Holds information about the requested security
376 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
377 | the probability distribution will be
378 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold)
379 | :param start: starting value for S&P 500 random walk
380 | :param num_periods: number of steps in the random walk process
381 | """
382 |
383 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
384 | pct_above_threshold_array = pct_change_array[pct_change_array >= threshold]
385 | pct_below_threshold_array = pct_change_array[pct_change_array < threshold]
386 | n_above, bins_above, patches_above = plt.hist(pct_above_threshold_array, bins=num_bins, normed=True)
387 | n_below, bins_below, patches_below = plt.hist(pct_below_threshold_array, bins=num_bins, normed=True)
388 |
389 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
390 | # percentage change of the previous period
391 | pct_change_list = []
392 | all_walks = [] # will hold all the random walk data
393 | start=getLastClosingDate(adjusted_df)
394 | for i in range(100):
395 | mean, std = get_params_for_norm_dist(adjusted_df)
396 | first_pct_change = np.random.normal(mean, std)
397 | pct_change_list.append(first_pct_change)
398 | first_step = ((first_pct_change * start / 100) + start)
399 | random_walk = [start, first_step]
400 |
401 | for period in range(num_periods):
402 | step = random_walk[-1]
403 | prev_pct_change = pct_change_list[-1]
404 |
405 | # random number used to test whether event A will occur or event B will occur
406 | rand_prob = np.random.random_sample()
407 | if prev_pct_change >= threshold: # If true, event A occurred
408 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
409 | if rand_prob <= prob_list[0][0]: # If true, A then A
410 | pct_change = prob_from_bins(n_above, bins_above)
411 | else: # If true, A then B
412 | pct_change = prob_from_bins(n_below, bins_below)
413 | else: # If true, event B occurred
414 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
415 | if rand_prob <= prob_list[1][0]: # If true, B then A
416 | pct_change = prob_from_bins(n_above, bins_above)
417 | else: # If true, B then B
418 | pct_change = prob_from_bins(n_below, bins_below)
419 |
420 | pct_change_list.append(pct_change)
421 |
422 | step = ((pct_change * step / 100) + step)
423 |
424 | random_walk.append(step)
425 | all_walks.append(random_walk)
426 | show_rand_walks(all_walks, security)
427 |
428 |
429 | def rand_walk_3x3_markov(adjusted_df, prob_list, security, num_bins=10, lower_thresh=-1.0, upper_thresh=1.0,
430 | num_periods=12):
431 | """
432 |
433 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
434 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
435 | :param security: Holds information about the requested security
436 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
437 | the probability distribution will be
438 | :param lower_thresh: Represents the level dividing events A (pct change < lower thresh) & B(lower thresh <=
439 | pct change < upper thresh)
440 | :param upper_thresh: Represents the level dividing events B (lower thresh < pct change < upper thresh) &
441 | C(upper thresh < pct change)
442 | :param start: starting value for S&P 500 random walk
443 | :param num_periods: number of steps in the random walk process
444 | """
445 |
446 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
447 | pct_above_array = pct_change_array[pct_change_array >= upper_thresh]
448 | pct_between_array = pct_change_array[np.logical_and(pct_change_array > lower_thresh,
449 | pct_change_array < upper_thresh)]
450 | pct_below_array = pct_change_array[pct_change_array <= lower_thresh]
451 | n_above, bins_above, patches_above = plt.hist(pct_above_array, bins=num_bins, normed=True)
452 | n_between, bins_between, patches_between = plt.hist(pct_between_array, bins=num_bins, normed=True)
453 | n_below, bins_below, patches_below = plt.hist(pct_below_array, bins=num_bins, normed=True)
454 |
455 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
456 | # percentage change of the previous period
457 | pct_change_list = []
458 | all_walks = [] # will hold all the random walk data
459 | start = getLastClosingDate(adjusted_df)# walk starts with the last closing date
460 | for i in range(1000):
461 | mean, std = get_params_for_norm_dist(adjusted_df)
462 | first_pct_change = np.random.normal(mean, std)
463 | pct_change_list.append(first_pct_change)
464 | first_step = ((first_pct_change * start / 100) + start)
465 | random_walk = [start, first_step]
466 |
467 | for period in range(num_periods):
468 | step = random_walk[-1]
469 | prev_pct_change = pct_change_list[-1]
470 |
471 | # random number used to test whether event A will occur or event B will occur
472 | rand_prob = np.random.random_sample()
473 | if prev_pct_change <= lower_thresh: # If true, event A occurred
474 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
475 | if 0 < rand_prob <= prob_list[0][0]: # If true, A then A
476 | pct_change = prob_from_bins(n_below, bins_below)
477 | elif prob_list[0][0] < rand_prob < (prob_list[0][0] + prob_list[0][1]): # If true, A then B
478 | pct_change = prob_from_bins(n_between, bins_between)
479 | else: # If true, A then C
480 | pct_change = prob_from_bins(n_above, bins_above)
481 |
482 | elif lower_thresh < prev_pct_change < upper_thresh: # If true, event B occurred
483 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
484 | if 0 < rand_prob <= prob_list[1][0]: # If true, B then A
485 | pct_change = prob_from_bins(n_below, bins_below)
486 | elif prob_list[1][0] < rand_prob < (prob_list[1][0] + prob_list[1][1]): # If true, B then B
487 | pct_change = prob_from_bins(n_between, bins_between)
488 | else: # If true, B then C
489 | pct_change = prob_from_bins(n_above, bins_above)
490 |
491 | else: # If true, event C occurred
492 | # prob_list[2][0] is probability that event A will occur, given event C has happened already
493 | if 0 < rand_prob <= prob_list[2][0]: # If true, C then A
494 | pct_change = prob_from_bins(n_below, bins_below)
495 | elif prob_list[2][0] < rand_prob < (prob_list[2][0] + prob_list[2][1]): # If true, C then B
496 | pct_change = prob_from_bins(n_between, bins_between)
497 | else: # If true, C then C
498 | pct_change = prob_from_bins(n_above, bins_above)
499 |
500 | pct_change_list.append(pct_change)
501 |
502 | step = ((pct_change * step / 100) + step)
503 |
504 | random_walk.append(step)
505 | all_walks.append(random_walk)
506 | show_rand_walks(all_walks, security)
507 |
508 |
509 | def show_rand_walks(all_walks, security):
510 | """
511 | Visualizes all random walks as a plot and distribution.
512 |
513 | :param all_walks: list of all random walks conducted
514 | """
515 | np_aw = np.array(all_walks) # converts the list of all random walks to a Numpy Array
516 | np_aw_t = np.transpose(np_aw) # must transpose the array for graph to display properly
517 | plt.clf()
518 | plt.plot(np_aw_t)
519 | plt.xlabel("Steps")
520 | plt.ylabel("Value of " + security.get_name())
521 | plt.title("All Random Walks of " + security.get_name())
522 | plt.show()
523 |
524 | # Select last row from np_aw_t: ends
525 | ends = np_aw_t[-1]
526 |
527 | # Plot histogram of ends, display plot
528 | n, bins, patches = plt.hist(ends, bins=25, normed=True)
529 | plt.xlabel("Final Value of " + security.get_name() + " at end of period.")
530 | plt.ylabel("Frequency")
531 | rand_mean = ends.mean()
532 | rand_std = ends.std()
533 |
534 | plt.title("Distribution of Random Walk Final Values. Mean is %d and Standard Deviation is %d"
535 | % (rand_mean, rand_std), y=1.03)
536 | for num_std_from_mean in range(-3, 4):
537 | plt.axvline(rand_mean + rand_std * num_std_from_mean)
538 | bincenters = 0.5*(bins[1:]+bins[:-1])
539 | # plots the normal pdf of best fit
540 | y = mlab.normpdf(bincenters, rand_mean, rand_std)
541 | plt.plot(bincenters, y, 'r--', linewidth=3)
542 | plt.show()
543 |
544 |
545 | x = SecurityInfo(name="YELP", start="2009-06-06", end="2016-06-06", period="weekly")
546 | markov_df = get_data(x)
547 |
548 | print(markov_df)
549 |
550 | matrix = percent_change_prob_3x3(markov_df, x, lower_thresh=-3, upper_thresh=3)
551 | rand_walk_3x3_markov(markov_df, matrix, x, lower_thresh=-3, upper_thresh=3)
552 |
--------------------------------------------------------------------------------
/Old_versions/markov_stock_analysis v2-4.py:
--------------------------------------------------------------------------------
1 | """
2 | @author: Nikhil Bhaip
3 | @version: 2.4
4 | @since: 6/20/16
5 |
6 | The markov_stock analysis program implements an algorithm that finds the percentage change in a security based on
7 | historical weekly data from Yahoo Finance and visualizes the information as a time series plot in matplotlib. The
8 | program also creates a Markov chain model in which the states are bull market, bear market, and stagnant market.
9 | Using the probabilities associated with this Markov chain model, the program will predict the future S&P 500 data
10 | through a random walk. This program can be used as a tool to analyze securities, like stocks and indexes, as well as
11 | study the state of the market for a wide number of applications including options and technical analysis.
12 |
13 | The next step would be to include other newer variables like seasonality.
14 |
15 | """
16 | import datetime as dt
17 | import matplotlib.pyplot as plt
18 | from matplotlib.dates import MonthLocator, DateFormatter
19 | from matplotlib.ticker import MultipleLocator
20 | import matplotlib.mlab as mlab
21 | import pandas as pd
22 | import numpy as np
23 | import quandl
24 | quandl.ApiConfig.api_key = "7NU4-sXfczxA9fsf_C8E"
25 |
26 |
27 | class SecurityInfo:
28 | """
29 | Holds information about a security (stock, index) to be used when retrieving data from Quandl and accessing
30 | information for other functions within the program.
31 |
32 | """
33 | def __init__(self, name, start, end, period="weekly"):
34 | self.name = name
35 | try:
36 | dt.datetime.strptime(start, '%Y-%m-%d')
37 | dt.datetime.strptime(end, '%Y-%m-%d')
38 | self.start = start
39 | self.end = end
40 | except ValueError:
41 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
42 | period_list = ["none", "daily", "weekly", "monthly", "quarterly", "annual"]
43 | if period in period_list:
44 | self.period = period
45 | else:
46 | print("Invalid period format. Using default period as 'weekly'")
47 | self.period = "weekly"
48 |
49 | def summary(self):
50 | print("Name: " + self.name)
51 | print("Starting Date: " + self.start)
52 | print("Ending Date: " + self.end)
53 | print("Period: " + self.period)
54 |
55 | def valid_date(self, new_date):
56 | try:
57 | dt.datetime.strptime(new_date, '%Y-%m-%d')
58 | return True
59 | except ValueError:
60 | raise ValueError("Incorrect data format, should be YYYY-MM-DD")
61 |
62 | def set_name(self, new_name):
63 | self.name = new_name
64 |
65 | def get_name(self):
66 | return self.name
67 |
68 | def set_start(self, new_start):
69 | if self.valid_date(new_start):
70 | self.start = new_start
71 |
72 | def get_start(self):
73 | return self.start
74 |
75 | def set_end(self, new_end):
76 | if self.valid_date(new_end):
77 | self.end = new_end
78 |
79 | def get_end(self):
80 | return self.end
81 |
82 | def set_period(self, new_period):
83 | self.period = new_period
84 |
85 | def get_period(self):
86 | return self.period
87 |
88 |
89 | def get_data(security):
90 | """
91 | This function obtains data under certain parameters from Quandl and returns the following information as a Pandas
92 | DataFrame: date, adjusted closing, and percentage change in adjusted closing from the last week.
93 |
94 | :param security: Holds information about the requested security
95 | :return: A Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
96 | """
97 | name = security.get_name()
98 | start = security.get_start()
99 | end = security.get_end()
100 | period = security.get_period()
101 | raw_df = quandl.get("YAHOO/" + name, start_date=start, collapse=period)
102 | adjusted_df = raw_df.ix[:, ['Adjusted Close']]
103 | adjusted_df["Percentage Change"] = adjusted_df['Adjusted Close'].pct_change() * 100
104 | return adjusted_df
105 |
106 |
107 | def percent_change_as_time_plot(adjusted_df, security):
108 | """
109 | This function visualizes the percentage change data as a time series plot.
110 |
111 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
112 | :param security: Holds information about the requested security
113 | """
114 |
115 | pct_change_list = adjusted_df['Percentage Change'].tolist()
116 | date_list = adjusted_df.index.values
117 | fig, ax = plt.subplots()
118 | ax.plot(date_list, pct_change_list)
119 | plt.xlabel("Dates")
120 | plt.ylabel("Percentage change from last period")
121 | if security.get_period() == "none":
122 | plt.title("Percentage change in " + security.get_name(), y=1.03)
123 | else:
124 | plt.title("Percentage change in " + security.get_name() + " " + security.get_period() + " data", y=1.03)
125 | ax.xaxis.set_minor_locator(MonthLocator())
126 | ax.yaxis.set_minor_locator(MultipleLocator(1))
127 | ax.fmt_xdata = DateFormatter('%Y-%m-%d')
128 | ax.autoscale_view()
129 | fig.autofmt_xdate()
130 |
131 | plt.show()
132 |
133 |
134 | def get_params_for_norm_dist(adjusted_df):
135 | """
136 | This function returns the mean and standard deviation in the percentage change column of a DataFrame.
137 | :param adjusted_df: must have 'Percentage Change' column
138 |
139 | :returns mean and standard deviation of the percentage change column
140 | """
141 | mean = adjusted_df["Percentage Change"].mean()
142 | std = adjusted_df["Percentage Change"].std()
143 | return mean, std
144 |
145 |
146 | def percent_change_as_hist(adjusted_df, security):
147 | """
148 | This function visualizes the percentage change data as a histogram. The graph is also fitted to a normal
149 | distribution curve.
150 |
151 | :param security: Holds information about the requested security
152 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
153 | """
154 | pct_change_list = adjusted_df['Percentage Change']
155 |
156 | # Code below removes the NaN value and plots the histogram. Bins are left adjusted right now, so when plotting the
157 | # normal distribution function, we must adjust it to be based off the center (average) of the bins.
158 | n, bins, patches = plt.hist(pct_change_list.dropna(), bins=25, normed=True)
159 | bincenters = 0.5*(bins[1:]+bins[:-1])
160 |
161 | plt.xlabel("Percentage change")
162 | plt.ylabel("Frequency")
163 | mean, std = get_params_for_norm_dist(adjusted_df)
164 | plt.title("Distribution of percentage change in " + security.get_name() + " Mu: %.3f, Sigma: %.3f"
165 | % (mean, std), y=1.03)
166 |
167 | # adds vertical lines to the graph corresponding to the x's that represent the number of deviations from the mean
168 | for num_std_from_mean in range(-3, 4):
169 | plt.axvline(mean + std * num_std_from_mean)
170 |
171 | # plots the normal pdf of best fit
172 | y = mlab.normpdf(bincenters, mean, std)
173 | plt.plot(bincenters, y, 'r--', linewidth=1)
174 |
175 | plt.show()
176 |
177 |
178 | def print_prob_list_2x2(security, threshold, prob_list):
179 | """
180 | Outputs summary of a 2x2 probability matrix
181 |
182 | :param security: Holds information about the requested security
183 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
184 | :param prob_list: 2x2 markov transition matrix describing security
185 | """
186 | print(prob_list, "\n")
187 | thresh_str = "{:.2f}%".format(threshold)
188 | for i in range(2):
189 | if i == 0:
190 | part_1_summary = "\nIf " + security.get_name() + " rises above {thresh} in one period, "
191 | else:
192 | part_1_summary = "\nIf " + security.get_name() + " falls below {thresh} in one period, "
193 | part_2_summary = "in the next period, there is a {:.2f} chance that the security will rise above {thresh} " \
194 | "and a {:.2f} chance that it will fall below this threshold."
195 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1],
196 | thresh=thresh_str))
197 |
198 |
199 | def percent_change_prob_2x2(adjusted_df, security, threshold=0.0, summary=False):
200 | """
201 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
202 | two states: % change greater than or equal to a threshold (A) and % change less than a threshold (B). The threshold
203 | is defaulted to zero, so that the states are roughly divided into positive and negative changes. The four
204 | probabilities are: a (A | A), b (B | A) , c (A | B) , d (B | B). By definition, the sum of the rows in the right
205 | stochastic transition matrix must add up to 1: (a + b = 1 and c + d = 1)
206 |
207 | A B
208 | P = A a b
209 | B c d
210 |
211 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
212 | :param security: Holds information about the requested security
213 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
214 | :param summary: If true, outputs a summary of the probability matrix
215 | """
216 | count_list = [[0, 0], # a_count, b_count,
217 | [0, 0]] # c_count, d_count
218 |
219 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
220 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
221 |
222 | for index, pct in new_df['Percentage Change'].iteritems():
223 | if index == 1: # prevents program from calculating a future probability
224 | break
225 | if pct >= threshold:
226 | if new_df['Percentage Change'][index-1] >= threshold:
227 | count_list[0][0] += 1 # event A occurred, then event A occurred
228 | else:
229 | count_list[0][1] += 1 # event A occurred, then event B occurred
230 | else:
231 | if new_df['Percentage Change'][index-1] >= threshold:
232 | count_list[1][0] += 1 # event B occurred, then event A occurred
233 | else:
234 | count_list[1][1] += 1 # event B occurred, then event B occurred
235 |
236 | prob_list = [[count / sum(group) for count in group] for group in count_list]
237 |
238 | if summary:
239 | print_prob_list_2x2(security, threshold, prob_list)
240 |
241 | return prob_list
242 |
243 |
244 | def percent_change_prob_3x3(adjusted_df, security, lower_thresh=-1.0, upper_thresh=1.0):
245 | """
246 | This function finds the probabilities associated with the Markov chain of the percentage change column. There are
247 | three states: % change less than or equal to a lower threshold (A), % change between the upper and lower
248 | thresholds (B)and % change greater than or equal to an upper threshold (C). The lower threshold is defaulted to -1,
249 | and the upper threshold is defaulted to +1. Percentage changes below the lower threshold may be considered bearish,
250 | in between the two thresholds considered stagnant, and above the threshold considered bullish. The nine
251 | probabilities are: a P(A | A), b (B | A) , c (C | A) , d (A | B), e (B | B), f (C | B), g (A | C), h (B | C), and
252 | i (C | C). The sum of the rows in the matrix must add up to 1: (a + b + c = 1 and d + e + f = 1 and g + h + i = 1)
253 |
254 | A B C
255 | P = A a b c
256 | B d e f
257 | C g h i
258 |
259 | See percent_change_prob_2x2 for more details
260 |
261 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
262 | :param security: Holds information about the requested security
263 | :param lower_thresh: Represents the level dividing events A & B
264 | :param upper_thresh: Represents the level dividing events A & B
265 | """
266 | # counts frequency of sub-events
267 | count_list = [[0, 0, 0], # a_count, b_count, c_count
268 | [0, 0, 0], # d_count, e_count, f_count
269 | [0, 0, 0]] # g_count, h_count, i_count
270 |
271 | new_df = adjusted_df['Percentage Change'].dropna().to_frame()
272 | new_df = new_df.set_index(np.arange(new_df.size, 0, -1))
273 |
274 | for index, pct in new_df['Percentage Change'].iteritems():
275 | if index == 1: # prevents program from calculating a future probability
276 | break
277 | if pct <= lower_thresh:
278 |
279 | if new_df['Percentage Change'][index-1] <= lower_thresh:
280 | count_list[0][0] += 1 # increment a_count
281 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
282 | count_list[0][1] += 1 # increment b_count
283 | else:
284 | count_list[0][2] += 1 # increment c_count
285 |
286 | elif lower_thresh < pct < upper_thresh:
287 |
288 | if new_df['Percentage Change'][index-1] <= lower_thresh:
289 | count_list[1][0] += 1 # increment d_count
290 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
291 | count_list[1][1] += 1 # increment e_count
292 | else:
293 | count_list[1][2] += 1 # increment f_count
294 |
295 | else:
296 |
297 | if new_df['Percentage Change'][index-1] <= lower_thresh:
298 | count_list[2][0] += 1 # increment g_count
299 | elif lower_thresh < new_df['Percentage Change'][index-1] < upper_thresh:
300 | count_list[2][1] += 1 # increment h_count
301 | else:
302 | count_list[2][2] += 1 # increment i_count
303 |
304 | prob_list = [[count / sum(group) for count in group] for group in count_list]
305 | for group in prob_list:
306 | print(group)
307 | lower_thresh_str = "{:.2f}%".format(lower_thresh)
308 | upper_thresh_str = "{:.2f}%".format(upper_thresh)
309 | for i in range(3):
310 | if i == 0:
311 | part_1_summary = "\nIf " + security.get_name() + " falls below {lower_thresh} in one period (bearish),"
312 | elif i == 1:
313 | part_1_summary = "\nIf " + security.get_name() + " changes between {lower_thresh} and {upper_thresh} in " \
314 | "one period (stagnant),"
315 | else:
316 | part_1_summary = "\nIf " + security.get_name() + " rises above {upper_thresh} in one period (bullish),"
317 |
318 | part_2_summary = "in the next period, there is a {:.3f} chance that the security will fall by more than " \
319 | "{lower_thresh} (bearish), a {:.3f} chance that the security will change somewhere between " \
320 | "{lower_thresh} and {upper_thresh} (stagnant), and a {:.3f} chance that the security will " \
321 | "rise by more than {upper_thresh} (bullish)."
322 | print((part_1_summary + part_2_summary).format(prob_list[i][0], prob_list[i][1], prob_list[i][2],
323 | lower_thresh=lower_thresh_str, upper_thresh=upper_thresh_str))
324 |
325 | return prob_list
326 |
327 |
328 | def random_walk_norm_pdf(adjusted_df, security, start, num_periods=12):
329 | """
330 | This function calculates and visualizes a random walk assuming that security data are independent of current state.
331 | Based on a basic normal distribution and a starting point, the function will predict the security's movement for a
332 | finite number of periods. This is the most fundamental random walk and has many unrealistic
333 | assumptions, such as the data are independently and identically distributed, which is likely not true for the
334 | most securities.
335 |
336 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
337 | :param security: Holds information about a selected security
338 | :param start: starting value for S&P 500 random walk
339 | :param num_periods: number of steps in the random walk process
340 |
341 | """
342 | mean, std = get_params_for_norm_dist(adjusted_df)
343 | pct_change_list = []
344 | all_walks = [] # will hold all the random walk data
345 | for i in range(100):
346 | random_walk = [start]
347 | for period in range(num_periods):
348 | # sets the step as the last element in the random walk
349 | step = random_walk[-1]
350 |
351 | # picks a random percent change from a Gaussian distribution based on historical mean and standard deviation
352 | pct_change = np.random.normal(mean, std)
353 | pct_change_list.append(pct_change)
354 |
355 | # reordering of percent change formula
356 | step = ((pct_change * step / 100) + step)
357 |
358 | random_walk.append(step)
359 | all_walks.append(random_walk)
360 | show_rand_walks(all_walks, security)
361 |
362 |
363 | def prob_from_bins(heights, bins):
364 | """
365 | Chooses a random bin based on the prob distribution in the histogram. Then returns a random percentage change from
366 | that bin.
367 |
368 | :param heights: heights of the histogram
369 | :param bins: left-hand edges of each bin; must have at least two values in list
370 | :return: random percentage change
371 | """
372 | np_heights = np.asarray(heights)
373 | bin_length = bins[1]-bins[0]
374 | np_area = bin_length * np_heights # sum of area is equal to 1
375 | bin_num = np.random.choice(np.arange(start=1, stop=len(bins)), p=np_area)
376 | rand_pct_change = bin_length * np.random.ranf() + bins[bin_num-1]
377 | return rand_pct_change
378 |
379 |
380 | def rand_walk_2x2_markov(adjusted_df, prob_list, security, num_bins=10, threshold=0.0, start=2099.0, num_periods=12):
381 | """
382 | Divides the per
383 |
384 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
385 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
386 | :param security: Holds information about the requested security
387 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
388 | the probability distribution will be
389 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold)
390 | :param start: starting value for S&P 500 random walk
391 | :param num_periods: number of steps in the random walk process
392 | """
393 |
394 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
395 | pct_above_threshold_array = pct_change_array[pct_change_array >= threshold]
396 | pct_below_threshold_array = pct_change_array[pct_change_array < threshold]
397 | n_above, bins_above, patches_above = plt.hist(pct_above_threshold_array, bins=num_bins, normed=True)
398 | n_below, bins_below, patches_below = plt.hist(pct_below_threshold_array, bins=num_bins, normed=True)
399 |
400 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
401 | # percentage change of the previous period
402 | pct_change_list = []
403 | all_walks = [] # will hold all the random walk data
404 | for i in range(100):
405 | mean, std = get_params_for_norm_dist(adjusted_df)
406 | first_pct_change = np.random.normal(mean, std)
407 | pct_change_list.append(first_pct_change)
408 | first_step = ((first_pct_change * start / 100) + start)
409 | random_walk = [start, first_step]
410 |
411 | for period in range(num_periods):
412 | step = random_walk[-1]
413 | prev_pct_change = pct_change_list[-1]
414 |
415 | # random number used to test whether event A will occur or event B will occur
416 | rand_prob = np.random.random_sample()
417 | if prev_pct_change >= threshold: # If true, event A occurred
418 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
419 | if rand_prob <= prob_list[0][0]: # If true, A then A
420 | pct_change = prob_from_bins(n_above, bins_above)
421 | else: # If true, A then B
422 | pct_change = prob_from_bins(n_below, bins_below)
423 | else: # If true, event B occurred
424 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
425 | if rand_prob <= prob_list[1][0]: # If true, B then A
426 | pct_change = prob_from_bins(n_above, bins_above)
427 | else: # If true, B then B
428 | pct_change = prob_from_bins(n_below, bins_below)
429 |
430 | pct_change_list.append(pct_change)
431 |
432 | step = ((pct_change * step / 100) + step)
433 |
434 | random_walk.append(step)
435 | all_walks.append(random_walk)
436 | show_rand_walks(all_walks, security)
437 |
438 |
439 | def rand_walk_3x3_markov(adjusted_df, prob_list, security, num_bins=10, lower_thresh=-1.0, upper_thresh=1.0,
440 | start=2099.0, num_periods=12):
441 | """
442 |
443 | :param adjusted_df: Pandas DataFrame with columns: Date, Adjusted Close, and Percentage Change.
444 | :param prob_list: Contains a 2x2 list that holds the probabilities from a Markov chain with two states
445 | :param security: Holds information about the requested security
446 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
447 | the probability distribution will be
448 | :param lower_thresh: Represents the level dividing events A (pct change < lower thresh) & B(lower thresh <=
449 | pct change < upper thresh)
450 | :param upper_thresh: Represents the level dividing events B (lower thresh < pct change < upper thresh) &
451 | C(upper thresh < pct change)
452 | :param start: starting value for S&P 500 random walk
453 | :param num_periods: number of steps in the random walk process
454 | """
455 |
456 | pct_change_array = np.array(adjusted_df["Percentage Change"].dropna())
457 | pct_above_array = pct_change_array[pct_change_array >= upper_thresh]
458 | pct_between_array = pct_change_array[np.logical_and(pct_change_array > lower_thresh,
459 | pct_change_array < upper_thresh)]
460 | pct_below_array = pct_change_array[pct_change_array <= lower_thresh]
461 | n_above, bins_above, patches_above = plt.hist(pct_above_array, bins=num_bins, normed=True)
462 | n_between, bins_between, patches_between = plt.hist(pct_between_array, bins=num_bins, normed=True)
463 | n_below, bins_below, patches_below = plt.hist(pct_below_array, bins=num_bins, normed=True)
464 |
465 | # First percentage change is determined from a basic normal distribution. Every following period is based on the
466 | # percentage change of the previous period
467 | pct_change_list = []
468 | all_walks = [] # will hold all the random walk data
469 | for i in range(1000):
470 | mean, std = get_params_for_norm_dist(adjusted_df)
471 | first_pct_change = np.random.normal(mean, std)
472 | pct_change_list.append(first_pct_change)
473 | first_step = ((first_pct_change * start / 100) + start)
474 | random_walk = [start, first_step]
475 |
476 | for period in range(num_periods):
477 | step = random_walk[-1]
478 | prev_pct_change = pct_change_list[-1]
479 |
480 | # random number used to test whether event A will occur or event B will occur
481 | rand_prob = np.random.random_sample()
482 | if prev_pct_change <= lower_thresh: # If true, event A occurred
483 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
484 | if 0 < rand_prob <= prob_list[0][0]: # If true, A then A
485 | pct_change = prob_from_bins(n_below, bins_below)
486 | elif prob_list[0][0] < rand_prob < (prob_list[0][0] + prob_list[0][1]): # If true, A then B
487 | pct_change = prob_from_bins(n_between, bins_between)
488 | else: # If true, A then C
489 | pct_change = prob_from_bins(n_above, bins_above)
490 |
491 | elif lower_thresh < prev_pct_change < upper_thresh: # If true, event B occurred
492 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
493 | if 0 < rand_prob <= prob_list[1][0]: # If true, B then A
494 | pct_change = prob_from_bins(n_below, bins_below)
495 | elif prob_list[1][0] < rand_prob < (prob_list[1][0] + prob_list[1][1]): # If true, B then B
496 | pct_change = prob_from_bins(n_between, bins_between)
497 | else: # If true, B then C
498 | pct_change = prob_from_bins(n_above, bins_above)
499 |
500 | else: # If true, event C occurred
501 | # prob_list[2][0] is probability that event A will occur, given event C has happened already
502 | if 0 < rand_prob <= prob_list[2][0]: # If true, C then A
503 | pct_change = prob_from_bins(n_below, bins_below)
504 | elif prob_list[2][0] < rand_prob < (prob_list[2][0] + prob_list[2][1]): # If true, C then B
505 | pct_change = prob_from_bins(n_between, bins_between)
506 | else: # If true, C then C
507 | pct_change = prob_from_bins(n_above, bins_above)
508 |
509 | pct_change_list.append(pct_change)
510 |
511 | step = ((pct_change * step / 100) + step)
512 |
513 | random_walk.append(step)
514 | all_walks.append(random_walk)
515 | show_rand_walks(all_walks, security)
516 |
517 |
518 | def show_rand_walks(all_walks, security):
519 | """
520 | Visualizes all random walks as a plot and distribution.
521 |
522 | :param all_walks: list of all random walks conducted
523 | :param security: holds information about the security
524 | """
525 | np_aw = np.array(all_walks) # converts the list of all random walks to a Numpy Array
526 | np_aw_t = np.transpose(np_aw) # must transpose the array for graph to display properly
527 | plt.clf()
528 | plt.plot(np_aw_t)
529 | plt.xlabel("Steps")
530 | plt.ylabel("Value of " + security.get_name())
531 | plt.title("All Random Walks of " + security.get_name())
532 | plt.show()
533 |
534 | # Select last row from np_aw_t: ends
535 | ends = np_aw_t[-1]
536 |
537 | # Plot histogram of ends, display plot
538 | n, bins, patches = plt.hist(ends, bins=25, normed=True)
539 | plt.xlabel("Final Value of " + security.get_name() + " at end of period.")
540 | plt.ylabel("Frequency")
541 | rand_mean = ends.mean()
542 | rand_std = ends.std()
543 |
544 | plt.title("Distribution of Random Walk Final Values. Mean is %d and Standard Deviation is %d"
545 | % (rand_mean, rand_std), y=1.03)
546 | for num_std_from_mean in range(-3, 4):
547 | plt.axvline(rand_mean + rand_std * num_std_from_mean)
548 | bincenters = 0.5*(bins[1:]+bins[:-1])
549 | # plots the normal pdf of best fit
550 | y = mlab.normpdf(bincenters, rand_mean, rand_std)
551 | plt.plot(bincenters, y, 'r--', linewidth=3)
552 | plt.show()
553 |
554 |
555 | def predict_percentage_change(name, weeks=52, period="weekly", threshold=0.0, num_bins=10):
556 | """
557 | Predicts the percentage change of a security in the next week based on last week's percentage change
558 |
559 | :param name: Ticker name of security (e.g. "AAPL")
560 | :param weeks: Weeks since the most recent recorded date (cannot use years/months because months and years
561 | have varying quantities of days; Numpy requires constancy in datetime arithmetic)
562 | :param period: Frequency of percentage change data
563 | :param threshold: Represents the level dividing events A (change >= threshold) & B (change < threshold).
564 | :param num_bins: Specifies number of bins in the histogram distribution. The more bins, the more realistic
565 | the probability distribution will be
566 |
567 | """
568 | last_row = quandl.get("YAHOO/"+name, rows=1, period=period, order="desc")
569 | end_date = last_row.index.values[0] # numpy datetime object
570 | start_date = end_date - np.timedelta64(weeks, 'W')
571 | end_date_str = pd.to_datetime(str(end_date))
572 | end_date_str = end_date_str.strftime('%Y-%m-%d')
573 | start_date_str = pd.to_datetime(str(start_date))
574 | start_date_str = start_date_str.strftime('%Y-%m-%d')
575 |
576 | sec = SecurityInfo(name=name, start=start_date_str, end=end_date_str, period=period)
577 | markov_df = get_data(sec)
578 | prob_list = percent_change_prob_2x2(markov_df, sec, threshold=threshold)
579 |
580 | pct_change_array = np.array(markov_df["Percentage Change"].dropna())
581 | pct_above_threshold_array = pct_change_array[pct_change_array >= threshold]
582 | pct_below_threshold_array = pct_change_array[pct_change_array < threshold]
583 | n_above, bins_above, patches_above = plt.hist(pct_above_threshold_array, bins=num_bins, normed=True)
584 | n_below, bins_below, patches_below = plt.hist(pct_below_threshold_array, bins=num_bins, normed=True)
585 |
586 | #print(end_date_str)
587 | last_percent_change = markov_df.max()[1]
588 | #print("\nThis week's percentage change was {:.3f}%.".format(last_percent_change))
589 | # random number used to test whether event A will occur or event B will occur
590 |
591 | next_pct_change_list = []
592 | for _ in range(10000):
593 | rand_prob = np.random.random_sample()
594 | if last_percent_change >= threshold: # If true, event A occurred
595 | # prob_list[0][0] is probability that another event A will occur, given event A has happened already
596 | if rand_prob <= prob_list[0][0]: # If true, A then A
597 | next_pct_change = prob_from_bins(n_above, bins_above)
598 | else: # If true, A then B
599 | next_pct_change = prob_from_bins(n_below, bins_below)
600 | else: # If true, event B occurred
601 | # prob_list[1][0] is probability that event A will occur, given event B has happened already
602 | if rand_prob <= prob_list[1][0]: # If true, B then A
603 | next_pct_change = prob_from_bins(n_above, bins_above)
604 | else: # If true, B then B
605 | next_pct_change = prob_from_bins(n_below, bins_below)
606 |
607 | next_pct_change_list.append(next_pct_change)
608 | next_pct_change_np = np.array(next_pct_change_list)
609 | plt.clf()
610 | n, bins, patches = plt.hist(next_pct_change_np, bins=25, normed=True)
611 | bincenters = 0.5*(bins[1:]+bins[:-1])
612 |
613 | plt.xlabel("Percentage change")
614 | plt.ylabel("Frequency")
615 |
616 | mean = next_pct_change_np.mean()
617 | std = next_pct_change_np.std()
618 |
619 | plt.title("Distribution of percentage change in " + name + " Mu: %.3f, Sigma: %.3f"
620 | % (mean, std), y=1.03)
621 |
622 | # adds vertical lines to the graph corresponding to the x's that represent the number of deviations from the mean
623 | for num_std_from_mean in range(-3, 4):
624 | plt.axvline(mean + std * num_std_from_mean)
625 |
626 | # plots the normal pdf of best fit
627 | y = mlab.normpdf(bincenters, mean, std)
628 | plt.plot(bincenters, y, 'r--', linewidth=1)
629 |
630 | plt.show()
631 |
632 |
633 | return [last_percent_change, mean, std]
634 |
635 |
636 | def compare_securities_2x2(sec_list, weeks, thresh=0.0):
637 | """
638 | Returns an excel sheet with stock name, this week's percentage change, mean of next week's predicted
639 | percentage change, and standard deviation of next week's predicted percentage change
640 |
641 | :param sec_list: with all the security names
642 | :param weeks: Number of weeks since the most recent recorded date (cannot use years/months because months and
643 | years have varying quantities of days; Numpy requires constancy in datetime arithmetic)
644 | :param thresh: divides percentage changes into two categories (>= and <); applies to each security
645 | """
646 | sec_dict = {}
647 | for name in sec_list:
648 | sec_info = predict_percentage_change(name, weeks=weeks, threshold=thresh)
649 | sec_dict[name] = sec_info
650 | sec_df = pd.DataFrame(sec_dict).transpose()
651 | sec_df.columns = ['Last % Change', "Mean Predicted % Change", "Standard Deviation " +
652 | "Predicted % Change"]
653 | sec_df= sec_df.sort_values(by=["Mean Predicted % Change"], ascending=True)
654 | writer = pd.ExcelWriter('output.xlsx')
655 | sec_df.to_excel(writer, 'Sheet1')
656 | writer.save()
657 |
658 |
659 | #compare_securities_2x2(["BAC", "AAPL", "GOOG", "T"], weeks=26, thresh=2.0)
660 |
661 | predict_percentage_change("YELP", 104, threshold=5)
662 | #x = SecurityInfo(name="AMZN", start="2009-05-23", end="2016-05-23", period="weekly")
663 | #markov_df = get_data(x)
664 | #print(markov_df)
665 | #percent_change_as_time_plot(markov_df, x)
666 | #percent_change_as_hist(markov_df, x)
667 | #matrix = percent_change_prob_3x3(markov_df, x, lower_thresh=-5, upper_thresh=5)
668 | #rand_walk_3x3_markov(markov_df, matrix, x, lower_thresh=-5, upper_thresh=5)
669 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |