├── ARM └── ch5 │ ├── arsenic_wells_switching.ipynb │ └── data │ └── wells.dat ├── MLFH ├── CH1 │ ├── ch1.py │ ├── chapter1.ipynb │ ├── post90_count_ts.png │ ├── quick_hist_all_years.png │ ├── quick_hist_post90.png │ └── ufo_ts_bystate.png ├── CH2 │ ├── ch2.ipynb │ ├── ch2.py │ ├── ch2_with_formulas.ipynb │ ├── height_density_bysex.png │ ├── height_hist_bins001.png │ ├── height_hist_bins1.png │ ├── height_hist_bins5.png │ ├── height_weight_class.png │ ├── height_weight_lowess.png │ ├── height_weight_scatter.png │ ├── heights_density.png │ ├── lowess work │ │ ├── README │ │ ├── cylowess.c │ │ ├── cylowess.pyx │ │ ├── cylowess.so │ │ ├── cylowess_testing.ipynb │ │ ├── mcycle.csv │ │ ├── r_lowess_d0_it3_f0-01.csv │ │ ├── results │ │ │ ├── test_lowess_delta.csv │ │ │ ├── test_lowess_frac.csv │ │ │ ├── test_lowess_iter.csv │ │ │ ├── test_lowess_r_outputs.R │ │ │ └── test_lowess_simple.csv │ │ ├── setup.py │ │ ├── test_lowess.py │ │ └── test_lowess_r_output.R │ ├── weight_density_bysex.png │ └── weight_density_bysex_subplot.png ├── CH3 │ ├── ch3.ipynb │ ├── ch3.py │ ├── ch3_nltk.ipynb │ └── r_stopwords.csv ├── ch4 │ ├── ch4.ipynb │ ├── tdm_df.py │ └── tdm_df.pyc ├── ch5 │ └── ch5.ipynb ├── ch6 │ ├── ch6.ipynb │ ├── tdm_df.py │ └── tdm_df.pyc ├── ch7 │ └── ch7.ipynb ├── ch8 │ └── ch8.ipynb └── ch9 │ ├── ch9.ipynb │ └── convert_dta_to_csv.r └── README.md /MLFH/CH1/ch1.py: -------------------------------------------------------------------------------- 1 | '''' 2 | ------------------------------------------------------------------------------- 3 | Filename : ch1.py 4 | Date : 2012-04-16 5 | Author : C. Vogel 6 | Purpose : Replicate analysis of UFO data in Chapter 1 of _Machine Learning 7 | : for Hackers_. 8 | Input Data : ufo_awesome.csv is available at the book's github repository at 9 | : https://github.com/johnmyleswhite/ML_for_Hackers.git 10 | Libraries : Numpy 1.6.1, Matplotlib 1.1.0, Pandas 0.7.3 11 | ------------------------------------------------------------------------------- 12 | 13 | This script is a Python port of the R code in Chapter 1 of _Machine Learning 14 | for Hackers_ by D. Conway and J.M. White. It is mainly intended to be run via 15 | the interactive shell, though that's not necessary. 16 | 17 | The script will produce (1) a cleaned tab-separated file of the UFO data; 18 | (2) a series of 4 PNG figures. 19 | 20 | The UFO dataset (approx. 75MB tab-separated file) should be located in a 21 | /data/ufo subfolder of the working directory. Otherwise, change the `inpath` 22 | and `outpath` variables at the start of the file. 23 | 24 | For a detailed description of the analysis and the process of porting it 25 | to Python, see: slendrmeans.wordpress.com/will-it-python. 26 | ''' 27 | 28 | 29 | import numpy as np 30 | from pandas import * 31 | import matplotlib.pyplot as plt 32 | import datetime as dt 33 | import time 34 | import re 35 | 36 | # The location of the UFO raw data 37 | inpath = 'data/ufo/ufo_awesome.tsv' 38 | 39 | ######################################### 40 | # Fixing extra columns in the raw data. # 41 | ######################################### 42 | # Pandas' read_table function gives an error reading the raw data file 43 | # `ufo_awesome.tsv`. It turns out there are extra tabs in some of the fields, 44 | # generating extra (>6) columns. 45 | 46 | # A test: read lines from the file until we reach a line with more than 6 47 | # tab-separated columns. Then print the line. I use enumerate() to identify 48 | # the bad line and its columns. 49 | # This 7th column of this bad line corresponds to the first bad date 50 | # column in the text. (Indicating R pushes the extra columns to new lines 51 | # to a new row). 52 | 53 | inf = open(inpath, 'r') 54 | for i, line in enumerate(inf): 55 | splitline = line.split('\t') 56 | if len(splitline) != 6: 57 | first_bad_line = splitline 58 | print "First bad row:", i 59 | for j, col in enumerate(first_bad_line): 60 | print j, col 61 | break 62 | inf.close() 63 | 64 | # The location of a cleaned version of the data, where the extra 65 | # columns are eliminated. Output of the function `ufo_tab_to_sixcols` below. 66 | outpath = 'data/ufo/ufo_awesome_6col.tsv' 67 | 68 | 69 | def ufotab_to_sixcols(inpath, outpath): 70 | ''' 71 | Keep only the first 6 columns of data from messy UFO TSV file. 72 | 73 | The UFO data set is only supposed to have six columns. But... 74 | 75 | The sixth column is a long written description of the UFO sighting, and 76 | sometimes is broken by tab characters which create extra columns. 77 | 78 | For these records, we only keep the first six columns. This typically cuts 79 | off some of the long description. 80 | 81 | Sometimes a line has less than six columns. These are not written to 82 | the output file (i.e., they're dropped from the data). These records are 83 | usually so comprimised as to be uncleanable anyway. 84 | 85 | This function has (is) a side effect on the `outpath` file, to which it 86 | writes output. 87 | ''' 88 | 89 | inf = open(inpath, 'r') 90 | outf = open(outpath, 'w') 91 | 92 | for line in inf: 93 | splitline = line.split('\t') 94 | # Skip short lines, which are dirty beyond repair, anyway. 95 | if len(splitline) < 6: 96 | continue 97 | 98 | newline = ('\t').join(splitline[ :6]) 99 | # Records that have been truncated won't end in a newline character 100 | # so add one. 101 | if newline[-1: ] != '\n': 102 | newline += '\n' 103 | 104 | outf.write(newline) 105 | 106 | inf.close() 107 | outf.close() 108 | 109 | # Run the data cleaning function to create the cleaned file. No need to do 110 | # this more than once. 111 | ufotab_to_sixcols(inpath, outpath) 112 | 113 | # With the new clean file, we can use Pandas' to import the data. 114 | ufo = read_table('data/ufo/ufo_awesome_6col.tsv', sep = '\t', na_values = '', 115 | header = None, names = ['date_occurred', 116 | 'date_reported', 117 | 'location', 118 | 'short_desc', 119 | 'duration', 120 | 'long_desc']) 121 | 122 | # Print the beginning of the data; compare to table on p. 14. 123 | print ufo.head(6).to_string(formatters = {'long_desc' : lambda x : x[ :21]}) 124 | 125 | ######################################### 126 | # Converting and cleaning up date data. # 127 | ######################################### 128 | # Unlike the R import, Pandas' read_table pulled the dates in as integers 129 | # in YYYYMMDD format. We'll use the function below and map() it to the 130 | # date columns in the data. 131 | 132 | def ymd_convert(x): 133 | ''' 134 | Convert dates in the imported UFO data. 135 | Clean entries will look like YYYMMDD. If they're not clean, return NA. 136 | ''' 137 | try: 138 | cnv_dt = dt.datetime.strptime(str(x), '%Y%m%d') 139 | except ValueError: 140 | cnv_dt = np.nan 141 | 142 | return cnv_dt 143 | 144 | ufo['date_occurred'] = ufo['date_occurred'].map(ymd_convert) 145 | ufo['date_reported'] = ufo['date_reported'].map(ymd_convert) 146 | 147 | # Get rid of the rows that couldn't be conformed to datetime. 148 | ufo = ufo[(notnull(ufo['date_reported'])) & (notnull(ufo['date_occurred']))] 149 | 150 | ############################# 151 | # Organizing location data. # 152 | ############################# 153 | # Note on p. 16 the authors claim strsplit() throws an error if there is no 154 | # comma in the entry. This doesn't appear to be true. 155 | 156 | def get_location(l): 157 | ''' 158 | Divide the `location` variable in the data into two new variables. 159 | The first is the city, the second the state (or province). The function 160 | returns a two-element list of the form [city, state]. 161 | 162 | This function is a fairly direct translation of the one in the text. 163 | But, by assuming legitimate U.S. locations have only one comma in them 164 | (e.g. `Baltimore, MD`), the authors miss a number of data points where 165 | the `city` entry has a detailed description with several commas: e.g., 166 | `Baltimore, near U.S. Rte 59, MD`. 167 | ''' 168 | split_location = l.split(',') 169 | clean_location = [x.strip() for x in split_location] 170 | if len(split_location) != 2: 171 | clean_location = ['', ''] 172 | 173 | return clean_location 174 | 175 | # As an alternative to the one-comma method for finding U.S. locations, 176 | # we try using a regular expression that looks for entries that end in a 177 | # comma and two letters (e.g., `, MD`) after stripping extra white space. 178 | 179 | # Since the regexp is going to be mapped along a Series of data, we'll 180 | # compile it first. 181 | us_state_pattern = re.compile(', [A-Z][A-Z]$', re.IGNORECASE) 182 | 183 | def get_location2(l): 184 | ''' 185 | Divide the `location` variable in the data into two new variables. 186 | The first is the city, the second the state (or province). The function 187 | returns a two-element list of the form [city, state]. 188 | 189 | This function assumes legitimate U.S. locations have location data 190 | that end in a comma plus the two-letter state abbreviation. It will 191 | miss any rows where, for instance, the state is spelled out. 192 | 193 | Note that the regexp pattern `us_state_pattern` is defined outside 194 | the function, and not called as an extra argument. (Since this 195 | function will be used with Pandas' map(), it's more convenient to 196 | define it with a single argument. 197 | ''' 198 | strip_location = l.strip() 199 | us_state_search = us_state_pattern.search(strip_location) 200 | if us_state_search == None: 201 | clean_location = ['', ''] 202 | else: 203 | us_city = strip_location[ :us_state_search.start()] 204 | us_state = strip_location[us_state_search.start() + 2: ] 205 | clean_location = [us_city, us_state] 206 | return clean_location 207 | 208 | # Get a series of [city, state] lists, then unpack them into new 209 | # variables in the data frame. 210 | location_lists = ufo['location'].map(get_location2) 211 | ufo['us_city'] = [city for city, st in location_lists] 212 | ufo['us_state'] = [st.lower() for city, st in location_lists] 213 | 214 | # State list from p. 18. Note they forget DC. There seem to be 12 DC entries. 215 | us_states = ['ak', 'al', 'ar', 'az', 'ca', 'co', 'ct', 'dc', 'de', 'fl', 216 | 'ga', 'hi', 'ia', 'id', 'il', 'in', 'ks', 'ky', 'la', 'ma', 217 | 'md', 'me', 'mi', 'mn', 'mo', 'ms', 'mt', 'nc', 'nd', 'ne', 218 | 'nh', 'nj', 'nm', 'nv', 'ny', 'oh', 'ok', 'or', 'pa', 'ri', 219 | 'sc', 'sd', 'tn', 'tx', 'ut', 'va', 'vt', 'wa', 'wi', 'wv', 220 | 'wy'] 221 | 222 | 223 | 224 | # If the `us_state` variable doesn't match the states in the list, set it to 225 | # a missing string. Then nix the rows without valid U.S. states. 226 | ufo['us_state'][-np.in1d(ufo['us_state'].tolist(), us_states)] = '' 227 | ufo['us_city'][-np.in1d(ufo['us_state'].tolist(), us_states)] = '' 228 | 229 | ufo_us = ufo[ufo['us_state'] != ''] 230 | 231 | # Get a data Series of years in the data. Note that Pandas' describe() 232 | # won't give quantiles of the datetime objects in the date variables. 233 | # (it requires interpolating between dates, which is tricky with 234 | # datetimes. We can call describe() on the years to get quantiles, though 235 | # since they're just integers. 236 | years = ufo_us['date_occurred'].map(lambda x: x.year) 237 | years.describe() 238 | 239 | 240 | ########################################################################### 241 | # Plot distribution of sigthings over time and subset to recent sigthings # 242 | ########################################################################### 243 | # Figure 1-5 of the text. Note it's over years, and not the original 244 | # `date_occured` variable. Matplotlib apparently can't draw histograms 245 | # of datetimes. 246 | plt.figure() 247 | years.hist(bins = (years.max() - years.min())/30., fc = 'steelblue') 248 | plt.title('Histogram of years with U.S. UFO sightings\nAll years in data') 249 | plt.savefig('quick_hist_all_years.png') 250 | 251 | # Restrict the dates in the data to 1990 and after. 252 | ufo_us = ufo_us[ufo_us['date_occurred'] >= dt.datetime(1990, 1, 1)] 253 | 254 | years_post90 = ufo_us['date_occurred'].map(lambda x: x.year) 255 | 256 | # How much data do we have now, compare to p. 22 of the text. 257 | ufo_us.shape 258 | 259 | # Check how many sightings we saved with the regex version of the 260 | # location-cleaning function. 261 | city_commas = ufo['us_city'].map(lambda x: x.count(',')) 262 | print 'Cities with commas = ', sum(city_commas > 0) 263 | 264 | # Figure 1-6 in the text. 265 | plt.figure() 266 | years_post90.hist(bins = 20, fc = 'steelblue') 267 | plt.title('Histogram of years with U.S. UFO sightings\n1990 through 2010') 268 | plt.savefig('quick_hist_post90.png') 269 | 270 | # It's a little strange to histogram over dates. Let's just make a line 271 | # plot with the time series of no. of sigthings by date. Aggregated at the 272 | # national level, it looks like there's some seasonality in the data, 273 | # and a clear `millenium` effect. 274 | post90_count = ufo_us.groupby('date_occurred')['date_occurred'].count() 275 | plt.figure() 276 | post90_count.plot() 277 | plt.title('Number of U.S. UFO sightings\nJanuary 1990 through August 2010') 278 | plt.savefig('post90_count_ts.png') 279 | 280 | ################################## 281 | # Get monthly sightings by state # 282 | ################################## 283 | # Aggregate data to the state/month level with Pandas' groupby() method. 284 | ufo_us['year_month'] = ufo_us['date_occurred'].map(lambda x: 285 | dt.date(x.year, x.month, 1)) 286 | 287 | sightings_counts = ufo_us.groupby(['us_state', 288 | 'year_month'])['year_month'].count() 289 | 290 | # Check out Alaska to compare with p. 22. Note we get an extra row, which 291 | # results from the improved location cleaning. 292 | print 'First few AK sightings in data:' 293 | print sightings_counts.ix['ak'].head(6) 294 | 295 | print 'Extra AK sighting, no on p. 22:' 296 | print ufo_us[(ufo_us['us_state'] == 'ak') & 297 | (ufo_us['year_month'] == dt.date(1994, 2, 1))] \ 298 | [['year_month','location']] 299 | 300 | # Since groupby drops state-month levels for which there are no sightings, 301 | # we'll create a 2-level MultiIndex with the full range of state-month pairs. 302 | # Then, we'll re-index the data, filling in 0's where data is missing. 303 | ym_list = [dt.date(y, m, 1) for y in range(1990, 2011) 304 | for m in range(1, 13) 305 | if dt.date(y, m, 1) <= dt.date(2010, 8, 1)] 306 | 307 | full_index = zip(np.sort(us_states * len(ym_list)), ym_list * len(us_states)) 308 | full_index = MultiIndex.from_tuples(full_index, names = 309 | ['states', 'year_month']) 310 | 311 | sightings_counts = sightings_counts.reindex(full_index, fill_value = 0) 312 | 313 | ############################################################## 314 | # Plot monthly sightings by state in lattice/facet-wrap plot # 315 | ############################################################## 316 | # Subplot parameters. We set up a figures with MxN subplots, where MxN >= 51 317 | # (no. of states to plot). When MxN > 51, the `hangover` variable counts how 318 | # many extra subplot remain in the last row of figure. We'll need this to 319 | # to put tick labels in nice places. 320 | nrow = 13; ncol = 4; hangover = len(us_states) % ncol 321 | 322 | fig, axes = plt.subplots(nrow, ncol, sharey = True, figsize = (9, 11)) 323 | 324 | fig.suptitle('Monthly UFO Sightings by U.S. State\nJanuary 1990 through August 2010', 325 | size = 12) 326 | plt.subplots_adjust(wspace = .05, hspace = .05) 327 | 328 | num_state = 0 329 | for i in range(nrow): 330 | for j in range(ncol): 331 | xs = axes[i, j] 332 | 333 | xs.grid(linestyle = '-', linewidth = .25, color = 'gray') 334 | 335 | if num_state < 51: 336 | st = us_states[num_state] 337 | sightings_counts.ix[st, ].plot(ax = xs, linewidth = .75) 338 | xs.text(0.05, .95, st.upper(), transform = axes[i, j].transAxes, 339 | verticalalignment = 'top') 340 | num_state += 1 341 | else: 342 | # Make extra subplots invisible 343 | plt.setp(xs, visible = False) 344 | 345 | xtl = xs.get_xticklabels() 346 | ytl = xs.get_yticklabels() 347 | 348 | # X-axis tick labels: 349 | # Turn off tick labels for all the the bottom-most 350 | # subplots. This includes the plots on the last row, and 351 | # if the last row doesn't have a subplot in every column 352 | # put tick labels on the next row up for those last 353 | # columns. 354 | # 355 | # Y-axis tick labels: 356 | # Put left-axis labels on the first column of subplots, 357 | # odd rows. Put right-axis labels on the last column 358 | # of subplots, even rows. 359 | if i < nrow - 2 or (i < nrow - 1 and (hangover == 0 or 360 | j <= hangover - 1)): 361 | plt.setp(xtl, visible = False) 362 | if j > 0 or i % 2 == 1: 363 | plt.setp(ytl, visible = False) 364 | if j == ncol - 1 and i % 2 == 1: 365 | xs.yaxis.tick_right() 366 | 367 | plt.setp(xtl, rotation=90.) 368 | 369 | plt.savefig('ufo_ts_bystate.png', dpi = 300) 370 | 371 | -------------------------------------------------------------------------------- /MLFH/CH1/post90_count_ts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH1/post90_count_ts.png -------------------------------------------------------------------------------- /MLFH/CH1/quick_hist_all_years.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH1/quick_hist_all_years.png -------------------------------------------------------------------------------- /MLFH/CH1/quick_hist_post90.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH1/quick_hist_post90.png -------------------------------------------------------------------------------- /MLFH/CH1/ufo_ts_bystate.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH1/ufo_ts_bystate.png -------------------------------------------------------------------------------- /MLFH/CH2/ch2.py: -------------------------------------------------------------------------------- 1 | '''' 2 | ------------------------------------------------------------------------------- 3 | Filename : ch2.ipynb 4 | Date : 2012-04-30 5 | Author : C. Vogel 6 | Purpose : Replicate analysis of height and weight data in Chapter 2 of 7 | : _Machine Learning for Hackers_. 8 | Input Data : 01_heights_weights_genders.tsv is available at the book's github 9 | : repository at https://github.com/johnmyleswhite/ML_for_Hackers.git 10 | Libraries : Numpy 1.6.1, Matplotlib 1.1.0, Pandas 0.7.3, scipy 0.10.0, 11 | : statsmodels 0.4.0 12 | ------------------------------------------------------------------------------- 13 | 14 | This notebook is a Python port of the R code in Chapter 2 of _Machine Learning 15 | for Hackers_ by D. Conway and J.M. White. 16 | 17 | Running the notebook will produce 9 PNG figures and save them to the working 18 | directory. 19 | 20 | The height/weight dataset CSV file should be located in a /data/ subfolder of 21 | the working directory. 22 | 23 | For a detailed description of the analysis and the process of porting it 24 | to Python, see: slendrmeans.wordpress.com/will-it-python. 25 | ''' 26 | 27 | 28 | 29 | import numpy as np 30 | from pandas import * 31 | import matplotlib.pyplot as plt 32 | import os 33 | import statsmodels.api as sm 34 | from statsmodels.nonparametric.kde import KDE 35 | from statsmodels.nonparametric import lowess 36 | from statsmodels.api import GLM, Logit 37 | 38 | 39 | 40 | # Numeric Summaries 41 | # p. 37 42 | 43 | # Import the height and weights data 44 | heights_weights = read_table('data/01_heights_weights_genders.csv', sep = ',', header = 0) 45 | 46 | # Assign the heights column to its own series, and describe it. 47 | heights = heights_weights['Height'] 48 | heights.describe() 49 | 50 | # Means, medians, and modes (p. 38) 51 | def my_mean(x): 52 | return float(np.sum(x)) / len(x) 53 | 54 | def my_median(x): 55 | ''' 56 | Compute the median of a series x. 57 | ''' 58 | 59 | # Get a sorted copy of the values in the series (need to call values 60 | # otherwise integer indexing messes things up.) 61 | sorted_x = np.sort(x.values) 62 | if len(x) % 2 == 0: 63 | indices = [0.5 * len(x) - 1, 0.5 * len(x)] 64 | return np.mean(sorted_x[indices]) 65 | else: 66 | # Ceil(x) - 1 = Floor(x), but this is to make clear that the -1 is to 67 | # account for 0-based counting. 68 | index = ceil(0.5 * len(x)) - 1 69 | return sorted_x.ix[index] 70 | 71 | 72 | 73 | # Check my_mean and my_median against built-ins 74 | my_mean(heights) - heights.mean() 75 | 76 | my_median(heights) - heights.median() 77 | 78 | 79 | 80 | # Quantiles (p. 40) 81 | heights.min(), heights.max() 82 | 83 | 84 | # Range = max - min. Note: np.ptp(heights.values) will do the same thing. 85 | # HT Nathaniel Smith 86 | def my_range(s): 87 | ''' 88 | Difference between the max and min of an array or Series 89 | ''' 90 | return s.max() - s.min() 91 | 92 | my_range(heights) 93 | 94 | 95 | # Similarly, pandas doesn't seem to provide multiple quantiles. 96 | # But (1) the standard ones are available via .describe() and 97 | # (2) creating one is simple. 98 | 99 | # To get a single quantile 100 | heights.quantile(.5) 101 | 102 | # Function to get arbitrary quantiles of a series. 103 | def my_quantiles(s, prob = (0.0, 0.25, 0.5, 1.0)): 104 | ''' 105 | Calculate quantiles of a series. 106 | 107 | Parameters: 108 | ----------- 109 | s : a pandas Series 110 | prob : a tuple (or other iterable) of probabilities at 111 | which to compute quantiles. Must be an iterable, 112 | even for a single probability (e.g. prob = (0.50) 113 | not prob = 0.50). 114 | 115 | Returns: 116 | -------- 117 | A pandas series with the probabilities as an index. 118 | ''' 119 | q = [s.quantile(p) for p in prob] 120 | return Series(q, index = prob) 121 | 122 | # With the default prob argument 123 | my_quantiles(heights) 124 | 125 | # With a specific prob argument - here deciles 126 | my_quantiles(heights, prob = arange(0, 1.1, 0.1)) 127 | 128 | # Standard deviation and variances 129 | def my_var(x): 130 | return np.sum((x - x.mean())**2) / (len(x) - 1) 131 | 132 | my_var(heights) - heights.var() 133 | 134 | def my_sd(x): 135 | return np.sqrt(my_var(x)) 136 | 137 | my_sd(heights) - heights.std() 138 | 139 | # Exploratory Data Visualization (p. 44) 140 | 141 | # Histograms 142 | # 1-inch bins 143 | bins1 = np.arange(heights.min(), heights.max(), 1.) 144 | heights.hist(bins = bins1, fc = 'steelblue') 145 | plt.savefig('height_hist_bins1.png') 146 | 147 | # 5-inch bins 148 | bins5 = np.arange(heights.min(), heights.max(), 5.) 149 | heights.hist(bins = bins5, fc = 'steelblue') 150 | plt.savefig('height_hist_bins5.png') 151 | 152 | # 0.001-inch bins 153 | bins001 = np.arange(heights.min(), heights.max(), .001) 154 | heights.hist(bins = bins001, fc = 'steelblue') 155 | plt.savefig('height_hist_bins001.png') 156 | 157 | # Kernel density estimators, from scipy.stats. 158 | # Create a KDE ojbect 159 | heights_kde = KDE(heights) 160 | # Use fit() to estimate the densities. Default is gaussian kernel 161 | # using fft. This will provide a "density" attribute. 162 | heights_kde.fit() 163 | 164 | # Plot the density of the heights 165 | # Sort inside the plotting so the lines connect nicely. 166 | fig = plt.figure() 167 | plt.plot(heights_kde.support, heights_kde.density) 168 | plt.savefig('heights_density.png') 169 | 170 | # Pull out male and female heights as arrays over which to compute densities 171 | heights_m = heights[heights_weights['Gender'] == 'Male'].values 172 | heights_f = heights[heights_weights['Gender'] == 'Female'].values 173 | heights_m_kde = KDE(heights_m) 174 | heights_f_kde = KDE(heights_f) 175 | heights_m_kde.fit() 176 | heights_f_kde.fit() 177 | 178 | fig = plt.figure() 179 | plt.plot(heights_m_kde.support, heights_m_kde.density, label = 'Male') 180 | plt.plot(heights_f_kde.support, heights_f_kde.density, label = 'Female') 181 | plt.legend() 182 | plt.savefig('height_density_bysex.png') 183 | 184 | 185 | # Do the same thing with weights. 186 | weights_m = heights_weights[heights_weights['Gender'] == 'Male']['Weight'].values 187 | weights_f = heights_weights[heights_weights['Gender'] == 'Female']['Weight'].values 188 | weights_m_kde = KDE(weights_m) 189 | weights_f_kde = KDE(weights_f) 190 | weights_m_kde.fit() 191 | weights_f_kde.fit() 192 | 193 | fig = plt.figure() 194 | plt.plot(weights_m_kde.support, weights_f_kde.density, label = 'Male') 195 | plt.plot(weights_f_kde.support, weights_f_kde.density, label = 'Female') 196 | plt.legend() 197 | plt.savefig('weight_density_bysex.png') 198 | 199 | 200 | # Subplot weight density by sex. 201 | fig, axes = plt.subplots(nrows = 2, ncols = 1, sharex = True, figsize = (9, 6)) 202 | plt.subplots_adjust(hspace = 0.1) 203 | axes[0].plot(weights_f_kde.support, weights_f_kde.density, label = 'Female') 204 | axes[0].xaxis.tick_top() 205 | axes[0].legend() 206 | axes[1].plot(weights_m_kde.support, weights_f_kde.density, label = 'Male') 207 | axes[1].legend() 208 | plt.savefig('weight_density_bysex_subplot.png') 209 | 210 | # Scatter plot. Pull weight (both sexes) out as a separate array first, like 211 | # we did with height above. 212 | weights = heights_weights['Weight'] 213 | plt.plot(heights, weights, '.k', mew = 0, alpha = .1) 214 | plt.savefig('height_weight_scatter.png') 215 | 216 | 217 | # Lowess smoothing - this seems to be new functionality not yet in docs (as of 0.40, April 2012). 218 | lowess_line = lowess.lowess(weights, heights) 219 | 220 | plt.figure(figsize = (13, 9)) 221 | plt.plot(heights, weights, '.', mfc = 'steelblue', mew=0, alpha = .25) 222 | plt.plot(lowess_line[:,0], lowess_line[:, 1], '-', color = '#461B7E', label = "Lowess fit") 223 | plt.legend(loc = "upper left") 224 | plt.savefig('height_weight_lowess.png') 225 | 226 | # Logistic regression of sex on height and weight 227 | # Sex is coded in the binary variable `male`. 228 | 229 | # LHS binary variable 230 | male = (heights_weights['Gender'] == 'Male') * 1 231 | 232 | # Matrix of predictor variables: hieght and weight from data frame 233 | # into an Nx2 array. 234 | hw_exog = heights_weights[['Height', 'Weight']].values 235 | 236 | # Logit model 1: Using GLM and the Binomial Family w/ the Logit Link 237 | # Note I have to add constants to the `exog` matrix. The prepend = True 238 | # argument prevents a warning about future change to the default argument. 239 | logit_model = GLM(male, sm.add_constant(hw_exog, prepend = True), family = sm.families.Binomial(sm.families.links.logit)) 240 | logit_model.fit().summary() 241 | 242 | # Get the coefficient parameters. 243 | logit_pars = logit_model.fit().params 244 | 245 | 246 | # Logit model 2: Using the Logit function. 247 | logit_model2 = Logit(male, sm.add_constant(hw_exog, prepend = True)) 248 | logit_model2.fit().summary() 249 | 250 | # Get the coefficient parameters 251 | logit_pars2 = logit_model2.fit().params 252 | 253 | # Compare the two methods again. They give the same parameters. 254 | DataFrame({'GLM' : logit_pars, 'Logit' : logit_pars2}) 255 | 256 | # Draw a separating line in the [height, weight]-space. 257 | # The line will separate the space into predicted-male 258 | # and predicted-female regions. 259 | 260 | # Get the intercept and slope of the line based on the logit coefficients 261 | intercept = -logit_pars['const'] / logit_pars['x2'] 262 | slope = -logit_pars['x1'] / logit_pars['x2'] 263 | 264 | # Plot the data and the separating line 265 | # Color code male and female points. 266 | fig = plt.figure(figsize = (10, 8)) 267 | plt.plot(heights_f, weights_f, '.', label = 'Female', mew = 0, mfc='coral', alpha = .1) 268 | plt.plot(heights_m, weights_m, '.', label = 'Male', mew = 0, mfc='steelblue', alpha = .1) 269 | plt.plot(array([50, 80]), intercept + slope * array([50, 80]), '-', color = '#461B7E') 270 | plt.legend(loc='upper left') 271 | plt.savefig('height_weight_class.png') 272 | 273 | -------------------------------------------------------------------------------- /MLFH/CH2/height_density_bysex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/height_density_bysex.png -------------------------------------------------------------------------------- /MLFH/CH2/height_hist_bins001.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/height_hist_bins001.png -------------------------------------------------------------------------------- /MLFH/CH2/height_hist_bins1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/height_hist_bins1.png -------------------------------------------------------------------------------- /MLFH/CH2/height_hist_bins5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/height_hist_bins5.png -------------------------------------------------------------------------------- /MLFH/CH2/height_weight_class.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/height_weight_class.png -------------------------------------------------------------------------------- /MLFH/CH2/height_weight_lowess.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/height_weight_lowess.png -------------------------------------------------------------------------------- /MLFH/CH2/height_weight_scatter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/height_weight_scatter.png -------------------------------------------------------------------------------- /MLFH/CH2/heights_density.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/heights_density.png -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/README: -------------------------------------------------------------------------------- 1 | Files for an improved version of statsmodel's lowess. 2 | 3 | cylowess.pyx is Cython code for a faster version of the lowess function 4 | in statsmodels.nonparametric.lowess. 5 | 6 | This code is more or less a from-scratch re-write, borrowing from both 7 | statsmodel's lowess, and from W. Cleveland's original lowess.f Fortran code. 8 | 9 | The main speed improvements come from: 10 | 11 | 1. Replacing expensive lstsq() calls in the statsmodel versions with direct 12 | calculations of the fitted y-values. 13 | 2. General Cython-based speedups for simple procedures in tight loops (like 14 | updating k-nearest neighbors). 15 | 3. But mostly the implementation of local linear interpolation via the new 16 | delta parameter. This vastly reduces the amount of times weighted regressions 17 | are run with minimal effect on the results. The idea, is to only run regression 18 | at points spaced at most `delta` apart, then linearly interpolate between those 19 | two results. 20 | For moderate to large data (N > 5000) this cuts timings about 50-100x--from 21 | seconds to milliseconds Currently, the default delta is zero, so the feature 22 | is not implemented unless explicitly set by the user. This should probably 23 | change. 24 | 25 | See the IPython notebook for comparisons with statsmodels and R lowess. CSV files 26 | are exported R data that are loaded into the notebook. 27 | 28 | -C. Vogel 29 | May 2012 -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/cylowess.pyx: -------------------------------------------------------------------------------- 1 | #cython: boundscheck = False 2 | #cython: wraparound = False 3 | #cython: cdivision = True 4 | 5 | ''' 6 | Univariate lowess function, like in R. 7 | 8 | References 9 | ---------- 10 | Hastie, Tibshirani, Friedman. (2009) The Elements of Statistical Learning: Data 11 | Mining, Inference, and Prediction, Second Edition: Chapter 6. 12 | 13 | Cleveland, W.S. (1979) "Robust Locally Weighted Regression and Smoothing 14 | Scatterplots". Journal of the American Statistical Association 74 (368): 829-836. 15 | ''' 16 | 17 | cimport numpy as np 18 | import numpy as np 19 | from cpython cimport bool 20 | cimport cython 21 | from libc.math cimport fabs, fmax 22 | 23 | 24 | DTYPE = np.double 25 | ctypedef np.double_t DTYPE_t 26 | 27 | def lowess(np.ndarray[DTYPE_t, ndim = 1] endog, 28 | np.ndarray[DTYPE_t, ndim = 1] exog, 29 | double frac = 2.0 / 3.0, 30 | Py_ssize_t it = 3, 31 | double delta = 0.0): 32 | ''' 33 | LOWESS (Locally Weighted Scatterplot Smoothing) 34 | 35 | A lowess function that outs smoothed estimates of endog 36 | at the given exog values from points (exog, endog) 37 | 38 | Parameters 39 | ---------- 40 | endog: 1-D numpy array 41 | The y-values of the observed points 42 | exog: 1-D numpy array 43 | The x-values of the observed points 44 | frac: float 45 | Between 0 and 1. The fraction of the data used 46 | when estimating each y-value. 47 | it: int 48 | The number of residual-based reweightings 49 | to perform. 50 | delta: float 51 | Distance within which to use linear-interpolation 52 | instead of weighted regression. 53 | 54 | Returns 55 | ------- 56 | out: numpy array 57 | A numpy array with two columns. The first column 58 | is the sorted x values and the second column the 59 | associated estimated y-values. 60 | 61 | Notes 62 | ----- 63 | This lowess function implements the algorithm given in the 64 | reference below using local linear estimates. 65 | 66 | Suppose the input data has N points. The algorithm works by 67 | estimating the `smooth` y_i by taking the frac*N closest points 68 | to (x_i,y_i) based on their x values and estimating y_i 69 | using a weighted linear regression. The weight for (x_j,y_j) 70 | is tricube function applied to |x_i-x_j|. 71 | 72 | If it > 1, then further weighted local linear regressions 73 | are performed, where the weights are the same as above 74 | times the _lowess_bisquare function of the residuals. Each iteration 75 | takes approximately the same amount of time as the original fit, 76 | so these iterations are expensive. They are most useful when 77 | the noise has extremely heavy tails, such as Cauchy noise. 78 | Noise with less heavy-tails, such as t-distributions with df>2, 79 | are less problematic. The weights downgrade the influence of 80 | points with large residuals. In the extreme case, points whose 81 | residuals are larger than 6 times the median absolute residual 82 | are given weight 0. 83 | 84 | delta can be used to save computations. For each x_i, regressions 85 | are skipped for points closer than delta. The next regression is 86 | fit for the farthest point within delta of x_i and all points in 87 | between are estimated by linearly interpolating between the two 88 | regression fits. 89 | 90 | Judicious choice of delta can cut computation time considerably 91 | for large data (N > 5000). A good choice is delta = 0.01 * 92 | range(exog). 93 | 94 | Some experimentation is likely required to find a good 95 | choice of frac and iter for a particular dataset. 96 | 97 | References 98 | ---------- 99 | Cleveland, W.S. (1979) "Robust Locally Weighted Regression 100 | and Smoothing Scatterplots". Journal of the American Statistical 101 | Association 74 (368): 829-836. 102 | 103 | Examples 104 | -------- 105 | The below allows a comparison between how different the fits from 106 | lowess for different values of frac can be. 107 | 108 | >>> import numpy as np 109 | >>> import statsmodels.api as sm 110 | >>> import cylowess 111 | >>> lowess = cylowess.lowess 112 | >>> x = np.random.uniform(low = -2*np.pi, high = 2*np.pi, size=500) 113 | >>> y = np.sin(x) + np.random.normal(size=len(x)) 114 | >>> z = lowess(y,x) 115 | >>> w = lowess(y,x, frac=1./3) 116 | 117 | This gives a similar comparison for when it is 0 vs not. 118 | 119 | >>> import numpy as np 120 | >>> import scipy.stats as stats 121 | >>> import statsmodels.api as sm 122 | >>> import cylowess 123 | >>> lowess = cylowess.lowess 124 | >>> x = np.random.uniform(low = -2*np.pi, high = 2*np.pi, size=500) 125 | >>> y = np.sin(x) + stats.cauchy.rvs(size=len(x)) 126 | >>> z = lowess(y,x, frac= 1./3, it=0) 127 | >>> w = lowess(y,x, frac=1./3) 128 | 129 | ''' 130 | cdef: 131 | Py_ssize_t n 132 | int k 133 | Py_ssize_t robiter, i, left_end, right_end 134 | int last_fit_i, 135 | np.ndarray[np.int_t, ndim = 1] sort_index 136 | np.ndarray[DTYPE_t, ndim = 1] x, y 137 | np.ndarray[DTYPE_t, ndim = 1] y_fit 138 | np.ndarray[DTYPE_t, ndim = 1] weights 139 | np.ndarray[DTYPE_t, ndim = 1] resid_weights 140 | 141 | 142 | # Inputs should be vectors (1-D arrays) of the 143 | # same length. 144 | if exog.ndim != 1: 145 | raise ValueError('exog must be a vector') 146 | if endog.ndim != 1: 147 | raise ValueError('endog must be a vector') 148 | if endog.shape[0] != exog.shape[0] : 149 | raise ValueError('exog and endog must have same length') 150 | 151 | # Cut out missing values 152 | x = exog[(np.isfinite(exog) & np.isfinite(endog))] 153 | y = endog[(np.isfinite(exog) & np.isfinite(endog))] 154 | 155 | # Sort both inputs according to the ascending order of x values 156 | sort_index = np.argsort(exog) 157 | x = np.array(x[sort_index]) 158 | y = np.array(y[sort_index]) 159 | n = x.shape[0] 160 | 161 | # The number of neighbors in each regression. 162 | k = int(frac * n) 163 | 164 | # frac should be set, so that 2 <= k <= n. 165 | # Conform them instead of throwing error. 166 | if k < 2: 167 | k = 2 168 | if k > n: 169 | k = n 170 | 171 | y_fit = np.zeros(n, dtype = DTYPE) 172 | resid_weights = np.zeros(n, dtype = DTYPE) 173 | 174 | it += 1 # Add one to it for initial run. 175 | for robiter in xrange(it): 176 | i = 0 177 | last_fit_i = -1 178 | left_end = 0 179 | right_end = k 180 | y_fit = np.zeros(n, dtype = DTYPE) 181 | 182 | # 'do' Fit y[i]'s 'until' the end of the regression 183 | while True: 184 | # Re-initialize the weights for each point x[i]. 185 | weights = np.zeros(n, dtype = DTYPE) 186 | 187 | # Describe the neighborhood around the current x[i]. 188 | left_end, right_end, radius = update_neighborhood(x, i, n, 189 | left_end, 190 | right_end) 191 | 192 | # Calculate the weights for the regression in this neighborhood. 193 | # Determine if at least some weights are positive, so a regression 194 | # is ok. 195 | reg_ok = calculate_weights(x, weights, resid_weights, i, left_end, 196 | right_end, radius, robiter > 0) 197 | 198 | # If ok, run the regression 199 | calculate_y_fit(x, y, i, y_fit, weights, left_end, right_end, 200 | reg_ok) 201 | 202 | # If we skipped some points (because of how delta was set), go back 203 | # and fit them by linear interpolation. 204 | if last_fit_i < (i - 1): 205 | interpolate_skipped_fits(x, y_fit, i, last_fit_i) 206 | 207 | # Update the last fit counter to indicate we've now fit this point. 208 | # Find the next i for which we'll run a regression. 209 | i, last_fit_i = update_indices(x, y_fit, delta, i, n, last_fit_i) 210 | 211 | if last_fit_i >= n-1: 212 | break 213 | 214 | # Calculate residual weights, but don't bother on the last iteration. 215 | if robiter < it - 1: 216 | resid_weights = calculate_residual_weights(y, y_fit) 217 | 218 | 219 | return np.array([x, y_fit]).T 220 | 221 | 222 | def update_neighborhood(np.ndarray[DTYPE_t, ndim = 1] x, 223 | Py_ssize_t i, 224 | Py_ssize_t n, 225 | Py_ssize_t left_end, 226 | Py_ssize_t right_end): 227 | ''' 228 | Find the indices bounding the k-nearest-neighbors of the current 229 | point. 230 | 231 | Parameters 232 | ---------- 233 | x: 1-D numpy array 234 | The input x-values 235 | i: indexing integer 236 | The index of the point currently being fit. 237 | n: indexing integer 238 | The length of the input vectors, x and y. 239 | left_end: indexing integer 240 | The index of the left-most point in the neighborhood 241 | of x[i-1] (the previously-fit point). 242 | right_end: indexing integer 243 | The index of the right-most point in the neighborhood 244 | of x[i-1]. Non-inclusive, s.t. the neighborhood is 245 | x[left_end] <= x < x[right_end]. 246 | radius: float 247 | The radius of the current neighborhood. The larger of 248 | distances between x[i] and its left-most or right-most 249 | neighbor. 250 | 251 | Returns 252 | ------- 253 | left_end: indexing integer 254 | The index of the left-most point in the neighborhood 255 | of x[i] (the current point). 256 | right_end: indexing integer 257 | The index of the right-most point in the neighborhood 258 | of x[i]. Non-inclusive, s.t. the neighborhood is 259 | x[left_end] <= x < x[right_end]. 260 | radius: float 261 | The radius of the current neighborhood. The larger of 262 | distances between x[i] and its left-most or right-most 263 | neighbor. 264 | ''' 265 | 266 | cdef double radius 267 | # A subtle loop. Start from the current neighborhood range: 268 | # [left_end, right_end). Shift both ends rightwards by one 269 | # (so that the neighborhood still contains k points), until 270 | # the current point is in the center (or just to the left of 271 | # the center) of the neighborhood. This neighborhood will 272 | # contain the k-nearest neighbors of x[i]. 273 | # 274 | # Once the right end hits the end of the data, hold the 275 | # neighborhood the same for the remaining x[i]s. 276 | while True: 277 | if right_end < n: 278 | 279 | if (x[i] > (x[left_end] + x[right_end]) / 2.0): 280 | left_end += 1 281 | right_end += 1 282 | else: 283 | break 284 | else: 285 | break 286 | 287 | radius = fmax(x[i] - x[left_end], x[right_end-1] - x[i]) 288 | 289 | return left_end, right_end, radius 290 | 291 | cdef bool calculate_weights(np.ndarray[DTYPE_t, ndim = 1] x, 292 | np.ndarray[DTYPE_t, ndim = 1] weights, 293 | np.ndarray[DTYPE_t, ndim = 1] resid_weights, 294 | Py_ssize_t i, 295 | Py_ssize_t left_end, 296 | Py_ssize_t right_end, 297 | double radius, 298 | bool use_resid_weights): 299 | ''' 300 | 301 | Parameters 302 | ---------- 303 | x: 1-D vector 304 | The input x-values. 305 | weights: 1-D numpy array 306 | The vector of regression weights. 307 | resid_weights: 1-D numpy array 308 | The vector of residual weights from the last iteration. 309 | i: indexing integer 310 | The index of the point currently being fit. 311 | left_end: indexing integer 312 | The index of the left-most point in the neighborhood of 313 | x[i]. 314 | right_end: indexing integer 315 | The index of the right-most point in the neighborhood 316 | of x[i]. Non-inclusive, s.t. the neighborhood is 317 | x[left_end] <= x < x[right_end]. 318 | radius: float 319 | The radius of the current neighborhood. The larger of 320 | distances between x[i] and its left-most or right-most 321 | neighbor. 322 | use_resid_weights: boolean 323 | If True, multiply the x-distance weights by the residual 324 | weights from the last iteration of regressions. Set to 325 | False on the first iteration (since there are no residuals 326 | yet) and True on the subsequent ``robustifying`` iterations. 327 | 328 | 329 | Returns 330 | ------- 331 | reg_ok: boolean 332 | If True, at least some points have positive weight, and the 333 | regression will be run. If False, the regression is skipped 334 | and y_fit[i] is set to equal y[i]. 335 | Also, changes elements of weights in-place. 336 | ''' 337 | 338 | cdef: 339 | np.ndarray[DTYPE_t, ndim = 1] x_j = x[left_end:right_end] 340 | np.ndarray[DTYPE_t, ndim = 1] dist_i_j = np.abs(x_j - x[i]) / radius 341 | bool reg_ok = True 342 | double sum_weights 343 | 344 | # Assign the distance measure to the weights, then apply the tricube 345 | # function to change in-place. 346 | # use_resid_weights will be False on the first iteration, then True 347 | # on the subsequent ones, after some residuals have been calculated. 348 | weights[left_end:right_end] = dist_i_j 349 | if use_resid_weights == False: 350 | tricube(weights[left_end:right_end]) 351 | if use_resid_weights == True: 352 | tricube(weights[left_end:right_end]) 353 | weights[left_end:right_end] = (weights[left_end:right_end] * 354 | resid_weights[left_end:right_end]) 355 | 356 | sum_weights = np.sum(weights[left_end:right_end]) 357 | 358 | if sum_weights <= 0.0: 359 | reg_ok = False 360 | else: 361 | weights[left_end:right_end] = weights[left_end:right_end] / sum_weights 362 | 363 | return reg_ok 364 | 365 | 366 | cdef void calculate_y_fit(np.ndarray[DTYPE_t, ndim = 1] x, 367 | np.ndarray[DTYPE_t, ndim = 1] y, 368 | Py_ssize_t i, 369 | np.ndarray[DTYPE_t, ndim = 1] y_fit, 370 | np.ndarray[DTYPE_t, ndim = 1] weights, 371 | Py_ssize_t left_end, 372 | Py_ssize_t right_end, 373 | bool reg_ok): 374 | ''' 375 | Calculate smoothed/fitted y-value by weighted regression. 376 | 377 | Parameters 378 | ---------- 379 | x: 1-D numpy array 380 | The vector of input x-values. 381 | y: 1-D numpy array 382 | The vector of input y-values. 383 | i: indexing integer 384 | The index of the point currently being fit. 385 | y_fit: 1-D numpy array 386 | The vector of fitted y-values. 387 | weights: 1-D numpy array 388 | The vector of regression weights. 389 | left_end: indexing integer 390 | The index of the left-most point in the neighborhood of 391 | x[i]. 392 | right_end: indexing integers 393 | The index of the right-most point in the neighborhood 394 | of x[i]. Non-inclusive, s.t. the neighborhood is 395 | x[left_end] <= x < x[right_end]. 396 | reg_ok: boolean 397 | If True, at least some points have positive weight, and the 398 | regression will be run. If False, the regression is skipped 399 | and y_fit[i] is set to equal y[i]. 400 | 401 | Returns 402 | ------- 403 | Nothing. Changes y_fit[i] in-place. 404 | 405 | Notes 406 | ----- 407 | No regression function (e.g. lstsq) is called. Instead "projection 408 | vector" p_i_j is calculated, and y_fit[i] = sum(p_i_j * y[j]) = y_fit[i] 409 | for j s.t. x[j] is in the neighborhood of x[i]. p_i_j is a function of 410 | the weights, x[i], and its neighbors. 411 | ''' 412 | 413 | cdef: 414 | double sum_weighted_x = 0, weighted_sqdev_x = 0, p_i_j 415 | 416 | if reg_ok == False: 417 | y_fit[i] = y[i] 418 | else: 419 | for j in xrange(left_end, right_end): 420 | sum_weighted_x += weights[j] * x[j] 421 | for j in xrange(left_end, right_end): 422 | weighted_sqdev_x += weights[j] * (x[j] - sum_weighted_x) ** 2 423 | for j in xrange(left_end, right_end): 424 | p_i_j = weights[j] * (1.0 + (x[i] - sum_weighted_x) * 425 | (x[j] - sum_weighted_x) / weighted_sqdev_x) 426 | y_fit[i] += p_i_j * y[j] 427 | 428 | cdef void interpolate_skipped_fits(np.ndarray[DTYPE_t, ndim = 1] x, 429 | np.ndarray[DTYPE_t, ndim = 1] y_fit, 430 | Py_ssize_t i, 431 | Py_ssize_t last_fit_i): 432 | ''' 433 | Calculate smoothed/fitted y by linear interpolation between the current 434 | and previous y fitted by weighted regression. 435 | Called only if delta > 0. 436 | 437 | Parameters 438 | ---------- 439 | x: 1-D numpy array 440 | The vector of input x-values. 441 | y_fit: 1-D numpy array 442 | The vector of fitted y-values 443 | i: indexing integer 444 | The index of the point currently being fit by weighted 445 | regression. 446 | last_fit_i: indexing integer 447 | The index of the last point fit by weighted regression. 448 | 449 | Returns 450 | ------- 451 | Nothing: changes elements of y_fit in-place. 452 | ''' 453 | 454 | cdef np.ndarray[DTYPE_t, ndim = 1] a 455 | 456 | a = x[(last_fit_i + 1): i] - x[last_fit_i] 457 | a = a / (x[i] - x[last_fit_i]) 458 | y_fit[(last_fit_i + 1): i] = a * y_fit[i] + (1.0 - a) * y_fit[last_fit_i] 459 | 460 | 461 | def update_indices(np.ndarray[DTYPE_t, ndim = 1] x, 462 | np.ndarray[DTYPE_t, ndim = 1] y_fit, 463 | double delta, 464 | Py_ssize_t i, 465 | Py_ssize_t n, 466 | Py_ssize_t last_fit_i): 467 | ''' 468 | Update the counters of the local regression. 469 | 470 | Parameters 471 | ---------- 472 | x: 1-D numpy array 473 | The vector of input x-values. 474 | y_fit: 1-D numpy array 475 | The vector of fitted y-values 476 | delta: float 477 | Indicates the range of x values within which linear 478 | interpolation should be used to estimate y_fit instead 479 | of weighted regression. 480 | i: indexing integer 481 | The index of the current point being fit. 482 | n: indexing integer 483 | The length of the input vectors, x and y. 484 | last_fit_i: indexing integer 485 | The last point at which y_fit was calculated. 486 | 487 | Returns 488 | ------- 489 | i: indexing integer 490 | The next point at which to run a weighted regression. 491 | last_fit_i: indexing integer 492 | The updated last point at which y_fit was calculated 493 | 494 | Notes 495 | ----- 496 | The relationship between the outputs is s.t. x[i+1] > 497 | x[last_fit_i] + delta. 498 | 499 | ''' 500 | cdef: 501 | Py_ssize_t k 502 | double cutpoint 503 | 504 | last_fit_i = i 505 | # For most points within delta of the current point, we skip the 506 | # weighted linear regression (which save much computation of 507 | # weights and fitted points). Instead, we'll jump to the last 508 | # point within delta, fit the weighted regression at that point, 509 | # and linearly interpolate in between. 510 | 511 | # This loop increments until we fall just outside of delta distance, 512 | # copying the results for any repeated x's along the way. 513 | cutpoint = x[last_fit_i] + delta 514 | for k in range(last_fit_i + 1, n): 515 | if x[k] > cutpoint: 516 | break 517 | if x[k] == x[last_fit_i]: 518 | # if tied with previous x-value, just use the already 519 | # fitted y, and update the last-fit counter. 520 | y_fit[k] = y_fit[last_fit_i] 521 | last_fit_i = k 522 | 523 | # i, which indicates the next point to fit the regression at, is 524 | # either one prior to k (since k should be the first point outside 525 | # of delta) or is just incremented + 1 if k = i+1. This insures we 526 | # always step forward. 527 | i = max(k-1, last_fit_i + 1) 528 | 529 | return i, last_fit_i 530 | 531 | 532 | def calculate_residual_weights(np.ndarray[DTYPE_t, ndim = 1] y, 533 | np.ndarray[DTYPE_t, ndim = 1] y_fit): 534 | ''' 535 | Calculate residual weights for the next `robustifying` iteration. 536 | 537 | Parameters 538 | ---------- 539 | y: 1-D numpy array 540 | The vector of actual input y-values. 541 | y_fit: 1-D numpy array 542 | The vector of fitted y-values from the current 543 | iteration. 544 | 545 | Returns 546 | ------- 547 | resid_weights: 1-D numpy array 548 | The vector of residual weights, to be used in the 549 | next iteration of regressions. 550 | ''' 551 | 552 | std_resid = np.abs(y - y_fit) 553 | std_resid /= 6.0 * np.median(std_resid) 554 | 555 | # Some trimming of outlier residuals. 556 | std_resid[std_resid >= 1.0] = 1.0 557 | #std_resid[std_resid >= 0.999] = 1.0 558 | #std_resid[std_resid <= 0.001] = 0.0 559 | 560 | resid_weights = bisquare(std_resid) 561 | 562 | return resid_weights 563 | 564 | 565 | cdef void tricube(np.ndarray[DTYPE_t, ndim = 1] x): 566 | ''' 567 | The tri-cubic function (1 - x**3)**3. Used to weight neighboring 568 | points along the x-axis based on their distance to the current point. 569 | 570 | Parameters 571 | ---------- 572 | x: 1-D numpy array 573 | A vector of neighbors` distances from the current point, 574 | in units of the neighborhood radius. 575 | 576 | Returns 577 | ------- 578 | Nothing. Changes array elements in-place 579 | ''' 580 | 581 | # fast_array_cube is an elementwise, in-place cubed-power 582 | # operator. 583 | fast_array_cube(x) 584 | x[:] = np.negative(x) 585 | x += 1 586 | fast_array_cube(x) 587 | 588 | 589 | cdef void fast_array_cube(np.ndarray[DTYPE_t, ndim = 1] x): 590 | ''' 591 | A fast, elementwise, in-place cube operator. Called by the 592 | tricube function. 593 | 594 | Parameters 595 | ---------- 596 | x: 1-D numpy array 597 | 598 | Returns 599 | ------- 600 | Nothing. Changes array elements in-place. 601 | ''' 602 | 603 | x2 = x*x 604 | x *= x2 605 | 606 | 607 | def bisquare(np.ndarray[DTYPE_t, ndim = 1] x): 608 | ''' 609 | The bi-square function (1 - x**2)**2. 610 | 611 | Used to weight the residuals in the `robustifying` 612 | iterations. Called by the calculate_residual_weights function. 613 | 614 | Parameters 615 | ---------- 616 | x: 1-D numpy array 617 | A vector of absolute regression residuals, in units of 618 | 6 times the median absolute residual. 619 | 620 | Returns 621 | ------- 622 | A 1-D numpy array of residual weights. 623 | ''' 624 | 625 | return (1.0 - x**2)**2 626 | 627 | 628 | 629 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/cylowess.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/lowess work/cylowess.so -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/mcycle.csv: -------------------------------------------------------------------------------- 1 | "times","accel" 2 | 2.4,0 3 | 2.6,-1.3 4 | 3.2,-2.7 5 | 3.6,0 6 | 4,-2.7 7 | 6.2,-2.7 8 | 6.6,-2.7 9 | 6.8,-1.3 10 | 7.8,-2.7 11 | 8.2,-2.7 12 | 8.8,-1.3 13 | 8.8,-2.7 14 | 9.6,-2.7 15 | 10,-2.7 16 | 10.2,-5.4 17 | 10.6,-2.7 18 | 11,-5.4 19 | 11.4,0 20 | 13.2,-2.7 21 | 13.6,-2.7 22 | 13.8,0 23 | 14.6,-13.3 24 | 14.6,-5.4 25 | 14.6,-5.4 26 | 14.6,-9.3 27 | 14.6,-16 28 | 14.6,-22.8 29 | 14.8,-2.7 30 | 15.4,-22.8 31 | 15.4,-32.1 32 | 15.4,-53.5 33 | 15.4,-54.9 34 | 15.6,-40.2 35 | 15.6,-21.5 36 | 15.8,-21.5 37 | 15.8,-50.8 38 | 16,-42.9 39 | 16,-26.8 40 | 16.2,-21.5 41 | 16.2,-50.8 42 | 16.2,-61.7 43 | 16.4,-5.4 44 | 16.4,-80.4 45 | 16.6,-59 46 | 16.8,-71 47 | 16.8,-91.1 48 | 16.8,-77.7 49 | 17.6,-37.5 50 | 17.6,-85.6 51 | 17.6,-123.1 52 | 17.6,-101.9 53 | 17.8,-99.1 54 | 17.8,-104.4 55 | 18.6,-112.5 56 | 18.6,-50.8 57 | 19.2,-123.1 58 | 19.4,-85.6 59 | 19.4,-72.3 60 | 19.6,-127.2 61 | 20.2,-123.1 62 | 20.4,-117.9 63 | 21.2,-134 64 | 21.4,-101.9 65 | 21.8,-108.4 66 | 22,-123.1 67 | 23.2,-123.1 68 | 23.4,-128.5 69 | 24,-112.5 70 | 24.2,-95.1 71 | 24.2,-81.8 72 | 24.6,-53.5 73 | 25,-64.4 74 | 25,-57.6 75 | 25.4,-72.3 76 | 25.4,-44.3 77 | 25.6,-26.8 78 | 26,-5.4 79 | 26.2,-107.1 80 | 26.2,-21.5 81 | 26.4,-65.6 82 | 27,-16 83 | 27.2,-45.6 84 | 27.2,-24.2 85 | 27.2,9.5 86 | 27.6,4 87 | 28.2,12 88 | 28.4,-21.5 89 | 28.4,37.5 90 | 28.6,46.9 91 | 29.4,-17.4 92 | 30.2,36.2 93 | 31,75 94 | 31.2,8.1 95 | 32,54.9 96 | 32,48.2 97 | 32.8,46.9 98 | 33.4,16 99 | 33.8,45.6 100 | 34.4,1.3 101 | 34.8,75 102 | 35.2,-16 103 | 35.2,-54.9 104 | 35.4,69.6 105 | 35.6,34.8 106 | 35.6,32.1 107 | 36.2,-37.5 108 | 36.2,22.8 109 | 38,46.9 110 | 38,10.7 111 | 39.2,5.4 112 | 39.4,-1.3 113 | 40,-21.5 114 | 40.4,-13.3 115 | 41.6,30.8 116 | 41.6,-10.7 117 | 42.4,29.4 118 | 42.8,0 119 | 42.8,-10.7 120 | 43,14.7 121 | 44,-1.3 122 | 44.4,0 123 | 45,10.7 124 | 46.6,10.7 125 | 47.8,-26.8 126 | 47.8,-14.7 127 | 48.8,-13.3 128 | 50.6,0 129 | 52,10.7 130 | 53.2,-14.7 131 | 55,-2.7 132 | 55,10.7 133 | 55.4,-2.7 134 | 57.6,10.7 135 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/r_lowess_d0_it3_f0-01.csv: -------------------------------------------------------------------------------- 1 | "x","y" 2 | 2.4,-1.05527911306819 3 | 2.6,-1.12191546213901 4 | 3.2,-1.3160810589629 5 | 3.6,-1.44228622638434 6 | 4,-1.56622833055018 7 | 6.2,-2.20535256156392 8 | 6.6,-2.30960353192232 9 | 6.8,-2.34085489228434 10 | 7.8,-2.42829907254918 11 | 8.2,-2.50088600503397 12 | 8.8,-2.62763282647147 13 | 8.8,-2.62763282647147 14 | 9.6,-2.89106318406413 15 | 10,-2.94940884219566 16 | 10.2,-2.96474357854881 17 | 10.6,-2.98800310967419 18 | 11,-2.91489861298946 19 | 11.4,-2.7869325850868 20 | 13.2,-1.44815598334248 21 | 13.6,-3.08732715163511 22 | 13.8,-4.47915905205275 23 | 14.6,-10.8068835803928 24 | 14.6,-10.8068835803928 25 | 14.6,-10.8068835803928 26 | 14.6,-10.8068835803928 27 | 14.6,-10.8068835803928 28 | 14.6,-10.8068835803928 29 | 14.8,-16.4802488804481 30 | 15.4,-35.044804079601 31 | 15.4,-35.044804079601 32 | 15.4,-35.044804079601 33 | 15.4,-35.044804079601 34 | 15.6,-36.2024699857126 35 | 15.6,-36.2024699857126 36 | 15.8,-34.2946110366992 37 | 15.8,-34.2946110366992 38 | 16,-40.9623781610697 39 | 16,-40.9623781610697 40 | 16.2,-49.8894820126662 41 | 16.2,-49.8894820126662 42 | 16.2,-49.8894820126662 43 | 16.4,-59.1575903657072 44 | 16.4,-59.1575903657072 45 | 16.6,-69.2618040410694 46 | 16.8,-78.3672548349392 47 | 16.8,-78.3672548349392 48 | 16.8,-78.3672548349392 49 | 17.6,-99.0626269349997 50 | 17.6,-99.0626269349997 51 | 17.6,-99.0626269349997 52 | 17.6,-99.0626269349997 53 | 17.8,-102.23913076535 54 | 17.8,-102.23913076535 55 | 18.6,-105.652890355726 56 | 18.6,-105.652890355726 57 | 19.2,-109.287141504408 58 | 19.4,-110.86031629994 59 | 19.4,-110.86031629994 60 | 19.6,-112.453088494228 61 | 20.2,-114.301112171188 62 | 20.4,-114.649557476194 63 | 21.2,-117.076859204605 64 | 21.4,-117.783792010231 65 | 21.8,-118.57266795491 66 | 22,-118.123275062322 67 | 23.2,-110.462590434712 68 | 23.4,-112.245220421091 69 | 24,-98.6622216657911 70 | 24.2,-91.0193902327628 71 | 24.2,-91.0193902327628 72 | 24.6,-75.5437345924428 73 | 25,-59.3264242523324 74 | 25,-59.3264242523324 75 | 25.4,-45.3652057885971 76 | 25.4,-45.3652057885971 77 | 25.6,-39.9193022836345 78 | 26,-29.4898561808369 79 | 26.2,-26.7335833651572 80 | 26.2,-26.7335833651572 81 | 26.4,-24.7308823089427 82 | 27,-17.3461923374273 83 | 27.2,-13.9877368356593 84 | 27.2,-13.9877368356593 85 | 27.2,-13.9877368356593 86 | 27.6,-3.81544929300075 87 | 28.2,10.0630237568212 88 | 28.4,11.8459033194807 89 | 28.4,11.8459033194807 90 | 28.6,13.7410371926982 91 | 29.4,23.9840770224533 92 | 30.2,30.6043362580197 93 | 31,37.192489862103 94 | 31.2,38.4974329391848 95 | 32,40.2184164284705 96 | 32,40.2184164284705 97 | 32.8,38.4881342223577 98 | 33.4,32.0477723999532 99 | 33.8,30.279266730721 100 | 34.4,28.1763302530371 101 | 34.8,27.8686331549754 102 | 35.2,27.2079004830731 103 | 35.2,27.2079004830731 104 | 35.4,27.1338969773874 105 | 35.6,27.3449931413974 106 | 35.6,27.3449931413974 107 | 36.2,26.7215917837038 108 | 36.2,26.7215917837038 109 | 38,16.0933068920218 110 | 38,16.0933068920218 111 | 39.2,4.44817897699259 112 | 39.4,2.73128847555066 113 | 40,2.0724405389235 114 | 40.4,0.78752966654204 115 | 41.6,0.313882972235793 116 | 41.6,0.313882972235793 117 | 42.4,2.80166660884346 118 | 42.8,3.41308814919302 119 | 42.8,3.41308814919302 120 | 43,3.91731033265016 121 | 44,4.82547410488896 122 | 44.4,4.3557567860508 123 | 45,2.35094708701136 124 | 46.6,-5.35459332063613 125 | 47.8,-6.18598516585209 126 | 47.8,-6.18598516585209 127 | 48.8,-5.60403115908023 128 | 50.6,-5.426800456096 129 | 52,-3.43429857282505 130 | 53.2,-1.06348431706449 131 | 55,2.40078257807028 132 | 55,2.40078257807028 133 | 55.4,3.1498821640819 134 | 57.6,7.23208018602103 135 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/results/test_lowess_delta.csv: -------------------------------------------------------------------------------- 1 | "x","y","out_0","out_Rdef","out_1" 2 | 2.4,0,-1.05527911306819,-1.0552790539899,-1.05527936004381 3 | 2.6,-1.3,-1.12191546213901,-1.1219153861107,-1.12048006786341 4 | 3.2,-2.7,-1.3160810589629,-1.31608092863007,-1.3160821913222 5 | 3.6,0,-1.44228622638434,-1.44228605698195,-1.44115634878145 6 | 4,-2.7,-1.56622833055018,-1.56622811612462,-1.5662305062407 7 | 6.2,-2.7,-2.20535256156392,-2.2053517538187,-2.20536232244474 8 | 6.6,-2.7,-2.30960353192232,-2.30960281009849,-2.2956961032938 9 | 6.8,-1.3,-2.34085489228434,-2.34085423588152,-2.34086299371832 10 | 7.8,-2.7,-2.42829907254918,-2.42829915868062,-2.42829392886562 11 | 8.2,-2.7,-2.50088600503397,-2.50088668179744,-2.50802002039905 12 | 8.8,-1.3,-2.62763282647147,-2.62763400103173,-2.6276091576992 13 | 8.8,-2.7,-2.62763282647147,-2.62763400103173,-2.6276091576992 14 | 9.6,-2.7,-2.89106318406413,-2.89106462018415,-2.89101322886983 15 | 10,-2.7,-2.94940884219566,-2.94940971405603,-2.92978739771462 16 | 10.2,-5.4,-2.96474357854881,-2.96474407294923,-2.94917448213701 17 | 10.6,-2.7,-2.98800310967419,-2.98800285418974,-2.98794865098179 18 | 11,-5.4,-2.91489861298946,-2.91489711566446,-2.88744453034241 19 | 11.4,0,-2.7869325850868,-2.78692822156943,-2.78694040970302 20 | 13.2,-2.7,-1.44815598334248,-1.44816310225492,-1.44385084369714 21 | 13.6,-2.7,-3.08732715163511,-3.08737540800837,-3.49090558955723 22 | 13.8,0,-4.47915905205275,-4.47922582871376,-4.51443296248728 23 | 14.6,-13.3,-10.8068835803928,-10.8066936421153,-13.6350258781699 24 | 14.6,-5.4,-10.8068835803928,-10.8066936421153,-13.6350258781699 25 | 14.6,-5.4,-10.8068835803928,-10.8066936421153,-13.6350258781699 26 | 14.6,-9.3,-10.8068835803928,-10.8066936421153,-13.6350258781699 27 | 14.6,-16,-10.8068835803928,-10.8066936421153,-13.6350258781699 28 | 14.6,-22.8,-10.8068835803928,-10.8066936421153,-13.6350258781699 29 | 14.8,-2.7,-16.4802488804481,-16.4780229300172,-15.9151741070906 30 | 15.4,-22.8,-35.044804079601,-34.9874445170015,-26.8581148667903 31 | 15.4,-32.1,-35.044804079601,-34.9874445170015,-26.8581148667903 32 | 15.4,-53.5,-35.044804079601,-34.9874445170015,-26.8581148667903 33 | 15.4,-54.9,-35.044804079601,-34.9874445170015,-26.8581148667903 34 | 15.6,-40.2,-36.2024699857126,-34.6266687507356,-30.5057617866902 35 | 15.6,-21.5,-36.2024699857126,-34.6266687507356,-30.5057617866902 36 | 15.8,-21.5,-34.2946110366992,-34.2658929844696,-34.1534087065902 37 | 15.8,-50.8,-34.2946110366992,-34.2658929844696,-34.1534087065902 38 | 16,-42.9,-40.9623781610697,-42.2168368598067,-42.9852599021164 39 | 16,-26.8,-40.9623781610697,-42.2168368598067,-42.9852599021164 40 | 16.2,-21.5,-49.8894820126662,-50.1677807351438,-51.8171110976426 41 | 16.2,-50.8,-49.8894820126662,-50.1677807351438,-51.8171110976426 42 | 16.2,-61.7,-49.8894820126662,-50.1677807351438,-51.8171110976426 43 | 16.4,-5.4,-59.1575903657072,-59.7887394693938,-60.6489622931688 44 | 16.4,-80.4,-59.1575903657072,-59.7887394693938,-60.6489622931688 45 | 16.6,-59,-69.2618040410694,-69.409698203644,-69.4808134886951 46 | 16.8,-71,-78.3672548349392,-78.3786544205479,-78.3126646842213 47 | 16.8,-91.1,-78.3672548349392,-78.3786544205479,-78.3126646842213 48 | 16.8,-77.7,-78.3672548349392,-78.3786544205479,-78.3126646842213 49 | 17.6,-37.5,-99.0626269349997,-99.0685638620475,-97.3394791491682 50 | 17.6,-85.6,-99.0626269349997,-99.0685638620475,-97.3394791491682 51 | 17.6,-123.1,-99.0626269349997,-99.0685638620475,-97.3394791491682 52 | 17.6,-101.9,-99.0626269349997,-99.0685638620475,-97.3394791491682 53 | 17.8,-99.1,-102.23913076535,-102.236800187791,-102.096182765405 54 | 17.8,-104.4,-102.23913076535,-102.236800187791,-102.096182765405 55 | 18.6,-112.5,-105.652890355726,-105.636714940334,-107.06377585755 56 | 18.6,-50.8,-105.652890355726,-105.636714940334,-107.06377585755 57 | 19.2,-123.1,-109.287141504408,-109.260051015558,-111.331289827006 58 | 19.4,-85.6,-110.86031629994,-110.847084622306,-112.753794483491 59 | 19.4,-72.3,-110.86031629994,-110.847084622306,-112.753794483491 60 | 19.6,-127.2,-112.453088494228,-112.434118229053,-114.176299139977 61 | 20.2,-123.1,-114.301112171188,-114.288741776628,-115.294787242616 62 | 20.4,-117.9,-114.649557476194,-114.639002102959,-115.667616610162 63 | 21.2,-134,-117.076859204605,-117.074078653111,-117.439853956824 64 | 21.4,-101.9,-117.783792010231,-117.782236984255,-117.88291329349 65 | 21.8,-108.4,-118.57266795491,-118.572741489722,-117.917368868909 66 | 22,-123.1,-118.123275062322,-118.123617157085,-117.934596656619 67 | 23.2,-123.1,-110.462590434712,-110.462791016035,-109.655283790486 68 | 23.4,-128.5,-112.245220421091,-112.24649450208,-105.882365747401 69 | 24,-112.5,-98.6622216657911,-98.6598772073171,-94.5636116181439 70 | 24.2,-95.1,-91.0193902327628,-91.0153647609239,-90.7906935750583 71 | 24.2,-81.8,-91.0193902327628,-91.0153647609239,-90.7906935750583 72 | 24.6,-53.5,-75.5437345924428,-75.5363085811589,-75.2533262502218 73 | 25,-64.4,-59.3264242523324,-59.3082711941104,-59.7159589253854 74 | 25,-57.6,-59.3264242523324,-59.3082711941104,-59.7159589253854 75 | 25.4,-72.3,-45.3652057885971,-45.321493380937,-47.9266649780034 76 | 25.4,-44.3,-45.3652057885971,-45.321493380937,-47.9266649780034 77 | 25.6,-26.8,-39.9193022836345,-39.8534675789927,-42.0320180043123 78 | 26,-5.4,-29.4898561808369,-29.3822440526094,-30.2427240569304 79 | 26.2,-107.1,-26.7335833651572,-27.0200668442082,-27.6230906229388 80 | 26.2,-21.5,-26.7335833651572,-27.0200668442082,-27.6230906229388 81 | 26.4,-65.6,-24.7308823089427,-24.657889635807,-25.0034571889472 82 | 27,-16,-17.3461923374273,-17.2960223519001,-17.1445568869724 83 | 27.2,-45.6,-13.9877368356593,-13.9480365878035,-12.6879000955758 84 | 27.2,-24.2,-13.9877368356593,-13.9480365878035,-12.6879000955758 85 | 27.2,9.5,-13.9877368356593,-13.9480365878035,-12.6879000955758 86 | 27.6,4,-3.81544929300075,-3.67654074668076,-3.77458651278248 87 | 28.2,12,10.0630237568212,10.4292037083406,6.63623368639569 88 | 28.4,-21.5,11.8459033194807,12.2458133097196,10.1065070861218 89 | 28.4,37.5,11.8459033194807,12.2458133097196,10.1065070861218 90 | 28.6,46.9,13.7410371926982,14.0624229110986,13.5767804858479 91 | 29.4,-17.4,23.9840770224533,24.2444925515626,23.8402789020684 92 | 30.2,36.2,30.6043362580197,30.7927655869127,30.5082837506196 93 | 31,75,37.192489862103,37.2590800936589,37.0155606615369 94 | 31.2,8.1,38.4974329391848,38.5529243858765,38.6423798892662 95 | 32,54.9,40.2184164284705,40.232311787479,40.281652114669 96 | 32,48.2,40.2184164284705,40.232311787479,40.281652114669 97 | 32.8,46.9,38.4881342223577,38.4812312203676,38.5320434495723 98 | 33.4,16,32.0477723999532,32.0422229920712,33.4707146803741 99 | 33.8,45.6,30.279266730721,30.2882445697795,30.0964955009087 100 | 34.4,1.3,28.1763302530371,28.1869576448825,28.6794222266994 101 | 34.8,75,27.8686331549754,27.8855044158556,27.7347067105599 102 | 35.2,-16,27.2079004830731,27.2331604231248,27.4330593860529 103 | 35.2,-54.9,27.2079004830731,27.2331604231248,27.4330593860529 104 | 35.4,69.6,27.1338969773874,27.3081574180076,27.2822357237994 105 | 35.6,34.8,27.3449931413974,27.3831544128905,27.1314120615459 106 | 35.6,32.1,27.3449931413974,27.3831544128905,27.1314120615459 107 | 36.2,-37.5,26.7215917837038,26.7559570747085,26.4861424298851 108 | 36.2,22.8,26.7215917837038,26.7559570747085,26.4861424298851 109 | 38,46.9,16.0933068920218,16.1088030693815,15.7779538866494 110 | 38,10.7,16.0933068920218,16.1088030693815,15.7779538866494 111 | 39.2,5.4,4.44817897699259,4.45785328546281,4.1703815590451 112 | 39.4,-1.3,2.73128847555066,2.74017845367171,3.56657548153721 113 | 40,-21.5,2.0724405389235,2.08082249298943,1.75515724901349 114 | 40.4,-13.3,0.78752966654204,0.793816963606541,0.497712583774448 115 | 41.6,30.8,0.313882972235793,0.319118527216493,0.0449134512399733 116 | 41.6,-10.7,0.313882972235793,0.319118527216493,0.0449134512399733 117 | 42.4,29.4,2.80166660884346,2.80941754119285,2.49038076772333 118 | 42.8,0,3.41308814919302,3.41955522868013,3.28652493927628 119 | 42.8,-10.7,3.41308814919302,3.41955522868013,3.28652493927628 120 | 43,14.7,3.91731033265016,3.92318149881277,3.68459702505276 121 | 44,-1.3,4.82547410488896,4.82758500069282,4.73372662415776 122 | 44.4,0,4.3557567860508,4.35704640194155,3.77318910001817 123 | 45,10.7,2.35094708701136,2.35103961376254,2.33238281380878 124 | 46.6,10.7,-5.35459332063613,-5.35567701965494,-5.33767485769193 125 | 47.8,-26.8,-6.18598516585209,-6.18756429261028,-6.15238275133211 126 | 47.8,-14.7,-6.18598516585209,-6.18756429261028,-6.15238275133211 127 | 48.8,-13.3,-5.60403115908023,-5.60516373426295,-5.5753369618273 128 | 50.6,0,-5.426800456096,-5.42746556080406,-5.40869771635579 129 | 52,10.7,-3.43429857282505,-3.43472280019316,-3.42261780701641 130 | 53.2,-14.7,-1.06348431706449,-1.06373551506761,-1.0562068163286 131 | 55,-2.7,2.40078257807028,2.40073516891229,2.40254352926014 132 | 55,10.7,2.40078257807028,2.40073516891229,2.40254352926014 133 | 55.4,-2.7,3.1498821640819,3.14987339944585,3.15053668341895 134 | 57.6,10.7,7.23208018602103,7.23224479871991,7.22733167377529 135 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/results/test_lowess_frac.csv: -------------------------------------------------------------------------------- 1 | "x","y","out_2_3","out_1_5" 2 | -6.28318530717959,1.62379338,1.80466114483605,1.74422043564428 3 | -5.84986218254651,-0.698604608439735,1.61311618057005,1.78085424101695 4 | -5.41653905791344,2.36301878512764,1.4261750242551,1.82442301835773 5 | -4.98321593328036,1.38351347251922,1.24562324374046,1.77486834294242 6 | -4.54989280864729,1.69579406254153,1.07265257431088,1.39031437952162 7 | -4.11656968401421,1.02040307815689,0.90788868621548,1.00001423202428 8 | -3.68324655938114,0.565281617177021,0.750838284637633,0.548573857702114 9 | -3.24992343474806,-0.115994541576058,0.599502542547986,0.0610231514603004 10 | -2.81660031011499,-0.13775271013598,0.449824578142944,-0.317172771478911 11 | -2.38327718548191,-1.32421916885342,0.296704884287402,-0.4862311730223 12 | -1.94995406084884,-0.279552579816791,0.208794666058654,-0.451464938932888 13 | -1.51663093621576,-3.26363167385112,0.140876587028018,-0.248131840307566 14 | -1.08330781158269,-0.0833224044460227,0.0962675820781971,-0.0237961837298486 15 | -0.649984686949612,0.293094354806235,0.0786932125300421,0.104998065423058 16 | -0.216661562316538,-0.306331490211024,0.0909461637774299,0.397335441132786 17 | 0.216661562316538,1.01979942021102,0.128317937081397,0.363728456246864 18 | 0.649984686949613,2.15022107519377,0.179230771330068,0.233437948119581 19 | 1.08330781158269,-0.353834385553977,0.222979906140634,0.0891119257588428 20 | 1.51663093621576,-0.167194126148876,0.24201292117444,-0.038080427447538 21 | 1.94995406084884,1.20925362981679,0.227104044749631,0.114430117144529 22 | 2.38327718548191,-0.164216371146577,0.182597181372592,0.301514517320983 23 | 2.81660031011499,0.52347598013598,0.0922474352764859,0.33827506687755 24 | 3.24992343474806,0.502229041576058,0.00966894001873829,0.370870495189447 25 | 3.68324655938114,0.167419892822978,-0.0694426066734743,0.297525445305924 26 | 4.11656968401421,0.629382671843109,-0.148737586456007,-0.0025753083081115 27 | 4.54989280864729,-0.535255802541526,-0.230416513848043,-0.331549145238025 28 | 4.98321593328036,-2.10024621251922,-0.316002091657721,-0.604601953026263 29 | 5.41653905791344,-0.847684595127637,-0.406022134137506,-0.747685790122756 30 | 5.84986218254651,-0.703574241560265,-0.500035363273341,-0.558951416131742 31 | 6.28318530717959,-0.17326155,-0.59701081650098,-0.308131362093249 32 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/results/test_lowess_iter.csv: -------------------------------------------------------------------------------- 1 | "x","y","out_0","out_3" 2 | 0,1.86299605,0.626447948304564,1.10919399651889 3 | 1,0.89183134,1.50083963634256,1.96623384153416 4 | 2,3.87761229,2.38617619262433,2.82234369576482 5 | 3,-0.63442237,3.2716390241964,3.67416606752392 6 | 4,4.30249022,4.13972663751248,4.51531636960336 7 | 5,6.03560416,4.99266140022312,5.34832051645261 8 | 6,6.21163349,5.90622249996935,6.2127611583589 9 | 7,8.14167809,6.85414647844424,7.0371035908847 10 | 8,7.99631825,7.81633581357042,7.88238440682891 11 | 9,6.91191013,8.66846618267192,8.70367831271047 12 | 10,10.13065417,9.53212152727725,9.56987287315289 13 | 11,9.1947793,10.4655376106265,10.5011237562793 14 | 12,12.60404596,11.4696917739989,11.4924301925592 15 | 13,10.69091796,12.612670577977,12.6180333553907 16 | 14,15.7081412,13.8080457514041,13.8056705212656 17 | 15,14.45366757,14.9355218408992,14.928079110753 18 | 16,15.06892052,16.0491183613157,16.0363681324567 19 | 17,18.79023999,17.1604998952365,17.1426206340736 20 | 18,19.05822445,18.2739171975973,18.2516511312687 21 | 19,17.95469436,19.3834268539226,19.3581200947664 22 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/results/test_lowess_r_outputs.R: -------------------------------------------------------------------------------- 1 | # test_lowess_r_output.R 2 | # 3 | # Generate outputs for unit tests 4 | # for lowess function in cylowess.pyx 5 | # 6 | # May 2012 7 | # 8 | 9 | # test_simple 10 | x_simple = 0:19 11 | # Standard Normal noise 12 | noise_simple = c(-0.76741118, -0.30754369, 13 | 0.39950921, -0.46352422, -1.67081778, 14 | 0.6595567 , 0.66367639, -2.04388585, 15 | 0.8123281 , 1.45977518, 16 | 1.21428038, 1.29296866, 0.78028477, 17 | -0.2402853 , -0.21721302, 18 | 0.24549405, 0.25987014, -0.90709034, 19 | -1.45688216, -0.31780505) 20 | 21 | y_simple = x_simple + noise_simple 22 | 23 | out_simple = lowess(x_simple, y_simple, delta = 0, iter = 3) 24 | 25 | 26 | # test_iter 27 | x_iter = 0:19 28 | # Cauchy noise 29 | noise_iter = c(1.86299605, -0.10816866, 1.87761229, 30 | -3.63442237, 0.30249022, 31 | 1.03560416, 0.21163349, 1.14167809, 32 | -0.00368175, -2.08808987, 33 | 0.13065417, -1.8052207 , 0.60404596, 34 | -2.30908204, 1.7081412 , 35 | -0.54633243, -0.93107948, 1.79023999, 36 | 1.05822445, -1.04530564) 37 | 38 | y_iter = x_iter + noise_iter 39 | 40 | out_iter_0 = lowess(x_iter, y_iter, delta = 0, iter = 0) 41 | out_iter_3 = lowess(x_iter, y_iter, delta = 0, iter = 3) 42 | 43 | 44 | # test_frac 45 | x_frac = seq(-2*pi, 2*pi, length = 30) 46 | 47 | # normal noise 48 | noise_frac = c(1.62379338, -1.11849371, 1.60085673, 49 | 0.41996348, 0.70896754, 50 | 0.19271408, 0.04972776, -0.22411356, 51 | 0.18154882, -0.63651971, 52 | 0.64942414, -2.26509826, 0.80018964, 53 | 0.89826857, -0.09136105, 54 | 0.80482898, 1.54504686, -1.23734643, 55 | -1.16572754, 0.28027691, 56 | -0.85191583, 0.20417445, 0.61034806, 57 | 0.68297375, 1.45707167, 58 | 0.45157072, -1.13669622, -0.08552254, 59 | -0.28368514, -0.17326155) 60 | 61 | y_frac = sin(x_frac) + noise_frac 62 | 63 | out_frac_2_3 = lowess(x_frac, y_frac, f = 2/3, delta = 0, iter = 3) 64 | out_frac_1_5 = lowess(x_frac, y_frac, f = 1/5, delta = 0, iter = 3) 65 | 66 | 67 | # test_delta 68 | # Load mcycle motorcycle collision data 69 | library(MASS) 70 | data(mcycle) 71 | 72 | out_delta_0 = lowess(mcycle, f = 0.1, delta = 0.0) 73 | out_delta_Rdef = lowess(mcycle, f = 0.1) 74 | out_delta_1 = lowess(mcycle, f = 0.1, delta = 1.0) 75 | 76 | 77 | # Create data frames of inputs and outputs, write them to CSV to be imported 78 | # by test_lowess.py 79 | 80 | df_test_simple = data.frame(x = x_simple, y = y_simple, out = out_simple$y) 81 | df_test_frac = data.frame(x = x_frac, y = y_frac, 82 | out_2_3 = out_frac_2_3$y, out_1_5 = out_frac_1_5$y) 83 | df_test_iter = data.frame(x = x_iter, y = y_iter, out_0 = out_iter_0$y, 84 | out_3 = out_iter_3$y) 85 | df_test_delta = data.frame(x = mcycle$times, y = mcycle$accel, 86 | out_0 = out_delta_0$y, out_Rdef = out_delta_Rdef$y, 87 | out_1 = out_delta_1$y) 88 | 89 | 90 | write.csv(df_test_simple, "test_lowess_simple.csv", row.names = FALSE) 91 | write.csv(df_test_frac, "test_lowess_frac.csv", row.names = FALSE) 92 | write.csv(df_test_iter, "test_lowess_iter.csv", row.names = FALSE) 93 | write.csv(df_test_delta, "test_lowess_delta.csv", row.names = FALSE) 94 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/results/test_lowess_simple.csv: -------------------------------------------------------------------------------- 1 | "x","y","out" 2 | 0,-0.76741118,-0.626034455349546 3 | 1,0.69245631,0.56507171201094 4 | 2,2.39950921,1.75962718897954 5 | 3,2.53647578,2.95796332584499 6 | 4,2.32918222,4.15606361537761 7 | 5,5.6595567,5.34733969366442 8 | 6,6.66367639,6.52229821799894 9 | 7,4.95611415,7.70815938803622 10 | 8,8.8123281,8.87590555190343 11 | 9,10.45977518,9.940975860297 12 | 10,11.21428038,10.8981138457948 13 | 11,12.29296866,11.7851424727769 14 | 12,12.78028477,12.6188717296918 15 | 13,12.7597147,13.409849737403 16 | 14,13.78278698,14.1516996584552 17 | 15,15.24549405,14.9180658146586 18 | 16,16.25987014,15.695660019874 19 | 17,16.09290966,16.4783034134255 20 | 18,16.54311784,17.2617441530539 21 | 19,18.68219495,18.0459201716397 22 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/setup.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Setup file for cylowess.pyx, a faster lowess smoother in Cython. 3 | ''' 4 | 5 | from distutils.core import setup 6 | from distutils.extension import Extension 7 | from Cython.Distutils import build_ext 8 | import numpy as np 9 | 10 | 11 | 12 | setup( 13 | cmdclass = {'build_ext' : build_ext}, 14 | include_dirs = [np.get_include()], 15 | ext_modules = [Extension('cylowess', ['cylowess.pyx'])] 16 | ) 17 | 18 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/test_lowess.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Lowess testing suite. 3 | 4 | Expected outcomes are generate by R's lowess function given the same 5 | arguments. The R script test_lowess_r_outputs.R can be used to 6 | generate the expected outcomes. 7 | 8 | The delta tests utilize Silverman's motorcycle collision data, 9 | available in R's MASS package. 10 | ''' 11 | 12 | import os 13 | import numpy as np 14 | from numpy.testing import assert_almost_equal 15 | 16 | import cylowess 17 | lowess = cylowess.lowess 18 | 19 | # Number of decimals to test equality with. 20 | # The default is 7. 21 | testdec = 7 22 | curdir = os.path.dirname(os.path.abspath(__file__)) 23 | rpath = os.path.join(curdir, 'results') 24 | 25 | class TestLowess(object): 26 | 27 | def test_simple(self): 28 | rfile = os.path.join(rpath, 'test_lowess_simple.csv') 29 | test_data = np.genfromtxt(open(rfile, 'r'), 30 | delimiter = ',', names = True) 31 | expected_lowess = np.array([test_data['x'], test_data['out']]).T 32 | 33 | actual_lowess = lowess(test_data['y'], test_data['x']) 34 | 35 | assert_almost_equal(expected_lowess, actual_lowess, decimal = testdec) 36 | 37 | 38 | def test_iter(self): 39 | rfile = os.path.join(rpath, 'test_lowess_iter.csv') 40 | test_data = np.genfromtxt(open(rfile, 'r'), 41 | delimiter = ',', names = True) 42 | 43 | expected_lowess_no_iter = np.array([test_data['x'], test_data['out_0']]).T 44 | expected_lowess_3_iter = np.array([test_data['x'], test_data['out_3']]).T 45 | 46 | actual_lowess_no_iter = lowess(test_data['y'], test_data['x'], it = 0) 47 | actual_lowess_3_iter = lowess(test_data['y'], test_data['x'], it = 3) 48 | 49 | assert_almost_equal(expected_lowess_no_iter, actual_lowess_no_iter, decimal = testdec) 50 | assert_almost_equal(expected_lowess_3_iter, actual_lowess_3_iter, decimal = testdec) 51 | 52 | 53 | def test_frac(self): 54 | rfile = os.path.join(rpath, 'test_lowess_frac.csv') 55 | test_data = np.genfromtxt(open(rfile, 'r'), 56 | delimiter = ',', names = True) 57 | 58 | expected_lowess_23 = np.array([test_data['x'], test_data['out_2_3']]).T 59 | expected_lowess_15 = np.array([test_data['x'], test_data['out_1_5']]).T 60 | 61 | actual_lowess_23 = lowess(test_data['y'], test_data['x'] ,frac = 2./3) 62 | actual_lowess_15 = lowess(test_data['y'], test_data['x'] ,frac = 1./5) 63 | 64 | assert_almost_equal(expected_lowess_23, actual_lowess_23, decimal = testdec) 65 | assert_almost_equal(expected_lowess_15, actual_lowess_15, decimal = testdec) 66 | 67 | 68 | def test_delta(self): 69 | rfile = os.path.join(rpath, 'test_lowess_delta.csv') 70 | test_data = np.genfromtxt(open(rfile, 'r'), 71 | delimiter = ',', names = True) 72 | 73 | expected_lowess_del0 = np.array([test_data['x'], test_data['out_0']]).T 74 | expected_lowess_delRdef = np.array([test_data['x'], test_data['out_Rdef']]).T 75 | expected_lowess_del1 = np.array([test_data['x'], test_data['out_1']]).T 76 | 77 | actual_lowess_del0 = lowess(test_data['y'], test_data['x'], frac = 0.1) 78 | actual_lowess_delRdef = lowess(test_data['y'], test_data['x'], frac = 0.1, 79 | delta = 0.01 * np.ptp(test_data['x'])) 80 | actual_lowess_del1 = lowess(test_data['y'], test_data['x'], frac = 0.1, delta = 1.0) 81 | 82 | assert_almost_equal(expected_lowess_del0, actual_lowess_del0, decimal = testdec) 83 | assert_almost_equal(expected_lowess_delRdef, actual_lowess_delRdef, decimal = testdec) 84 | assert_almost_equal(expected_lowess_del1, actual_lowess_del1, decimal = testdec) 85 | 86 | 87 | if __name__ == "__main__": 88 | import nose 89 | nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--noexe'], exit=False) 90 | 91 | -------------------------------------------------------------------------------- /MLFH/CH2/lowess work/test_lowess_r_output.R: -------------------------------------------------------------------------------- 1 | # test_lowess_r_output.R 2 | # 3 | # Generate outputs for unit tests 4 | # for lowess function in cylowess.pyx 5 | # 6 | # May 2012 7 | # 8 | 9 | # test_simple 10 | x = 0:19 11 | # Standard Normal noise 12 | noise = c(-0.76741118, -0.30754369, 13 | 0.39950921, -0.46352422, -1.67081778, 14 | 0.6595567 , 0.66367639, -2.04388585, 15 | 0.8123281 , 1.45977518, 16 | 1.21428038, 1.29296866, 0.78028477, 17 | -0.2402853 , -0.21721302, 18 | 0.24549405, 0.25987014, -0.90709034, 19 | -1.45688216, -0.31780505) 20 | 21 | y = x + noise 22 | 23 | test.simple.out = lowess(x, y, delta = 0, iter = 3) 24 | 25 | # Print comma separated results (to paste into test file) 26 | print("Simple test outputs") 27 | paste(round(test.simple.out$y, 10), collapse = ", ") 28 | 29 | 30 | # test_iter 31 | x = 0:19 32 | # Cauchy noise 33 | noise = c(1.86299605, -0.10816866, 1.87761229, 34 | -3.63442237, 0.30249022, 35 | 1.03560416, 0.21163349, 1.14167809, 36 | -0.00368175, -2.08808987, 37 | 0.13065417, -1.8052207 , 0.60404596, 38 | -2.30908204, 1.7081412 , 39 | -0.54633243, -0.93107948, 1.79023999, 40 | 1.05822445, -1.04530564) 41 | 42 | y = x + noise 43 | 44 | test.no.iter.out = lowess(x, y, delta = 0, iter = 0) 45 | test.3.iter.out = lowess(x, y, delta = 0, iter = 3) 46 | 47 | print("Iter test outputs") 48 | paste(round(test.no.iter.out$y, 10), collapse = ", ") 49 | paste(round(test.3.iter.out$y, 10), collapse = ", ") 50 | 51 | 52 | # test_frac 53 | x = seq(-2*pi, 2*pi, length = 30) 54 | 55 | # normal noise 56 | noise = c( 1.62379338, -1.11849371, 1.60085673, 57 | 0.41996348, 0.70896754, 58 | 0.19271408, 0.04972776, -0.22411356, 59 | 0.18154882, -0.63651971, 60 | 0.64942414, -2.26509826, 0.80018964, 61 | 0.89826857, -0.09136105, 62 | 0.80482898, 1.54504686, -1.23734643, 63 | -1.16572754, 0.28027691, 64 | -0.85191583, 0.20417445, 0.61034806, 65 | 0.68297375, 1.45707167, 66 | 0.45157072, -1.13669622, -0.08552254, 67 | -0.28368514, -0.17326155) 68 | 69 | y = sin(x) + noise 70 | 71 | frac.2_3.out = lowess(x, y, f = 2/3, delta = 0, iter = 3) 72 | frac.1_5.out = lowess(x, y, f = 1/5, delta = 0, iter = 3) 73 | 74 | print("Frac test outputs") 75 | paste(round(frac.2_3.out$y, 10), collapse=", ") 76 | paste(round(frac.1_5.out$y, 10), collapse=", ") 77 | 78 | 79 | # test_delta 80 | # Load mcycle motorcycle collision data 81 | library(MASS) 82 | data(mcycle) 83 | ##### 84 | ##### 85 | delta.0.out = lowess(mcycle, f = 0.1, delta = 0.0) 86 | delta.default.out = lowess(mcycle, f = 0.1) 87 | delta.1.out = lowess(mcycle, f = 0.1, delta = 1.0) 88 | 89 | print("mcycle x values") 90 | paste(mcycle$times, collapse = ", ") 91 | 92 | print("mcycle y values") 93 | paste(mcycle$accel, collapse = ", ") 94 | 95 | print("Delta test outputs") 96 | paste(round(delta.0.out$y, 10), collapse = ", ") 97 | 98 | paste(round(delta.default.out$y, 10), collapse = ", ") 99 | 100 | paste(round(delta.1.out$y, 10), collapse = ", ") 101 | -------------------------------------------------------------------------------- /MLFH/CH2/weight_density_bysex.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/weight_density_bysex.png -------------------------------------------------------------------------------- /MLFH/CH2/weight_density_bysex_subplot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/CH2/weight_density_bysex_subplot.png -------------------------------------------------------------------------------- /MLFH/CH3/ch3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ch3" 4 | }, 5 | "nbformat": 2, 6 | "worksheets": [ 7 | { 8 | "cells": [ 9 | { 10 | "cell_type": "code", 11 | "collapsed": true, 12 | "input": [ 13 | "''''", 14 | "-------------------------------------------------------------------------------", 15 | "Filename : ch3.ipynb", 16 | "Date : 2012-06-17", 17 | "Author : C. Vogel", 18 | "Purpose : Replicate the naive Bayes e-mail classifier in Chapter 3 of ", 19 | " : _Machine Learning for Hackers_.", 20 | "Input Data : e-mail files, split into spam and ham (non-spam) folders are available ", 21 | " : at the book's github repository at https://github.com/johnmyleswhite/", 22 | " : ML_for_Hackers.git. This also uses r_stopwords.csv, a text file ", 23 | " : containing a list of stopwords used by R's tm package. This is used", 24 | " : to facilitate comparability with the results of the R analysis.", 25 | "Libraries : Numpy 1.6.1, Pandas 0.7.3, NLTK 2.0.1, textmining", 26 | "-------------------------------------------------------------------------------", 27 | "", 28 | "This notebook is a Python port of the R code in Chapter 3 of _Machine Learning", 29 | "for Hackers_ by D. Conway and J.M. White.", 30 | "", 31 | "E-mail files, split into folders classified as spam or ham (non-spam) should be located ", 32 | "in a /data/ subfolder of the working directory. See the paths defined just after the import", 33 | "statements below to see what directory structure this script requires. Copying complete", 34 | "data folder from the book's github repository should be sufficient.", 35 | "", 36 | "For a detailed description of the analysis and the process of porting it", 37 | "to Python, see: slendrmeans.wordpress.com/will-it-python.", 38 | "'''" 39 | ], 40 | "language": "python", 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "collapsed": true, 46 | "input": [ 47 | "import os", 48 | "import math", 49 | "import string", 50 | "import nltk", 51 | "from nltk.corpus import stopwords", 52 | "import numpy as np", 53 | "import textmining as txtm", 54 | "from pandas import *" 55 | ], 56 | "language": "python", 57 | "outputs": [], 58 | "prompt_number": 1 59 | }, 60 | { 61 | "cell_type": "code", 62 | "collapsed": true, 63 | "input": [ 64 | "# Directories with e-mail data", 65 | "# The spam and ham files are broken into multiple ", 66 | "# directories so as to separate training and evaluation data", 67 | "data_path = os.path.abspath(os.path.join('.', 'data'))", 68 | "spam_path = os.path.join(data_path, 'spam')", 69 | "spam2_path = os.path.join(data_path, 'spam_2') ", 70 | "easyham_path = os.path.join(data_path, 'easy_ham')", 71 | "easyham2_path = os.path.join(data_path, 'easy_ham_2')", 72 | "hardham_path = os.path.join(data_path, 'hard_ham')", 73 | "hardham2_path = os.path.join(data_path, 'hard_ham_2')" 74 | ], 75 | "language": "python", 76 | "outputs": [], 77 | "prompt_number": 2 78 | }, 79 | { 80 | "cell_type": "code", 81 | "collapsed": true, 82 | "input": [ 83 | "def get_msg(path):", 84 | " '''", 85 | " Read in the `message` portion of an e-mail, given", 86 | " its file path. The `message` text begins after the first", 87 | " blank line; above is header information.", 88 | "", 89 | " Returns a string.", 90 | " '''", 91 | " with open(path, 'rU') as con:", 92 | " msg = con.readlines()", 93 | " first_blank_index = msg.index('\\n')", 94 | " msg = msg[(first_blank_index + 1): ]", 95 | " return ''.join(msg) " 96 | ], 97 | "language": "python", 98 | "outputs": [], 99 | "prompt_number": 3 100 | }, 101 | { 102 | "cell_type": "code", 103 | "collapsed": true, 104 | "input": [ 105 | "def get_msgdir(path):", 106 | " '''", 107 | " Read all messages from files in a directory into", 108 | " a list where each item is the text of a message. ", 109 | " ", 110 | " Simply gets a list of e-mail files in a directory,", 111 | " and iterates `get_msg()` over them.", 112 | "", 113 | " Returns a list of strings.", 114 | " '''", 115 | " filelist = os.listdir(path)", 116 | " filelist = filter(lambda x: x != 'cmds', filelist)", 117 | " all_msgs =[get_msg(os.path.join(path, f)) for f in filelist]", 118 | " return all_msgs" 119 | ], 120 | "language": "python", 121 | "outputs": [], 122 | "prompt_number": 4 123 | }, 124 | { 125 | "cell_type": "code", 126 | "collapsed": true, 127 | "input": [ 128 | "# Get lists containing messages of each type.", 129 | "all_spam = get_msgdir(spam_path)", 130 | "all_easyham = get_msgdir(easyham_path)", 131 | "all_easyham = all_easyham[:500]", 132 | "all_hardham = get_msgdir(hardham_path)" 133 | ], 134 | "language": "python", 135 | "outputs": [], 136 | "prompt_number": 5 137 | }, 138 | { 139 | "cell_type": "code", 140 | "collapsed": true, 141 | "input": [ 142 | "# Get stopwords.", 143 | "# NLTK stopwords", 144 | "sw = stopwords.words('english')", 145 | "# Stopwords exported from the 'tm' library in R.", 146 | "rsw = read_csv('r_stopwords.csv')['x'].values.tolist() " 147 | ], 148 | "language": "python", 149 | "outputs": [], 150 | "prompt_number": 6 151 | }, 152 | { 153 | "cell_type": "code", 154 | "collapsed": true, 155 | "input": [ 156 | "def tdm_df(doclist, stopwords = [], remove_punctuation = True, ", 157 | " remove_digits = True, sparse_df = False):", 158 | " '''", 159 | " Create a term-document matrix from a list of e-mails.", 160 | " ", 161 | " Uses the TermDocumentMatrix function in the `textmining` module.", 162 | " But, pre-processes the documents to remove digits and punctuation,", 163 | " and post-processes to remove stopwords, to match the functionality ", 164 | " of R's `tm` package.", 165 | "", 166 | " NB: This is not particularly memory efficient and you can get memory ", 167 | " errors with an especially long list of documents.", 168 | "", 169 | " Returns a (by default, sparse) DataFrame. Each column is a term,", 170 | " each row is a document.", 171 | " '''", 172 | " ", 173 | " # Create the TDM from the list of documents.", 174 | " tdm = txtm.TermDocumentMatrix()", 175 | " ", 176 | " for doc in doclist:", 177 | " if remove_punctuation == True:", 178 | " doc = doc.translate(None, string.punctuation.translate(None, '\"'))", 179 | " if remove_digits == True:", 180 | " doc = doc.translate(None, string.digits)", 181 | " ", 182 | " tdm.add_doc(doc)", 183 | " ", 184 | " # Push the TDM data to a list of lists,", 185 | " # then make that an ndarray, which then", 186 | " # becomes a DataFrame.", 187 | " tdm_rows = []", 188 | " for row in tdm.rows(cutoff = 1):", 189 | " tdm_rows.append(row)", 190 | " ", 191 | " tdm_array = np.array(tdm_rows[1:])", 192 | " tdm_terms = tdm_rows[0]", 193 | " df = DataFrame(tdm_array, columns = tdm_terms)", 194 | " ", 195 | " # Remove stopwords from the dataset, manually.", 196 | " # TermDocumentMatrix does not do this for us.", 197 | " if len(stopwords) > 0:", 198 | " for col in df:", 199 | " if col in stopwords:", 200 | " del df[col]", 201 | " ", 202 | " if sparse_df == True:", 203 | " df.to_sparse(fill_value = 0)", 204 | " ", 205 | " return df" 206 | ], 207 | "language": "python", 208 | "outputs": [], 209 | "prompt_number": 7 210 | }, 211 | { 212 | "cell_type": "code", 213 | "collapsed": true, 214 | "input": [ 215 | "spam_tdm = tdm_df(all_spam, stopwords = rsw, sparse_df = True)" 216 | ], 217 | "language": "python", 218 | "outputs": [], 219 | "prompt_number": 8 220 | }, 221 | { 222 | "cell_type": "code", 223 | "collapsed": true, 224 | "input": [ 225 | "def make_term_df(tdm):", 226 | " '''", 227 | " Create a DataFrame that gives statistics for each term in a ", 228 | " Term Document Matrix.", 229 | "", 230 | " `frequency` is how often the term occurs across all documents.", 231 | " `density` is frequency normalized by the sum of all terms' frequencies.", 232 | " `occurrence` is the percent of documents that a term appears in.", 233 | "", 234 | " Returns a DataFrame, with an index of terms from the input TDM.", 235 | " '''", 236 | " term_df = DataFrame(tdm.sum(), columns = ['frequency'])", 237 | " term_df['density'] = term_df.frequency / float(term_df.frequency.sum())", 238 | " term_df['occurrence'] = tdm.apply(lambda x: np.sum((x > 0))) / float(tdm.shape[0])", 239 | " ", 240 | " return term_df.sort_index(by = 'occurrence', ascending = False)" 241 | ], 242 | "language": "python", 243 | "outputs": [], 244 | "prompt_number": 9 245 | }, 246 | { 247 | "cell_type": "code", 248 | "collapsed": true, 249 | "input": [ 250 | "spam_term_df = make_term_df(spam_tdm)" 251 | ], 252 | "language": "python", 253 | "outputs": [], 254 | "prompt_number": 10 255 | }, 256 | { 257 | "cell_type": "code", 258 | "collapsed": false, 259 | "input": [ 260 | "spam_term_df.head()" 261 | ], 262 | "language": "python", 263 | "outputs": [ 264 | { 265 | "html": [ 266 | "
", 267 | "", 268 | " ", 269 | " ", 270 | " ", 271 | " ", 272 | " ", 273 | " ", 274 | " ", 275 | " ", 276 | " ", 277 | " ", 278 | " ", 279 | " ", 280 | " ", 281 | " ", 282 | " ", 283 | " ", 284 | " ", 285 | " ", 286 | " ", 287 | " ", 288 | " ", 289 | " ", 290 | " ", 291 | " ", 292 | " ", 293 | " ", 294 | " ", 295 | " ", 296 | " ", 297 | " ", 298 | " ", 299 | " ", 300 | " ", 301 | " ", 302 | " ", 303 | " ", 304 | " ", 305 | " ", 306 | " ", 307 | " ", 308 | "
frequencydensityoccurrence
email 868 0.005907 0.576
please 453 0.003083 0.520
click 370 0.002518 0.450
list 422 0.002872 0.444
body 420 0.002858 0.410
", 309 | "
" 310 | ], 311 | "output_type": "pyout", 312 | "prompt_number": 11, 313 | "text": [ 314 | " frequency density occurrence", 315 | "email 868 0.005907 0.576", 316 | "please 453 0.003083 0.520", 317 | "click 370 0.002518 0.450", 318 | "list 422 0.002872 0.444", 319 | "body 420 0.002858 0.410" 320 | ] 321 | } 322 | ], 323 | "prompt_number": 11 324 | }, 325 | { 326 | "cell_type": "code", 327 | "collapsed": false, 328 | "input": [ 329 | "easyham_tdm = tdm_df(all_easyham, stopwords = rsw, sparse_df = True)" 330 | ], 331 | "language": "python", 332 | "outputs": [], 333 | "prompt_number": 12 334 | }, 335 | { 336 | "cell_type": "code", 337 | "collapsed": true, 338 | "input": [ 339 | "easyham_term_df = make_term_df(easyham_tdm)" 340 | ], 341 | "language": "python", 342 | "outputs": [], 343 | "prompt_number": 13 344 | }, 345 | { 346 | "cell_type": "code", 347 | "collapsed": false, 348 | "input": [ 349 | "easyham_term_df.head(6)" 350 | ], 351 | "language": "python", 352 | "outputs": [ 353 | { 354 | "html": [ 355 | "
", 356 | "", 357 | " ", 358 | " ", 359 | " ", 360 | " ", 361 | " ", 362 | " ", 363 | " ", 364 | " ", 365 | " ", 366 | " ", 367 | " ", 368 | " ", 369 | " ", 370 | " ", 371 | " ", 372 | " ", 373 | " ", 374 | " ", 375 | " ", 376 | " ", 377 | " ", 378 | " ", 379 | " ", 380 | " ", 381 | " ", 382 | " ", 383 | " ", 384 | " ", 385 | " ", 386 | " ", 387 | " ", 388 | " ", 389 | " ", 390 | " ", 391 | " ", 392 | " ", 393 | " ", 394 | " ", 395 | " ", 396 | " ", 397 | " ", 398 | " ", 399 | " ", 400 | " ", 401 | " ", 402 | " ", 403 | "
frequencydensityoccurrence
wrote 237 0.004052 0.378
list 248 0.004240 0.368
dont 241 0.004120 0.290
email 188 0.003214 0.276
subject 162 0.002770 0.270
time 188 0.003214 0.258
", 404 | "
" 405 | ], 406 | "output_type": "pyout", 407 | "prompt_number": 14, 408 | "text": [ 409 | " frequency density occurrence", 410 | "wrote 237 0.004052 0.378", 411 | "list 248 0.004240 0.368", 412 | "dont 241 0.004120 0.290", 413 | "email 188 0.003214 0.276", 414 | "subject 162 0.002770 0.270", 415 | "time 188 0.003214 0.258" 416 | ] 417 | } 418 | ], 419 | "prompt_number": 14 420 | }, 421 | { 422 | "cell_type": "code", 423 | "collapsed": true, 424 | "input": [ 425 | "def classify_email(msg, training_df, prior = 0.5, c = 1e-6):", 426 | " '''", 427 | " A conditional probability calculator for a naive Bayes e-mail", 428 | " classifier.", 429 | " Given an e-mail message and a training dataset, the classifier", 430 | " returns the log probability of observing the terms in the message if", 431 | " it were of the same class as the e-mails in the training set (spam/ham).", 432 | "", 433 | " NB: Log probabilities are used for this function, because the raw probabilities", 434 | " will be so small that underflow is a real risk. Calculating probability", 435 | " would require multiplying many occurrence probabilities -- p1 * p2 * ... * pN,", 436 | " where pi is often ~= 0. For log probability we can compute ln(p1) + ln(p2) +", 437 | " ... + ln(pN), where ln(pi) < 0 by a far. This will not affect the ordering ", 438 | " of probabilities (which is what we care about ultimately), but solves the ", 439 | " underflow risk. Cf. p. 89 of MLFH to see how small raw probability calculations", 440 | " can get, and an apparent underflow in row 4.", 441 | "", 442 | " Returns a log probability (float) between -Infty and +Infty.", 443 | " '''", 444 | " msg_tdm = tdm_df([msg])", 445 | " msg_freq = msg_tdm.sum()", 446 | " msg_match = list(set(msg_freq.index).intersection(set(training_df.index)))", 447 | " if len(msg_match) < 1:", 448 | " return math.log(prior) + math.log(c) * len(msg_freq)", 449 | " else:", 450 | " match_probs = training_df.occurrence[msg_match]", 451 | " return (math.log(prior) + np.log(match_probs).sum() ", 452 | " + math.log(c) * (len(msg_freq) - len(msg_match)))" 453 | ], 454 | "language": "python", 455 | "outputs": [], 456 | "prompt_number": 15 457 | }, 458 | { 459 | "cell_type": "code", 460 | "collapsed": true, 461 | "input": [ 462 | "hardham_spamtest = [classify_email(m, spam_term_df) for m in all_hardham]", 463 | "hardham_hamtest = [classify_email(m, easyham_term_df) for m in all_hardham]", 464 | "s_spam = np.array(hardham_spamtest) > np.array(hardham_hamtest)" 465 | ], 466 | "language": "python", 467 | "outputs": [], 468 | "prompt_number": 16 469 | }, 470 | { 471 | "cell_type": "code", 472 | "collapsed": true, 473 | "input": [ 474 | "def spam_classifier(msglist):", 475 | " '''", 476 | " The naive Bayes classifier. ", 477 | " Using spam and ham training datasets, use `classify_email()` to", 478 | " compute the conditional log probability of each e-mail in a list. ", 479 | " Assign each e-mail to whichever class's training data returns the ", 480 | " highest probability.", 481 | "", 482 | " Returns a DataFrame with the conditional log probabilities and the", 483 | " class.", 484 | " '''", 485 | " spamprob = [classify_email(m, spam_term_df) for m in msglist]", 486 | " hamprob = [classify_email(m, easyham_term_df) for m in msglist]", 487 | " classify = np.where(np.array(spamprob) > np.array(hamprob), 'Spam', 'Ham')", 488 | " out_df = DataFrame({'pr_spam' : spamprob,", 489 | " 'pr_ham' : hamprob, ", 490 | " 'classify' : classify}, ", 491 | " columns = ['pr_spam', 'pr_ham', 'classify'])", 492 | " return out_df" 493 | ], 494 | "language": "python", 495 | "outputs": [], 496 | "prompt_number": 17 497 | }, 498 | { 499 | "cell_type": "code", 500 | "collapsed": true, 501 | "input": [ 502 | "def class_stats(df):", 503 | " return df.classify.value_counts() / float(len(df.classify))" 504 | ], 505 | "language": "python", 506 | "outputs": [], 507 | "prompt_number": 18 508 | }, 509 | { 510 | "cell_type": "code", 511 | "collapsed": false, 512 | "input": [ 513 | "hardham_classify = spam_classifier(all_hardham)", 514 | "class_stats(hardham_classify)" 515 | ], 516 | "language": "python", 517 | "outputs": [ 518 | { 519 | "output_type": "pyout", 520 | "prompt_number": 19, 521 | "text": [ 522 | "Spam 0.702811", 523 | "Ham 0.297189" 524 | ] 525 | } 526 | ], 527 | "prompt_number": 19 528 | }, 529 | { 530 | "cell_type": "code", 531 | "collapsed": false, 532 | "input": [ 533 | "hardham_classify.head()" 534 | ], 535 | "language": "python", 536 | "outputs": [ 537 | { 538 | "html": [ 539 | "
", 540 | "", 541 | " ", 542 | " ", 543 | " ", 544 | " ", 545 | " ", 546 | " ", 547 | " ", 548 | " ", 549 | " ", 550 | " ", 551 | " ", 552 | " ", 553 | " ", 554 | " ", 555 | " ", 556 | " ", 557 | " ", 558 | " ", 559 | " ", 560 | " ", 561 | " ", 562 | " ", 563 | " ", 564 | " ", 565 | " ", 566 | " ", 567 | " ", 568 | " ", 569 | " ", 570 | " ", 571 | " ", 572 | " ", 573 | " ", 574 | " ", 575 | " ", 576 | " ", 577 | " ", 578 | " ", 579 | " ", 580 | " ", 581 | "
pr_spampr_hamclassify
0-3620.438622-3628.335856 Spam
1-4961.487090-5215.082391 Spam
2 -374.143783 -393.897628 Spam
3-3190.000772-3192.233073 Spam
4-9315.804062-9404.738565 Spam
", 582 | "
" 583 | ], 584 | "output_type": "pyout", 585 | "prompt_number": 20, 586 | "text": [ 587 | " pr_spam pr_ham classify", 588 | "0 -3620.438622 -3628.335856 Spam", 589 | "1 -4961.487090 -5215.082391 Spam", 590 | "2 -374.143783 -393.897628 Spam", 591 | "3 -3190.000772 -3192.233073 Spam", 592 | "4 -9315.804062 -9404.738565 Spam" 593 | ] 594 | } 595 | ], 596 | "prompt_number": 20 597 | }, 598 | { 599 | "cell_type": "code", 600 | "collapsed": true, 601 | "input": [ 602 | "# Run the classifier on the evaluation e-mails in the ham2/spam2", 603 | "# directories.", 604 | "all_easyham2 = get_msgdir(easyham2_path)", 605 | "all_hardham2 = get_msgdir(hardham2_path)", 606 | "all_spam2 = get_msgdir(spam2_path)" 607 | ], 608 | "language": "python", 609 | "outputs": [], 610 | "prompt_number": 21 611 | }, 612 | { 613 | "cell_type": "code", 614 | "collapsed": false, 615 | "input": [ 616 | "# The classifier does a great job on easy ham.", 617 | "easyham2_classify = spam_classifier(all_easyham2)", 618 | "class_stats(easyham2_classify)" 619 | ], 620 | "language": "python", 621 | "outputs": [ 622 | { 623 | "output_type": "pyout", 624 | "prompt_number": 22, 625 | "text": [ 626 | "Ham 0.979286", 627 | "Spam 0.020714" 628 | ] 629 | } 630 | ], 631 | "prompt_number": 22 632 | }, 633 | { 634 | "cell_type": "code", 635 | "collapsed": false, 636 | "input": [ 637 | "# But it does a pretty bad job on hardham,", 638 | "# not surprisingly.", 639 | "hardham2_classify = spam_classifier(all_hardham2)", 640 | "class_stats(hardham2_classify)" 641 | ], 642 | "language": "python", 643 | "outputs": [ 644 | { 645 | "output_type": "pyout", 646 | "prompt_number": 23, 647 | "text": [ 648 | "Spam 0.693548", 649 | "Ham 0.306452" 650 | ] 651 | } 652 | ], 653 | "prompt_number": 23 654 | }, 655 | { 656 | "cell_type": "code", 657 | "collapsed": false, 658 | "input": [ 659 | "# It's also very accurate for spam.", 660 | "spam2_classify = spam_classifier(all_spam2)", 661 | "class_stats(spam2_classify)" 662 | ], 663 | "language": "python", 664 | "outputs": [ 665 | { 666 | "output_type": "pyout", 667 | "prompt_number": 24, 668 | "text": [ 669 | "Spam 0.969936", 670 | "Ham 0.030064" 671 | ] 672 | } 673 | ], 674 | "prompt_number": 24 675 | }, 676 | { 677 | "cell_type": "code", 678 | "collapsed": true, 679 | "input": [ 680 | "# These are are almost identical to results using the authors' R", 681 | "# script after modifying the classify.email() function to use log", 682 | "# probabilities.", 683 | "#", 684 | "# NOT SPAM SPAM", 685 | "# easyham2.col 0.97928571 0.02071429", 686 | "# hardham2.col 0.30241935 0.69758065", 687 | "# spam2.col 0.03006442 0.96993558" 688 | ], 689 | "language": "python", 690 | "outputs": [] 691 | } 692 | ] 693 | } 694 | ] 695 | } -------------------------------------------------------------------------------- /MLFH/CH3/ch3.py: -------------------------------------------------------------------------------- 1 | '''' 2 | ------------------------------------------------------------------------------- 3 | Filename : ch3.ipynb 4 | Date : 2012-06-17 5 | Author : C. Vogel 6 | Purpose : Replicate the naive Bayes e-mail classifier in Chapter 3 of 7 | : _Machine Learning for Hackers_. 8 | Input Data : e-mail files, split into spam and ham (non-spam) folders are available 9 | : at the book's github repository at https://github.com/johnmyleswhite/ 10 | : ML_for_Hackers.git. This also uses r_stopwords.csv, a text file 11 | : containing a list of stopwords used by R's tm package. This is used 12 | : to facilitate comparability with the results of the R analysis. 13 | Libraries : Numpy 1.6.1, Pandas 0.7.3, NLTK 2.0.1, textmining 14 | ------------------------------------------------------------------------------- 15 | 16 | This notebook is a Python port of the R code in Chapter 3 of _Machine Learning 17 | for Hackers_ by D. Conway and J.M. White. 18 | 19 | E-mail files, split into folders classified as spam or ham (non-spam) should be located 20 | in a /data/ subfolder of the working directory. See the paths defined just after the import 21 | statements below to see what directory structure this script requires. Copying complete 22 | data folder from the book's github repository should be sufficient. 23 | 24 | For a detailed description of the analysis and the process of porting it 25 | to Python, see: slendrmeans.wordpress.com/will-it-python. 26 | ''' 27 | 28 | import os 29 | import math 30 | import string 31 | import nltk 32 | from nltk.corpus import stopwords 33 | import numpy as np 34 | import textmining as txtm 35 | from pandas import * 36 | 37 | # Directories with e-mail data 38 | # The spam and ham files are broken into multiple 39 | # directories so as to separate training and evaluation data 40 | data_path = os.path.abspath(os.path.join('.', 'data')) 41 | spam_path = os.path.join(data_path, 'spam') 42 | spam2_path = os.path.join(data_path, 'spam_2') 43 | easyham_path = os.path.join(data_path, 'easy_ham') 44 | easyham2_path = os.path.join(data_path, 'easy_ham_2') 45 | hardham_path = os.path.join(data_path, 'hard_ham') 46 | hardham2_path = os.path.join(data_path, 'hard_ham_2') 47 | 48 | def get_msg(path): 49 | ''' 50 | Read in the `message` portion of an e-mail, given 51 | its file path. The `message` text begins after the first 52 | blank line; above is header information. 53 | 54 | Returns a string. 55 | ''' 56 | with open(path, 'rU') as con: 57 | msg = con.readlines() 58 | first_blank_index = msg.index('\n') 59 | msg = msg[(first_blank_index + 1): ] 60 | return ''.join(msg) 61 | 62 | def get_msgdir(path): 63 | ''' 64 | Read all messages from files in a directory into 65 | a list where each item is the text of a message. 66 | 67 | Simply gets a list of e-mail files in a directory, 68 | and iterates `get_msg()` over them. 69 | 70 | Returns a list of strings. 71 | ''' 72 | filelist = os.listdir(path) 73 | filelist = filter(lambda x: x != 'cmds', filelist) 74 | all_msgs =[get_msg(os.path.join(path, f)) for f in filelist] 75 | return all_msgs 76 | 77 | # Get lists containing messages of each type. 78 | all_spam = get_msgdir(spam_path) 79 | all_easyham = get_msgdir(easyham_path) 80 | all_easyham = all_easyham[:500] 81 | all_hardham = get_msgdir(hardham_path) 82 | 83 | # Get stopwords. 84 | # NLTK stopwords 85 | sw = stopwords.words('english') 86 | # Stopwords exported from the 'tm' library in R. 87 | rsw = read_csv('r_stopwords.csv')['x'].values.tolist() 88 | 89 | def tdm_df(doclist, stopwords = [], remove_punctuation = True, 90 | remove_digits = True, sparse_df = False): 91 | ''' 92 | Create a term-document matrix from a list of e-mails. 93 | 94 | Uses the TermDocumentMatrix function in the `textmining` module. 95 | But, pre-processes the documents to remove digits and punctuation, 96 | and post-processes to remove stopwords, to match the functionality 97 | of R's `tm` package. 98 | 99 | NB: This is not particularly memory efficient and you can get memory 100 | errors with an especially long list of documents. 101 | 102 | Returns a (by default, sparse) DataFrame. Each column is a term, 103 | each row is a document. 104 | ''' 105 | 106 | # Create the TDM from the list of documents. 107 | tdm = txtm.TermDocumentMatrix() 108 | 109 | for doc in doclist: 110 | if remove_punctuation == True: 111 | doc = doc.translate(None, string.punctuation.translate(None, '"')) 112 | if remove_digits == True: 113 | doc = doc.translate(None, string.digits) 114 | 115 | tdm.add_doc(doc) 116 | 117 | # Push the TDM data to a list of lists, 118 | # then make that an ndarray, which then 119 | # becomes a DataFrame. 120 | tdm_rows = [] 121 | for row in tdm.rows(cutoff = 1): 122 | tdm_rows.append(row) 123 | 124 | tdm_array = np.array(tdm_rows[1:]) 125 | tdm_terms = tdm_rows[0] 126 | df = DataFrame(tdm_array, columns = tdm_terms) 127 | 128 | # Remove stopwords from the dataset, manually. 129 | # TermDocumentMatrix does not do this for us. 130 | if len(stopwords) > 0: 131 | for col in df: 132 | if col in stopwords: 133 | del df[col] 134 | 135 | if sparse_df == True: 136 | df.to_sparse(fill_value = 0) 137 | 138 | return df 139 | 140 | spam_tdm = tdm_df(all_spam, stopwords = rsw, sparse_df = True) 141 | 142 | def make_term_df(tdm): 143 | ''' 144 | Create a DataFrame that gives statistics for each term in a 145 | Term Document Matrix. 146 | 147 | `frequency` is how often the term occurs across all documents. 148 | `density` is frequency normalized by the sum of all terms' frequencies. 149 | `occurrence` is the percent of documents that a term appears in. 150 | 151 | Returns a DataFrame, with an index of terms from the input TDM. 152 | ''' 153 | term_df = DataFrame(tdm.sum(), columns = ['frequency']) 154 | term_df['density'] = term_df.frequency / float(term_df.frequency.sum()) 155 | term_df['occurrence'] = tdm.apply(lambda x: np.sum((x > 0))) / float(tdm.shape[0]) 156 | 157 | return term_df.sort_index(by = 'occurrence', ascending = False) 158 | 159 | spam_term_df = make_term_df(spam_tdm) 160 | 161 | print 'Spam Training Set Term Statistics' 162 | print spam_term_df.head() 163 | 164 | easyham_tdm = tdm_df(all_easyham, stopwords = rsw, sparse_df = True) 165 | 166 | easyham_term_df = make_term_df(easyham_tdm) 167 | 168 | print 'Ham Training Set Term Statistics' 169 | print easyham_term_df.head(6) 170 | 171 | def classify_email(msg, training_df, prior = 0.5, c = 1e-6): 172 | ''' 173 | A conditional probability calculator for a naive Bayes e-mail 174 | classifier. 175 | Given an e-mail message and a training dataset, the classifier 176 | returns the log probability of observing the terms in the message if 177 | it were of the same class as the e-mails in the training set (spam/ham). 178 | 179 | NB: Log probabilities are used for this function, because the raw probabilities 180 | will be so small that underflow is a real risk. Calculating probability 181 | would require multiplying many occurrence probabilities -- p1 * p2 * ... * pN, 182 | where pi is often ~= 0. For log probability we can compute ln(p1) + ln(p2) + 183 | ... + ln(pN), where ln(pi) < 0 by a far. This will not affect the ordering 184 | of probabilities (which is what we care about ultimately), but solves the 185 | underflow risk. Cf. p. 89 of MLFH to see how small raw probability calculations 186 | can get, and an apparent underflow in row 4. 187 | 188 | Returns a log probability (float) between -Infty and +Infty. 189 | ''' 190 | msg_tdm = tdm_df([msg]) 191 | msg_freq = msg_tdm.sum() 192 | msg_match = list(set(msg_freq.index).intersection(set(training_df.index))) 193 | if len(msg_match) < 1: 194 | return math.log(prior) + math.log(c) * len(msg_freq) 195 | else: 196 | match_probs = training_df.occurrence[msg_match] 197 | return (math.log(prior) + np.log(match_probs).sum() 198 | + math.log(c) * (len(msg_freq) - len(msg_match))) 199 | 200 | hardham_spamtest = [classify_email(m, spam_term_df) for m in all_hardham] 201 | hardham_hamtest = [classify_email(m, easyham_term_df) for m in all_hardham] 202 | s_spam = np.array(hardham_spamtest) > np.array(hardham_hamtest) 203 | 204 | def spam_classifier(msglist): 205 | ''' 206 | The naive Bayes classifier. 207 | Using spam and ham training datasets, use `classify_email()` to 208 | compute the conditional log probability of each e-mail in a list. 209 | Assign each e-mail to whichever class's training data returns the 210 | highest probability. 211 | 212 | Returns a DataFrame with the conditional log probabilities and the 213 | class. 214 | ''' 215 | spamprob = [classify_email(m, spam_term_df) for m in msglist] 216 | hamprob = [classify_email(m, easyham_term_df) for m in msglist] 217 | classify = np.where(np.array(spamprob) > np.array(hamprob), 'Spam', 'Ham') 218 | out_df = DataFrame({'pr_spam' : spamprob, 219 | 'pr_ham' : hamprob, 220 | 'classify' : classify}, 221 | columns = ['pr_spam', 'pr_ham', 'classify']) 222 | return out_df 223 | 224 | def class_stats(df): 225 | return df.classify.value_counts() / float(len(df.classify)) 226 | 227 | hardham_classify = spam_classifier(all_hardham) 228 | print 'Hard Ham Classification Statistics (first set)' 229 | print class_stats(hardham_classify) 230 | 231 | print 'Hard Ham (first set) classification data' 232 | print hardham_classify.head() 233 | 234 | # Run the classifier on the evaluation e-mails in the ham2/spam2 235 | # directories. 236 | all_easyham2 = get_msgdir(easyham2_path) 237 | all_hardham2 = get_msgdir(hardham2_path) 238 | all_spam2 = get_msgdir(spam2_path) 239 | 240 | # The classifier does a great job on easy ham. 241 | easyham2_classify = spam_classifier(all_easyham2) 242 | print 'Easy Ham Classification Statistics' 243 | print class_stats(easyham2_classify) 244 | 245 | # But it does a pretty bad job on hardham, 246 | # not surprisingly. 247 | hardham2_classify = spam_classifier(all_hardham2) 248 | print 'Hard Ham Classification Statistics' 249 | print class_stats(hardham2_classify) 250 | 251 | # It's also very accurate for spam. 252 | spam2_classify = spam_classifier(all_spam2) 253 | print 'Spam Classification Statistics' 254 | print class_stats(spam2_classify) 255 | 256 | # These are are almost identical to results using the authors' R 257 | # script after modifying the classify.email() function to use log 258 | # probabilities. 259 | # 260 | # NOT SPAM SPAM 261 | # easyham2.col 0.97928571 0.02071429 262 | # hardham2.col 0.30241935 0.69758065 263 | # spam2.col 0.03006442 0.96993558 264 | 265 | -------------------------------------------------------------------------------- /MLFH/CH3/ch3_nltk.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "ch3_nltk" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Building a Naive Bayes spam classifier with NLTK\n", 15 | "\n", 16 | "We'll follow the same logic as the program from chapter 3 of *Machine Learning for Hackers*, but we'll do so with a workflow more suited to NLTK's functions. So instead of creating a term-document matrix, and building our own Naive Bayes classifier, we'll build a `features` $\\rightarrow$ `label` association for each training e-mail, and feed a list of these to NLTK's `NaiveBayesClassifier` function.\n", 17 | "\n", 18 | "Some good references for this are:\n", 19 | "\n", 20 | "Bird, Steven and et. al., *Natural Language Processing with Python*\n", 21 | "\n", 22 | "Perkins, Jacob, *Python Text Processing with NLTK 2.0 Cookbook*\n", 23 | "\n", 24 | "\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "collapsed": false, 30 | "input": [ 31 | "from pandas import *\n", 32 | "import numpy as np\n", 33 | "import os\n", 34 | "import re\n", 35 | "from nltk import NaiveBayesClassifier\n", 36 | "import nltk.classify\n", 37 | "from nltk.tokenize import wordpunct_tokenize\n", 38 | "from nltk.corpus import stopwords\n", 39 | "from collections import defaultdict" 40 | ], 41 | "language": "python", 42 | "metadata": {}, 43 | "outputs": [], 44 | "prompt_number": 1 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Loading the e-mail messages into lists\n", 51 | "\n", 52 | "E-mails of each type --spam, \"easy\" ham, and \"hard\" ham-- are split across two directories per type. We'll use the first directories of spam and \"easy\" ham to train the classifier. Then we'll test the classifier on the e-mails in the second directories." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "collapsed": false, 58 | "input": [ 59 | "data_path = os.path.abspath(os.path.join('.', 'data'))\n", 60 | "spam_path = os.path.join(data_path, 'spam')\n", 61 | "spam2_path = os.path.join(data_path, 'spam_2') \n", 62 | "easyham_path = os.path.join(data_path, 'easy_ham')\n", 63 | "easyham2_path = os.path.join(data_path, 'easy_ham_2')\n", 64 | "hardham_path = os.path.join(data_path, 'hard_ham')\n", 65 | "hardham2_path = os.path.join(data_path, 'hard_ham_2')" 66 | ], 67 | "language": "python", 68 | "metadata": {}, 69 | "outputs": [], 70 | "prompt_number": 2 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "The following function loads all the e-mail files in a directory, extracts their message bodies and returns them in a list." 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "collapsed": false, 82 | "input": [ 83 | "def get_msgdir(path):\n", 84 | " '''\n", 85 | " Read all messages from files in a directory into\n", 86 | " a list where each item is the text of a message. \n", 87 | " \n", 88 | " Simply gets a list of e-mail files in a directory,\n", 89 | " and iterates get_msg() over them.\n", 90 | "\n", 91 | " Returns a list of strings.\n", 92 | " '''\n", 93 | " filelist = os.listdir(path)\n", 94 | " filelist = filter(lambda x: x != 'cmds', filelist)\n", 95 | " all_msgs =[get_msg(os.path.join(path, f)) for f in filelist]\n", 96 | " return all_msgs\n", 97 | "\n", 98 | "def get_msg(path):\n", 99 | " '''\n", 100 | " Read in the 'message' portion of an e-mail, given\n", 101 | " its file path. The 'message' text begins after the first\n", 102 | " blank line; above is header information.\n", 103 | "\n", 104 | " Returns a string.\n", 105 | " '''\n", 106 | " with open(path, 'rU') as con:\n", 107 | " msg = con.readlines()\n", 108 | " first_blank_index = msg.index('\\n')\n", 109 | " msg = msg[(first_blank_index + 1): ]\n", 110 | " return ''.join(msg) " 111 | ], 112 | "language": "python", 113 | "metadata": {}, 114 | "outputs": [], 115 | "prompt_number": 3 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "We'll use the functions to make training and testing message lists for each type of e-mail." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "collapsed": false, 127 | "input": [ 128 | "train_spam_messages = get_msgdir(spam_path)\n", 129 | "train_easyham_messages = get_msgdir(easyham_path)\n", 130 | "# Only keep the first 500 to balance w/ number of spam messages.\n", 131 | "train_easyham_messages = train_easyham_messages[:500]\n", 132 | "train_hardham_messages = get_msgdir(hardham_path)\n", 133 | "\n", 134 | "test_spam_messages = get_msgdir(spam2_path)\n", 135 | "test_easyham_messages = get_msgdir(easyham2_path)\n", 136 | "test_hardham_messages = get_msgdir(hardham2_path)\n" 137 | ], 138 | "language": "python", 139 | "metadata": {}, 140 | "outputs": [], 141 | "prompt_number": 4 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "## Extracting word features from the e-mail messages\n", 148 | "\n", 149 | "Each e-mail in our classifier's training data will have a label (\"spam\" or \"ham\") and a feature set. For this application, we're just going to use a feature set that is just a set of the unique words in the e-mail. Below, we'll turn this into a dictionary to feed into the `NaiveBayesClassifier`, but first, let's get the set." 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "### Parsing and tokenizing the e-mails\n", 157 | "\n", 158 | "We're going to use NLTK's `wordpunct_tokenize` function to break the message into tokens. This splits tokens at white space and (most) punctuation marks, and returns the punctuation along with the tokens on each side. So `\"I don't know. Do you?\"` becomes `[\"I\", \"don\", \"'\", \"t\", \"know\", \".\", \"Do\", \"you\", \"?\"]`.\n", 159 | "\n", 160 | "If you look through some of the training e-mails in `train_spam_messages` and `train_ham_messages`, you'll notice a few features that make extracting words tricky.\n", 161 | "\n", 162 | "First, there are a couple of odd text artefacts. The string '3D' shows up in strange places in HTML attributes and other places, and we'll remove these. Furthermore there seem to be some mid-word line wraps flagged with an '=' where the word is broken across lines. For example, the work 'apple' might be split across lines like 'app=\\nle'. We want to strip these out so we can recover 'apple'. We'll want to deal with all these first, before we apply the tokenizer.\n", 163 | "\n", 164 | "Second, there's a lot of HTML in the messages. We'll have to decide first whether we want to keep HTML info in our set of words. If we do, we'll apply `wordpunct_tokenize` to some HTML, for example:\n", 165 | "\n", 166 | "`\"\"`\n", 167 | "\n", 168 | "and it will tokenize to:\n", 169 | "\n", 170 | "`[\"<\", \"HEAD\", \"><\", \"BODY\", \">\"]`\n", 171 | "\n", 172 | "So if we drop the punctuation tokens, and get the unique set of what remains, we'd have `{\"HEAD\", \"BODY\", \"Comment\"}`, which seems like what we'd want. For example, it's nice that this method doesn't make, `` and `` separate words in our set, but just captures the existence of this tag with the term `\"HEAD\"`. It might be a problem that we won't distinguish between the HTML tag `` and \"head\" used as an English word in the message. But for the moment I'm willing to bet that sort of conflation won't have a big affect on the classifier.\n", 173 | "\n", 174 | "If we don't want to count HTML information in our set of words, we can set the `strip_html` to `True`, and we'll take all the HTML tags out before tokenizing.\n", 175 | "\n", 176 | "Lastly we'll strip out any \"stopwords\" from the set. Stopwords are highly common, therefore low information words, like \"a\", \"the\", \"he\", etc. Below I'll use `stopwords`, downloaded from NLTK's corpus library, with a minor modifications to deal with this. (In other programs I've used the stopwords exported from R's `tm` package.)\n", 177 | "\n", 178 | "Note that because our tokenizer splits contractions (\"she'll\" $\\rightarrow$ \"she\", \"ll\"), we'd like to drop the ends (\"ll\"). Some of these may be picked up in NLTK's `stopwords` list, others we'll manually add. It's an imperfect, but easy solution. There are more sophisticated ways of dealing with this which are overkill for our purposes.\n", 179 | "\n", 180 | "Tokenizing, as perhaps you can tell, is a non-trivial operation. NLTK has a host of other tokenizing functions of varying sophistication, and even lets you define your own tokenizing rule using regex." 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "collapsed": false, 186 | "input": [ 187 | "def get_msg_words(msg, stopwords = [], strip_html = False):\n", 188 | " '''\n", 189 | " Returns the set of unique words contained in an e-mail message. Excludes \n", 190 | " any that are in an optionally-provided list. \n", 191 | "\n", 192 | " NLTK's 'wordpunct' tokenizer is used, and this will break contractions.\n", 193 | " For example, don't -> (don, ', t). Therefore, it's advisable to supply\n", 194 | " a stopwords list that includes contraction parts, like 'don' and 't'.\n", 195 | " '''\n", 196 | " \n", 197 | " # Strip out weird '3D' artefacts.\n", 198 | " msg = re.sub('3D', '', msg)\n", 199 | " \n", 200 | " # Strip out html tags and attributes and html character codes,\n", 201 | " # like   and <.\n", 202 | " if strip_html:\n", 203 | " msg = re.sub('<(.|\\n)*?>', ' ', msg)\n", 204 | " msg = re.sub('&\\w+;', ' ', msg)\n", 205 | " \n", 206 | " # wordpunct_tokenize doesn't split on underscores. We don't\n", 207 | " # want to strip them, since the token first_name may be informative\n", 208 | " # moreso than 'first' and 'name' apart. But there are tokens with long\n", 209 | " # underscore strings (e.g. 'name_________'). We'll just replace the\n", 210 | " # multiple underscores with a single one, since 'name_____' is probably\n", 211 | " # not distinct from 'name___' or 'name_' in identifying spam.\n", 212 | " msg = re.sub('_+', '_', msg)\n", 213 | "\n", 214 | " # Note, remove '=' symbols before tokenizing, since these are\n", 215 | " # sometimes occur within words to indicate, e.g., line-wrapping.\n", 216 | " msg_words = set(wordpunct_tokenize(msg.replace('=\\n', '').lower()))\n", 217 | " \n", 218 | " # Get rid of stopwords\n", 219 | " msg_words = msg_words.difference(stopwords)\n", 220 | " \n", 221 | " # Get rid of punctuation tokens, numbers, and single letters.\n", 222 | " msg_words = [w for w in msg_words if re.search('[a-zA-Z]', w) and len(w) > 1]\n", 223 | " \n", 224 | " return msg_words" 225 | ], 226 | "language": "python", 227 | "metadata": {}, 228 | "outputs": [], 229 | "prompt_number": 5 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "The stopwords list. While it contains some terms to account for contractions, we'll add a couple more." 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "collapsed": false, 241 | "input": [ 242 | "sw = stopwords.words('english')\n", 243 | "sw.extend(['ll', 've'])" 244 | ], 245 | "language": "python", 246 | "metadata": {}, 247 | "outputs": [], 248 | "prompt_number": 6 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "metadata": {}, 253 | "source": [ 254 | "### Making a `(features, label)` list\n", 255 | "\n", 256 | "The `NaiveBayesClassifier` function trains on data that's of the form `[(features1, label1), features2, label2), ..., (featuresN, labelN)]` where `featuresi` is a dictionary of features for e-mail `i` and `labeli` is the label for e-mail `i' (\"spam\" or \"ham\"). \n", 257 | "\n", 258 | "The function `features_from_messages` iterates through the messages creating this list, but calls an outside function to create the `features` for each e-mail. This makes the function modular in case we decide to try out some other method of extracting features from the e-mails besides the set of word. It then combines the features to the e-mail's label in a tuple and adds the tuple to the list.\n", 259 | "\n", 260 | "The `word_indicator` function calls `get_msg_words()` to get an e-mail's words as a set, then creates a dictionary with entries `{word: True}` for each word in the set. This is a little counter-intuitive (since we don't have `{word: False}` entries for words not in the set) but `NaiveBayesClassifier` knows how to handle it." 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "collapsed": false, 266 | "input": [ 267 | "def features_from_messages(messages, label, feature_extractor, **kwargs):\n", 268 | " '''\n", 269 | " Make a (features, label) tuple for each message in a list of a certain,\n", 270 | " label of e-mails ('spam', 'ham') and return a list of these tuples.\n", 271 | "\n", 272 | " Note every e-mail in 'messages' should have the same label.\n", 273 | " '''\n", 274 | " features_labels = []\n", 275 | " for msg in messages:\n", 276 | " features = feature_extractor(msg, **kwargs)\n", 277 | " features_labels.append((features, label))\n", 278 | " return features_labels\n", 279 | "\n", 280 | "def word_indicator(msg, **kwargs):\n", 281 | " '''\n", 282 | " Create a dictionary of entries {word: True} for every unique\n", 283 | " word in a message.\n", 284 | "\n", 285 | " Note **kwargs are options to the word-set creator,\n", 286 | " get_msg_words().\n", 287 | " '''\n", 288 | " features = defaultdict(list)\n", 289 | " msg_words = get_msg_words(msg, **kwargs)\n", 290 | " for w in msg_words:\n", 291 | " features[w] = True\n", 292 | " return features" 293 | ], 294 | "language": "python", 295 | "metadata": {}, 296 | "outputs": [], 297 | "prompt_number": 7 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "## Training and evaluating the classifier" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "The following is just a helper function to make training and testing data from the messages. Notice we combine the training spam and training ham into a single set, since we need to train our classifier on data with both spam and ham in it." 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "collapsed": false, 316 | "input": [ 317 | "def make_train_test_sets(feature_extractor, **kwargs):\n", 318 | " '''\n", 319 | " Make (feature, label) lists for each of the training \n", 320 | " and testing lists.\n", 321 | " '''\n", 322 | " train_spam = features_from_messages(train_spam_messages, 'spam', \n", 323 | " feature_extractor, **kwargs)\n", 324 | " train_ham = features_from_messages(train_easyham_messages, 'ham', \n", 325 | " feature_extractor, **kwargs)\n", 326 | " train_set = train_spam + train_ham\n", 327 | "\n", 328 | " test_spam = features_from_messages(test_spam_messages, 'spam',\n", 329 | " feature_extractor, **kwargs)\n", 330 | "\n", 331 | " test_ham = features_from_messages(test_easyham_messages, 'ham',\n", 332 | " feature_extractor, **kwargs)\n", 333 | "\n", 334 | " test_hardham = features_from_messages(test_hardham_messages, 'ham',\n", 335 | " feature_extractor, **kwargs)\n", 336 | " \n", 337 | " return train_set, test_spam, test_ham, test_hardham" 338 | ], 339 | "language": "python", 340 | "metadata": {}, 341 | "outputs": [], 342 | "prompt_number": 8 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "Finally we make a function to run the classifier and check its accuracy on test data. After training the classifier, we check how accurately it classifies data in new spam, \"easy\" ham, and \"hard\" ham datasets. \n", 349 | "\n", 350 | "The function then prints out the results of `NaiveBayesClassifiers`'s handy `show_most_informative_features` method. This shows which features are most unique to one label or another. For example, if \"viagra\" shows up in 500 of the spam e-mails, but only 2 of the \"ham\" e-mails in the training set, then the method will show that \"viagra\" is one of the most informative features with a `spam:ham` ratio of 250:1." 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "collapsed": false, 356 | "input": [ 357 | "def check_classifier(feature_extractor, **kwargs):\n", 358 | " '''\n", 359 | " Train the classifier on the training spam and ham, then check its accuracy\n", 360 | " on the test data, and show the classifier's most informative features.\n", 361 | " '''\n", 362 | " \n", 363 | " # Make training and testing sets of (features, label) data\n", 364 | " train_set, test_spam, test_ham, test_hardham = \\\n", 365 | " make_train_test_sets(feature_extractor, **kwargs)\n", 366 | " \n", 367 | " # Train the classifier on the training set\n", 368 | " classifier = NaiveBayesClassifier.train(train_set)\n", 369 | " \n", 370 | " # How accurate is the classifier on the test sets?\n", 371 | " print ('Test Spam accuracy: {0:.2f}%'\n", 372 | " .format(100 * nltk.classify.accuracy(classifier, test_spam)))\n", 373 | " print ('Test Ham accuracy: {0:.2f}%'\n", 374 | " .format(100 * nltk.classify.accuracy(classifier, test_ham)))\n", 375 | " print ('Test Hard Ham accuracy: {0:.2f}%'\n", 376 | " .format(100 * nltk.classify.accuracy(classifier, test_hardham)))\n", 377 | "\n", 378 | " # Show the top 20 informative features\n", 379 | " print classifier.show_most_informative_features(20)" 380 | ], 381 | "language": "python", 382 | "metadata": {}, 383 | "outputs": [], 384 | "prompt_number": 9 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "First, we run the classifier keeping all the HTML information in the feature set. The accuracy at identifying spam and ham is very high. Unsurprisingly, we do a lousy job at identifying hard ham. \n", 391 | "\n", 392 | "This may be because our training set is relying too much on HTML tags to identify spam. As we can see, HTML info comprises all the `most_informative_features`." 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "collapsed": false, 398 | "input": [ 399 | "check_classifier(word_indicator, stopwords = sw)" 400 | ], 401 | "language": "python", 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "output_type": "stream", 406 | "stream": "stdout", 407 | "text": [ 408 | "Test Spam accuracy: 98.71%\n", 409 | "Test Ham accuracy: 97.07%" 410 | ] 411 | }, 412 | { 413 | "output_type": "stream", 414 | "stream": "stdout", 415 | "text": [ 416 | "\n", 417 | "Test Hard Ham accuracy: 13.71%" 418 | ] 419 | }, 420 | { 421 | "output_type": "stream", 422 | "stream": "stdout", 423 | "text": [ 424 | "\n", 425 | "Most Informative Features\n", 426 | " align = True spam : ham = 119.7 : 1.0" 427 | ] 428 | }, 429 | { 430 | "output_type": "stream", 431 | "stream": "stdout", 432 | "text": [ 433 | "\n", 434 | " tr = True spam : ham = 115.7 : 1.0\n", 435 | " td = True spam : ham = 111.7 : 1.0\n", 436 | " arial = True spam : ham = 107.7 : 1.0\n", 437 | " cellpadding = True spam : ham = 97.0 : 1.0\n", 438 | " cellspacing = True spam : ham = 94.3 : 1.0\n", 439 | " img = True spam : ham = 80.3 : 1.0\n", 440 | " bgcolor = True spam : ham = 67.4 : 1.0\n", 441 | " href = True spam : ham = 67.0 : 1.0\n", 442 | " sans = True spam : ham = 62.3 : 1.0\n", 443 | " colspan = True spam : ham = 61.0 : 1.0\n", 444 | " font = True spam : ham = 61.0 : 1.0\n", 445 | " valign = True spam : ham = 60.3 : 1.0\n", 446 | " br = True spam : ham = 59.6 : 1.0\n", 447 | " verdana = True spam : ham = 57.7 : 1.0\n", 448 | " nbsp = True spam : ham = 57.4 : 1.0\n", 449 | " color = True spam : ham = 54.4 : 1.0\n", 450 | " ff0000 = True spam : ham = 53.0 : 1.0\n", 451 | " ffffff = True spam : ham = 50.6 : 1.0\n", 452 | " border = True spam : ham = 49.6 : 1.0\n", 453 | "None\n" 454 | ] 455 | } 456 | ], 457 | "prompt_number": 10 458 | }, 459 | { 460 | "cell_type": "markdown", 461 | "metadata": {}, 462 | "source": [ 463 | "If we try just using the text of the messages, without the HTML tags and information, we lose a tiny bit of accuracy in identifying spam but do much better with the hard ham." 464 | ] 465 | }, 466 | { 467 | "cell_type": "code", 468 | "collapsed": false, 469 | "input": [ 470 | "check_classifier(word_indicator, stopwords = sw, strip_html = True)" 471 | ], 472 | "language": "python", 473 | "metadata": {}, 474 | "outputs": [ 475 | { 476 | "output_type": "stream", 477 | "stream": "stdout", 478 | "text": [ 479 | "Test Spam accuracy: 96.64%\n", 480 | "Test Ham accuracy: 98.64%" 481 | ] 482 | }, 483 | { 484 | "output_type": "stream", 485 | "stream": "stdout", 486 | "text": [ 487 | "\n", 488 | "Test Hard Ham accuracy: 56.05%" 489 | ] 490 | }, 491 | { 492 | "output_type": "stream", 493 | "stream": "stdout", 494 | "text": [ 495 | "\n", 496 | "Most Informative Features\n", 497 | " dear = True spam : ham = 41.7 : 1.0" 498 | ] 499 | }, 500 | { 501 | "output_type": "stream", 502 | "stream": "stdout", 503 | "text": [ 504 | "\n", 505 | " aug = True ham : spam = 38.3 : 1.0\n", 506 | " guaranteed = True spam : ham = 35.0 : 1.0\n", 507 | " assistance = True spam : ham = 29.7 : 1.0\n", 508 | " groups = True ham : spam = 27.9 : 1.0\n", 509 | " mailings = True spam : ham = 25.0 : 1.0\n", 510 | " sincerely = True spam : ham = 23.0 : 1.0\n", 511 | " fill = True spam : ham = 23.0 : 1.0\n", 512 | " mortgage = True spam : ham = 21.7 : 1.0\n", 513 | " sir = True spam : ham = 21.0 : 1.0\n", 514 | " sponsor = True ham : spam = 20.3 : 1.0\n", 515 | " article = True ham : spam = 20.3 : 1.0\n", 516 | " assist = True spam : ham = 19.0 : 1.0\n", 517 | " income = True spam : ham = 18.6 : 1.0\n", 518 | " tue = True ham : spam = 18.3 : 1.0\n", 519 | " mails = True spam : ham = 18.3 : 1.0\n", 520 | " iso = True spam : ham = 17.7 : 1.0\n", 521 | " admin = True ham : spam = 17.7 : 1.0\n", 522 | " monday = True ham : spam = 17.7 : 1.0\n", 523 | " earn = True spam : ham = 17.0 : 1.0\n", 524 | "None\n" 525 | ] 526 | } 527 | ], 528 | "prompt_number": 11 529 | }, 530 | { 531 | "cell_type": "code", 532 | "collapsed": false, 533 | "input": [], 534 | "language": "python", 535 | "metadata": {}, 536 | "outputs": [] 537 | } 538 | ], 539 | "metadata": {} 540 | } 541 | ] 542 | } -------------------------------------------------------------------------------- /MLFH/CH3/r_stopwords.csv: -------------------------------------------------------------------------------- 1 | "x" 2 | "a" 3 | "about" 4 | "above" 5 | "across" 6 | "after" 7 | "again" 8 | "against" 9 | "all" 10 | "almost" 11 | "alone" 12 | "along" 13 | "already" 14 | "also" 15 | "although" 16 | "always" 17 | "am" 18 | "among" 19 | "an" 20 | "and" 21 | "another" 22 | "any" 23 | "anybody" 24 | "anyone" 25 | "anything" 26 | "anywhere" 27 | "are" 28 | "area" 29 | "areas" 30 | "aren't" 31 | "around" 32 | "as" 33 | "ask" 34 | "asked" 35 | "asking" 36 | "asks" 37 | "at" 38 | "away" 39 | "b" 40 | "back" 41 | "backed" 42 | "backing" 43 | "backs" 44 | "be" 45 | "became" 46 | "because" 47 | "become" 48 | "becomes" 49 | "been" 50 | "before" 51 | "began" 52 | "behind" 53 | "being" 54 | "beings" 55 | "below" 56 | "best" 57 | "better" 58 | "between" 59 | "big" 60 | "both" 61 | "but" 62 | "by" 63 | "c" 64 | "came" 65 | "can" 66 | "cannot" 67 | "can't" 68 | "case" 69 | "cases" 70 | "certain" 71 | "certainly" 72 | "clear" 73 | "clearly" 74 | "come" 75 | "could" 76 | "couldn't" 77 | "d" 78 | "did" 79 | "didn't" 80 | "differ" 81 | "different" 82 | "differently" 83 | "do" 84 | "does" 85 | "doesn't" 86 | "doing" 87 | "done" 88 | "don't" 89 | "down" 90 | "downed" 91 | "downing" 92 | "downs" 93 | "during" 94 | "e" 95 | "each" 96 | "early" 97 | "either" 98 | "end" 99 | "ended" 100 | "ending" 101 | "ends" 102 | "enough" 103 | "even" 104 | "evenly" 105 | "ever" 106 | "every" 107 | "everybody" 108 | "everyone" 109 | "everything" 110 | "everywhere" 111 | "f" 112 | "face" 113 | "faces" 114 | "fact" 115 | "facts" 116 | "far" 117 | "felt" 118 | "few" 119 | "find" 120 | "finds" 121 | "first" 122 | "for" 123 | "four" 124 | "from" 125 | "full" 126 | "fully" 127 | "further" 128 | "furthered" 129 | "furthering" 130 | "furthers" 131 | "g" 132 | "gave" 133 | "general" 134 | "generally" 135 | "get" 136 | "gets" 137 | "give" 138 | "given" 139 | "gives" 140 | "go" 141 | "going" 142 | "good" 143 | "goods" 144 | "got" 145 | "great" 146 | "greater" 147 | "greatest" 148 | "group" 149 | "grouped" 150 | "grouping" 151 | "groups" 152 | "h" 153 | "had" 154 | "hadn't" 155 | "has" 156 | "hasn't" 157 | "have" 158 | "haven't" 159 | "having" 160 | "he" 161 | "he'd" 162 | "he'll" 163 | "her" 164 | "here" 165 | "here's" 166 | "hers" 167 | "herself" 168 | "he's" 169 | "high" 170 | "higher" 171 | "highest" 172 | "him" 173 | "himself" 174 | "his" 175 | "how" 176 | "however" 177 | "how's" 178 | "i" 179 | "i'd" 180 | "if" 181 | "i'll" 182 | "i'm" 183 | "important" 184 | "in" 185 | "interest" 186 | "interested" 187 | "interesting" 188 | "interests" 189 | "into" 190 | "is" 191 | "isn't" 192 | "it" 193 | "its" 194 | "it's" 195 | "itself" 196 | "i've" 197 | "j" 198 | "just" 199 | "k" 200 | "keep" 201 | "keeps" 202 | "kind" 203 | "knew" 204 | "know" 205 | "known" 206 | "knows" 207 | "l" 208 | "large" 209 | "largely" 210 | "last" 211 | "later" 212 | "latest" 213 | "least" 214 | "less" 215 | "let" 216 | "lets" 217 | "let's" 218 | "like" 219 | "likely" 220 | "long" 221 | "longer" 222 | "longest" 223 | "m" 224 | "made" 225 | "make" 226 | "making" 227 | "man" 228 | "many" 229 | "may" 230 | "me" 231 | "member" 232 | "members" 233 | "men" 234 | "might" 235 | "more" 236 | "most" 237 | "mostly" 238 | "mr" 239 | "mrs" 240 | "much" 241 | "must" 242 | "mustn't" 243 | "my" 244 | "myself" 245 | "n" 246 | "necessary" 247 | "need" 248 | "needed" 249 | "needing" 250 | "needs" 251 | "never" 252 | "new" 253 | "newer" 254 | "newest" 255 | "next" 256 | "no" 257 | "nobody" 258 | "non" 259 | "noone" 260 | "nor" 261 | "not" 262 | "nothing" 263 | "now" 264 | "nowhere" 265 | "number" 266 | "numbers" 267 | "o" 268 | "of" 269 | "off" 270 | "often" 271 | "old" 272 | "older" 273 | "oldest" 274 | "on" 275 | "once" 276 | "one" 277 | "only" 278 | "open" 279 | "opened" 280 | "opening" 281 | "opens" 282 | "or" 283 | "order" 284 | "ordered" 285 | "ordering" 286 | "orders" 287 | "other" 288 | "others" 289 | "ought" 290 | "our" 291 | "ours" 292 | "ourselves" 293 | "out" 294 | "over" 295 | "own" 296 | "p" 297 | "part" 298 | "parted" 299 | "parting" 300 | "parts" 301 | "per" 302 | "perhaps" 303 | "place" 304 | "places" 305 | "point" 306 | "pointed" 307 | "pointing" 308 | "points" 309 | "possible" 310 | "present" 311 | "presented" 312 | "presenting" 313 | "presents" 314 | "problem" 315 | "problems" 316 | "put" 317 | "puts" 318 | "q" 319 | "quite" 320 | "r" 321 | "rather" 322 | "really" 323 | "right" 324 | "room" 325 | "rooms" 326 | "s" 327 | "said" 328 | "same" 329 | "saw" 330 | "say" 331 | "says" 332 | "second" 333 | "seconds" 334 | "see" 335 | "seem" 336 | "seemed" 337 | "seeming" 338 | "seems" 339 | "sees" 340 | "several" 341 | "shall" 342 | "shan't" 343 | "she" 344 | "she'd" 345 | "she'll" 346 | "she's" 347 | "should" 348 | "shouldn't" 349 | "show" 350 | "showed" 351 | "showing" 352 | "shows" 353 | "side" 354 | "sides" 355 | "since" 356 | "small" 357 | "smaller" 358 | "smallest" 359 | "so" 360 | "some" 361 | "somebody" 362 | "someone" 363 | "something" 364 | "somewhere" 365 | "state" 366 | "states" 367 | "still" 368 | "such" 369 | "sure" 370 | "t" 371 | "take" 372 | "taken" 373 | "than" 374 | "that" 375 | "that's" 376 | "the" 377 | "their" 378 | "theirs" 379 | "them" 380 | "themselves" 381 | "then" 382 | "there" 383 | "therefore" 384 | "there's" 385 | "these" 386 | "they" 387 | "they'd" 388 | "they'll" 389 | "they're" 390 | "they've" 391 | "thing" 392 | "things" 393 | "think" 394 | "thinks" 395 | "this" 396 | "those" 397 | "though" 398 | "thought" 399 | "thoughts" 400 | "three" 401 | "through" 402 | "thus" 403 | "to" 404 | "today" 405 | "together" 406 | "too" 407 | "took" 408 | "toward" 409 | "turn" 410 | "turned" 411 | "turning" 412 | "turns" 413 | "two" 414 | "u" 415 | "under" 416 | "until" 417 | "up" 418 | "upon" 419 | "us" 420 | "use" 421 | "used" 422 | "uses" 423 | "v" 424 | "very" 425 | "w" 426 | "want" 427 | "wanted" 428 | "wanting" 429 | "wants" 430 | "was" 431 | "wasn't" 432 | "way" 433 | "ways" 434 | "we" 435 | "we'd" 436 | "well" 437 | "we'll" 438 | "wells" 439 | "went" 440 | "were" 441 | "we're" 442 | "weren't" 443 | "we've" 444 | "what" 445 | "what's" 446 | "when" 447 | "when's" 448 | "where" 449 | "where's" 450 | "whether" 451 | "which" 452 | "while" 453 | "who" 454 | "whole" 455 | "whom" 456 | "who's" 457 | "whose" 458 | "why" 459 | "why's" 460 | "will" 461 | "with" 462 | "within" 463 | "without" 464 | "won't" 465 | "work" 466 | "worked" 467 | "working" 468 | "works" 469 | "would" 470 | "wouldn't" 471 | "x" 472 | "y" 473 | "year" 474 | "years" 475 | "yes" 476 | "yet" 477 | "you" 478 | "you'd" 479 | "you'll" 480 | "young" 481 | "younger" 482 | "youngest" 483 | "your" 484 | "you're" 485 | "yours" 486 | "yourself" 487 | "yourselves" 488 | "you've" 489 | "z" 490 | -------------------------------------------------------------------------------- /MLFH/ch4/tdm_df.py: -------------------------------------------------------------------------------- 1 | def tdm_df(doclist, stopwords = [], remove_punctuation = True, 2 | remove_digits = True, sparse_df = True): 3 | ''' 4 | 5 | Create a term-document matrix from a list of e-mails. 6 | 7 | Uses the TermDocumentMatrix function in the `textmining` module. 8 | But, pre-processes the documents to remove digits and punctuation, 9 | and post-processes to remove stopwords, to match the functionality 10 | of R's `tm` package. 11 | 12 | NB: This is not particularly memory efficient and you can get memory 13 | errors with an especially long list of documents. 14 | 15 | Returns a (by default, sparse) DataFrame. Each column is a term, 16 | each row is a document. 17 | ''' 18 | import numpy as np 19 | import textmining as txtm 20 | import pandas as pd 21 | import string 22 | 23 | # Some (at least to me) unavoidable type-checking. 24 | # If you only pass one document (string) to the doclist parameter, 25 | # the for-loop below will iterate over the letters in the string 26 | # instead of strings in a list. This coerces the doclist parameter 27 | # to be a list, even if it's only one document. 28 | if isinstance(doclist, basestring): 29 | doclist = [doclist] 30 | 31 | # Create the TDM from the list of documents. 32 | tdm = txtm.TermDocumentMatrix() 33 | 34 | for doc in doclist: 35 | if remove_punctuation == True: 36 | doc = doc.translate(None, string.punctuation.translate(None, '"')) 37 | if remove_digits == True: 38 | doc = doc.translate(None, string.digits) 39 | 40 | tdm.add_doc(doc) 41 | 42 | # Push the TDM data to a list of lists, 43 | # then make that an ndarray, which then 44 | # becomes a DataFrame. 45 | tdm_rows = [] 46 | for row in tdm.rows(cutoff = 1): 47 | tdm_rows.append(row) 48 | 49 | tdm_array = np.array(tdm_rows[1:]) 50 | tdm_terms = tdm_rows[0] 51 | df = pd.DataFrame(tdm_array, columns = tdm_terms) 52 | 53 | # Remove stopwords from the dataset, manually. 54 | # TermDocumentMatrix does not do this for us. 55 | if remove_punctuation: 56 | stopwords = [w.translate(None, string.punctuation.translate(None, '"')) 57 | for w in stopwords] 58 | if len(stopwords) > 0: 59 | for col in df: 60 | if col in stopwords: 61 | del df[col] 62 | 63 | if sparse_df == True: 64 | df.to_sparse(fill_value = 0) 65 | 66 | return df 67 | -------------------------------------------------------------------------------- /MLFH/ch4/tdm_df.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/ch4/tdm_df.pyc -------------------------------------------------------------------------------- /MLFH/ch6/tdm_df.py: -------------------------------------------------------------------------------- 1 | def tdm_df(doclist, stopwords = [], remove_punctuation = True, 2 | remove_digits = True, sparse_df = True): 3 | ''' 4 | 5 | Create a term-document matrix from a list of e-mails. 6 | 7 | Uses the TermDocumentMatrix function in the `textmining` module. 8 | But, pre-processes the documents to remove digits and punctuation, 9 | and post-processes to remove stopwords, to match the functionality 10 | of R's `tm` package. 11 | 12 | NB: This is not particularly memory efficient and you can get memory 13 | errors with an especially long list of documents. 14 | 15 | Returns a (by default, sparse) DataFrame. Each column is a term, 16 | each row is a document. 17 | ''' 18 | import numpy as np 19 | import textmining as txtm 20 | import pandas as pd 21 | import string 22 | 23 | # Some (at least to me) unavoidable type-checking. 24 | # If you only pass one document (string) to the doclist parameter, 25 | # the for-loop below will iterate over the letters in the string 26 | # instead of strings in a list. This coerces the doclist parameter 27 | # to be a list, even if it's only one document. 28 | if isinstance(doclist, basestring): 29 | doclist = [doclist] 30 | 31 | # Create the TDM from the list of documents. 32 | tdm = txtm.TermDocumentMatrix() 33 | 34 | for doc in doclist: 35 | if remove_punctuation == True: 36 | doc = doc.translate(None, string.punctuation.translate(None, '"')) 37 | if remove_digits == True: 38 | doc = doc.translate(None, string.digits) 39 | 40 | tdm.add_doc(doc) 41 | 42 | # Push the TDM data to a list of lists, 43 | # then make that an ndarray, which then 44 | # becomes a DataFrame. 45 | tdm_rows = [] 46 | for row in tdm.rows(cutoff = 1): 47 | tdm_rows.append(row) 48 | 49 | tdm_array = np.array(tdm_rows[1:]) 50 | tdm_terms = tdm_rows[0] 51 | df = pd.DataFrame(tdm_array, columns = tdm_terms) 52 | 53 | # Remove stopwords from the dataset, manually. 54 | # TermDocumentMatrix does not do this for us. 55 | if remove_punctuation: 56 | stopwords = [w.translate(None, string.punctuation.translate(None, '"')) 57 | for w in stopwords] 58 | if len(stopwords) > 0: 59 | for col in df: 60 | if col in stopwords: 61 | del df[col] 62 | 63 | if sparse_df == True: 64 | df.to_sparse(fill_value = 0) 65 | 66 | return df 67 | -------------------------------------------------------------------------------- /MLFH/ch6/tdm_df.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/carljv/Will_it_Python/5feb80e4023d73bff12040c5aa5f14177794d459/MLFH/ch6/tdm_df.pyc -------------------------------------------------------------------------------- /MLFH/ch9/convert_dta_to_csv.r: -------------------------------------------------------------------------------- 1 | # Convert Senate data Stata files to CSV 2 | # For Machine Learning for Hackers Chapter 9 3 | 4 | wkdir = "data/roll_call" 5 | setwd(wkdir) 6 | library(foreign) 7 | 8 | # dta files are in 'data/roll_call' 9 | # csv files will be written to the same directory. 10 | flist = list.files() 11 | 12 | for (f in flist) { 13 | 14 | # Create filename xyz123.csv from xyz123.dta 15 | csv_name = paste(strsplit(f, "\\.")[[1]][1], "csv", sep = ".") 16 | 17 | df = read.dta(f) 18 | write.csv(df, csv_name) 19 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Will it Python? 2 | 3 | - Updated Dec 1, 2012 4 | 5 | ## Introduction 6 | *Will it Python?* programs are ports of data analyses originally done in R into Python. The projects are accompanied by a series of blog posts documenting the porting-process. See http://slendrmeans.wordpress.com/will-it-python for more information. 7 | 8 | ## Projects: 9 | The code is organized into subfolders by translation project. The projects so far are: 10 | 11 | 1. MLFH: _Machine Learning for Hackers_ by Drew Conway and John Myles White. 12 | 13 | ## Python and library versions: 14 | The code is written for Python 2.7. Third party libraries used in projects are: 15 | - Numpy 1.7.0h2 16 | - Scipy 0.11.0 17 | - matplotlib 1.1.2 18 | - pandas 0.9.1 19 | - statsmodels 0.5.0 (dev) 20 | - NLTK 2.0.4 21 | - scikit-learn 0.12.1 22 | 23 | I'm also using IPython 0.13 to create the IPython notebooks. 24 | 25 | These packages will be updated over time. Scripts/notebooks will usually indicate what version of a library they were first coded in. If unspecified, it's usually safe to assume the latest stable version will work. 26 | 27 | ## IPython Notebooks: 28 | The earlier chapters contain both IPython notebooks and python scripts of the code. Since I started the project, the IPython notebooks have gained more widespread use, so I'm typically providing only those. From the notebook, it's not difficult to export to a python script, but some code (especially plotting) may not be designed for scripting. 29 | 30 | 31 | 32 | 33 | --------------------------------------------------------------------------------