├── .gitignore ├── DamerauLevenshteinDistance ├── badwords.txt ├── dameraulevenshtein.py └── example.py ├── DiscreteOptionPricing ├── price_bounds.py └── shout_option.py ├── DiscreteSDE └── discreteSDE.py ├── Estimators └── theil_sen.py ├── KalmanFilter └── simple_kalman.py ├── MachineLearningScikitLearn ├── BayesianBandit.py ├── blender.py ├── ensembleSelector.py ├── maxCorrelationTransformer.py ├── outlier.py ├── pretty_pca.py ├── supervised_pca.py └── weighted_least_squares.py ├── MonteCarlo ├── Copulas │ └── README.txt ├── Integration │ ├── Assignment.pdf │ ├── MonteCarloIntegrator.py │ ├── Q6.py │ └── examples.py ├── MCMC │ ├── copulas.py │ ├── mcmc.py │ └── mcmc_example.py ├── grammschmidt.py ├── sample_normal_given_projection.py ├── sample_psd.py └── sampling_methods.py ├── MultinomialMarkovAndEncoding ├── encoding.py └── multinomialMM.py ├── NumericalDerivatives ├── diff.py └── diff.pyc ├── README.md ├── TimeSeries ├── MASE.py ├── risk_measures.py └── utils.py ├── pyMC ├── LinearRegressionWithLoss.py ├── SmallSample.py ├── TableGame.py ├── blowflies.py └── mixtureNormals.py └── utils ├── contour_irregular_data.py ├── cov2corr.py ├── dataframe_pairwise_feature_gen.py ├── jarquebera_test.py ├── kaggleDataSet.py ├── linked_list.py ├── lyungbox_test.py ├── mean_average_precision.py ├── memorize.py ├── power_set.py ├── primes.py ├── qq_plot.py └── sample.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled source # 2 | ################### 3 | *.com 4 | *.class 5 | *.dll 6 | *.exe 7 | *.o 8 | *.so 9 | *.pyc 10 | 11 | 12 | #images# 13 | .png 14 | 15 | # Packages # 16 | ############ 17 | # it's better to unpack these files and commit the raw source 18 | # git has its own built in compression methods 19 | *.7z 20 | *.dmg 21 | *.gz 22 | *.iso 23 | *.jar 24 | *.rar 25 | *.tar 26 | *.zip 27 | 28 | # Logs and databases # 29 | ###################### 30 | *.log 31 | *.sql 32 | *.sqlite 33 | 34 | # OS generated files # 35 | ###################### 36 | .DS_Store 37 | .DS_Store? 38 | ._* 39 | .Spotlight-V100 40 | .Trashes 41 | Icon? 42 | ehthumbs.db 43 | Thumbs.db 44 | -------------------------------------------------------------------------------- /DamerauLevenshteinDistance/badwords.txt: -------------------------------------------------------------------------------- 1 | ahole 2 | anus 3 | ash0le 4 | ash0les 5 | asholes 6 | ass 7 | ass monkey 8 | assface 9 | assh0le 10 | assh0lez 11 | asshole 12 | assholes 13 | assholz 14 | asswipe 15 | azzhole 16 | bassterds 17 | bastard 18 | bastards 19 | bastardz 20 | basterds 21 | basterdz 22 | biatch 23 | bitch 24 | bitches 25 | blow job 26 | boffing 27 | butthole 28 | buttwipe 29 | c0ck 30 | c0cks 31 | c0k 32 | carpet muncher 33 | cawk 34 | cawks 35 | clit 36 | cnts 37 | cntz 38 | cock 39 | cockhead 40 | cock-head 41 | cocks 42 | cocksucker 43 | cock-sucker 44 | crap 45 | cum 46 | cunt 47 | cunts 48 | cuntz 49 | dick 50 | dild0 51 | dild0s 52 | dildo 53 | dildos 54 | dilld0 55 | dilld0s 56 | dominatricks 57 | dominatrics 58 | dominatrix 59 | dyke 60 | enema 61 | f u c k 62 | f u c k e r 63 | fag 64 | fag1t 65 | faget 66 | fagg1t 67 | faggit 68 | faggot 69 | fagit 70 | fags 71 | fagz 72 | faig 73 | faigs 74 | fart 75 | flipping the bird 76 | fuck 77 | fucker 78 | fuckin 79 | fucking 80 | fucks 81 | fudge packer 82 | fuk 83 | fukah 84 | fuken 85 | fuker 86 | fukin 87 | fukk 88 | fukkah 89 | fukken 90 | fukker 91 | fukkin 92 | g00k 93 | gay 94 | gayboy 95 | gaygirl 96 | gays 97 | gayz 98 | god-damned 99 | h00r 100 | h0ar 101 | h0re 102 | hells 103 | hoar 104 | hoor 105 | hoore 106 | jackoff 107 | jap 108 | japs 109 | jerk-off 110 | jisim 111 | jiss 112 | jizm 113 | jizz 114 | knob 115 | knobs 116 | knobz 117 | kunt 118 | kunts 119 | kuntz 120 | lesbian 121 | lezzian 122 | lipshits 123 | lipshitz 124 | masochist 125 | masokist 126 | massterbait 127 | masstrbait 128 | masstrbate 129 | masterbaiter 130 | masterbate 131 | masterbates 132 | motha fucker 133 | motha fuker 134 | motha fukkah 135 | motha fukker 136 | mother fucker 137 | mother fukah 138 | mother fuker 139 | mother fukkah 140 | mother fukker 141 | mother-fucker 142 | mutha fucker 143 | mutha fukah 144 | mutha fuker 145 | mutha fukkah 146 | mutha fukker 147 | n1gr 148 | nastt 149 | nigger; 150 | nigur; 151 | niiger; 152 | niigr; 153 | orafis 154 | orgasim; 155 | orgasm 156 | orgasum 157 | oriface 158 | orifice 159 | orifiss 160 | packi 161 | packie 162 | packy 163 | paki 164 | pakie 165 | paky 166 | pecker 167 | peeenus 168 | peeenusss 169 | peenus 170 | peinus 171 | pen1s 172 | penas 173 | penis 174 | penis-breath 175 | penus 176 | penuus 177 | phuc 178 | phuck 179 | phuk 180 | phuker 181 | phukker 182 | polac 183 | polack 184 | polak 185 | poonani 186 | pr1c 187 | pr1ck 188 | pr1k 189 | pusse 190 | pussee 191 | pussy 192 | puuke 193 | puuker 194 | queer 195 | queers 196 | queerz 197 | qweers 198 | qweerz 199 | qweir 200 | recktum 201 | rectum 202 | retard 203 | sadist 204 | scank 205 | schlong 206 | screwing 207 | semen 208 | sex 209 | sexy 210 | sh!t 211 | sh1t 212 | sh1ter 213 | sh1ts 214 | sh1tter 215 | sh1tz 216 | shit 217 | shits 218 | shitter 219 | shitty 220 | shity 221 | shitz 222 | shyt 223 | shyte 224 | shytty 225 | shyty 226 | skanck 227 | skank 228 | skankee 229 | skankey 230 | skanks 231 | skanky 232 | slut 233 | sluts 234 | slutty 235 | slutz 236 | son-of-a-bitch 237 | tit 238 | turd 239 | va1jina 240 | vag1na 241 | vagiina 242 | vagina 243 | vaj1na 244 | vajina 245 | vullva 246 | vulva 247 | w0p 248 | wh00r 249 | wh0re 250 | whore 251 | xrated 252 | xxx 253 | b!+ch 254 | bitch 255 | blowjob 256 | clit 257 | arschloch 258 | fuck 259 | shit 260 | ass 261 | asshole 262 | b!tch 263 | b17ch 264 | b1tch 265 | bastard 266 | bi+ch 267 | boiolas 268 | buceta 269 | c0ck 270 | cawk 271 | chink 272 | cipa 273 | clits 274 | cock 275 | cum 276 | cunt 277 | dildo 278 | dirsa 279 | ejakulate 280 | fatass 281 | fcuk 282 | fuk 283 | fux0r 284 | hoer 285 | hore 286 | jism 287 | kawk 288 | l3itch 289 | l3i+ch 290 | lesbian 291 | masturbate 292 | masterbat 293 | masterbat3 294 | motherfucker 295 | s.o.b. 296 | mofo 297 | nazi 298 | nigga 299 | nigger 300 | nutsack 301 | phuck 302 | pimpis 303 | pusse 304 | pussy 305 | scrotum 306 | sh!t 307 | shemale 308 | shi+ 309 | sh!+ 310 | slut 311 | smut 312 | teets 313 | tits 314 | boobs 315 | b00bs 316 | teez 317 | testical 318 | testicle 319 | titt 320 | bitching 321 | idiot 322 | w00se 323 | jackoff 324 | wank 325 | whoar 326 | whore 327 | damn 328 | dyke 329 | fuck 330 | shit 331 | @$$ 332 | amcik 333 | andskota 334 | arse 335 | assrammer 336 | ayir 337 | bi7ch 338 | bitch 339 | bollock 340 | breasts 341 | butt-pirate 342 | cabron 343 | cazzo 344 | chraa 345 | chuj 346 | cock 347 | cunt 348 | d4mn 349 | daygo 350 | dego 351 | dick 352 | dike 353 | dupa 354 | dziwka 355 | ejackulate 356 | ekrem 357 | ekto 358 | enculer 359 | faen 360 | fag 361 | fanculo 362 | fanny 363 | feces 364 | feg 365 | felcher 366 | ficken 367 | fitt 368 | flikker 369 | foreskin 370 | fotze 371 | fu( 372 | fuk 373 | futkretzn 374 | gay 375 | gook 376 | guiena 377 | h0r 378 | h4x0r 379 | hell 380 | helvete 381 | hoer 382 | honkey 383 | huevon 384 | hui 385 | injun 386 | jizz 387 | kanker 388 | kike 389 | klootzak 390 | kraut 391 | knulle 392 | kuk 393 | kuksuger 394 | kurac 395 | kurwa 396 | kusi 397 | kyrpa 398 | lesbo 399 | mamhoon 400 | masturbat 401 | merd 402 | mibun 403 | monkleigh 404 | mouliewop 405 | muie 406 | mulkku 407 | muschi 408 | nazis 409 | nepesaurio 410 | nigger 411 | orospu 412 | paska 413 | perse 414 | picka 415 | pierdol 416 | pillu 417 | pimmel 418 | piss 419 | pizda 420 | poontsee 421 | poop 422 | porn 423 | p0rn 424 | pr0n 425 | preteen 426 | pula 427 | pule 428 | puta 429 | puto 430 | qahbeh 431 | queef 432 | rautenberg 433 | schaffer 434 | scheiss 435 | schlampe 436 | schmuck 437 | screw 438 | sh!t 439 | sharmuta 440 | sharmute 441 | shipal 442 | shiz 443 | skribz 444 | skurwysyn 445 | sphencter 446 | spic 447 | spierdalaj 448 | splooge 449 | suka 450 | b00b 451 | testicle 452 | titt 453 | twat 454 | vittu 455 | wank 456 | wetback 457 | wichser 458 | wop 459 | yed 460 | zabourah -------------------------------------------------------------------------------- /DamerauLevenshteinDistance/dameraulevenshtein.py: -------------------------------------------------------------------------------- 1 | # 2 | # Author: Michael Homer 3 | # Date: Sunday, April 26th, 2009 4 | # License: MIT 5 | # 6 | 7 | def dameraulevenshtein(seq1, seq2): 8 | """Calculate the Damerau-Levenshtein distance between sequences. 9 | 10 | This distance is the number of additions, deletions, substitutions, 11 | and transpositions needed to transform the first sequence into the 12 | second. Although generally used with strings, any sequences of 13 | comparable objects will work. 14 | 15 | Transpositions are exchanges of *consecutive* characters; all other 16 | operations are self-explanatory. 17 | 18 | This implementation is O(N*M) time and O(M) space, for N and M the 19 | lengths of the two sequences. 20 | 21 | >>> dameraulevenshtein('ba', 'abc') 22 | 2 23 | >>> dameraulevenshtein('fee', 'deed') 24 | 2 25 | 26 | It works with arbitrary sequences too: 27 | >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e']) 28 | 2 29 | """ 30 | # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F 31 | # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix. 32 | # However, only the current and two previous rows are needed at once, 33 | # so we only store those. 34 | oneago = None 35 | thisrow = range(1, len(seq2) + 1) + [0] 36 | for x in xrange(len(seq1)): 37 | # Python lists wrap around for negative indices, so put the 38 | # leftmost column at the *end* of the list. This matches with 39 | # the zero-indexed strings and saves extra calculation. 40 | twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1] 41 | for y in xrange(len(seq2)): 42 | delcost = oneago[y] + 1 43 | addcost = thisrow[y - 1] + 1 44 | subcost = oneago[y - 1] + (seq1[x] != seq2[y]) 45 | thisrow[y] = min(delcost, addcost, subcost) 46 | # This block deals with transpositions 47 | if (x > 0 and y > 0 and seq1[x] == seq2[y - 1] 48 | and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]): 49 | thisrow[y] = min(thisrow[y], twoago[y - 2] + 1) 50 | return thisrow[len(seq2) - 1] -------------------------------------------------------------------------------- /DamerauLevenshteinDistance/example.py: -------------------------------------------------------------------------------- 1 | # example usage using badwords.txt (not for the easily offended, but seriously, you're from the internet sooo...) 2 | 3 | 4 | from dameraulevenshtein import dameraulevenshtein as dl_distance 5 | import string 6 | 7 | #open the badwords.txt 8 | file = open("badwords.txt", "r") 9 | swear_list = map( string.strip, file.readlines() ) #strips that annoying \n 10 | 11 | def isswear( word, max_distance = 1): 12 | """ 13 | checks if word is a swear word, or a missing spelling of swear word. 14 | """ 15 | word = word.lower() 16 | dl = lambda x: dl_distance(x, word) <= max_distance 17 | return any( map(dl, swear_list) ) 18 | 19 | 20 | 21 | 22 | if __name__=="__main__": 23 | words_to_test = ["boo", "cameron", "pissy", "ashole", "azzhole", "btiching"] 24 | 25 | print "max distance = 1" 26 | for w in words_to_test: 27 | print w, isswear(w) 28 | print 29 | print "max distance = 2" 30 | for w in words_to_test: 31 | print w, isswear(w,2) 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /DiscreteOptionPricing/price_bounds.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a simple, recursive, implementation of pricing 3 | a option with uncertain volatility (known sigma_max, sigma_min) in a 4 | recombining trinomial tree model. 5 | 6 | It is surprisingly fast, thanks to cacheing the calls. 7 | 8 | Example of use below. 9 | 10 | """ 11 | 12 | import numpy as np 13 | 14 | class memorize( object ): 15 | 16 | def __init__(self, func): 17 | self.func = func 18 | self.cache = {} 19 | 20 | def __call__(self, *args): 21 | try: 22 | return self.cache[args] 23 | except: 24 | self.cache[args] = self.func(*args) 25 | return self.cache[args] 26 | 27 | def __repr__(self): 28 | return self.func.__doc__ 29 | 30 | def Snj( S_0, n ,j, sigma_max, r, t_delta): 31 | return S_0*np.exp( j*sigma_max*np.sqrt(delta_t) + n*r*delta_t ) 32 | 33 | @memorize 34 | def price( style, F, sigma_max, sigma_min, delta_t, r, S_0, n, j, N): 35 | """ 36 | This is the main function. 37 | style: either "min" or "max", get the min or maximum price respectively. 38 | F: the final payoff function 39 | sigma_max, sigma_min: the max and min volatility 40 | delta_t: the length of time step 41 | r: the risk-free rate 42 | S_0: the initial price of the underlying 43 | n: the time step 44 | j: position in tree 45 | N: the number of time steps. I'd keep this not too large, else you stack overflow lol. 46 | """ 47 | if n == N: 48 | return F( Snj(S_0, n, j, sigma_max, r, delta_t ) ) 49 | 50 | t = sigma_max*np.sqrt(delta_t)/2 51 | l = (1-t)*price(style,F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j+1, N) + \ 52 | (1+t)*price(style,F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j-1, N) - \ 53 | 2*price(style,F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j, N) 54 | 55 | c = 0.5 if (1-2*(style=="min"))*l >= 0 else sigma_min**2/(2*sigma_max**2) 56 | 57 | return np.exp( -r*delta_t)*( price(style, F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j, N) + c*l ) 58 | 59 | 60 | 61 | if __name__=="__main__": 62 | 63 | def F(x): 64 | # a collared option. 65 | return max(0, x - 100) - max( 0, x - 120) 66 | 67 | sigma_max = 0.4 68 | sigma_min = 0.1 69 | r= 0.1 70 | S0 = 100. 71 | N = 100. 72 | delta_t = 1.0/N 73 | 74 | print price("min", F, sigma_max, sigma_min, delta_t, r, S0, 0,0, N ) 75 | print price("max", F, sigma_max, sigma_min, delta_t, r, S0, 0,0, N ) 76 | 77 | """ 78 | 4.54345306389 79 | 12.358008422 80 | """ 81 | -------------------------------------------------------------------------------- /DiscreteOptionPricing/shout_option.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 2012-01-17 3 | 4 | @author: Cameron Davidson-Pilon 5 | 6 | 7 | ''' 8 | import math 9 | 10 | def binomial_shoutout(r, u, q, n, x_0, k): 11 | """ 12 | r: risk-free rate 13 | u: the return of an up jump 14 | q: the Risk Neutral probability of an 'up' jump 15 | n: the number of periods 16 | x_0: the start price 17 | k: the strike price 18 | 19 | Explanation of model: 20 | 21 | The central idea of my model is based on the recursive formula for a binomial option pricing: 22 | 23 | V(t,x) = D*( q*V(t+1, u*x) + (1-q)*V(t+1, x/u) ) (1) 24 | 25 | The value of the option at the next time step, V_{t+1}, is dependent on whether the investor chooses to shout or not 26 | to shout at the current node. The investor tries to maximize his/her profit, thus, I modified the formula above to: 27 | 28 | V_t = D*( max ( shout now, shout later) ) (2) 29 | 30 | The task is now the calculate the two quantities, 'shout now' and 'shout later'. The value if the investor 31 | shouts now is given by (1) with the new payoff max( K-S_T, 0, K-S_t*), and we can calculate the 'shout now' value. 32 | The 'shout later' requires us to look at the the next value of the nodes in the tree, and calculate the value of these 33 | nodes given we have NOT shouted. 34 | 35 | We can formulize this as: 36 | 37 | V(t,x) = D*( max( q*V(t+1, u*x | shout position = x) + (1-q)*V(t+1, x/u | shout position = x) , (3) 38 | q*V(t+1, u*x | haven't shouted ) + (1-q)*V(t+1, x/u | haven't shouted ) ) ) 39 | 40 | So when should one shout? Heuristically, one should shout when the expected value of shouting now is greater 41 | then the expected value of waiting to shout later, i.e. when 42 | 43 | E[ V(t+1 | shout position = x ] > E[ V(t+1 | haven't shouted yet) ] (4) 44 | 45 | Both expectations are under the risk-neutral measure. I tried this heuristic and found that it is optimal 46 | to immediately shout if the option is in the money. This makes sense, as if the stock drops, you gain the large 47 | K-S_T payoff, but if the stock rises you are protected and still receive, albeit small, K-S_t* > 0. Obviously, 48 | if the stock is not in the money it is pointless to shout. 49 | 50 | 51 | """ 52 | 53 | R = float(r); U = float(u); Q = float(q); X_0 = float(x_0); K = float(k); N = float(n) 54 | D = math.exp(-R/N) 55 | dictionary={} 56 | shout_times = [] 57 | 58 | def payoff(K,x,m): 59 | #This is a put-style payoff 60 | return max(K-x, K-m, 0) 61 | 62 | 63 | def value(n, x, m): 64 | """ find the value of a shout put""" 65 | try: 66 | return dictionary["%s,%s,%s"%(n,x,m)] 67 | except: 68 | if n==N: 69 | return payoff(K,x,m) 70 | else: 71 | shout_now = Q*value(n+1, U*x, m) + (1-Q)*value(n+1,x/U, m) 72 | if m==x: 73 | shout_later = Q*value(n+1, U*x, U*x) + (1-Q)*value(n+1, x/U, x/U) 74 | else: 75 | shout_later = 0 76 | 77 | if shout_now>shout_later: 78 | #This is the condition when to shout. If true, add it shout_times 79 | if f(m,n) not in shout_times: 80 | shout_times.append( f(m,n) ) 81 | 82 | y = D*max( shout_now, shout_later ) 83 | dictionary[ "%s,%s,%s"%(n,x,m) ] = y 84 | 85 | return y 86 | 87 | def f(x,n): 88 | """ This is to find the number of up jumps given a price x and time period n.""" 89 | return (n,int(0.5*(math.log(x/X_0,U)+n))) 90 | 91 | def delta(n,k): 92 | """This function computes the delta at each node (n,k), where n is the number 93 | of up jumps and n is the time period""" 94 | up = X_0*u**(2*k-n+1) 95 | down = X_0*u**(2*k-n-1) 96 | return ( value(n,up,up)-value(n,down,down) )/(up-down) 97 | 98 | print value(0,X_0,X_0) 99 | for s in shout_times: 100 | print s 101 | 102 | """ 103 | Example: 104 | 105 | Assume the following: 106 | (i) S(0) = K = 1, and the volatility of the underlying security is sigma = 40%. 107 | (ii) The continuously compounded interest rate is constant and equal to r = 1%. 108 | (iii) The maturity of the contract is 12 months, and the owner can "shout" only at the end of each 109 | month. 110 | (iv) The underlying security pays no dividends. 111 | Using a binomial tree model with 12 time periods, find the value of this option at time zero. Identify 112 | all nodes at which it is optimal for the owner to "shout" and find the replicating portfolio at time 0. 113 | 114 | From this, we need the size of an "up" stock movement, and the risk-neutral probability of an "up" movement. 115 | r = 0.01 116 | sigma = .40 117 | periods = 12 118 | S_0 = K = 1 119 | 120 | 121 | u = exp(sigma/sqrt(periods) ) 122 | q = (exp(r/periods) - 1/u)/(u - 1/u) 123 | 124 | binomial_shoutout(r, u, q, periods, S_0, K) 125 | 126 | 127 | -------------------------------------------------------------------------------- /DiscreteSDE/discreteSDE.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simulate 1-d stochastic differential equations numerically using different schemes 3 | 4 | 5 | Example: 6 | 7 | kappa = 0.3 8 | b = 0.07 9 | sigma = 0.06 10 | gamma = 0.5 11 | delta =0.004 12 | N = 1e6 13 | 14 | def drift(x): 15 | return kappa*( b - x) 16 | 17 | def diffusion(x): 18 | return sigma*x**(gamma) 19 | 20 | #this is a CIR process 21 | 22 | sdeEuler = DiscreteSDE( drift, diffusion, "euler", startPosition = b, delta=delta ) 23 | sde.sample( 10, N ) 24 | 25 | 26 | """ 27 | 28 | import scipy.stats as stats 29 | import numpy as np 30 | from time import clock 31 | 32 | class DiscreteSDE( object ): 33 | """ 34 | initialize: 35 | drift: the drift function, univariate, must accept and return array of same size. 36 | diffusion the diffusion function, univariate, must accept and return array of same size. 37 | method: a string in ["euler", "milstein", "second-order" ] 38 | delta: the time step 39 | startTime: the starting time of the process 40 | startPosition: the starting position of the process 41 | 42 | methods: 43 | sample( t, n): sample the sde n times until time t. Returns a 2d numpy array with time along the columns 44 | """ 45 | 46 | 47 | def __init__(self, drift, diffusion, method, delta = 0.001, startTime = 0, startPosition =0 ): 48 | self.drift = drift 49 | self.diffusion = diffusion 50 | if method.lower() not in ["euler", "milstein", "second_order" ]: 51 | raise 52 | else: 53 | self.method = method 54 | self.delta = delta 55 | self.startTime = startTime 56 | self.startPosition = startPosition 57 | 58 | def sample(self,t=1, n=1): 59 | return getattr( self, self.method )(t, n) 60 | 61 | 62 | def euler(self, t, n): 63 | #initalize 64 | P,N = self._init(t,n) 65 | 66 | for i in xrange(1,int(N)): 67 | x = P[:, i-1] 68 | P[:,i] = x + self.drift(x)*self.delta + self.diffusion(x)*np.sqrt(self.delta)*np.random.randn( n ) 69 | 70 | return P 71 | 72 | 73 | def milstein(self,t,n, h = 0.001): 74 | 75 | def diff_prime( u ): 76 | return (self.diffusion( u + h/2 ) - self.diffusion( u - h/2))/h 77 | 78 | 79 | P, N = self._init(t,n) 80 | for i in xrange(1,int(N)): 81 | x = P[:, i-1] 82 | R = np.random.randn( n ) 83 | P[:,i] = x + self.drift(x)*self.delta + self.diffusion(x)*np.sqrt(self.delta)*R + \ 84 | 0.5*diff_prime( x)*self.diffusion(x)*( self.delta*R**2 - self.delta ) 85 | 86 | return P 87 | 88 | def second_order( self, t, n ): 89 | P, N = self._init(t,n) 90 | 91 | 92 | cov = np.array( [[self.delta, 0.5*self.delta**2],[ 0.5*self.delta**2, self.delta**3/3 ]] ) 93 | mu = np.array( [0,0] ) 94 | for i in xrange(1,int(N)): 95 | x = P[:, i-1] 96 | RI = np.random.multivariate_normal( mu, cov, n ) 97 | R = RI[:,0] 98 | I = RI[:,1] 99 | 100 | P[:,i] = x + self.drift(x)*self.delta + self.diffusion(x)*np.sqrt(self.delta)*R + \ 101 | (first_derivative( self.drift, x)*self.drift(x) - 0.5*self.diffusion(x)**2*second_derivative( self.drift, x) )*0.5*self.delta**2 + \ 102 | (first_derivative( self.diffusion, x)*self.drift(x) - 0.5*self.diffusion(x)**2*second_derivative( self.diffusion, x) )*(self.delta*R - I) + \ 103 | ( self.diffusion(x)*first_derivative(self.drift,x) )*I + \ 104 | ( self.diffusion(x)*first_derivative(self.diffusion, x) )*(R**2 - self.delta) 105 | return P 106 | 107 | 108 | 109 | def _init(self,t, n ): 110 | if t < self.startTime: 111 | raise 112 | N = np.floor( t / self.delta ) 113 | M = np.zeros( (n, N) ) 114 | M[:,0] = self.startPosition 115 | return M,N 116 | 117 | 118 | def first_derivative( f, x, h = 0.001): 119 | return ( f(x + h) - f(x-h) )/(2*h) 120 | 121 | def second_derivative( f, x, h = 0.001): 122 | return (f(x + h) - 2*f(x) + f(x-h) )/(h**2) 123 | 124 | 125 | 126 | 127 | 128 | if __name__=="__main__": 129 | 130 | kappa = 0.3 131 | b = 0.07 132 | sigma = 0.06 133 | gamma = 0.5 134 | print "Parameters:" 135 | print "kappa: %.2f, b: %0.2f, sigma;: %0.2f, gamma: %0.2f"%( kappa, b, sigma, gamma ) 136 | 137 | def drift(x): 138 | return kappa*( b - x) 139 | 140 | def diffusion(x): 141 | return sigma*x**(gamma) 142 | 143 | delta =0.004 144 | sdeEuler = DiscreteSDE( drift, diffusion, "euler", startPosition = b, delta=delta ) 145 | sdeMilstein = DiscreteSDE( drift, diffusion, "milstein", startPosition = b, delta = delta) 146 | sdeSecondOrder = DiscreteSDE( drift, diffusion, "second_order", startPosition = b, delta = delta) 147 | 148 | 149 | N = 5000 150 | print "delta = 0.004" 151 | 152 | start = clock() 153 | eulerAt3 = sdeEuler.sample( 3, N )[:, -1] 154 | eulerAt3.sort() 155 | print "Euler: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start) 156 | 157 | start = clock() 158 | eulerAt3 = sdeMilstein.sample( 3, N )[:, -1] 159 | eulerAt3.sort() 160 | print "Milstein: q = %.3f s.t. P( X_3 > q ) <= 0.1.Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start) 161 | 162 | start = clock() 163 | eulerAt3 = sdeSecondOrder.sample( 3, N )[:, -1] 164 | eulerAt3.sort() 165 | print "SecondOrder: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start) 166 | 167 | 168 | print 169 | delta = 0.1 170 | print "delta = 0.1" 171 | start = clock() 172 | sdeEuler.delta = sdeMilstein.delta = sdeSecondOrder.delta = delta 173 | 174 | start = clock() 175 | eulerAt3 = sdeEuler.sample( 3, N )[:, -1] 176 | eulerAt3.sort() 177 | print "Euler: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start) 178 | 179 | start = clock() 180 | eulerAt3 = sdeMilstein.sample( 3, N )[:, -1] 181 | eulerAt3.sort() 182 | print "Milstein: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start) 183 | 184 | eulerAt3 = sdeSecondOrder.sample( 3, N )[:, -1] 185 | eulerAt3.sort() 186 | print "Second Order: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start) 187 | 188 | 189 | """ 190 | A bond price is given by: 191 | P(0,T) = E[ exp( -\int_0^T r_t dt ) ] 192 | Is the question asking use to compute this integral, which includes the integration? Sure I'll do it. 193 | 194 | P(0,T) ~= 1/N * exp( -delta*( \sum r_i ) ) 195 | 196 | 197 | """ 198 | 199 | def bond_price( r_t, delta): 200 | return np.exp( -delta*(r_t.sum()) ) 201 | 202 | 203 | 204 | def print_partB( discreteSDE, end_time, delta, name ): 205 | start = clock() 206 | discreteSDE.delta = delta 207 | value = stats.nanmean(np.apply_along_axis( lambda u: bond_price(u, delta), 1, discreteSDE.sample( end_time, N )) ) 208 | print "%s: estimates %.4f on %d year bond. Delta: %.4f, Time: %.2f"%(name, value, end_time, delta, clock() - start) 209 | return 210 | 211 | print_partB( sdeEuler, 3, 0.004, "Euler" ) 212 | print_partB( sdeEuler, 3, 0.1, "Euler" ) 213 | print_partB( sdeEuler, 10, 0.004, "Euler" ) 214 | print_partB( sdeEuler, 10, 0.1, "Euler" ) 215 | print 216 | 217 | print_partB( sdeMilstein, 3, 0.004, "Milstein" ) 218 | print_partB( sdeMilstein, 3, 0.4, "Milstein" ) 219 | print_partB( sdeMilstein, 10, 0.004, "Milstein" ) 220 | print_partB( sdeMilstein, 10, 0.1, "Milstein" ) 221 | print 222 | 223 | print_partB( sdeSecondOrder, 3, 0.004, "Second-Order" ) 224 | print_partB( sdeSecondOrder, 3, 0.1, "Second-Order" ) 225 | print_partB( sdeSecondOrder, 10, 0.004, "Second-Order" ) 226 | print_partB( sdeSecondOrder, 10, 0.1, "Second-Order" ) 227 | 228 | 229 | 230 | 231 | -------------------------------------------------------------------------------- /Estimators/theil_sen.py: -------------------------------------------------------------------------------- 1 | """ 2 | This implements the Theil-Sen linear regression estimator for 2d data points. 3 | The jist of it is: 4 | It returns the median all computed slope value between pairs (x_i, y_i), (x_j, y_j), (x_i > x_j) 5 | where slope = (y_i - y_j)/(x_i - x_j) 6 | 7 | 8 | Very robust to outliers. 9 | 10 | """ 11 | import numpy as np 12 | import bottleneck #very fast searching and sorting written in Cython. 13 | import itertools 14 | 15 | def theil_sen(x,y, sample= "auto", n_samples = 1e7): 16 | """ 17 | Computes the Theil-Sen estimator for 2d data. 18 | parameters: 19 | x: 1-d np array, the control variate 20 | y: 1-d np.array, the ind variate. 21 | sample: if n>100, the performance can be worse, so we sample n_samples. 22 | Set to False to not sample. 23 | n_samples: how many points to sample. 24 | 25 | This complexity is O(n**2), which can be poor for large n. We will perform a sampling 26 | of data points to get an unbiased, but larger variance estimator. 27 | The sampling will be done by picking two points at random, and computing the slope, 28 | up to n_samples times. 29 | 30 | """ 31 | assert x.shape[0] == y.shape[0], "x and y must be the same shape." 32 | n = x.shape[0] 33 | 34 | if n < 100 or not sample: 35 | ix = np.argsort( x ) 36 | slopes = np.empty( n*(n-1)*0.5 ) 37 | for c, pair in enumerate(itertools.combinations( range(n),2 ) ): #it creates range(n) =( 38 | i,j = ix[pair[0]], ix[pair[1]] 39 | slopes[c] = slope( x[i], x[j], y[i],y[j] ) 40 | else: 41 | i1 = np.random.randint(0, n, n_samples) 42 | i2 = np.random.randint(0, n, n_samples) 43 | slopes = slope( x[i1], x[i2], y[i1], y[i2] ) 44 | #pdb.set_trace() 45 | 46 | slope_ = bottleneck.nanmedian( slopes ) 47 | #find the optimal b as the median of y_i - slope*x_i 48 | intercepts = np.empty( n ) 49 | for c in xrange(n): 50 | intercepts[c] = y[c] - slope_*x[c] 51 | intercept_ = bottleneck.median( intercepts ) 52 | 53 | return np.array( [slope_, intercept_] ) 54 | 55 | 56 | 57 | def slope( x_1, x_2, y_1, y_2): 58 | return (1 - 2*(x_1>x_2) )*( (y_2 - y_1)/np.abs((x_2-x_1)) ) 59 | 60 | 61 | 62 | 63 | if __name__=="__main__": 64 | x = np.asarray( [ 0.0000, 0.2987, 0.4648, 0.5762, 0.8386 ] ) 65 | y = np.asarray( [ 56751, 57037, 56979, 57074, 57422 ] ) 66 | print theil_sen( x, y ) 67 | -------------------------------------------------------------------------------- /KalmanFilter/simple_kalman.py: -------------------------------------------------------------------------------- 1 | #kalman filter, simple example from http://en.wikipedia.org/wiki/Kalman_filter 2 | 3 | import numpy as np 4 | from numpy.linalg import inv 5 | from numpy import dot 6 | from matplotlib import pyplot as plt 7 | 8 | 9 | def predict(x, F, B, u, P, Q ): 10 | assert x.shape[1] == 1 11 | assert F.shape[0] == x.shape[0] 12 | assert u.shape == x.shape 13 | assert B.shape[0] == u.shape[0] 14 | assert F.shape[1] == P.shape[0] 15 | assert Q.shape == P.shape 16 | 17 | x_p = dot(F, x) + dot(B,u) 18 | P_p = dot(F,P).dot(F.T) + Q 19 | 20 | assert x.shape == x_p.shape 21 | assert P_p.shape == P.shape 22 | 23 | return x_p, P_p 24 | 25 | def update(z, H, x_p, P_p, R ): 26 | assert H.shape[1] == x_p.shape[0] 27 | assert H.shape[1] == P_p.shape[0] 28 | assert R.shape[0] == H.shape[0] 29 | assert z.shape[1] == 1 30 | assert z.shape[0] == H.shape[0] 31 | 32 | y = z - dot(H,x_p) 33 | S = dot(H,P_p).dot(H.T) + R 34 | K = dot(P_p, H.T).dot(inv(S)) 35 | x_u = x_p + dot(K,y) 36 | P_u = (np.eye(K.shape[0]) - dot(K,H)).dot(P_p) 37 | 38 | return x_u, P_u 39 | 40 | 41 | def run(acc_variance=1., obs_variance=1., delta_t = 0.5): 42 | steps = 100 43 | X_guesses = np.zeros((2,steps)) 44 | X_actual = np.zeros((2,steps)) 45 | 46 | F = np.array([[1, delta_t],[0,1]]) 47 | G = np.array([[delta_t**2/2, delta_t]]) 48 | B = np.zeros((2,2)) 49 | u = np.zeros((2,1)) 50 | Q = np.array([ [delta_t**4/4, delta_t**3/2], [delta_t**3/2, delta_t**2]])*acc_variance 51 | H = np.array([[1,0]]) 52 | R = np.array( [[obs_variance]]) 53 | 54 | #initial values 55 | x = x_g = np.zeros((2,1)) 56 | P_g = np.zeros((2,2)) 57 | 58 | for i in range(steps): 59 | x = dot(F,x) + np.random.multivariate_normal( [0,0], Q ).reshape( 2, 1) 60 | 61 | x_p, P_p = predict(x_g, F, B, u, P_g, Q ) 62 | z = dot(H,x) + np.random.normal(0,obs_variance) 63 | 64 | x_g, P_g = update(z, H, x_p, P_p, R ) 65 | 66 | #print x_g.shape, P_g.shape 67 | X_guesses[:,i] = x_g[:,0] 68 | X_actual[:,i] = x[:,0] 69 | 70 | 71 | return X_guesses[0,:], X_actual[0,:] 72 | 73 | 74 | delta_t = 0.5 75 | actual, guesses = run(2.,10., delta_t) 76 | 77 | plt.plot(actual, label='actual') 78 | plt.plot(guesses, label='guesses') 79 | 80 | plt.legend() 81 | plt.show() 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /MachineLearningScikitLearn/BayesianBandit.py: -------------------------------------------------------------------------------- 1 | ##Bayesian Bandit in python 2 | 3 | import scipy.stats as stats 4 | import numpy as np 5 | 6 | 7 | 8 | class BayesianBandit( object ): 9 | 10 | 11 | def __init__( self, prior_alpha = 1, prior_beta = 1 ): 12 | self.prior_alpha = 1 13 | self.prior_beta = 1 14 | self.betad = stats.beta 15 | 16 | 17 | 18 | def fit(self, bandits, trials = 10 ): 19 | """ 20 | Bandits is an object that can be called like bandits.pull(choice) and returns a 0 or 1. 21 | 22 | 23 | """ 24 | n_bandits = len( bandits ) 25 | self.n_pulls = np.zeros( n_bandits ) 26 | self.n_successes = np.zeros( n_bandits ) 27 | self.prior_distibutions = np.array( [self.prior_alpha, self.prior_beta])*np.ones( (n_bandits, 2 ) ) 28 | 29 | for i in xrange(trials): 30 | 31 | choice = np.argmax( self.betad.rvs( self.prior_distibutions[:,0] + self.n_successes, 32 | self.prior_distibutions[:,1] + self.n_pulls - self.n_successes ) ) 33 | outcome = bandits.pull(choice) 34 | self.n_pulls[choice] += 1 35 | self.n_successes[choice] += outcome 36 | 37 | self.posterior_alpha = self.prior_distibutions[:,0] + self.n_successes 38 | self.posterior_beta = self.prior_distibutions[:,1] + self.n_pulls - self.n_successes 39 | return 40 | 41 | def predict(self, n=1): 42 | choices = np.zeros( n ) 43 | for i in range(n): 44 | 45 | 46 | choice = np.argmax( self.betad.rvs( self.prior_distibutions[:,0] + self.n_successes, 47 | self.prior_distibutions[:,1] + self.n_pulls - self.n_successes ) ) 48 | choices[i] = choice 49 | 50 | return choices 51 | 52 | 53 | class Bandits(object): 54 | 55 | def __init__(self, probabilities ): 56 | self.probabilities = probabilities 57 | 58 | 59 | def pull( self, choice): 60 | return 1 if np.random.random() < self.probabilities[choice] else 0 61 | 62 | 63 | 64 | def __len__(self): 65 | return len( self.probabilities ) 66 | -------------------------------------------------------------------------------- /MachineLearningScikitLearn/blender.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | from sklearn.cross_validation import ShuffleSplit 4 | from time import clock 5 | import pp 6 | 7 | class Blender( object): 8 | """ 9 | This class implements a linear blend of different models. 10 | 11 | 12 | methods: 13 | fit( data, response, dict_of_additional_variables ) 14 | add_model( model, name) 15 | predict( new_data, dict_of_additional_variables ) 16 | 17 | 18 | attributes: 19 | coefs_ 20 | 21 | """ 22 | 23 | 24 | def __init__( self, blender = LinearRegression(), training_fraction = 0.8, verbose = False): 25 | self.blender = blender 26 | self.training_fraction = training_fraction 27 | self.verbose = verbose 28 | self.models = dict() 29 | self._n_models = 0 30 | 31 | 32 | def add_model(self, model, name=None): 33 | """ 34 | model: a sklearn model that exposes the methods fit & predict. 35 | name: a name to specify the model, eg "ElNet500alpha" 36 | 37 | """ 38 | self._n_models +=1 39 | if not name: 40 | name = "%d"%( self._n_models ) 41 | self.models[name] = model 42 | return 43 | 44 | def del_model(self, name ): 45 | try: 46 | del self.models[name] 47 | except KeyError: 48 | print "Model %s not in blender."%name 49 | 50 | return 51 | 52 | 53 | def split_arrays(self, n, test_fraction = 0.1 ): 54 | 55 | 56 | shfSplt = ShuffleSplit( n=n, n_iterations=1, test_size = test_fraction) 57 | train_ix, test_ix = shfSplt.__iter__().next() 58 | return train_ix, test_ix 59 | 60 | 61 | 62 | def fit(self, data, response, dict_of_additional_variables={}): 63 | """ 64 | data: the data matrix, shape (n,d) 65 | response: the response vector (n,) 66 | dict_of_additional_variables: 67 | a dictionary with the keys the model names (optional to include), and the items are of the form: 68 | {"train":[ items to be included in training], "test":[items to be included in testing] } 69 | """ 70 | 71 | #split the data to held-in and held-out. 72 | train_ix, blend_ix = self.split_arrays( data.shape[0], test_fraction = 1- self.training_fraction ) 73 | training_data, blend_data, training_response, blend_response = data[train_ix], data[blend_ix], response[train_ix], response[blend_ix] 74 | 75 | 76 | X = np.zeros( (blend_response.shape[0], len( self.models ) ) ) 77 | 78 | if self.verbose: 79 | print "Shape of training data vs blending data: ", training_data.shape, blend_data.shape 80 | #train the models 81 | 82 | 83 | #try some parrallel 84 | ncpus = max( len( self.models ), 32 ) 85 | job_server = pp.Server( ncpus, ppservers = () ) 86 | jobs = dict() 87 | to_import = ("import numpy as np", "sklearn", "time", "from localRegression import *", "from sklearn.linear_model import sparse", "from sklearn.utils import atleast2d_or_csc") 88 | for name, model in sorted( self.models.iteritems() ): 89 | 90 | try: 91 | fitargs = [ training_data, training_response] + [ array[train_ix] for array in dict_of_additional_variables[name ]] 92 | predictargs = [ blend_data ] + [ array[blend_ix] for array in dict_of_additional_variables[name] ] 93 | except KeyError: 94 | fitargs = [ training_data , training_response] 95 | predictargs = [ blend_data ] 96 | 97 | jobs[name] = job_server.submit( pp_run,(model, name, self.verbose, fitargs, predictargs), (), to_import ) 98 | 99 | if self.verbose: 100 | print "Model %s sent to cpu."%name 101 | 102 | i = 0 103 | for name, model in sorted( self.models.iteritems() ): 104 | self.models[name], X[:,i] = jobs[name]() 105 | i+=1 106 | 107 | if self.verbose: 108 | print "Fitting finished, starting blending." 109 | 110 | self.blender.fit( X, blend_response ) 111 | self.coef_ = self.blender.coef_ 112 | 113 | self._fit_training_data = training_data 114 | self._fit_blend_data = blend_data 115 | self._fit_training_response = training_response 116 | self._fit_blend_response = blend_response 117 | 118 | if self.verbose: 119 | print "Done fitting" 120 | job_server.destroy() 121 | return self 122 | 123 | def predict( self, data, dict_of_additional_variables={}): 124 | 125 | ncpus = max( len( self.models ), 32 ) 126 | job_server = pp.Server( ncpus, ppservers = () ) 127 | jobs = dict() 128 | to_import = ("import numpy as np", "sklearn", "time", "from localRegression import *", "from sklearn.linear_model import sparse", "from sklearn.utils import atleast2d_or_csc") 129 | for name, model in sorted( self.models.iteritems() ): 130 | try: 131 | predictargs = [data] + dict_of_additional_variables[name] 132 | except KeyError: 133 | predictargs = [ data ] 134 | 135 | jobs[name] = job_server.submit( pp_predict, (model, name, self.verbose, predictargs), (), to_import) 136 | 137 | X = np.zeros( (data.shape[0], len( self.models ) ) ) 138 | i = 0 139 | for name, model in sorted( self.models.iteritems() ): 140 | X[:,i] = jobs[name]() 141 | i+=1 142 | job_server.destroy() 143 | return self.blender.predict( X ) 144 | 145 | 146 | 147 | def pp_predict( model, name, verbose, predictargs): 148 | start = time.clock() 149 | p = model.predict( *predictargs ) 150 | if verbose: 151 | print "Model %s fitted, took %.2f seconds"%(name, time.clock() - start ) 152 | return p 153 | 154 | def pp_run( model, name, verbose, fitargs, predictargs): 155 | 156 | start = time.clock() 157 | model.fit(*fitargs) 158 | if verbose: 159 | print "Model %s fitted, took %.2f seconds."%(name, time.clock() - start ) 160 | prediction = model.predict( *predictargs ) 161 | return model, prediction 162 | -------------------------------------------------------------------------------- /MachineLearningScikitLearn/ensembleSelector.py: -------------------------------------------------------------------------------- 1 | #ensemble selection 2 | 3 | import numpy as np 4 | 5 | 6 | def RMSE( Z, W): 7 | return np.sqrt( ((Z - W[:,None])**2).mean(axis=0) ) 8 | 9 | def basis(i, N): 10 | z = np.zeros(N) 11 | z[i] = 1 12 | return z 13 | 14 | 15 | class EnsembleSelection( object ): 16 | """ 17 | This class implements a greedy ensemble selection algorithm outlined in Ensemble Selection from Libraries of Models. 18 | The algorthim starts with an initial ensemble of models (if fraction_sorted_initialization > 0), and addeds models 19 | sequentially until improve falls below some threshold or the max number of models are selected. 20 | 21 | verbose: 0,1 or 2. Report the current score, number of models at each iterations. 22 | with_replacement: all the algorithm to select models already selected. 23 | fraction_sorted_initialization: The fraction of the best models to initialilly include in the ensemble 24 | bag_selection: Perform the following bagged_selection_times. select bagged_fraction and perform the greedy algo of them. 25 | bagged_fraction: see above. 26 | max_models: the maximum number of models to include in an ensemble 27 | score_function: the function to minimize. 28 | tol: the fractional decrease in the score_function to continue selection. RMSE_{i+1}/RMSE_{i} > 1 + tol. 29 | fit_models: instead of giving already fitted models, this object will fit the models too. 30 | training_fraction: the fraction to use for training, 1-training_fraction is used as ensemble selection. 31 | 32 | methods: 33 | add_model( iterable_of_models ): add a collection of models to the algorithm. Must be performed before fit() is called. 34 | Models must be aready fitted and have a .predict() method exposed. 35 | fit( X, Y): perform the ensemble selection on data X and target Y 36 | predict( X ): return the prediction of the ensemble. 37 | 38 | """ 39 | 40 | 41 | def __init__(self, verbose = 1, 42 | with_replacement = True, 43 | fraction_sorted_initialization = 1.0, #bayesian prior of 1/N. 44 | bag_selection = 0, 45 | bagged_fraction = 0.5, 46 | max_models = None, 47 | score_function = RMSE, 48 | tol = 1e-4, 49 | fit_models = False, 50 | training_fraction =0.8, 51 | models = []): 52 | self.verbose = verbose 53 | self.with_replacement = with_replacement 54 | self.fraction_sorted_initialization = fraction_sorted_initialization 55 | self.bag_selection = bag_selection 56 | self.bagged_fraction = bagged_fraction 57 | self.max_models = max_models 58 | self.fit_models = fit_models 59 | self.training_fraction = training_fraction 60 | 61 | self.score_function = score_function 62 | self.tol = tol 63 | 64 | if self.max_models == None: 65 | self.max_models = np.inf 66 | 67 | self.models= models 68 | 69 | 70 | def add_model(self, model ): 71 | """model should be an iterable""" 72 | self.models += [ m for m in model ] 73 | 74 | return 75 | 76 | 77 | def _predict( self, predictions, models_included_ ): 78 | return (np.dot( predictions, models_included_ )/models_included_.sum())[:,None] 79 | 80 | def _fit( self, predictions, Y, ix): 81 | 82 | n,N = predictions.shape 83 | #train and store the prediction results 84 | models_included_ = np.zeros( N ) 85 | 86 | init_n_to_include = max( int( self.fraction_sorted_initialization*N), 1) 87 | models_included_[ np.argsort( self.individual_scores[ix] )[:init_n_to_include] ] = 1 88 | 89 | total_scores_ = np.array( [np.inf, self.score_function( self._predict( predictions, models_included_ ) , Y) ] ) 90 | 91 | while (models_included_.sum() < self.max_models) and ( total_scores_[-2]/total_scores_[-1] > 1 + self.tol ) : 92 | 93 | #find the best addition. 94 | _scores = [ self.score_function(self._predict(predictions, models_included_ + basis(i, N)), Y) \ 95 | for i in range(N) if (models_included_[i] == 0 or self.with_replacement)] 96 | m = np.argmin( _scores ) 97 | if _scores[m] < total_scores_[-1]: 98 | total_scores_ = np.append( total_scores_, _scores[m] ) 99 | models_included_[m] += 1 100 | if self.verbose > 1: 101 | print "Added model %d."%m 102 | print "Current score: %.3f."%total_scores_[-1] 103 | print "Current models included: ", models_included_ 104 | print 105 | else: 106 | flag = True 107 | break 108 | 109 | if self.verbose > 0: 110 | if (models_included_.sum() >= self.max_models): 111 | print "Exited after %d iterations because number of models exceeded. %d >= self.max_models"%(models_included_.sum(), models_included_.sum() ) 112 | elif ( total_scores_[-2]/total_scores_[-1] <= 1 + self.tol ): 113 | print "Exited after %d iterations because tolerence exceeded: %.8f < 1 + tol"%(models_included_.sum(), total_scores_[-2]/total_scores_[-1]) 114 | elif flag: 115 | print "The (local) minimum was found after %d iterations."%(models_included_.sum()) 116 | print "Score: %.4f"%total_scores_[-1] 117 | return models_included_/models_included_.sum() 118 | 119 | 120 | def fit(self, X, Y): 121 | N = len( self.models ) 122 | n,d = X.shape 123 | 124 | if self.fit_models: 125 | cutoff = int(n*self.training_fraction) 126 | a = np.arange(n) 127 | np.random.shuffle(a) 128 | training_data, training_target = X[ a[:cutoff] ,:], Y[ a[:cutoff] ] 129 | [ m.fit( training_data, training_target) for m in self.models ] 130 | 131 | if self.verbose > 0: 132 | print "models trained." 133 | X, Y = X[ a[cutoff:], :], Y[ a[cutoff:] ] 134 | n,d = X.shape 135 | 136 | #train and store the prediction results 137 | predictions = np.zeros( (n, N) ) 138 | for i in range(N): 139 | predictions[ :, i] = self.models[i].predict( X ) 140 | 141 | self.individual_scores = self.score_function( predictions, Y ) 142 | self.models_included_ = np.zeros( N ) 143 | p = self.bagged_fraction if self.bag_selection > 0 else 1 144 | 145 | for i in range( max(1, self.bag_selection ) ): 146 | a = np.arange( N) 147 | np.random.shuffle( a) 148 | ix = a[:int(p*N) ] 149 | models_included_ = self._fit( predictions[:, ix], Y, ix ) 150 | self.models_included_[ix] += models_included_ 151 | 152 | self.models_included_ /= self.models_included_.sum() 153 | self.score_ = self.score_function( self._predict( predictions, self.models_included_), Y ) 154 | return self 155 | 156 | 157 | def get_params(self, deep=False): 158 | return self.__dict__ 159 | 160 | def predict( self, X ): 161 | 162 | N = len( self.models ) 163 | n,d = X.shape 164 | #train and store the prediction results 165 | predictions = np.zeros( (n, N) ) 166 | for i in range(N): 167 | predictions[ :, i] = self.models[i].predict( X ) 168 | 169 | return self._predict( predictions, self.models_included_ ) 170 | 171 | -------------------------------------------------------------------------------- /MachineLearningScikitLearn/maxCorrelationTransformer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.covariance import EllipticEnvelope 3 | from sklearn.linear_model import LinearRegression as LR 4 | 5 | """ 6 | 7 | Note: This shows less than benchmark (all identity) performace. The issue is I am maximizing the wrong thing. I 8 | should be trying to maximize the partial-correlation. TODO 9 | 10 | To do this, we will use a greedy algorithm. 11 | 12 | """ 13 | 14 | np.seterr( all="raise") 15 | 16 | EPSILON = 1e-2 17 | 18 | dict_of_transforms = dict([ 19 | ("identity", lambda x: x), 20 | ("logPlus1",lambda x: np.log(x+1)), 21 | ("sqrtPlus1", lambda x: np.sqrt(x+1) ), 22 | ("sqrt",lambda x: np.sqrt(x) ), 23 | ("cuberoot", lambda x: x**(1.0/3.0) ), 24 | ("squared", lambda x: x**2 ), 25 | ("squaredPlus1", lambda x: (x+1)**2 ), 26 | ("cubed",lambda x: x**3 ), 27 | ("inverse",lambda x: 1./(x+EPSILON) ), 28 | ("exp",lambda x: np.exp(x) ), 29 | ("negexp",lambda x: np.exp(-x) ), 30 | ("inversePlus1", lambda x: 1./(x+1) ), 31 | ("arctan", lambda x: np.arctan(x) ), 32 | ("tan", lambda x: np.tan(x) ), 33 | ("arcsinsqrt", lambda x: np.arcsin(np.sqrt(x)) ), 34 | ("inversesqrt", lambda x: 1.0/(np.sqrt(x)+EPSILON) ), 35 | ("inversesqrtPlus1", lambda x: 1.0/(np.sqrt(x+1)) ), 36 | ("x/(1-x)", lambda x: x/(1-x+EPSILON) ), 37 | ("sqrtlog",lambda x: np.sqrt( -np.log( x + EPSILON) ) ), 38 | ("rank", lambda x: np.argsort( x ) ), 39 | 40 | ]) 41 | 42 | 43 | 44 | class MaxCorrelationTransformer(object): 45 | """ 46 | transforms the features of a data matrix to increase the correlation with a response vector, y. 47 | attributes: 48 | transforms: a dictionary of functions to try as a transform (defaults to dict_of_transforms) 49 | normalize01: True is the data will be normalized to between [0,1] 50 | additional_transforms: a dictionary of transforms in addition to the default. 51 | 52 | 53 | methods: 54 | fit: 55 | transform: 56 | fit_transform: 57 | 58 | 59 | 60 | """ 61 | def __init__(self, transforms = dict_of_transforms, 62 | normalize01 = False, 63 | additional_transforms = {}, 64 | verbose=False, 65 | remove_outliers=False 66 | tol = 1e-2): 67 | self.transforms = transforms 68 | self.verbose = verbose 69 | self.transforms.update( additional_transforms ) 70 | #map( _wrapper, transforms + additional_transforms ) 71 | for fname, func in self.transforms.iteritems(): 72 | self.transforms[ fname ] = _wrapper(func, verbose) 73 | 74 | 75 | self.tol = tol 76 | 77 | 78 | def fit(self, X, Y): 79 | "to do" 80 | 81 | n,d = X.shape 82 | 83 | self.transforms_ = ["identity"]*d 84 | abs_partial_correlations_ = abs(partial_correlation_via_inverse(X,Y)[-1, 0:-1]) 85 | temp_abs_partial_correlations_ = -1e2*np.ones_like( abs_partial_correlations_ ) 86 | ix = np.arange( d) 87 | while abs( temp_abs_partial_correlations_.sum() - abs_partial_correlations_.sum() ) > self.tol: 88 | for i in xrange(d): 89 | _X = X[:,i] 90 | Z = X[:, ix != i] 91 | for transform_name, transform in self.transforms: 92 | no_error, f_X = transform( _X ) 93 | if no_error: 94 | pc = abs( partial_correlation( f_X, Y, Z ) ) 95 | if pc > abs_partial_correlations_[i]: 96 | temp_abs_partial_correlations_[i] = pc 97 | self.transforms_[i] = transform_name 98 | 99 | 100 | 101 | return self 102 | 103 | 104 | def transform(self, X): 105 | if self.normalize01: 106 | X = _normalize01( X ) 107 | 108 | 109 | newX = X.copy() 110 | n,d = X.shape 111 | for i in range(d): 112 | newX[:,i] = self.transforms[ self.transforms_[i] ]( X[:,i] ) 113 | 114 | return newX 115 | 116 | def fit_transform( self, X, y): 117 | 118 | self.fit( X, y) 119 | return self.transformedX 120 | 121 | 122 | 123 | def _corr(x,y, remove_outliers=False): 124 | #check if x,y are same shape 125 | n = x.shape[0] 126 | if x.var()==0 or y.var()==0: 127 | return 0 128 | else: 129 | if remove_outliers: 130 | ee = EllipticEnvelope(store_precision = False, contamination=0.05) 131 | ee.fit( np.concatenate( [x[:,None],y[:,None] ], axis=1) ) 132 | c = ee.covariance_ 133 | return c[0,1]/np.sqrt( c[0,0]*c[1,1] ) 134 | return np.dot( x - x.mean(), y - y.mean() ) / np.sqrt(( x.var()*y.var() ))/ n 135 | 136 | def _wrapper(f, verbose = False): 137 | def g(x): 138 | try: 139 | u = f(x) 140 | except FloatingPointError as e: 141 | if verbose: 142 | print "Error.", e 143 | return False, np.zeros_like(x) 144 | if ( ~ np.isfinite( u ) ).sum() > 0: 145 | if verbose: 146 | print "Infinite." 147 | return False, np.zeros_like(x) 148 | else: 149 | return True, u 150 | return g 151 | 152 | def partial_correlation(X, Y, Z): 153 | """ 154 | This computes the partial-correlation between X and Y, with covariates Z. 155 | """ 156 | lr1 = LR() 157 | lr2 = LR() 158 | lr1.fit(Z,X) 159 | lr2.fit(Z,Y) 160 | 161 | return np.corrcoef( Y - lr1.predict(Z), X - lr2.predict(Z) )[0,1] 162 | 163 | def partial_correlation_via_inverse(X, Y=None): 164 | try: 165 | X = np.concatenate([ X,Y], axis=1 ) 166 | except: 167 | pass 168 | return -cov2corr( np.linalg.inv(np.dot(X.T, X) ) ) 169 | 170 | def cov2corr( A ): 171 | """ 172 | covariance matrix to correlation matrix. 173 | """ 174 | d = np.sqrt(A.diagonal()) 175 | A = ((A.T/d).T)/d 176 | #A[ np.diag_indices(A.shape[0]) ] = np.ones( A.shape[0] ) 177 | return A -------------------------------------------------------------------------------- /MachineLearningScikitLearn/outlier.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | #imports and definitions 3 | import numpy as np 4 | import scipy.stats as stats 5 | import scipy.spatial.distance as distance 6 | import matplotlib.pyplot as plt 7 | from sklearn.covariance import MinCovDet as MCD 8 | 9 | 10 | 11 | class Outlier_detection(object): 12 | 13 | def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995): 14 | self.verbose = verbose 15 | self.support_fraction = support_fraction 16 | self.chi2 = stats.chi2 17 | self.mcd = MCD(store_precision = True, support_fraction = support_fraction) 18 | self.chi2_percentile = chi2_percentile 19 | 20 | def fit(self, X): 21 | """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme""" 22 | self.mcd.fit(X) 23 | mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_ ) 24 | d = np.array(map(mahalanobis, X)) #Mahalanobis distance values 25 | self.d2 = d ** 2 #MD squared 26 | n, self.degrees_of_freedom_ = X.shape 27 | self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) ) 28 | if self.verbose: 29 | print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile) 30 | print "with support fraction %.2f."%self.support_fraction 31 | return self 32 | 33 | def plot(self,log=False, sort = False ): 34 | """ 35 | Cause plotting is always fun. 36 | 37 | log: transform the distance-sq to a log ( distance-sq ) 38 | sort: sort the data according to distnace before plotting 39 | ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views. 40 | 41 | """ 42 | n = self.d2.shape[0] 43 | fig = plt.figure() 44 | 45 | x = np.arange( n ) 46 | ax = fig.add_subplot(111) 47 | 48 | 49 | transform = (lambda x: x ) if not log else (lambda x: np.log(x)) 50 | chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_) 51 | 52 | chi_line = transform( chi_line ) 53 | d2 = transform( self.d2 ) 54 | if sort: 55 | isort = np.argsort( d2 ) 56 | ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' ) 57 | plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" ) 58 | 59 | 60 | else: 61 | ax.scatter(x, d2 ) 62 | extreme_values = d2[ self.iextreme_values ] 63 | ax.scatter( x[self.iextreme_values], extreme_values, color="r" ) 64 | 65 | ax.hlines( chi_line, 0, n, 66 | label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" ) 67 | 68 | ax.legend() 69 | ax.set_ylabel("distance squared") 70 | ax.set_xlabel("observation") 71 | ax.set_xlim(0, self.d2.shape[0]) 72 | 73 | 74 | plt.show() 75 | 76 | 77 | -------------------------------------------------------------------------------- /MachineLearningScikitLearn/pretty_pca.py: -------------------------------------------------------------------------------- 1 | #prettyPCA 2 | 3 | 4 | """ 5 | This functions plots more interesting plot of PCA reduced data in 2d. 6 | """ 7 | 8 | import matplotlib.pyplot as plt 9 | import matplotlib.gridspec as gridspec 10 | 11 | def pretty_pca( skPCA, transformed_data, var_names = None, fraction_data = 1., scale = 3, scatter_color = None): 12 | """ 13 | skPCA: a sklearn-fitted PCA instance. 14 | transformed_data: the pca-reduced data. 15 | var_names: the variable names; defaults to numbers starting at 0. 16 | fraction_data: fraction of data points to plot. 17 | scale: how much to scale the lines by, default 3. 18 | 19 | """ 20 | line_color = "k" 21 | transformed_data = transformed_data[::int(1.0/fraction_data),:2] 22 | components = skPCA.components_[:2,:].T 23 | n_features = components.shape[0] 24 | if var_names == None: 25 | var_names = [ "%d"%i for i in range(n_features) ] 26 | else: 27 | var_names = [ "%s, %d"%(name, i) for i,name in enumerate(var_names) ] 28 | 29 | 30 | fig = plt.figure(1,figsize=(8,5)) 31 | gs = gridspec.GridSpec( 2, 1, height_ratios=[3,1] ) 32 | 33 | ax = plt.subplot( gs[0] ) 34 | if scatter_color is not None: 35 | ax.scatter( transformed_data[:,0], transformed_data[:,1], edgecolors='none', alpha = 0.6, c = scatter_color ) 36 | else: 37 | ax.scatter( transformed_data[:,0], transformed_data[:,1], edgecolors='none', alpha = 0.5 ) 38 | 39 | ax.scatter( [0], [0], s = 5, c = "k" ) 40 | for i in range( n_features ): 41 | #ax.plot( *zip([0,0], scale*components[i,:]) , c = line_color, lw = 2, alpha = 0.8 ) 42 | ax.annotate( "", scale*components[i, :], (0,0), arrowprops = dict( arrowstyle="->")) 43 | ax.annotate(var_names[i], xy=scale*components[i,:], xycoords='data', 44 | #xytext=(-50, 30), 45 | textcoords='offset points', 46 | size = 12, 47 | ) 48 | ax.set_title("2 Dimensional PCA data") 49 | 50 | ax = plt.subplot( gs[1] ) 51 | 52 | ax.bar( range(skPCA.explained_variance_ratio_.shape[0]), skPCA.explained_variance_ratio_ ) 53 | ax.bar( range(2), skPCA.explained_variance_ratio_[:2], color = "r" ) 54 | ax.set_title( "Explained variance ratio" ) 55 | plt.show() 56 | return 57 | -------------------------------------------------------------------------------- /MachineLearningScikitLearn/supervised_pca.py: -------------------------------------------------------------------------------- 1 | #supervised PCA according to Supervised Principal Compontent Anaysis by Ghodsi et al. 2010 2 | 3 | import numpy as np 4 | from scipy import linalg 5 | 6 | from ..utils.arpack import eigsh 7 | from ..base import BaseEstimator, TransformerMixin 8 | from ..preprocessing import KernelCenterer, scale 9 | from ..metrics.pairwise import pairwise_kernels 10 | 11 | 12 | from time import clock 13 | 14 | 15 | 16 | class SupervisedPCA(BaseEstimator, TransformerMixin): 17 | """Supervised Principal component analysis (SPCA) 18 | 19 | Non-linear dimensionality reduction through the use of kernels. 20 | 21 | Parameters 22 | ---------- 23 | n_components: int or None 24 | Number of components. If None, all non-zero components are kept. 25 | 26 | kernel: "linear" | "poly" | "rbf" | "sigmoid" | "precomputed" 27 | Kernel. 28 | Default: "linear" 29 | 30 | degree : int, optional 31 | Degree for poly, rbf and sigmoid kernels. 32 | Default: 3. 33 | 34 | gamma : float, optional 35 | Kernel coefficient for rbf and poly kernels. 36 | Default: 1/n_features. 37 | 38 | coef0 : float, optional 39 | Independent term in poly and sigmoid kernels. 40 | 41 | 42 | eigen_solver: string ['auto'|'dense'|'arpack'] 43 | Select eigensolver to use. If n_components is much less than 44 | the number of training samples, arpack may be more efficient 45 | than the dense eigensolver. 46 | 47 | tol: float 48 | convergence tolerance for arpack. 49 | Default: 0 (optimal value will be chosen by arpack) 50 | 51 | max_iter : int 52 | maximum number of iterations for arpack 53 | Default: None (optimal value will be chosen by arpack) 54 | 55 | Attributes 56 | ---------- 57 | 58 | `lambdas_`, `alphas_`: 59 | Eigenvalues and eigenvectors of the centered kernel matrix 60 | 61 | 62 | """ 63 | 64 | def __init__(self, n_components=None, kernel="linear", gamma=0, degree=3, 65 | coef0=1, alpha=1.0, fit_inverse_transform=False, 66 | eigen_solver='auto', tol=0, max_iter=None): 67 | 68 | 69 | self.n_components = n_components 70 | self.kernel = kernel.lower() 71 | self.gamma = gamma 72 | self.degree = degree 73 | self.coef0 = coef0 74 | self.alpha = alpha 75 | self.fit_inverse_transform = fit_inverse_transform 76 | self.eigen_solver = eigen_solver 77 | self.tol = tol 78 | self.max_iter = max_iter 79 | self.centerer = KernelCenterer() 80 | 81 | 82 | def transform(self, X): 83 | """ 84 | Returns a new X, X_trans, based on previous self.fit() estimates 85 | """ 86 | return X.dot( self.alphas_ ) 87 | 88 | 89 | def fit(self,X,Y): 90 | self._fit(X,Y) 91 | return 92 | 93 | def fit_transform( self, X, Y): 94 | 95 | 96 | self.fit( X,Y) 97 | return self._transform() 98 | 99 | def _transform(self): 100 | 101 | return self.X_fit.dot(self.alphas_) 102 | 103 | 104 | def _fit(self, X, Y): 105 | #find kenerl matrix of Y 106 | K = self.centerer.fit_transform(self._get_kernel(Y)) 107 | #scale X 108 | X_scale = scale(X) 109 | 110 | 111 | if self.n_components is None: 112 | n_components = K.shape[0] 113 | else: 114 | n_components = min(K.shape[0], self.n_components) 115 | 116 | #compute eigenvalues of X^TKX 117 | 118 | M = (X.T).dot(K).dot(X) 119 | print "here" 120 | if self.eigen_solver == 'auto': 121 | if M.shape[0] > 200 and n_components < 10: 122 | eigen_solver = 'arpack' 123 | else: 124 | eigen_solver = 'dense' 125 | else: 126 | eigen_solver = self.eigen_solver 127 | 128 | if eigen_solver == 'dense': 129 | self.lambdas_, self.alphas_ = linalg.eigh( 130 | M, eigvals=(M.shape[0] - n_components, M.shape[0] - 1)) 131 | elif eigen_solver == 'arpack': 132 | self.lambdas_, self.alphas_ = eigsh(M, n_components, 133 | which="LA", 134 | tol=self.tol) 135 | indices = self.lambdas_.argsort()[::-1] 136 | self.lambdas_ = self.lambdas_[indices] 137 | self.alphas_ = self.alphas_[:, indices] 138 | 139 | #remove the zero/negative eigenvalues 140 | self.alphas_ = self.alphas_[:, self.lambdas_ > 0 ] 141 | self.lambdas_ = self.lambdas_[ self.lambdas_ > 0 ] 142 | print self.alphas_.shape 143 | 144 | self.X_fit = X; 145 | 146 | 147 | def _get_kernel(self, X, Y=None): 148 | params = {"gamma": self.gamma, 149 | "degree": self.degree, 150 | "coef0": self.coef0} 151 | try: 152 | return pairwise_kernels(X, Y, metric=self.kernel, 153 | filter_params=True, n_jobs = -1, **params) 154 | except AttributeError: 155 | raise ValueError("%s is not a valid kernel. Valid kernels are: " 156 | "rbf, poly, sigmoid, linear and precomputed." 157 | % self.kernel) 158 | 159 | 160 | 161 | 162 | class KernelSupervisedPCA( BaseEstimator, TransformerMixin): 163 | 164 | """Kernel Supervised Principal component analysis (SPCA) 165 | 166 | Non-linear dimensionality reduction through the use of kernels. 167 | 168 | Parameters 169 | ---------- 170 | n_components: int or None 171 | Number of components. If None, all non-zero components are kept. 172 | 173 | x||ykernel: "linear" | "poly" | "rbf" | "sigmoid" | "precomputed" 174 | Kernel. 175 | Default: "linear" 176 | 177 | degree : int, optional 178 | Degree for poly, rbf and sigmoid kernels. 179 | Default: 3. 180 | 181 | gamma : float, optional 182 | Kernel coefficient for rbf and poly kernels. 183 | Default: 1/n_features. 184 | 185 | coef0 : float, optional 186 | Independent term in poly and sigmoid kernels. 187 | 188 | 189 | eigen_solver: string ['auto'|'dense'|'arpack'] 190 | Select eigensolver to use. If n_components is much less than 191 | the number of training samples, arpack may be more efficient 192 | than the dense eigensolver. 193 | 194 | tol: float 195 | convergence tolerance for arpack. 196 | Default: 0 (optimal value will be chosen by arpack) 197 | 198 | max_iter : int 199 | maximum number of iterations for arpack 200 | Default: None (optimal value will be chosen by arpack) 201 | 202 | Attributes 203 | ---------- 204 | 205 | `lambdas_`, `alphas_`: 206 | Eigenvalues and eigenvectors of the centered kernel matrix 207 | 208 | 209 | """ 210 | 211 | def __init__(self, n_components=None, xkernel={'kernel': "linear", 'gamma':0, 'degree':3, 212 | 'coef0':1}, ykernel = {'kernel': "linear", 'gamma':0, 'degree':3, 213 | 'coef0':1}, fit_inverse_transform=False, 214 | eigen_solver='auto', tol=0, max_iter=None): 215 | 216 | 217 | self.n_components = n_components 218 | self.xkernel = xkernel 219 | self.ykernel = ykernel 220 | self.fit_inverse_transform = fit_inverse_transform 221 | self.eigen_solver = eigen_solver 222 | self.tol = tol 223 | self.max_iter = max_iter 224 | self.centerer = KernelCenterer() 225 | 226 | 227 | def transform(self, X): 228 | """ 229 | Returns a new X, X_trans, based on previous self.fit() estimates 230 | """ 231 | K = self._get_kernel(self.X_fit, self.xkernel, X ) 232 | return K.T.dot( self.alphas_ ) 233 | 234 | 235 | def fit(self,X,Y): 236 | self._fit(X,Y) 237 | return 238 | 239 | def fit_transform( self, X, Y): 240 | 241 | 242 | self.fit( X,Y) 243 | return self._transform() 244 | 245 | def _transform(self): 246 | 247 | return self.Kx_fit.dot(self.alphas_) 248 | 249 | 250 | def _fit(self, X, Y): 251 | #find kenerl matrix of Y 252 | Ky = self.centerer.fit_transform(self._get_kernel(Y), self.ykernel) 253 | Kx = self.centerer.fit_transform( self._get_kernel(X), self.xkernel) 254 | 255 | 256 | 257 | if self.n_components is None: 258 | n_components = Ky.shape[0] 259 | else: 260 | n_components = min(Ky.shape[0], self.n_components) 261 | 262 | #compute eigenvalues of X^TKX 263 | 264 | M = (Kx).dot(Ky).dot(Kx) 265 | if self.eigen_solver == 'auto': 266 | if M.shape[0] > 200 and n_components < 10: 267 | eigen_solver = 'arpack' 268 | else: 269 | eigen_solver = 'dense' 270 | else: 271 | eigen_solver = self.eigen_solver 272 | 273 | if eigen_solver == 'dense': 274 | self.lambdas_, self.alphas_ = linalg.eigh( 275 | M, Kx, eigvals=(M.shape[0] - n_components, M.shape[0] - 1)) 276 | elif eigen_solver == 'arpack': 277 | self.lambdas_, self.alphas_ = eigsh(M, Kx, n_components, 278 | which="LA", 279 | tol=self.tol) 280 | indices = self.lambdas_.argsort()[::-1] 281 | self.lambdas_ = self.lambdas_[indices] 282 | self.alphas_ = self.alphas_[:, indices] 283 | 284 | #remove the zero/negative eigenvalues 285 | self.alphas_ = self.alphas_[:, self.lambdas_ > 0 ] 286 | self.lambdas_ = self.lambdas_[ self.lambdas_ > 0 ] 287 | 288 | self.X_fit = X; 289 | self.Kx_fit = Kx; 290 | 291 | def _get_kernel(self, X, params, Y=None): 292 | try: 293 | return pairwise_kernels(X, Y, metric=params['kernel'], 294 | n_jobs = -1, **params) 295 | except AttributeError: 296 | raise ValueError("%s is not a valid kernel. Valid kernels are: " 297 | "rbf, poly, sigmoid, linear and precomputed." 298 | % params['kernel']) 299 | 300 | 301 | -------------------------------------------------------------------------------- /MachineLearningScikitLearn/weighted_least_squares.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sklearn.linear_model.LinearRegression as LR 3 | 4 | 5 | class WeightedLinearRegression(LR): 6 | """ 7 | Implements a weighted least squares class. 8 | weights: a nx1 vector of non-zero weights. 9 | 10 | """ 11 | def __init__(weights, **kwargs): 12 | print "warning: untested" 13 | super(LR, self).__init__(**kwargs) 14 | self.weights= weights 15 | 16 | 17 | def fit( X, Y): 18 | assert X.shape[0] == Y.shape[0] == self.weights.shape[0], "Objects must be same size" 19 | sqw = np.sqrt( self.weights ) 20 | self.fit( X*sqw, Y*sqw ) 21 | return self 22 | 23 | def predict( X ): 24 | return self.predict( X*np.sqrt(self.weights) ) -------------------------------------------------------------------------------- /MonteCarlo/Copulas/README.txt: -------------------------------------------------------------------------------- 1 | See MCMC (specificaly mcmc_examples.py) folder for python implementation of copulas -------------------------------------------------------------------------------- /MonteCarlo/Integration/Assignment.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CamDavidsonPilon/Python-Numerics/043ab4ad9003325c6270486b24d163933e0c7e8a/MonteCarlo/Integration/Assignment.pdf -------------------------------------------------------------------------------- /MonteCarlo/Integration/MonteCarloIntegrator.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import scipy.stats as stats 4 | import time 5 | 6 | class MCIntegrator( object ): 7 | """ 8 | target_function: a function that accepts a n-D array, and returns an n-D array. 9 | interval: the interval of the integration 10 | b_antithetic: whether to use antithesis variables. Much quicker, but only useful on monotonic target_functions 11 | sampling_dist: a scipy frozen distribution with support equal to the interval 12 | N: number of variables to use in the initial estimate. 13 | control_variates = a list of function that accepts a nD array, and return an nD array 14 | """ 15 | def __init__(self, target_function, 16 | interval = (0,1), 17 | N = 10000, 18 | b_antithetic = False, 19 | sampling_dist = stats.uniform(), 20 | verbose=False, 21 | control_variates = []): 22 | self.target_function = target_function 23 | self.min_interval, self.max_interval = interval 24 | self.N_ = N 25 | self.N = 0 26 | self.sampling_dist = sampling_dist 27 | self.value =0 28 | self.b_antithetic = b_antithetic 29 | self.verbose = verbose 30 | self.control_variates = control_variates 31 | 32 | def estimate_N(self, N ): 33 | self.N += N 34 | return self._estimate(N) 35 | 36 | 37 | 38 | def _estimate(self, N): 39 | 40 | #generate N values from sampling_dist 41 | if not self.b_antithetic: 42 | U = self.sampling_dist.rvs(N) 43 | Y = self.target_function( U ) 44 | for func in self.control_variates: 45 | X = func(U) 46 | Y += X 47 | 48 | if self.verbose: 49 | print Y.var() 50 | self.value += Y.sum() 51 | else: 52 | U_ = self.sampling_dist.rvs(N/2) 53 | antiU_ = self.min_interval + (self.max_interval - U_ ) 54 | Y = (self.target_function( U_ ) + self.target_function( antiU_ ) ) 55 | if self.verbose: 56 | print Y.var() 57 | self.value +=Y.sum() 58 | return self.value / self.N 59 | 60 | def estimate(self): 61 | self.N += self.N_ 62 | return self._estimate(self.N_) 63 | 64 | 65 | 66 | if __name__ == "__main__": 67 | #Some examples: 68 | 69 | 70 | def target(u): 71 | return np.exp(-u**2)*2 72 | 73 | mci = MCIntegrator( target, interval =(0,2), b_antithetic = False, sampling_dist = stats.uniform(0,2), verbose= True ) 74 | N = 1e6 75 | 76 | start = time.clock() 77 | print "Using %d samples,"%N 78 | print "Non-antithetic: %.5f."%mci.estimate_N(N ) 79 | print "Duration: %.3f s."%(time.clock() - start) 80 | print 81 | mci = MCIntegrator( target, interval =(0,2), b_antithetic = True, sampling_dist = stats.uniform(0,2), verbose= True ) 82 | start = time.clock() 83 | print "Antithetic: %.5f."%mci.estimate_N(N ) 84 | print "Duration: %.3f s."%(time.clock() - start) 85 | print 86 | 87 | """ 88 | Using 1000000 samples, 89 | 0.474815598284 90 | Non-antithetic: 0.88140. 91 | Duration: 0.382 s. 92 | 93 | 0.0417625416316 94 | Antithetic: 0.88216. 95 | Duration: 0.303 s. 96 | """ 97 | 98 | 99 | #Using importance sampling 100 | 101 | def importance_function(u): 102 | return (-.5*u + 1)*2 103 | 104 | 105 | class Importance(object): 106 | def __init__(self): 107 | pass 108 | 109 | def rvs(self,n): 110 | u = stats.uniform(0,1).rvs( n) 111 | return 2*( 1 - np.sqrt(u) ) 112 | 113 | sampling_dist = Importance() 114 | mci = MCIntegrator( target, interval = (0,2), b_antithetic = False, sampling_dist = sampling_dist, N=100000, verbose= True ) 115 | print mci.estimate() 116 | 117 | 118 | #using control variates 119 | 120 | def polynomial_control( u ): 121 | return -.26*( (1-u**2) - -1.0/3) 122 | 123 | mci = MCIntegrator( target, interval =(0,2), sampling_dist = stats.uniform(0,2), verbose= True, control_variates=[polynomial_control] ) 124 | start = time.clock() 125 | print "Control Variates: %.5f."%mci.estimate_N(N ) 126 | print "Duration: %.3f s."%(time.clock() - start) 127 | 128 | 129 | -------------------------------------------------------------------------------- /MonteCarlo/Integration/Q6.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | Q6. 5 | 6 | The best estimate of c* is about 3.05. To find this I used a gamma distribution to estimate the 7 | integral for values of c between 0 and q and plotted the results. With this optimal 8 | value of c*, the expected value is approximatly equal to 9 | 10 | E[ 1_{x > q} ] = 1.139e-06 11 | 12 | 13 | 14 | """ 15 | 16 | 17 | import q2 18 | import numpy as np 19 | import scipy.stats as stats 20 | Q = 3.7 21 | interval = (0, np.infty) 22 | 23 | def target(u, c): 24 | return (2*u**2)/(u-c)*np.exp( - ( u**2 + 2*c*u - c**2 ) ) 25 | 26 | 27 | potentialC = np.array( [0.1, 0.5, 1, 2, 3, 3.5] ) 28 | potentialCprime = np.linspace( 2.5, 3.55, 20) 29 | estimates = np.zeros_like( potentialCprime) 30 | for i,c in enumerate(potentialCprime): 31 | #sampling_dist = stats.norm( loc = c, scale= 5 ) 32 | sampling_dist = stats.gamma(1, loc=Q ) 33 | target_c = lambda x: target(x,c) 34 | mci = q2.MCIntegrator( target_c, interval =interval, b_antithetic = False, sampling_dist = sampling_dist, N=100000, verbose= False ) 35 | estimates[i] = mci.estimate() 36 | 37 | 38 | 39 | 40 | #3.0526315789473681 is about best 41 | c_opt = 3.0526 42 | 43 | def rayleigh(u): 44 | return 2*u*np.exp(-u**2) 45 | 46 | 47 | def target(u): 48 | return ( u > Q)*rayleigh(u)/rayleigh(u-c_opt) 49 | sampling_dist = stats.rayleigh( loc = c_opt, scale=1./np.sqrt(2)) 50 | mci = q2.MCIntegrator( target, interval =interval, b_antithetic = False, sampling_dist = sampling_dist, N=100000, verbose= False ) 51 | print mci.estimate() 52 | # estimate: 1.13859704116e-06 53 | 54 | 55 | -------------------------------------------------------------------------------- /MonteCarlo/Integration/examples.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import scipy.stats as stats 4 | import time 5 | 6 | from MonteCarloIntegrator import * 7 | 8 | """ 9 | Lets estimate the integral 10 | 11 | I = \int_0^2 exp(-x**2) dx 12 | = E_u[ exp(-x**2)/2 ] where u ~ Uni(0,2) 13 | 14 | 15 | """ 16 | 17 | 18 | def target(u): 19 | return np.exp(-u**2)*2 20 | 21 | mci = MCIntegrator( target, interval =(0,2), b_antithetic = False, sampling_dist = stats.uniform(0,2), verbose= True ) 22 | N = 1e6 23 | 24 | start = time.clock() 25 | print "Using %d samples,"%N 26 | print "Non-antithetic: %.5f."%mci.estimate_N(N ) 27 | print "Duration: %.3f s."%(time.clock() - start) 28 | 29 | #using anti-thetic 30 | 31 | mci = MCIntegrator( target, interval =(0,2), b_antithetic = True, sampling_dist = stats.uniform(0,2), verbose= True ) 32 | start = time.clock() 33 | print "Antithetic: %.5f."%mci.estimate_N(N ) 34 | print "Duration: %.3f s."%(time.clock() - start) 35 | 36 | 37 | 38 | #Using importance sampling 39 | 40 | def importance_function(u): 41 | return (-.5*u + 1)*2 42 | 43 | 44 | class Importance(object): 45 | def __init__(self): 46 | pass 47 | 48 | def rvs(self,n): 49 | u = stats.uniform(0,1).rvs( n) 50 | return 2*( 1 - np.sqrt(u) ) 51 | 52 | sampling_dist = Importance() 53 | mci = MCIntegrator( target, interval = (0,2), b_antithetic = False, sampling_dist = sampling_dist, N=N, verbose= True ) 54 | print mci.estimate() 55 | 56 | 57 | #using control variates 58 | 59 | def polynomial_control( u ): 60 | return -.26*( (1-u**2) - -1.0/3) 61 | 62 | mci = MCIntegrator( target, interval =(0,2), sampling_dist = stats.uniform(0,2), verbose= True, N=N, control_variates=[polynomial_control] ) 63 | start = time.clock() 64 | print "Control Variates: %.5f."%mci.estimate_N(N ) 65 | print "Duration: %.3f s."%(time.clock() - start) -------------------------------------------------------------------------------- /MonteCarlo/MCMC/copulas.py: -------------------------------------------------------------------------------- 1 | """Some copulas and helpers for copulas""" 2 | 3 | from __future__ import division 4 | import numpy as np 5 | import scipy.stats as stats 6 | import scipy as sp 7 | 8 | 9 | def gumbel(t, theta = 1): 10 | #theta in (0, \infty) 11 | return np.exp( -t**(1./theta) ) 12 | 13 | def inv_gumbel( t, theta=1): 14 | return (-np.log(t) )**theta 15 | 16 | 17 | def clayton(t, theta=1): 18 | return (1+theta*t)**(-1./theta) 19 | 20 | def inv_clayton( t, theta =1): 21 | return 1.0/theta*( t**(-theta) - 1) 22 | 23 | 24 | 25 | def arch_copula(u, f= gumbel, f_inv = inv_gumbel, theta = 1 ): 26 | """ 27 | #u is a numpy array 28 | """ 29 | 30 | if ( (u > 1).sum() + (u <0).sum() )>0: 31 | return 0 32 | 33 | return f( f_inv( u, theta ).sum(), theta ) 34 | 35 | 36 | def _pdf(f, u, delta = 0.001 ): 37 | n = u.shape[0] 38 | if n==1: 39 | t= f(u[0]+delta/2) - f(u[0]-delta/2) 40 | return t 41 | else: 42 | f_plus = lambda *x: f( u[0] + delta/2, *x) 43 | f_minus = lambda *x: f( u[0] - delta/2, *x) 44 | return _pdf(f_plus, u[1:], delta ) - _pdf(f_minus, u[1:], delta ) 45 | 46 | def cdf2pdf( f, u, delta=0.001, kwargs={} ): 47 | """numerically unstable for large dimensions""" 48 | def _wrapper(*args): 49 | u = np.array(args) 50 | return f(u, **kwargs) 51 | n = u.shape[0] 52 | return _pdf( _wrapper, u, delta)/delta**n 53 | 54 | 55 | 56 | class Copula_Proposal( object ): 57 | def __init__(self): 58 | self.norm = stats.norm 59 | 60 | def rvs(self, loc, scale, size=1): 61 | return self.norm.rvs( loc = loc, scale= scale, size = size) 62 | 63 | def pdf( self, x, given, scale = 1.0): 64 | """ 65 | http://darrenjw.wordpress.com/2012/06/04/metropolis-hastings-mcmc-when-the-proposal-and-target-have-differing-support/ 66 | """ 67 | return self.norm.cdf( x/scale ).prod() 68 | 69 | 70 | 71 | 72 | -------------------------------------------------------------------------------- /MonteCarlo/MCMC/mcmc.py: -------------------------------------------------------------------------------- 1 | 2 | from __future__ import division 3 | 4 | """ 5 | 6 | I'll begin with the MCMC object. 7 | It is a very general instance of a MCMC. It uses a Gaussian random walk to propose the next step. 8 | The first issue is whether to accept or reject instances that fall outside the unit cube (as copulas 9 | are only defined here), or more generally, fall out of the support of the target distribution. We bias 10 | the results if we use the acceptance ratio target(x')/target(x_n). This is because by immediatly rejecting 11 | results that are outside the support, we are using a truncated proposal distribution, and this is not 12 | symmetric. Thus in the below code, I use the ratio target(x')/target(x_n) * norm_cdf( x_n)/norm_cdf(x'). See 13 | http://darrenjw.wordpress.com/2012/06/04/metropolis-hastings-mcmc-when-the-proposal-and-target-have-differing-support/ 14 | for a full, and great, explaination. 15 | I have dynamic step size that targets a certain acceptance rate (if too many acceptances, likely 16 | not exploring the space very well, vs. too few acceptances mean likely stepping too far. See documentation in the code). 17 | 18 | 19 | 20 | 21 | """ 22 | 23 | import pdb 24 | import numpy as np 25 | import scipy.stats as stats 26 | import matplotlib.pyplot as plt 27 | import scipy as sp 28 | 29 | 30 | # Need a way to sample from copula 31 | # Do this using MCMC 32 | 33 | 34 | class Normal_proposal( object ): 35 | 36 | def __init__(self, ): 37 | self.norm = stats.norm 38 | 39 | def rvs(self, loc = 0, scale = 1, size =1): 40 | return self.norm.rvs( loc = loc, scale = scale, size = size ) 41 | 42 | def pdf( self, x, given, scale= 1 ): 43 | return self.norm.pdf( x-given, scale = 1).prod() #assumes independent 44 | 45 | 46 | 47 | 48 | class MCMC(object): 49 | """ 50 | Implementation of the Metropolis-Hasting algo. 51 | params: 52 | target_dist: the target_distribution, what accept a d-dim vector. 53 | proposal_dist: the proposal dist, an object with the following methods: 54 | .pdf(x, y, scale): the pdf of scale*X | y, should accept a vector 55 | .rvs(loc, scale, size) #todo 56 | 57 | x_0: a starting location 58 | burn_in: the number of burn in steps 59 | dim: the dimension of the densities. 60 | init_scale: the initial scale to start at. The algorithm uses a simple 61 | dynamic scale to target a certain acceptance ratio. 62 | 63 | methods: 64 | next() : generates and returns a random variate from the target_dist 65 | 66 | 67 | """ 68 | def __init__(self, target_dist, 69 | dim = 1, 70 | x_0 = None, 71 | burn_in = 300, 72 | init_scale = 1, 73 | proposal_dist = Normal_proposal(), 74 | verbose = True): 75 | self.target_dist = target_dist 76 | self.x = x_0 77 | self.burn_in = burn_in 78 | self.dim = dim 79 | self.uniform = stats.uniform() 80 | #self.std = 1 81 | self.proposals = 0 82 | self.accepted = 0 83 | self.proposal_dist = proposal_dist 84 | self.verbose = verbose 85 | self.std = init_scale 86 | self.array_std = self.std*np.ones(1) 87 | if x_0 == None: 88 | #initialize array 89 | self.x = np.zeros(dim) 90 | self._burn() 91 | 92 | def _normcdf(self, x_array): 93 | return proposal_dist.cdf( x_array).prod() 94 | 95 | def _modify_step(self): 96 | #lets check our acceptance rate, and aim for .234, see http://www.maths.lancs.ac.uk/~sherlocc/Publications/rwm.final.pdf 97 | opt_rate = .234 98 | epsilon = 0.05 99 | rate = self.accepted/self.proposals 100 | if rate > opt_rate + epsilon: #too many acceptance, spread out more 101 | self.std *= 1.001 102 | elif rate < opt_rate - epsilon: 103 | self.std /= 1.001 104 | 105 | self.array_std = np.append( self.array_std, self.std) 106 | return 107 | 108 | def rvs(self, n=1): 109 | #generate a new sample 110 | #An interesting bug: http://darrenjw.wordpress.com/2012/06/04/metropolis-hastings-mcmc-when-the-proposal-and-target-have-differing-support/ 111 | 112 | observations = np.empty( (n,self.dim) ) 113 | for i in range(n): 114 | accept = False 115 | tally = 0 116 | #lets keep a running tally of our acceptance rate. 117 | while not accept: 118 | self.proposals += 1 119 | #x_new = self.x + self.std*np.random.multivariate_normal(np.zeros(self.dim), np.eye(self.dim)) 120 | x_new = self.proposal_dist.rvs(self.x, scale = self.std, size = self.dim) #this is 121 | #a = self.target_dist( x_new )/ self.target_dist( self.x) #we use the correct acceptance ratio: 122 | #a = self.target_dist( x_new)*self._normcdf( self.x)/ ( self.target_dist( self.x )*self._normcdf( x_new ) ) 123 | a = self.target_dist(x_new)*self.proposal_dist.pdf(self.x, x_new)/( self.target_dist(self.x)*self.proposal_dist.pdf( x_new, self.x) ) 124 | #print a 125 | #pdb.set_trace() 126 | if (a>=1) or ( self.uniform.rvs() < a ): 127 | accept = True 128 | self.x = x_new 129 | self.accepted +=1 130 | tally+=1 131 | if tally%150==0: 132 | print "hmm...I'm not mixing well. I've rejected 150+ samples. Try a restart? Currently at ", self.x 133 | observations[i] = self.x 134 | return observations 135 | 136 | def _burn(self): 137 | if self.verbose: 138 | print "Burn, Baby, burn. %d times."%self.burn_in 139 | for i in xrange(self.burn_in): 140 | self.rvs() 141 | self._modify_step() 142 | 143 | if self.verbose: 144 | print "Burn-in complete. Use next() to call new observations." 145 | 146 | -------------------------------------------------------------------------------- /MonteCarlo/MCMC/mcmc_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Use MCMC to sample from some copulas 3 | 4 | 5 | Given a copula, we need to find its pdf. I chose, to establish arbitrary dimensional copulas, to do 6 | this numerically. I needed to compute the copula differentiated with respect to all of its arguemnts. This 7 | was quite the algorithmic challenge, but I reduced it to a recursive problem that works blazingly fast. This 8 | felxibility allows us to never have to explicitly find the pdf, which can be difficult even for dimension > 2. 9 | The differentiation algorithm uses a central difference scheme. Unfortunatly the scheme is unstable for dimensions 10 | greater than 6. 11 | 12 | """ 13 | import numpy as np 14 | import scipy.stats as stats 15 | import matplotlib.pyplot as plt 16 | import scipy as sp 17 | 18 | from mcmc import * 19 | from copulas import * 20 | 21 | 22 | 23 | 24 | mcmc1 = MCMC( lambda u: cdf2pdf( arch_copula, u) , dim = 2, x_0 = np.array( [0.5, 0.5] ) ) 25 | mcmc3 = MCMC( lambda u: cdf2pdf( arch_copula, u, kwargs={"theta":3}) , dim = 2, x_0 = np.array( [0.5, 0.5] ) ) 26 | 27 | N = 1000 28 | sampleTheta1 = mcmc1.rvs( N ) 29 | sampleTheta3 = mcmc3.rvs( N ) 30 | 31 | plt.figure() 32 | 33 | plt.subplot(221) 34 | plt.scatter( sampleTheta1[:,0], sampleTheta1[:,1], alpha = 0.5) 35 | plt.title("1000 values from a Gumbel \n copula with %s=1"%r"$\theta$") 36 | 37 | plt.subplot(222) 38 | plt.scatter( sampleTheta3[:,0], sampleTheta3[:,1], alpha = 0.5 ) 39 | plt.title("1000 values from a Gumbel \n copula with %s=3"%r"$\theta$") 40 | 41 | 42 | 43 | #lets make the exponential 44 | def make_exp( u ): 45 | return -np.log(u/3)*3 46 | 47 | plt.subplot(223) 48 | plt.scatter( make_exp( sampleTheta1[:,0]) , make_exp( sampleTheta1[:,1] ), alpha = 0.5 ) 49 | plt.title("1000 EXP(3) values from a Gumbel \n copula with %s=1"%r"$\theta$") 50 | 51 | 52 | plt.subplot(224) 53 | plt.scatter( make_exp( sampleTheta3[:,0]) , make_exp( sampleTheta3[:,1] ), alpha = 0.5 ) 54 | plt.title("1000 EXP(3) values from a Gumbel \n copula with %s=3"%r"$\theta$") 55 | 56 | plt.show() 57 | 58 | 59 | mcmc1 = MCMC( lambda u: cdf2pdf( arch_copula, u, kwargs={"f":clayton, "f_inv":inv_clayton} ) , dim = 2, x_0 = np.array( [0.5, 0.5] ) ) 60 | mcmc3 = MCMC( lambda u: cdf2pdf( arch_copula, u, kwargs={"theta":5, "f":clayton, "f_inv":inv_clayton}) , dim = 2, x_0 = np.array( [0.5, 0.5] ) ) 61 | 62 | 63 | dataTheta1 = mcmc1.rvs( N ) 64 | 65 | dataTheta3 = mcmc3.rvs( N ) 66 | 67 | plt.figure() 68 | 69 | plt.subplot(221) 70 | plt.scatter( dataTheta1[:,0], dataTheta1[:,1], alpha = 0.5 ) 71 | plt.title("1000 values from a Clayton \n copula with %s=1"%r"$\theta$") 72 | 73 | plt.subplot(222) 74 | plt.scatter( dataTheta3[:,0], dataTheta3[:,1], alpha = 0.5 ) 75 | plt.title("1000 values from a Clayton \n copula with %s=5"%r"$\theta$") 76 | 77 | 78 | 79 | #lets make the exponential 80 | def make_exp( u ): 81 | return -np.log(u) 82 | 83 | plt.subplot(223) 84 | plt.scatter( make_exp( dataTheta1[:,0]) , make_exp( dataTheta1[:,1] ), alpha = 0.5 ) 85 | plt.title("1000 EXP(1) values from a Clayton\n copula with %s=1"%r"$\theta$") 86 | 87 | 88 | plt.subplot(224) 89 | plt.scatter( make_exp( dataTheta3[:,0]) , make_exp( dataTheta3[:,1] ), alpha = 0.5 ) 90 | plt.title("1000 EXP(1) values from a Clayton\n copula with %s=5"%r"$\theta$") 91 | 92 | plt.show() -------------------------------------------------------------------------------- /MonteCarlo/grammschmidt.py: -------------------------------------------------------------------------------- 1 | # author iizukak, 2011 2 | # author cam davidson-pilon, 2012 3 | 4 | import numpy as np 5 | import pdb 6 | def gs_cofficient(v1, v2): 7 | return np.dot(v2, v1) / np.dot(v1, v1) 8 | 9 | def multiply(cofficient, v): 10 | return map((lambda x : x * cofficient), v) 11 | 12 | def proj(v1, v2): 13 | return multiply(gs_cofficient(v1, v2) , v1) 14 | 15 | def gs(X): 16 | """ 17 | performs the Gramm-Shmidt process to orthonormalize a a matrix of vectors. 18 | X: vectors to orthonormalize are rows. 19 | Returns Y, same shape as X, and with orthonormal rows. 20 | """ 21 | Y = np.zeros_like(X) 22 | for i in range(len(X)): 23 | temp_vec = X[i] 24 | for j in range(i) : 25 | proj_vec = proj(Y[j,:], X[i]) 26 | temp_vec = temp_vec - proj_vec 27 | Y[i,:] = temp_vec/np.sqrt( np.dot( temp_vec,temp_vec ) ) 28 | return Y 29 | 30 | 31 | -------------------------------------------------------------------------------- /MonteCarlo/sample_normal_given_projection.py: -------------------------------------------------------------------------------- 1 | """ 2 | This function generates samples N from N( mu, Simga ) such that N'*nu = x ie. samples 3 | N | N*nu = x (which is still normal btw). 4 | 5 | Note that actually mu is useless. 6 | 7 | """ 8 | import numpy as np 9 | def sample_normal_given_projection( covariance, x, lin_proj, n_samples=1): 10 | """ 11 | parameters: 12 | x: the value s.t. lin_proj*N = x; scalar 13 | lin_proj: the vector to project the sample unto (n,) 14 | covariance: the covariance matrix of the unconditional samples (nxn) 15 | n_samples: the number of samples to return 16 | 17 | returns: 18 | ( n x n_samples ) numpy array 19 | 20 | """ 21 | variance = np.dot( np.dot( lin_proj.T, covariance), lin_proj ) 22 | 23 | #normalize our variables s.t. lin_proj*N is N(0,1) 24 | 25 | sigma_lin = np.dot(covariance, lin_proj[:,None]) 26 | cond_mu = ( sigma_lin.T*x/variance ).flatten() 27 | cond_covar = covariance - np.dot( sigma_lin, sigma_lin.T )/ variance 28 | 29 | _samples = np.random.multivariate_normal( cond_mu, cond_covar, size = (n_samples) ) 30 | return ( _samples ) 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /MonteCarlo/sample_psd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from grammschmidt import gs 3 | import warnings 4 | import scipy.stats as stats 5 | 6 | def sample_pd_matrix( dim, avg_variance = 1, diag = np.array([]) ): 7 | """ 8 | avg_variance = the average variance, scalar. 9 | dim: the dimension of the sampled covariance matrix 10 | diag: enter a dim-dimensional vector to use as the diagonal eigenvalues elements. 11 | """ 12 | 13 | #create an orthonormal basis 14 | Ob = gs(np.random.randn( dim,dim ) ) 15 | if not diag.any(): 16 | """ 17 | This uses the fact that the sum of varinaces/n == Trace(A)/n == sum of eigenvalues/n ~= E[ Gamma(1, 1/avg_variance) ] = avg_variance 18 | """ 19 | diag = stats.gamma.rvs( 1, scale = avg_variance, size = ( (dim,1) ) ) 20 | else: 21 | diag = diag.reshape( (dim,1) ) 22 | return np.dot( Ob.T*diag.T, Ob ) 23 | 24 | 25 | def return_lower_elements(A): 26 | n = A.shape[0] 27 | t = [ (i,j) for j in range(0,n) for i in range(j+1,n) ] 28 | return np.array( [A[x] for x in t] ) 29 | 30 | 31 | def deprecated(func): 32 | '''This is a decorator which can be used to mark functions 33 | as deprecated. It will result in a warning being emitted 34 | when the function is used.''' 35 | def new_func(*args, **kwargs): 36 | warnings.warn("Call to deprecated function {}.".format(func.__name__), 37 | category=DeprecationWarning) 38 | return func(*args, **kwargs) 39 | new_func.__name__ = func.__name__ 40 | new_func.__doc__ = func.__doc__ 41 | new_func.__dict__.update(func.__dict__) 42 | return new_func 43 | 44 | @deprecated 45 | def generate_pd_matrix( dim, avg_covariance=0, avg_variance = 0, diag=np.array([]) ): 46 | """ 47 | Currently unstable for dim > 25. I would not use. 48 | 49 | 50 | This uses Sylvester's criterion to create n-dim covariance (PSD) matrices. 51 | To make correlation matrices, specify the diag parameters to be an array of all ones. 52 | parameters: 53 | avg_covariance: is added to a Normal(0,1) observation for each covariance. 54 | So, the sample mean of all covariances should be avg_covariance. 55 | dim: the dimension of the sampled covariance matrix 56 | diag: enter a dim-dimensional vector to use as the diagonal elements. 57 | 58 | """ 59 | invA = None 60 | M = np.zeros( (dim,dim) ) 61 | for i in xrange( 0, dim ): 62 | A = M[:i,:i] 63 | b_flag = False 64 | while not b_flag: 65 | #generate a variance and covariance array 66 | variance = diag[i] if diag.any() else avg_variance + np.abs( np.random.randn(1) ) 67 | covariance = (avg_covariance + np.random.randn(i)) #for stability 68 | #pdb.set_trace() 69 | #Using Danny's algorithm 70 | if i > 0: 71 | c = variance*np.random.rand(1) # > 0, < variance 72 | _lambda = np.dot( np.dot( covariance[:,None].T, invA), covariance[:,None] )[0] +1 73 | print _lambda 74 | covariance = (np.sqrt(c)/np.sqrt(_lambda))*covariance.T 75 | 76 | 77 | #check if det > 0 of matrix | A cov | 78 | # | cov var | 79 | 80 | 81 | if i==0 or _lambda > 0: 82 | b_flag = True 83 | M[i, :i] = covariance 84 | M[:i, i] = covariance 85 | M[i,i] = variance 86 | 87 | if i > 0: 88 | invA = invert_block_matrix_CASE1( A , covariance, variance, invA) 89 | #invA = np.linalg.inv( M[:i+1,:i+1]) 90 | else: 91 | invA = 1.0/M[i,i] 92 | 93 | return M 94 | 95 | 96 | 97 | def invert_block_matrix_CASE1( A, b, c, invA = None): 98 | """ 99 | Inverts the matrix | A b | 100 | | b' c | 101 | 102 | where A is (n,n), b is (n,) and c is a scalar 103 | P,lus if you know A inverse already, add it to make computations easier. 104 | This is quicker for larger matrices. How large? 105 | 106 | """ 107 | 108 | n = A.shape[0] 109 | if n == 1 and A[0] != 0: 110 | invA = 1.0/A 111 | if b.shape[0] == 0: 112 | return 1.0/A 113 | 114 | if invA == None: 115 | invA = np.linalg.inv(A) 116 | 117 | inverse = np.zeros( (n+1, n+1) ) 118 | k = c - np.dot( np.dot( b, invA), b ) 119 | 120 | inverse[ :n, :n] = invA + np.dot( np.dot( invA, b[:,None]), np.dot( b[:,None].T, invA) )/k 121 | inverse[n, n] = 1/k 122 | inverse[:n, n] = -np.dot(invA,b)/k 123 | inverse[n, :n] = -np.dot(invA,b)/k 124 | return inverse 125 | 126 | 127 | -------------------------------------------------------------------------------- /MonteCarlo/sampling_methods.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | """ 4 | I simulated 10000 variables with CDF F(y) = 1/3(y**5 + y**2 + y) using a acceptance rejection scheme and the inversion method. While both 5 | where fast, AR was much faster than the inversion method, even using a poor sampling scheme. The sampler I used for the AR method 6 | a M*Uniform distribution, where M = max_y 1/3( 5*y**4 + 2*y + 1). This bounded the pdf of Y. My output is below: 7 | 8 | >> Testing AR Method. 9 | >> Generate 10000 variables: 10 | >> Mean: 0.669, time taken: 0.13 seconds 11 | 12 | >> Testing Inverse Method. 13 | >> Generate 10000 variables: 14 | >> Mean: 0.669, time taken: 32.76 seconds 15 | 16 | """ 17 | 18 | 19 | 20 | 21 | import numpy as np 22 | import time 23 | import scipy.stats as stats 24 | from scipy.optimize import fsolve 25 | 26 | class AR_method(object): 27 | def __init__(self, target_f, sample_g, M): 28 | """ 29 | M: the constant s.t. sample_g*M >= target_f for all x 30 | sample_g: a scipy.stats frozen random variable. 31 | target_f: a 1-d integrable, positive function 32 | """ 33 | self.target_f = target_f 34 | self.sample_g = sample_g 35 | self.uniform = stats.uniform 36 | self.M = M 37 | 38 | def generate(self,n=1): 39 | 40 | rv = np.zeros( n) 41 | i=0 42 | #recursivly call this. 43 | while i0 92 | # TODO 93 | sample = np.empty( (1, self.len_trials) ) 94 | for i,k in enumerate(K): 95 | substr = self._sample_conditional( X[i] ) 96 | pass 97 | 98 | def sample_conditional(self, k, x, negate=False): 99 | #Sample the process, but at position k, put x (or put NOT x). 100 | sample = np.empty( (1, self.len_trials) ) 101 | negate = int(negate) #0 or 1 102 | sample[0,0] = np.argmax( np.random.multinomial( 1, self.init_probs_estimate ) ) 103 | for i in range(1, k + negate): 104 | A = np.linalg.matrix_power( self.trans_probs_estimate, k-i ) 105 | if not negate: 106 | p = self.trans_probs_estimate[ sample[0,i-1], :]*A[:, x ] 107 | else: 108 | p = self.trans_probs_estimate[ sample[0,i-1], :]*(1-A[:, x ]) 109 | 110 | p = self._normalize(p) 111 | sample[0, i] = np.argmax( np.random.multinomial( 1, p ) ) 112 | 113 | if not negate: 114 | sample[0, k] = x 115 | 116 | 117 | for i in range(k+ 1, self.len_trials): 118 | sample[0, i] = np.argmax(np.random.multinomial( 1, self.trans_probs_estimate[ sample[0,i-1],: ] ) ) 119 | return sample 120 | 121 | 122 | 123 | def _fit_init(self,data, encoded): 124 | 125 | if not encoded: 126 | if not self.encoding: 127 | self.encoding = encoding.EncodingScheme() 128 | data = self.encoding.encode(data) 129 | 130 | 131 | self.number_of_series = 0 132 | self.data = data 133 | self.unique_elements = np.arange( len( self.encoding.unique_bins) )[None, :] 134 | self.len_trials = self.encoding.series_length 135 | 136 | #self.n_trials, self.len_trials = data #iterators do not have a defined shape. This might have to be done on the fly. 137 | self.init_probs_estimate = np.zeros( self.unique_elements.shape[1], dtype="int" ) 138 | self.trans_probs_estimate = np.zeros( (self.unique_elements.shape[1], self.unique_elements.shape[1]), dtype="int" ) 139 | 140 | 141 | 142 | 143 | 144 | -------------------------------------------------------------------------------- /NumericalDerivatives/diff.py: -------------------------------------------------------------------------------- 1 | #numerical high-dim derivatives 2 | import numpy as np 3 | from decimal import Decimal 4 | import decimal 5 | 6 | 7 | class memorize(object): 8 | def __init__(self, func): 9 | self.func = func 10 | self.cache = {} 11 | 12 | def __call__(self, *args): 13 | u = args[1] 14 | print u 15 | ustr = u.tostring() 16 | try: 17 | return self.cache[ustr] 18 | except: 19 | self.cache[ustr] = self.func(*args) 20 | return self.cache[ustr] 21 | 22 | 23 | def __repr__(self): 24 | return self.func.__doc__ 25 | 26 | def _pdf(f, u, delta = 0.001 ): 27 | n = u.shape[0] 28 | if n==1: 29 | t= f(u[0]+delta/2) - f(u[0]-delta/2) 30 | return t 31 | else: 32 | f_plus = lambda *x: f( u[0] + delta/2, *x) 33 | f_minus = lambda *x: f( u[0] - delta/2, *x) 34 | return _pdf(f_plus, u[1:], delta ) - _pdf(f_minus, u[1:], delta ) 35 | 36 | 37 | def _pdfOrder4(f, u, delta = 0.001 ): 38 | n = u.shape[0] 39 | if n==1: 40 | t= ( f(u[0]+delta/2) )- ( f(u[0]-delta/2) ) 41 | return t 42 | else: 43 | f_plus1 = lambda *x: f( u[0] + delta/2, *x) 44 | f_plus2 = lambda *x: f( u[0] + delta, *x) 45 | f_minus1 = lambda *x: f( u[0] - delta/2, *x) 46 | f_minus2 = lambda *x: f( u[0] - delta, *x) 47 | p = -_pdfOrder4(f_plus2, u[1:], delta ) + 8*_pdfOrder4(f_plus1, u[1:], delta) \ 48 | - 8*n(f_minus1, u[1:], delta ) + _pdfOrder4(f_minus2, u[1:], delta )/6 49 | return p 50 | 51 | def cdf2pdf( f, u, delta=0.001, kwargs={} ): 52 | """numerically unstable for large dimensions""" 53 | def _wrapper(*args): 54 | u = np.array(args) 55 | return f(u, **kwargs) 56 | n = u.shape[0] 57 | p= _pdf( _wrapper, u, delta) 58 | return np.exp( np.log(p) - n*np.log( delta ) ) 59 | #return p / delta**n 60 | 61 | -------------------------------------------------------------------------------- /NumericalDerivatives/diff.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CamDavidsonPilon/Python-Numerics/043ab4ad9003325c6270486b24d163933e0c7e8a/NumericalDerivatives/diff.pyc -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | A collection of numerical python recipes 2 | ======================================== 3 | 4 | ###Damerau-Lenenshtein Distance 5 | An implementation of the DL distance, used to measure the "distance" (as defined by number of deletions, substitutions, transpositions and insertions) between two strings. I use it to detect swear words and their misspellings. 6 | 7 | 8 | ###Discrete Option Pricing 9 | Contains functions and classes to compute financial derivatives using discrete pricing theory. Mostly recursion. 10 | 11 | ###Discrete SDE 12 | Robust classes/methods to simulate stochastic differential equations using a discretization scheme. Includes Euler, Milstein and Second-Order scheme. To be implemented into PyProcess. 13 | 14 | ###Estimators 15 | Some useful estimators of regression and others. 16 | 17 | ###Machine Learning Scikit Learn 18 | Some scikit-learn-friendly machine learning classes. 19 | 20 | ###Monte Carlo 21 | A collection of tools to sample from a variety of distributions and evaluating integrals. Include bivariate copula sampling, markov chain monte carlo, and numerical integration (with variance reduction support). 22 | 23 | ###Multinomial Markov And Encoding 24 | Create a multinomial markov chain (plus some awesome sampling and conditional sampling algos) from encoded data. See my [password analysis](http://www.camdp.com/blogs/modeling-password-creation) for a use and creation of it. 25 | 26 | ###Numerical Derivatives 27 | Compute the derivative of functions a points using discrete schemes. Has a great recursive solution to solving problem: 28 | >> Given a multivariate CDF, how can I computationally, and efficiently, find its pdf? 29 | 30 | This problem occurs in copula sampling often. 31 | 32 | 33 | ###Time Series 34 | Some time series helpers and utilities 35 | 36 | ###utils 37 | Some nice utils to have around. 38 | 39 | 40 | 41 | 42 | Author: 43 | Cameron Davidson-Pilon 44 | camdp.com 45 | 46 | Contact me at: 47 | cam.davidson.pilon@gmail.com 48 | @cmrndp 49 | -------------------------------------------------------------------------------- /TimeSeries/MASE.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | 4 | 5 | def MASE(training_series, testing_series, prediction_series): 6 | """ 7 | Computes the MEAN-ABSOLUTE SCALED ERROR forcast error for univariate time series prediction. 8 | 9 | See "Another look at measures of forecast accuracy", Rob J Hyndman 10 | 11 | parameters: 12 | training_series: the series used to train the model, 1d numpy array 13 | testing_series: the test series to predict, 1d numpy array or float 14 | prediction_series: the prediction of testing_series, 1d numpy array (same size as testing_series) or float 15 | absolute: "squares" to use sum of squares and root the result, "absolute" to use absolute values. 16 | 17 | """ 18 | print "Needs to be tested." 19 | n = training_series.shape[0] 20 | d = np.abs( np.diff( training_series) ).sum()/(n-1) 21 | 22 | errors = np.abs(testing_series - prediction_series ) 23 | return errors.mean()/d -------------------------------------------------------------------------------- /TimeSeries/risk_measures.py: -------------------------------------------------------------------------------- 1 | #risk measures 2 | 3 | import scipy.stats as stats 4 | from scipy.optimize import fsolve 5 | import numpy as np 6 | 7 | 8 | 9 | def VaR(ts, alpha, flavour): 10 | if flavour == "historical": 11 | temp_ts = ts.copy() 12 | temp_ts.sort() 13 | n = len( temp_ts) 14 | try: 15 | return -temp_ts.values[ np.floor( (1-alpha)*n ) ] 16 | except: 17 | return -temp_ts[ np.floor( (1-alpha)*n ) ] 18 | 19 | elif flavour == "t": 20 | t = stats.t 21 | t = stats.t( *t.fit( ts ) ) 22 | return -t.ppf( 1-alpha ) 23 | 24 | elif flavour == "normal": 25 | mean = ts.mean() 26 | std = ts.std() 27 | return -stats.norm.ppf( 1-alpha, mean, std ) 28 | elif flavour == "Cornish-Fischer": 29 | z_c = -stats.norm.ppf( 1-alpha, 0 ,1) 30 | S = stats.skew(ts) 31 | K = stats.kurtosis(ts) 32 | z_cf = z_c + (z_c**2-1)*S/6 + (z_c**3- 3*z_c)*K/24 + (2*z_c**3-5*z_c)*S**2/36 33 | return ts.mean() - z_cf*np.sqrt( ts.std() ) 34 | 35 | elif flavour == "kernel": 36 | kde = stats.gaussian_kde( ts ) 37 | print kde.factor 38 | 39 | f = lambda x: kde.integrate_box_1d(-1, x) - (1-alpha) 40 | return -fsolve( f, -0.05)[0] 41 | 42 | 43 | 44 | def ES( ts ,alpha, flavour="historical"): 45 | var = VaR( ts, alpha, flavour) 46 | n_simulations = 200000 47 | if flavour=="historical": 48 | return -ts[( ts < -var )].mean() 49 | 50 | elif flavour == "normal": 51 | mean = ts.mean() 52 | std = ts.std() 53 | norm = stats.norm( mean, std ) 54 | samples = -norm.rvs( n_simulations ) 55 | 56 | return samples[ var <= samples ].mean() 57 | 58 | elif flavour == "t": 59 | t = stats.t 60 | t = stats.t( *t.fit( ts ) ) 61 | samples = -t.rvs( n_simulations ) 62 | return samples[var <=samples ].mean() 63 | 64 | elif flavour == "kernel": 65 | kde = stats.gaussian_kde(ts) 66 | samples = -kde.resample(n_simulations) 67 | return samples[ var<= samples].mean() -------------------------------------------------------------------------------- /TimeSeries/utils.py: -------------------------------------------------------------------------------- 1 | #time series utils 2 | 3 | 4 | 5 | def MASE(training_series, testing_series, prediction_series): 6 | """ 7 | Computes the MEAN-ABSOLUTE SCALED ERROR forcast error for univariate time series prediction. 8 | 9 | See "Another look at measures of forecast accuracy", Rob J Hyndman 10 | 11 | parameters: 12 | training_series: the series used to train the model, 1d numpy array 13 | testing_series: the test series to predict, 1d numpy array or float 14 | prediction_series: the prediction of testing_series, 1d numpy array (same size as testing_series) or float 15 | absolute: "squares" to use sum of squares and root the result, "absolute" to use absolute values. 16 | 17 | """ 18 | print "Needs to be tested." 19 | n = training_series.shape[0] 20 | d = np.abs( training_series.diff() ).sum()/(n-1) 21 | 22 | errors = np.abs(testing_series - prediction_series ) 23 | return errors.mean()/d -------------------------------------------------------------------------------- /pyMC/LinearRegressionWithLoss.py: -------------------------------------------------------------------------------- 1 | #Least squares with penalty on wrong sign. 2 | 3 | import matplotlib 4 | matplotlib.use("Agg") 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import pymc as mc 8 | import scipy.optimize as sop 9 | 10 | def sign(x): 11 | return -1 if x<0 else 1 12 | 13 | def loss( y, yhat, coef = 100): 14 | """vectorized""" 15 | sol = np.zeros_like(y) 16 | ix = y*yhat < 0 17 | sol[ix] = coef*yhat**2 - sign(y[ix])*yhat + abs(y[ix]) 18 | sol[ ~ix ] = abs( y[~ix] - yhat ) 19 | return sol 20 | 21 | 22 | #generate some artifical data 23 | size = 250 24 | beta = 0.4 25 | alpha = 0.0 26 | 27 | X = np.random.randn( size ) 28 | Y = beta*X + alpha + np.random.randn( size ) 29 | 30 | 31 | 32 | # Form the bayesian analysis. 33 | prec = mc.Uniform( "prec", 0, 100 ) 34 | beta_0 = mc.Normal( "beta", 0, 0.0001 ) 35 | alpha_0 = mc.Normal( "alpha", 0, 0.0001 ) 36 | 37 | 38 | @mc.deterministic 39 | def mean( X = X, alpha_0 = alpha_0, beta_0 = beta_0 ): 40 | return alpha_0 + beta_0*X 41 | 42 | to_predict_x = np.linspace( -10, 10, 100) 43 | 44 | 45 | obs = mc.Normal( "obs", mean, prec, value = Y, observed = True) 46 | 47 | model = mc.Model( {"obs":obs, "beta_0":beta_0, "alpha_0":alpha_0, "prec":prec} ) 48 | mcmc = mc.MCMC( model ) 49 | 50 | n_samples = 100000 51 | burnin = 50000 52 | mcmc.sample( burnin + n_samples, burnin) 53 | mean_alpha_0 = mcmc.alpha_0.stats()["mean"] #correspondes to the least squares estimate 54 | mean_beta_0 = mcmc.beta_0.stats()["mean"] #correspondes to the least squares estimate 55 | ls_prediction = mean_alpha_0 + mean_beta_0*to_predict_x 56 | 57 | 58 | alpha_trace = mcmc.alpha_0.trace.gettrace() 59 | beta_trace = mcmc.beta_0.trace.gettrace() 60 | rprec = [1.0/np.sqrt(prec.random()) for i in range(n_samples ) ] 61 | norm_samples = rprec*np.random.randn(n_samples) 62 | 63 | 64 | v = np.zeros_like( to_predict_x) 65 | for i,x in enumerate(to_predict_x): 66 | post_samples = norm_samples + (alpha_trace + beta_trace*x) 67 | tomin = lambda yhat: loss( post_samples, yhat).mean() 68 | v[i] = sop.fmin( tomin, ls_prediction[i] ) 69 | 70 | print v 71 | 72 | #nice plots 73 | plt.figure() 74 | plt.plot( to_predict_x, ls_prediction, lw =2, label = "Least squares prediction", c="k" ) 75 | plt.plot( to_predict_x, v, lw = 2, label = "Bayesian Loss-optimized prediction", c= "r") 76 | plt.scatter( X, Y, alpha = 0.4 ) 77 | plt.legend() 78 | plt.title("Least squares predictions vs \n Bayesian Loss-optimized predictions") 79 | plt.xlim(-7, 7) 80 | plt.ylim(-5, 5) 81 | plt.savefig( "LossOptII.png" ) 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /pyMC/SmallSample.py: -------------------------------------------------------------------------------- 1 | import pymc as mc 2 | import numpy as np 3 | 4 | #data 5 | X = 5 6 | Y = 10 7 | 8 | #rate = mc.Exponential("rate", 1 ) #priors on N 9 | N = mc.Poisson( "N", 20, value = max(X,Y) ) 10 | #N = mc.Uninformative("N", value = max(X,Y) ) 11 | 12 | 13 | pX = mc.Beta("pX", 1,1) #uniform priors 14 | pY = mc.Beta("pY", 1,1 ) 15 | 16 | 17 | observed = mc.Binomial("obs", p = np.array( [pX, pY] ), n = N, value = np.array( [X,Y] ), observed = True ) 18 | 19 | 20 | -------------------------------------------------------------------------------- /pyMC/TableGame.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | #Alice vs Bob in table game 3 | 4 | import random 5 | max_simulations = 1e6 6 | 7 | simulation = 0 8 | wins_alice = 0 9 | wins_bob = 0 10 | 11 | 12 | while simulation < max_simulations: 13 | #draw random p 14 | p = random.random() 15 | #draw eight trials 16 | Alice_wins = sum( [ random.random() < p for i in range(8) ] ) 17 | if Alice_wins == 5: 18 | simulation += 1 19 | #This is case of 5vs3 in 8th round, lets check who wins by drawing three more 20 | if any( [ random.random() < p for i in range(3) ] ): 21 | wins_alice +=1 22 | else: 23 | wins_bob +=1 24 | 25 | 26 | print "Proportion of Alice wins: %.3f."%( wins_alice/max_simulations ) -------------------------------------------------------------------------------- /pyMC/blowflies.py: -------------------------------------------------------------------------------- 1 | """ 2 | See 3 | 4 | BAYESIAN INFERENCE AND MARKOV CHAIN MONTE CARLO BY EXAMPLE 5 | GEOFFK. NICHOLLS 6 | 7 | """ 8 | import numpy as np 9 | import pymc as mc 10 | import pandas as pd 11 | 12 | #observations 13 | data = pd.read_csv("blowfly97I.csv") 14 | yt = data["total"].value 15 | 16 | N = t.shape[0] 17 | 18 | r = mc.Exponential( "r", beta = 1.0 ) 19 | b = mc.Exponential( "b", beta = 1000.0 ) 20 | lambduh = mc.Exponential( "lambdu", beta = 1.0/1000 ) 21 | n_0 = mc.Poisson( "n_0", mu=lambduh) 22 | 23 | 24 | @mc.deterministic 25 | def n_t( n_0=n_0, r=r, b=b, N=N): 26 | n = np.empty( N, dtype=object) 27 | n[0] = n_0 28 | for i in range( 1, N): 29 | n[i] = (r*n[i-1])/( 1.0 + b**4*n[i-1]**4 ) 30 | return n 31 | 32 | y = np.empty( N, dtype=object) 33 | for i in range(0, N): 34 | y[i] = mc.Poisson( "y_%i"%i, mu = n_t[i], observed= True, value = yt[i] ) 35 | 36 | model = mc.Model( {"yt":yt, "nt":n_t, "b":b, "r":r, "n_0":n_0}) 37 | mcmc= mc.MCMC(model) 38 | 39 | mcmc.sample( 30000, 15000) 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /pyMC/mixtureNormals.py: -------------------------------------------------------------------------------- 1 | from pymc import * 2 | 3 | size = 20 4 | p = Uniform( "p", 0 , 1) 5 | 6 | ber = Bernoulli( "ber", p = p, size = size) 7 | 8 | precision = Gamma('precision', alpha=0.1, beta=0.1) 9 | 10 | mean1 = Normal( "mean1", 0, 0.001 ) 11 | mean2 = Normal( "mean2", 0, 0.001 ) 12 | 13 | @deterministic 14 | def mean( ber = ber, mean1 = mean1, mean2 = mean2): 15 | return ber*mean1 + (1-ber)*mean2 16 | 17 | 18 | #generate some artifical data 19 | v = np.random.randint( 0, 2, size) 20 | data = v*(10+ np.random.randn(size) ) + (1-v)*(-10 + np.random.randn(size ) ) 21 | 22 | 23 | obs = Normal( "obs", mean, precision, value = data, observed = True) 24 | 25 | model = Model( {"p":p, "precision": precision, "mean1": mean1, "mean2":mean2, "obs":obs} ) -------------------------------------------------------------------------------- /utils/contour_irregular_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | contour_irregular_data.py 3 | 4 | This module/function lets you plot irregularlly spaced data by using an interpolation scheme 5 | 6 | Code taken/hacked modified from http://www.scipy.org/Cookbook/Matplotlib/Gridding_irregularly_spaced_data 7 | """ 8 | 9 | 10 | import numpy as np 11 | from scipy.interpolate import griddata 12 | import matplotlib.pyplot as plt 13 | 14 | 15 | def contour(x,y,z, linewidth = 2, labels = None): 16 | """ 17 | Plots contours for non-evenly spaced data. 18 | x,y,z must be 1d arrays. 19 | lines = # of contour lines (default 18 ) 20 | linewidth = line width of lines (default 2 ) 21 | """ 22 | 23 | assert x.shape[0] == y.shape[0] == z.shape[0], "arrays x,y,z must be the same size" 24 | 25 | #make a grid that surrounds x,y support 26 | xi = np.linspace(x.min(),x.max(),100) 27 | yi = np.linspace(y.min(),y.max(),100) 28 | # grid the data. 29 | zi = griddata((x, y), z, (xi[None,:], yi[:,None]), method='cubic') 30 | # contour the gridded data, plotting dots at the randomly spaced data points. 31 | plt.figure() 32 | CS = plt.contour(xi,yi,zi,linewidth=2) 33 | plt.clabel(CS, inline=1, fontsize=10) 34 | 35 | if labels: 36 | plt.xlabel(labels[0]) 37 | plt.ylabel(labels[1]) 38 | # plot data points. 39 | plt.scatter(x,y,c=z,s=60, alpha = 0.7, edgecolors = "none") 40 | plt.xlim(x.min(),x.max()) 41 | plt.ylim(y.min(),y.max()) 42 | plt.show() 43 | -------------------------------------------------------------------------------- /utils/cov2corr.py: -------------------------------------------------------------------------------- 1 | """ 2 | covariance matrix to correlation matrix. 3 | """ 4 | 5 | 6 | 7 | def cov2corr( A ): 8 | """ 9 | covariance matrix to correlation matrix. 10 | """ 11 | d = np.sqrt(A.diagonal()) 12 | A = ((A.T/d).T)/d 13 | #A[ np.diag_indices(A.shape[0]) ] = np.ones( A.shape[0] ) 14 | return A -------------------------------------------------------------------------------- /utils/dataframe_pairwise_feature_gen.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import combinations_with_replacement, combinations 3 | 4 | def create_pairwise_data( df, ignore = [], squares = True): 5 | """ 6 | df: a dataframe 7 | ignore: an iterable of columns to not make quad features out of. 8 | 9 | returns: 10 | a copied dataframe with quadratic features, including squares of variables if squares == True. 11 | 12 | """ 13 | n,d = df.shape 14 | columns = df.columns.diff( ignore ) 15 | 16 | df = df.copy() 17 | 18 | iterator = combinations_with_replacement if squares else combinations 19 | 20 | for x,y in iterator( columns, 2): 21 | df[ x + "__times__" + y ] = df[x]*df[y] 22 | 23 | 24 | return df 25 | -------------------------------------------------------------------------------- /utils/jarquebera_test.py: -------------------------------------------------------------------------------- 1 | #jaque-berra test 2 | 3 | import scipy.stats as stats 4 | 5 | def JarqueBeraTest(data,significance = 0.95): 6 | """ 7 | If the data come from a normal distribution, the JB statistic asymptotically has a chi-squared distribution with two degrees of freedom, 8 | so the statistic can be used to test the hypothesis that the data are from a normal distribution. 9 | 10 | """ 11 | n = data.shape[0] 12 | if n < 2000: 13 | print "Warning: JarqueBera tests works best with large sample sizes (> ~2000 )." 14 | 15 | S = float(n)/6*( stats.skew(data)**2 + 0.25*(stats.kurtosis( data, fisher=True) )**2) 16 | t = stats.chi2(2).ppf( significance ) 17 | if S < t: 18 | print "Not enough evidence to reject as non-Normal according to the Jarque-Bera test. S = %.4f < %.4f"%(S,t) 19 | else: 20 | print "Reject that is Normal according to the Jarque-Bera test; S = %.4f > %.4f"%(S,t) 21 | -------------------------------------------------------------------------------- /utils/kaggleDataSet.py: -------------------------------------------------------------------------------- 1 | #Author: John Ramney 2 | 3 | import requests 4 | 5 | # The direct link to the Kaggle data set 6 | data_url = 'http://www.kaggle.com/c/digit-recognizer/download/train.csv' 7 | 8 | # The local path where the data set is saved. 9 | local_filename = "train.csv" 10 | 11 | # Kaggle Username and Password 12 | kaggle_info = {'UserName': "my_username", 'Password': "my_password"} 13 | 14 | # Attempts to download the CSV file. Gets rejected because we are not logged in. 15 | r = requests.get(data_url) 16 | 17 | # Login to Kaggle and retrieve the data. 18 | r = requests.post(r.url, data = kaggle_info, prefetch = False) 19 | 20 | # Writes the data to a local file one chunk at a time. 21 | f = open(local_filename, 'w') 22 | for chunk in r.iter_content(chunk_size = 512 * 1024): # Reads 512KB at a time into memory 23 | if chunk: # filter out keep-alive new chunks 24 | f.write(chunk) 25 | f.close() 26 | -------------------------------------------------------------------------------- /utils/linked_list.py: -------------------------------------------------------------------------------- 1 | 2 | def reverse( list_head, previous = None): 3 | """ 4 | assume .next is present 5 | """ 6 | if not list_head.next: 7 | list_head.next = previous 8 | return 9 | else: 10 | reverse( list_head.next, list_head ) 11 | list_head.next = previous if previous else None 12 | 13 | 14 | class Linked( object ): 15 | 16 | def __init__(self, next, value ): 17 | self.next = next 18 | self.value = value 19 | 20 | 21 | C = Linked( None, "c") 22 | B = Linked( C, "b") 23 | A = Linked( B, "a") 24 | -------------------------------------------------------------------------------- /utils/lyungbox_test.py: -------------------------------------------------------------------------------- 1 | #lyungBoxTest 2 | 3 | 4 | import numpy.ma as ma 5 | import numpy as np 6 | import scipy.stats as stats 7 | import scipy.stats.mstats as mstats 8 | 9 | 10 | def LyungBoxTest(ts, tested_lag, significance = 0.95 ): 11 | """ 12 | ts: a time series. 13 | tested_lag: is the lag being tested, but must be an int. 14 | """ 15 | tested_lag = int(tested_lag) 16 | f_ts = ts 17 | f_ts = f_ts - f_ts.mean() 18 | n = f_ts.shape[0] 19 | Q = 0 20 | for i in range(1, tested_lag+1 ): 21 | lagged_f_ts = f_ts.shift(i) 22 | m_f_ts = ma.masked_array( lagged_f_ts, mask = np.isnan( lagged_f_ts ) ) 23 | Q += mstats.pearsonr( f_ts, m_f_ts)[0]**2/(n-i) 24 | 25 | Q = Q*n*(n+2) 26 | t = stats.chi2(tested_lag).ppf( significance ) 27 | if Q < t: 28 | print "%d | Not enough evidence to reject Null: Q = %.4f < %.4f"%(tested_lag, Q,t) 29 | #print "Not enough evidence to reject "+func.__name__ + " " + series_name+" as not %d autocorrelated according to the Lyung Box test. Q = %.4f < %.4f"%(tested_lag, Q,t) 30 | else: 31 | print "%d | Reject Null: Q = %.4f > %.4f"%(tested_lag, Q,t) 32 | #print "Reject that "+series_name+ " is autocorrelated at lag %d according to the Lyung Box test test; Q = %.4f > %.4f"%(tested_lag, Q,t) 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /utils/mean_average_precision.py: -------------------------------------------------------------------------------- 1 | #mean average precision 2 | """ 3 | % This function computes the average precision of predicated values. The 4 | % average precision is hella-confusing at first glace. Here's what Kaggle 5 | % has to say: 6 | % 7 | % The true scores are sorted (descending) according to the order of the 8 | % submission (only the order of the submission matters). In each row, 9 | % we then compute the cumulative (from the top up to that row) 10 | % "True Scores Ordered by Submission" divided by the cumulative "True 11 | % Scores Ordered By True Scores", where that quotient is called the 12 | % precision at row n. The final score is the average of the precision 13 | % at row n (over all n). 14 | % 15 | % 16 | % Ok, so say the true scores, sorted, are 3, 2.3, 1.6. And I predicted 17 | % the order 3, 1.6, 2.3. Then the average prec. is mean( 3/3, 18 | % (3+1.6)/(3+2.3), (3+1.6 + 2.3)/(3+ 2.3 + 1.6) ) = .96 something. 19 | % 20 | % 21 | """ 22 | import numpy as np 23 | 24 | def MAP( true_scores, predictive_scores): 25 | true_values_sorted = true_scores.copy() 26 | true_values_sorted = true_values_sorted[ np.argsort( -true_values_sorted ) ] 27 | 28 | ix = np.argsort( -predictive_scores ) 29 | 30 | true_values_sorted_by_prediction = true_scores[ix] 31 | 32 | score = np.mean( true_values_sorted_by_prediction.cumsum()/ true_values_sorted.cumsum() ) 33 | return score 34 | -------------------------------------------------------------------------------- /utils/memorize.py: -------------------------------------------------------------------------------- 1 | """ 2 | use this decorator for recursion to cache calls 3 | 4 | """ 5 | 6 | 7 | class memorize( object ): 8 | 9 | def __init__(self, func): 10 | self.func = func 11 | self.cache = {} 12 | 13 | def __call__(self, *args): 14 | try: 15 | return self.cache[args] 16 | except: 17 | self.cache[args] = self.func(*args) 18 | return self.cache[args] 19 | 20 | def __repr__(self): 21 | return self.func.__doc__ 22 | 23 | -------------------------------------------------------------------------------- /utils/power_set.py: -------------------------------------------------------------------------------- 1 | #Cameron Davidson-Pilon, 2012 2 | 3 | #iterative solution 4 | 5 | def power_set( list ): 6 | n = len(list) 7 | for enumerate in range( 2**n ): 8 | 9 | subset = [] 10 | i = enumerate 11 | 12 | for j in range(n): 13 | if i%2: 14 | subset.append( list[j] ) 15 | i = i >> 1 16 | 17 | print subset 18 | 19 | 20 | #resursive solution 21 | 22 | def power_set( list, called_empty = False ): 23 | 24 | if not called_empty: 25 | print [] 26 | 27 | n = len(list) 28 | print list 29 | 30 | if n == 1: 31 | #not 0, as there are n subsets of length 1, but only 1 32 | #subset of length 0 33 | return 34 | 35 | for i in range(n): 36 | power_set( list[:i] + list[i+1:], True ) 37 | -------------------------------------------------------------------------------- /utils/primes.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | def primes_up_to(max_num): 4 | current_primes = [] 5 | for num in range(2, max_num): 6 | if prime(num, current_primes): 7 | current_primes.append(num) 8 | return 9 | 10 | def prime(n, current_primes): 11 | for i in current_primes: 12 | if n%i==0: 13 | return False 14 | 15 | return True 16 | 17 | 18 | print primes_up_to( 25 ) 19 | 20 | 21 | print primes_up_to( 100 ) -------------------------------------------------------------------------------- /utils/qq_plot.py: -------------------------------------------------------------------------------- 1 | 2 | import matplotlib.pyplot as plt 3 | import scipy.stats as stats 4 | import numpy as np 5 | 6 | def qq_plot( data ): 7 | plt.figure() 8 | (osm, osr) = stats.probplot( data, sparams=[ data.mean(), data.std()], dist='norm', fit = True ) 9 | x_ = np.array( [min( osm[1] ), max (osm[1] ) ] ) 10 | slope = osr[0] 11 | inter = osr[1] 12 | 13 | plt.plot( x_, x_ , label="Line y=x") 14 | plt.scatter( osm[0], osm[1] ) 15 | plt.xlabel( "Observed" ) 16 | plt.ylabel( "Theoretical" ) 17 | plt.show() -------------------------------------------------------------------------------- /utils/sample.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | #sample a file by taking every nth item 4 | # usage: 5 | # $ python sample.py n myfile.txt mysampledfile.txt 6 | # 7 | # 8 | 9 | import sys 10 | 11 | 12 | def sample(n, infile, outfile): 13 | 14 | n = int(n) 15 | try: 16 | ifile = open( infile, 'r') 17 | except e: 18 | print "Could not open file %s"%infile 19 | raise e 20 | 21 | ofile = open ( outfile, 'w') 22 | 23 | i = 0 24 | for line in ifile.readlines(): 25 | if i%n==0: 26 | ofile.write(line) 27 | i+=1 28 | 29 | 30 | 31 | if __name__ == "__main__": 32 | sample( *sys.argv[1:] ) 33 | print "Completed" --------------------------------------------------------------------------------