├── .gitignore
├── DamerauLevenshteinDistance
    ├── badwords.txt
    ├── dameraulevenshtein.py
    └── example.py
├── DiscreteOptionPricing
    ├── price_bounds.py
    └── shout_option.py
├── DiscreteSDE
    └── discreteSDE.py
├── Estimators
    └── theil_sen.py
├── KalmanFilter
    └── simple_kalman.py
├── MachineLearningScikitLearn
    ├── BayesianBandit.py
    ├── blender.py
    ├── ensembleSelector.py
    ├── maxCorrelationTransformer.py
    ├── outlier.py
    ├── pretty_pca.py
    ├── supervised_pca.py
    └── weighted_least_squares.py
├── MonteCarlo
    ├── Copulas
    │   └── README.txt
    ├── Integration
    │   ├── Assignment.pdf
    │   ├── MonteCarloIntegrator.py
    │   ├── Q6.py
    │   └── examples.py
    ├── MCMC
    │   ├── copulas.py
    │   ├── mcmc.py
    │   └── mcmc_example.py
    ├── grammschmidt.py
    ├── sample_normal_given_projection.py
    ├── sample_psd.py
    └── sampling_methods.py
├── MultinomialMarkovAndEncoding
    ├── encoding.py
    └── multinomialMM.py
├── NumericalDerivatives
    ├── diff.py
    └── diff.pyc
├── README.md
├── TimeSeries
    ├── MASE.py
    ├── risk_measures.py
    └── utils.py
├── pyMC
    ├── LinearRegressionWithLoss.py
    ├── SmallSample.py
    ├── TableGame.py
    ├── blowflies.py
    └── mixtureNormals.py
└── utils
    ├── contour_irregular_data.py
    ├── cov2corr.py
    ├── dataframe_pairwise_feature_gen.py
    ├── jarquebera_test.py
    ├── kaggleDataSet.py
    ├── linked_list.py
    ├── lyungbox_test.py
    ├── mean_average_precision.py
    ├── memorize.py
    ├── power_set.py
    ├── primes.py
    ├── qq_plot.py
    └── sample.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled source #
 2 | ###################
 3 | *.com
 4 | *.class
 5 | *.dll
 6 | *.exe
 7 | *.o
 8 | *.so
 9 | *.pyc
10 | 
11 | 
12 | #images#
13 | .png
14 | 
15 | # Packages #
16 | ############
17 | # it's better to unpack these files and commit the raw source
18 | # git has its own built in compression methods
19 | *.7z
20 | *.dmg
21 | *.gz
22 | *.iso
23 | *.jar
24 | *.rar
25 | *.tar
26 | *.zip
27 | 
28 | # Logs and databases #
29 | ######################
30 | *.log
31 | *.sql
32 | *.sqlite
33 | 
34 | # OS generated files #
35 | ######################
36 | .DS_Store
37 | .DS_Store?
38 | ._*
39 | .Spotlight-V100
40 | .Trashes
41 | Icon?
42 | ehthumbs.db
43 | Thumbs.db
44 | 


--------------------------------------------------------------------------------
/DamerauLevenshteinDistance/badwords.txt:
--------------------------------------------------------------------------------
  1 | ahole
  2 | anus
  3 | ash0le
  4 | ash0les
  5 | asholes
  6 | ass
  7 | ass monkey
  8 | assface
  9 | assh0le
 10 | assh0lez
 11 | asshole
 12 | assholes
 13 | assholz
 14 | asswipe
 15 | azzhole
 16 | bassterds
 17 | bastard
 18 | bastards
 19 | bastardz
 20 | basterds
 21 | basterdz
 22 | biatch
 23 | bitch
 24 | bitches
 25 | blow job
 26 | boffing
 27 | butthole
 28 | buttwipe
 29 | c0ck
 30 | c0cks
 31 | c0k
 32 | carpet muncher
 33 | cawk
 34 | cawks
 35 | clit
 36 | cnts
 37 | cntz
 38 | cock
 39 | cockhead
 40 | cock-head
 41 | cocks
 42 | cocksucker
 43 | cock-sucker
 44 | crap
 45 | cum
 46 | cunt
 47 | cunts
 48 | cuntz
 49 | dick
 50 | dild0
 51 | dild0s
 52 | dildo
 53 | dildos
 54 | dilld0
 55 | dilld0s
 56 | dominatricks
 57 | dominatrics
 58 | dominatrix
 59 | dyke
 60 | enema
 61 | f u c k
 62 | f u c k e r
 63 | fag
 64 | fag1t
 65 | faget
 66 | fagg1t
 67 | faggit
 68 | faggot
 69 | fagit
 70 | fags
 71 | fagz
 72 | faig
 73 | faigs
 74 | fart
 75 | flipping the bird
 76 | fuck
 77 | fucker
 78 | fuckin
 79 | fucking
 80 | fucks
 81 | fudge packer
 82 | fuk
 83 | fukah
 84 | fuken
 85 | fuker
 86 | fukin
 87 | fukk
 88 | fukkah
 89 | fukken
 90 | fukker
 91 | fukkin
 92 | g00k
 93 | gay
 94 | gayboy
 95 | gaygirl
 96 | gays
 97 | gayz
 98 | god-damned
 99 | h00r
100 | h0ar
101 | h0re
102 | hells
103 | hoar
104 | hoor
105 | hoore
106 | jackoff
107 | jap
108 | japs
109 | jerk-off
110 | jisim
111 | jiss
112 | jizm
113 | jizz
114 | knob
115 | knobs
116 | knobz
117 | kunt
118 | kunts
119 | kuntz
120 | lesbian
121 | lezzian
122 | lipshits
123 | lipshitz
124 | masochist
125 | masokist
126 | massterbait
127 | masstrbait
128 | masstrbate
129 | masterbaiter
130 | masterbate
131 | masterbates
132 | motha fucker
133 | motha fuker
134 | motha fukkah
135 | motha fukker
136 | mother fucker
137 | mother fukah
138 | mother fuker
139 | mother fukkah
140 | mother fukker
141 | mother-fucker
142 | mutha fucker
143 | mutha fukah
144 | mutha fuker
145 | mutha fukkah
146 | mutha fukker
147 | n1gr
148 | nastt
149 | nigger;
150 | nigur;
151 | niiger;
152 | niigr;
153 | orafis
154 | orgasim;
155 | orgasm
156 | orgasum
157 | oriface
158 | orifice
159 | orifiss
160 | packi
161 | packie
162 | packy
163 | paki
164 | pakie
165 | paky
166 | pecker
167 | peeenus
168 | peeenusss
169 | peenus
170 | peinus
171 | pen1s
172 | penas
173 | penis
174 | penis-breath
175 | penus
176 | penuus
177 | phuc
178 | phuck
179 | phuk
180 | phuker
181 | phukker
182 | polac
183 | polack
184 | polak
185 | poonani
186 | pr1c
187 | pr1ck
188 | pr1k
189 | pusse
190 | pussee
191 | pussy
192 | puuke
193 | puuker
194 | queer
195 | queers
196 | queerz
197 | qweers
198 | qweerz
199 | qweir
200 | recktum
201 | rectum
202 | retard
203 | sadist
204 | scank
205 | schlong
206 | screwing
207 | semen
208 | sex
209 | sexy
210 | sh!t
211 | sh1t
212 | sh1ter
213 | sh1ts
214 | sh1tter
215 | sh1tz
216 | shit
217 | shits
218 | shitter
219 | shitty
220 | shity
221 | shitz
222 | shyt
223 | shyte
224 | shytty
225 | shyty
226 | skanck
227 | skank
228 | skankee
229 | skankey
230 | skanks
231 | skanky
232 | slut
233 | sluts
234 | slutty
235 | slutz
236 | son-of-a-bitch
237 | tit
238 | turd
239 | va1jina
240 | vag1na
241 | vagiina
242 | vagina
243 | vaj1na
244 | vajina
245 | vullva
246 | vulva
247 | w0p
248 | wh00r
249 | wh0re
250 | whore
251 | xrated
252 | xxx
253 | b!+ch
254 | bitch
255 | blowjob
256 | clit
257 | arschloch
258 | fuck
259 | shit
260 | ass
261 | asshole
262 | b!tch
263 | b17ch
264 | b1tch
265 | bastard
266 | bi+ch
267 | boiolas
268 | buceta
269 | c0ck
270 | cawk
271 | chink
272 | cipa
273 | clits
274 | cock
275 | cum
276 | cunt
277 | dildo
278 | dirsa
279 | ejakulate
280 | fatass
281 | fcuk
282 | fuk
283 | fux0r
284 | hoer
285 | hore
286 | jism
287 | kawk
288 | l3itch
289 | l3i+ch
290 | lesbian
291 | masturbate
292 | masterbat
293 | masterbat3
294 | motherfucker
295 | s.o.b.
296 | mofo
297 | nazi
298 | nigga
299 | nigger
300 | nutsack
301 | phuck
302 | pimpis
303 | pusse
304 | pussy
305 | scrotum
306 | sh!t
307 | shemale
308 | shi+
309 | sh!+
310 | slut
311 | smut
312 | teets
313 | tits
314 | boobs
315 | b00bs
316 | teez
317 | testical
318 | testicle
319 | titt
320 | bitching
321 | idiot
322 | w00se
323 | jackoff
324 | wank
325 | whoar
326 | whore
327 | damn
328 | dyke
329 | fuck
330 | shit
331 | @$$
332 | amcik
333 | andskota
334 | arse
335 | assrammer
336 | ayir
337 | bi7ch
338 | bitch
339 | bollock
340 | breasts
341 | butt-pirate
342 | cabron
343 | cazzo
344 | chraa
345 | chuj
346 | cock
347 | cunt
348 | d4mn
349 | daygo
350 | dego
351 | dick
352 | dike
353 | dupa
354 | dziwka
355 | ejackulate
356 | ekrem
357 | ekto
358 | enculer
359 | faen
360 | fag
361 | fanculo
362 | fanny
363 | feces
364 | feg
365 | felcher
366 | ficken
367 | fitt
368 | flikker
369 | foreskin
370 | fotze
371 | fu(
372 | fuk
373 | futkretzn
374 | gay
375 | gook
376 | guiena
377 | h0r
378 | h4x0r
379 | hell
380 | helvete
381 | hoer
382 | honkey
383 | huevon
384 | hui
385 | injun
386 | jizz
387 | kanker
388 | kike
389 | klootzak
390 | kraut
391 | knulle
392 | kuk
393 | kuksuger
394 | kurac
395 | kurwa
396 | kusi
397 | kyrpa
398 | lesbo
399 | mamhoon
400 | masturbat
401 | merd
402 | mibun
403 | monkleigh
404 | mouliewop
405 | muie
406 | mulkku
407 | muschi
408 | nazis
409 | nepesaurio
410 | nigger
411 | orospu
412 | paska
413 | perse
414 | picka
415 | pierdol
416 | pillu
417 | pimmel
418 | piss
419 | pizda
420 | poontsee
421 | poop
422 | porn
423 | p0rn
424 | pr0n
425 | preteen
426 | pula
427 | pule
428 | puta
429 | puto
430 | qahbeh
431 | queef
432 | rautenberg
433 | schaffer
434 | scheiss
435 | schlampe
436 | schmuck
437 | screw
438 | sh!t
439 | sharmuta
440 | sharmute
441 | shipal
442 | shiz
443 | skribz
444 | skurwysyn
445 | sphencter
446 | spic
447 | spierdalaj
448 | splooge
449 | suka
450 | b00b
451 | testicle
452 | titt
453 | twat
454 | vittu
455 | wank
456 | wetback
457 | wichser
458 | wop
459 | yed
460 | zabourah


--------------------------------------------------------------------------------
/DamerauLevenshteinDistance/dameraulevenshtein.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Author: Michael Homer
 3 | # Date: Sunday, April 26th, 2009
 4 | # License: MIT
 5 | #
 6 | 
 7 | def dameraulevenshtein(seq1, seq2):
 8 |     """Calculate the Damerau-Levenshtein distance between sequences.
 9 | 
10 |     This distance is the number of additions, deletions, substitutions,
11 |     and transpositions needed to transform the first sequence into the
12 |     second. Although generally used with strings, any sequences of
13 |     comparable objects will work.
14 | 
15 |     Transpositions are exchanges of *consecutive* characters; all other
16 |     operations are self-explanatory.
17 | 
18 |     This implementation is O(N*M) time and O(M) space, for N and M the
19 |     lengths of the two sequences.
20 | 
21 |     >>> dameraulevenshtein('ba', 'abc')
22 |     2
23 |     >>> dameraulevenshtein('fee', 'deed')
24 |     2
25 | 
26 |     It works with arbitrary sequences too:
27 |     >>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
28 |     2
29 |     """
30 |     # codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
31 |     # Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
32 |     # However, only the current and two previous rows are needed at once,
33 |     # so we only store those.
34 |     oneago = None
35 |     thisrow = range(1, len(seq2) + 1) + [0]
36 |     for x in xrange(len(seq1)):
37 |         # Python lists wrap around for negative indices, so put the
38 |         # leftmost column at the *end* of the list. This matches with
39 |         # the zero-indexed strings and saves extra calculation.
40 |         twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
41 |         for y in xrange(len(seq2)):
42 |             delcost = oneago[y] + 1
43 |             addcost = thisrow[y - 1] + 1
44 |             subcost = oneago[y - 1] + (seq1[x] != seq2[y])
45 |             thisrow[y] = min(delcost, addcost, subcost)
46 |             # This block deals with transpositions
47 |             if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
48 |                 and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
49 |                 thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
50 |     return thisrow[len(seq2) - 1]


--------------------------------------------------------------------------------
/DamerauLevenshteinDistance/example.py:
--------------------------------------------------------------------------------
 1 | # example usage using badwords.txt (not for the easily offended, but seriously, you're from the internet sooo...)
 2 | 
 3 | 
 4 | from dameraulevenshtein import dameraulevenshtein as dl_distance
 5 | import string
 6 | 
 7 | #open the badwords.txt
 8 | file = open("badwords.txt", "r")
 9 | swear_list = map( string.strip, file.readlines() ) #strips that annoying \n
10 | 
11 | def isswear( word, max_distance = 1):
12 |     """
13 |     checks if word is a swear word, or a missing spelling of swear word.
14 |     """
15 |     word = word.lower()
16 |     dl = lambda x: dl_distance(x, word) <= max_distance
17 |     return any( map(dl, swear_list) )
18 |     
19 | 
20 | 
21 |     
22 | if __name__=="__main__":
23 |     words_to_test = ["boo", "cameron", "pissy", "ashole", "azzhole", "btiching"]
24 |     
25 |     print "max distance = 1"
26 |     for w in words_to_test:
27 |         print w, isswear(w)    
28 |     print
29 |     print "max distance = 2"
30 |     for w in words_to_test:
31 |         print w, isswear(w,2)
32 |         
33 |         
34 |     
35 |     
36 |     
37 |     


--------------------------------------------------------------------------------
/DiscreteOptionPricing/price_bounds.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a simple, recursive, implementation of pricing
 3 | a option with uncertain volatility (known sigma_max, sigma_min) in a 
 4 | recombining trinomial tree model.
 5 | 
 6 | It is surprisingly fast, thanks to cacheing the calls.
 7 | 
 8 | Example of use below.
 9 | 
10 | """
11 | 
12 | import numpy as np
13 | 
14 | class memorize( object ):
15 | 
16 |     def __init__(self, func):
17 |         self.func = func
18 |         self.cache = {}
19 |         
20 |     def __call__(self, *args):
21 |         try:
22 |             return self.cache[args]
23 |         except:
24 |             self.cache[args] = self.func(*args)
25 |             return self.cache[args]
26 | 
27 |     def __repr__(self):
28 |         return self.func.__doc__
29 | 
30 | def Snj( S_0, n ,j, sigma_max, r, t_delta):
31 |     return S_0*np.exp( j*sigma_max*np.sqrt(delta_t) + n*r*delta_t )
32 | 
33 | @memorize    
34 | def price( style, F, sigma_max, sigma_min, delta_t, r, S_0, n, j, N):
35 |     """
36 |     This is the main function.
37 |         style: either "min" or "max", get the min or maximum price respectively.
38 |         F: the final payoff function
39 |         sigma_max, sigma_min: the max and min volatility
40 |         delta_t: the length of time step
41 |         r: the risk-free rate
42 |         S_0: the initial price of the underlying
43 |         n: the time step
44 |         j: position in tree
45 |         N: the number of time steps. I'd keep this not too large, else you stack overflow lol.
46 |     """
47 |     if n == N:
48 |         return F( Snj(S_0, n, j, sigma_max, r, delta_t ) )
49 | 
50 |     t = sigma_max*np.sqrt(delta_t)/2
51 |     l = (1-t)*price(style,F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j+1, N) + \
52 |         (1+t)*price(style,F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j-1, N) - \
53 |         2*price(style,F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j, N)
54 |     
55 |     c = 0.5 if (1-2*(style=="min"))*l >= 0 else sigma_min**2/(2*sigma_max**2)
56 | 
57 |     return  np.exp( -r*delta_t)*( price(style, F, sigma_max, sigma_min, delta_t, r, S_0, n+1, j, N) + c*l )
58 |     
59 |     
60 |     
61 | if __name__=="__main__":
62 |     
63 |     def F(x):  
64 |         # a collared option.
65 |         return max(0, x - 100) - max( 0, x - 120)
66 | 
67 |     sigma_max = 0.4
68 |     sigma_min = 0.1
69 |     r= 0.1
70 |     S0 = 100.
71 |     N = 100.
72 |     delta_t = 1.0/N
73 |     
74 |     print price("min", F, sigma_max, sigma_min, delta_t, r, S0, 0,0, N )
75 |     print price("max", F, sigma_max, sigma_min, delta_t, r, S0, 0,0, N )
76 |     
77 |     """
78 |     4.54345306389
79 |     12.358008422
80 |     """
81 | 


--------------------------------------------------------------------------------
/DiscreteOptionPricing/shout_option.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 2012-01-17
  3 | 
  4 | @author: Cameron Davidson-Pilon
  5 | 
  6 | 
  7 | '''
  8 | import math
  9 | 
 10 | def binomial_shoutout(r, u, q, n, x_0, k):
 11 |     """
 12 |     r: risk-free rate
 13 |     u: the return of an up jump
 14 |     q: the Risk Neutral probability of an 'up' jump 
 15 |     n: the number of periods
 16 |     x_0: the start price
 17 |     k: the strike price
 18 |     
 19 |     Explanation of model:
 20 |         
 21 |         The central idea of my model is based on the recursive formula for a binomial option pricing:
 22 |         
 23 |                 V(t,x) = D*( q*V(t+1, u*x) + (1-q)*V(t+1, x/u) )      (1)
 24 |                 
 25 |         The value of the option at the next time step, V_{t+1}, is dependent on whether the investor chooses to shout or not
 26 |         to shout at the current node. The investor tries to maximize his/her profit, thus, I modified the formula above to:
 27 |         
 28 |                 V_t = D*( max ( shout now, shout later) )      (2)
 29 |                 
 30 |         The task is now the calculate the two quantities, 'shout now' and 'shout later'. The value if the investor 
 31 |         shouts now is given by (1) with the new payoff max( K-S_T, 0, K-S_t*), and we can calculate the 'shout now' value.
 32 |         The 'shout later' requires us to look at the the next value of the nodes in the tree, and calculate the value of these
 33 |         nodes given we have NOT shouted.
 34 |         
 35 |         We can formulize this as:
 36 |             
 37 |             V(t,x) = D*( max( q*V(t+1, u*x | shout position  = x) + (1-q)*V(t+1, x/u | shout position = x) ,   (3) 
 38 |                               q*V(t+1, u*x | haven't shouted ) + (1-q)*V(t+1, x/u | haven't shouted ) ) )   
 39 |         
 40 |         So when should one shout? Heuristically, one should shout when the expected value of shouting now is greater 
 41 |         then the expected value of waiting to shout later, i.e. when 
 42 |         
 43 |              E[ V(t+1 | shout position = x ]  >  E[ V(t+1 | haven't shouted yet) ]  (4)
 44 |         
 45 |        Both expectations are under the risk-neutral measure. I tried this heuristic and found that it is optimal 
 46 |        to immediately shout if the option is in the money. This makes sense, as if the stock drops, you gain the large 
 47 |        K-S_T payoff, but if the stock rises you are protected and still receive, albeit small, K-S_t* > 0. Obviously, 
 48 |        if the stock is not in the money it is pointless to shout.
 49 |     
 50 |         
 51 |     """
 52 | 
 53 |     R = float(r); U = float(u); Q = float(q); X_0 = float(x_0); K = float(k); N = float(n)
 54 |     D = math.exp(-R/N)
 55 |     dictionary={}
 56 |     shout_times = []
 57 |     
 58 |     def payoff(K,x,m):
 59 |         #This is a put-style payoff
 60 |         return max(K-x, K-m, 0)
 61 |     
 62 |     
 63 |     def value(n, x, m):
 64 |         """ find the value of a shout put"""
 65 |         try:
 66 |            return dictionary["%s,%s,%s"%(n,x,m)]
 67 |         except:
 68 |             if n==N:
 69 |                 return payoff(K,x,m)
 70 |             else:
 71 |                 shout_now = Q*value(n+1, U*x, m) + (1-Q)*value(n+1,x/U, m)
 72 |                 if m==x:
 73 |                     shout_later = Q*value(n+1, U*x, U*x) + (1-Q)*value(n+1, x/U, x/U) 
 74 |                 else:
 75 |                     shout_later = 0
 76 |                     
 77 |                 if shout_now>shout_later:
 78 |                     #This is the condition when to shout. If true, add it shout_times
 79 |                     if f(m,n) not in shout_times:
 80 |                         shout_times.append( f(m,n) )
 81 |                         
 82 |                 y = D*max( shout_now, shout_later )
 83 |                 dictionary[ "%s,%s,%s"%(n,x,m) ] = y
 84 |                 
 85 |                 return y
 86 |                 
 87 |     def f(x,n):
 88 |         """ This is to find the number of up jumps given a price x and time period n."""
 89 |         return (n,int(0.5*(math.log(x/X_0,U)+n)))
 90 |     
 91 |     def delta(n,k):
 92 |         """This function computes the delta at each node (n,k), where n is the number 
 93 |         of up jumps and n is the time period"""
 94 |         up = X_0*u**(2*k-n+1)
 95 |         down = X_0*u**(2*k-n-1)
 96 |         return ( value(n,up,up)-value(n,down,down) )/(up-down)
 97 | 
 98 |     print value(0,X_0,X_0)
 99 |     for s in shout_times:
100 |         print s
101 | 
102 | """
103 | Example:
104 | 
105 | Assume the following:
106 | (i) S(0) = K = 1, and the volatility of the underlying security is  sigma = 40%.
107 | (ii) The continuously compounded interest rate is constant and equal to r = 1%.
108 | (iii) The maturity of the contract is 12 months, and the owner can "shout" only at the end of each
109 | month.
110 | (iv) The underlying security pays no dividends.
111 | Using a binomial tree model with 12 time periods, find the value of this option at time zero. Identify
112 | all nodes at which it is optimal for the owner to "shout" and find the replicating portfolio at time 0.
113 | 
114 | From this, we need the size of an "up" stock movement, and the risk-neutral probability of an "up" movement.
115 | r = 0.01
116 | sigma = .40
117 | periods = 12 
118 | S_0 = K = 1
119 | 
120 | 
121 | u = exp(sigma/sqrt(periods) )
122 | q = (exp(r/periods) - 1/u)/(u - 1/u)
123 | 
124 | binomial_shoutout(r, u, q, periods, S_0, K)
125 | 
126 | 
127 |    


--------------------------------------------------------------------------------
/DiscreteSDE/discreteSDE.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Simulate 1-d stochastic differential equations numerically using different schemes
  3 | 
  4 | 
  5 | Example:
  6 | 
  7 | kappa = 0.3
  8 | b = 0.07
  9 | sigma = 0.06
 10 | gamma = 0.5
 11 | delta =0.004
 12 | N = 1e6
 13 | 
 14 | def drift(x):
 15 |     return kappa*( b - x)
 16 |     
 17 | def diffusion(x):
 18 |     return sigma*x**(gamma)
 19 | 
 20 | #this is a CIR process
 21 | 
 22 | sdeEuler = DiscreteSDE( drift, diffusion, "euler", startPosition = b, delta=delta )
 23 | sde.sample( 10, N )
 24 |     
 25 |     
 26 | """
 27 | 
 28 | import scipy.stats as stats
 29 | import numpy as np
 30 | from time import clock
 31 | 
 32 | class DiscreteSDE( object ):
 33 |     """
 34 |     initialize:
 35 |         drift: the drift function, univariate, must accept and return array of same size.
 36 |         diffusion the diffusion function, univariate, must accept and return array of same size.
 37 |         method: a string in ["euler", "milstein", "second-order" ]
 38 |         delta: the time step
 39 |         startTime: the starting time of the process
 40 |         startPosition: the starting position of the process 
 41 |         
 42 |     methods:
 43 |         sample( t, n): sample the sde n times until time t. Returns a 2d numpy array with time along the columns
 44 |     """
 45 |     
 46 | 
 47 |     def __init__(self, drift, diffusion, method, delta = 0.001, startTime = 0, startPosition =0  ):
 48 |         self.drift = drift
 49 |         self.diffusion = diffusion
 50 |         if method.lower() not in ["euler", "milstein", "second_order" ]:
 51 |             raise 
 52 |         else:
 53 |             self.method = method
 54 |         self.delta = delta
 55 |         self.startTime = startTime
 56 |         self.startPosition = startPosition
 57 |             
 58 |     def sample(self,t=1, n=1):
 59 |         return getattr( self, self.method )(t, n)
 60 |         
 61 |         
 62 |     def euler(self, t, n):
 63 |         #initalize
 64 |         P,N = self._init(t,n)
 65 |         
 66 |         for i in xrange(1,int(N)):
 67 |             x = P[:, i-1]
 68 |             P[:,i] = x + self.drift(x)*self.delta + self.diffusion(x)*np.sqrt(self.delta)*np.random.randn( n )
 69 |         
 70 |         return P
 71 |             
 72 |     
 73 |     def milstein(self,t,n, h = 0.001):
 74 |         
 75 |         def diff_prime( u ):
 76 |             return (self.diffusion( u + h/2 ) - self.diffusion( u - h/2))/h
 77 |         
 78 |         
 79 |         P, N = self._init(t,n)
 80 |         for i in xrange(1,int(N)):
 81 |             x = P[:, i-1]
 82 |             R = np.random.randn( n )
 83 |             P[:,i] = x + self.drift(x)*self.delta + self.diffusion(x)*np.sqrt(self.delta)*R  + \
 84 |                                         0.5*diff_prime( x)*self.diffusion(x)*( self.delta*R**2 - self.delta )
 85 |         
 86 |         return P
 87 |         
 88 |     def second_order( self, t, n ):
 89 |         P, N = self._init(t,n)
 90 |         
 91 |         
 92 |         cov = np.array( [[self.delta, 0.5*self.delta**2],[ 0.5*self.delta**2, self.delta**3/3 ]] )
 93 |         mu = np.array( [0,0] )
 94 |         for i in xrange(1,int(N)):
 95 |             x = P[:, i-1]
 96 |             RI = np.random.multivariate_normal( mu, cov, n )
 97 |             R = RI[:,0]
 98 |             I = RI[:,1]
 99 |             
100 |             P[:,i] = x + self.drift(x)*self.delta + self.diffusion(x)*np.sqrt(self.delta)*R  + \
101 |                        (first_derivative( self.drift, x)*self.drift(x) - 0.5*self.diffusion(x)**2*second_derivative( self.drift, x) )*0.5*self.delta**2 + \
102 |                        (first_derivative( self.diffusion, x)*self.drift(x) - 0.5*self.diffusion(x)**2*second_derivative( self.diffusion, x) )*(self.delta*R - I) + \
103 |                        ( self.diffusion(x)*first_derivative(self.drift,x) )*I + \
104 |                        ( self.diffusion(x)*first_derivative(self.diffusion, x) )*(R**2 - self.delta) 
105 |         return P
106 |         
107 |         
108 |         
109 |     def _init(self,t, n ):
110 |         if t < self.startTime:
111 |             raise
112 |         N = np.floor( t / self.delta )
113 |         M = np.zeros( (n, N) )
114 |         M[:,0] = self.startPosition
115 |         return M,N
116 |         
117 |         
118 | def first_derivative( f, x, h = 0.001):
119 |     return ( f(x + h) - f(x-h) )/(2*h)
120 |     
121 | def second_derivative( f, x, h = 0.001):
122 |     return (f(x + h) - 2*f(x) + f(x-h) )/(h**2)
123 | 
124 |     
125 |     
126 |     
127 |     
128 | if __name__=="__main__":    
129 |  
130 |     kappa = 0.3
131 |     b = 0.07
132 |     sigma = 0.06
133 |     gamma = 0.5
134 |     print "Parameters:"
135 |     print "kappa: %.2f, b: %0.2f, sigma;: %0.2f, gamma: %0.2f"%( kappa, b, sigma, gamma )
136 | 
137 |     def drift(x):
138 |         return kappa*( b - x)
139 |         
140 |     def diffusion(x):
141 |         return sigma*x**(gamma)
142 |         
143 |     delta =0.004
144 |     sdeEuler = DiscreteSDE( drift, diffusion, "euler", startPosition = b, delta=delta )
145 |     sdeMilstein = DiscreteSDE( drift, diffusion, "milstein", startPosition = b, delta = delta)
146 |     sdeSecondOrder = DiscreteSDE( drift, diffusion, "second_order", startPosition = b, delta = delta)
147 | 
148 |     
149 |     N = 5000
150 |     print "delta = 0.004"
151 |     
152 |     start = clock()
153 |     eulerAt3 = sdeEuler.sample( 3, N )[:, -1]
154 |     eulerAt3.sort()
155 |     print "Euler: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start)
156 |     
157 |     start = clock()
158 |     eulerAt3 = sdeMilstein.sample( 3, N )[:, -1]
159 |     eulerAt3.sort()
160 |     print "Milstein: q = %.3f s.t. P( X_3 > q ) <= 0.1.Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start)
161 |     
162 |     start = clock()
163 |     eulerAt3 = sdeSecondOrder.sample( 3, N )[:, -1]
164 |     eulerAt3.sort()
165 |     print "SecondOrder: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start)
166 |     
167 |     
168 |     print
169 |     delta = 0.1
170 |     print "delta = 0.1"
171 |     start = clock()
172 |     sdeEuler.delta = sdeMilstein.delta = sdeSecondOrder.delta = delta
173 |     
174 |     start = clock()
175 |     eulerAt3 = sdeEuler.sample( 3, N )[:, -1]
176 |     eulerAt3.sort()
177 |     print "Euler: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start)
178 |     
179 |     start = clock()
180 |     eulerAt3 = sdeMilstein.sample( 3, N )[:, -1]
181 |     eulerAt3.sort()
182 |     print "Milstein: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start)
183 |     
184 |     eulerAt3 = sdeSecondOrder.sample( 3, N )[:, -1]
185 |     eulerAt3.sort()
186 |     print "Second Order: q = %.3f s.t. P( X_3 > q ) <= 0.1. Time: %.3f"%(eulerAt3[ np.floor(0.9*N) ], clock() -start)
187 |     
188 | 
189 |     """
190 |      A bond price is given by:
191 |      P(0,T) = E[ exp( -\int_0^T r_t dt ) ]
192 |      Is the question asking use to compute this integral, which includes the integration? Sure I'll do it.
193 |      
194 |      P(0,T) ~= 1/N * exp( -delta*( \sum r_i ) )
195 |     
196 |     
197 |     """
198 |     
199 |     def bond_price( r_t, delta):
200 |         return np.exp( -delta*(r_t.sum()) )
201 |         
202 |         
203 |     
204 |     def print_partB( discreteSDE, end_time, delta, name ):
205 |         start = clock()
206 |         discreteSDE.delta = delta
207 |         value = stats.nanmean(np.apply_along_axis( lambda u: bond_price(u, delta), 1, discreteSDE.sample( end_time, N )) )
208 |         print "%s: estimates %.4f on %d year bond. Delta: %.4f, Time: %.2f"%(name, value, end_time, delta, clock() - start)
209 |         return
210 |      
211 |     print_partB( sdeEuler, 3, 0.004, "Euler" )
212 |     print_partB( sdeEuler, 3, 0.1, "Euler" )
213 |     print_partB( sdeEuler, 10, 0.004, "Euler" )
214 |     print_partB( sdeEuler, 10, 0.1, "Euler" )
215 |     print
216 |     
217 |     print_partB( sdeMilstein, 3, 0.004, "Milstein" )
218 |     print_partB( sdeMilstein, 3, 0.4, "Milstein" )
219 |     print_partB( sdeMilstein, 10, 0.004, "Milstein" )
220 |     print_partB( sdeMilstein, 10, 0.1, "Milstein" )
221 |     print
222 |     
223 |     print_partB( sdeSecondOrder, 3, 0.004, "Second-Order" )
224 |     print_partB( sdeSecondOrder, 3, 0.1, "Second-Order" )
225 |     print_partB( sdeSecondOrder, 10, 0.004, "Second-Order" )
226 |     print_partB( sdeSecondOrder, 10, 0.1, "Second-Order" )
227 |     
228 |     
229 |     
230 |         
231 | 


--------------------------------------------------------------------------------
/Estimators/theil_sen.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This implements the Theil-Sen linear regression estimator for 2d data points.
 3 | The jist of it is:
 4 | It returns the median all computed slope value between pairs (x_i, y_i), (x_j, y_j), (x_i > x_j)
 5 | where slope = (y_i - y_j)/(x_i - x_j)
 6 | 
 7 | 
 8 | Very robust to outliers.
 9 | 
10 | """
11 | import numpy as np
12 | import bottleneck #very fast searching and sorting written in Cython.
13 | import itertools
14 | 
15 | def theil_sen(x,y, sample= "auto", n_samples = 1e7):
16 |     """
17 |     Computes the Theil-Sen estimator for 2d data.
18 |     parameters:
19 |         x: 1-d np array, the control variate
20 |         y: 1-d np.array, the ind variate.
21 |         sample: if n>100, the performance can be worse, so we sample n_samples.
22 |                 Set to False to not sample.
23 |         n_samples: how many points to sample.
24 |     
25 |     This complexity is O(n**2), which can be poor for large n. We will perform a sampling
26 |     of data points to get an unbiased, but larger variance estimator. 
27 |     The sampling will be done by picking two points at random, and computing the slope,
28 |     up to n_samples times.
29 |     
30 |     """
31 |     assert x.shape[0] == y.shape[0], "x and y must be the same shape."
32 |     n = x.shape[0]
33 |     
34 |     if n < 100 or not sample:
35 |         ix = np.argsort( x )
36 |         slopes = np.empty( n*(n-1)*0.5 )
37 |         for c, pair in enumerate(itertools.combinations( range(n),2 ) ): #it creates range(n) =( 
38 |             i,j = ix[pair[0]], ix[pair[1]]
39 |             slopes[c] = slope( x[i], x[j], y[i],y[j] )
40 |     else:
41 |         i1 = np.random.randint(0, n, n_samples)
42 |         i2 = np.random.randint(0, n, n_samples)
43 |         slopes = slope( x[i1], x[i2], y[i1], y[i2] )
44 |         #pdb.set_trace()
45 |     
46 |     slope_ = bottleneck.nanmedian( slopes )
47 |     #find the optimal b as the median of y_i - slope*x_i
48 |     intercepts = np.empty( n )
49 |     for c in xrange(n):
50 |         intercepts[c] = y[c] - slope_*x[c]
51 |     intercept_ = bottleneck.median( intercepts )
52 | 
53 |     return np.array( [slope_, intercept_] )
54 |         
55 |         
56 |         
57 | def slope( x_1, x_2, y_1, y_2):
58 |     return (1 - 2*(x_1>x_2) )*( (y_2 - y_1)/np.abs((x_2-x_1)) )
59 |     
60 |     
61 |     
62 |     
63 | if __name__=="__main__":
64 |     x = np.asarray( [ 0.0000, 0.2987, 0.4648, 0.5762, 0.8386 ] ) 
65 |     y = np.asarray( [ 56751, 57037, 56979, 57074, 57422 ] ) 
66 |     print theil_sen( x, y )
67 |     


--------------------------------------------------------------------------------
/KalmanFilter/simple_kalman.py:
--------------------------------------------------------------------------------
 1 | #kalman filter, simple example from http://en.wikipedia.org/wiki/Kalman_filter
 2 | 
 3 | import numpy as np
 4 | from numpy.linalg import inv
 5 | from numpy import dot
 6 | from matplotlib import pyplot as plt
 7 | 
 8 | 
 9 | def predict(x, F, B, u, P, Q ):
10 |     assert x.shape[1] == 1
11 |     assert F.shape[0] == x.shape[0]
12 |     assert u.shape == x.shape
13 |     assert B.shape[0] == u.shape[0]
14 |     assert F.shape[1] == P.shape[0]
15 |     assert Q.shape == P.shape
16 | 
17 |     x_p = dot(F, x) + dot(B,u)
18 |     P_p = dot(F,P).dot(F.T) + Q
19 | 
20 |     assert x.shape == x_p.shape
21 |     assert P_p.shape == P.shape
22 | 
23 |     return x_p, P_p
24 | 
25 | def update(z, H, x_p, P_p, R ):
26 |     assert H.shape[1] == x_p.shape[0]
27 |     assert H.shape[1] == P_p.shape[0]
28 |     assert R.shape[0] == H.shape[0]
29 |     assert z.shape[1] == 1
30 |     assert z.shape[0] == H.shape[0]
31 | 
32 |     y = z - dot(H,x_p)
33 |     S = dot(H,P_p).dot(H.T) + R
34 |     K = dot(P_p, H.T).dot(inv(S))
35 |     x_u = x_p + dot(K,y)
36 |     P_u = (np.eye(K.shape[0]) - dot(K,H)).dot(P_p)
37 | 
38 |     return x_u, P_u
39 | 
40 | 
41 | def run(acc_variance=1., obs_variance=1., delta_t = 0.5):
42 |     steps = 100
43 |     X_guesses = np.zeros((2,steps))
44 |     X_actual = np.zeros((2,steps))
45 | 
46 |     F = np.array([[1, delta_t],[0,1]])
47 |     G = np.array([[delta_t**2/2, delta_t]])
48 |     B = np.zeros((2,2))
49 |     u = np.zeros((2,1))
50 |     Q = np.array([ [delta_t**4/4, delta_t**3/2], [delta_t**3/2, delta_t**2]])*acc_variance
51 |     H = np.array([[1,0]])
52 |     R = np.array( [[obs_variance]])
53 | 
54 |     #initial values
55 |     x = x_g = np.zeros((2,1))
56 |     P_g = np.zeros((2,2))
57 | 
58 |     for i in range(steps):
59 |         x = dot(F,x) + np.random.multivariate_normal( [0,0], Q ).reshape( 2, 1)
60 | 
61 |         x_p, P_p = predict(x_g, F, B, u, P_g, Q )
62 |         z = dot(H,x) + np.random.normal(0,obs_variance)
63 | 
64 |         x_g, P_g = update(z, H, x_p, P_p, R )
65 | 
66 |         #print x_g.shape, P_g.shape
67 |         X_guesses[:,i] = x_g[:,0]
68 |         X_actual[:,i] = x[:,0]
69 | 
70 | 
71 |     return X_guesses[0,:], X_actual[0,:]
72 | 
73 | 
74 | delta_t = 0.5
75 | actual, guesses = run(2.,10., delta_t)
76 | 
77 | plt.plot(actual, label='actual')
78 | plt.plot(guesses, label='guesses')
79 | 
80 | plt.legend()
81 | plt.show()
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/BayesianBandit.py:
--------------------------------------------------------------------------------
 1 | ##Bayesian Bandit in python
 2 | 
 3 | import scipy.stats as stats
 4 | import numpy as np
 5 | 
 6 | 
 7 | 
 8 | class BayesianBandit( object ):
 9 | 
10 | 
11 |     def __init__( self, prior_alpha = 1, prior_beta = 1 ):
12 |         self.prior_alpha = 1
13 |         self.prior_beta = 1
14 |         self.betad = stats.beta
15 |         
16 |     
17 |     
18 |     def fit(self, bandits, trials = 10 ):
19 |         """
20 |         Bandits is an object that can be called like bandits.pull(choice) and returns a 0 or 1.
21 |         
22 |         
23 |         """
24 |         n_bandits = len( bandits )
25 |         self.n_pulls = np.zeros( n_bandits )
26 |         self.n_successes = np.zeros( n_bandits )
27 |         self.prior_distibutions = np.array( [self.prior_alpha, self.prior_beta])*np.ones( (n_bandits, 2 ) )
28 | 
29 |         for i in xrange(trials):
30 |         
31 |             choice = np.argmax( self.betad.rvs( self.prior_distibutions[:,0] + self.n_successes,
32 |                                                 self.prior_distibutions[:,1] + self.n_pulls - self.n_successes ) )
33 |             outcome = bandits.pull(choice)
34 |             self.n_pulls[choice] += 1
35 |             self.n_successes[choice] += outcome
36 |         
37 |         self.posterior_alpha =  self.prior_distibutions[:,0] + self.n_successes
38 |         self.posterior_beta = self.prior_distibutions[:,1] + self.n_pulls - self.n_successes
39 |         return
40 |         
41 |     def predict(self, n=1):
42 |         choices = np.zeros( n ) 
43 |         for i in range(n):
44 |         
45 |         
46 |             choice = np.argmax( self.betad.rvs( self.prior_distibutions[:,0] + self.n_successes,
47 |                                                 self.prior_distibutions[:,1] + self.n_pulls - self.n_successes ) )
48 |             choices[i] = choice
49 |         
50 |         return choices
51 |         
52 |         
53 | class Bandits(object):
54 |     
55 |     def __init__(self, probabilities ):
56 |         self.probabilities = probabilities
57 |         
58 |         
59 |     def pull( self, choice):
60 |         return 1 if np.random.random() < self.probabilities[choice] else 0
61 |         
62 |         
63 |         
64 |     def __len__(self):
65 |         return len( self.probabilities )
66 | 


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/blender.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.linear_model import LinearRegression
  3 | from sklearn.cross_validation import ShuffleSplit
  4 | from time import clock
  5 | import pp
  6 | 
  7 | class Blender( object):
  8 |     """
  9 |         This class implements a linear blend of different models.
 10 |     
 11 |     
 12 |         methods:
 13 |             fit( data, response, dict_of_additional_variables )
 14 |             add_model( model, name)
 15 |             predict( new_data, dict_of_additional_variables )
 16 |         
 17 |         
 18 |         attributes:
 19 |             coefs_
 20 |             
 21 |     """
 22 |     
 23 |     
 24 |     def __init__( self, blender = LinearRegression(), training_fraction = 0.8, verbose = False):
 25 |             self.blender = blender
 26 |             self.training_fraction = training_fraction
 27 |             self.verbose = verbose
 28 |             self.models = dict()
 29 |             self._n_models = 0
 30 |     
 31 |     
 32 |     def add_model(self, model, name=None):
 33 |         """ 
 34 |             model: a sklearn model that exposes the methods fit & predict. 
 35 |             name: a name to specify the model, eg "ElNet500alpha"
 36 |         
 37 |         """
 38 | 	self._n_models +=1
 39 |         if not name:
 40 |             name = "%d"%( self._n_models )
 41 |         self.models[name] = model
 42 |         return
 43 |         
 44 |     def del_model(self, name ):
 45 | 	try:	
 46 | 	   del self.models[name]
 47 | 	except KeyError:
 48 | 	   print "Model %s not in blender."%name
 49 | 
 50 | 	return
 51 | 
 52 | 
 53 |     def split_arrays(self, n,  test_fraction = 0.1 ):
 54 | 	
 55 | 	
 56 | 	shfSplt = ShuffleSplit( n=n, n_iterations=1, test_size = test_fraction)
 57 | 	train_ix, test_ix = shfSplt.__iter__().next()
 58 | 	return train_ix, test_ix
 59 | 
 60 |  	
 61 |  
 62 |     def fit(self, data, response, dict_of_additional_variables={}):
 63 |         """
 64 |             data: the data matrix, shape (n,d)
 65 |             response: the response vector (n,)
 66 |             dict_of_additional_variables:
 67 |                 a dictionary with the keys the model names (optional to include), and the items are of the form:
 68 |                         {"train":[ items to be included in training], "test":[items to be included in testing] }
 69 |         """
 70 |         
 71 |         #split the data to held-in and held-out.
 72 | 	train_ix, blend_ix = self.split_arrays( data.shape[0], test_fraction = 1- self.training_fraction )
 73 | 	training_data, blend_data, training_response, blend_response = data[train_ix], data[blend_ix], response[train_ix], response[blend_ix]	
 74 | 
 75 |  
 76 |         X = np.zeros( (blend_response.shape[0], len( self.models ) ) )
 77 | 
 78 |         if self.verbose:
 79 |             print "Shape of training data vs blending data: ", training_data.shape, blend_data.shape 
 80 |         #train the models
 81 | 	
 82 | 	
 83 | 	#try some parrallel
 84 | 	ncpus = max( len( self.models ), 32 )
 85 | 	job_server = pp.Server( ncpus, ppservers = () )
 86 | 	jobs = dict()
 87 | 	to_import = ("import numpy as np", "sklearn", "time", "from localRegression import *", "from sklearn.linear_model import sparse", "from sklearn.utils import atleast2d_or_csc") 
 88 |         for name, model in sorted( self.models.iteritems() ):
 89 | 		
 90 | 		try:
 91 | 			fitargs = [ training_data, training_response] + [ array[train_ix] for array in dict_of_additional_variables[name ]] 
 92 | 			predictargs = [ blend_data ] + [ array[blend_ix] for array in dict_of_additional_variables[name] ]
 93 | 		except KeyError:
 94 | 			fitargs = [ training_data , training_response]
 95 | 			predictargs = [ blend_data ]
 96 |                 
 97 | 		jobs[name] = job_server.submit( pp_run,(model, name, self.verbose, fitargs, predictargs), (), to_import )
 98 | 
 99 | 	    	if self.verbose:
100 | 			print "Model %s sent to cpu."%name
101 |         
102 | 	i = 0
103 | 	for name, model in sorted( self.models.iteritems() ):
104 | 	    self.models[name], X[:,i]  = jobs[name]()
105 | 	    i+=1
106 | 
107 |         if self.verbose:
108 |             print "Fitting finished, starting blending."
109 |         
110 |         self.blender.fit( X, blend_response )
111 |         self.coef_ = self.blender.coef_
112 |         
113 |         self._fit_training_data = training_data
114 |         self._fit_blend_data = blend_data
115 |         self._fit_training_response = training_response
116 |         self._fit_blend_response = blend_response
117 |         
118 |         if self.verbose:
119 |             print "Done fitting"
120 |         job_server.destroy()    
121 |         return self
122 |            
123 |     def predict( self, data, dict_of_additional_variables={}):
124 |         
125 | 	ncpus = max( len( self.models ), 32 )
126 | 	job_server = pp.Server( ncpus, ppservers = () )
127 | 	jobs = dict()
128 | 	to_import = ("import numpy as np", "sklearn", "time", "from localRegression import *", "from sklearn.linear_model import sparse", "from sklearn.utils import atleast2d_or_csc") 
129 |         for name, model in sorted( self.models.iteritems() ):
130 | 		try:
131 | 			predictargs = [data] +  dict_of_additional_variables[name]
132 | 		except KeyError:	
133 | 			predictargs = [ data ] 
134 | 
135 | 		jobs[name] = job_server.submit( pp_predict, (model, name, self.verbose, predictargs), (), to_import)
136 | 
137 | 	X = np.zeros( (data.shape[0], len( self.models ) ) )
138 |         i = 0
139 |         for name, model in sorted( self.models.iteritems() ):
140 |             X[:,i] = jobs[name]()
141 | 	    i+=1	
142 | 	job_server.destroy()
143 |         return self.blender.predict( X )
144 |         
145 | 
146 | 
147 | def pp_predict( model, name, verbose, predictargs):
148 | 	start = time.clock()
149 | 	p = model.predict( *predictargs )
150 | 	if verbose:
151 | 		print "Model %s fitted, took %.2f seconds"%(name, time.clock() - start )
152 | 	return p
153 | 
154 | def pp_run( model, name, verbose, fitargs, predictargs):
155 | 	
156 | 	start = time.clock()
157 | 	model.fit(*fitargs)
158 |         if verbose:
159 | 		print "Model %s fitted, took %.2f seconds."%(name, time.clock() - start )
160 | 	prediction = model.predict( *predictargs ) 
161 | 	return model, prediction
162 | 


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/ensembleSelector.py:
--------------------------------------------------------------------------------
  1 | #ensemble selection
  2 | 
  3 | import numpy as np
  4 |         
  5 |         
  6 | def RMSE( Z, W):
  7 |     return np.sqrt( ((Z - W[:,None])**2).mean(axis=0) )
  8 | 
  9 | def basis(i, N):
 10 |     z = np.zeros(N)
 11 |     z[i] = 1
 12 |     return z
 13 | 
 14 | 
 15 | class EnsembleSelection( object ):
 16 |     """
 17 |     This class implements a greedy ensemble selection algorithm outlined in Ensemble Selection from Libraries of Models.
 18 |     The algorthim starts with an initial ensemble of models (if fraction_sorted_initialization > 0), and addeds models 
 19 |     sequentially until improve falls below some threshold or the max number of models are selected.
 20 |     
 21 |     verbose: 0,1 or 2. Report the current score, number of models at each iterations.
 22 |     with_replacement: all the algorithm to select models already selected.
 23 |     fraction_sorted_initialization: The fraction of the best models to initialilly include in the ensemble
 24 |     bag_selection: Perform the following bagged_selection_times. select bagged_fraction and perform the greedy algo of them.
 25 |     bagged_fraction: see above.
 26 |     max_models: the maximum number of models to include in an ensemble
 27 |     score_function: the function to minimize.
 28 |     tol: the fractional decrease in the score_function to continue selection. RMSE_{i+1}/RMSE_{i} > 1 + tol.
 29 |     fit_models: instead of giving already fitted models, this object will fit the models too.
 30 |     training_fraction: the fraction to use for training, 1-training_fraction is used as ensemble selection.
 31 |     
 32 |     methods:
 33 |         add_model( iterable_of_models ): add a collection of models to the algorithm. Must be performed before fit() is called.
 34 |             Models must be aready fitted and have a .predict() method exposed.
 35 |         fit( X, Y): perform the ensemble selection on data X and target Y
 36 |         predict( X ): return the prediction of the ensemble.
 37 |     
 38 |     """
 39 |     
 40 |     
 41 |     def __init__(self, verbose = 1, 
 42 |                  with_replacement = True, 
 43 |                  fraction_sorted_initialization = 1.0, #bayesian prior of 1/N.
 44 |                  bag_selection = 0,
 45 |                  bagged_fraction = 0.5,
 46 |                  max_models = None,
 47 |                  score_function = RMSE,
 48 |                  tol = 1e-4,
 49 |                  fit_models = False,
 50 |                  training_fraction =0.8, 
 51 |                  models = []):
 52 |         self.verbose = verbose
 53 |         self.with_replacement = with_replacement
 54 |         self.fraction_sorted_initialization = fraction_sorted_initialization
 55 |         self.bag_selection = bag_selection
 56 |         self.bagged_fraction = bagged_fraction
 57 |         self.max_models = max_models
 58 |         self.fit_models = fit_models
 59 |         self.training_fraction = training_fraction  
 60 |         
 61 |         self.score_function = score_function
 62 |         self.tol = tol
 63 |         
 64 |         if self.max_models == None:
 65 |             self.max_models = np.inf
 66 |         
 67 |         self.models= models
 68 |     
 69 |     
 70 |     def add_model(self, model ):
 71 |         """model should be an iterable"""
 72 |         self.models += [ m for m in model ]
 73 |         
 74 |         return
 75 |     
 76 |     
 77 |     def _predict( self, predictions, models_included_ ):
 78 |         return (np.dot( predictions, models_included_ )/models_included_.sum())[:,None]
 79 |     
 80 |     def _fit( self, predictions, Y, ix):
 81 |     
 82 |         n,N = predictions.shape
 83 |         #train and store the prediction results
 84 |         models_included_ = np.zeros( N )
 85 |             
 86 |         init_n_to_include = max( int( self.fraction_sorted_initialization*N), 1)
 87 |         models_included_[ np.argsort( self.individual_scores[ix] )[:init_n_to_include] ] = 1
 88 |         
 89 |         total_scores_ = np.array( [np.inf,  self.score_function( self._predict( predictions, models_included_ ) , Y)  ] )        
 90 |         
 91 |         while (models_included_.sum() < self.max_models) and ( total_scores_[-2]/total_scores_[-1] > 1 + self.tol ) :
 92 |             
 93 |             #find the best addition. 
 94 |             _scores = [ self.score_function(self._predict(predictions, models_included_ + basis(i, N)), Y) \
 95 |                                     for i in range(N) if (models_included_[i] == 0 or self.with_replacement)] 
 96 |             m = np.argmin( _scores )
 97 |             if _scores[m] < total_scores_[-1]:
 98 |                 total_scores_ = np.append( total_scores_, _scores[m] )
 99 |                 models_included_[m] += 1
100 |                 if self.verbose > 1:
101 |                     print "Added model %d."%m
102 |                     print "Current score: %.3f."%total_scores_[-1]
103 |                     print "Current models included: ", models_included_
104 |                     print 
105 |             else:
106 |                 flag = True
107 |                 break
108 |             
109 |         if self.verbose > 0:
110 |             if (models_included_.sum() >= self.max_models):
111 |                 print "Exited after %d iterations because number of models exceeded. %d >= self.max_models"%(models_included_.sum(), models_included_.sum() )
112 |             elif ( total_scores_[-2]/total_scores_[-1] <= 1 + self.tol ):
113 |                 print "Exited after %d iterations because tolerence exceeded: %.8f < 1 + tol"%(models_included_.sum(), total_scores_[-2]/total_scores_[-1])
114 |             elif flag:
115 |                 print "The (local) minimum was found after %d iterations."%(models_included_.sum())
116 |             print "Score: %.4f"%total_scores_[-1]
117 |         return models_included_/models_included_.sum()
118 |         
119 |         
120 |     def fit(self,  X, Y):
121 |         N = len( self.models )
122 |         n,d = X.shape
123 |         
124 |         if self.fit_models:
125 |             cutoff = int(n*self.training_fraction)
126 |             a = np.arange(n)
127 |             np.random.shuffle(a)
128 |             training_data, training_target = X[ a[:cutoff] ,:], Y[ a[:cutoff] ] 
129 |             [ m.fit( training_data, training_target) for m in self.models ]
130 |             
131 |             if self.verbose > 0:
132 |                 print "models trained."
133 |             X, Y = X[ a[cutoff:], :], Y[ a[cutoff:] ]
134 |             n,d = X.shape
135 |         
136 |         #train and store the prediction results
137 |         predictions = np.zeros( (n, N) )
138 |         for i in range(N):
139 |             predictions[ :, i] = self.models[i].predict( X )
140 |             
141 |         self.individual_scores = self.score_function( predictions, Y )
142 |         self.models_included_ = np.zeros( N )
143 |         p = self.bagged_fraction if self.bag_selection > 0 else 1
144 |         
145 |         for i in range( max(1, self.bag_selection ) ):
146 |             a = np.arange( N) 
147 |             np.random.shuffle( a)
148 |             ix = a[:int(p*N) ]
149 |             models_included_ = self._fit( predictions[:, ix], Y, ix )
150 |             self.models_included_[ix] += models_included_
151 |         
152 |         self.models_included_ /= self.models_included_.sum()
153 |         self.score_ = self.score_function( self._predict( predictions, self.models_included_), Y )
154 |         return self
155 |         
156 |     
157 |     def get_params(self, deep=False):
158 |         return self.__dict__
159 |     
160 |     def predict( self, X ):
161 |             
162 |         N = len( self.models )
163 |         n,d = X.shape
164 |         #train and store the prediction results
165 |         predictions = np.zeros( (n, N) )
166 |         for i in range(N):
167 |             predictions[ :, i] = self.models[i].predict( X )
168 |             
169 |         return self._predict( predictions, self.models_included_ )        
170 |         
171 | 


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/maxCorrelationTransformer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.covariance import EllipticEnvelope
  3 | from sklearn.linear_model import LinearRegression as LR
  4 | 
  5 | """
  6 | 
  7 | Note: This shows less than benchmark (all identity) performace. The issue is I am maximizing the wrong thing. I 
  8 | should be trying to maximize the partial-correlation. TODO
  9 | 
 10 | To do this, we will use a greedy algorithm.
 11 | 
 12 | """
 13 | 
 14 | np.seterr( all="raise")
 15 | 
 16 | EPSILON = 1e-2
 17 | 
 18 | dict_of_transforms = dict([ 
 19 |     ("identity", lambda x: x),
 20 |     ("logPlus1",lambda x: np.log(x+1)),
 21 |     ("sqrtPlus1", lambda x: np.sqrt(x+1) ),
 22 |     ("sqrt",lambda x: np.sqrt(x) ),
 23 |     ("cuberoot", lambda x: x**(1.0/3.0) ),
 24 |     ("squared", lambda x: x**2 ),
 25 |     ("squaredPlus1", lambda x: (x+1)**2 ),
 26 |     ("cubed",lambda x: x**3 ),
 27 |     ("inverse",lambda x: 1./(x+EPSILON) ),
 28 |     ("exp",lambda x: np.exp(x) ),
 29 |     ("negexp",lambda x: np.exp(-x) ),
 30 |     ("inversePlus1", lambda x: 1./(x+1) ),
 31 |     ("arctan", lambda x: np.arctan(x) ),
 32 |     ("tan", lambda x: np.tan(x) ),
 33 |     ("arcsinsqrt", lambda x: np.arcsin(np.sqrt(x)) ),
 34 |     ("inversesqrt", lambda x: 1.0/(np.sqrt(x)+EPSILON) ),
 35 |     ("inversesqrtPlus1", lambda x: 1.0/(np.sqrt(x+1)) ),
 36 |     ("x/(1-x)", lambda x: x/(1-x+EPSILON) ),
 37 |     ("sqrtlog",lambda x: np.sqrt( -np.log( x + EPSILON) ) ),
 38 |     ("rank", lambda x: np.argsort( x ) ),
 39 |     
 40 | ])    
 41 |     
 42 | 
 43 | 
 44 | class MaxCorrelationTransformer(object):
 45 |     """
 46 |     transforms the features of a data matrix to increase the correlation with a response vector, y.
 47 |     attributes:
 48 |         transforms: a dictionary of functions to try as a transform (defaults to dict_of_transforms)
 49 |         normalize01: True is the data will be normalized to between [0,1]
 50 |         additional_transforms: a dictionary of transforms in addition to the default.
 51 |         
 52 |         
 53 |     methods:
 54 |         fit:
 55 |         transform:
 56 |         fit_transform:
 57 |     
 58 |     
 59 |     
 60 |     """
 61 |     def __init__(self,  transforms = dict_of_transforms, 
 62 |                         normalize01 = False, 
 63 |                         additional_transforms = {}, 
 64 |                         verbose=False, 
 65 |                         remove_outliers=False
 66 |                         tol = 1e-2):
 67 |         self.transforms = transforms        
 68 |         self.verbose = verbose
 69 |         self.transforms.update( additional_transforms )
 70 |         #map( _wrapper, transforms + additional_transforms )
 71 |         for fname, func in self.transforms.iteritems():
 72 |             self.transforms[ fname ] = _wrapper(func, verbose)
 73 |         
 74 | 
 75 |         self.tol = tol
 76 |         
 77 |         
 78 |     def fit(self, X, Y):
 79 |         "to do"
 80 |     
 81 |         n,d = X.shape
 82 |            
 83 |         self.transforms_ = ["identity"]*d
 84 |         abs_partial_correlations_ = abs(partial_correlation_via_inverse(X,Y)[-1, 0:-1])
 85 |         temp_abs_partial_correlations_ = -1e2*np.ones_like( abs_partial_correlations_ )
 86 |         ix = np.arange( d) 
 87 |         while abs( temp_abs_partial_correlations_.sum() - abs_partial_correlations_.sum() ) > self.tol:
 88 |             for i in xrange(d):
 89 |                 _X = X[:,i]
 90 |                 Z = X[:, ix != i]
 91 |                 for transform_name, transform in self.transforms:
 92 |                     no_error, f_X = transform( _X )
 93 |                     if no_error:
 94 |                         pc = abs( partial_correlation( f_X, Y, Z ) )
 95 |                         if  pc > abs_partial_correlations_[i]:
 96 |                             temp_abs_partial_correlations_[i] = pc
 97 |                             self.transforms_[i] = transform_name
 98 |             
 99 |                             
100 | 
101 |         return self
102 |     
103 | 
104 |     def transform(self, X):
105 |         if self.normalize01:
106 |             X = _normalize01( X )
107 |         
108 |         
109 |         newX = X.copy()
110 |         n,d = X.shape
111 |         for i in range(d):
112 |             newX[:,i] = self.transforms[ self.transforms_[i] ]( X[:,i] )
113 |         
114 |         return newX
115 |       
116 |     def fit_transform( self, X, y):
117 |         
118 |         self.fit( X, y)
119 |         return self.transformedX
120 |         
121 | 
122 |       
123 | def _corr(x,y, remove_outliers=False):
124 |         #check if x,y are same shape
125 |         n = x.shape[0]
126 |         if x.var()==0 or y.var()==0:
127 |             return 0
128 |         else:
129 |             if remove_outliers:
130 |                 ee = EllipticEnvelope(store_precision = False, contamination=0.05)
131 |                 ee.fit( np.concatenate( [x[:,None],y[:,None] ], axis=1) )
132 |                 c = ee.covariance_
133 |                 return c[0,1]/np.sqrt( c[0,0]*c[1,1] )
134 |             return np.dot( x - x.mean(), y - y.mean() ) / np.sqrt(( x.var()*y.var() ))/ n
135 |         
136 | def _wrapper(f, verbose = False):
137 |     def g(x):
138 |         try:
139 |             u = f(x)
140 |         except FloatingPointError as e:
141 |             if verbose:
142 |                 print "Error.", e
143 |             return False, np.zeros_like(x)
144 |         if ( ~ np.isfinite( u  ) ).sum() > 0:
145 |             if verbose:
146 |                 print "Infinite."
147 |             return False, np.zeros_like(x)
148 |         else:
149 |             return True, u
150 |     return g
151 |     
152 | def partial_correlation(X, Y, Z):
153 |     """
154 |     This computes the partial-correlation between X and Y, with covariates Z.
155 |     """
156 |     lr1 = LR()
157 |     lr2 = LR()
158 |     lr1.fit(Z,X)
159 |     lr2.fit(Z,Y)
160 |     
161 |     return np.corrcoef( Y - lr1.predict(Z), X - lr2.predict(Z) )[0,1]
162 |     
163 | def partial_correlation_via_inverse(X, Y=None):
164 |     try:
165 |         X = np.concatenate([ X,Y], axis=1 )
166 |     except:
167 |         pass
168 |     return -cov2corr( np.linalg.inv(np.dot(X.T, X) ) )
169 | 
170 | def cov2corr( A ):
171 |     """
172 |     covariance matrix to correlation matrix.
173 |     """
174 |     d = np.sqrt(A.diagonal())
175 |     A = ((A.T/d).T)/d
176 |     #A[ np.diag_indices(A.shape[0]) ] = np.ones( A.shape[0] )
177 |     return A


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/outlier.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | #imports and definitions
 3 | import numpy as np
 4 | import scipy.stats as stats
 5 | import scipy.spatial.distance as distance
 6 | import matplotlib.pyplot as plt
 7 | from sklearn.covariance import MinCovDet as MCD
 8 | 
 9 | 
10 | 
11 | class Outlier_detection(object):
12 | 
13 |     def __init__(self, support_fraction = 0.95, verbose = True, chi2_percentile = 0.995):
14 |         self.verbose = verbose
15 |         self.support_fraction = support_fraction
16 |         self.chi2 = stats.chi2
17 |         self.mcd = MCD(store_precision = True, support_fraction = support_fraction)
18 |         self.chi2_percentile = chi2_percentile
19 |         
20 |     def fit(self, X):
21 |         """Prints some summary stats (if verbose is one) and returns the indices of what it consider to be extreme"""
22 |         self.mcd.fit(X)
23 |         mahalanobis = lambda p: distance.mahalanobis(p, self.mcd.location_, self.mcd.precision_  )
24 |         d = np.array(map(mahalanobis, X)) #Mahalanobis distance values
25 |         self.d2 = d ** 2 #MD squared
26 |         n, self.degrees_of_freedom_ = X.shape
27 |         self.iextreme_values = (self.d2 > self.chi2.ppf(0.995, self.degrees_of_freedom_) )
28 |         if self.verbose:
29 |             print "%.3f proportion of outliers at %.3f%% chi2 percentile, "%(self.iextreme_values.sum()/float(n), self.chi2_percentile)
30 |             print "with support fraction %.2f."%self.support_fraction
31 |         return self
32 | 
33 |     def plot(self,log=False, sort = False ):
34 |         """
35 |         Cause plotting is always fun.
36 |         
37 |         log: transform the distance-sq to a log ( distance-sq )
38 |         sort: sort the data according to distnace before plotting
39 |         ifollow: a set if indices to mark with yellow, useful for seeing where data lies across views.
40 |         
41 |         """
42 |         n = self.d2.shape[0]
43 |         fig = plt.figure()
44 |         
45 |         x = np.arange( n )
46 |         ax = fig.add_subplot(111)
47 |  
48 |  
49 |         transform = (lambda x: x ) if not log else (lambda x: np.log(x))
50 |         chi_line = self.chi2.ppf(self.chi2_percentile, self.degrees_of_freedom_)     
51 |         
52 |         chi_line = transform( chi_line )
53 |         d2 = transform( self.d2 )
54 |         if sort:
55 |             isort = np.argsort( d2 )    
56 |             ax.scatter(x, d2[isort], alpha = 0.7, facecolors='none' )
57 |             plt.plot( x, transform(self.chi2.ppf( np.linspace(0,1,n),self.degrees_of_freedom_ )), c="r", label="distribution assuming normal" )
58 |             
59 |         
60 |         else:
61 |             ax.scatter(x, d2 )
62 |             extreme_values = d2[ self.iextreme_values ]
63 |             ax.scatter( x[self.iextreme_values], extreme_values, color="r" )
64 |             
65 |         ax.hlines( chi_line, 0, n, 
66 |                         label ="%.1f%% $\chi^2$ quantile"%(100*self.chi2_percentile), linestyles = "dotted" )
67 | 
68 |         ax.legend()
69 |         ax.set_ylabel("distance squared")
70 |         ax.set_xlabel("observation")
71 |         ax.set_xlim(0, self.d2.shape[0])
72 | 
73 | 
74 |         plt.show()
75 | 
76 |         
77 | 


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/pretty_pca.py:
--------------------------------------------------------------------------------
 1 | #prettyPCA
 2 | 
 3 | 
 4 | """
 5 | This functions plots more interesting plot of PCA reduced data in 2d.
 6 | """
 7 | 
 8 | import matplotlib.pyplot as plt
 9 | import matplotlib.gridspec as gridspec
10 | 
11 | def pretty_pca( skPCA, transformed_data, var_names = None, fraction_data = 1., scale = 3, scatter_color = None):
12 |     """
13 |     skPCA: a sklearn-fitted PCA instance.
14 |     transformed_data: the pca-reduced data. 
15 |     var_names: the variable names; defaults to numbers starting at 0.
16 |     fraction_data: fraction of data points to plot.
17 |     scale: how much to scale the lines by, default 3.
18 |     
19 |     """
20 |     line_color = "k"
21 |     transformed_data = transformed_data[::int(1.0/fraction_data),:2]
22 |     components = skPCA.components_[:2,:].T
23 |     n_features = components.shape[0]
24 |     if var_names == None:
25 |         var_names = [ "%d"%i for i in range(n_features) ]
26 |     else:
27 |         var_names = [ "%s, %d"%(name, i) for i,name in enumerate(var_names) ]
28 | 
29 |     
30 |     fig = plt.figure(1,figsize=(8,5))
31 |     gs = gridspec.GridSpec( 2, 1, height_ratios=[3,1] )
32 |     
33 |     ax = plt.subplot( gs[0] )
34 |     if scatter_color is not None:
35 |         ax.scatter( transformed_data[:,0], transformed_data[:,1], edgecolors='none', alpha = 0.6, c = scatter_color )
36 |     else:
37 |         ax.scatter( transformed_data[:,0], transformed_data[:,1], edgecolors='none', alpha = 0.5 )
38 | 
39 |     ax.scatter( [0], [0], s = 5, c = "k" )
40 |     for i in range( n_features ):
41 |         #ax.plot( *zip([0,0], scale*components[i,:]) , c = line_color, lw = 2, alpha = 0.8 )
42 |         ax.annotate( "", scale*components[i, :], (0,0), arrowprops = dict( arrowstyle="->"))
43 |         ax.annotate(var_names[i], xy=scale*components[i,:],  xycoords='data',
44 |                 #xytext=(-50, 30),
45 |                 textcoords='offset points',
46 |                 size = 12,
47 |                 )
48 |     ax.set_title("2 Dimensional PCA data")
49 |         
50 |     ax = plt.subplot( gs[1] )
51 |     
52 |     ax.bar( range(skPCA.explained_variance_ratio_.shape[0]), skPCA.explained_variance_ratio_ )
53 |     ax.bar( range(2), skPCA.explained_variance_ratio_[:2], color = "r" )
54 |     ax.set_title( "Explained variance ratio" )
55 |     plt.show()
56 |     return
57 |     


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/supervised_pca.py:
--------------------------------------------------------------------------------
  1 | #supervised PCA according to Supervised Principal Compontent Anaysis by Ghodsi et al. 2010
  2 | 
  3 | import numpy as np
  4 | from scipy import linalg
  5 | 
  6 | from ..utils.arpack import eigsh
  7 | from ..base import BaseEstimator, TransformerMixin
  8 | from ..preprocessing import KernelCenterer, scale
  9 | from ..metrics.pairwise import pairwise_kernels
 10 | 
 11 | 
 12 | from time import clock
 13 | 
 14 | 
 15 | 
 16 | class SupervisedPCA(BaseEstimator, TransformerMixin):
 17 |     """Supervised Principal component analysis (SPCA)
 18 | 
 19 |     Non-linear dimensionality reduction through the use of kernels.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     n_components: int or None
 24 |         Number of components. If None, all non-zero components are kept.
 25 | 
 26 |     kernel: "linear" | "poly" | "rbf" | "sigmoid" | "precomputed"
 27 |         Kernel.
 28 |         Default: "linear"
 29 | 
 30 |     degree : int, optional
 31 |         Degree for poly, rbf and sigmoid kernels.
 32 |         Default: 3.
 33 | 
 34 |     gamma : float, optional
 35 |         Kernel coefficient for rbf and poly kernels.
 36 |         Default: 1/n_features.
 37 | 
 38 |     coef0 : float, optional
 39 |         Independent term in poly and sigmoid kernels.
 40 | 
 41 | 
 42 |     eigen_solver: string ['auto'|'dense'|'arpack']
 43 |         Select eigensolver to use.  If n_components is much less than
 44 |         the number of training samples, arpack may be more efficient
 45 |         than the dense eigensolver.
 46 | 
 47 |     tol: float
 48 |         convergence tolerance for arpack.
 49 |         Default: 0 (optimal value will be chosen by arpack)
 50 | 
 51 |     max_iter : int
 52 |         maximum number of iterations for arpack
 53 |         Default: None (optimal value will be chosen by arpack)
 54 | 
 55 |     Attributes
 56 |     ----------
 57 | 
 58 |     `lambdas_`, `alphas_`:
 59 |         Eigenvalues and eigenvectors of the centered kernel matrix
 60 | 
 61 | 
 62 |     """
 63 |     
 64 |     def __init__(self, n_components=None, kernel="linear", gamma=0, degree=3,
 65 |                  coef0=1, alpha=1.0, fit_inverse_transform=False,
 66 |                  eigen_solver='auto', tol=0, max_iter=None):
 67 |                  
 68 |                  
 69 |         self.n_components = n_components
 70 |         self.kernel = kernel.lower()
 71 |         self.gamma = gamma
 72 |         self.degree = degree
 73 |         self.coef0 = coef0
 74 |         self.alpha = alpha
 75 |         self.fit_inverse_transform = fit_inverse_transform
 76 |         self.eigen_solver = eigen_solver
 77 |         self.tol = tol
 78 |         self.max_iter = max_iter
 79 |         self.centerer = KernelCenterer()
 80 |         
 81 |         
 82 |     def transform(self, X):
 83 |         """
 84 |         Returns a new X, X_trans, based on previous self.fit() estimates
 85 |         """
 86 |         return X.dot( self.alphas_ )  
 87 |         
 88 |     
 89 |     def fit(self,X,Y):
 90 |         self._fit(X,Y)
 91 |         return 
 92 |         
 93 |     def fit_transform( self, X, Y):
 94 |     
 95 |     
 96 |         self.fit( X,Y)
 97 |         return self._transform()
 98 |         
 99 |     def _transform(self):
100 |         
101 |         return self.X_fit.dot(self.alphas_)
102 | 
103 |         
104 |     def _fit(self, X, Y):
105 |         #find kenerl matrix of Y
106 |         K = self.centerer.fit_transform(self._get_kernel(Y))
107 |         #scale X
108 |         X_scale = scale(X)
109 |         
110 |         
111 |         if self.n_components is None:
112 |             n_components = K.shape[0]
113 |         else:
114 |             n_components = min(K.shape[0], self.n_components)
115 |         
116 |         #compute eigenvalues of X^TKX
117 |         
118 |         M = (X.T).dot(K).dot(X)
119 |         print "here"
120 |         if self.eigen_solver == 'auto':
121 |                 if M.shape[0] > 200 and n_components < 10:
122 |                     eigen_solver = 'arpack'
123 |                 else:
124 |                     eigen_solver = 'dense'
125 |             else:
126 |                 eigen_solver = self.eigen_solver
127 | 
128 |             if eigen_solver == 'dense':
129 |                 self.lambdas_, self.alphas_ = linalg.eigh(
130 |                     M, eigvals=(M.shape[0] - n_components, M.shape[0] - 1))
131 |             elif eigen_solver == 'arpack':
132 |                 self.lambdas_, self.alphas_ = eigsh(M, n_components,
133 |                                                     which="LA",
134 |                                                     tol=self.tol)
135 |         indices = self.lambdas_.argsort()[::-1]
136 |         self.lambdas_ = self.lambdas_[indices]
137 |         self.alphas_ = self.alphas_[:, indices]
138 |         
139 |         #remove the zero/negative eigenvalues
140 |         self.alphas_ = self.alphas_[:, self.lambdas_ > 0 ]
141 |         self.lambdas_ = self.lambdas_[ self.lambdas_ > 0 ]
142 |         print self.alphas_.shape
143 |         
144 |         self.X_fit = X;
145 |         
146 |         
147 |     def _get_kernel(self, X, Y=None):
148 |         params = {"gamma": self.gamma,
149 |                   "degree": self.degree,
150 |                   "coef0": self.coef0}
151 |         try:
152 |             return pairwise_kernels(X, Y, metric=self.kernel,
153 |                                     filter_params=True,  n_jobs = -1, **params)
154 |         except AttributeError:
155 |             raise ValueError("%s is not a valid kernel. Valid kernels are: "
156 |                              "rbf, poly, sigmoid, linear and precomputed."
157 |                              % self.kernel)
158 |         
159 |     
160 |         
161 |          
162 | class KernelSupervisedPCA(    BaseEstimator, TransformerMixin):    
163 | 
164 |     """Kernel Supervised Principal component analysis (SPCA)
165 | 
166 |     Non-linear dimensionality reduction through the use of kernels.
167 | 
168 |     Parameters
169 |     ----------
170 |     n_components: int or None
171 |         Number of components. If None, all non-zero components are kept.
172 | 
173 |     x||ykernel: "linear" | "poly" | "rbf" | "sigmoid" | "precomputed"
174 |         Kernel.
175 |         Default: "linear"
176 | 
177 |     degree : int, optional
178 |         Degree for poly, rbf and sigmoid kernels.
179 |         Default: 3.
180 | 
181 |     gamma : float, optional
182 |         Kernel coefficient for rbf and poly kernels.
183 |         Default: 1/n_features.
184 | 
185 |     coef0 : float, optional
186 |         Independent term in poly and sigmoid kernels.
187 | 
188 | 
189 |     eigen_solver: string ['auto'|'dense'|'arpack']
190 |         Select eigensolver to use.  If n_components is much less than
191 |         the number of training samples, arpack may be more efficient
192 |         than the dense eigensolver.
193 | 
194 |     tol: float
195 |         convergence tolerance for arpack.
196 |         Default: 0 (optimal value will be chosen by arpack)
197 | 
198 |     max_iter : int
199 |         maximum number of iterations for arpack
200 |         Default: None (optimal value will be chosen by arpack)
201 | 
202 |     Attributes
203 |     ----------
204 | 
205 |     `lambdas_`, `alphas_`:
206 |         Eigenvalues and eigenvectors of the centered kernel matrix
207 | 
208 | 
209 |     """
210 |     
211 |     def __init__(self, n_components=None, xkernel={'kernel': "linear", 'gamma':0, 'degree':3,
212 |                  'coef0':1}, ykernel = {'kernel': "linear", 'gamma':0, 'degree':3,
213 |                  'coef0':1},  fit_inverse_transform=False,
214 |                  eigen_solver='auto', tol=0, max_iter=None):
215 |                  
216 |                  
217 |         self.n_components = n_components
218 |         self.xkernel = xkernel
219 |         self.ykernel = ykernel
220 |         self.fit_inverse_transform = fit_inverse_transform
221 |         self.eigen_solver = eigen_solver
222 |         self.tol = tol
223 |         self.max_iter = max_iter
224 |         self.centerer = KernelCenterer()
225 |         
226 |         
227 |     def transform(self, X):
228 |         """
229 |         Returns a new X, X_trans, based on previous self.fit() estimates
230 |         """
231 |         K = self._get_kernel(self.X_fit, self.xkernel, X )
232 |         return K.T.dot( self.alphas_ )  
233 |             
234 |     
235 |     def fit(self,X,Y):
236 |         self._fit(X,Y)
237 |         return 
238 |         
239 |     def fit_transform( self, X, Y):
240 |     
241 |     
242 |         self.fit( X,Y)
243 |         return self._transform()
244 |         
245 |     def _transform(self):
246 |         
247 |         return self.Kx_fit.dot(self.alphas_)
248 | 
249 |         
250 |     def _fit(self, X, Y):
251 |         #find kenerl matrix of Y
252 |         Ky = self.centerer.fit_transform(self._get_kernel(Y), self.ykernel)
253 |         Kx = self.centerer.fit_transform( self._get_kernel(X), self.xkernel)
254 |         
255 | 
256 |         
257 |         if self.n_components is None:
258 |             n_components = Ky.shape[0]
259 |         else:
260 |             n_components = min(Ky.shape[0], self.n_components)
261 |         
262 |         #compute eigenvalues of X^TKX
263 |         
264 |         M = (Kx).dot(Ky).dot(Kx)
265 | 	if self.eigen_solver == 'auto':
266 |             if M.shape[0] > 200 and n_components < 10:
267 |                 eigen_solver = 'arpack'
268 |             else:
269 |                 eigen_solver = 'dense'
270 |         else:
271 |             eigen_solver = self.eigen_solver
272 | 
273 |         if eigen_solver == 'dense':
274 |             self.lambdas_, self.alphas_ = linalg.eigh(
275 |                 M, Kx, eigvals=(M.shape[0] - n_components, M.shape[0] - 1))
276 |         elif eigen_solver == 'arpack':
277 |             self.lambdas_, self.alphas_ = eigsh(M, Kx, n_components,
278 |                                                 which="LA",
279 |                                                 tol=self.tol)
280 |         indices = self.lambdas_.argsort()[::-1]
281 |         self.lambdas_ = self.lambdas_[indices]
282 |         self.alphas_ = self.alphas_[:, indices]
283 |         
284 |         #remove the zero/negative eigenvalues
285 |         self.alphas_ = self.alphas_[:, self.lambdas_ > 0 ]
286 |         self.lambdas_ = self.lambdas_[ self.lambdas_ > 0 ]
287 |         
288 |         self.X_fit = X;
289 |         self.Kx_fit = Kx;
290 |         
291 |     def _get_kernel(self, X, params, Y=None):
292 |         try:
293 |             return pairwise_kernels(X, Y, metric=params['kernel'],
294 |                                      n_jobs = -1, **params)
295 |         except AttributeError:
296 |             raise ValueError("%s is not a valid kernel. Valid kernels are: "
297 |                              "rbf, poly, sigmoid, linear and precomputed."
298 |                              % params['kernel'])
299 |         
300 |     
301 | 


--------------------------------------------------------------------------------
/MachineLearningScikitLearn/weighted_least_squares.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sklearn.linear_model.LinearRegression as LR
 3 | 
 4 | 
 5 | class WeightedLinearRegression(LR):
 6 |     """
 7 |     Implements a weighted least squares class.
 8 |     weights: a nx1 vector of non-zero weights. 
 9 |     
10 |     """
11 |     def __init__(weights, **kwargs):
12 |         print "warning: untested"
13 |         super(LR, self).__init__(**kwargs)
14 |         self.weights=  weights
15 |         
16 |         
17 |     def fit( X, Y):
18 |         assert X.shape[0] == Y.shape[0] == self.weights.shape[0], "Objects must be same size"
19 |         sqw = np.sqrt( self.weights )
20 |         self.fit( X*sqw, Y*sqw )
21 |         return self
22 |     
23 |     def predict( X ):
24 |         return self.predict( X*np.sqrt(self.weights) )


--------------------------------------------------------------------------------
/MonteCarlo/Copulas/README.txt:
--------------------------------------------------------------------------------
1 | See MCMC (specificaly mcmc_examples.py) folder for python implementation of copulas


--------------------------------------------------------------------------------
/MonteCarlo/Integration/Assignment.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CamDavidsonPilon/Python-Numerics/043ab4ad9003325c6270486b24d163933e0c7e8a/MonteCarlo/Integration/Assignment.pdf


--------------------------------------------------------------------------------
/MonteCarlo/Integration/MonteCarloIntegrator.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import scipy.stats as stats
  4 | import time
  5 | 
  6 | class MCIntegrator( object ):
  7 |     """
  8 |     target_function: a function that accepts a n-D array, and returns an n-D array.
  9 |     interval: the interval of the integration
 10 |     b_antithetic: whether to use antithesis variables. Much quicker, but only useful on monotonic target_functions
 11 |     sampling_dist: a scipy frozen distribution with support equal to the interval
 12 |     N: number of variables to use in the initial estimate.
 13 |     control_variates = a list of function that accepts a nD array, and return an nD array
 14 |     """
 15 |     def __init__(self, target_function, 
 16 |                        interval = (0,1), 
 17 |                        N = 10000, 
 18 |                        b_antithetic = False, 
 19 |                        sampling_dist = stats.uniform(), 
 20 |                        verbose=False,
 21 |                        control_variates = []):
 22 |         self.target_function = target_function
 23 |         self.min_interval, self.max_interval = interval
 24 |         self.N_ = N
 25 |         self.N = 0
 26 |         self.sampling_dist = sampling_dist
 27 |         self.value =0
 28 |         self.b_antithetic = b_antithetic
 29 |         self.verbose = verbose
 30 |         self.control_variates = control_variates
 31 |    
 32 |     def estimate_N(self, N ):
 33 |         self.N += N
 34 |         return self._estimate(N)
 35 |         
 36 |         
 37 |         
 38 |     def _estimate(self, N):
 39 |         
 40 |         #generate N values from sampling_dist
 41 |         if not self.b_antithetic:
 42 |             U = self.sampling_dist.rvs(N)
 43 |             Y = self.target_function( U )
 44 |             for func in self.control_variates:
 45 |                 X =  func(U)
 46 |                 Y += X
 47 |                 
 48 |             if self.verbose:
 49 |                 print Y.var()
 50 |             self.value +=  Y.sum()
 51 |         else:
 52 |             U_ = self.sampling_dist.rvs(N/2)
 53 |             antiU_ = self.min_interval + (self.max_interval - U_ )
 54 |             Y =  (self.target_function( U_ ) + self.target_function( antiU_ ) )
 55 |             if self.verbose:
 56 |                 print Y.var()
 57 |             self.value +=Y.sum()
 58 |         return self.value / self.N 
 59 |         
 60 |     def estimate(self):
 61 |         self.N += self.N_
 62 |         return self._estimate(self.N_)
 63 |         
 64 | 
 65 | 
 66 | if __name__ == "__main__":
 67 |     #Some examples:
 68 |     
 69 |     
 70 |     def target(u):
 71 |         return np.exp(-u**2)*2
 72 |     
 73 |     mci = MCIntegrator( target, interval =(0,2), b_antithetic = False, sampling_dist = stats.uniform(0,2), verbose= True )
 74 |     N = 1e6
 75 |     
 76 |     start = time.clock()
 77 |     print "Using %d samples,"%N
 78 |     print "Non-antithetic: %.5f."%mci.estimate_N(N )
 79 |     print "Duration: %.3f s."%(time.clock() - start) 
 80 |     print
 81 |     mci = MCIntegrator( target, interval =(0,2), b_antithetic = True, sampling_dist = stats.uniform(0,2), verbose= True )
 82 |     start = time.clock()
 83 |     print "Antithetic: %.5f."%mci.estimate_N(N )        
 84 |     print "Duration: %.3f s."%(time.clock() - start) 
 85 |     print
 86 |     
 87 |     """
 88 |     Using 1000000 samples,
 89 |     0.474815598284
 90 |     Non-antithetic: 0.88140.
 91 |     Duration: 0.382 s.
 92 | 
 93 |     0.0417625416316
 94 |     Antithetic: 0.88216.
 95 |     Duration: 0.303 s.
 96 |     """
 97 |     
 98 |     
 99 |     #Using importance sampling
100 |     
101 |     def importance_function(u):
102 |         return (-.5*u + 1)*2
103 |         
104 |         
105 |     class Importance(object):
106 |         def __init__(self):
107 |             pass
108 |         
109 |         def rvs(self,n):
110 |             u = stats.uniform(0,1).rvs( n)
111 |             return 2*( 1 - np.sqrt(u) )
112 | 
113 |     sampling_dist = Importance()
114 |     mci = MCIntegrator( target, interval = (0,2), b_antithetic = False, sampling_dist = sampling_dist, N=100000, verbose= True )
115 |     print mci.estimate()
116 |     
117 |     
118 |     #using control variates
119 | 
120 |     def polynomial_control( u ):
121 |         return -.26*( (1-u**2) - -1.0/3)
122 |         
123 |     mci = MCIntegrator( target, interval =(0,2), sampling_dist = stats.uniform(0,2), verbose= True, control_variates=[polynomial_control] )
124 |     start = time.clock()
125 |     print "Control Variates: %.5f."%mci.estimate_N(N )        
126 |     print "Duration: %.3f s."%(time.clock() - start)
127 | 
128 |     
129 |     


--------------------------------------------------------------------------------
/MonteCarlo/Integration/Q6.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | """
 4 | Q6.
 5 | 
 6 | The best estimate of c* is about 3.05. To find this I used a gamma distribution to estimate the 
 7 | integral for values of c between 0 and q and plotted the results. With this optimal 
 8 | value of c*, the expected value is approximatly equal to
 9 | 
10 | E[ 1_{x > q} ] = 1.139e-06
11 | 
12 | 
13 | 
14 | """
15 | 
16 | 
17 | import q2
18 | import numpy as np
19 | import scipy.stats as stats
20 | Q = 3.7
21 | interval = (0, np.infty)
22 | 
23 | def target(u, c):
24 |     return (2*u**2)/(u-c)*np.exp( - ( u**2 + 2*c*u - c**2 ) )
25 |     
26 |     
27 | potentialC = np.array( [0.1, 0.5, 1, 2, 3, 3.5] )
28 | potentialCprime = np.linspace( 2.5, 3.55, 20)
29 | estimates = np.zeros_like( potentialCprime)
30 | for i,c in enumerate(potentialCprime):
31 |     #sampling_dist = stats.norm( loc = c, scale= 5 )
32 |     sampling_dist = stats.gamma(1, loc=Q )
33 |     target_c = lambda x: target(x,c)
34 |     mci = q2.MCIntegrator( target_c, interval =interval, b_antithetic = False, sampling_dist = sampling_dist, N=100000, verbose= False )
35 |     estimates[i] = mci.estimate()
36 |     
37 |     
38 | 
39 | 
40 | #3.0526315789473681 is about best
41 | c_opt = 3.0526
42 | 
43 | def rayleigh(u):
44 |     return 2*u*np.exp(-u**2)
45 | 
46 |     
47 | def target(u):
48 |     return ( u > Q)*rayleigh(u)/rayleigh(u-c_opt)
49 | sampling_dist = stats.rayleigh( loc = c_opt, scale=1./np.sqrt(2))
50 | mci = q2.MCIntegrator( target, interval =interval, b_antithetic = False, sampling_dist = sampling_dist, N=100000, verbose= False )
51 | print mci.estimate()
52 | # estimate: 1.13859704116e-06
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/MonteCarlo/Integration/examples.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | import scipy.stats as stats
 4 | import time
 5 | 
 6 | from MonteCarloIntegrator import *
 7 | 
 8 | """
 9 | Lets estimate the integral
10 | 
11 | I = \int_0^2 exp(-x**2) dx
12 |   = E_u[ exp(-x**2)/2 ] where u ~ Uni(0,2)
13 | 
14 | 
15 | """
16 | 
17 | 
18 | def target(u):
19 |     return np.exp(-u**2)*2
20 | 
21 | mci = MCIntegrator( target, interval =(0,2), b_antithetic = False, sampling_dist = stats.uniform(0,2), verbose= True )
22 | N = 1e6
23 | 
24 | start = time.clock()
25 | print "Using %d samples,"%N
26 | print "Non-antithetic: %.5f."%mci.estimate_N(N )
27 | print "Duration: %.3f s."%(time.clock() - start) 
28 | 
29 | #using anti-thetic
30 | 
31 | mci = MCIntegrator( target, interval =(0,2), b_antithetic = True, sampling_dist = stats.uniform(0,2), verbose= True )
32 | start = time.clock()
33 | print "Antithetic: %.5f."%mci.estimate_N(N )        
34 | print "Duration: %.3f s."%(time.clock() - start) 
35 | 
36 | 
37 | 
38 | #Using importance sampling
39 | 
40 | def importance_function(u):
41 |     return (-.5*u + 1)*2
42 |     
43 |     
44 | class Importance(object):
45 |     def __init__(self):
46 |         pass
47 |     
48 |     def rvs(self,n):
49 |         u = stats.uniform(0,1).rvs( n)
50 |         return 2*( 1 - np.sqrt(u) )
51 | 
52 | sampling_dist = Importance()
53 | mci = MCIntegrator( target, interval = (0,2), b_antithetic = False, sampling_dist = sampling_dist, N=N, verbose= True )
54 | print mci.estimate()
55 | 
56 | 
57 | #using control variates
58 | 
59 | def polynomial_control( u ):
60 |     return -.26*( (1-u**2) - -1.0/3)
61 |     
62 | mci = MCIntegrator( target, interval =(0,2), sampling_dist = stats.uniform(0,2), verbose= True, N=N, control_variates=[polynomial_control] )
63 | start = time.clock()
64 | print "Control Variates: %.5f."%mci.estimate_N(N )        
65 | print "Duration: %.3f s."%(time.clock() - start)


--------------------------------------------------------------------------------
/MonteCarlo/MCMC/copulas.py:
--------------------------------------------------------------------------------
 1 | """Some copulas and helpers for copulas"""
 2 | 
 3 | from __future__ import division
 4 | import numpy as np
 5 | import scipy.stats as stats
 6 | import scipy as sp
 7 | 
 8 | 
 9 | def gumbel(t, theta = 1):
10 |     #theta in (0, \infty)
11 |     return np.exp( -t**(1./theta) )
12 |     
13 | def inv_gumbel( t, theta=1):
14 |     return (-np.log(t) )**theta
15 |     
16 |     
17 | def clayton(t, theta=1):
18 |     return (1+theta*t)**(-1./theta)
19 |     
20 | def inv_clayton( t, theta =1):
21 |     return 1.0/theta*( t**(-theta) - 1)
22 |     
23 | 
24 |     
25 | def arch_copula(u, f= gumbel, f_inv = inv_gumbel, theta = 1 ):
26 |     """
27 |     #u is a numpy array
28 |     """
29 | 
30 |     if ( (u > 1).sum() + (u <0).sum() )>0:
31 |         return 0
32 |     
33 |     return f( f_inv( u, theta ).sum(), theta )
34 | 
35 |  
36 | def _pdf(f, u, delta = 0.001 ):
37 |     n = u.shape[0]
38 |     if n==1:
39 |         t= f(u[0]+delta/2) - f(u[0]-delta/2)
40 |         return t
41 |     else:
42 |         f_plus = lambda *x: f( u[0] + delta/2, *x)
43 |         f_minus = lambda *x: f( u[0] - delta/2, *x)
44 |         return _pdf(f_plus, u[1:], delta ) - _pdf(f_minus, u[1:], delta ) 
45 |         
46 | def cdf2pdf( f, u, delta=0.001, kwargs={} ):
47 |     """numerically unstable for large dimensions"""
48 |     def _wrapper(*args):
49 |         u = np.array(args)
50 |         return f(u, **kwargs)
51 |     n = u.shape[0]
52 |     return _pdf( _wrapper, u, delta)/delta**n
53 |     
54 |     
55 |     
56 | class Copula_Proposal( object ):
57 |     def __init__(self):
58 |         self.norm = stats.norm
59 |         
60 |     def rvs(self, loc, scale, size=1):
61 |         return self.norm.rvs( loc = loc, scale= scale, size = size)
62 |         
63 |     def pdf( self, x, given, scale = 1.0):
64 |         """
65 |         http://darrenjw.wordpress.com/2012/06/04/metropolis-hastings-mcmc-when-the-proposal-and-target-have-differing-support/
66 |         """
67 |         return self.norm.cdf( x/scale ).prod()
68 |         
69 |        
70 |         
71 |         
72 | 


--------------------------------------------------------------------------------
/MonteCarlo/MCMC/mcmc.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from __future__ import division
  3 | 
  4 | """
  5 | 
  6 | I'll begin with the MCMC object. 
  7 |    It is a very general instance of a MCMC. It uses a Gaussian random walk to propose the next step.
  8 | The first issue is whether to accept or reject instances that fall outside the unit cube (as copulas 
  9 | are only defined here), or more generally, fall out of the support of the target distribution. We bias
 10 | the results if we use the acceptance ratio target(x')/target(x_n). This is because by immediatly rejecting 
 11 | results that are outside the support, we are using a truncated proposal distribution, and this is not 
 12 | symmetric. Thus in the below code, I use the ratio target(x')/target(x_n) * norm_cdf( x_n)/norm_cdf(x'). See
 13 | http://darrenjw.wordpress.com/2012/06/04/metropolis-hastings-mcmc-when-the-proposal-and-target-have-differing-support/
 14 | for a full, and great, explaination. 
 15 |    I have dynamic step size that targets a certain acceptance rate (if too many acceptances, likely
 16 | not exploring the space very well, vs. too few acceptances mean likely stepping too far. See documentation in the code). 
 17 |    
 18 |  
 19 | 
 20 | 
 21 | """
 22 | 
 23 | import pdb
 24 | import numpy as np
 25 | import scipy.stats as stats
 26 | import matplotlib.pyplot as plt
 27 | import scipy as sp
 28 | 
 29 | 
 30 | # Need a way to sample from copula
 31 | # Do this using MCMC
 32 | 
 33 | 
 34 | class Normal_proposal( object ):
 35 |     
 36 |     def __init__(self, ):
 37 |         self.norm = stats.norm
 38 |     
 39 |     def rvs(self, loc = 0, scale = 1, size =1):
 40 |         return self.norm.rvs( loc = loc, scale = scale, size = size )
 41 |     
 42 |     def pdf( self, x, given, scale= 1 ):
 43 |         return self.norm.pdf( x-given, scale = 1).prod() #assumes independent
 44 |     
 45 |    
 46 | 
 47 | 
 48 | class MCMC(object):
 49 |         """
 50 |         Implementation of the Metropolis-Hasting algo.
 51 |         params:
 52 |             target_dist: the target_distribution, what accept a d-dim vector.
 53 |             proposal_dist: the proposal dist, an object with the following methods:
 54 |                 .pdf(x, y, scale): the pdf of scale*X | y, should accept a vector
 55 |                 .rvs(loc, scale, size) #todo
 56 |                 
 57 |             x_0: a starting location
 58 |             burn_in: the number of burn in steps
 59 |             dim: the dimension of the densities.
 60 |             init_scale: the initial scale to start at. The algorithm uses a simple 
 61 |                 dynamic scale to target a certain acceptance ratio.
 62 |             
 63 |         methods:
 64 |             next() : generates and returns a random variate from the target_dist
 65 |             
 66 |             
 67 |         """
 68 |         def __init__(self, target_dist, 
 69 |                            dim = 1, 
 70 |                            x_0 = None, 
 71 |                            burn_in = 300, 
 72 |                            init_scale = 1,
 73 |                            proposal_dist = Normal_proposal(),
 74 |                            verbose = True):
 75 |             self.target_dist = target_dist
 76 |             self.x = x_0
 77 |             self.burn_in = burn_in
 78 |             self.dim = dim
 79 |             self.uniform = stats.uniform()
 80 |             #self.std = 1
 81 |             self.proposals = 0
 82 |             self.accepted = 0
 83 |             self.proposal_dist = proposal_dist
 84 |             self.verbose = verbose
 85 |             self.std = init_scale
 86 |             self.array_std = self.std*np.ones(1)
 87 |             if x_0 == None:
 88 |                 #initialize array
 89 |                 self.x = np.zeros(dim)
 90 |             self._burn()
 91 |             
 92 |         def _normcdf(self, x_array):
 93 |             return proposal_dist.cdf( x_array).prod()
 94 |         
 95 |         def _modify_step(self):
 96 |             #lets check our acceptance rate, and aim for .234, see http://www.maths.lancs.ac.uk/~sherlocc/Publications/rwm.final.pdf
 97 |             opt_rate = .234
 98 |             epsilon = 0.05
 99 |             rate = self.accepted/self.proposals
100 |             if rate > opt_rate + epsilon: #too many acceptance, spread out more
101 |                 self.std *= 1.001
102 |             elif rate < opt_rate  - epsilon:
103 |                 self.std /= 1.001
104 |             
105 |             self.array_std = np.append( self.array_std, self.std)
106 |             return
107 |             
108 |         def rvs(self, n=1):
109 |             #generate a new sample
110 |             #An interesting bug: http://darrenjw.wordpress.com/2012/06/04/metropolis-hastings-mcmc-when-the-proposal-and-target-have-differing-support/
111 |             
112 |             observations = np.empty( (n,self.dim) )
113 |             for i in range(n):
114 |                 accept = False
115 |                 tally = 0
116 |                 #lets keep a running tally of our acceptance rate.
117 |                 while not accept:
118 |                     self.proposals += 1
119 |                     #x_new = self.x +  self.std*np.random.multivariate_normal(np.zeros(self.dim), np.eye(self.dim))
120 |                     x_new = self.proposal_dist.rvs(self.x, scale = self.std, size = self.dim) #this is 
121 |                     #a = self.target_dist( x_new )/ self.target_dist( self.x) #we use the correct acceptance ratio:
122 |                     #a = self.target_dist( x_new)*self._normcdf( self.x)/ ( self.target_dist( self.x )*self._normcdf( x_new ) )
123 |                     a = self.target_dist(x_new)*self.proposal_dist.pdf(self.x, x_new)/( self.target_dist(self.x)*self.proposal_dist.pdf( x_new, self.x) )
124 |                     #print a
125 |                     #pdb.set_trace()
126 |                     if (a>=1) or ( self.uniform.rvs() < a ):
127 |                         accept = True
128 |                         self.x = x_new
129 |                         self.accepted +=1
130 |                     tally+=1
131 |                     if tally%150==0:
132 |                         print "hmm...I'm not mixing well. I've rejected 150+ samples. Try a restart? Currently at ", self.x
133 |                 observations[i] = self.x
134 |             return observations  
135 |   
136 |         def _burn(self):
137 |             if self.verbose:
138 |                 print "Burn, Baby, burn. %d times."%self.burn_in
139 |             for i in xrange(self.burn_in):
140 |                 self.rvs()
141 |                 self._modify_step()
142 | 
143 |             if self.verbose:
144 |                 print "Burn-in complete. Use next() to call new observations."
145 | 
146 | 


--------------------------------------------------------------------------------
/MonteCarlo/MCMC/mcmc_example.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Use MCMC to sample from some copulas
 3 | 
 4 | 
 5 | Given a copula, we need to find its pdf. I chose, to establish arbitrary dimensional copulas, to do 
 6 | this numerically. I needed to compute the copula differentiated with respect to all of its arguemnts. This
 7 | was quite the algorithmic challenge, but I reduced it to a recursive problem that works blazingly fast. This 
 8 | felxibility allows us to never have to explicitly find the pdf, which can be difficult even for dimension > 2. 
 9 | The differentiation algorithm uses a central difference scheme. Unfortunatly the scheme is unstable for dimensions
10 | greater than 6.
11 | 
12 | """
13 | import numpy as np
14 | import scipy.stats as stats
15 | import matplotlib.pyplot as plt
16 | import scipy as sp
17 | 
18 | from mcmc import *
19 | from copulas import *
20 | 
21 | 
22 |     
23 |     
24 | mcmc1 = MCMC( lambda u: cdf2pdf( arch_copula, u) , dim = 2, x_0 = np.array( [0.5, 0.5] ) )
25 | mcmc3 = MCMC( lambda u: cdf2pdf( arch_copula, u, kwargs={"theta":3}) , dim = 2, x_0 = np.array( [0.5, 0.5] ) )
26 | 
27 | N = 1000
28 | sampleTheta1 = mcmc1.rvs( N )
29 | sampleTheta3 = mcmc3.rvs( N )
30 | 
31 | plt.figure()
32 | 
33 | plt.subplot(221)
34 | plt.scatter( sampleTheta1[:,0], sampleTheta1[:,1], alpha = 0.5)
35 | plt.title("1000 values from a Gumbel \n copula with  %s=1"%r"$\theta$")
36 | 
37 | plt.subplot(222)
38 | plt.scatter( sampleTheta3[:,0], sampleTheta3[:,1], alpha = 0.5 )
39 | plt.title("1000 values from a Gumbel \n copula with  %s=3"%r"$\theta$")
40 | 
41 | 
42 | 
43 | #lets make the exponential
44 | def make_exp( u ):
45 |     return -np.log(u/3)*3
46 | 
47 | plt.subplot(223)
48 | plt.scatter( make_exp( sampleTheta1[:,0]) , make_exp( sampleTheta1[:,1] ), alpha = 0.5 )
49 | plt.title("1000 EXP(3) values from a Gumbel \n copula with  %s=1"%r"$\theta$")
50 | 
51 | 
52 | plt.subplot(224)
53 | plt.scatter( make_exp( sampleTheta3[:,0]) , make_exp( sampleTheta3[:,1] ), alpha = 0.5 )
54 | plt.title("1000 EXP(3) values from a Gumbel \n copula with  %s=3"%r"$\theta$")
55 | 
56 | plt.show()
57 | 
58 | 
59 | mcmc1 = MCMC( lambda u: cdf2pdf( arch_copula, u, kwargs={"f":clayton, "f_inv":inv_clayton}  ) , dim = 2, x_0 = np.array( [0.5, 0.5] ) )
60 | mcmc3 = MCMC( lambda u: cdf2pdf( arch_copula, u, kwargs={"theta":5, "f":clayton, "f_inv":inv_clayton}) , dim = 2, x_0 = np.array( [0.5, 0.5] ) )
61 | 
62 | 
63 | dataTheta1 = mcmc1.rvs( N )
64 | 
65 | dataTheta3 =  mcmc3.rvs( N )
66 | 
67 | plt.figure()
68 | 
69 | plt.subplot(221)
70 | plt.scatter( dataTheta1[:,0], dataTheta1[:,1], alpha = 0.5 )
71 | plt.title("1000 values from a Clayton \n copula with %s=1"%r"$\theta$")
72 | 
73 | plt.subplot(222)
74 | plt.scatter( dataTheta3[:,0], dataTheta3[:,1], alpha = 0.5 )
75 | plt.title("1000 values from a Clayton \n copula with %s=5"%r"$\theta$")
76 | 
77 | 
78 | 
79 | #lets make the exponential
80 | def make_exp( u ):
81 |     return -np.log(u)
82 | 
83 | plt.subplot(223)
84 | plt.scatter( make_exp( dataTheta1[:,0]) , make_exp( dataTheta1[:,1] ), alpha = 0.5 )
85 | plt.title("1000 EXP(1) values from a Clayton\n copula with %s=1"%r"$\theta$")
86 | 
87 | 
88 | plt.subplot(224)
89 | plt.scatter( make_exp( dataTheta3[:,0]) , make_exp( dataTheta3[:,1] ), alpha = 0.5 )
90 | plt.title("1000 EXP(1) values from a Clayton\n copula with %s=5"%r"$\theta$")
91 | 
92 | plt.show()


--------------------------------------------------------------------------------
/MonteCarlo/grammschmidt.py:
--------------------------------------------------------------------------------
 1 | # author iizukak, 2011
 2 | # author cam davidson-pilon, 2012
 3 | 
 4 | import numpy as np
 5 | import pdb
 6 | def gs_cofficient(v1, v2):
 7 |     return np.dot(v2, v1) / np.dot(v1, v1)
 8 | 
 9 | def multiply(cofficient, v):
10 |     return map((lambda x : x * cofficient), v)
11 | 
12 | def proj(v1, v2):
13 |     return multiply(gs_cofficient(v1, v2) , v1)
14 | 
15 | def gs(X):
16 |     """
17 |     performs the Gramm-Shmidt process to orthonormalize a a matrix of vectors.
18 |     X: vectors to orthonormalize are rows.
19 |     Returns Y, same shape as X, and with orthonormal rows.
20 |     """
21 |     Y = np.zeros_like(X)
22 |     for i in range(len(X)):
23 |         temp_vec = X[i]
24 |         for j in range(i) :
25 |             proj_vec = proj(Y[j,:], X[i])
26 |             temp_vec = temp_vec - proj_vec
27 |         Y[i,:] = temp_vec/np.sqrt( np.dot( temp_vec,temp_vec ) )
28 |     return Y
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/MonteCarlo/sample_normal_given_projection.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This function generates samples N from N( mu, Simga ) such that N'*nu = x ie. samples 
 3 |  N | N*nu = x (which is still normal btw).
 4 |  
 5 |  Note that actually mu is useless.
 6 | 
 7 | """
 8 | import numpy as np
 9 | def sample_normal_given_projection(  covariance, x, lin_proj, n_samples=1):
10 |     """
11 |     parameters: 
12 |         x: the value s.t. lin_proj*N = x; scalar
13 |         lin_proj: the vector to project the sample unto (n,)
14 |         covariance: the covariance matrix of the unconditional samples (nxn)
15 |         n_samples: the number of samples to return
16 |         
17 |     returns:
18 |         ( n x n_samples ) numpy array 
19 |         
20 |     """
21 |     variance = np.dot( np.dot( lin_proj.T, covariance), lin_proj )
22 |     
23 |     #normalize our variables s.t. lin_proj*N is N(0,1)
24 | 
25 |     sigma_lin = np.dot(covariance, lin_proj[:,None])
26 |     cond_mu = ( sigma_lin.T*x/variance ).flatten()
27 |     cond_covar = covariance - np.dot( sigma_lin, sigma_lin.T )/ variance
28 |     
29 |     _samples = np.random.multivariate_normal( cond_mu, cond_covar, size = (n_samples) )
30 |     return ( _samples )
31 |     
32 | 
33 | 
34 |         
35 |         
36 |     
37 |     
38 |     


--------------------------------------------------------------------------------
/MonteCarlo/sample_psd.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from grammschmidt import gs
  3 | import warnings
  4 | import scipy.stats as stats
  5 | 
  6 | def sample_pd_matrix( dim, avg_variance = 1, diag = np.array([]) ):
  7 |     """
  8 |     avg_variance = the average variance, scalar. 
  9 |     dim: the dimension of the sampled covariance matrix
 10 |     diag: enter a dim-dimensional vector to use as the diagonal eigenvalues elements.
 11 |     """
 12 |     
 13 |     #create an orthonormal basis 
 14 |     Ob = gs(np.random.randn( dim,dim ) )
 15 |     if not diag.any():
 16 |         """
 17 |         This uses the fact that the sum of varinaces/n == Trace(A)/n == sum of eigenvalues/n ~= E[ Gamma(1, 1/avg_variance) ] = avg_variance
 18 |         """
 19 |         diag = stats.gamma.rvs( 1, scale = avg_variance, size = ( (dim,1) ) )
 20 |     else:
 21 |         diag = diag.reshape( (dim,1) )
 22 |     return np.dot( Ob.T*diag.T, Ob )
 23 |     
 24 | 
 25 | def return_lower_elements(A):
 26 |     n = A.shape[0]
 27 |     t = [ (i,j) for j in range(0,n) for i in range(j+1,n) ]
 28 |     return np.array( [A[x] for x in t] )
 29 |     
 30 |     
 31 | def deprecated(func):
 32 |     '''This is a decorator which can be used to mark functions
 33 |     as deprecated. It will result in a warning being emitted
 34 |     when the function is used.'''
 35 |     def new_func(*args, **kwargs):
 36 |         warnings.warn("Call to deprecated function {}.".format(func.__name__),
 37 |                       category=DeprecationWarning)
 38 |         return func(*args, **kwargs)
 39 |     new_func.__name__ = func.__name__
 40 |     new_func.__doc__ = func.__doc__
 41 |     new_func.__dict__.update(func.__dict__)
 42 |     return new_func
 43 |     
 44 | @deprecated    
 45 | def generate_pd_matrix( dim, avg_covariance=0, avg_variance = 0, diag=np.array([]) ):
 46 |     """
 47 |     Currently unstable for dim > 25. I would not use.
 48 |     
 49 |     
 50 |     This uses Sylvester's criterion to create n-dim covariance (PSD) matrices.
 51 |     To make correlation matrices, specify the diag parameters to be an array of all ones.
 52 |     parameters:
 53 |         avg_covariance: is added to a Normal(0,1) observation for each covariance.
 54 |             So, the sample mean of all covariances should be avg_covariance.
 55 |         dim: the dimension of the sampled covariance matrix
 56 |         diag: enter a dim-dimensional vector to use as the diagonal elements.
 57 | 
 58 |     """
 59 |     invA = None
 60 |     M = np.zeros( (dim,dim) )
 61 |     for i in xrange( 0, dim ):
 62 |         A = M[:i,:i]
 63 |         b_flag = False
 64 |         while not b_flag:
 65 |             #generate a variance and covariance array
 66 |             variance = diag[i] if diag.any() else avg_variance + np.abs( np.random.randn(1) ) 
 67 |             covariance = (avg_covariance + np.random.randn(i))  #for stability
 68 |             #pdb.set_trace()
 69 |             #Using Danny's algorithm
 70 |             if i > 0:
 71 |                 c = variance*np.random.rand(1) # > 0, < variance 
 72 |                 _lambda = np.dot( np.dot( covariance[:,None].T, invA), covariance[:,None] )[0] +1 
 73 |                 print _lambda
 74 |                 covariance = (np.sqrt(c)/np.sqrt(_lambda))*covariance.T
 75 |                 
 76 |             
 77 |             #check if det > 0 of matrix | A   cov |
 78 |             #                           | cov var |
 79 |             
 80 |             
 81 |             if i==0 or _lambda > 0:
 82 |                 b_flag = True
 83 |         M[i, :i] = covariance
 84 |         M[:i, i] = covariance
 85 |         M[i,i] = variance
 86 |         
 87 |         if i > 0:
 88 |             invA = invert_block_matrix_CASE1( A , covariance, variance, invA)
 89 |             #invA = np.linalg.inv( M[:i+1,:i+1])
 90 |         else:
 91 |             invA = 1.0/M[i,i]
 92 |         
 93 |     return M
 94 |   
 95 | 
 96 |       
 97 | def invert_block_matrix_CASE1( A, b, c, invA = None):
 98 |     """
 99 |     Inverts the matrix | A  b |
100 |                        | b' c |
101 |     
102 |     where A is (n,n), b is (n,) and c is a scalar
103 |     P,lus if you know A inverse already, add it to make computations easier.
104 |     This is quicker for larger matrices. How large?
105 |     
106 |     """
107 |         
108 |     n = A.shape[0]    
109 |     if n == 1 and A[0] != 0:
110 |         invA = 1.0/A
111 |     if b.shape[0] == 0:
112 |         return 1.0/A
113 |     
114 |     if invA == None:
115 |        invA = np.linalg.inv(A)
116 | 
117 |     inverse = np.zeros( (n+1, n+1) )
118 |     k = c - np.dot( np.dot( b, invA), b )
119 | 
120 |     inverse[ :n, :n] = invA  + np.dot(  np.dot( invA, b[:,None]), np.dot( b[:,None].T, invA) )/k
121 |     inverse[n, n] = 1/k
122 |     inverse[:n, n] = -np.dot(invA,b)/k
123 |     inverse[n, :n] = -np.dot(invA,b)/k
124 |     return inverse
125 |     
126 |     
127 |         


--------------------------------------------------------------------------------
/MonteCarlo/sampling_methods.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | """
  4 | I simulated 10000 variables with CDF F(y) = 1/3(y**5 + y**2 + y) using a acceptance rejection scheme and the inversion method. While both
  5 | where fast, AR was much faster than the inversion method, even using a poor sampling scheme. The sampler I used for the AR method 
  6 | a M*Uniform distribution, where M = max_y 1/3( 5*y**4 + 2*y + 1). This bounded the pdf of Y. My output is below:
  7 | 
  8 | >> Testing AR Method.
  9 | >> Generate 10000 variables:
 10 | >> Mean: 0.669, time taken: 0.13 seconds
 11 | 
 12 | >> Testing Inverse Method.
 13 | >> Generate 10000 variables:
 14 | >> Mean: 0.669, time taken: 32.76 seconds
 15 | 
 16 | """
 17 | 
 18 | 
 19 | 
 20 | 
 21 | import numpy as np
 22 | import time
 23 | import scipy.stats as stats
 24 | from scipy.optimize import fsolve
 25 | 
 26 | class AR_method(object):
 27 |     def __init__(self, target_f, sample_g, M):
 28 |         """
 29 |             M: the constant s.t. sample_g*M >= target_f for all x
 30 |             sample_g: a scipy.stats frozen random variable.
 31 |             target_f: a 1-d integrable, positive function
 32 |         """
 33 |         self.target_f = target_f
 34 |         self.sample_g = sample_g
 35 |         self.uniform = stats.uniform
 36 |         self.M = M
 37 |         
 38 |     def generate(self,n=1):
 39 |         
 40 |         rv = np.zeros( n) 
 41 |         i=0
 42 |         #recursivly call this.
 43 |         while i<n:
 44 |             g = self.sample_g.rvs()
 45 |             if self.uniform.rvs() < self.target_f(g)/( M*self.sample_g.pdf(g) ):
 46 |                 rv[i] = g
 47 |                 i+=1
 48 |         return rv
 49 |         
 50 |         
 51 |     def generateII( self, n=1):
 52 |         "quicker, uses recursion"
 53 |         if n==0:
 54 |             return np.array([])
 55 |         else:
 56 |             #generate n new points
 57 |             g = self.sample_g.rvs(size=n)
 58 |             
 59 |             u = self.uniform.rvs(size=n)
 60 |             r = self.target_f(g)/( M*self.sample_g.pdf(g) )
 61 |             samples = g[ u < r ]
 62 |             
 63 |             return np.concatenate( [samples, self.generateII( n = n - samples.shape[0] ) ] )
 64 |     
 65 |     def generateIII( self, n=1):
 66 |         "quickest"
 67 |         samples = np.array( [] )
 68 |         i = 0
 69 |         while i < n:
 70 |             g = self.sample_g.rvs(size=n)
 71 |             u = self.uniform.rvs(size=n)
 72 |             r = self.target_f(g)/( M*self.sample_g.pdf(g) )
 73 |             new_samples = g[ u < r ] 
 74 |             samples = np.concatenate( [samples, new_samples] )
 75 |             i += new_samples.shape[0]
 76 |         return samples
 77 |         
 78 | class Inversion_method(object):
 79 |     
 80 |     def __init__(self, target_F):
 81 |         self.target_f = target_F
 82 |         self.uniform = stats.uniform()
 83 |         
 84 |         
 85 |     def generate( self, n=1):
 86 |         n = int(n)
 87 |         rv = np.zeros(n)
 88 |         for i in range(n):
 89 |             u = self.uniform.rvs()
 90 |             rv[i] = fsolve( lambda y: self.target_f(y) - u, 0.5)
 91 |         
 92 |         return rv
 93 |     
 94 |     def generateII(self, n=1):
 95 |         n = int(n)
 96 |         try:
 97 |             return fsolve( lambda y:self.target_f(y) - self.uniform.rvs(size=n), 0.5*np.ones(n) )
 98 |         except MemoryError:
 99 |             i = 0
100 |             sample = np.array([])
101 |             while i<n:
102 |                 d = min(n-i, 750)
103 |                 u = self.uniform.rvs(size=d)
104 |                 sample = np.concatenate( [sample, fsolve( lambda y:self.target_f(y) - u, 0.5*np.ones(d) )] )
105 |                 i+=d
106 |             return sample
107 |         
108 |         
109 |         
110 | if __name__=="__main__":
111 |     F = lambda x: (x+x**2 + x**(5))/3.
112 |     f = lambda x: (1 + 2*x + 5*x**(4))/3.
113 |     N = 1e4
114 |     print "Testing AR Method."
115 |     print "Generate %d variables:"%N
116 |     g = stats.uniform()
117 |     M = f(1)
118 |     ar = AR_method( target_f = f, sample_g = g, M = M)
119 | 
120 |     start = time.clock()
121 |     ar_test = ar.generateIII( N )
122 |     print "Mean: %.3f, time taken: %.2f seconds"%(ar_test.mean(), time.clock() - start )
123 | 
124 |     print
125 |     print "Testing Inverse Method."
126 |     print "Generate %d variables:"%N
127 |     iv = Inversion_method( target_F = F)
128 | 
129 |     start = time.clock()
130 |     iv_test = iv.generateII( N )
131 |     print "Mean: %.3f, time taken: %.2f seconds"%(iv_test.mean(), time.clock() - start )
132 | 
133 | 
134 | 
135 |                 
136 |         


--------------------------------------------------------------------------------
/MultinomialMarkovAndEncoding/encoding.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | 
  4 | import string
  5 | 
  6 | class EncodingScheme(object):
  7 |         """
  8 |         EncodingScheme is a class to make Markov model data out of raw data. 
  9 |         Input:
 10 |             list_of_regex_bins: a list of regular expressions, as strings, representing how to "bin"
 11 |                 the raw data. eg: [ '[0-9]', '[a-z]', '[A-Z]' ]
 12 |                 A -1 is inserted if the item cannot be binned correctly. 
 13 |                 Notes: -Try not to overlap bins. 
 14 |                        -To specify all unique bins, leave the list empty, [].
 15 |                        -An exception is thrown if a item is not able to be binned.
 16 |                        #-To specify some bins, and have everything else unique, 
 17 |             to_append_to_end:
 18 |                 if the series data is not the same length ( eg: password data), this specifies what to append
 19 |                 to end before performing analysis. If not needed, leave as None. This is still buggy.
 20 |                 
 21 |             garbage_bin: a boolean to include a garbage bin, ie a bin that collects everything not collected.
 22 |                 Notes: having garbage_bin to True is pretty much useless if all unique bins is set, ie. 
 23 |                        having list_of_regex_bins = []
 24 |             
 25 |             restict: only allow series with elements that belong to this iterable. (For example: string.printable or string.ascii_letters)
 26 |             
 27 |         attributes:
 28 |             unique_bins: a dictionary of the bins used to encode the data and the encode mapping.
 29 |             realized_bins: a dictionary of the bins used to encode the realized data values that
 30 |                                 satisfy the bins. Useful for debugging and seeing what garbage is collected
 31 |                                 with realized_bins['garbage']
 32 |          
 33 |         Methods:
 34 |             encode(raw_data): returns the encoded data as a generator.
 35 |             
 36 |         """
 37 | 
 38 | 
 39 |         def __init__(self, list_of_regex_bins=[], to_append_to_end = None, garbage_bin=False, restrict=None  ):
 40 |             self.list_of_regex_bins = list_of_regex_bins
 41 |             self.to_append_to_end = to_append_to_end
 42 |             self.unique_bins = dict()
 43 |             self.number_of_bins = -1
 44 |             self.garbage_bin = garbage_bin
 45 |             self.realized_bins = dict( zip ( self.list_of_regex_bins, [ set() for i in xrange(len(list_of_regex_bins) )  ] ) )
 46 |             self.restrict = restrict
 47 |             
 48 |             
 49 |         def encode(self, data):
 50 |             """
 51 |             This function creates a representation of the data.
 52 |             Input:
 53 |                 data: a list of iterables (eg: strings, lists, arrays, np.arrays).
 54 |             output:
 55 |                 a generator of 1d numpy arrays of computer-readable time series, starting at 0 to unique_number of elements.
 56 |                 
 57 |             ex:
 58 |                 eScheme = Encoding_scheme( )
 59 |                 input = ['data', 'atad', 'dta']
 60 |                 eScheme.encode( data)
 61 |                 
 62 |             """
 63 |             self._init_encode(data)
 64 |             
 65 |             idata = self.yield_data() #returns a generator
 66 |             self._create_dict(idata)
 67 |             
 68 |             idata = self.append_ends( self.data, self.series_length ) #returns a generator
 69 | 
 70 |             return self._encode_generator(idata)
 71 | 
 72 |  
 73 |         
 74 |         def _encode_generator(self, idata):
 75 |            for series in idata:
 76 |                 if self.restrict:
 77 |                     for item in series:
 78 |                         if item not in self.restrict:
 79 |                             try:
 80 |                                 next(idata)
 81 |                                 break
 82 |                             except:
 83 |                                 pass
 84 | 
 85 |                 encoded_data = np.zeros( self.series_length, dtype="int" ) 
 86 |                 for col_i, item in enumerate(series):
 87 |                     encoded_data[col_i] = self._encode( item )
 88 |                 yield encoded_data
 89 |         
 90 |         
 91 |         def _create_dict(self, idata):
 92 |             for series in idata:
 93 |                 if self.restrict:
 94 |                     for item in series:
 95 |                         if item not in self.restrict:
 96 |                             print series
 97 |                             try:
 98 |                                 next(idata)
 99 |                                 break
100 |                             except:
101 |                                 pass
102 | 
103 |                         
104 |                 for item in series:
105 |                     self._encode( item )
106 |         
107 |         def _init_encode(self, data):
108 |             self.data = data
109 |             self.series_length = self._max_length(data)
110 | 
111 |         
112 |         def decode(self, sample):
113 |             try:
114 |                 return "".join([ self.inv_map[s] for s in sample ])
115 |             except:
116 |                 self.inv_map = dict((v,k) for k, v in self.unique_bins.iteritems())
117 |                 return "".join([ self.inv_map[s] for s in sample ]) 
118 |         
119 |         
120 |         def _max_length(self,data):
121 |             return max( map( len, data ) )
122 |         
123 |         def yield_data(self):
124 |             for series in self.data:
125 |                 
126 |                 
127 |                 yield series
128 |         
129 |         def append_ends( self, data, length):
130 |             for series in data:
131 |                 if len(series)<length:
132 |                         series+= self.to_append_to_end*(length-len(series) ) #this is too specific
133 |                 yield series
134 | 
135 |             
136 |         def _encode(self, item):
137 |             """
138 |             This both creates the dictionares/bins and returns the proper encoding.
139 |             
140 |             
141 |             """
142 |             
143 |             if not self.list_of_regex_bins:
144 |                 #more efficient in python to use try-else
145 |                 try:
146 |                     return self.unique_bins[str(item)]
147 |                 except KeyError:
148 |                     #This won't distinguish 1.0 from 1 etc.
149 |                     self.number_of_bins +=1
150 |                     self.unique_bins[str(item)] = self.number_of_bins 
151 |                     return self.unique_bins[str(item)]
152 |             
153 |             else:
154 |                 for regex in self.list_of_regex_bins:
155 |                     if re.match( regex, str(item) ):
156 |                         try:
157 |                             return self.unique_bins[regex]
158 |                         except KeyError:
159 |                             self.number_of_bins +=1
160 |                             self.unique_bins[regex] = self.number_of_bins 
161 |                             self.realized_bins[regex].add( item)
162 |                             return self.unique_bins[regex]
163 |                             
164 |                 #we didnt collect it. See if garbage bin is enabled.
165 |                 if self.garbage_bin:
166 |                     if 'garbage' not in self.unique_bins.keys():
167 |                         self.number_of_bins +=1
168 |                         self.unique_bins['garbage'] = self.number_of_bins
169 |                         self.realized_bins['garbage'] = set()
170 |                         
171 |                     self.realized_bins['garbage'].add( item)           
172 |                     return self.unique_bins['garbage']
173 |                     
174 |                 else:
175 |                     raise BinningError(item)
176 |                 
177 |                 
178 |                 
179 |                 
180 |                 
181 |                 
182 | class BinningError( Exception):
183 |     #thrown if no bin is found for some value.
184 |     def __init__(self, value):
185 |         self.value = value
186 |     
187 |     def __str__(self):
188 |         return "Could not find a bin for value %s."%repr(self.value)
189 |             
190 |             
191 |             
192 |             
193 |             
194 |                     
195 |             
196 |             
197 |         
198 |             
199 |             
200 |             
201 |         
202 |             
203 |         


--------------------------------------------------------------------------------
/MultinomialMarkovAndEncoding/multinomialMM.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import encoding
  4 | 
  5 | class MultinomialMM(object):
  6 |     """
  7 |     Create and learn a  multinomial Markov model 
  8 |     Input:
  9 |         encoding: a EncodingScheme class that will process the data prior to fitting. If
 10 |                   no scheme is given, and the data is inputed without encoding, a default 
 11 |                   encoding will be used (all unique binning).
 12 |     
 13 |     Attributes:
 14 |         self.data: the data used to fit the model
 15 |         self.unique_elements: the found unique elements of the data
 16 |         self.init_probs_esimate: the probability vector of inital emissions
 17 |         self.trans_probs_estimate: the trasmission probability matrix of going from 
 18 |             emission [row] to emission [col].
 19 |     
 20 |     Methods:
 21 |         self.fit(data, encoded=True)
 22 |         self.sample( n=1)
 23 |         self.decoded_sample(n=1)
 24 |         
 25 |     
 26 |     
 27 |     """
 28 |     def __init__(self, encoding=None):
 29 |         self.encoding = encoding
 30 |         
 31 |        
 32 |     def fit(self, data, encoded=True):
 33 |         """
 34 |         Fit the model to some data. Estimates the transition and intial probabilities.
 35 |         Input:
 36 |             Data: a (nxt) numpy array of n samples, each t unit long. The data must have a specific 
 37 |                 form to be read in where each possible emission is enumerated starting from 0 
 38 |                 (called encoded data).
 39 |             encoded: a boolean representing if the data is encoded. If not, a naive EncodingScheme will be used.
 40 |         
 41 |    
 42 |         """
 43 |         
 44 |         self._fit_init(data, encoded)
 45 |         list_series_length = range(1, self.len_trials)
 46 | 
 47 |         for encoded_series in data:
 48 |             
 49 |             self.init_probs_estimate[ encoded_series[0] ] += 1
 50 |             for j in list_series_length:
 51 |                 self.trans_probs_estimate[ encoded_series[j-1], encoded_series[j] ] += 1 
 52 |         
 53 |             self.number_of_series += 1
 54 |         self.init_probs_estimate = self._normalize( self.init_probs_estimate )
 55 |         self.trans_probs_estimate = self._normalize( self.trans_probs_estimate )
 56 |         
 57 |     def sample(self, n=1):
 58 |         """
 59 |         Sample the learned model n times.
 60 |         
 61 |         """
 62 |         samples = np.empty( (n, self.len_trials) )
 63 |         for i in range(n):
 64 |             samples[i,:] = self._sample()
 65 |         return samples
 66 |         
 67 |     def _sample(self):
 68 |         sample = np.empty( (1,self.len_trials) )
 69 |         sample[0,0] = np.argmax(np.random.multinomial(1, self.init_probs_estimate )) # argmax. something like this.
 70 |         for i in range( 1, self.len_trials):
 71 |             sample[0, i] = np.argmax(np.random.multinomial( 1, self.trans_probs_estimate[ sample[0,i-1],: ] ) )
 72 |         return sample
 73 |     
 74 |     def decode_sample(self, sample):
 75 |         """return decoded samples based on the encoding scheme"""
 76 |         return self.encoding.decode( sample )
 77 | 
 78 |     def _normalize(self, array ):
 79 |         #normalizes the array to sum to one. The array should be semi-positive
 80 |         try:
 81 |             #2d?
 82 |             return array.astype("float")/array.sum(1)[:,None]
 83 |         except: 
 84 |             #oh, 1d
 85 |             return array.astype("float")/array.sum()
 86 |          
 87 | 
 88 | 
 89 |      
 90 |     def __sample_conditional( self, K, X):
 91 |         #K and X are a list, K is increasing positions, min(K)>0
 92 |         # TODO
 93 |         sample = np.empty( (1, self.len_trials) )
 94 |         for i,k in enumerate(K):
 95 |             substr = self._sample_conditional( X[i] )
 96 |             pass
 97 |                     
 98 |     def sample_conditional(self, k, x, negate=False):
 99 |         #Sample the process, but at position k, put x (or put NOT x).
100 |         sample = np.empty( (1, self.len_trials) )
101 |         negate = int(negate) #0 or 1
102 |         sample[0,0] = np.argmax( np.random.multinomial( 1, self.init_probs_estimate ) )
103 |         for i in range(1, k + negate):
104 |             A = np.linalg.matrix_power( self.trans_probs_estimate, k-i )
105 |             if not negate:
106 |                 p = self.trans_probs_estimate[ sample[0,i-1], :]*A[:, x ]
107 |             else:
108 |                 p = self.trans_probs_estimate[ sample[0,i-1], :]*(1-A[:, x ])
109 | 
110 |             p = self._normalize(p)
111 |             sample[0, i] = np.argmax( np.random.multinomial( 1, p ) )
112 |         
113 |         if not negate:
114 |             sample[0, k] = x
115 |             
116 |         
117 |         for i in range(k+ 1, self.len_trials):
118 |                         sample[0, i] = np.argmax(np.random.multinomial( 1, self.trans_probs_estimate[ sample[0,i-1],: ] ) )
119 |         return sample
120 |         
121 |             
122 | 
123 |     def _fit_init(self,data, encoded):    
124 |         
125 |         if not encoded:
126 |             if not self.encoding:
127 |                 self.encoding = encoding.EncodingScheme()
128 |             data = self.encoding.encode(data)
129 | 
130 |             
131 |         self.number_of_series = 0
132 |         self.data = data
133 |         self.unique_elements = np.arange( len( self.encoding.unique_bins)    )[None, :]
134 |         self.len_trials = self.encoding.series_length
135 |         
136 |         #self.n_trials, self.len_trials = data #iterators do not have a defined shape. This might have to be done on the fly.
137 |         self.init_probs_estimate = np.zeros( self.unique_elements.shape[1], dtype="int" )
138 |         self.trans_probs_estimate = np.zeros( (self.unique_elements.shape[1], self.unique_elements.shape[1]), dtype="int" )
139 |             
140 |             
141 |             
142 |         
143 |             
144 |         


--------------------------------------------------------------------------------
/NumericalDerivatives/diff.py:
--------------------------------------------------------------------------------
 1 | #numerical high-dim derivatives
 2 | import numpy as np
 3 | from decimal import Decimal
 4 | import decimal
 5 | 
 6 | 
 7 | class memorize(object):
 8 |     def __init__(self, func):
 9 |         self.func = func
10 |         self.cache = {}
11 |         
12 |     def __call__(self, *args):
13 |         u = args[1]
14 |         print u
15 |         ustr = u.tostring()
16 |         try:
17 |             return self.cache[ustr]
18 |         except:
19 |             self.cache[ustr] = self.func(*args)
20 |             return self.cache[ustr]
21 |        
22 |    
23 |     def __repr__(self):
24 |         return self.func.__doc__
25 | 
26 | def _pdf(f, u, delta = 0.001 ):
27 |     n = u.shape[0]
28 |     if n==1:
29 |         t= f(u[0]+delta/2) - f(u[0]-delta/2)
30 |         return t
31 |     else:
32 |         f_plus = lambda *x: f( u[0] + delta/2, *x)
33 |         f_minus = lambda *x: f( u[0] - delta/2, *x)
34 |         return _pdf(f_plus, u[1:], delta ) - _pdf(f_minus, u[1:], delta ) 
35 |         
36 |         
37 | def _pdfOrder4(f, u, delta = 0.001 ):
38 |     n = u.shape[0]
39 |     if n==1:
40 |         t= ( f(u[0]+delta/2) )- ( f(u[0]-delta/2) )
41 |         return t
42 |     else:
43 |         f_plus1 = lambda *x: f( u[0] + delta/2, *x)
44 |         f_plus2 = lambda *x: f( u[0] + delta, *x)
45 |         f_minus1 = lambda *x: f( u[0] - delta/2, *x)
46 |         f_minus2 = lambda *x: f( u[0] - delta, *x)
47 |         p = -_pdfOrder4(f_plus2, u[1:], delta ) + 8*_pdfOrder4(f_plus1, u[1:], delta) \
48 |                     - 8*n(f_minus1, u[1:], delta ) + _pdfOrder4(f_minus2, u[1:], delta )/6
49 |         return p
50 | 
51 | def cdf2pdf( f, u, delta=0.001, kwargs={} ):
52 |     """numerically unstable for large dimensions"""
53 |     def _wrapper(*args):
54 |         u = np.array(args)
55 |         return f(u, **kwargs)
56 |     n = u.shape[0]
57 |     p= _pdf( _wrapper, u, delta)
58 |     return np.exp( np.log(p) - n*np.log( delta ) )
59 |     #return p / delta**n
60 |     
61 |     


--------------------------------------------------------------------------------
/NumericalDerivatives/diff.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CamDavidsonPilon/Python-Numerics/043ab4ad9003325c6270486b24d163933e0c7e8a/NumericalDerivatives/diff.pyc


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A collection of numerical python recipes
 2 | ========================================
 3 | 
 4 | ###Damerau-Lenenshtein Distance
 5 | An implementation of the DL distance, used to measure the "distance" (as defined by number of deletions, substitutions, transpositions and insertions) between two strings. I use it to detect swear words and their misspellings.
 6 | 
 7 | 
 8 | ###Discrete Option Pricing
 9 | Contains functions and classes to compute financial derivatives using discrete pricing theory. Mostly recursion.
10 | 
11 | ###Discrete SDE
12 | Robust classes/methods to simulate stochastic differential equations using a discretization scheme. Includes Euler, Milstein and Second-Order scheme. To be implemented into PyProcess.
13 | 
14 | ###Estimators
15 | Some useful estimators of regression and others.
16 | 
17 | ###Machine Learning Scikit Learn
18 | Some scikit-learn-friendly machine learning classes. 
19 | 
20 | ###Monte Carlo
21 | A collection of tools to sample from a variety of distributions and evaluating integrals. Include bivariate copula sampling, markov chain monte carlo, and numerical integration (with variance reduction support). 
22 | 
23 | ###Multinomial Markov And Encoding
24 | Create a multinomial markov chain (plus some awesome sampling and conditional sampling algos) from encoded data. See my [password analysis](http://www.camdp.com/blogs/modeling-password-creation) for a use and creation of it.
25 | 
26 | ###Numerical Derivatives
27 | Compute the derivative of functions a points using discrete schemes. Has a great recursive solution to solving problem:
28 | >> Given a multivariate CDF, how can I computationally, and efficiently, find its pdf?
29 | 
30 | This problem occurs in copula sampling often. 
31 | 
32 | 
33 | ###Time Series
34 | Some time series helpers and utilities
35 | 
36 | ###utils
37 | Some nice utils to have around.
38 | 
39 | 
40 | 
41 | 
42 | Author:
43 | Cameron Davidson-Pilon
44 | camdp.com
45 | 
46 | Contact me at:
47 | cam.davidson.pilon@gmail.com
48 | @cmrndp
49 | 


--------------------------------------------------------------------------------
/TimeSeries/MASE.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | 
 4 | 
 5 | def MASE(training_series, testing_series, prediction_series):
 6 |     """
 7 |     Computes the MEAN-ABSOLUTE SCALED ERROR forcast error for univariate time series prediction.
 8 |     
 9 |     See "Another look at measures of forecast accuracy", Rob J Hyndman
10 |     
11 |     parameters:
12 |         training_series: the series used to train the model, 1d numpy array
13 |         testing_series: the test series to predict, 1d numpy array or float
14 |         prediction_series: the prediction of testing_series, 1d numpy array (same size as testing_series) or float
15 |         absolute: "squares" to use sum of squares and root the result, "absolute" to use absolute values.
16 |     
17 |     """
18 |     print "Needs to be tested."
19 |     n = training_series.shape[0]
20 |     d = np.abs(  np.diff( training_series) ).sum()/(n-1)
21 |     
22 |     errors = np.abs(testing_series - prediction_series )
23 |     return errors.mean()/d


--------------------------------------------------------------------------------
/TimeSeries/risk_measures.py:
--------------------------------------------------------------------------------
 1 | #risk measures
 2 | 
 3 | import scipy.stats as stats
 4 | from scipy.optimize import fsolve
 5 | import numpy as np
 6 | 
 7 | 
 8 | 
 9 | def VaR(ts, alpha, flavour):
10 |     if flavour == "historical":
11 |         temp_ts = ts.copy()
12 |         temp_ts.sort()
13 |         n = len( temp_ts)
14 |         try:
15 |             return -temp_ts.values[ np.floor( (1-alpha)*n ) ]
16 |         except:
17 |             return -temp_ts[ np.floor( (1-alpha)*n ) ]
18 |             
19 |     elif flavour == "t":
20 |         t = stats.t
21 |         t = stats.t( *t.fit( ts ) )
22 |         return -t.ppf( 1-alpha )
23 |             
24 |     elif flavour == "normal":
25 |         mean = ts.mean()
26 |         std = ts.std()
27 |         return -stats.norm.ppf( 1-alpha, mean, std )
28 |     elif flavour == "Cornish-Fischer":
29 |         z_c = -stats.norm.ppf( 1-alpha, 0 ,1)
30 |         S = stats.skew(ts)
31 |         K = stats.kurtosis(ts)
32 |         z_cf = z_c + (z_c**2-1)*S/6 + (z_c**3- 3*z_c)*K/24 + (2*z_c**3-5*z_c)*S**2/36
33 |         return ts.mean() - z_cf*np.sqrt( ts.std() )
34 |         
35 |     elif flavour == "kernel":
36 |         kde = stats.gaussian_kde( ts )  
37 |         print kde.factor
38 | 
39 |         f = lambda x: kde.integrate_box_1d(-1, x) - (1-alpha)
40 |         return -fsolve( f, -0.05)[0]
41 |         
42 |         
43 |         
44 | def ES( ts ,alpha, flavour="historical"):
45 |     var = VaR( ts, alpha, flavour)
46 |     n_simulations = 200000
47 |     if flavour=="historical":
48 |         return -ts[( ts < -var )].mean()
49 |         
50 |     elif flavour == "normal":
51 |         mean = ts.mean()
52 |         std = ts.std()
53 |         norm = stats.norm( mean, std )
54 |         samples = -norm.rvs( n_simulations )
55 |         
56 |         return samples[ var <= samples ].mean()
57 |         
58 |     elif flavour == "t":
59 |         t = stats.t
60 |         t = stats.t( *t.fit( ts ) )
61 |         samples = -t.rvs( n_simulations )
62 |         return samples[var <=samples ].mean()
63 |     
64 |     elif flavour == "kernel":
65 |         kde = stats.gaussian_kde(ts)
66 |         samples = -kde.resample(n_simulations)
67 |         return samples[ var<= samples].mean()


--------------------------------------------------------------------------------
/TimeSeries/utils.py:
--------------------------------------------------------------------------------
 1 | #time series utils
 2 | 
 3 | 
 4 | 
 5 | def MASE(training_series, testing_series, prediction_series):
 6 |     """
 7 |     Computes the MEAN-ABSOLUTE SCALED ERROR forcast error for univariate time series prediction.
 8 |     
 9 |     See "Another look at measures of forecast accuracy", Rob J Hyndman
10 |     
11 |     parameters:
12 |         training_series: the series used to train the model, 1d numpy array
13 |         testing_series: the test series to predict, 1d numpy array or float
14 |         prediction_series: the prediction of testing_series, 1d numpy array (same size as testing_series) or float
15 |         absolute: "squares" to use sum of squares and root the result, "absolute" to use absolute values.
16 |     
17 |     """
18 |     print "Needs to be tested."
19 |     n = training_series.shape[0]
20 |     d = np.abs(  training_series.diff() ).sum()/(n-1)
21 |     
22 |     errors = np.abs(testing_series - prediction_series )
23 |     return errors.mean()/d


--------------------------------------------------------------------------------
/pyMC/LinearRegressionWithLoss.py:
--------------------------------------------------------------------------------
 1 | #Least squares with penalty on wrong sign.
 2 | 
 3 | import matplotlib
 4 | matplotlib.use("Agg")
 5 | import matplotlib.pyplot as plt
 6 | import numpy as np
 7 | import pymc as mc
 8 | import scipy.optimize as sop
 9 | 
10 | def sign(x):
11 |     return -1 if x<0 else 1
12 | 
13 | def loss( y, yhat, coef = 100):
14 |     """vectorized"""
15 |     sol = np.zeros_like(y)
16 |     ix = y*yhat < 0 
17 |     sol[ix] = coef*yhat**2 - sign(y[ix])*yhat + abs(y[ix])
18 |     sol[ ~ix ] = abs( y[~ix] - yhat )
19 |     return sol
20 | 
21 |         
22 | #generate some artifical data
23 | size = 250
24 | beta = 0.4
25 | alpha = 0.0
26 | 
27 | X = np.random.randn( size )
28 | Y = beta*X + alpha + np.random.randn( size )
29 | 
30 | 
31 | 
32 | # Form the bayesian analysis.
33 | prec = mc.Uniform( "prec", 0, 100 )
34 | beta_0 = mc.Normal( "beta", 0, 0.0001 )
35 | alpha_0 = mc.Normal( "alpha", 0, 0.0001 )
36 | 
37 | 
38 | @mc.deterministic
39 | def mean( X = X, alpha_0 = alpha_0, beta_0 = beta_0 ):
40 |     return alpha_0 + beta_0*X
41 |     
42 | to_predict_x = np.linspace( -10, 10, 100)
43 | 
44 |     
45 | obs = mc.Normal( "obs", mean, prec, value = Y, observed = True)
46 | 
47 | model = mc.Model( {"obs":obs, "beta_0":beta_0, "alpha_0":alpha_0, "prec":prec} )
48 | mcmc = mc.MCMC( model )
49 | 
50 | n_samples = 100000
51 | burnin = 50000
52 | mcmc.sample( burnin + n_samples, burnin)
53 | mean_alpha_0 = mcmc.alpha_0.stats()["mean"] #correspondes to the least squares estimate
54 | mean_beta_0 = mcmc.beta_0.stats()["mean"] #correspondes to the least squares estimate
55 | ls_prediction = mean_alpha_0 + mean_beta_0*to_predict_x
56 | 
57 | 
58 | alpha_trace = mcmc.alpha_0.trace.gettrace()
59 | beta_trace = mcmc.beta_0.trace.gettrace()
60 | rprec = [1.0/np.sqrt(prec.random()) for i in range(n_samples ) ]
61 | norm_samples = rprec*np.random.randn(n_samples)
62 | 
63 | 
64 | v = np.zeros_like( to_predict_x)
65 | for i,x in enumerate(to_predict_x):
66 |     post_samples = norm_samples + (alpha_trace + beta_trace*x)
67 |     tomin = lambda yhat: loss( post_samples, yhat).mean()
68 |     v[i] = sop.fmin( tomin, ls_prediction[i] )
69 |     
70 | print v
71 |     
72 | #nice plots
73 | plt.figure()
74 | plt.plot( to_predict_x, ls_prediction, lw =2, label = "Least squares prediction", c="k" )
75 | plt.plot( to_predict_x, v, lw = 2, label = "Bayesian Loss-optimized prediction", c= "r")
76 | plt.scatter( X, Y, alpha = 0.4 )
77 | plt.legend()
78 | plt.title("Least squares predictions vs \n Bayesian Loss-optimized predictions")
79 | plt.xlim(-7, 7)
80 | plt.ylim(-5, 5)
81 | plt.savefig( "LossOptII.png" )
82 | 
83 |     
84 | 
85 | 


--------------------------------------------------------------------------------
/pyMC/SmallSample.py:
--------------------------------------------------------------------------------
 1 | import pymc as mc
 2 | import numpy as np
 3 | 
 4 | #data
 5 | X = 5 
 6 | Y = 10
 7 | 
 8 | #rate = mc.Exponential("rate", 1 ) #priors on N
 9 | N = mc.Poisson( "N", 20, value = max(X,Y) )
10 | #N = mc.Uninformative("N", value = max(X,Y) )
11 | 
12 | 
13 | pX = mc.Beta("pX", 1,1) #uniform priors 
14 | pY = mc.Beta("pY", 1,1 )
15 | 
16 | 
17 | observed = mc.Binomial("obs", p = np.array( [pX, pY] ), n = N, value = np.array( [X,Y] ), observed = True )
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/pyMC/TableGame.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | #Alice vs Bob in table game
 3 | 
 4 | import random
 5 | max_simulations = 1e6
 6 | 
 7 | simulation = 0
 8 | wins_alice = 0
 9 | wins_bob = 0
10 | 
11 | 
12 | while simulation < max_simulations:
13 |     #draw random p
14 |     p = random.random()
15 |     #draw eight trials
16 |     Alice_wins = sum( [ random.random() < p for i in range(8) ] )
17 |     if Alice_wins == 5:
18 |         simulation += 1
19 |         #This is case of 5vs3 in 8th round, lets check who wins by drawing three more
20 |         if any( [ random.random() < p for i in range(3) ] ):
21 |             wins_alice +=1
22 |         else:
23 |             wins_bob +=1
24 |             
25 | 
26 | print "Proportion of Alice wins: %.3f."%( wins_alice/max_simulations ) 


--------------------------------------------------------------------------------
/pyMC/blowflies.py:
--------------------------------------------------------------------------------
 1 | """
 2 | See
 3 | 
 4 | BAYESIAN INFERENCE AND MARKOV CHAIN MONTE CARLO BY EXAMPLE
 5 | GEOFFK. NICHOLLS
 6 | 
 7 | """
 8 | import numpy as np
 9 | import pymc as mc
10 | import pandas as pd
11 | 
12 | #observations
13 | data = pd.read_csv("blowfly97I.csv")
14 | yt = data["total"].value
15 | 
16 | N = t.shape[0]
17 | 
18 | r =  mc.Exponential( "r", beta = 1.0 )
19 | b = mc.Exponential( "b", beta = 1000.0 )
20 | lambduh = mc.Exponential( "lambdu", beta = 1.0/1000 )
21 | n_0 = mc.Poisson( "n_0", mu=lambduh)
22 | 
23 | 
24 | @mc.deterministic
25 | def n_t( n_0=n_0, r=r, b=b, N=N):
26 |     n = np.empty( N, dtype=object)
27 |     n[0] = n_0
28 |     for i in range( 1, N):
29 |         n[i] =  (r*n[i-1])/( 1.0 + b**4*n[i-1]**4 )
30 |     return n
31 | 
32 | y = np.empty( N, dtype=object)
33 | for i in range(0, N):
34 |     y[i] = mc.Poisson( "y_%i"%i, mu = n_t[i], observed= True, value = yt[i] )
35 |     
36 | model = mc.Model( {"yt":yt, "nt":n_t, "b":b, "r":r, "n_0":n_0})
37 | mcmc=  mc.MCMC(model)
38 | 
39 | mcmc.sample( 30000, 15000)
40 | 
41 |     
42 | 
43 |     
44 | 
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/pyMC/mixtureNormals.py:
--------------------------------------------------------------------------------
 1 | from pymc import *
 2 | 
 3 | size = 20
 4 | p = Uniform( "p", 0 , 1)
 5 | 
 6 | ber = Bernoulli( "ber", p = p, size = size)
 7 | 
 8 | precision = Gamma('precision', alpha=0.1, beta=0.1)
 9 | 
10 | mean1 = Normal( "mean1", 0, 0.001 )
11 | mean2 = Normal( "mean2", 0, 0.001 )
12 | 
13 | @deterministic
14 | def mean( ber = ber, mean1 = mean1, mean2 = mean2):
15 |     return ber*mean1 + (1-ber)*mean2
16 |    
17 | 
18 | #generate some artifical data   
19 | v = np.random.randint( 0, 2, size)
20 | data = v*(10+ np.random.randn(size) ) + (1-v)*(-10 + np.random.randn(size ) )
21 | 
22 | 
23 | obs = Normal( "obs", mean, precision, value = data, observed = True)
24 | 
25 | model = Model( {"p":p, "precision": precision, "mean1": mean1, "mean2":mean2, "obs":obs} )


--------------------------------------------------------------------------------
/utils/contour_irregular_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | contour_irregular_data.py
 3 | 
 4 | This module/function lets you plot irregularlly spaced data by using an interpolation scheme
 5 | 
 6 | Code taken/hacked modified from http://www.scipy.org/Cookbook/Matplotlib/Gridding_irregularly_spaced_data
 7 | """
 8 | 
 9 | 
10 | import numpy as np
11 | from scipy.interpolate import griddata
12 | import matplotlib.pyplot as plt
13 | 
14 | 
15 | def contour(x,y,z, linewidth = 2, labels = None):
16 |     """
17 |     Plots contours for non-evenly spaced data.
18 |     x,y,z must be 1d arrays.
19 |     lines = # of contour lines (default 18 )
20 |     linewidth = line width of lines (default 2 )
21 |     """
22 |     
23 |     assert x.shape[0] == y.shape[0] == z.shape[0], "arrays x,y,z must be the same size"
24 |     
25 |     #make a grid that surrounds x,y support
26 |     xi = np.linspace(x.min(),x.max(),100)
27 |     yi = np.linspace(y.min(),y.max(),100)
28 |     # grid the data.
29 |     zi = griddata((x, y), z, (xi[None,:], yi[:,None]), method='cubic')
30 |     # contour the gridded data, plotting dots at the randomly spaced data points.
31 |     plt.figure()
32 |     CS = plt.contour(xi,yi,zi,linewidth=2)
33 |     plt.clabel(CS, inline=1, fontsize=10)
34 |     
35 |     if labels:
36 |         plt.xlabel(labels[0])
37 |         plt.ylabel(labels[1])
38 |     # plot data points.
39 |     plt.scatter(x,y,c=z,s=60, alpha = 0.7, edgecolors = "none")
40 |     plt.xlim(x.min(),x.max())
41 |     plt.ylim(y.min(),y.max())
42 |     plt.show()
43 | 


--------------------------------------------------------------------------------
/utils/cov2corr.py:
--------------------------------------------------------------------------------
 1 | """
 2 | covariance matrix to correlation matrix.
 3 | """
 4 | 
 5 | 
 6 | 
 7 | def cov2corr( A ):
 8 |     """
 9 |     covariance matrix to correlation matrix.
10 |     """
11 |     d = np.sqrt(A.diagonal())
12 |     A = ((A.T/d).T)/d
13 |     #A[ np.diag_indices(A.shape[0]) ] = np.ones( A.shape[0] )
14 |     return A


--------------------------------------------------------------------------------
/utils/dataframe_pairwise_feature_gen.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from itertools import combinations_with_replacement, combinations
 3 | 
 4 | def create_pairwise_data( df, ignore = [], squares = True):
 5 |     """
 6 |     df: a dataframe 
 7 |     ignore: an iterable of columns to not make quad features out of.
 8 |     
 9 |     returns:
10 |         a copied dataframe with quadratic features, including squares of variables if squares == True.
11 |     
12 |     """
13 |     n,d = df.shape
14 |     columns = df.columns.diff( ignore )
15 |         
16 |     df = df.copy()
17 |     
18 |     iterator = combinations_with_replacement if squares else combinations  
19 |     
20 |     for x,y in iterator( columns, 2):
21 |         df[ x + "__times__" + y ] = df[x]*df[y]
22 |         
23 |         
24 |     return df
25 |         


--------------------------------------------------------------------------------
/utils/jarquebera_test.py:
--------------------------------------------------------------------------------
 1 | #jaque-berra test
 2 | 
 3 | import scipy.stats as stats
 4 | 
 5 | def JarqueBeraTest(data,significance = 0.95):
 6 |     """
 7 |     If the data come from a normal distribution, the JB statistic asymptotically has a chi-squared distribution with two degrees of freedom, 
 8 |     so the statistic can be used to test the hypothesis that the data are from a normal distribution. 
 9 |     
10 |     """
11 |     n = data.shape[0]
12 |     if n < 2000:
13 |         print "Warning: JarqueBera tests works best with large sample sizes (> ~2000 )."
14 |     
15 |     S = float(n)/6*( stats.skew(data)**2 + 0.25*(stats.kurtosis( data, fisher=True) )**2)
16 |     t = stats.chi2(2).ppf( significance )
17 |     if S < t:
18 |         print "Not enough evidence to reject as non-Normal according to the Jarque-Bera test. S = %.4f < %.4f"%(S,t)
19 |     else:
20 |         print "Reject that is Normal according to the Jarque-Bera test; S = %.4f > %.4f"%(S,t)
21 |         


--------------------------------------------------------------------------------
/utils/kaggleDataSet.py:
--------------------------------------------------------------------------------
 1 | #Author: John Ramney
 2 | 
 3 | import requests
 4 | 
 5 | # The direct link to the Kaggle data set
 6 | data_url = 'http://www.kaggle.com/c/digit-recognizer/download/train.csv'
 7 | 
 8 | # The local path where the data set is saved.
 9 | local_filename = "train.csv"
10 | 
11 | # Kaggle Username and Password
12 | kaggle_info = {'UserName': "my_username", 'Password': "my_password"}
13 | 
14 | # Attempts to download the CSV file. Gets rejected because we are not logged in.
15 | r = requests.get(data_url)
16 | 
17 | # Login to Kaggle and retrieve the data.
18 | r = requests.post(r.url, data = kaggle_info, prefetch = False)
19 | 
20 | # Writes the data to a local file one chunk at a time.
21 | f = open(local_filename, 'w')
22 | for chunk in r.iter_content(chunk_size = 512 * 1024): # Reads 512KB at a time into memory
23 |     if chunk: # filter out keep-alive new chunks
24 |         f.write(chunk)
25 | f.close()
26 | 


--------------------------------------------------------------------------------
/utils/linked_list.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def reverse( list_head, previous = None):
 3 |     """
 4 |     assume .next is present
 5 |     """
 6 |     if not list_head.next:
 7 |         list_head.next = previous
 8 |         return
 9 |     else:
10 |         reverse( list_head.next, list_head ) 
11 |         list_head.next = previous if previous else None
12 | 
13 |         
14 | class Linked( object ):
15 |     
16 |     def __init__(self, next, value ):
17 |         self.next = next
18 |         self.value = value
19 | 
20 | 
21 | C = Linked( None, "c")
22 | B = Linked( C, "b")
23 | A = Linked( B, "a")
24 |         


--------------------------------------------------------------------------------
/utils/lyungbox_test.py:
--------------------------------------------------------------------------------
 1 | #lyungBoxTest
 2 | 
 3 | 
 4 | import numpy.ma as ma
 5 | import numpy as np
 6 | import scipy.stats as stats
 7 | import scipy.stats.mstats as mstats
 8 | 
 9 | 
10 | def LyungBoxTest(ts, tested_lag, significance = 0.95 ):
11 |     """
12 |     ts: a time series.
13 |     tested_lag: is the lag being tested, but must be an int.
14 |     """
15 |     tested_lag = int(tested_lag) 
16 |     f_ts = ts
17 |     f_ts = f_ts - f_ts.mean()
18 |     n = f_ts.shape[0]
19 |     Q = 0
20 |     for i in range(1, tested_lag+1 ):
21 |         lagged_f_ts = f_ts.shift(i)
22 |         m_f_ts = ma.masked_array( lagged_f_ts, mask = np.isnan(  lagged_f_ts ) )
23 |         Q += mstats.pearsonr( f_ts, m_f_ts)[0]**2/(n-i)
24 | 
25 |     Q = Q*n*(n+2)
26 |     t = stats.chi2(tested_lag).ppf( significance )
27 |     if Q < t:
28 |         print "%d   | Not enough evidence to reject Null: Q = %.4f < %.4f"%(tested_lag, Q,t)
29 |         #print "Not enough evidence to reject "+func.__name__ + " " + series_name+" as not %d autocorrelated according to the Lyung Box test. Q = %.4f < %.4f"%(tested_lag, Q,t)
30 |     else:
31 |         print "%d   | Reject Null: Q = %.4f > %.4f"%(tested_lag, Q,t)
32 |         #print "Reject that "+series_name+ " is autocorrelated at lag %d according to the Lyung Box test test; Q = %.4f > %.4f"%(tested_lag, Q,t)
33 | 
34 | 
35 |         
36 |         


--------------------------------------------------------------------------------
/utils/mean_average_precision.py:
--------------------------------------------------------------------------------
 1 | #mean average precision
 2 | """
 3 | % This function computes the average precision of predicated values. The
 4 | % average precision is hella-confusing at first glace. Here's what Kaggle
 5 | % has to say:
 6 | %   
 7 | %   The true scores are sorted (descending) according to the order of the 
 8 | %   submission (only the order of the submission matters). In each row, 
 9 | %   we then compute the cumulative (from the top up to that row) 
10 | %   "True Scores Ordered by Submission" divided by the cumulative "True 
11 | %   Scores Ordered By True Scores", where that quotient is called the 
12 | %   precision at row n. The final score is the average of the precision 
13 | %   at row n (over all n).
14 | %
15 | %
16 | %  Ok, so say the true scores, sorted, are 3, 2.3, 1.6. And I predicted 
17 | % the order 3, 1.6, 2.3. Then the average prec. is mean( 3/3,
18 | % (3+1.6)/(3+2.3), (3+1.6 + 2.3)/(3+ 2.3 + 1.6) ) = .96 something. 
19 | %
20 | %
21 | """
22 | import numpy as np
23 | 
24 | def MAP( true_scores, predictive_scores):
25 |     true_values_sorted = true_scores.copy()
26 |     true_values_sorted = true_values_sorted[ np.argsort( -true_values_sorted ) ]
27 |     
28 |     ix = np.argsort( -predictive_scores )
29 |     
30 |     true_values_sorted_by_prediction = true_scores[ix]
31 |     
32 |     score = np.mean( true_values_sorted_by_prediction.cumsum()/ true_values_sorted.cumsum() )
33 |     return score
34 |     


--------------------------------------------------------------------------------
/utils/memorize.py:
--------------------------------------------------------------------------------
 1 | """
 2 | use this decorator for recursion to cache calls
 3 | 
 4 | """
 5 | 
 6 | 
 7 | class memorize( object ):
 8 | 
 9 |     def __init__(self, func):
10 |         self.func = func
11 |         self.cache = {}
12 |         
13 |     def __call__(self, *args):
14 |         try:
15 |             return self.cache[args]
16 |         except:
17 |             self.cache[args] = self.func(*args)
18 |             return self.cache[args]
19 | 
20 |     def __repr__(self):
21 |         return self.func.__doc__
22 |         
23 |         


--------------------------------------------------------------------------------
/utils/power_set.py:
--------------------------------------------------------------------------------
 1 | #Cameron Davidson-Pilon, 2012
 2 | 
 3 | #iterative solution
 4 | 
 5 | def power_set( list ):
 6 | 	n = len(list)
 7 | 	for enumerate in range( 2**n ):
 8 | 	
 9 | 		subset = []
10 | 		i = enumerate
11 | 		
12 | 		for j in range(n):
13 | 			if i%2:
14 | 				subset.append( list[j] )
15 | 			i = i >> 1
16 | 		
17 | 		print subset
18 | 		
19 | 
20 | #resursive solution
21 | 
22 | def power_set( list, called_empty = False ):
23 | 	
24 | 	if not called_empty:
25 | 		print []
26 | 	
27 | 	n = len(list)
28 | 	print list
29 | 	
30 | 	if n == 1:
31 | 		#not 0, as there are n subsets of length 1, but only 1 
32 | 		#subset of length 0
33 | 		return
34 | 	
35 | 	for i in range(n):
36 | 		power_set( list[:i] + list[i+1:], True )
37 | 		


--------------------------------------------------------------------------------
/utils/primes.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | def primes_up_to(max_num):  
 4 |   current_primes = []
 5 |   for num in range(2, max_num):
 6 |         if prime(num, current_primes):
 7 |              current_primes.append(num)            
 8 |   return
 9 |            
10 | def prime(n, current_primes):
11 |   for i in current_primes:
12 |        if n%i==0:
13 |             return False
14 |     
15 |   return True
16 |   
17 |   
18 | print primes_up_to( 25 )
19 | 
20 | 
21 | print primes_up_to( 100 )


--------------------------------------------------------------------------------
/utils/qq_plot.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import matplotlib.pyplot as plt
 3 | import scipy.stats as stats
 4 | import numpy as np
 5 | 
 6 | def qq_plot( data ):
 7 |     plt.figure()
 8 |     (osm, osr) = stats.probplot( data, sparams=[ data.mean(), data.std()], dist='norm', fit = True )
 9 |     x_ = np.array( [min( osm[1] ), max (osm[1] ) ] )
10 |     slope = osr[0]
11 |     inter = osr[1]
12 |     
13 |     plt.plot( x_, x_ , label="Line y=x")
14 |     plt.scatter( osm[0], osm[1] )
15 |     plt.xlabel( "Observed" )
16 |     plt.ylabel( "Theoretical" )
17 |     plt.show()


--------------------------------------------------------------------------------
/utils/sample.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #sample a file by taking every nth item
 4 | # usage:
 5 | # $ python sample.py n myfile.txt mysampledfile.txt
 6 | #
 7 | #
 8 | 
 9 | import sys
10 | 
11 | 
12 | def sample(n, infile, outfile):
13 |     
14 |     n = int(n)
15 |     try:
16 |         ifile = open( infile, 'r')
17 |     except e:
18 |         print "Could not open file %s"%infile
19 |         raise e
20 |     
21 |     ofile = open ( outfile, 'w')
22 |     
23 |     i = 0
24 |     for line in ifile.readlines():
25 |         if i%n==0:
26 |             ofile.write(line)
27 |         i+=1
28 | 
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     sample( *sys.argv[1:] )
33 |     print "Completed"


--------------------------------------------------------------------------------