├── firstform
    ├── tests
    │   ├── __init__.py
    │   ├── tools.py
    │   └── app_tests.py
    ├── firstform
    │   ├── __init__.py
    │   └── __init__.pyc
    ├── bin
    │   ├── app.pyc
    │   ├── tools.py
    │   ├── __init__.py
    │   └── app.py
    ├── templates
    │   ├── layout.html
    │   ├── index.html
    │   └── hello_form.html
    └── setup.py
├── gothonweb
    ├── bin
    │   ├── __init__.py
    │   ├── app.py
    │   └── map.py
    ├── tests
    │   ├── __init__.py
    │   ├── tools.py
    │   ├── app_tests.py
    │   └── map_tests.py
    ├── templates
    │   ├── you_died.html
    │   ├── layout.html
    │   └── show_room.html
    ├── sessions
    │   ├── 40ad7454d4b4cbacedaa449f7e2c8fb04165ecf4
    │   ├── 5524b4c1828de273b8ae4c70bbbe0e631e031e4a
    │   └── 6adbe20488a3ffd0040abc4ac06991d1d79c97d0
    └── setup.py
├── .gitignore
├── tutorials
    ├── exercism_py3
    │   ├── leap
    │   │   ├── .cache
    │   │   │   └── v
    │   │   │   │   └── cache
    │   │   │   │       └── lastfailed
    │   │   ├── year5.py
    │   │   ├── leap.py
    │   │   ├── year4.py
    │   │   ├── year.py
    │   │   ├── leap_test.py
    │   │   └── README.md
    │   ├── hello-world
    │   │   ├── .cache
    │   │   │   └── v
    │   │   │   │   └── cache
    │   │   │   │       └── lastfailed
    │   │   ├── hello_world2.py
    │   │   ├── hello_world.py
    │   │   ├── hello_world_test.py
    │   │   ├── hello_world_test2.py
    │   │   └── README.md
    │   ├── Ex5_hamming
    │   │   ├── hamming2.py
    │   │   └── hamming.py
    │   ├── dna
    │   │   ├── dna2.py
    │   │   └── dna.py
    │   ├── word_count
    │   │   ├── wordcount3.py
    │   │   ├── wordcount2.py
    │   │   ├── README.md
    │   │   └── word_count_test.py
    │   └── pangram
    │   │   ├── pangram.py
    │   │   ├── pangram2.py
    │   │   └── pangram_detailed.py
    ├── ThinkBayes
    │   ├── thinkbayesLoco.png
    │   ├── thinkbayesLoco2.png
    │   ├── thinkbayeseuro.png
    │   ├── thinkbayeseuro2.png
    │   ├── thinkbayesprice.png
    │   ├── thinkbayesprice2.png
    │   ├── thinkbayesprice3.png
    │   ├── .ipynb_checkpoints
    │   │   ├── 046-ImplimentingSuite-checkpoint.ipynb
    │   │   ├── 056 - Chap6DecisionAnalysis-checkpoint.ipynb
    │   │   ├── 046-Suite_m&m-checkpoint.ipynb
    │   │   ├── 046-MontyHall_framework-checkpoint.ipynb
    │   │   ├── 047-Dice-checkpoint.ipynb
    │   │   ├── 049-Credible_intervals_cdfs-checkpoint.ipynb
    │   │   └── 043-Distributions-checkpoint.ipynb
    │   ├── 056 - Chap6DecisionAnalysis.ipynb
    │   ├── 046-ImplimentingSuite.ipynb
    │   ├── 046-Suite_m&m.ipynb
    │   ├── 046-MontyHall_framework.ipynb
    │   ├── 047-Dice.ipynb
    │   ├── 049-Credible_intervals_cdfs.ipynb
    │   └── 043-Distributions.ipynb
    ├── algorithms
    │   ├── notebooks
    │   │   ├── .ipynb_checkpoints
    │   │   │   ├── 068-Lesson2-checkpoint.ipynb
    │   │   │   └── Lesson1-checkpoint.ipynb
    │   │   ├── 068-Lesson2.ipynb
    │   │   └── Lesson1.ipynb
    │   └── scripts
    │   │   ├── L1_Eulerian_Q10.py
    │   │   └── L1_EulerianPath.py
    ├── KaggleNLP
    │   └── word_vectors.py
    ├── K-means
    │   └── kmeans.py
    ├── Samsung
    │   └── notebooks
    │   │   ├── 029-Samsung_cleanup.ipynb
    │   │   └── 031-Samsung_cleanup.ipynb
    └── 026-Linear_Regression_Analysis.ipynb
├── windspeed
    ├── plots
    │   ├── WSahel.png
    │   ├── 038-62124Sebha.png
    │   └── 038-62124Sebha_2.png
    ├── scripts
    │   ├── 012-ws_tseries.py
    │   ├── 030-group_tseries.py
    │   ├── 037-group_tseries.py
    │   ├── 013-ws_tseries.py
    │   ├── 038-group_tseries.py
    │   ├── 039-group_tseries.py
    │   └── 040-group_tseries.py
    └── notebooks
    │   └── 010_1-windspeed.ipynb
├── SQL
    └── galaXQL_17.sql
├── 001-git-basics.md
├── monkeylearn
    └── 015-selectdata.py
├── DSFromScratch
    ├── Chap13
    │   └── machine_learning.py
    └── Chap6
    │   ├── 064-Chap6.ipynb
    │   └── .ipynb_checkpoints
    │       └── 064-Chap6-checkpoint.ipynb
├── Titanic
    └── bin
    │   ├── clean_test.py
    │   └── clean_test_53.py
└── TOdo.md


/firstform/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gothonweb/bin/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gothonweb/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/firstform/firstform/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 
3 | API_key.txt


--------------------------------------------------------------------------------
/tutorials/exercism_py3/leap/.cache/v/cache/lastfailed:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/tutorials/exercism_py3/hello-world/.cache/v/cache/lastfailed:
--------------------------------------------------------------------------------
1 | {}


--------------------------------------------------------------------------------
/firstform/bin/app.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/firstform/bin/app.pyc


--------------------------------------------------------------------------------
/windspeed/plots/WSahel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/windspeed/plots/WSahel.png


--------------------------------------------------------------------------------
/firstform/firstform/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/firstform/firstform/__init__.pyc


--------------------------------------------------------------------------------
/windspeed/plots/038-62124Sebha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/windspeed/plots/038-62124Sebha.png


--------------------------------------------------------------------------------
/tutorials/exercism_py3/Ex5_hamming/hamming2.py:
--------------------------------------------------------------------------------
1 | def distance(dna1, dna2):
2 | 	return sum(d1 != d2 for d1, d2 in zip(dna1, dna2))


--------------------------------------------------------------------------------
/windspeed/plots/038-62124Sebha_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/windspeed/plots/038-62124Sebha_2.png


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/thinkbayesLoco.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesLoco.png


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/thinkbayesLoco2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesLoco2.png


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/thinkbayeseuro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayeseuro.png


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/thinkbayeseuro2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayeseuro2.png


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/thinkbayesprice.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesprice.png


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/thinkbayesprice2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesprice2.png


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/thinkbayesprice3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesprice3.png


--------------------------------------------------------------------------------
/gothonweb/templates/you_died.html:
--------------------------------------------------------------------------------
1 | <h1>You Died!</h1>
2 | 
3 | <p>Looks like you bit the dust.</p>
4 | <p><a href="/">Play Again</a></p>
5 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/dna/dna2.py:
--------------------------------------------------------------------------------
1 | DNA_TO_RNA = str.maketrans("GCTA", "CGAU")
2 | 
3 | def to_rna(dna):
4 |     return dna.translate(DNA_TO_RNA)


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/.ipynb_checkpoints/046-ImplimentingSuite-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/tutorials/algorithms/notebooks/.ipynb_checkpoints/068-Lesson2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/.ipynb_checkpoints/056 - Chap6DecisionAnalysis-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 |  "cells": [],
3 |  "metadata": {},
4 |  "nbformat": 4,
5 |  "nbformat_minor": 0
6 | }
7 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/Ex5_hamming/hamming.py:
--------------------------------------------------------------------------------
1 | def distance(x,y):
2 |     count=0
3 |     for i,x in enumerate(x):
4 |         if x != y[i]: count += 1
5 |     return count
6 |   
7 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/hello-world/hello_world2.py:
--------------------------------------------------------------------------------
1 | #
2 | # Skeleton file for the Python "Hello World" exercise.
3 | #
4 | 
5 | def hello(name=''):
6 |     return 'Hello, %s!' % (name or 'World')


--------------------------------------------------------------------------------
/tutorials/exercism_py3/dna/dna.py:
--------------------------------------------------------------------------------
1 | 
2 | def to_rna(dna):
3 |     
4 |     d={'G':'C','C':'G','T':'A','A':'U'}
5 |     
6 |     p = list(dna)
7 |     return ''.join([d[m] for m in p])
8 |         
9 |     


--------------------------------------------------------------------------------
/gothonweb/sessions/40ad7454d4b4cbacedaa449f7e2c8fb04165ecf4:
--------------------------------------------------------------------------------
1 | KGRwMQpTJ2lwJwpwMgpWMTI3LjAuMC4xCnAzCnNTJ3Jvb20nCnA0Ck5zUydzZXNzaW9uX2lkJwpw
2 | NQpTJzQwYWQ3NDU0ZDRiNGNiYWNlZGFhNDQ5ZjdlMmM4ZmIwNDE2NWVjZjQnCnA2CnMu
3 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/word_count/wordcount3.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | import re
3 | 
4 | 
5 | def word_count(phrase):
6 |     return Counter(re.findall(r"[\w]+", phrase.lower().replace('_', ' ')))


--------------------------------------------------------------------------------
/tutorials/exercism_py3/pangram/pangram.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | import re 
 5 | 
 6 | def is_pangram(s):
 7 |     
 8 |     letters = re.sub('[^a-zA-Z]','',s)
 9 |    
10 |     return len(list(set(letters.lower())))== 26 
11 |      
12 | 


--------------------------------------------------------------------------------
/firstform/templates/layout.html:
--------------------------------------------------------------------------------
 1 | $def with (content)
 2 | 
 3 | <html>
 4 | <head>
 5 |     <title>My first form</title>
 6 | </head>
 7 | 
 8 | <!--You can use codes or words to specify colours-->
 9 | <body bgcolor="Aquamarine">
10 | 
11 | $:content
12 | 
13 | </body>
14 | </html>


--------------------------------------------------------------------------------
/gothonweb/templates/layout.html:
--------------------------------------------------------------------------------
 1 | $def with (content)
 2 | 
 3 | <html>
 4 | <head>
 5 |     <title>Interactive Game</title>
 6 | </head>
 7 | 
 8 | <!--You can use codes or words to specify colours-->
 9 | <body bgcolor="Aquamarine">
10 | 
11 | $:content
12 | 
13 | </body>
14 | </html>


--------------------------------------------------------------------------------
/tutorials/exercism_py3/leap/year5.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def is_leap_year(year):
 3 |     return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0)
 4 | 
 5 | if __name__ == '__main__':
 6 |   
 7 |     year = int(input('Type in a year to test if it is a leap year\n> '))  
 8 |     is_leap_year(year)
 9 |     
10 |  


--------------------------------------------------------------------------------
/SQL/galaXQL_17.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO hilight
2 | SELECT stars.starid AS starid
3 | FROM stars 
4 | LEFT OUTER JOIN planets ON stars.starid == planets.starid
5 | LEFT OUTER JOIN moons ON planets.planetid == moons.planetid
6 | GROUP BY stars.starid ORDER BY (COUNT(planets.planetid) + COUNT(moons.moonid)) 
7 | DESC
8 | LIMIT 1


--------------------------------------------------------------------------------
/tutorials/exercism_py3/leap/leap.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def is_leap_year(year):
 3 |     if (year%400 !=0) & (year%4 != 0) & (year%100 != 0):
 4 |         print ("%d is not a leap year" % year)
 5 |         return False
 6 |     else:
 7 |         print ("%d is a leap year!" % year)
 8 |         return True
 9 | 
10 | year = int(input('Type in a year to test if it is a leap year\n> '))  
11 | is_leap_year(year)


--------------------------------------------------------------------------------
/firstform/templates/index.html:
--------------------------------------------------------------------------------
 1 | $def with (greeting)
 2 | 
 3 | $if greeting:
 4 |     I would just like to say \
 5 |     <em style="color:green; font-size:2em;">$greeting</em>.
 6 | $else:
 7 |     <em>Hello</em>, world!
 8 | <!--This is how you make a comment.Below is how you link back to the original 
 9 | /hello form. You don't need hello_form.html-->    
10 | <p><a href="/hello">Input Form</a> takes you back to the 
11 | submission form.</p>
12 |     
13 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/leap/year4.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def is_leap_year(year):
 3 |          
 4 |     if year % 4 ==0 and year % 100 != 0 or year % 400 == 0:
 5 |         print ("%d is a leap year! "% year)
 6 |         return True
 7 |       
 8 |     else: 
 9 |         print ("%d is not a leap year" % year)
10 |         return False
11 | 
12 | if __name__ == '__main__':
13 |   
14 |     year = int(input('Type in a year to test if it is a leap year\n> '))  
15 |     is_leap_year(year)
16 |     
17 |  


--------------------------------------------------------------------------------
/firstform/templates/hello_form.html:
--------------------------------------------------------------------------------
 1 | <h1 style= "text-align:center;">Fill Out This Form, Please</h1>
 2 | 
 3 | <!--action and method are attributes especially for inside <form>-->
 4 | <form style="text-align:center" action="/hello" method="POST">
 5 |     <!--type and name are attributes for <input>-->
 6 |     A Greeting: <input type="text" name="greet">
 7 |     <br/>
 8 |     <br/>
 9 |     Your Name: <input type="text" name="name">
10 |     <br/>
11 |     <br/>
12 |     <input type="submit">
13 | </form>
14 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/hello-world/hello_world.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Skeleton file for the Python "Hello World" exercise.
 3 | #
 4 | def hello(name=''):
 5 |    
 6 |     if name == '':
 7 |         greeting = "Hello, World!"
 8 |         print (greeting)
 9 |         return greeting
10 |     
11 |     else:
12 |         greeting = 'Hello, %s!' % name
13 |         print (greeting)
14 |         return greeting
15 | 
16 | if __name__ == '__main__':
17 |     
18 |     name = input('What is your name?\n> ')
19 |     hello(name)


--------------------------------------------------------------------------------
/firstform/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import setup
 3 | except ImportError:
 4 |     from distutils.core import setup
 5 | 
 6 | config = {
 7 |     'description': 'My Project',
 8 |     'author': 'Sophie Cowie',
 9 |     'url': 'URL to get it at.',
10 |     'download_url': 'Where to download it.',
11 |     'author_email': 'sophie_cowie@hotmail.com',
12 |     'version': '0.1',
13 |     'install_requires': ['nose'],
14 |     'packages': ['NAME'],
15 |     'scripts': [],
16 |     'name': 'gothonweb'
17 | }
18 | 
19 | setup(**config)


--------------------------------------------------------------------------------
/gothonweb/setup.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from setuptools import setup
 3 | except ImportError:
 4 |     from distutils.core import setup
 5 | 
 6 | config = {
 7 |     'description': 'My Project',
 8 |     'author': 'Sophie Cowie',
 9 |     'url': 'URL to get it at.',
10 |     'download_url': 'Where to download it.',
11 |     'author_email': 'sophie_cowie@hotmail.com',
12 |     'version': '0.1',
13 |     'install_requires': ['nose'],
14 |     'packages': ['NAME'],
15 |     'scripts': [],
16 |     'name': 'projectname'
17 | }
18 | 
19 | setup(**config)


--------------------------------------------------------------------------------
/tutorials/exercism_py3/leap/year.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def is_leap_year(year):
 3 |     
 4 |     b = (year%4 ==0)
 5 |     c = (year%100 != 0)
 6 |     d = (year%400 == 0)
 7 |     
 8 |     if b == True and c == True or d == True:
 9 |         print ("%d is a leap year! "% year)
10 |         return True
11 |       
12 |     else: 
13 |         print ("%d is not a leap year" % year)
14 |         return False
15 | 
16 | if __name__ == '__main__':
17 |   
18 |     year = int(input('Type in a year to test if it is a leap year\n> '))  
19 |     is_leap_year(year)
20 |     
21 |  


--------------------------------------------------------------------------------
/tutorials/exercism_py3/pangram/pangram2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: UTF-8 -*-
 2 | 
 3 | ALPHABET = 'abcdefghijklmnopqrstuvwxyz '
 4 | 
 5 | 
 6 | def is_pangram(s):
 7 |     
 8 |      
 9 |     return set(list(s.lower())) >= set(ALPHABET)
10 |   
11 | if __name__ == '__main__':
12 |     
13 |     #is_pangram('the quick brown fox jumps over the lazy dog')
14 |     # When I declare the encoding at the beginning, it doesnt throw up an error 
15 |     # with string here.
16 |     string = 'Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich.'
17 |     #new = string.encode('utf-8')
18 |     is_pangram(string)


--------------------------------------------------------------------------------
/001-git-basics.md:
--------------------------------------------------------------------------------
 1 | Some basic git commands I used today to set this up:
 2 | 
 3 | 
 4 | `git init` initialises a local repository 
 5 | 
 6 | 
 7 | `git add` stages the work to Index
 8 | 
 9 | `git commit -m "comment"` saves the work to the repository
10 | 
11 | 
12 | Now the locally saved work can be added to the remote repository.
13 | 
14 | First you want to connect to the remote server:
15 | `git remote add origin git@github.com:SophMC/notechain`
16 | 
17 | 
18 | `git push -u origin master` -u is added the first time, after that you just 
19 | need to be inside the local repo that you want to push and type
20 | `git push`
21 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/leap/leap_test.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from year5 import is_leap_year
 4 | 
 5 | 
 6 | class YearTest(unittest.TestCase):
 7 |     def test_leap_year(self):
 8 |         self.assertIs(is_leap_year(1996), True)
 9 | 
10 |     def test_non_leap_year(self):
11 |         self.assertIs(is_leap_year(1997), False)
12 | 
13 |     def test_non_leap_even_year(self):
14 |         self.assertIs(is_leap_year(1998), False)
15 | 
16 |     def test_century(self):
17 |         self.assertIs(is_leap_year(1900), False)
18 | 
19 |     def test_exceptional_century(self):
20 |         self.assertIs(is_leap_year(2400), True)
21 | 
22 | if __name__ == '__main__':
23 |     unittest.main()
24 | 


--------------------------------------------------------------------------------
/firstform/bin/tools.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import *
 2 | import re
 3 | 
 4 | def assert_response(resp, contains=None, matches=None,headers=None, 
 5 | status="200"):
 6 |     assert status in resp.status, \
 7 |     "Expected response %r not in %r" \
 8 |     % (status, resp.status)
 9 |   
10 |     if status == "200":
11 | 	assert resp.data, "Response data is empty."
12 | 	
13 |     if contains:
14 | 	assert contains in resp.data, "Response does not contain %r"\
15 | 				      % contains
16 | 				    
17 |     if matches:
18 | 	reg = re.compile(matches)
19 | 	assert reg.matches(resp.data), "Response does not match %r"\
20 | 				       % matches
21 | 				     
22 |     if headers:
23 | 	assert_equal(resp.headers,headers)


--------------------------------------------------------------------------------
/firstform/bin/__init__.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import *
 2 | import re
 3 | 
 4 | def assert_response(resp, contains=None, matches=None,headers=None, 
 5 | status="200"):
 6 |     assert status in resp.status, \
 7 |     "Expected response %r not in %r" \
 8 |     % (status, resp.status)
 9 |   
10 |     if status == "200":
11 | 	assert resp.data, "Response data is empty."
12 | 	
13 |     if contains:
14 | 	assert contains in resp.data, "Response does not contain %r"\
15 | 				      % contains
16 | 				    
17 |     if matches:
18 | 	reg = re.compile(matches)
19 | 	assert reg.matches(resp.data), "Response does not match %r"\
20 | 				       % matches
21 | 				     
22 |     if headers:
23 | 	assert_equal(resp.headers,headers)


--------------------------------------------------------------------------------
/firstform/tests/tools.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import *
 2 | import re
 3 | 
 4 | 
 5 | 
 6 | def assert_response(resp, contains=None, matches=None,headers=None, 
 7 | status="200"):
 8 |     assert status in resp.status, \
 9 |     "Expected response %r not in %r" \
10 |     % (status, resp.status)
11 |   
12 |     if status == "200":
13 | 	assert resp.data, "Response data is empty."
14 | 	
15 |     if contains:
16 | 	assert contains in resp.data, "Response does not contain %r"\
17 | 				      % contains
18 | 				    
19 |     if matches:
20 | 	reg = re.compile(matches)
21 | 	assert reg.matches(resp.data), "Response does not match %r"\
22 | 				       % matches
23 | 				     
24 |     if headers:
25 | 	assert_equal(resp.headers,headers)


--------------------------------------------------------------------------------
/gothonweb/tests/tools.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import *
 2 | import re
 3 | 
 4 | def assert_response(resp, contains=None, matches=None,headers=None, 
 5 | status="200"):
 6 |     assert status in resp.status, \
 7 |     "Expected response %r not in %r" \
 8 |     % (status, resp.status)
 9 |   
10 |     if status == "200":
11 | 	assert resp.data, "Response data is empty."
12 | 	
13 |     if contains:
14 |         #confirm that number x, is in resp.data and if now print out.."Response 
15 |         #does not contain...."
16 | 	assert contains in resp.data, "Response does not contain %r"\
17 | 				      % contains
18 | 				    
19 |     if matches:
20 | 	reg = re.compile(matches)
21 | 	assert reg.matches(resp.data), "Response does not match %r"\
22 | 				       % matches
23 | 				     
24 |     if headers:
25 | 	assert_equal(resp.headers,headers)


--------------------------------------------------------------------------------
/tutorials/exercism_py3/word_count/wordcount2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import re
 4 | 
 5 | def word_count(sentence):
 6 |     
 7 |     sentence = re.sub('[,_]',' ',sentence)
 8 |     
 9 |     # ^ to substitute things that are NOT \s(spaces) and \w(alphanumeric 
10 |     #   characters-letters). r, means raw string notation.
11 |     sentence = re.sub(r'[^\s\w_]+', '', sentence.lower())
12 |     f = sentence.split()
13 |       
14 |     # Make a dictionary to store the pairs
15 |     p = {}
16 |     
17 |     for x in f:
18 |         
19 |         # \b before and after helps to preserve whole words.
20 |         matches = re.findall((r'\b%s\b'%x),' '.join(x for x in f))
21 |         
22 |         #match the key to the value in the dictionary
23 |         p[x] = len(matches)
24 | 
25 |         return p    
26 |   
27 |                        
28 |                        
29 |                        


--------------------------------------------------------------------------------
/tutorials/exercism_py3/word_count/README.md:
--------------------------------------------------------------------------------
 1 | # Word Count
 2 | 
 3 | Write a program that given a phrase can count the occurrences of each word in that phrase.
 4 | 
 5 | For example for the input `"olly olly in come free"`
 6 | 
 7 | ```plain
 8 | olly: 2
 9 | in: 1
10 | come: 1
11 | free: 1
12 | ```
13 | 
14 | 
15 | ### Submitting Exercises
16 | 
17 | Note that, when trying to submit an exercise, make sure the solution is in the `exercism/python/<exerciseName>` directory.
18 | 
19 | For example, if you're submitting `bob.py` for the Bob exercise, the submit command would be something like `exercism submit <path_to_exercism_dir>/python/bob/bob.py`.
20 | 
21 | 
22 | For more detailed information about running tests, code style and linting,
23 | please see the [help page](http://exercism.io/languages/python).
24 | 
25 | ## Source
26 | 
27 | This is a classic toy problem, but we were reminded of it by seeing it in the Go Tour.
28 | 


--------------------------------------------------------------------------------
/firstform/tests/app_tests.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import *
 2 | #How to import an application and run it directly for the automated test! 
 3 | #Important!
 4 | from bin.app import app
 5 | #From dir tests, import assert_response function from tools.py
 6 | from tests.tools import assert_response
 7 | 
 8 | def test_index():
 9 |     # check that we get a 404 on the / URL
10 |     resp = app.request("/")
11 |     assert_response(resp,status="404")
12 |     
13 |     #test our first GET request to /hello
14 |     resp = app.request("/hello")
15 |     assert_response(resp)
16 |     
17 |     #make sure default values work for the form
18 |     resp = app.request("/hello", method="POST")
19 |     assert_response(resp, contains="Nobody")
20 |     
21 |     # test that we get expected values
22 |     data = {'name':'Zed','greet':'Hola'}
23 |     resp = app.request("/hello", method="POST",data=data)
24 |     assert_response(resp,contains="Zed")


--------------------------------------------------------------------------------
/gothonweb/tests/app_tests.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import *
 2 | #How to import an application and run it directly for the automated test! 
 3 | #Important!
 4 | from bin.app import app
 5 | #From dir tests, import assert_response function from tools.py
 6 | from tests.tools import assert_response
 7 | 
 8 | def test_index():
 9 |     # check that we get a 404 on the / URL
10 |     resp = app.request("/")
11 |     assert_response(resp,status="404")
12 |     
13 |     #test our first GET request to /hello
14 |     resp = app.request("/game")
15 |     assert_response(resp)
16 |     
17 |     #make sure default values work for the form
18 |     #resp = app.request("/game", method="POST")
19 |     #assert_response(resp, action=None)
20 |     
21 |     # test that we get expected values
22 |     #data = {'name':'Zed','greet':'Hola'}
23 |     #resp = app.request("/hello", method="POST",data=data)
24 |     #assert_response(resp,contains="Zed")


--------------------------------------------------------------------------------
/tutorials/algorithms/notebooks/068-Lesson2.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "m $\\in \\Theta$ (n)"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": null,
13 |    "metadata": {
14 |     "collapsed": true
15 |    },
16 |    "outputs": [],
17 |    "source": []
18 |   }
19 |  ],
20 |  "metadata": {
21 |   "anaconda-cloud": {},
22 |   "kernelspec": {
23 |    "display_name": "Python [Root]",
24 |    "language": "python",
25 |    "name": "Python [Root]"
26 |   },
27 |   "language_info": {
28 |    "codemirror_mode": {
29 |     "name": "ipython",
30 |     "version": 3
31 |    },
32 |    "file_extension": ".py",
33 |    "mimetype": "text/x-python",
34 |    "name": "python",
35 |    "nbconvert_exporter": "python",
36 |    "pygments_lexer": "ipython3",
37 |    "version": "3.5.2"
38 |   }
39 |  },
40 |  "nbformat": 4,
41 |  "nbformat_minor": 0
42 | }
43 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/hello-world/hello_world_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import unicode_literals
 4 | import unittest
 5 | 
 6 | import hello_world2
 7 | 
 8 | 
 9 | class HelloWorldTests(unittest.TestCase):
10 | 
11 |     def test_hello_without_name(self):
12 |         self.assertEqual(
13 |             'Hello, World!',
14 |             hello_world2.hello()
15 |         )
16 | 
17 |     def test_hello_with_sample_name(self):
18 |         self.assertEqual(
19 |             'Hello, Alice!',
20 |             hello_world2.hello('Alice')
21 |         )
22 | 
23 |     def test_hello_with_other_sample_name(self):
24 |         self.assertEqual(
25 |             'Hello, Bob!',
26 |             hello_world2.hello('Bob')
27 |         )
28 | 
29 |     def test_hello_with_umlaut_name(self):
30 |         self.assertEqual(
31 |             'Hello, Jürgen!',
32 |             hello_world2.hello('Jürgen')
33 |         )
34 | 
35 | if __name__ == '__main__':
36 |     unittest.main()
37 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/hello-world/hello_world_test2.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from __future__ import unicode_literals
 4 | import unittest
 5 | 
 6 | import hello_world2
 7 | 
 8 | 
 9 | class HelloWorldTests(unittest.TestCase):
10 | 
11 |     def test_hello_without_name(self):
12 |         self.assertEqual(
13 |             'Hello, World!',
14 |             hello_world2.hello()
15 |         )
16 | 
17 |     def test_hello_with_sample_name(self):
18 |         self.assertEqual(
19 |             'Hello, Alice!',
20 |             hello_world2.hello('Alice')
21 |         )
22 | 
23 |     def test_hello_with_other_sample_name(self):
24 |         self.assertEqual(
25 |             'Hello, Bob!',
26 |             hello_world2.hello('Bob')
27 |         )
28 | 
29 |     def test_hello_with_umlaut_name(self):
30 |         self.assertEqual(
31 |             'Hello, Jürgen!',
32 |             hello_world2.hello('Jürgen')
33 |         )
34 | 
35 | if __name__ == '__main__':
36 |     unittest.main()
37 | 


--------------------------------------------------------------------------------
/firstform/bin/app.py:
--------------------------------------------------------------------------------
 1 | import web
 2 | 
 3 | #This is mapping /hello to the class index.
 4 | #Whenever someone types in /hello they will
 5 | #get sent to the index class first. 
 6 | urls = (
 7 |   '/hello', 'index'
 8 | )
 9 | 
10 | '''Whenever /hello is accessed while this app is running, it will begin
11 |    a chain of processes starting from here. /hello is the key for index'''
12 | 
13 | app = web.application(urls, globals())
14 | 
15 | render = web.template.render('templates/', base="layout")
16 | 
17 | class index:
18 |     def GET(self):
19 | 	#use render to display a page from the hello_form.html template
20 | 	return render.hello_form()
21 |         #name="Nobody is the default if the information is not given
22 | 	#inputs=(name="Nobody")
23 |     def POST(self):
24 | 	form = web.input(name="Nobody",greet="Hello")
25 | 	#forgot to put brackets around form.greet form.name!
26 | 	greeting = "%s, %s" % (form.greet, form.name)
27 | 	return render.index(greeting = greeting)
28 | 
29 | if __name__ == "__main__":
30 |     app.run()


--------------------------------------------------------------------------------
/gothonweb/templates/show_room.html:
--------------------------------------------------------------------------------
 1 | $def with (room)
 2 | 
 3 | <h1> $room.name </h1>
 4 | 
 5 | <pre>
 6 | $room.description
 7 | </pre>
 8 | 
 9 | $if room.name == "death":
10 |     <p><a href="/">Play Again?</a></p>
11 | 
12 |  
13 | $if room.name == "Central Corridor":
14 |     <p>
15 |     <form action="/game" method="POST">
16 |     Write 1,2 or 3 in the box \
17 |     <input type="text" name="action"> <input type="SUBMIT">
18 |     </form>
19 |     </p>
20 |   
21 |     
22 | $if room.name == "Laser Weapon Armory":
23 |     <p>
24 |     <form action="/game" method="POST">
25 |     <!--very strangely need the \ here, but not above!-->
26 |     Guess the code \
27 |     <input type="text" name="action"> <input type="SUBMIT">
28 |     </form>
29 |     </p>
30 | 
31 | $if room.name == "The Bridge":
32 |     <p>
33 |     <form action="/game" method="POST">
34 |      \
35 |     <input type="text" name="action"> <input type="SUBMIT">
36 |     </form>
37 |     </p>  
38 | 
39 | <!--- Took the \$else out of here as it was not working for some reason --->
40 | 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/pangram/pangram_detailed.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # -*- coding: UTF-8 -*-
 3 | 
 4 | import re
 5 | 
 6 | def is_pangram(s):
 7 |     
 8 |       
 9 |     # create a regular expression object(regex) to pull out only letters 
10 |     # from chars. ^ matches start of the string.
11 |     regex = re.compile('[^a-zA-Z]')
12 |     
13 |     # Use regex object to substitute anything that doesn't match the pattern.
14 |     # is the same as  letters = re.sub('[^a-zA-Z]','',s)
15 |     letters = regex.sub('', s)
16 |     
17 |           
18 |     #break up the sentence into characters and extract the unique values
19 |     if len(list(set(letters.lower())))== 26: 
20 |         
21 |         print(list(set(letters)))
22 |         print('This is a pangram')
23 |         return True
24 |     else: 
25 |         print(list(set(letters)))
26 |         print('This is not a pangram')
27 |         return False  
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     
32 |     #is_pangram('the quick brown fox jumps over the lazy dog')
33 |     is_pangram('Victor jagt zwölf Boxkämpfer quer über den großen Sylter' 
34 |                'Deich.')
35 |     
36 | #set(list(s.lower())) >= set(ALPHABET)


--------------------------------------------------------------------------------
/monkeylearn/015-selectdata.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pandas as pd
 3 | import requests
 4 | 
 5 | with open('API_key.txt') as f:
 6 |     API_KEY = f.read().strip()
 7 | 
 8 | API_KEY = API_read
 9 | 
10 | raw_df = pd.read_csv('indeed_edin.csv', encoding='utf-8',
11 |                      error_bad_lines=False)
12 | #turnstilelink_link_1/_text
13 | 
14 | df = raw_df[['location_value', 'turnstilelink_link_1/_text', 
15 | 'summary_description']]
16 | df.columns = ['location', 'title', 'description']
17 | 
18 | content_df = list(df.title + ' ' + df.description)
19 | 
20 | categories = []
21 | step = 150
22 | for start in xrange(0, len(content_df), step):
23 |     end = start + step
24 | 
25 |     response = requests.post(
26 |         
27 | "https://api.monkeylearn.com/v2/classifiers/cl_4PFzSWVR/classify/",
28 |         data=json.dumps({'text_list': content_df[start:end]}),
29 |     	headers={'Authorization': 'Token {}'.format(API_KEY),
30 |             'Content-Type': 'application/json'}).json()
31 | 
32 |     # We go through the results of the API call, storing the result on a list.
33 |     for category in response['result']:
34 |         categories.append(category[0]['label'])
35 |         
36 | augmented_df = df.join(pd.DataFrame(categories, columns=['category']))
37 | augmented_df.to_csv('indeed_aug.csv', encoding='utf-8', index=False, 
38 | header=False)        


--------------------------------------------------------------------------------
/tutorials/exercism_py3/leap/README.md:
--------------------------------------------------------------------------------
 1 | # Leap
 2 | 
 3 | Write a program that will take a year and report if it is a leap year.
 4 | 
 5 | The tricky thing here is that a leap year in the Gregorian calendar occurs:
 6 | 
 7 | ```plain
 8 | on every year that is evenly divisible by 4
 9 |   except every year that is evenly divisible by 100
10 |     unless the year is also evenly divisible by 400
11 | ```
12 | 
13 | For example, 1997 is not a leap year, but 1996 is.  1900 is not a leap
14 | year, but 2000 is.
15 | 
16 | If your language provides a method in the standard library that does
17 | this look-up, pretend it doesn't exist and implement it yourself.
18 | 
19 | ## Notes
20 | 
21 | Though our exercise adopts some very simple rules, there is more to
22 | learn!
23 | 
24 | For a delightful, four minute explanation of the whole leap year
25 | phenomenon, go watch [this youtube video][video].
26 | 
27 | [video]: http://www.youtube.com/watch?v=xX96xng7sAE
28 | 
29 | ### Submitting Exercises
30 | 
31 | Note that, when trying to submit an exercise, make sure the solution is in the `exercism/python/<exerciseName>` directory.
32 | 
33 | For example, if you're submitting `bob.py` for the Bob exercise, the submit command would be something like `exercism submit <path_to_exercism_dir>/python/bob/bob.py`.
34 | 
35 | 
36 | For more detailed information about running tests, code style and linting,
37 | please see the [help page](http://exercism.io/languages/python).
38 | 
39 | ## Source
40 | 
41 | JavaRanch Cattle Drive, exercise 3 [http://www.javaranch.com/leap.jsp](http://www.javaranch.com/leap.jsp)
42 | 


--------------------------------------------------------------------------------
/DSFromScratch/Chap13/machine_learning.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | import math, random
 3 | 
 4 | #
 5 | # data splitting
 6 | #
 7 | 
 8 | def split_data(data, prob):
 9 |     """split data into fractions [prob, 1 - prob]"""
10 |     results = [], []
11 |     for row in data:
12 |         results[0 if random.random() < prob else 1].append(row)
13 |     return results
14 | 
15 | def train_test_split(x, y, test_pct):
16 |     data = list(zip(x, y))                        # pair corresponding values
17 |     train, test = split_data(data, 1 - test_pct)  # split the dataset of pairs
18 |     x_train, y_train = list(zip(*train))          # magical un-zip trick
19 |     x_test, y_test = list(zip(*test))
20 |     return x_train, x_test, y_train, y_test
21 | 
22 | #
23 | # correctness
24 | #
25 | 
26 | def accuracy(tp, fp, fn, tn):
27 |     correct = tp + tn
28 |     total = tp + fp + fn + tn
29 |     return correct / total
30 | 
31 | def precision(tp, fp, fn, tn):
32 |     return tp / (tp + fp)
33 | 
34 | def recall(tp, fp, fn, tn):
35 |     return tp / (tp + fn)
36 | 
37 | def f1_score(tp, fp, fn, tn):
38 |     p = precision(tp, fp, fn, tn)
39 |     r = recall(tp, fp, fn, tn)
40 | 
41 |     return 2 * p * r / (p + r)
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     print("accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 
46 | 981070))
47 |     print("precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 
48 | 981070))
49 |     print("recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070))
50 |     print("f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 
51 | 981070))


--------------------------------------------------------------------------------
/gothonweb/tests/map_tests.py:
--------------------------------------------------------------------------------
 1 | from nose.tools import *
 2 | #from map file in dir bin import everything in map file.
 3 | #This originally had just a class but as it has class instances also
 4 | # defined, we want to import everything in the file.
 5 | from bin.map import *
 6 | 
 7 | def test_room():
 8 |     gold = Room("GoldRoom","""This room has gold in it you can grab. There's a 
 9 |                door to the north.""")  
10 |     assert_equal(gold.name, "GoldRoom")
11 |     assert_equal(gold.paths,{})
12 |   
13 | def test_room_paths():
14 |     center = Room("Center", "Test room in the center.")
15 |     north = Room("North", "Test room in the north.")
16 |     south = Room("South", "Test room in the south.")
17 | 
18 |     center.add_paths({'north': north, 'south': south})
19 |     assert_equal(center.go('north'), north)
20 |     assert_equal(center.go('south'), south)
21 | 
22 |   
23 | def test_map():
24 |     start = Room("Start", "You can go west and down a hole.")
25 |     west = Room("Trees", "There are trees here, you can go east.")
26 |     down = Room("Dungeon", "It's dark down here, you can go up.")
27 | 
28 |     start.add_paths({'west': west, 'down': down})
29 |     west.add_paths({'east': start})
30 |     down.add_paths({'up': start})
31 | 
32 |     assert_equal(start.go('west'), west)
33 |     assert_equal(start.go('west').go('east'), start)
34 |     assert_equal(start.go('down').go('up'), start)
35 |  
36 | def test_gothon_game_map():
37 |     assert_equal(START.go('shoot!'), generic_death)
38 |     assert_equal(START.go('dodge!'), generic_death)
39 | 
40 |     room = START.go('tell a joke')
41 |     assert_equal(room, laser_weapon_armory)


--------------------------------------------------------------------------------
/Titanic/bin/clean_test.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | df = pd.read_csv('/home/sophie/projects/Titanic/data/test.csv', header=0)
 5 | 
 6 | # Change Sex column to 1/0 in Gender
 7 | df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(float)
 8 | 
 9 | #Drop columns
10 | df = df.drop(['Name','Cabin','Ticket','Sex'], axis=1)
11 | 
12 | # Remove any rows which have a nan in the Embarked or Fare column
13 | df = df.dropna(subset = ['Embarked','Fare'])
14 | 
15 | # Turn Embarked into float numbers
16 | df['Embarked'] = df['Embarked'].map({'C': 1 ,'Q': 2 ,'S': 3}).astype(float)
17 | 
18 | 
19 | ###Make guesses for Age. Use the medians for each class
20 | #Make a table filled with zeros
21 | median_ages = np.zeros((2,3)) # male/female for each class
22 | 
23 | # Loop over the table to fill in the values
24 | for i in range(0, 2):
25 |     for j in range(0, 3):
26 |         median_ages[i,j] = df[(df['Gender'] == i) & (df['Pclass'] == j + 
27 |         1)]['Age'].dropna().median()
28 | 	
29 | # Make a copy of Age 
30 | df['AgeFill'] = df['Age']
31 | 
32 | 
33 | # Fill the new column with the correct values.
34 | for i in range(0, 2):
35 |     for j in range(0, 3):
36 |         # we need df.loc here to specify the row AND the column. 
37 |         # only where age is null, gender is 1/0 and class is 1-3, that AgeFill 
38 |         # will be set to the median age.
39 |         df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j + 1), 
40 |         'AgeFill'] = median_ages[i,j]
41 | 
42 | # We can drop the Age column now we have AgeFill
43 | df = df.drop(['Age'], axis=1)
44 | 
45 | # Transform the whole dataframe into floats. 
46 | df= df.astype(float)
47 | 
48 | #Output this to csv to be read in for predicting values.
49 | df.to_csv('/home/sophie/projects/Titanic/data/clean_test.csv', sep = " ", index 
50 | = False) 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/windspeed/scripts/012-ws_tseries.py:
--------------------------------------------------------------------------------
 1 | #The aim of this script is to produce a timeseries of windspeed for each 
 2 | #station, with lines for winds at 0000, 0600, 1200 and 1800
 3 | 
 4 | import glob,os
 5 | import pandas as pd
 6 | 
 7 | #change the directory in here first
 8 | os.chdir("/home/sophie/projects/windspeed/data/")
 9 | fname_list = glob.glob('*allwinds.txt')
10 | 
11 | def read_file(fname):
12 |     '''take a file and read it into a dataframe'''
13 |     
14 |     print """ %s please select the index of the following files to make a plot \
15 |           of: """ % list(enumerate(fname))
16 |     location = int(raw_input("> "))
17 |             
18 |     date_spec = {'date_time': [0,1,2]}
19 |     column_names=["year","month","day","hour","ws"]
20 |     dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}
21 |     
22 |     print fname[location]
23 |     wind = pd.read_csv(fname[location], sep=" ",parse_dates=date_spec, 
24 |     keep_date_col=True, names=column_names, index_col=False)
25 |     #Dealing with hour - going from 600, 1200 etc to 6,12, 18
26 |     wind["hour"]=(wind["hour"]/100).astype(int)
27 |     
28 |     #adding a date_time column with timestamp data
29 |     wind['date_time'] = pd.to_datetime(wind.date_time) + \
30 |     wind.hour.astype('timedelta64[h]')
31 |   
32 |     print "here the data from %s will be split up" % fname_list[location]
33 |     print "location index= %d" %location
34 |     print "wind dataframe= %r" %wind[0:5]
35 |     #data_subs(wind,location)
36 |     return data_subs(wind,location)
37 |     
38 |     
39 | def data_subs(wind,location):
40 |     '''Takes a dataframe and splits it into four new dataframes ready for 
41 |     plotting'''
42 |     print wind[0:5]
43 |     print location
44 |     #print "here the data from %s will be split up" % fname_list[location]
45 |     pass
46 | 
47 | if __name__ == "__main__":
48 |    
49 |     data = read_file(fname_list)
50 |     #data_subs(wind, location)


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/056 - Chap6DecisionAnalysis.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "### Chap 6: Decision Analysis\n",
 8 |     "\n",
 9 |     "How to decide on the price of a showcase?\n",
10 |     "Bayesian thinking towards an answer:\n",
11 |     "1) Prior beliefs on what the showcase prices could be: Analyse previous prices on the show.\n",
12 |     "2) Likelihood/Update: Seeing the prizes, how should you update? i.e. How to interpret the data?\n",
13 |     "3) Results from Update on the Prior: the Posterior. How to choose from the posterior distribution?\n",
14 |     "\n",
15 |     "All of these steps require subjective decisions. \n",
16 |     "\n",
17 |     "**Modeling the contestants**\n",
18 |     "If you were a contestant on the show you could use this distribution (fig 6.1) to quantify your prior belief about the price of each showcase (before you even see the prizes). \n",
19 |     "To Update, we have to answer these questions:\n",
20 |     "\n",
21 |     "1) What data should we consider and how should we quantify it?\n",
22 |     "2) Can we compute a likelihood function; ie.e for each hypo value of `price`, can we compute the conditional likelihood of the data?"
23 |    ]
24 |   },
25 |   {
26 |    "cell_type": "code",
27 |    "execution_count": null,
28 |    "metadata": {
29 |     "collapsed": true
30 |    },
31 |    "outputs": [],
32 |    "source": []
33 |   }
34 |  ],
35 |  "metadata": {
36 |   "kernelspec": {
37 |    "display_name": "Python 3",
38 |    "language": "python",
39 |    "name": "python3"
40 |   },
41 |   "language_info": {
42 |    "codemirror_mode": {
43 |     "name": "ipython",
44 |     "version": 3
45 |    },
46 |    "file_extension": ".py",
47 |    "mimetype": "text/x-python",
48 |    "name": "python",
49 |    "nbconvert_exporter": "python",
50 |    "pygments_lexer": "ipython3",
51 |    "version": "3.5.1"
52 |   }
53 |  },
54 |  "nbformat": 4,
55 |  "nbformat_minor": 0
56 | }
57 | 


--------------------------------------------------------------------------------
/tutorials/KaggleNLP/word_vectors.py:
--------------------------------------------------------------------------------
 1 | # Download the punkt tokenizer for sentence splitting
 2 | import nltk.data
 3 | 
 4 | # Import various modules for string cleaning
 5 | from bs4 import BeautifulSoup
 6 | import re
 7 | from nltk.corpus import stopwords
 8 | import pandas as pd
 9 | 
10 | #Load the punkt tokenizer
11 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
12 | 
13 | # Define a function to split a review into parsed sentences
14 | def review_to_sentences(review, tokenizer, remove_stopwords=False):
15 |     """Split a review into parsed sentences. Returns a list of sentences,
16 |     where each sentence is a list of words"""
17 |     # 1. Use the NLTK tokenizer to split the paragraph into sentences
18 |     raw_sentences = tokenizer.tokenize(review.strip())
19 | 
20 |     # 2. Loop over each sentence
21 |     sentences = []
22 |     for raw_sentence in raw_sentences:
23 |         # If a sentence is empty, skip it
24 |         if len(raw_sentence) > 0:
25 |             # Otherwise, call review_to_wordlist to get a list of words
26 |             sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))
27 |             
28 |     # Return the list of sentences (each sentence is a list of words, so this 
29 |     # returns a list of lists)
30 |     return sentences
31 |   
32 | 
33 | def review_to_wordlist(review, remove_stopwords=False):
34 |     """Convert a document to a sequence of words, optionally removing stop words
35 |     Returns a list of words"""
36 |     # 1. Remove HTML
37 |     review_text = BeautifulSoup(review).get_text()
38 |     # 2. Remove non-letters
39 |     review_text = re.sub("[^a-zA-Z]"," ", review_text)
40 |     # 3. Convert words to lower case and split them
41 |     words = review_text.lower().split()
42 |     # 4. Optionally remove stop words (false by default)
43 |     if remove_stopwords:
44 |         stops = set(stopwords.words("english"))
45 |         words = [w for w in words if not w in stops]
46 |     # 5. Return a list of words
47 |     return(words)
48 | 
49 | 


--------------------------------------------------------------------------------
/tutorials/K-means/kmeans.py:
--------------------------------------------------------------------------------
 1 | # supporting lib for kmeans clustering
 2 | # Nitin Borwankar
 3 | # Open Data Science Training
 4 | 
 5 | import numpy as np
 6 | from scipy.cluster.vq import kmeans,vq
 7 | from scipy.spatial.distance import cdist
 8 | import matplotlib.pyplot as plt
 9 | 
10 | 
11 | def load_data(fName = '../datasets/UN4col.csv'):
12 |   fp = open(fName)
13 |   XX = np.loadtxt(fp)
14 |   fp.close()
15 |   return XX
16 |   
17 | 
18 | def run_kmeans(X, n=10):
19 |   _K = range(1,n)
20 | 
21 |   # scipy.cluster.vq.kmeans
22 |   _KM = [kmeans(X,k) for k in _K] # apply kmeans 1 to 10
23 |   _centroids = [cent for (cent,var) in _KM]   # cluster centroids
24 | 
25 |   _D_k = [cdist(X, cent, 'euclidean') for cent in _centroids]
26 | 
27 |   _cIdx = [np.argmin(D,axis=1) for D in _D_k]
28 |   _dist = [np.min(D,axis=1) for D in _D_k]
29 |   _avgWithinSS = [sum(d)/X.shape[0] for d in _dist]  
30 |   
31 |   return (_K, _KM, _centroids, _D_k, _cIdx, _dist, _avgWithinSS)
32 |   
33 | def plot_elbow_curve(kIdx, K, avgWithinSS):  
34 |   fig = plt.figure()
35 |   ax = fig.add_subplot(111)
36 |   ax.plot(K, avgWithinSS, 'b*-')
37 |   ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, 
38 |       markeredgewidth=2, markeredgecolor='r', markerfacecolor='None')
39 |   plt.grid(True)
40 |   plt.xlabel('Number of clusters')
41 |   plt.ylabel('Average within-cluster sum of squares')
42 |   tt = plt.title('Elbow for KMeans clustering')  
43 |   return(fig,ax)
44 |   
45 | def plot_clusters(orig,pred,nx,ny,legend=True):
46 |   data = orig
47 |   import matplotlib.pyplot as plt
48 |   ylabels = { 0:'Male life expectancy in yrs',1:'Female life expectancy in yrs',2:'Infant mortality, per 1000'}
49 |   # plot data into three clusters based on value of c
50 |   p0 = plt.plot(data[pred==0,nx],data[pred==0,ny],'ro',label='Underdeveloped')
51 |   p2 = plt.plot(data[pred==2,nx],data[pred==2,ny],'go',label='Developing') 
52 |   p1 = plt.plot(data[pred==1,nx],data[pred==1,ny],'bo',label='Developed') 
53 | 
54 |   lx = p1[0].axes.set_xlabel('Per Capita GDP in US$')
55 |   ly = p1[0].axes.set_ylabel(ylabels[ny])
56 |   tt= plt.title('UN countries Dataset, KMeans clustering with K=3')
57 |   if legend:
58 |     ll=plt.legend()  
59 |   return (p0,p1,p2)
60 |   
61 | 


--------------------------------------------------------------------------------
/gothonweb/bin/app.py:
--------------------------------------------------------------------------------
 1 | import web
 2 | import map
 3 | 
 4 | urls = ("/game", "GameEngine", "/", "Index")
 5 | 
 6 | app = web.application(urls, globals())
 7 | 
 8 | #little hack so that debug mode works with sessions
 9 | #
10 | if web.config.get('_session') is None:
11 |     store = web.session.DiskStore('sessions')
12 |     session = web.session.Session(app, store, initializer={'room': None})
13 |     
14 |     web.config._session = session
15 | else:
16 |     session = web.config._session
17 |     
18 | render = web.template.render('templates/', base="layout")
19 | 
20 | class Index(object):
21 |     def GET(self):
22 |         # this is used to "setup" the session with starting values
23 |         #Give us the first session.room = central_corridor
24 |         session.room = map.START
25 |         #Sends you on your way to GameEngine class
26 |         web.seeother("/game")
27 |         
28 | class GameEngine(object):
29 |     #inside the html page you have standard
30 |     def GET(self):
31 |         #session.room should = TRUE, either because it has been through Index, 
32 | 	#or been given another link  
33 |         if session.room:
34 | 	    #make html page from show_room.html. Take session.room as the 
35 | 	    #variable in the html page, accessed by $
36 | 	    return render.show_room(room=session.room)
37 |         else:
38 |             # why is there here? do you need it?
39 |             #if something is passed to session.room which is not recognised, it 
40 | 	    #won't fail
41 |             return render.you_died()
42 | 
43 |     def POST(self):
44 | 	#inside <form> you can pass method= which takes a function such as GET 
45 | 	#or POST(as they are defined in this app)
46 | 	#if action is not given a value in the form, it will automatically be 
47 | 	#None
48 | 	#
49 |         form = web.input(action=None)
50 |         
51 |         if session.room and form.action:
52 | 	    #
53 |             session.room = session.room.go(form.action)
54 |             web.seeother("/game")
55 | 	#if session.room = laser_weapon_armory and form.action != '123' and 
56 | 	 #   count < 10:
57 | 		#count =+ 1
58 | 		#session.room = session.room.go(form.action)
59 | 		#web.seeother("/game")
60 | 	    #if session.room = laser_weapon_armory and form.action != '123' and 
61 | 		#count >= 10:
62 | 		#session.room = None
63 | 		#web.seeother("/game")
64 | 	    
65 |         else:
66 | 	    web.seeother("/game")
67 | 
68 | if __name__ == "__main__":
69 |     app.run()


--------------------------------------------------------------------------------
/tutorials/exercism_py3/hello-world/README.md:
--------------------------------------------------------------------------------
 1 | # Hello World
 2 | 
 3 | Write a program that greets the user by name, or by saying "Hello, World!" if no name is given.
 4 | 
 5 | ["Hello, World!"](http://en.wikipedia.org/wiki/%22Hello,_world!%22_program) is the traditional first program for beginning programming in a new language.
 6 | 
 7 | **Note:** You can skip this exercise by running:
 8 | 
 9 |     exercism skip $LANGUAGE hello-world
10 | 
11 | ## Specification
12 | 
13 | The `Hello World!` program will greet me, the caller.
14 | 
15 | If I tell the program my name is Alice, it will greet me by saying "Hello, Alice!".
16 | 
17 | If I neglect to give it my name, it will greet me by saying "Hello, World!"
18 | 
19 | ## Test-Driven Development
20 | 
21 | As programmers mature, they eventually want to test their code.
22 | 
23 | Here at Exercism we simulate [Test-Driven Development](http://en.wikipedia.org/wiki/Test-driven_development) (TDD), where you write your tests before writing any functionality. The simulation comes in the form of a pre-written test suite, which will signal that you have solved the problem.
24 | 
25 | It will also provide you with a safety net to explore other solutions without breaking the functionality.
26 | 
27 | ### A typical TDD workflow on Exercism:
28 | 
29 | 1. Run the test file and pick one test that's failing.
30 | 2. Write some code to fix the test you picked.
31 | 3. Re-run the tests to confirm the test is now passing.
32 | 4. Repeat from step 1.
33 | 5. Submit your solution (`exercism submit /path/to/file`)
34 | 
35 | ## Instructions
36 | 
37 | Submissions are encouraged to be general, within reason. Having said that, it's also important not to over-engineer a solution.
38 | 
39 | It's important to remember that the goal is to make code as expressive and readable as we can. However, solutions to the hello-world exercise will not be reviewed by a person, but by rikki- the robot, who will offer an encouraging word.
40 | 
41 | ### Submitting Exercises
42 | 
43 | Note that, when trying to submit an exercise, make sure the solution is in the `exercism/python/<exerciseName>` directory.
44 | 
45 | For example, if you're submitting `bob.py` for the Bob exercise, the submit command would be something like `exercism submit <path_to_exercism_dir>/python/bob/bob.py`.
46 | 
47 | 
48 | For more detailed information about running tests, code style and linting,
49 | please see the [help page](http://exercism.io/languages/python).
50 | 
51 | ## Source
52 | 
53 | This is a program to introduce users to using Exercism [http://en.wikipedia.org/wiki/%22Hello,_world!%22_program](http://en.wikipedia.org/wiki/%22Hello,_world!%22_program)
54 | 


--------------------------------------------------------------------------------
/tutorials/algorithms/scripts/L1_Eulerian_Q10.py:
--------------------------------------------------------------------------------
 1 | # Taken from 
 2 | #https://discussions.udacity.com/t/problem-set-1-challenge-find-eulerian-tour/
 3 | #26214/8
 4 | 
 5 | #### What this script does
 6 | #- Goes through the edges fo the graph in order, trying each edge as the 
 7 | #starting point. 
 8 | #- If the staring point leads to a complete eulerian tour, it returns it.
 9 | #- Otherwise, it tries again starting with the next edge in the graph
10 | 
11 | #- While going through the tour, it selects the next edge as the first edge it 
12 | #comes to in the graph that leaves from the current node (that hasn't already 
13 | #been used). 
14 | #- Is there a better selection method to find the best edge out of all possible 
15 | #edges form the current node?
16 | 
17 | 
18 | 
19 | def get_degree(graph):
20 |     degree = {}
21 |     for x, y in graph:
22 |         degree[x] = degree.get(x, 0) + 1
23 |         degree[y] = degree.get(y, 0) + 1
24 |     return degree
25 |     
26 | def eulerian_tour_is_possible(graph):
27 |     degree = get_degree(graph)
28 |     odd = 0
29 |     for entry in degree: 
30 |         if degree[entry] % 2 != 0: 
31 |             odd += 1
32 |     if odd == 0: return True
33 |     return False
34 | 
35 | def find_next_edge(node, graph):
36 |     edges = find_all_edges(node, graph)
37 |     for edge in edges:
38 |         if node in edge:
39 |             return edge
40 |     return None
41 |         
42 | def find_all_edges(node, graph):
43 |     edges = []
44 |     for edge in graph:
45 |         if node in edge:
46 |             edges.append(edge)
47 |     return edges
48 |             
49 | def find_eulerian_tour(graph):
50 |     if eulerian_tour_is_possible(graph):
51 |         for i in range(len(graph)):
52 |             tour = []
53 |             graph_copy = graph[:]   # make copy of graph to do work on
54 |             start_edge = graph_copy.pop(i)   # change starting edge as loop 
55 |                                              # iterates
56 |             tour.append(start_edge[0])
57 |             tour.append(start_edge[1])
58 |             while len(graph_copy) > 0:
59 |                 edge = find_next_edge(tour[-1], graph_copy)
60 |                 if edge == None: break  # we've reached a node where no more 
61 |                                         # possible edges exist
62 |                 if tour[-1] == edge[0]:
63 |                     tour.append(edge[1])
64 |                 else:
65 |                     tour.append(edge[0])
66 |                 graph_copy.pop(graph_copy.index(edge))
67 |             if graph_copy == []: return tour # we've used all edges, tour 
68 |                                              # found! 
69 |         return None 
70 |     else:
71 |         return None
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/tutorials/algorithms/scripts/L1_EulerianPath.py:
--------------------------------------------------------------------------------
 1 | # Needs python 2.7
 2 | # Eulerian Tour Ver 1
 3 | #
 4 | # Write a function, `create_tour` that takes as
 5 | # input a list of nodes
 6 | # and outputs a list of tuples representing
 7 | # edges between nodes that have an Eulerian tour.
 8 | #
 9 | 
10 | def edge(x,y):
11 |     """Ensures that each set of edges in the tour
12 |     goes from a lower value to a higher value"""
13 |     return (x, y) if x < y else (y, x)
14 | 
15 | def create_tour(nodes):
16 |     """For each node create edges which incrementally
17 |     increase """
18 |     tour = []
19 |     l = len(nodes)
20 |     for i in range(l):
21 |         t = edge(nodes[i], nodes[(i+1) % l])
22 |         print nodes
23 |         print t
24 |         tour.append(t)
25 |     return tour
26 | 
27 | 
28 | ############
29 | 
30 | def get_degree(tour):
31 |     degree = {}
32 |     for x, y in tour:
33 |         degree[x] = degree.get(x, 0) + 1
34 |         degree[y] = degree.get(y, 0) + 1
35 |     return degree
36 | 
37 | def check_edge(t, b, nodes):
38 |     """
39 |     t: tuple representing an edge
40 |     b: origin node
41 |     nodes: set of nodes already visited
42 | 
43 |     if we can get to a new node from `b` following `t`
44 |     then return that node, else return None
45 |     """
46 |     if t[0] == b:
47 |         if t[1] not in nodes:
48 |             return t[1]
49 |     elif t[1] == b:
50 |         if t[0] not in nodes:
51 |             return t[0]
52 |     return None
53 | 
54 | def connected_nodes(tour):
55 |     """return the set of nodes reachable from
56 |     the first node in `tour`"""
57 |     a = tour[0][0]
58 |     nodes = set([a])
59 |     explore = set([a])
60 |     while len(explore) > 0:
61 |         # see what other nodes we can reach
62 |         b = explore.pop()
63 |         for t in tour:
64 |             node = check_edge(t, b, nodes)
65 |             if node is None:
66 |                 continue
67 |             nodes.add(node)
68 |             explore.add(node)
69 |     return nodes
70 | 
71 | def is_eulerian_tour(nodes, tour):
72 |     # all nodes must be even degree
73 |     # and every node must be in graph
74 |     degree = get_degree(tour)
75 |     for node in nodes:
76 |         try:
77 |             d = degree[node]
78 |             if d % 2 == 1:
79 |                 print "Node %s has odd degree" % node
80 |                 return False
81 |         except KeyError:
82 |             print "Node %s was not in your tour" % node
83 |             return False
84 |     connected = connected_nodes(tour)
85 |     if len(connected) == len(nodes):
86 |         return True
87 |     else:
88 |         print "Your graph wasn't connected"
89 |         return False
90 | 
91 | def test():
92 |     nodes = [20, 21, 22, 23, 24, 25]
93 |     tour = create_tour(nodes)
94 |     return is_eulerian_tour(nodes, tour)
95 |     
96 | print test() 


--------------------------------------------------------------------------------
/tutorials/exercism_py3/word_count/word_count_test.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import unittest
 3 | 
 4 | from wordcount2 import word_count
 5 | 
 6 | 
 7 | # to be backwards compatible with the old Python 2.X
 8 | def decode_if_needed(string):
 9 |     try:
10 |         return string.decode('utf-8')
11 |     except AttributeError:
12 |         return string
13 | 
14 | 
15 | class WordCountTests(unittest.TestCase):
16 | 
17 |     def test_count_one_word(self):
18 |         self.assertEqual(
19 |             {'word': 1},
20 |             word_count('word')
21 |         )
22 | 
23 |     def test_count_one_of_each(self):
24 |         self.assertEqual(
25 |             {'one': 1, 'of': 1, 'each': 1},
26 |             word_count('one of each')
27 |         )
28 | 
29 |     def test_count_multiple_occurences(self):
30 |         self.assertEqual(
31 |             {'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1},
32 |             word_count('one fish two fish red fish blue fish')
33 |         )
34 | 
35 |     def test_preserves_punctuation(self):
36 |         self.assertEqual(
37 |             {'car': 1, 'carpet': 1, 'as': 1, 'java': 1, 'javascript': 1},
38 |             word_count('car : carpet as java : javascript!!&@$%^&')
39 |         )
40 | 
41 |     def test_include_numbers(self):
42 |         self.assertEqual(
43 |             {'testing': 2, '1': 1, '2': 1},
44 |             word_count('testing 1 2 testing')
45 |         )
46 | 
47 |     def test_mixed_case(self):
48 |         self.assertEqual(
49 |             [2, 3],
50 |             sorted(list(word_count('go Go GO Stop stop').values()))
51 |         )
52 | 
53 |     def test_multiple_spaces(self):
54 |         self.assertEqual(
55 |             {'wait': 1, 'for': 1, 'it': 1},
56 |             word_count('wait for       it')
57 |         )
58 | 
59 |     def test_newlines(self):
60 |         self.assertEqual(
61 |             {'rah': 2, 'ah': 3, 'roma': 2, 'ma': 1, 'ga': 2, 'oh': 1, 'la': 2,
62 |              'want': 1, 'your': 1, 'bad': 1, 'romance': 1},
63 |             word_count('rah rah ah ah ah\nroma roma ma\n'
64 |                        'ga ga oh la la\nwant your bad romance')
65 |         )
66 | 
67 |     def test_tabs(self):
68 |         self.assertEqual(
69 |             {'rah': 2, 'ah': 3, 'roma': 2, 'ma': 1, 'ga': 2, 'oh': 1, 'la': 2,
70 |              'want': 1, 'your': 1, 'bad': 1, 'romance': 1},
71 |             word_count('rah rah ah ah ah\troma roma ma\tga ga oh la la\t'
72 |                        'want your bad romance')
73 |         )
74 | 
75 |     def test_non_alphanumeric(self):
76 |         self.assertEqual(
77 |             {'hey': 1, 'my': 1, 'spacebar': 1, 'is': 1, 'broken': 1},
78 |             word_count('hey,my_spacebar_is_broken.')
79 |         )
80 | 
81 |     def test_unicode(self):
82 |         self.assertEqual(
83 |             {decode_if_needed('до'): 1, decode_if_needed('свидания'): 1},
84 |             word_count('до🖖свидания!')
85 |         )
86 | 
87 | if __name__ == '__main__':
88 |     unittest.main()
89 | 


--------------------------------------------------------------------------------
/windspeed/scripts/030-group_tseries.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import datetime as datetime
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # Creating a panel of timeseries for each group of stations.
  7 | 
  8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 
  9 | # obs per month. 
 10 | 
 11 | # An average over the group will be an extra plot in the panel.
 12 | 
 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh',
 14 | '60555Touggourt','60559ElOued','60566Ghardaia','60580Ouargla',
 15 | '60581HassiMessaoud']
 16 | 
 17 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah',
 18 | '62103Ghadames','62124Sebha']
 19 | 
 20 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa',
 21 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika']
 22 | 
 23 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya',
 24 | '61017Bilma'] 
 25 | 
 26 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan',
 27 | '62420Baharia','62423Farafra','62435Kharga'] 
 28 | 
 29 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima',
 30 | '62680Atbara']
 31 | 
 32 | 
 33 | stations=[NAl,CSar,WSa,CSal,Egy,Sud]
 34 | 
 35 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 
 36 | 'Egypt':Egy,'Sudan':Sud}
 37 | 
 38 | 
 39 | def read_file(fname):
 40 |     '''put the station name into read_file and read_file will return a 
 41 |     dataFrame called wind which has the following columns a dataframe with a 
 42 |     datetime index'''
 43 |     
 44 |      
 45 |     column_names=["year","month","day","hour","ws"]
 46 |     dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}
 47 |     
 48 |     datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname
 49 | 
 50 |     # specify the columns you want to group together. Can't include hour at 
 51 |     # this point as it is not in the right format. 
 52 |     date_spec = {'date_time': [0,1,2]}
 53 | 
 54 |     # when you use keep_dat_col it keeps them as objects, not as the dtype you 
 55 |     # read them in as.
 56 |     wind = pd.read_csv(datafile, sep=" ", names=column_names, 
 57 |     parse_dates=date_spec,   keep_date_col=True, index_col=False ) 
 58 | 
 59 |     # Dealing with hour - going from 600, 1200 etc to 6,12, 18
 60 |     wind["hour"]=(wind["hour"]/100).astype(int)
 61 | 
 62 |     # combining year, month, day that were parsed together into date_time with 
 63 |     # hour, which is now in the correct format.
 64 |     wind['date_time'] = pd.to_datetime(wind.date_time) + \
 65 |     wind.hour.astype('timedelta64[h]')
 66 |   
 67 |     # make datetime the index before making subsections.
 68 |     wind.index = wind['date_time']  
 69 |     
 70 |     # Adds extra rows where value is kept if it meets isin() criteria. Nan if 
 71 |     # it doesn't.
 72 |     wind['ws_0']= wind['ws'][wind['hour'].isin([0])]
 73 |     wind['ws_06']= wind['ws'][wind['hour'].isin([6])]
 74 |     wind['ws_12']= wind['ws'][wind['hour'].isin([12])]
 75 |     wind['ws_18']= wind['ws'][wind['hour'].isin([18])]
 76 |     
 77 |     return wind
 78 | 
 79 | def group_mean(group):
 80 |     '''loop over items in group list, using read_file on each one
 81 |     return'''
 82 |     pass
 83 |     
 84 | def plot_tseries():
 85 |     '''set up n+1 subplots where n is number of stations in the group. Fill in 
 86 |     each plot with timeseries from each station and then a mean of all the 
 87 |     stations. Output to file eps.'''
 88 |     pass
 89 | 
 90 | 
 91 |     
 92 | 
 93 | 
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/tutorials/Samsung/notebooks/029-Samsung_cleanup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 109,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Populating the interactive namespace from numpy and matplotlib\n",
 15 |       "              name\n",
 16 |       "1  tBodyAcc-mean()\n",
 17 |       "2  tBodyAcc-mean()\n",
 18 |       "3  tBodyAcc-mean()\n",
 19 |       "4   tBodyAcc-std()\n",
 20 |       "5   tBodyAcc-std()\n",
 21 |       "               name\n",
 22 |       "1   tBodyAcc-mean()\n",
 23 |       "4    tBodyAcc-std()\n",
 24 |       "7    tBodyAcc-mad()\n",
 25 |       "10   tBodyAcc-max()\n",
 26 |       "13   tBodyAcc-min()\n"
 27 |      ]
 28 |     }
 29 |    ],
 30 |    "source": [
 31 |     "%pylab inline\n",
 32 |     "import pandas as pd\n",
 33 |     "\n",
 34 |     "# \n",
 35 |     "df = pd.read_csv('/home/sophie/projects/Samsung/data/UCI_HAR_Dataset/UCI_HAR_Dataset/features_copy.txt',sep=\" \",\n",
 36 |     "                 names = ['name'], dtype='str')\n",
 37 |     "\n",
 38 |     "print df[0:5] # Shows us some duplicates\n",
 39 |     "\n",
 40 |     "# This works to drop duplicate columns. Have to specify the column name. \n",
 41 |     "df.drop_duplicates(['name'],inplace=True)\n",
 42 |     "\n",
 43 |     "print df[0:5] # Line above removes the duplicates"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 110,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "                            name\n",
 58 |       "303    fBodyAcc-bandsEnergy()-18\n",
 59 |       "304   fBodyAcc-bandsEnergy()-916\n",
 60 |       "305  fBodyAcc-bandsEnergy()-1724\n",
 61 |       "306  fBodyAcc-bandsEnergy()-2532\n",
 62 |       "307  fBodyAcc-bandsEnergy()-3340\n",
 63 |       "308  fBodyAcc-bandsEnergy()-4148\n",
 64 |       "309  fBodyAcc-bandsEnergy()-4956\n",
 65 |       "310  fBodyAcc-bandsEnergy()-5764\n",
 66 |       "311   fBodyAcc-bandsEnergy()-116\n",
 67 |       "312  fBodyAcc-bandsEnergy()-1732\n",
 68 |       "Empty DataFrame\n",
 69 |       "Columns: [name]\n",
 70 |       "Index: []\n"
 71 |      ]
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "# Print out the lines that have numbers in them.\n",
 76 |     "print df[df.name.str.contains('[0-9]')][0:10]   # You have to specify the column. \n",
 77 |     "\n",
 78 |     "\n",
 79 |     "# remove numbers, brackets and \"-\" from all columns\n",
 80 |     "\n",
 81 |     "df.name = df.name.str.replace('[()]', '') # remove brackets\n",
 82 |     "df.name = df.name.str.replace('-','')     # remove -\n",
 83 |     "df.name = df.name.str.replace('[0-9]','') # remove any numbers\n",
 84 |     "\n",
 85 |     "# Select something which has a number in it.\n",
 86 |     "print df[df.name.str.contains('[0-9]')]   # This is empty now.\n"
 87 |    ]
 88 |   }
 89 |  ],
 90 |  "metadata": {
 91 |   "kernelspec": {
 92 |    "display_name": "Python 2",
 93 |    "language": "python",
 94 |    "name": "python2"
 95 |   },
 96 |   "language_info": {
 97 |    "codemirror_mode": {
 98 |     "name": "ipython",
 99 |     "version": 2
100 |    },
101 |    "file_extension": ".py",
102 |    "mimetype": "text/x-python",
103 |    "name": "python",
104 |    "nbconvert_exporter": "python",
105 |    "pygments_lexer": "ipython2",
106 |    "version": "2.7.11"
107 |   }
108 |  },
109 |  "nbformat": 4,
110 |  "nbformat_minor": 0
111 | }
112 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/046-ImplimentingSuite.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "How to encapsulate the framework in an object - A Suite is a Pmf that provides \\__init\\__, Update and Print:"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 5,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# This tells Python of that additional module import path. \n",
 19 |     "import os\n",
 20 |     "import sys\n",
 21 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 22 |     "if module_path not in sys.path:\n",
 23 |     "    sys.path.append(module_path)\n",
 24 |     "    \n",
 25 |     "from thinkbayes import Pmf"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 6,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "class Suite(Pmf):\n",
 37 |     "    '''Represents a suite of hypotheses and their probabilities.'''\n",
 38 |     "    def __init__(self, hypo=tuple()):\n",
 39 |     "        '''Initializes the distribution.'''\n",
 40 |     "    def Update(self,data):\n",
 41 |     "        '''Updates each hypothesis based on the data'''\n",
 42 |     "    def Print(self):\n",
 43 |     "        '''Prints the hypothese and their probabilities.'''"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "markdown",
 48 |    "metadata": {},
 49 |    "source": [
 50 |     "Suite is implemented in thinkbayes.py. To use Suite, write a class that inherits from it and provides Liklihood.\n",
 51 |     "e.g using the Monty Hall problem"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 7,
 57 |    "metadata": {
 58 |     "collapsed": true
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "from thinkbayes import Suite"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 8,
 68 |    "metadata": {
 69 |     "collapsed": true
 70 |    },
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "class Monty(Suite):\n",
 74 |     "    def Likelihood(self, data, hypo):\n",
 75 |     "        if hypo == data:\n",
 76 |     "            return 0 \n",
 77 |     "        elif hypo == 'A':\n",
 78 |     "            return 0.5\n",
 79 |     "        else:\n",
 80 |     "            return 1"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "markdown",
 85 |    "metadata": {},
 86 |    "source": [
 87 |     "And, to use the class:"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 10,
 93 |    "metadata": {
 94 |     "collapsed": false
 95 |    },
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "A 0.3333333333333333\n",
102 |       "B 0.6666666666666666\n",
103 |       "C 0.0\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "suite = Monty('ABC')\n",
109 |     "suite.Update('C')\n",
110 |     "suite.Print()"
111 |    ]
112 |   }
113 |  ],
114 |  "metadata": {
115 |   "kernelspec": {
116 |    "display_name": "Python 3",
117 |    "language": "python",
118 |    "name": "python3"
119 |   },
120 |   "language_info": {
121 |    "codemirror_mode": {
122 |     "name": "ipython",
123 |     "version": 3
124 |    },
125 |    "file_extension": ".py",
126 |    "mimetype": "text/x-python",
127 |    "name": "python",
128 |    "nbconvert_exporter": "python",
129 |    "pygments_lexer": "ipython3",
130 |    "version": "3.5.1"
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 0
135 | }
136 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/046-Suite_m&m.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Using the Suite framework to solve the M&M problem\n",
  8 |     "\n",
  9 |     "- Two bags of m&ms (one from 94 and one from 96), with different proportions of colors.\n",
 10 |     "- You get an m&m from each bag. A yellow and a green, but you don't know which bag they came from. \n",
 11 |     "\n",
 12 |     "What is the probability that the yellow is from bag 1?"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 20,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "import sys\n",
 25 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 26 |     "if module_path not in sys.path:\n",
 27 |     "    sys.path.append(module_path)\n",
 28 |     "    \n",
 29 |     "from thinkbayes import Pmf, Suite"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "First, encode the color mixes from before and after 1995:"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 21,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "class M_and_M(Suite):\n",
 48 |     "    \n",
 49 |     "    # Encode the color mixes from before and after 1995\n",
 50 |     "    mix94 = dict(brown=30, yellow=20, red=20, green=10, orange=10, tan=10)\n",
 51 |     "    mix96 = dict(blue=24, green=20, orange=16, yellow=14, red=13)\n",
 52 |     "    \n",
 53 |     "    #Next, encode the hypotheses\n",
 54 |     "    hypoA = dict(bag1=mix94, bag2=mix96)\n",
 55 |     "    hypoB = dict(bag1=mix96, bag2=mix94)\n",
 56 |     "    \n",
 57 |     "    # Map the name of the hypothesis to the representation\n",
 58 |     "    hypotheses = dict(A=hypoA, B=hypoB)\n",
 59 |     "    \n",
 60 |     "    # In this case the hypothesis, hypo, is  astring, either A or B. The data is a tuple that spcifies a bag and a color.\n",
 61 |     "    def Likelihood(self, data, hypo):\n",
 62 |     "        bag, color = data\n",
 63 |     "        mix = self.hypotheses[hypo][bag]\n",
 64 |     "        like = mix[color]\n",
 65 |     "        return like\n",
 66 |     "    "
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "Code to create the Suite and update it:"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 22,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "A 0.7407407407407407\n",
 88 |       "B 0.2592592592592592\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "suite = M_and_M('AB')  # All of the hypotheses are passed to suite.\n",
 94 |     "\n",
 95 |     "suite.Update(('bag1','yellow')) # This tuple is unpacked inside Likelihood into bag, color. \n",
 96 |     "suite.Update(('bag2','green'))\n",
 97 |     "\n",
 98 |     "suite.Print()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "We have picked a yellow from bag1 and a green from bag2.                        \n",
106 |     "A is the probability that bag1 = 94, bag2 = 96           \n",
107 |     "B is the probability that bag1 = 96, bag2 = 94                      \n",
108 |     "\n",
109 |     "The posterior probability of A is approximately 20/27 - same result as earlier. "
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": []
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 3",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.5.1"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 0
143 | }
144 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/.ipynb_checkpoints/046-Suite_m&m-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Using the Suite framework to solve the M&M problem\n",
  8 |     "\n",
  9 |     "- Two bags of m&ms (one from 94 and one from 96), with different proportions of colors.\n",
 10 |     "- You get an m&m from each bag. A yellow and a green, but you don't know which bag they came from. \n",
 11 |     "\n",
 12 |     "What is the probability that the yellow is from bag 1?"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 20,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "import os\n",
 24 |     "import sys\n",
 25 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 26 |     "if module_path not in sys.path:\n",
 27 |     "    sys.path.append(module_path)\n",
 28 |     "    \n",
 29 |     "from thinkbayes import Pmf, Suite"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "First, encode the color mixes from before and after 1995:"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 21,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "class M_and_M(Suite):\n",
 48 |     "    \n",
 49 |     "    # Encode the color mixes from before and after 1995\n",
 50 |     "    mix94 = dict(brown=30, yellow=20, red=20, green=10, orange=10, tan=10)\n",
 51 |     "    mix96 = dict(blue=24, green=20, orange=16, yellow=14, red=13)\n",
 52 |     "    \n",
 53 |     "    #Next, encode the hypotheses\n",
 54 |     "    hypoA = dict(bag1=mix94, bag2=mix96)\n",
 55 |     "    hypoB = dict(bag1=mix96, bag2=mix94)\n",
 56 |     "    \n",
 57 |     "    # Map the name of the hypothesis to the representation\n",
 58 |     "    hypotheses = dict(A=hypoA, B=hypoB)\n",
 59 |     "    \n",
 60 |     "    # In this case the hypothesis, hypo, is  astring, either A or B. The data is a tuple that spcifies a bag and a color.\n",
 61 |     "    def Likelihood(self, data, hypo):\n",
 62 |     "        bag, color = data\n",
 63 |     "        mix = self.hypotheses[hypo][bag]\n",
 64 |     "        like = mix[color]\n",
 65 |     "        return like\n",
 66 |     "    "
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {},
 72 |    "source": [
 73 |     "Code to create the Suite and update it:"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 22,
 79 |    "metadata": {
 80 |     "collapsed": false
 81 |    },
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "A 0.7407407407407407\n",
 88 |       "B 0.2592592592592592\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "suite = M_and_M('AB')  # All of the hypotheses are passed to suite.\n",
 94 |     "\n",
 95 |     "suite.Update(('bag1','yellow')) # This tuple is unpacked inside Likelihood into bag, color. \n",
 96 |     "suite.Update(('bag2','green'))\n",
 97 |     "\n",
 98 |     "suite.Print()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "metadata": {},
104 |    "source": [
105 |     "We have picked a yellow from bag1 and a green from bag2.                        \n",
106 |     "A is the probability that bag1 = 94, bag2 = 96           \n",
107 |     "B is the probability that bag1 = 96, bag2 = 94                      \n",
108 |     "\n",
109 |     "The posterior probability of A is approximately 20/27 - same result as earlier. "
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": []
120 |   }
121 |  ],
122 |  "metadata": {
123 |   "kernelspec": {
124 |    "display_name": "Python 3",
125 |    "language": "python",
126 |    "name": "python3"
127 |   },
128 |   "language_info": {
129 |    "codemirror_mode": {
130 |     "name": "ipython",
131 |     "version": 3
132 |    },
133 |    "file_extension": ".py",
134 |    "mimetype": "text/x-python",
135 |    "name": "python",
136 |    "nbconvert_exporter": "python",
137 |    "pygments_lexer": "ipython3",
138 |    "version": "3.5.1"
139 |   }
140 |  },
141 |  "nbformat": 4,
142 |  "nbformat_minor": 0
143 | }
144 | 


--------------------------------------------------------------------------------
/windspeed/scripts/037-group_tseries.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import datetime as datetime
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # Creating a panel of timeseries for each group of stations.
  7 | 
  8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 
  9 | # obs per month. 
 10 | 
 11 | # An average over the group will be an extra plot in the panel.
 12 | 
 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh',
 14 | '60555Touggourt','60559ElOued','60566Ghardaia','60580Ouargla',
 15 | '60581HassiMessaoud']
 16 | 
 17 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah',
 18 | '62103Ghadames','62124Sebha']
 19 | 
 20 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa',
 21 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika']
 22 | 
 23 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya',
 24 | '61017Bilma'] 
 25 | 
 26 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan',
 27 | '62420Baharia','62423Farafra','62435Kharga'] 
 28 | 
 29 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima',
 30 | '62680Atbara']
 31 | 
 32 | 
 33 | stations=[NAl,CSar,WSa,CSal,Egy,Sud]
 34 | 
 35 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 
 36 | 'Egypt':Egy,'Sudan':Sud}
 37 | 
 38 | 
 39 | 
 40 | # Could these two functions be turned into lambda functions?
 41 | # Would that be preferable or are these fine?
 42 | 
 43 | def meanf(x):
 44 |     if x.count() > 10:
 45 |        return x.mean()
 46 | 	
 47 | def sdf(x):
 48 |     if x.count() > 10:
 49 |         return x.std()
 50 | 
 51 | def read_file(fname):
 52 |     '''put the station name into read_file and read_file will return a 
 53 |     dataFrame called wind which has the following columns a dataframe with a 
 54 |     datetime index'''
 55 |     
 56 |      
 57 |     column_names=["year","month","day","hour","ws"]
 58 |     dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}
 59 |     
 60 |     datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname
 61 | 
 62 |     # specify the columns you want to group together. Can't include hour at 
 63 |     # this point as it is not in the right format. 
 64 |     date_spec = {'date_time': [0,1,2]}
 65 | 
 66 |     # when you use keep_dat_col it keeps them as objects, not as the dtype you 
 67 |     # read them in as.
 68 |     wind = pd.read_csv(datafile, sep=" ", names=column_names, 
 69 |     parse_dates=date_spec,   keep_date_col=True, index_col=False ) 
 70 | 
 71 |     # Dealing with hour - going from 600, 1200 etc to 6,12, 18
 72 |     wind["hour"]=(wind["hour"]/100).astype(int)
 73 | 
 74 |     # combining year, month, day that were parsed together into date_time with 
 75 |     # hour, which is now in the correct format.
 76 |     wind['date_time'] = pd.to_datetime(wind.date_time) + \
 77 |     wind.hour.astype('timedelta64[h]')
 78 |   
 79 |     # make datetime the index before making subsections.
 80 |     wind.index = wind['date_time']  
 81 |     
 82 |     # Adds extra rows where value is kept if it meets isin() criteria. Nan if 
 83 |     # it doesn't.
 84 |     wind['ws_0']= wind['ws'][wind['hour'].isin([0])]
 85 |     wind['ws_06']= wind['ws'][wind['hour'].isin([6])]
 86 |     wind['ws_12']= wind['ws'][wind['hour'].isin([12])]
 87 |     wind['ws_18']= wind['ws'][wind['hour'].isin([18])]
 88 |     
 89 |     group = wind.groupby(['year', 'month'])
 90 |         
 91 |     wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf])
 92 |     
 93 |     return wind_group
 94 | 
 95 |     
 96 | def plot_tseries(group):
 97 |     '''set up n+1 subplots where n is number of stations in the group. Fill in 
 98 |     each plot with timeseries from each station and then a mean of all the 
 99 |     stations. Output to file eps.'''
100 |     
101 |        
102 |     fig = plt.figure(figsize=(10,10))
103 |     
104 |     for i in range(len(group)):
105 | 
106 |         #read the file in for plotting
107 |         wind_group = read_file(group[i])
108 |        
109 |         # fig.add_subplot(nrows, ncols, num)
110 | 
111 |         ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1)
112 | 
113 |         plt.title(s=group[i], fontsize=15)
114 |         
115 |         wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'm')
116 |         wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'r')
117 |         wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'b')
118 |         wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'c') 
119 |          
120 |     ax.legend(loc=4,prop={'size':6})
121 |     
122 |     plt.tight_layout() # very nice! stops the titles overlapping
123 |     
124 |     fig.suptitle(group_strings[i])
125 |     
126 |     fig.savefig('/home/sophie/projects/windspeed/output/%s.png'%(group[i]), 
127 | dpi=125)
128 | 
129 | if __name__ == '__main__':
130 |   
131 |     for x in stations: plot_tseries(x)
132 |     
133 | 
134 | 
135 | 
136 | 
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/windspeed/scripts/013-ws_tseries.py:
--------------------------------------------------------------------------------
  1 | #The aim of this script is to produce a timeseries of windspeed for each 
  2 | #station, with lines for winds at 0000, 0600, 1200 and 1800
  3 | 
  4 | import glob,os
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | class Analysis(object):
 10 |     '''take a file and read it into a dataframe
 11 |     then ask user if they want to show statistics 
 12 |     or make plots'''
 13 |    
 14 |     
 15 |     def __init__(self):
 16 |       
 17 |         #df with all values
 18 |         self.wind=wind
 19 |        
 20 |         #why can't I do this here?
 21 |         #wind_00['hour'] = wind[wind['hour'].isin([0000])]
 22 |  
 23 |         
 24 |     def split_df(self,wind):
 25 |         '''Split the original dataframe up into hours'''
 26 |         
 27 |         #self.wind_00= wind[wind['hour'].isin([0000])]
 28 |         #print self.wind_00['hour'][0:5]
 29 |         wind_00=wind[wind['hour'].isin([0])]
 30 |         wind_06=wind[wind['hour'].isin([6])]
 31 |         wind_12=wind[wind['hour'].isin([12])]
 32 |         wind_18=wind[wind['hour'].isin([18])]
 33 |         #do I need to have return here?
 34 |         
 35 |         #do you want to look at some stats for these?
 36 |         stats = raw_input("Do you want to see some data stats for the " 
 37 | 	                  "hours 00,06,12,18, y/n ? \n> ")
 38 | 	if stats == 'y' or 'Y' or 'Yes': 
 39 | 	    self.investigate(wind, wind_00, wind_06, wind_12, wind_18)
 40 | 	
 41 | 	else: pass
 42 | 	
 43 | 	#Ask user if they want a timeseries plot
 44 | 	plots = raw_input("Do you want to look at a timeseries plot y/n ?"
 45 | 			  "\n>")
 46 | 	
 47 | 	if plots == 'y' or 'Y' or 'Yes':
 48 | 	    self.plot_tseries(wind, wind_00, wind_06, wind_12, wind_18)
 49 | 	    
 50 | 	else: pass
 51 |         
 52 |     def plot_tseries(self, wind, wind_00, wind_06, wind_12, wind_18):
 53 |         
 54 |         plt.plot(wind['date_time'],wind['ws'])
 55 |         
 56 |         #labels
 57 |         plt.xlabel("Time")
 58 |         plt.ylabel("wind-speed", size=10)
 59 | 
 60 | 
 61 |         #change size of x ticks
 62 |         plt.rc("font", size=7)
 63 |         
 64 |                
 65 |         #chopping the file extension off to put in the name of the image file
 66 |         fname = fname_list[location][:-4]
 67 |         
 68 |         #print the plot to the screen
 69 |         plt.show()
 70 |         
 71 |         #Ask user if they want to save the plot in a file
 72 |         qu = raw_input("Do you want save the timeseries in a png y/n ?"
 73 | 			  "\n>")
 74 | 
 75 |         if qu == 'y' or 'Y' or 'Yes':
 76 |             path = '/home/sophie/projects/windspeed/output/'
 77 |             plt.savefig(path+'%stseries.png' % fname, format='png')
 78 |                 
 79 |         else: pass
 80 |     
 81 |     def plot_hist(self, wind, wind_00, wind_06, wind_12, wind_18):
 82 |         pass
 83 |         
 84 |     
 85 |     def investigate(self, wind, wind_00, wind_06, wind_12, wind_18):
 86 |         
 87 |         print "-" * 10
 88 |         print "00 subset:"
 89 |         print wind_00.describe(percentiles=[.05,0.5,0.95])
 90 |         print "-" * 10
 91 |         print "06 subset:"
 92 |         print wind_06.describe(percentiles=[.05,0.5,0.95])
 93 |         print "-" * 10
 94 |         print "12 subset:"
 95 |         print wind_12.describe(percentiles=[.05,0.5,0.95])
 96 |         print "-" * 10
 97 |         print "18 subset: "
 98 |         print wind_18.describe(percentiles=[.05,0.5,0.95])
 99 |         print "-" * 10
100 | 
101 |         
102 | 
103 | if __name__ == "__main__":
104 |    
105 |     #change the directory in here first
106 |     os.chdir("/home/sophie/projects/windspeed/data/")
107 |     fname_list = glob.glob('*allwinds.txt')
108 | 
109 |     #Choose a station from the list.
110 |     print """ %s please select the index of the following files to make a plot\ 
111 |           of: """ % list(enumerate(fname_list))
112 | 
113 |     location = int(raw_input("> "))
114 | 
115 |     ##Group first 3 columns into a datetime object
116 |     date_spec = {'date_time': [0,1,2]}
117 |     column_names=["year","month","day","hour","ws"]
118 | 
119 |     #specify the data type of each column
120 |     dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}
121 | 
122 |     #read in the data into a dataframe called wind
123 |     wind = pd.read_csv(fname_list[location], sep=" ",parse_dates=date_spec, 
124 |     keep_date_col=True, names=column_names, index_col=False)
125 |     
126 |     #using keep_date_col=True puts forgets the dtypes specified for the columns
127 |     #so we need to change them again here.
128 |     wind[['year','month','day']]=wind[['year','month','day']].astype(int)
129 | 
130 |     #Dealing with hour - going from 600, 1200 etc to 6,12, 18
131 |     wind["hour"]=(wind["hour"]/100).astype(int)
132 | 
133 |     #adding a date_time column with timestamp data
134 |     wind['date_time'] = pd.to_datetime(wind.date_time) + \
135 |     wind.hour.astype('timedelta64[h]')
136 |     
137 |     b = Analysis()
138 |     b.split_df(wind)
139 |     


--------------------------------------------------------------------------------
/gothonweb/bin/map.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | class Room(object):
  4 |     
  5 |     def __init__(self,name,description):
  6 |         self.name = name
  7 |         self.description = description
  8 |         self.paths = {}
  9 |         
 10 |     def go(self,direction):
 11 |         """ takes a direction and puts it into the 
 12 |         empty paths dictionary. get() returns a value
 13 |         for a given key"""
 14 | 	return self.paths.get(direction, None)
 15 |       
 16 |     def add_paths(self, paths):
 17 |         #adds the key-values from paths into paths
 18 | 	self.paths.update(paths)
 19 | 
 20 | central_corridor = Room("Central Corridor",
 21 | """
 22 | The Gothons of Planet Percal #25 have invaded your ship and destroyed
 23 | your entire crew.  You are the last surviving member and your last
 24 | mission is to get the neutron destruct bomb from the Weapons Armory,
 25 | put it in the bridge, and blow the ship up after getting into an 
 26 | escape pod.
 27 | 
 28 | You're running down the central corridor to the Weapons Armory when
 29 | a Gothon jumps out, red scaly skin, dark grimy teeth, and evil clown costume
 30 | flowing around his hate filled body.  He's blocking the door to the
 31 | Armory and about to pull a weapon to blast you. You can either: 1) eat him, 
 32 | 2) kick him in the nuts, or 3) blast him first.
 33 | """)
 34 | 
 35 | 
 36 | laser_weapon_armory = Room("Laser Weapon Armory",
 37 | """
 38 | Lucky for you they made you learn Gothon insults in the academy.
 39 | You tell the one Gothon joke you know:
 40 | Lbhe zbgure vf fb sng, jura fur fvgf nebhaq gur ubhfr, fur fvgf nebhaq gur 
 41 | ubhfr.
 42 | The Gothon stops, tries not to laugh, then busts out laughing and can't 
 43 | move.
 44 | While he's laughing you run up and shoot him square in the head
 45 | putting him down, then jump through the Weapon Armory door.
 46 | 
 47 | You do a dive roll into the Weapon Armory, crouch and scan the room
 48 | for more Gothons that might be hiding.  It's dead quiet, too quiet.
 49 | You stand up and run to the far side of the room and find the
 50 | neutron bomb in its container.  There's a keypad lock on the box
 51 | and you need the code to get the bomb out.  If you get the code
 52 | wrong 10 times then the lock closes forever and you can't
 53 | get the bomb.  The code is 3 digits. (hint: the first two digits are 13)
 54 | """)
 55 | 
 56 | 
 57 | the_bridge = Room("The Bridge",
 58 | """
 59 | The container clicks open and the seal breaks, letting gas out.
 60 | You grab the neutron bomb and run as fast as you can to the
 61 | bridge where you must place it in the right spot.
 62 | 
 63 | You burst onto the Bridge with the netron destruct bomb
 64 | under your arm and surprise 5 Gothons who are trying to
 65 | take control of the ship.  Each of them has an even uglier
 66 | clown costume than the last.  They haven't pulled their
 67 | weapons out yet, as they see the active bomb under your
 68 | arm and don't want to set it off.
 69 | """)
 70 | 
 71 | 
 72 | escape_pod = Room("Escape Pod",
 73 | """
 74 | You point your blaster at the bomb under your arm
 75 | and the Gothons put their hands up and start to sweat.
 76 | You inch backward to the door, open it, and then carefully
 77 | place the bomb on the floor, pointing your blaster at it.
 78 | You then jump back through the door, punch the close button
 79 | and blast the lock so the Gothons can't get out.
 80 | Now that the bomb is placed you run to the escape pod to
 81 | get off this tin can.
 82 | 
 83 | You rush through the ship desperately trying to make it to
 84 | the escape pod before the whole ship explodes.  It seems like
 85 | hardly any Gothons are on the ship, so your run is clear of
 86 | interference.  You get to the chamber with the escape pods, and
 87 | now need to pick one to take.  Some of them could be damaged
 88 | but you don't have time to look.  There's 5 pods, which one
 89 | do you take?
 90 | """)
 91 | 
 92 | 
 93 | the_end_winner = Room("The End",
 94 | """
 95 | You jump into pod 2 and hit the eject button.
 96 | The pod easily slides out into space heading to
 97 | the planet below.  As it flies to the planet, you look
 98 | back and see your ship implode then explode like a
 99 | bright star, taking out the Gothon ship at the same
100 | time.  You won!
101 | """)
102 | 
103 | 
104 | the_end_loser = Room("The End",
105 | """
106 | You jump into a random pod and hit the eject button.
107 | The pod escapes out into the void of space, then
108 | implodes as the hull ruptures, crushing your body
109 | into jam jelly.
110 | """
111 | )
112 | 
113 | escape_pod.add_paths({
114 |     '2': the_end_winner,
115 |     '*': the_end_loser
116 | })
117 | 
118 | generic_death = Room("death", "You died.")
119 | 
120 | the_bridge.add_paths({
121 |     'throw the bomb': generic_death,
122 |     'slowly place the bomb': escape_pod
123 | })
124 | 
125 | laser_weapon_armory.add_paths({
126 |     '132': the_bridge,
127 |     '*': laser_weapon_armory
128 | })
129 | 
130 | central_corridor.add_paths({
131 |     '1': generic_death,
132 |     '2': laser_weapon_armory,
133 |     '3': laser_weapon_armory
134 | })
135 | 
136 | START = central_corridor
137 | 
138 | 
139 | 
140 | 
141 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/046-MontyHall_framework.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Here we are setting up the framework to investigate what happens to the probability that the car is behind A,B,C depending on some new data. This new data is Monty opening door B and there being no car behind it. The likelihood that this new data is factored into the Likelihood function. "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 20,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# This tells Python of that additional module import path. \n",
 19 |     "import os\n",
 20 |     "import sys\n",
 21 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 22 |     "if module_path not in sys.path:\n",
 23 |     "    sys.path.append(module_path)\n",
 24 |     "    \n",
 25 |     "from thinkbayes import Pmf"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "In this example the car is equally likely to be behind doors A, B or C for the PRIOR, p(H).          \n",
 33 |     "Working out the Likelihood is tricky:\n",
 34 |     " - If car is behind A, there is a 50% chance that Monty will pick B or C and both have no car.\n",
 35 |     " - If car is behind B, there is a 0% chance than Monty will pick B and there will be no car!\n",
 36 |     " - If car is behind C, there is a 100% chance that Monty will pick B and there will be no car. You have picked A so he has not choice but to pick C. \n",
 37 |     "\n",
 38 |     "Likelihood is set up here that if you pick B, likelihood will be 0.5 ,0 ,1 for ABC.             \n",
 39 |     "What happens if data is not B? What are we saying in real terms if we do that?       \n",
 40 |     "As the hypothesis is that \"car is behind door x\" then the liklihood that Monty chooses that door and there is no car behind it is always going to be 0. That leaves an equal chance of the car being behind B or C. "
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 16,
 46 |    "metadata": {
 47 |     "collapsed": false
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "class Monty(Pmf):\n",
 52 |     "    \n",
 53 |     "    def __init__(self,hypos):\n",
 54 |     "        Pmf.__init__(self)\n",
 55 |     "        for hypo in hypos:\n",
 56 |     "            self.Set(hypo,1)\n",
 57 |     "        self.Normalize()\n",
 58 |     "        \n",
 59 |     "    def Update(self,data):\n",
 60 |     "        for hypo in self.Values():\n",
 61 |     "            like = self.Likelihood(data,hypo)\n",
 62 |     "            self.Mult(hypo, like)\n",
 63 |     "        self.Normalize()\n",
 64 |     "        \n",
 65 |     "        # So far code is the same as in the Cookie problem\n",
 66 |     "        # Likelihood, however, requires some work:\n",
 67 |     "        \n",
 68 |     "    def Likelihood(self, data, hypo):\n",
 69 |     "        if hypo == data:\n",
 70 |     "            return 0 \n",
 71 |     "        elif hypo == 'A':\n",
 72 |     "            return 0.5\n",
 73 |     "        else:\n",
 74 |     "            return 1     "
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 17,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "hypos = 'ABC'  # for hypo in hypos will break this up into 'A', 'B', 'C'.\n",
 86 |     "pmf = Monty(hypos) # Class Monty inherits from class Pmf."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 18,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# Calling update is pretty much the same\n",
 98 |     "data = 'A'\n",
 99 |     "pmf.Update(data)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 19,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "C 0.5\n",
114 |       "B 0.5\n",
115 |       "A 0.0\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "# Now to print out the results (Same as Cookie problem)\n",
121 |     "for hypo, prob in pmf.Items():\n",
122 |     "    print (hypo, prob)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "The only thing that is different here is that writing Likelihood is a little more complicated. "
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.5.1"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 0
154 | }
155 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/.ipynb_checkpoints/046-MontyHall_framework-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Here we are setting up the framework to investigate what happens to the probability that the car is behind A,B,C depending on some new data. This new data is Monty opening door B and there being no car behind it. The likelihood that this new data is factored into the Likelihood function. "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 20,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "# This tells Python of that additional module import path. \n",
 19 |     "import os\n",
 20 |     "import sys\n",
 21 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 22 |     "if module_path not in sys.path:\n",
 23 |     "    sys.path.append(module_path)\n",
 24 |     "    \n",
 25 |     "from thinkbayes import Pmf"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "In this example the car is equally likely to be behind doors A, B or C for the PRIOR, p(H).          \n",
 33 |     "Working out the Likelihood is tricky:\n",
 34 |     " - If car is behind A, there is a 50% chance that Monty will pick B or C and both have no car.\n",
 35 |     " - If car is behind B, there is a 0% chance than Monty will pick B and there will be no car!\n",
 36 |     " - If car is behind C, there is a 100% chance that Monty will pick B and there will be no car. You have picked A so he has not choice but to pick C. \n",
 37 |     "\n",
 38 |     "Likelihood is set up here that if you pick B, likelihood will be 0.5 ,0 ,1 for ABC.             \n",
 39 |     "What happens if data is not B? What are we saying in real terms if we do that?       \n",
 40 |     "As the hypothesis is that \"car is behind door x\" then the liklihood that Monty chooses that door and there is no car behind it is always going to be 0. That leaves an equal chance of the car being behind B or C. "
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 16,
 46 |    "metadata": {
 47 |     "collapsed": false
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "class Monty(Pmf):\n",
 52 |     "    \n",
 53 |     "    def __init__(self,hypos):\n",
 54 |     "        Pmf.__init__(self)\n",
 55 |     "        for hypo in hypos:\n",
 56 |     "            self.Set(hypo,1)\n",
 57 |     "        self.Normalize()\n",
 58 |     "        \n",
 59 |     "    def Update(self,data):\n",
 60 |     "        for hypo in self.Values():\n",
 61 |     "            like = self.Likelihood(data,hypo)\n",
 62 |     "            self.Mult(hypo, like)\n",
 63 |     "        self.Normalize()\n",
 64 |     "        \n",
 65 |     "        # So far code is the same as in the Cookie problem\n",
 66 |     "        # Likelihood, however, requires some work:\n",
 67 |     "        \n",
 68 |     "    def Likelihood(self, data, hypo):\n",
 69 |     "        if hypo == data:\n",
 70 |     "            return 0 \n",
 71 |     "        elif hypo == 'A':\n",
 72 |     "            return 0.5\n",
 73 |     "        else:\n",
 74 |     "            return 1     "
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 17,
 80 |    "metadata": {
 81 |     "collapsed": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "hypos = 'ABC'  # for hypo in hypos will break this up into 'A', 'B', 'C'.\n",
 86 |     "pmf = Monty(hypos) # Class Monty inherits from class Pmf."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 18,
 92 |    "metadata": {
 93 |     "collapsed": true
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "# Calling update is pretty much the same\n",
 98 |     "data = 'A'\n",
 99 |     "pmf.Update(data)"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 19,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "C 0.5\n",
114 |       "B 0.5\n",
115 |       "A 0.0\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "# Now to print out the results (Same as Cookie problem)\n",
121 |     "for hypo, prob in pmf.Items():\n",
122 |     "    print (hypo, prob)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "The only thing that is different here is that writing Likelihood is a little more complicated. "
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 3",
136 |    "language": "python",
137 |    "name": "python3"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 3
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython3",
149 |    "version": "3.5.1"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 0
154 | }
155 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/.ipynb_checkpoints/047-Dice-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### The Dice Problem\n",
  8 |     "\n",
  9 |     "I have 5 dice - a 4-sided, 6-sided, 8-sided and 20-sided.   \n",
 10 |     "If I select a die from the box at random and get a 6, what is the probability that it was each of the dice.\n",
 11 |     "Here we will:    \n",
 12 |     "1) Choose a representation for the hypotheses       \n",
 13 |     "2) Choose a representation for the data    \n",
 14 |     "3) Write a likelihood function   \n",
 15 |     "\n",
 16 |     "Previously we used strings to represent hypotheses and data, here we will use numbers.   \n",
 17 |     "Specifically 4,6,8,12 and 20 to represent hypotheses:"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 7,
 23 |    "metadata": {
 24 |     "collapsed": true
 25 |    },
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import os\n",
 29 |     "import sys\n",
 30 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 31 |     "if module_path not in sys.path:\n",
 32 |     "    sys.path.append(module_path)\n",
 33 |     "    \n",
 34 |     "from thinkbayes import Pmf, Suite"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 8,
 40 |    "metadata": {
 41 |     "collapsed": false
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "class Dice(Suite):\n",
 46 |     "    \n",
 47 |     "    #notice there are no class variables here. \n",
 48 |     "    \n",
 49 |     "    def Likelihood(self, data, hypo):\n",
 50 |     "        if hypo < data:\n",
 51 |     "            return 0 \n",
 52 |     "        else:\n",
 53 |     "            return 1.0/hypo\n",
 54 |     "\n",
 55 |     "\n",
 56 |     "# We use integers to represent hypotheses\n",
 57 |     "suite = Dice([4 ,6 ,8 ,12 ,20 ])"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "If hypo < data this means the roll is greater than the number of sides on the die. This is not possible to the likelihood is 0.\n",
 65 |     "\n",
 66 |     "Otherwise the question is, \"Given that there are hypo sides, what is the chance of rolling data?\"   \n",
 67 |     "The answer is 1/hypo, regardless of data."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 9,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "4 0.0\n",
 82 |       "6 0.3921568627450979\n",
 83 |       "8 0.2941176470588235\n",
 84 |       "12 0.19607843137254896\n",
 85 |       "20 0.11764705882352941\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "# Update hypothesis if I roll a 6\n",
 91 |     "suite.Update(6)\n",
 92 |     "suite.Print()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "After we roll a 6, the probability for the 4-sided die is 0. 6-sided is most likely, though there is still almost a 12% chance for the 20-sided die"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 13,
105 |    "metadata": {
106 |     "collapsed": false
107 |    },
108 |    "outputs": [
109 |     {
110 |      "name": "stdout",
111 |      "output_type": "stream",
112 |      "text": [
113 |       "4 0.0\n",
114 |       "6 0.0\n",
115 |       "8 0.9965835404647062\n",
116 |       "12 0.0034137843517224823\n",
117 |       "20 2.6751835712673812e-06\n",
118 |       "-----\n",
119 |       "4 0.0\n",
120 |       "6 0.0\n",
121 |       "8 0.9977204760988618\n",
122 |       "12 0.0022784526069342752\n",
123 |       "20 1.0712942038485414e-06\n",
124 |       "-----\n",
125 |       "4 0.0\n",
126 |       "6 0.0\n",
127 |       "8 0.9984794472645385\n",
128 |       "12 0.0015201238918041802\n",
129 |       "20 4.2884365717293543e-07\n",
130 |       "-----\n",
131 |       "4 0.0\n",
132 |       "6 0.0\n",
133 |       "8 0.9989858984203864\n",
134 |       "12 0.0010139299551430545\n",
135 |       "20 1.7162447051522972e-07\n",
136 |       "-----\n",
137 |       "4 0.0\n",
138 |       "6 0.0\n",
139 |       "8 0.9993237494202397\n",
140 |       "12 0.0006761819067551149\n",
141 |       "20 6.867300515001654e-08\n",
142 |       "-----\n",
143 |       "4 0.0\n",
144 |       "6 0.0\n",
145 |       "8 0.999549082940396\n",
146 |       "12 0.00045088958420803747\n",
147 |       "20 2.7475395980645105e-08\n",
148 |       "-----\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "# What if we roll a few more times an get 6,8,7,7,5 and 4 from the same dice?\n",
154 |     "for roll in [6,8,7,7,5,4]:\n",
155 |     "    suite.Update(roll)\n",
156 |     "    suite.Print()\n",
157 |     "    print('-----')\n"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "Now the probability is 94% that we roll the 8-sided die and less than 1% for the 20-sided."
165 |    ]
166 |   }
167 |  ],
168 |  "metadata": {
169 |   "kernelspec": {
170 |    "display_name": "Python 3",
171 |    "language": "python",
172 |    "name": "python3"
173 |   },
174 |   "language_info": {
175 |    "codemirror_mode": {
176 |     "name": "ipython",
177 |     "version": 3
178 |    },
179 |    "file_extension": ".py",
180 |    "mimetype": "text/x-python",
181 |    "name": "python",
182 |    "nbconvert_exporter": "python",
183 |    "pygments_lexer": "ipython3",
184 |    "version": "3.5.1"
185 |   }
186 |  },
187 |  "nbformat": 4,
188 |  "nbformat_minor": 0
189 | }
190 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/047-Dice.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "collapsed": true
  7 |    },
  8 |    "source": [
  9 |     "### The Dice Problem\n",
 10 |     "\n",
 11 |     "I have 5 dice - a 4-sided, 6-sided, 8-sided and 20-sided.   \n",
 12 |     "If I select a die from the box at random and get a 6, what is the probability that it was each of the dice.\n",
 13 |     "Here we will:    \n",
 14 |     "1) Choose a representation for the hypotheses       \n",
 15 |     "2) Choose a representation for the data    \n",
 16 |     "3) Write a likelihood function   \n",
 17 |     "\n",
 18 |     "Previously we used strings to represent hypotheses and data, here we will use numbers.   \n",
 19 |     "Specifically 4,6,8,12 and 20 to represent hypotheses:"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 7,
 25 |    "metadata": {
 26 |     "collapsed": true
 27 |    },
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import os\n",
 31 |     "import sys\n",
 32 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 33 |     "if module_path not in sys.path:\n",
 34 |     "    sys.path.append(module_path)\n",
 35 |     "    \n",
 36 |     "from thinkbayes import Pmf, Suite"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 8,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "class Dice(Suite):\n",
 48 |     "    \n",
 49 |     "    #notice there are no class variables here. \n",
 50 |     "    \n",
 51 |     "    def Likelihood(self, data, hypo):\n",
 52 |     "        if hypo < data:\n",
 53 |     "            return 0 \n",
 54 |     "        else:\n",
 55 |     "            return 1.0/hypo\n",
 56 |     "\n",
 57 |     "\n",
 58 |     "# We use integers to represent hypotheses\n",
 59 |     "suite = Dice([4 ,6 ,8 ,12 ,20 ])"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "If hypo < data this means the roll is greater than the number of sides on the die. This is not possible to the likelihood is 0.\n",
 67 |     "\n",
 68 |     "Otherwise the question is, \"Given that there are hypo sides, what is the chance of rolling data?\"   \n",
 69 |     "The answer is 1/hypo, regardless of data."
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 9,
 75 |    "metadata": {
 76 |     "collapsed": false
 77 |    },
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "4 0.0\n",
 84 |       "6 0.3921568627450979\n",
 85 |       "8 0.2941176470588235\n",
 86 |       "12 0.19607843137254896\n",
 87 |       "20 0.11764705882352941\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "# Update hypothesis if I roll a 6\n",
 93 |     "suite.Update(6)\n",
 94 |     "suite.Print()"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "After we roll a 6, the probability for the 4-sided die is 0. 6-sided is most likely, though there is still almost a 12% chance for the 20-sided die"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 13,
107 |    "metadata": {
108 |     "collapsed": false
109 |    },
110 |    "outputs": [
111 |     {
112 |      "name": "stdout",
113 |      "output_type": "stream",
114 |      "text": [
115 |       "4 0.0\n",
116 |       "6 0.0\n",
117 |       "8 0.9965835404647062\n",
118 |       "12 0.0034137843517224823\n",
119 |       "20 2.6751835712673812e-06\n",
120 |       "-----\n",
121 |       "4 0.0\n",
122 |       "6 0.0\n",
123 |       "8 0.9977204760988618\n",
124 |       "12 0.0022784526069342752\n",
125 |       "20 1.0712942038485414e-06\n",
126 |       "-----\n",
127 |       "4 0.0\n",
128 |       "6 0.0\n",
129 |       "8 0.9984794472645385\n",
130 |       "12 0.0015201238918041802\n",
131 |       "20 4.2884365717293543e-07\n",
132 |       "-----\n",
133 |       "4 0.0\n",
134 |       "6 0.0\n",
135 |       "8 0.9989858984203864\n",
136 |       "12 0.0010139299551430545\n",
137 |       "20 1.7162447051522972e-07\n",
138 |       "-----\n",
139 |       "4 0.0\n",
140 |       "6 0.0\n",
141 |       "8 0.9993237494202397\n",
142 |       "12 0.0006761819067551149\n",
143 |       "20 6.867300515001654e-08\n",
144 |       "-----\n",
145 |       "4 0.0\n",
146 |       "6 0.0\n",
147 |       "8 0.999549082940396\n",
148 |       "12 0.00045088958420803747\n",
149 |       "20 2.7475395980645105e-08\n",
150 |       "-----\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "# What if we roll a few more times an get 6,8,7,7,5 and 4 from the same dice?\n",
156 |     "for roll in [6,8,7,7,5,4]:\n",
157 |     "    suite.Update(roll)\n",
158 |     "    suite.Print()\n",
159 |     "    print('-----')\n"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "Now the probability is 94% that we roll the 8-sided die and less than 1% for the 20-sided."
167 |    ]
168 |   }
169 |  ],
170 |  "metadata": {
171 |   "kernelspec": {
172 |    "display_name": "Python 3",
173 |    "language": "python",
174 |    "name": "python3"
175 |   },
176 |   "language_info": {
177 |    "codemirror_mode": {
178 |     "name": "ipython",
179 |     "version": 3
180 |    },
181 |    "file_extension": ".py",
182 |    "mimetype": "text/x-python",
183 |    "name": "python",
184 |    "nbconvert_exporter": "python",
185 |    "pygments_lexer": "ipython3",
186 |    "version": "3.5.1"
187 |   }
188 |  },
189 |  "nbformat": 4,
190 |  "nbformat_minor": 0
191 | }
192 | 


--------------------------------------------------------------------------------
/Titanic/bin/clean_test_53.py:
--------------------------------------------------------------------------------
  1 | # This script will clean up the test data for the Titanic competition using a 
  2 | # similar method to the notebook https://www.kaggle.com/creepykoala/
  3 | # titanic/study-of-tree-and-forest-algorithms which I have already applied to 
  4 | # the training data
  5 | 
  6 | # Import libraries
  7 | 
  8 | import numpy as np
  9 | from numpy.random import random_integers
 10 | import pandas as pd
 11 | import matplotlib.pyplot as plt
 12 | import sklearn
 13 | from sklearn.cross_validation import train_test_split
 14 | from sklearn.tree import DecisionTreeClassifier
 15 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 16 | from scipy.stats import pointbiserialr, spearmanr
 17 | 
 18 | # Load the test data
 19 | df = pd.read_csv('/home/sophie/projects/Titanic/data/test.csv', header=0)
 20 | 
 21 | # People with stronger titles tend to have more help on board. Hence, we will 
 22 | # categorize passengers based on titles.
 23 | Title_Dictionary = {
 24 |                     "Capt":       "Officer",
 25 |                     "Col":        "Officer",
 26 |                     "Major":      "Officer",
 27 |                     "Jonkheer":   "Royalty",
 28 |                     "Don":        "Royalty",
 29 |                     "Sir" :       "Royalty",
 30 |                     "Dr":         "Officer",
 31 |                     "Rev":        "Officer",
 32 |                     "the Countess":"Royalty",
 33 |                     "Dona":       "Royalty",
 34 |                     "Mme":        "Mrs",
 35 |                     "Mlle":       "Miss",
 36 |                     "Ms":         "Mrs",
 37 |                     "Mr" :        "Mr",
 38 |                     "Mrs" :       "Mrs",
 39 |                     "Miss" :      "Miss",
 40 |                     "Master" :    "Master",
 41 |                     "Lady" :      "Royalty"
 42 |                     }
 43 | df['Title'] = df['Name'].apply(lambda x: 
 44 | Title_Dictionary[x.split(',')[1].split('.')[0].strip()])
 45 | 
 46 | # Extract the letters from the beginning of each element of 'Ticket'
 47 | def Ticket_Prefix(s):
 48 |     s=s.split()[0] # if you don't include anything in split()
 49 |     if s.isdigit():
 50 |         return 'NoClue'
 51 |     else:
 52 |         return s
 53 |     
 54 | df['TicketPrefix'] = df['Ticket'].apply(lambda x: Ticket_Prefix(x))
 55 | 
 56 | # Make an array where null values are False.
 57 | mask_Age = df.Age.notnull()
 58 | 
 59 | # New dataframe where all rows have a value for age. 
 60 | Age_Sex_Title_Pclass = df.loc[mask_Age, ["Age", "Title", "Sex", "Pclass"]]
 61 | 
 62 | # Groupby object to group by Title, Pclass and Sex
 63 | Filler_Ages_1 = Age_Sex_Title_Pclass.groupby(by = ["Title", "Pclass", 
 64 | "Sex"]).median()
 65 | 
 66 | # This moves both Sex and Pclass into column headers and does so in that order. 
 67 | Filler_Ages = Filler_Ages_1.Age.unstack(level = -1).unstack(level = -1)
 68 | 
 69 | mask_Age = df.Age.isnull()  # A mask where null values are True
 70 | 
 71 | # New DataFrame with missing values for age
 72 | Age_Sex_Title_Pclass_missing = df.loc[mask_Age, ["Title", "Sex", "Pclass"]]
 73 | 
 74 | # Look-up function for the calculated median ages. 
 75 | def Age_filler(row):
 76 |     if row.Sex == "female":
 77 |         age = Filler_Ages.female.loc[row["Title"], row["Pclass"]]
 78 |         return age
 79 |     elif row.Sex == "male":
 80 |         age = Filler_Ages.male.loc[row["Title"], row["Pclass"]]
 81 |         return age
 82 |     
 83 | # Make a new column on "missing" dataframe and add the median value to each 
 84 | # row. 
 85 | Age_Sex_Title_Pclass_missing["Age"]=Age_Sex_Title_Pclass_missing.apply(
 86 |   Age_filler, axis =1                                                  ) 
 87 |           
 88 | 
 89 | # reform the 'Age' column.
 90 | df["Age"] = pd.concat([Age_Sex_Title_Pclass["Age"], 
 91 | Age_Sex_Title_Pclass_missing["Age"]])
 92 | 
 93 | # Filling in with the mean of all fares.
 94 | df['Fare'] = df['Fare'].fillna(value=df.Fare.mean())
 95 | 
 96 | df['FamilySize'] = df['SibSp'] + df['Parch']
 97 | df = df.drop(['Ticket', 'Cabin'], axis=1)
 98 | 
 99 | # get_dummies splits up a column into two seperate columns of 1 and 0, where 
100 | # they are true or false. 
101 | dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex')
102 | 
103 | # Making dummies for the other categorical features
104 | dummies_Embarked = pd.get_dummies(df['Embarked'], prefix = 'Embarked')
105 | dummies_Pclass = pd.get_dummies(df['Pclass'], prefix = 'Pclass')
106 | dummies_Titles = pd.get_dummies(df['Title'], prefix= 'Title')
107 | dummies_TicketPrefix = pd.get_dummies(df['TicketPrefix'], prefix='TicketPrefix')
108 | 
109 | # Make new dataframes which have the dummies added on to the end
110 | df = pd.concat([df,dummies_Sex, dummies_Embarked, dummies_Pclass, 
111 | dummies_Titles, dummies_TicketPrefix], axis = 1)
112 | 
113 | # Drop the categorical data
114 | df = df.drop(['Sex', 'Embarked','Pclass','Title','Name','TicketPrefix'], axis=1)
115 | 
116 | # Set PassengerId as the index:
117 | df = df.set_index(['PassengerId'])
118 | 
119 | # FEATURE SELECTION
120 | # To select features we correlate each feature against Survived in the 
121 | # training data. We need # to use different algorithms for the different data 
122 | # types:
123 | # - Spearman-Rank correlation for nominal vs nominal data
124 | # - Point-Biserial correlation for nominal vs continuous data
125 | 
126 | best_features = df[['Title_Mr', 'Sex_male', 'Sex_female', 'Title_Mrs', 
127 | 'Title_Miss', 'Pclass_3', 'Pclass_1', 'Fare', 'Embarked_C', 
128 | 'Embarked_S']]
129 | 
130 | #Output this to csv to be read in for making a prediction
131 | best_features.to_csv('/home/sophie/projects/Titanic/data/clean_test_53.csv', 
132 | sep = " ")
133 | 
134 | 


--------------------------------------------------------------------------------
/TOdo.md:
--------------------------------------------------------------------------------
  1 | # Learning resources to work through
  2 | 
  3 | ### General Data Science stats
  4 | 
  5 | - Do [this](http://nbviewer.jupyter.org/github/nborwankar/LearnDataScience/tree/master/notebooks/) before the Machine Learning below.
  6 | 
  7 | - [Harvard CS09 course](http://cs109.github.io/2015/pages/videos.html). Lots of tutorials and lectures covering everything from pandas, web 
  8 | scraping to bayesian stats. 
  9 |   - perhaps start with the homeworks and labs which also have solutions [here](https://github.com/cs109/content)
 10 | 
 11 | #### Baysian Statistics
 12 | 
 13 | - Very important to get a decent grounding in this: [Programming and Bayesian Methods for 
 14 | hackers](http://nbviewer.jupyter.org/github/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/blob/master/
 15 | Prologue/Prologue.ipynb)
 16 | 
 17 | - Recommended reading from S2DS. [Think Bayes](http://www.greenteapress.com/thinkbayes/thinkbayes.pdf).
 18 |   - Some problems, and solutions, from the Allen Downey's 
 19 | [Blog](http://allendowney.blogspot.co.uk/2011/10/all-your-bayes-are-belong-to-us.html)
 20 | 
 21 | - From the Harvard course:
 22 |   - [Bayesian Tomatoes](http://nbviewer.jupyter.org/github/cs109/content/blob/master/HW3.ipynb)
 23 |   - Lab 6: [Bayesianism, with MCMC] (http://nbviewer.jupyter.org/github/cs109/content/blob/master/labs/lab6/BayesLinear.ipynb)
 24 | 
 25 | 
 26 | ### Machine Learning 
 27 | 
 28 | - Work through the ISLR book (downloaded) with [this](https://github.com/JWarmenhoven/ISLR-python) python repo for guidance.
 29 | 
 30 | - Learn and understand [this introduction](http://www.astroml.org/sklearn_tutorial/general_concepts.html) from the python lib scikit. I 
 31 | also 
 32 | like this [reference map](http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html). 
 33 | 
 34 | - Clone [this](https://github.com/ogrisel/sklearn_pycon2014) repository and work through.
 35 | 
 36 | - Do the MonkeyLearn tutorials [here]
 37 | (https://blog.monkeylearn.com/getting-actionable-insights-from-reviews-using-machine-learning-part1/?utm_source=Email&utm_medium= 
 38 | Newsletter& utm_campaign=actionable-insights-reviews-using-machine-learning-part1)
 39 | and[here]
 40 |  (https://blog.monkeylearn.com/hacker-news-categorizer-with-monkeylearn/?utm_source=Email&utm_medium=Intercom&utm_content=FP&
 41 | utm_campaign=16-hacker-news-categorizer)
 42 | 
 43 | - [Very quick and dirty introduction to Random Forests](http://blog.yhat.com/posts/random-forests-in-python.html) using python and iris 
 44 | data.
 45 | 
 46 | - From harvard data science course Lab 4: [Scikit-Learn, Regression, and 
 47 | PCA](http://nbviewer.jupyter.org/github/cs109/content/blob/master/labs/lab4/Lab4full.ipynb)
 48 | 
 49 | - [Statistical Natural Language Processing](http://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb)
 50 | 
 51 | - A cheatsheet of sorts for scikit-learn in Python using pandas - ['Python Machine 
 52 | Learning'](https://github.com/rasbt/python-machine-learning-book)
 53 | 
 54 | - `Scikit.learn` for large data sets [article](https://www.opendatascience.com/blog/riding-on-large-data-with-scikit-learn/) and 
 55 | [tutorial](https://github.com/rasbt/pattern_classification/blob/master/machine_learning/scikit-learn/outofcore_modelpersistence.ipynb)
 56 | 
 57 | ### Social Network Analysis
 58 | 
 59 | - Udacity course on [Algorithms](https://classroom.udacity.com/courses/cs215/lessons/48311839/concepts/486877000923)
 60 | 
 61 | 
 62 | ### SQL
 63 | 
 64 | - Complete [this](http://sol.gfxile.net/g3/) excellent tutorial playing around with astronomy data. Done! 
 65 | 
 66 | - [SQL Zoo](http://sqlzoo.net/) is a a good tutorial site. Work through these.   
 67 | 
 68 | - Do [this](http://www.sqlcourse.com/) basic tutorial, followed by [this](http://www.sqlcourse2.com/) more advanced one. Looks good with 
 69 | lots of questions (+ answers!).
 70 | 
 71 | - Dip into [this](http://dev.mysql.com/doc/refman/5.5/en/examples.html) tutorial which goes through common queries from MySQL.
 72 | 
 73 | 
 74 | ### Pandas and Python
 75 | 
 76 | - Start working through [exorcism.io](http://exercism.io/languages/python#exercises)
 77 | 
 78 | - [How to think like a computer scientist](http://interactivepython.org/runestone/static/thinkcspy/toc.html) with clear explanations, 
 79 | videos and tests you as you go through.
 80 | 
 81 | - Work through [these Python 3 tutorial videos](https://www.youtube.com/playlist?list=PL1A2CSdiySGJd0LJRRSwQZbPZaDP0q67j). They are nice 
 82 | and short.
 83 | 
 84 | - [Matplotlib visualisation tutorial](https://www.dataquest.io/blog/matplotlib-tutorial/) which encorporates sentiment analysis with 
 85 | suggested futher exercises.
 86 | 
 87 | - [Statistical Natural Language Processing](http://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb)
 88 | 
 89 | ### Kaggle
 90 | 
 91 | - Do the Titanic competition.
 92 |   - Start with this for [Python](https://www.kaggle.com/c/titanic/details/getting-started-with-python) and [then 
 93 | again](https://www.kaggle.com/c/titanic/details/getting-started-with-python-ii), but using pandas.
 94 |   - Try tutorial in [Machine Learning DataBase 
 95 | (MLDB)](https://docs.mldb.ai/ipy/notebooks/_demos/_latest/Predicting%20Titanic%20Survival.html)
 96 | 
 97 | ### Visualisation
 98 | 
 99 | - Python interactive visualization library that targets modern web browsers for presentation:[Bokeh](http://bokeh.pydata.org/en/latest/)
100 | - [Plotly](https://plot.ly/api/)
101 | - Getting started with [d3](https://github.com/d3/d3/wiki/Tutorials)
102 | 
103 | ### git
104 | 
105 | - Work through [this](http://gitreal.codeschool.com/?utm_source=github&utm_medium=codeschool_option&utm_campaign=trygit) course.
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/gothonweb/sessions/5524b4c1828de273b8ae4c70bbbe0e631e031e4a:
--------------------------------------------------------------------------------
 1 | KGRwMQpTJ2lwJwpwMgpWMTI3LjAuMC4xCnAzCnNTJ3Jvb20nCnA0CmNjb3B5X3JlZwpfcmVjb25z
 2 | dHJ1Y3RvcgpwNQooY21hcApSb29tCnA2CmNfX2J1aWx0aW5fXwpvYmplY3QKcDcKTnRScDgKKGRw
 3 | OQpTJ3BhdGhzJwpwMTAKKGRwMTEKUycxJwpnNQooZzYKZzcKTnRScDEyCihkcDEzCmcxMAooZHAx
 4 | NApzUyduYW1lJwpwMTUKUydkZWF0aCcKcDE2CnNTJ2Rlc2NyaXB0aW9uJwpwMTcKUydZb3UgZGll
 5 | ZC4nCnAxOApzYnNTJzMnCmc1CihnNgpnNwpOdFJwMTkKKGRwMjAKZzEwCihkcDIxClMnMTMyJwpw
 6 | MjIKZzUKKGc2Cmc3Ck50UnAyMwooZHAyNApnMTAKKGRwMjUKUyd0aHJvdyB0aGUgYm9tYicKcDI2
 7 | CmcxMgpzUydzbG93bHkgcGxhY2UgdGhlIGJvbWInCnAyNwpnNQooZzYKZzcKTnRScDI4CihkcDI5
 8 | CmcxMAooZHAzMApTJyonCmc1CihnNgpnNwpOdFJwMzEKKGRwMzIKZzEwCihkcDMzCnNnMTUKUydU
 9 | aGUgRW5kJwpwMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBhIHJhbmRvbSBwb2QgYW5kIGhpdCB0
10 | aGUgZWplY3QgYnV0dG9uLlxuVGhlIHBvZCBlc2NhcGVzIG91dCBpbnRvIHRoZSB2b2lkIG9mIHNw
11 | YWNlLCB0aGVuXG5pbXBsb2RlcyBhcyB0aGUgaHVsbCBydXB0dXJlcywgY3J1c2hpbmcgeW91ciBi
12 | b2R5XG5pbnRvIGphbSBqZWxseS5cbicKcDM1CnNic1MnMicKZzUKKGc2Cmc3Ck50UnAzNgooZHAz
13 | NwpnMTAKKGRwMzgKc2cxNQpnMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBwb2QgMiBhbmQgaGl0
14 | IHRoZSBlamVjdCBidXR0b24uXG5UaGUgcG9kIGVhc2lseSBzbGlkZXMgb3V0IGludG8gc3BhY2Ug
15 | aGVhZGluZyB0b1xudGhlIHBsYW5ldCBiZWxvdy4gIEFzIGl0IGZsaWVzIHRvIHRoZSBwbGFuZXQs
16 | IHlvdSBsb29rXG5iYWNrIGFuZCBzZWUgeW91ciBzaGlwIGltcGxvZGUgdGhlbiBleHBsb2RlIGxp
17 | a2UgYVxuYnJpZ2h0IHN0YXIsIHRha2luZyBvdXQgdGhlIEdvdGhvbiBzaGlwIGF0IHRoZSBzYW1l
18 | XG50aW1lLiAgWW91IHdvbiFcbicKcDM5CnNic3NnMTUKUydFc2NhcGUgUG9kJwpwNDAKc2cxNwpT
19 | IlxuWW91IHBvaW50IHlvdXIgYmxhc3RlciBhdCB0aGUgYm9tYiB1bmRlciB5b3VyIGFybVxuYW5k
20 | IHRoZSBHb3Rob25zIHB1dCB0aGVpciBoYW5kcyB1cCBhbmQgc3RhcnQgdG8gc3dlYXQuXG5Zb3Ug
21 | aW5jaCBiYWNrd2FyZCB0byB0aGUgZG9vciwgb3BlbiBpdCwgYW5kIHRoZW4gY2FyZWZ1bGx5XG5w
22 | bGFjZSB0aGUgYm9tYiBvbiB0aGUgZmxvb3IsIHBvaW50aW5nIHlvdXIgYmxhc3RlciBhdCBpdC5c
23 | bllvdSB0aGVuIGp1bXAgYmFjayB0aHJvdWdoIHRoZSBkb29yLCBwdW5jaCB0aGUgY2xvc2UgYnV0
24 | dG9uXG5hbmQgYmxhc3QgdGhlIGxvY2sgc28gdGhlIEdvdGhvbnMgY2FuJ3QgZ2V0IG91dC5cbk5v
25 | dyB0aGF0IHRoZSBib21iIGlzIHBsYWNlZCB5b3UgcnVuIHRvIHRoZSBlc2NhcGUgcG9kIHRvXG5n
26 | ZXQgb2ZmIHRoaXMgdGluIGNhbi5cblxuWW91IHJ1c2ggdGhyb3VnaCB0aGUgc2hpcCBkZXNwZXJh
27 | dGVseSB0cnlpbmcgdG8gbWFrZSBpdCB0b1xudGhlIGVzY2FwZSBwb2QgYmVmb3JlIHRoZSB3aG9s
28 | ZSBzaGlwIGV4cGxvZGVzLiAgSXQgc2VlbXMgbGlrZVxuaGFyZGx5IGFueSBHb3Rob25zIGFyZSBv
29 | biB0aGUgc2hpcCwgc28geW91ciBydW4gaXMgY2xlYXIgb2ZcbmludGVyZmVyZW5jZS4gIFlvdSBn
30 | ZXQgdG8gdGhlIGNoYW1iZXIgd2l0aCB0aGUgZXNjYXBlIHBvZHMsIGFuZFxubm93IG5lZWQgdG8g
31 | cGljayBvbmUgdG8gdGFrZS4gIFNvbWUgb2YgdGhlbSBjb3VsZCBiZSBkYW1hZ2VkXG5idXQgeW91
32 | IGRvbid0IGhhdmUgdGltZSB0byBsb29rLiAgVGhlcmUncyA1IHBvZHMsIHdoaWNoIG9uZVxuZG8g
33 | eW91IHRha2U/XG4iCnA0MQpzYnNzZzE1ClMnVGhlIEJyaWRnZScKcDQyCnNnMTcKUyJcblRoZSBj
34 | b250YWluZXIgY2xpY2tzIG9wZW4gYW5kIHRoZSBzZWFsIGJyZWFrcywgbGV0dGluZyBnYXMgb3V0
35 | LlxuWW91IGdyYWIgdGhlIG5ldXRyb24gYm9tYiBhbmQgcnVuIGFzIGZhc3QgYXMgeW91IGNhbiB0
36 | byB0aGVcbmJyaWRnZSB3aGVyZSB5b3UgbXVzdCBwbGFjZSBpdCBpbiB0aGUgcmlnaHQgc3BvdC5c
37 | blxuWW91IGJ1cnN0IG9udG8gdGhlIEJyaWRnZSB3aXRoIHRoZSBuZXRyb24gZGVzdHJ1Y3QgYm9t
38 | YlxudW5kZXIgeW91ciBhcm0gYW5kIHN1cnByaXNlIDUgR290aG9ucyB3aG8gYXJlIHRyeWluZyB0
39 | b1xudGFrZSBjb250cm9sIG9mIHRoZSBzaGlwLiAgRWFjaCBvZiB0aGVtIGhhcyBhbiBldmVuIHVn
40 | bGllclxuY2xvd24gY29zdHVtZSB0aGFuIHRoZSBsYXN0LiAgVGhleSBoYXZlbid0IHB1bGxlZCB0
41 | aGVpclxud2VhcG9ucyBvdXQgeWV0LCBhcyB0aGV5IHNlZSB0aGUgYWN0aXZlIGJvbWIgdW5kZXIg
42 | eW91clxuYXJtIGFuZCBkb24ndCB3YW50IHRvIHNldCBpdCBvZmYuXG4iCnA0MwpzYnNTJyonCmcx
43 | OQpzc2cxNQpTJ0xhc2VyIFdlYXBvbiBBcm1vcnknCnA0NApzZzE3ClMiXG5MdWNreSBmb3IgeW91
44 | IHRoZXkgbWFkZSB5b3UgbGVhcm4gR290aG9uIGluc3VsdHMgaW4gdGhlIGFjYWRlbXkuXG5Zb3Ug
45 | dGVsbCB0aGUgb25lIEdvdGhvbiBqb2tlIHlvdSBrbm93OlxuTGJoZSB6Ymd1cmUgdmYgZmIgc25n
46 | LCBqdXJhIGZ1ciBmdmdmIG5lYmhhcSBndXIgdWJoZnIsIGZ1ciBmdmdmIG5lYmhhcSBndXIgXG51
47 | Ymhmci5cblRoZSBHb3Rob24gc3RvcHMsIHRyaWVzIG5vdCB0byBsYXVnaCwgdGhlbiBidXN0cyBv
48 | dXQgbGF1Z2hpbmcgYW5kIGNhbid0IFxubW92ZS5cbldoaWxlIGhlJ3MgbGF1Z2hpbmcgeW91IHJ1
49 | biB1cCBhbmQgc2hvb3QgaGltIHNxdWFyZSBpbiB0aGUgaGVhZFxucHV0dGluZyBoaW0gZG93biwg
50 | dGhlbiBqdW1wIHRocm91Z2ggdGhlIFdlYXBvbiBBcm1vcnkgZG9vci5cblxuWW91IGRvIGEgZGl2
51 | ZSByb2xsIGludG8gdGhlIFdlYXBvbiBBcm1vcnksIGNyb3VjaCBhbmQgc2NhbiB0aGUgcm9vbVxu
52 | Zm9yIG1vcmUgR290aG9ucyB0aGF0IG1pZ2h0IGJlIGhpZGluZy4gIEl0J3MgZGVhZCBxdWlldCwg
53 | dG9vIHF1aWV0LlxuWW91IHN0YW5kIHVwIGFuZCBydW4gdG8gdGhlIGZhciBzaWRlIG9mIHRoZSBy
54 | b29tIGFuZCBmaW5kIHRoZVxubmV1dHJvbiBib21iIGluIGl0cyBjb250YWluZXIuICBUaGVyZSdz
55 | IGEga2V5cGFkIGxvY2sgb24gdGhlIGJveFxuYW5kIHlvdSBuZWVkIHRoZSBjb2RlIHRvIGdldCB0
56 | aGUgYm9tYiBvdXQuICBJZiB5b3UgZ2V0IHRoZSBjb2RlXG53cm9uZyAxMCB0aW1lcyB0aGVuIHRo
57 | ZSBsb2NrIGNsb3NlcyBmb3JldmVyIGFuZCB5b3UgY2FuJ3RcbmdldCB0aGUgYm9tYi4gIFRoZSBj
58 | b2RlIGlzIDMgZGlnaXRzLiAoaGludDogdGhlIGZpcnN0IHR3byBkaWdpdHMgYXJlIDEzKVxuIgpw
59 | NDUKc2JzUycyJwpnMTkKc3NnMTUKUydDZW50cmFsIENvcnJpZG9yJwpwNDYKc2cxNwpTIlxuVGhl
60 | IEdvdGhvbnMgb2YgUGxhbmV0IFBlcmNhbCAjMjUgaGF2ZSBpbnZhZGVkIHlvdXIgc2hpcCBhbmQg
61 | ZGVzdHJveWVkXG55b3VyIGVudGlyZSBjcmV3LiAgWW91IGFyZSB0aGUgbGFzdCBzdXJ2aXZpbmcg
62 | bWVtYmVyIGFuZCB5b3VyIGxhc3Rcbm1pc3Npb24gaXMgdG8gZ2V0IHRoZSBuZXV0cm9uIGRlc3Ry
63 | dWN0IGJvbWIgZnJvbSB0aGUgV2VhcG9ucyBBcm1vcnksXG5wdXQgaXQgaW4gdGhlIGJyaWRnZSwg
64 | YW5kIGJsb3cgdGhlIHNoaXAgdXAgYWZ0ZXIgZ2V0dGluZyBpbnRvIGFuIFxuZXNjYXBlIHBvZC5c
65 | blxuWW91J3JlIHJ1bm5pbmcgZG93biB0aGUgY2VudHJhbCBjb3JyaWRvciB0byB0aGUgV2VhcG9u
66 | cyBBcm1vcnkgd2hlblxuYSBHb3Rob24ganVtcHMgb3V0LCByZWQgc2NhbHkgc2tpbiwgZGFyayBn
67 | cmlteSB0ZWV0aCwgYW5kIGV2aWwgY2xvd24gY29zdHVtZVxuZmxvd2luZyBhcm91bmQgaGlzIGhh
68 | dGUgZmlsbGVkIGJvZHkuICBIZSdzIGJsb2NraW5nIHRoZSBkb29yIHRvIHRoZVxuQXJtb3J5IGFu
69 | ZCBhYm91dCB0byBwdWxsIGEgd2VhcG9uIHRvIGJsYXN0IHlvdS4gWW91IGNhbiBlaXRoZXI6IDEp
70 | IGVhdCBoaW0sIFxuMikga2ljayBoaW0gaW4gdGhlIG51dHMsIG9yIDMpIGJsYXN0IGhpbSBmaXJz
71 | dC5cbiIKcDQ3CnNic1Mnc2Vzc2lvbl9pZCcKcDQ4ClMnNTUyNGI0YzE4MjhkZTI3M2I4YWU0Yzcw
72 | YmJiZTBlNjMxZTAzMWU0YScKcDQ5CnMu
73 | 


--------------------------------------------------------------------------------
/gothonweb/sessions/6adbe20488a3ffd0040abc4ac06991d1d79c97d0:
--------------------------------------------------------------------------------
 1 | KGRwMQpTJ2lwJwpwMgpWMTI3LjAuMC4xCnAzCnNTJ3Jvb20nCnA0CmNjb3B5X3JlZwpfcmVjb25z
 2 | dHJ1Y3RvcgpwNQooY21hcApSb29tCnA2CmNfX2J1aWx0aW5fXwpvYmplY3QKcDcKTnRScDgKKGRw
 3 | OQpTJ3BhdGhzJwpwMTAKKGRwMTEKUycxJwpnNQooZzYKZzcKTnRScDEyCihkcDEzCmcxMAooZHAx
 4 | NApzUyduYW1lJwpwMTUKUydkZWF0aCcKcDE2CnNTJ2Rlc2NyaXB0aW9uJwpwMTcKUydZb3UgZGll
 5 | ZC4nCnAxOApzYnNTJzMnCmc1CihnNgpnNwpOdFJwMTkKKGRwMjAKZzEwCihkcDIxClMnMTMyJwpw
 6 | MjIKZzUKKGc2Cmc3Ck50UnAyMwooZHAyNApnMTAKKGRwMjUKUyd0aHJvdyB0aGUgYm9tYicKcDI2
 7 | CmcxMgpzUydzbG93bHkgcGxhY2UgdGhlIGJvbWInCnAyNwpnNQooZzYKZzcKTnRScDI4CihkcDI5
 8 | CmcxMAooZHAzMApTJzInCmc1CihnNgpnNwpOdFJwMzEKKGRwMzIKZzEwCihkcDMzCnNnMTUKUydU
 9 | aGUgRW5kJwpwMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBwb2QgMiBhbmQgaGl0IHRoZSBlamVj
10 | dCBidXR0b24uXG5UaGUgcG9kIGVhc2lseSBzbGlkZXMgb3V0IGludG8gc3BhY2UgaGVhZGluZyB0
11 | b1xudGhlIHBsYW5ldCBiZWxvdy4gIEFzIGl0IGZsaWVzIHRvIHRoZSBwbGFuZXQsIHlvdSBsb29r
12 | XG5iYWNrIGFuZCBzZWUgeW91ciBzaGlwIGltcGxvZGUgdGhlbiBleHBsb2RlIGxpa2UgYVxuYnJp
13 | Z2h0IHN0YXIsIHRha2luZyBvdXQgdGhlIEdvdGhvbiBzaGlwIGF0IHRoZSBzYW1lXG50aW1lLiAg
14 | WW91IHdvbiFcbicKcDM1CnNic1MnKicKZzUKKGc2Cmc3Ck50UnAzNgooZHAzNwpnMTAKKGRwMzgK
15 | c2cxNQpnMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBhIHJhbmRvbSBwb2QgYW5kIGhpdCB0aGUg
16 | ZWplY3QgYnV0dG9uLlxuVGhlIHBvZCBlc2NhcGVzIG91dCBpbnRvIHRoZSB2b2lkIG9mIHNwYWNl
17 | LCB0aGVuXG5pbXBsb2RlcyBhcyB0aGUgaHVsbCBydXB0dXJlcywgY3J1c2hpbmcgeW91ciBib2R5
18 | XG5pbnRvIGphbSBqZWxseS5cbicKcDM5CnNic3NnMTUKUydFc2NhcGUgUG9kJwpwNDAKc2cxNwpT
19 | IlxuWW91IHBvaW50IHlvdXIgYmxhc3RlciBhdCB0aGUgYm9tYiB1bmRlciB5b3VyIGFybVxuYW5k
20 | IHRoZSBHb3Rob25zIHB1dCB0aGVpciBoYW5kcyB1cCBhbmQgc3RhcnQgdG8gc3dlYXQuXG5Zb3Ug
21 | aW5jaCBiYWNrd2FyZCB0byB0aGUgZG9vciwgb3BlbiBpdCwgYW5kIHRoZW4gY2FyZWZ1bGx5XG5w
22 | bGFjZSB0aGUgYm9tYiBvbiB0aGUgZmxvb3IsIHBvaW50aW5nIHlvdXIgYmxhc3RlciBhdCBpdC5c
23 | bllvdSB0aGVuIGp1bXAgYmFjayB0aHJvdWdoIHRoZSBkb29yLCBwdW5jaCB0aGUgY2xvc2UgYnV0
24 | dG9uXG5hbmQgYmxhc3QgdGhlIGxvY2sgc28gdGhlIEdvdGhvbnMgY2FuJ3QgZ2V0IG91dC5cbk5v
25 | dyB0aGF0IHRoZSBib21iIGlzIHBsYWNlZCB5b3UgcnVuIHRvIHRoZSBlc2NhcGUgcG9kIHRvXG5n
26 | ZXQgb2ZmIHRoaXMgdGluIGNhbi5cblxuWW91IHJ1c2ggdGhyb3VnaCB0aGUgc2hpcCBkZXNwZXJh
27 | dGVseSB0cnlpbmcgdG8gbWFrZSBpdCB0b1xudGhlIGVzY2FwZSBwb2QgYmVmb3JlIHRoZSB3aG9s
28 | ZSBzaGlwIGV4cGxvZGVzLiAgSXQgc2VlbXMgbGlrZVxuaGFyZGx5IGFueSBHb3Rob25zIGFyZSBv
29 | biB0aGUgc2hpcCwgc28geW91ciBydW4gaXMgY2xlYXIgb2ZcbmludGVyZmVyZW5jZS4gIFlvdSBn
30 | ZXQgdG8gdGhlIGNoYW1iZXIgd2l0aCB0aGUgZXNjYXBlIHBvZHMsIGFuZFxubm93IG5lZWQgdG8g
31 | cGljayBvbmUgdG8gdGFrZS4gIFNvbWUgb2YgdGhlbSBjb3VsZCBiZSBkYW1hZ2VkXG5idXQgeW91
32 | IGRvbid0IGhhdmUgdGltZSB0byBsb29rLiAgVGhlcmUncyA1IHBvZHMsIHdoaWNoIG9uZVxuZG8g
33 | eW91IHRha2U/XG4iCnA0MQpzYnNzZzE1ClMnVGhlIEJyaWRnZScKcDQyCnNnMTcKUyJcblRoZSBj
34 | b250YWluZXIgY2xpY2tzIG9wZW4gYW5kIHRoZSBzZWFsIGJyZWFrcywgbGV0dGluZyBnYXMgb3V0
35 | LlxuWW91IGdyYWIgdGhlIG5ldXRyb24gYm9tYiBhbmQgcnVuIGFzIGZhc3QgYXMgeW91IGNhbiB0
36 | byB0aGVcbmJyaWRnZSB3aGVyZSB5b3UgbXVzdCBwbGFjZSBpdCBpbiB0aGUgcmlnaHQgc3BvdC5c
37 | blxuWW91IGJ1cnN0IG9udG8gdGhlIEJyaWRnZSB3aXRoIHRoZSBuZXRyb24gZGVzdHJ1Y3QgYm9t
38 | YlxudW5kZXIgeW91ciBhcm0gYW5kIHN1cnByaXNlIDUgR290aG9ucyB3aG8gYXJlIHRyeWluZyB0
39 | b1xudGFrZSBjb250cm9sIG9mIHRoZSBzaGlwLiAgRWFjaCBvZiB0aGVtIGhhcyBhbiBldmVuIHVn
40 | bGllclxuY2xvd24gY29zdHVtZSB0aGFuIHRoZSBsYXN0LiAgVGhleSBoYXZlbid0IHB1bGxlZCB0
41 | aGVpclxud2VhcG9ucyBvdXQgeWV0LCBhcyB0aGV5IHNlZSB0aGUgYWN0aXZlIGJvbWIgdW5kZXIg
42 | eW91clxuYXJtIGFuZCBkb24ndCB3YW50IHRvIHNldCBpdCBvZmYuXG4iCnA0MwpzYnNTJyonCmcx
43 | OQpzc2cxNQpTJ0xhc2VyIFdlYXBvbiBBcm1vcnknCnA0NApzZzE3ClMiXG5MdWNreSBmb3IgeW91
44 | IHRoZXkgbWFkZSB5b3UgbGVhcm4gR290aG9uIGluc3VsdHMgaW4gdGhlIGFjYWRlbXkuXG5Zb3Ug
45 | dGVsbCB0aGUgb25lIEdvdGhvbiBqb2tlIHlvdSBrbm93OlxuTGJoZSB6Ymd1cmUgdmYgZmIgc25n
46 | LCBqdXJhIGZ1ciBmdmdmIG5lYmhhcSBndXIgdWJoZnIsIGZ1ciBmdmdmIG5lYmhhcSBndXIgXG51
47 | Ymhmci5cblRoZSBHb3Rob24gc3RvcHMsIHRyaWVzIG5vdCB0byBsYXVnaCwgdGhlbiBidXN0cyBv
48 | dXQgbGF1Z2hpbmcgYW5kIGNhbid0IFxubW92ZS5cbldoaWxlIGhlJ3MgbGF1Z2hpbmcgeW91IHJ1
49 | biB1cCBhbmQgc2hvb3QgaGltIHNxdWFyZSBpbiB0aGUgaGVhZFxucHV0dGluZyBoaW0gZG93biwg
50 | dGhlbiBqdW1wIHRocm91Z2ggdGhlIFdlYXBvbiBBcm1vcnkgZG9vci5cblxuWW91IGRvIGEgZGl2
51 | ZSByb2xsIGludG8gdGhlIFdlYXBvbiBBcm1vcnksIGNyb3VjaCBhbmQgc2NhbiB0aGUgcm9vbVxu
52 | Zm9yIG1vcmUgR290aG9ucyB0aGF0IG1pZ2h0IGJlIGhpZGluZy4gIEl0J3MgZGVhZCBxdWlldCwg
53 | dG9vIHF1aWV0LlxuWW91IHN0YW5kIHVwIGFuZCBydW4gdG8gdGhlIGZhciBzaWRlIG9mIHRoZSBy
54 | b29tIGFuZCBmaW5kIHRoZVxubmV1dHJvbiBib21iIGluIGl0cyBjb250YWluZXIuICBUaGVyZSdz
55 | IGEga2V5cGFkIGxvY2sgb24gdGhlIGJveFxuYW5kIHlvdSBuZWVkIHRoZSBjb2RlIHRvIGdldCB0
56 | aGUgYm9tYiBvdXQuICBJZiB5b3UgZ2V0IHRoZSBjb2RlXG53cm9uZyAxMCB0aW1lcyB0aGVuIHRo
57 | ZSBsb2NrIGNsb3NlcyBmb3JldmVyIGFuZCB5b3UgY2FuJ3RcbmdldCB0aGUgYm9tYi4gIFRoZSBj
58 | b2RlIGlzIDMgZGlnaXRzLiAoaGludDogdGhlIGZpcnN0IHR3byBkaWdpdHMgYXJlIDEzKVxuIgpw
59 | NDUKc2JzUycyJwpnMTkKc3NnMTUKUydDZW50cmFsIENvcnJpZG9yJwpwNDYKc2cxNwpTIlxuVGhl
60 | IEdvdGhvbnMgb2YgUGxhbmV0IFBlcmNhbCAjMjUgaGF2ZSBpbnZhZGVkIHlvdXIgc2hpcCBhbmQg
61 | ZGVzdHJveWVkXG55b3VyIGVudGlyZSBjcmV3LiAgWW91IGFyZSB0aGUgbGFzdCBzdXJ2aXZpbmcg
62 | bWVtYmVyIGFuZCB5b3VyIGxhc3Rcbm1pc3Npb24gaXMgdG8gZ2V0IHRoZSBuZXV0cm9uIGRlc3Ry
63 | dWN0IGJvbWIgZnJvbSB0aGUgV2VhcG9ucyBBcm1vcnksXG5wdXQgaXQgaW4gdGhlIGJyaWRnZSwg
64 | YW5kIGJsb3cgdGhlIHNoaXAgdXAgYWZ0ZXIgZ2V0dGluZyBpbnRvIGFuIFxuZXNjYXBlIHBvZC5c
65 | blxuWW91J3JlIHJ1bm5pbmcgZG93biB0aGUgY2VudHJhbCBjb3JyaWRvciB0byB0aGUgV2VhcG9u
66 | cyBBcm1vcnkgd2hlblxuYSBHb3Rob24ganVtcHMgb3V0LCByZWQgc2NhbHkgc2tpbiwgZGFyayBn
67 | cmlteSB0ZWV0aCwgYW5kIGV2aWwgY2xvd24gY29zdHVtZVxuZmxvd2luZyBhcm91bmQgaGlzIGhh
68 | dGUgZmlsbGVkIGJvZHkuICBIZSdzIGJsb2NraW5nIHRoZSBkb29yIHRvIHRoZVxuQXJtb3J5IGFu
69 | ZCBhYm91dCB0byBwdWxsIGEgd2VhcG9uIHRvIGJsYXN0IHlvdS4gWW91IGNhbiBlaXRoZXI6IDEp
70 | IGVhdCBoaW0sIFxuMikga2ljayBoaW0gaW4gdGhlIG51dHMsIG9yIDMpIGJsYXN0IGhpbSBmaXJz
71 | dC5cbiIKcDQ3CnNic1Mnc2Vzc2lvbl9pZCcKcDQ4ClMnNmFkYmUyMDQ4OGEzZmZkMDA0MGFiYzRh
72 | YzA2OTkxZDFkNzljOTdkMCcKcDQ5CnMu
73 | 


--------------------------------------------------------------------------------
/tutorials/algorithms/notebooks/Lesson1.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Lesson 1: Case Study"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "def naive(a,b):\n",
 19 |     "    x = a\n",
 20 |     "    y = b\n",
 21 |     "    z = 0\n",
 22 |     "    while x > 0:\n",
 23 |     "        z = z + y\n",
 24 |     "        x = x - 1\n",
 25 |     "    return z"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 8,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "516\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "print(naive(43,12))"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 9,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "17.0\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "print(102/6)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 12,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "24\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "import math\n",
 83 |     "\n",
 84 |     "print(math.factorial(4))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "#### Russian Peasants Algorithm\n",
 92 |     "(Ancient Egyption Multiplication)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 7,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "def russian(a,b):\n",
104 |     "    x = a\n",
105 |     "    y = b\n",
106 |     "    z = 0\n",
107 |     "    while x > 0:\n",
108 |     "        if x % 2 == 1: z = z + y # if x is odd add y to z\n",
109 |     "        y = y << 1\n",
110 |     "        x = x >> 1\n",
111 |     "    return z\n",
112 |     "    "
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 17,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "140\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "print(russian(20,7))"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "binary numbers:\n",
139 |     "1(2$^0$), 2(2$^1$), 4(2$^2$), 8(2$^3$), 16(2$^4$), 32(2$^5$), 64(2$^6$), 128(2$^7$), 256(2$^8$) \n",
140 |     "\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 16,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "256\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "print(2**8)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 19,
165 |    "metadata": {
166 |     "collapsed": false
167 |    },
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "24.5\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "print(int(49)/int(2))"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 21,
184 |    "metadata": {
185 |     "collapsed": false
186 |    },
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "1\n",
193 |       "None\n",
194 |       "9\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "import math\n",
200 |     "\n",
201 |     "def time(n):\n",
202 |     "    \"\"\" Return the number of steps \n",
203 |     "    necessary to calculate\n",
204 |     "    `print countdown(n)`\"\"\"\n",
205 |     "    steps = 0\n",
206 |     "    \n",
207 |     "    if n >= 10: steps = math.ceil(n/5.0)*2 + 3\n",
208 |     "    if (n < 10) & (n > 5) : steps = 7.0\n",
209 |     "    if (n <= 5) : steps = 5.0\n",
210 |     "    \n",
211 |     "    # answer.\n",
212 |     "    #steps = 3 + 2 * math.ceil(n/5.0)\n",
213 |     "    \n",
214 |     "    return steps\n",
215 |     "\n",
216 |     "def countdown(x):\n",
217 |     "    y = 0\n",
218 |     "    while x > 0:\n",
219 |     "        x = x - 5\n",
220 |     "        y = y + 1\n",
221 |     "    print (y)\n",
222 |     "\n",
223 |     "print (countdown(5))\n",
224 |     "print (time(12))"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "outputs": [],
234 |    "source": []
235 |   }
236 |  ],
237 |  "metadata": {
238 |   "kernelspec": {
239 |    "display_name": "Python [Root]",
240 |    "language": "python",
241 |    "name": "Python [Root]"
242 |   },
243 |   "language_info": {
244 |    "codemirror_mode": {
245 |     "name": "ipython",
246 |     "version": 3
247 |    },
248 |    "file_extension": ".py",
249 |    "mimetype": "text/x-python",
250 |    "name": "python",
251 |    "nbconvert_exporter": "python",
252 |    "pygments_lexer": "ipython3",
253 |    "version": "3.5.2"
254 |   }
255 |  },
256 |  "nbformat": 4,
257 |  "nbformat_minor": 0
258 | }
259 | 


--------------------------------------------------------------------------------
/tutorials/algorithms/notebooks/.ipynb_checkpoints/Lesson1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "Lesson 1: Case Study"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "def naive(a,b):\n",
 19 |     "    x = a\n",
 20 |     "    y = b\n",
 21 |     "    z = 0\n",
 22 |     "    while x > 0:\n",
 23 |     "        z = z + y\n",
 24 |     "        x = x - 1\n",
 25 |     "    return z"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 8,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "name": "stdout",
 37 |      "output_type": "stream",
 38 |      "text": [
 39 |       "516\n"
 40 |      ]
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "print(naive(43,12))"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 9,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "17.0\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "print(102/6)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 12,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "name": "stdout",
 75 |      "output_type": "stream",
 76 |      "text": [
 77 |       "24\n"
 78 |      ]
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "import math\n",
 83 |     "\n",
 84 |     "print(math.factorial(4))"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "#### Russian Peasants Algorithm\n",
 92 |     "(Ancient Egyption Multiplication)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 7,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "def russian(a,b):\n",
104 |     "    x = a\n",
105 |     "    y = b\n",
106 |     "    z = 0\n",
107 |     "    while x > 0:\n",
108 |     "        if x % 2 == 1: z = z + y # if x is odd add y to z\n",
109 |     "        y = y << 1\n",
110 |     "        x = x >> 1\n",
111 |     "    return z\n",
112 |     "    "
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 17,
118 |    "metadata": {
119 |     "collapsed": false
120 |    },
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "140\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "print(russian(20,7))"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "binary numbers:\n",
139 |     "1(2$^0$), 2(2$^1$), 4(2$^2$), 8(2$^3$), 16(2$^4$), 32(2$^5$), 64(2$^6$), 128(2$^7$), 256(2$^8$) \n",
140 |     "\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 16,
146 |    "metadata": {
147 |     "collapsed": false
148 |    },
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "256\n"
155 |      ]
156 |     }
157 |    ],
158 |    "source": [
159 |     "print(2**8)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 19,
165 |    "metadata": {
166 |     "collapsed": false
167 |    },
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "24.5\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "print(int(49)/int(2))"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 21,
184 |    "metadata": {
185 |     "collapsed": false
186 |    },
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "1\n",
193 |       "None\n",
194 |       "9\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "import math\n",
200 |     "\n",
201 |     "def time(n):\n",
202 |     "    \"\"\" Return the number of steps \n",
203 |     "    necessary to calculate\n",
204 |     "    `print countdown(n)`\"\"\"\n",
205 |     "    steps = 0\n",
206 |     "    \n",
207 |     "    if n >= 10: steps = math.ceil(n/5.0)*2 + 3\n",
208 |     "    if (n < 10) & (n > 5) : steps = 7.0\n",
209 |     "    if (n <= 5) : steps = 5.0\n",
210 |     "    \n",
211 |     "    # answer.\n",
212 |     "    #steps = 3 + 2 * math.ceil(n/5.0)\n",
213 |     "    \n",
214 |     "    return steps\n",
215 |     "\n",
216 |     "def countdown(x):\n",
217 |     "    y = 0\n",
218 |     "    while x > 0:\n",
219 |     "        x = x - 5\n",
220 |     "        y = y + 1\n",
221 |     "    print (y)\n",
222 |     "\n",
223 |     "print (countdown(5))\n",
224 |     "print (time(12))"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "outputs": [],
234 |    "source": []
235 |   }
236 |  ],
237 |  "metadata": {
238 |   "kernelspec": {
239 |    "display_name": "Python [Root]",
240 |    "language": "python",
241 |    "name": "Python [Root]"
242 |   },
243 |   "language_info": {
244 |    "codemirror_mode": {
245 |     "name": "ipython",
246 |     "version": 3
247 |    },
248 |    "file_extension": ".py",
249 |    "mimetype": "text/x-python",
250 |    "name": "python",
251 |    "nbconvert_exporter": "python",
252 |    "pygments_lexer": "ipython3",
253 |    "version": "3.5.2"
254 |   }
255 |  },
256 |  "nbformat": 4,
257 |  "nbformat_minor": 0
258 | }
259 | 


--------------------------------------------------------------------------------
/tutorials/026-Linear_Regression_Analysis.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### This notebook will use modelling software to generate the model coefficients a0, a1 and a2 to investigate FICO Score and Loan Amount as predictors of Interest Rate"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 44,
 13 |    "metadata": {
 14 |     "collapsed": false
 15 |    },
 16 |    "outputs": [
 17 |     {
 18 |      "name": "stdout",
 19 |      "output_type": "stream",
 20 |      "text": [
 21 |       "Populating the interactive namespace from numpy and matplotlib\n",
 22 |       "[[735]\n",
 23 |       " [715]]\n",
 24 |       "[[20000]\n",
 25 |       " [19200]]\n",
 26 |       "[[  1.00000000e+00   7.35000000e+02   2.00000000e+04]\n",
 27 |       " [  1.00000000e+00   7.15000000e+02   1.92000000e+04]\n",
 28 |       " [  1.00000000e+00   6.95000000e+02   1.00000000e+04]\n",
 29 |       " ..., \n",
 30 |       " [  1.00000000e+00   6.80000000e+02   1.00000000e+04]\n",
 31 |       " [  1.00000000e+00   6.75000000e+02   6.00000000e+03]\n",
 32 |       " [  1.00000000e+00   6.70000000e+02   9.00000000e+03]] [[  735 20000]\n",
 33 |       " [  715 19200]\n",
 34 |       " [  695 10000]\n",
 35 |       " ..., \n",
 36 |       " [  680 10000]\n",
 37 |       " [  675  6000]\n",
 38 |       " [  670  9000]]\n",
 39 |       "Coefficients:  [ 0.7232804  -0.00087589]\n",
 40 |       "Intercepts:  1.97716000896e-06\n",
 41 |       "P-Values:  [  0.00000000e+00   0.00000000e+00   3.00521465e-98]\n",
 42 |       "R-Squared:  0.644760522744\n"
 43 |      ]
 44 |     },
 45 |     {
 46 |      "name": "stderr",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "WARNING: pylab import has clobbered these variables: ['f']\n",
 50 |       "`%matplotlib` prevents importing * from pylab and numpy\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "%pylab inline\n",
 56 |     "import pylab as pl\n",
 57 |     "import numpy as np\n",
 58 |     "import pandas as pd\n",
 59 |     "import statsmodels.api as sm\n",
 60 |     "\n",
 61 |     "# import the cleaned up dataset\n",
 62 |     "df = pd.read_csv('/home/sophie/projects/LendingClub/data/clean_LD.csv')\n",
 63 |     "\n",
 64 |     "intrate = df['Interest.Rate']\n",
 65 |     "loanamt = df['Amount.Requested']\n",
 66 |     "fico = df['FICO.Score']\n",
 67 |     "\n",
 68 |     "# reshape the data from a pandas Series to columns\n",
 69 |     "# the dependent variable\n",
 70 |     "# This creates a 2D array, with T turning it from (1,1867) to (1867,1)\n",
 71 |     "y = np.matrix(intrate).T # I think T does the same as transpose()\n",
 72 |     "\n",
 73 |     "# the independent variables shaped as columns\n",
 74 |     "x1 = np.matrix(fico).transpose()\n",
 75 |     "x2 = np.matrix(loanamt).transpose()\n",
 76 |     "\n",
 77 |     "# put the two columns together to create an input matrix\n",
 78 |     "# if we had n independent variables we would have n columns here\n",
 79 |     "x = np.column_stack([x1,x2])  # column_stack takes a sequence fo 1-D arrays and stacks them as columns.\n",
 80 |     "\n",
 81 |     "print x[0:2,0] # to access x1\n",
 82 |     "print x[0:2,1] # to access x2\n",
 83 |     "\n",
 84 |     "# create a linear model and fit it to the data\n",
 85 |     "X = sm.add_constant(x) # adds a column of 1s (the first column) to the x (2D stacked data)\n",
 86 |     "model = sm.OLS(y,X)    # creates an ordinary least squares model. Y = response variable, X, should include an intercept.\n",
 87 |     "\n",
 88 |     "# f is a A RegressionResults class instance. The list of attributes are found \n",
 89 |     "# here http://statsmodels.sourceforge.net/devel/generated/statsmodels.regression.linear_model.RegressionResults.html\n",
 90 |     "f = model.fit() # fit is one of the methods which can be applied to an OLS object\n",
 91 |     "\n",
 92 |     "print 'Coefficients: ', f.params[0:2] # linear coefficients that minimize the least squares criterion. a1 and a2\n",
 93 |     "print 'Intercepts: ', f.params[2] # a0\n",
 94 |     "print 'P-Values: ', f.pvalues\n",
 95 |     "print 'R-Squared: ', f.rsquared\n",
 96 |     "\n"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {
102 |     "collapsed": true
103 |    },
104 |    "source": [
105 |     "Coefficients: contains $a_1$ and $a_2$\n",
106 |     "Intercept: is at $a_0$\n",
107 |     "\n",
108 |     "Next, we need to work out how reliable the numbers are. \n",
109 |     "P-values are probabilities we can use to do this and to be confident we want it to be close to 0.\n",
110 |     "Convention is p < 0.05. If it is more, we have less confidence using that dimension in modelling and predicting.\n",
111 |     "\n",
112 |     "$R^2$ : How much variance in the data is captured by the model.      \n",
113 |     "$R$ : coefficient of correlation between independent variables and dependent variable. How much Y depends on the seperate X's. Lies between -1 and 1, so $R^2$ lies between 0 and 1.\n",
114 |     "We want a high $R^2$.\n",
115 |     "\n",
116 |     "We have created a linear multivariate regression model for Interest Rate, which is well described by the parameters above.\n"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": []
127 |   }
128 |  ],
129 |  "metadata": {
130 |   "kernelspec": {
131 |    "display_name": "Python 2",
132 |    "language": "python",
133 |    "name": "python2"
134 |   },
135 |   "language_info": {
136 |    "codemirror_mode": {
137 |     "name": "ipython",
138 |     "version": 2
139 |    },
140 |    "file_extension": ".py",
141 |    "mimetype": "text/x-python",
142 |    "name": "python",
143 |    "nbconvert_exporter": "python",
144 |    "pygments_lexer": "ipython2",
145 |    "version": "2.7.11"
146 |   }
147 |  },
148 |  "nbformat": 4,
149 |  "nbformat_minor": 0
150 | }
151 | 


--------------------------------------------------------------------------------
/DSFromScratch/Chap6/064-Chap6.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Data Science from Scratch\n",
  8 |     "\n",
  9 |     "#### Chapter 6: Probability"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "#### Dependence and Independence\n",
 17 |     "\n",
 18 |     "Two events *E* and *F* are independent if the probability that they both happen is the product of the probabilities that each one happens:\n",
 19 |     "\n",
 20 |     "`P(E,F) = P(E)P(F)`"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "#### Conditional Probability\n",
 28 |     "\n",
 29 |     "`E` conditional on `F` as:\n",
 30 |     "`P(E|F) = P(E,F)/P(F)`\n",
 31 |     "\n",
 32 |     "This is the probability that E happens given that we know that `F` happens.    \n",
 33 |     "Often rewritten as:\n",
 34 |     "\n",
 35 |     "`P(E,F) = P(E|F)P(F)`\n",
 36 |     "\n",
 37 |     "Mathematically, this says that knowing that F occurred gives us no additional information about whether E occurred."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": []
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": []
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "#### Baye's Theorem\n",
 63 |     "\n",
 64 |     "Imagine a certain disease that affects 1 in every 10,000 people. And imagine\n",
 65 |     "that there is a test for this disease that gives the correct result (“diseased” if you have\n",
 66 |     "the disease, “nondiseased” if you don’t) 99% of the time.\n",
 67 |     "\n",
 68 |     "A more intuitive way to see this is to imagine a population of 1 million people. You’d\n",
 69 |     "expect 100 of them to have the disease, and 99 of those 100 to test positive. On the\n",
 70 |     "other hand, you’d expect 999,900 of them not to have the disease, and 9,999 of those\n",
 71 |     "to test positive. Which means that"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": []
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": []
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "#### Random Variables"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": []
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "#### Continuous Distributions"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": []
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "#### The Normal Distribution"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": []
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {
155 |     "collapsed": true
156 |    },
157 |    "outputs": [],
158 |    "source": []
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "#### The Central Limit Theorem"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": []
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {
180 |     "collapsed": true
181 |    },
182 |    "outputs": [],
183 |    "source": []
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "outputs": [],
192 |    "source": []
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": []
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": []
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": []
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "Python [Root]",
225 |    "language": "python",
226 |    "name": "Python [Root]"
227 |   },
228 |   "language_info": {
229 |    "codemirror_mode": {
230 |     "name": "ipython",
231 |     "version": 3
232 |    },
233 |    "file_extension": ".py",
234 |    "mimetype": "text/x-python",
235 |    "name": "python",
236 |    "nbconvert_exporter": "python",
237 |    "pygments_lexer": "ipython3",
238 |    "version": "3.5.2"
239 |   }
240 |  },
241 |  "nbformat": 4,
242 |  "nbformat_minor": 0
243 | }
244 | 


--------------------------------------------------------------------------------
/windspeed/scripts/038-group_tseries.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import datetime as datetime
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # Creating a panel of timeseries for each group of stations.
  7 | 
  8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 
  9 | # obs per month. 
 10 | 
 11 | # An average over the group will be an extra plot in the panel.
 12 | 
 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh',
 14 | '60555Touggourt','60559ElOued','60566Ghardaia','60580Ouargla',
 15 | '60581HassiMessaoud']
 16 | 
 17 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah',
 18 | '62103Ghadames','62124Sebha']
 19 | 
 20 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa',
 21 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika']
 22 | 
 23 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya',
 24 | '61017Bilma'] 
 25 | 
 26 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan',
 27 | '62420Baharia','62423Farafra','62435Kharga'] 
 28 | 
 29 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima',
 30 | '62680Atbara']
 31 | 
 32 | 
 33 | stations=[NAl,CSar,WSa,CSal,Egy,Sud]
 34 | #stations = [CSar, WSa]
 35 | 
 36 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 
 37 | 'Egypt':Egy,'Sudan':Sud}
 38 | 
 39 | group_strings=['NAlgeria','CSahara','WSahel','CSahel', 'Egypt','Sudan']
 40 | #group_strings=['CSahara','WSahel']
 41 | 
 42 | 
 43 | # Could these two functions be turned into lambda functions?
 44 | # Would that be preferable or are these fine?
 45 | 
 46 | def meanf(x):
 47 |     if x.count() > 10:
 48 |        return x.mean()
 49 | 	
 50 | def sdf(x):
 51 |     if x.count() > 10:
 52 |         return x.std()
 53 | 
 54 | def read_file(fname):
 55 |     '''put the station name into read_file and read_file will return a 
 56 |     dataFrame called wind which has the following columns a dataframe with a 
 57 |     datetime index'''
 58 |     
 59 |      
 60 |     column_names=["year","month","day","hour","ws"]
 61 |     dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}
 62 |     
 63 |     datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname
 64 | 
 65 |     # specify the columns you want to group together. Can't include hour at 
 66 |     # this point as it is not in the right format. 
 67 |     date_spec = {'date_time': [0,1,2]}
 68 | 
 69 |     # when you use keep_dat_col it keeps them as objects, not as the dtype you 
 70 |     # read them in as.
 71 |     wind = pd.read_csv(datafile, sep=" ", names=column_names, 
 72 |     parse_dates=date_spec,   keep_date_col=True, index_col=False ) 
 73 | 
 74 |     # Dealing with hour - going from 600, 1200 etc to 6,12, 18
 75 |     wind["hour"]=(wind["hour"]/100).astype(int)
 76 | 
 77 |     # combining year, month, day that were parsed together into date_time with 
 78 |     # hour, which is now in the correct format.
 79 |     wind['date_time'] = pd.to_datetime(wind.date_time) + \
 80 |     wind.hour.astype('timedelta64[h]')
 81 |   
 82 |     # make datetime the index before making subsections.
 83 |     wind.index = wind['date_time']  
 84 |     
 85 |     # Adds extra rows where value is kept if it meets isin() criteria. Nan if 
 86 |     # it doesn't.
 87 |     wind['ws_0']= wind['ws'][wind['hour'].isin([0])]
 88 |     wind['ws_06']= wind['ws'][wind['hour'].isin([6])]
 89 |     wind['ws_12']= wind['ws'][wind['hour'].isin([12])]
 90 |     wind['ws_18']= wind['ws'][wind['hour'].isin([18])]
 91 |     
 92 |     group = wind.groupby(['year', 'month'])
 93 |         
 94 |     wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf])
 95 |     
 96 |     return wind_group
 97 | 
 98 |     
 99 | def plot_tseries(group):
100 |     '''set up n+1 subplots where n is number of stations in the group. Fill in 
101 |     each plot with timeseries from each station and then a mean of all the 
102 |     stations. Output to file eps.'''
103 |     
104 |    
105 |     fig = plt.figure(figsize=(10,10))
106 |     
107 |     for i in range(len(group)):
108 | 
109 |         #just for testing, see what group we are on
110 |         print(group_strings[j])
111 |         print(type(group))
112 |         
113 |         #read the file in for plotting
114 |         wind_group = read_file(group[i])
115 |         
116 |         
117 |         
118 |         # Dump the month part of the index to make the xaxis less crowded
119 |         wind_group.index = wind_group.index.droplevel(['month'])
120 |        
121 |         # fig.add_subplot(nrows, ncols, num)
122 | 
123 |         ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1)
124 | 
125 |         plt.title(s=group[i], fontsize=15)
126 |         
127 |         # May not need the if statements if I can solve the x problem below.
128 |         if len(wind_group.ws_0['meanf']) != 0: 
129 |             wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'm')
130 |         
131 |         if len(wind_group.ws_0['meanf']) != 0: 
132 |             wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c ='r')
133 |         
134 |         if len(wind_group.ws_0['meanf']) != 0: 
135 |             wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8), c ='b')
136 |         
137 |         if len(wind_group.ws_0['meanf']) != 0: 
138 |             wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c='c') 
139 |          
140 |     ax.legend(loc=4,bbox_to_anchor=(0.95, 1.05),labels 
141 |     = ['00','06','12','18'],prop={'size':6})
142 |     
143 |     plt.tight_layout() # very nice! stops the titles overlapping
144 |     fig.suptitle(group_strings[j])
145 |     fig.savefig('/home/sophie/projects/windspeed/'
146 |                 'output/%s.png'%(group_strings[j]),dpi=125)
147 | 
148 | if __name__ == '__main__':
149 |     
150 |     # x is coming as a list and we need it as just an object name.
151 |     for j,x in enumerate(stations): plot_tseries(x)
152 |     #plot_tseries(CSar)
153 |     
154 | 
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 


--------------------------------------------------------------------------------
/DSFromScratch/Chap6/.ipynb_checkpoints/064-Chap6-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Data Science from Scratch\n",
  8 |     "\n",
  9 |     "#### Chapter 6: Probability"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "#### Dependence and Independence\n",
 17 |     "\n",
 18 |     "Two events *E* and *F* are independent if the probability that they both happen is the product of the probabilities that each one happens:\n",
 19 |     "\n",
 20 |     "`P(E,F) = P(E)P(F)`"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "markdown",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "#### Conditional Probability\n",
 28 |     "\n",
 29 |     "`E` conditional on `F` as:\n",
 30 |     "`P(E|F) = P(E,F)/P(F)`\n",
 31 |     "\n",
 32 |     "This is the probability that E happens given that we know that `F` happens.    \n",
 33 |     "Often rewritten as:\n",
 34 |     "\n",
 35 |     "`P(E,F) = P(E|F)P(F)`\n",
 36 |     "\n",
 37 |     "Mathematically, this says that knowing that F occurred gives us no additional information about whether E occurred."
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": []
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": null,
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "outputs": [],
 56 |    "source": []
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "#### Baye's Theorem\n",
 63 |     "\n",
 64 |     "Imagine a certain disease that affects 1 in every 10,000 people. And imagine\n",
 65 |     "that there is a test for this disease that gives the correct result (“diseased” if you have\n",
 66 |     "the disease, “nondiseased” if you don’t) 99% of the time.\n",
 67 |     "\n",
 68 |     "A more intuitive way to see this is to imagine a population of 1 million people. You’d\n",
 69 |     "expect 100 of them to have the disease, and 99 of those 100 to test positive. On the\n",
 70 |     "other hand, you’d expect 999,900 of them not to have the disease, and 9,999 of those\n",
 71 |     "to test positive. Which means that"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {
 78 |     "collapsed": true
 79 |    },
 80 |    "outputs": [],
 81 |    "source": []
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": []
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "#### Random Variables"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": []
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "#### Continuous Distributions"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {
119 |     "collapsed": true
120 |    },
121 |    "outputs": [],
122 |    "source": [
123 |     "\n"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {
130 |     "collapsed": true
131 |    },
132 |    "outputs": [],
133 |    "source": []
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "#### The Normal Distribution"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": null,
145 |    "metadata": {
146 |     "collapsed": true
147 |    },
148 |    "outputs": [],
149 |    "source": []
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {
155 |     "collapsed": true
156 |    },
157 |    "outputs": [],
158 |    "source": []
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "#### The Central Limit Theorem"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {
171 |     "collapsed": true
172 |    },
173 |    "outputs": [],
174 |    "source": []
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": null,
179 |    "metadata": {
180 |     "collapsed": true
181 |    },
182 |    "outputs": [],
183 |    "source": []
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {
189 |     "collapsed": true
190 |    },
191 |    "outputs": [],
192 |    "source": []
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": []
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": null,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": []
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {
216 |     "collapsed": true
217 |    },
218 |    "outputs": [],
219 |    "source": []
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "Python [Root]",
225 |    "language": "python",
226 |    "name": "Python [Root]"
227 |   },
228 |   "language_info": {
229 |    "codemirror_mode": {
230 |     "name": "ipython",
231 |     "version": 3
232 |    },
233 |    "file_extension": ".py",
234 |    "mimetype": "text/x-python",
235 |    "name": "python",
236 |    "nbconvert_exporter": "python",
237 |    "pygments_lexer": "ipython3",
238 |    "version": "3.5.2"
239 |   }
240 |  },
241 |  "nbformat": 4,
242 |  "nbformat_minor": 0
243 | }
244 | 


--------------------------------------------------------------------------------
/windspeed/scripts/039-group_tseries.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import datetime as datetime
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # Creating a panel of timeseries for each group of stations.
  7 | 
  8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 
  9 | # obs per month. 
 10 | 
 11 | # An average over the group will be an extra plot in the panel.
 12 | 
 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh',
 14 | '60555Touggourt','60559ElOued','60566Ghardaia',
 15 | '60580Ouargla','60581HassiMessaoud']
 16 | 
 17 | 
 18 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah',
 19 | '62103Ghadames','62124Sebha']
 20 | 
 21 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa',
 22 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika']
 23 | 
 24 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya',
 25 | '61017Bilma'] 
 26 | 
 27 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan',
 28 | '62420Baharia','62423Farafra','62435Kharga'] 
 29 | 
 30 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima',
 31 | '62680Atbara']
 32 | 
 33 | 
 34 | stations=[NAl,CSar,WSa,CSal,Egy,Sud]
 35 | #stations = [CSar, WSa]
 36 | 
 37 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 
 38 | 'Egypt':Egy,'Sudan':Sud}
 39 | 
 40 | group_strings=['NAlgeria','CSahara','WSahel','CSahel', 'Egypt','Sudan']
 41 | #group_strings=['CSahara','WSahel']
 42 | 
 43 | 
 44 | # Could these two functions be turned into lambda functions?
 45 | # Would that be preferable or are these fine?
 46 | 
 47 | def meanf(x):
 48 |     if x.count() > 10:
 49 |        return x.mean()
 50 | 	
 51 | def sdf(x):
 52 |     if x.count() > 10:
 53 |         return x.std()
 54 | 
 55 | def read_file(fname):
 56 |     '''put the station name into read_file and read_file will return a 
 57 |     dataFrame called wind which has the following columns a dataframe with a 
 58 |     datetime index'''
 59 |     
 60 |      
 61 |     column_names=["year","month","day","hour","ws"]
 62 |     dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}
 63 |     
 64 |     datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname
 65 | 
 66 |     # specify the columns you want to group together. Can't include hour at 
 67 |     # this point as it is not in the right format. 
 68 |     date_spec = {'date_time': [0,1,2]}
 69 | 
 70 |     # when you use keep_dat_col it keeps them as objects, not as the dtype you 
 71 |     # read them in as.
 72 |     wind = pd.read_csv(datafile, sep=" ", names=column_names, 
 73 |     parse_dates=date_spec,   keep_date_col=True, index_col=False ) 
 74 | 
 75 |     # Dealing with hour - going from 600, 1200 etc to 6,12, 18
 76 |     wind["hour"]=(wind["hour"]/100).astype(int)
 77 | 
 78 |     # combining year, month, day that were parsed together into date_time with 
 79 |     # hour, which is now in the correct format.
 80 |     wind['date_time'] = pd.to_datetime(wind.date_time) + \
 81 |     wind.hour.astype('timedelta64[h]')
 82 |   
 83 |     # make datetime the index before making subsections.
 84 |     wind.index = wind['date_time']  
 85 |     
 86 |     # Adds extra rows where value is kept if it meets isin() criteria. Nan if 
 87 |     # it doesn't.
 88 |     wind['ws_0']= wind['ws'][wind['hour'].isin([0])]
 89 |     wind['ws_06']= wind['ws'][wind['hour'].isin([6])]
 90 |     wind['ws_12']= wind['ws'][wind['hour'].isin([12])]
 91 |     wind['ws_18']= wind['ws'][wind['hour'].isin([18])]
 92 |     
 93 |     group = wind.groupby(['year', 'month'])
 94 |         
 95 |     wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf])
 96 |     
 97 |     return wind_group
 98 | 
 99 |     
100 | def plot_tseries(group):
101 |     '''set up n+1 subplots where n is number of stations in the group. Fill in 
102 |     each plot with timeseries from each station and then a mean of all the 
103 |     stations. Output to file eps.'''
104 |     
105 |    
106 |     fig = plt.figure(figsize=(10,10))
107 |     
108 |     for i in range(len(group)):
109 | 
110 |         #just for testing, see what group we are on
111 |         print(group_strings[0])
112 |         print(type(group))
113 |         
114 |         #read the file in for plotting
115 |         wind_group = read_file(group[i])
116 |         
117 |         # check that there is data for the time period of interest
118 |         assert len(wind_group['1990':'1994']) != 0, ('No data for %s in this' 
119 |                'time period so no plot!'% group[i])
120 | 	
121 |         if len(wind_group['1990':'1994']) != 0:
122 |         # Dump the month part of the index to make the xaxis less crowded     
123 |             wind_group.index = wind_group.index.droplevel(['month'])
124 | 	  
125 | 	    # fig.add_subplot(nrows, ncols, num)
126 | 
127 |             ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1)
128 | 
129 |             plt.title(s=group[i], fontsize=15)
130 | 	    
131 | 	    # May not need the if statements if I can solve the x problem below.
132 | 	    # No, I do, so if there are no data in that time period it will be 
133 | 	    # caught - as in Ouargla!
134 | 	    #print(len(wind_group.ws_0['meanf']))  
135 |             
136 |             wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8),c='m')    
137 |             wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c='r')  
138 |             wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8),c='b')   
139 |             wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c='c') 
140 |          
141 |     ax.legend(loc=4,bbox_to_anchor=(0.95, 1.05),labels 
142 |     = ['00','06','12','18'],prop={'size':6})
143 |     
144 |     plt.tight_layout() # very nice! stops the titles overlapping
145 |     fig.suptitle(group_strings[0])
146 |     fig.savefig('/home/sophie/projects/windspeed/'
147 |                 'output/%s.png'%(group_strings[0]),dpi=125)
148 | 
149 | if __name__ == '__main__':
150 |     
151 |     # x is coming as a list and we need it as just an object name.
152 |     #for j,x in enumerate(stations): plot_tseries(x)
153 |     plot_tseries(NAl)
154 |     
155 | 
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/049-Credible_intervals_cdfs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Credible Intervals\n",
  8 |     "\n",
  9 |     "Once you have computed a posterior distribution, it is often useful to summarize the results with a single point estimate or an interval. For point estimates it is common to use the mean, median, or the value with maximum likelihood.\n",
 10 |     "\n",
 11 |     "A **credible interval** are the values where there is a 90% chance that the unknown value falls between them. \n",
 12 |     "\n",
 13 |     "To compute a **credible interval** add up the probabilities in the posterior distribution and record the values that correspond to the 5th and 95th percentiles.\n",
 14 |     "\n",
 15 |     "We can use ThinkBayes"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 6,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "def Percentile(pmf, percentage):\n",
 27 |     "    p = percentage / 100.0\n",
 28 |     "    total = 0\n",
 29 |     "    for val, prob in pmf.Items():\n",
 30 |     "        total += prob\n",
 31 |     "        if total >= p:\n",
 32 |     "            return val"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Now import the locomotive suite of hypotheses so we can apply the Percentile function to it. "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 11,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import os\n",
 51 |     "import sys\n",
 52 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 53 |     "if module_path not in sys.path:\n",
 54 |     "    sys.path.append(module_path)\n",
 55 |     "    \n",
 56 |     "from thinkbayes import Pmf, Suite"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 12,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# Taken from the first \"Estimation\" tutorial\n",
 68 |     "class Dice(Suite): \n",
 69 |     "    def Likelihood(self, data, hypo):\n",
 70 |     "        if hypo < data:\n",
 71 |     "            return 0 \n",
 72 |     "        else:\n",
 73 |     "            return 1.0/hypo\n",
 74 |     "\n",
 75 |     "# The likelihood function is the same in the Train as the Dice\n",
 76 |     "class Train(Dice):\n",
 77 |     "    def __init__(self, hypos, alpha = 1.0):  # Adding alpha to the arguments\n",
 78 |     "        Pmf.__init__(self)\n",
 79 |     "        for hypo in hypos:\n",
 80 |     "            self.Set(hypo, hypo**(-alpha))  # adding in the power law here to alter the prior\n",
 81 |     "        self.Normalize()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 13,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "hypos = range(1, 1001) # PRIOR p(H)\n",
 93 |     "suite = Train(hypos)\n",
 94 |     "\n",
 95 |     "for data in [60, 30, 90]:\n",
 96 |     "    suite.Update(data)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "Now we can use the Percentile function we defined above."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 10,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "(91, 242)\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "# To use Percentile\n",
123 |     "interval = Percentile(suite, 5), Percentile(suite, 95)\n",
124 |     "print (interval)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "For the locomotive problem, using a power law prior and 3 trains, the 90% credible interval is (91, 243) - (5th ,95th). This very wide range correctly suggests the massive uncertainty in how many trains there are all together. "
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "### Cumulative distribution functions\n",
139 |     "\n",
140 |     "In the previous section we computed percentiles by iterating through the values and probabilities in a Pmf. If we need to compute more than a few percentiles, it is more efficient to use a cumulative distribution function (Cdf).\n",
141 |     "\n",
142 |     "Cdfs and Pmfs are equivalent in the sense that they contain the same information about the distribution, and you can convert on to the other. The advantage of the Cdf is that you can compute percentiles more efficiently.\n",
143 |     "\n",
144 |     "thinkbayes provides a Cdf class that represents a cumulative distribution function. Pmf provides a method that makes the corresponsing Cdf:"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 15,
150 |    "metadata": {
151 |     "collapsed": false
152 |    },
153 |    "outputs": [
154 |     {
155 |      "name": "stdout",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "(91, 242)\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "cdf = suite.MakeCdf()\n",
164 |     "\n",
165 |     "# Cdf provides a function named Percentile\n",
166 |     "interval = cdf.Percentile(5), cdf.Percentile(95)\n",
167 |     "\n",
168 |     "print(interval)"
169 |    ]
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "Python 2",
175 |    "language": "python",
176 |    "name": "python2"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 2
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython2",
188 |    "version": "2.7.11"
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 0
193 | }
194 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/.ipynb_checkpoints/049-Credible_intervals_cdfs-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### Credible Intervals\n",
  8 |     "\n",
  9 |     "Once you have computed a posterior distribution, it is often useful to summarize the results with a single point estimate or an interval. For point estimates it is common to use the mean, median, or the value with maximum likelihood.\n",
 10 |     "\n",
 11 |     "A **credible interval** are the values where there is a 90% chance that the unknown value falls between them. \n",
 12 |     "\n",
 13 |     "To compute a **credible interval** add up the probabilities in the posterior distribution and record the values that correspond to the 5th and 95th percentiles.\n",
 14 |     "\n",
 15 |     "We can use ThinkBayes"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 6,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "def Percentile(pmf, percentage):\n",
 27 |     "    p = percentage / 100.0\n",
 28 |     "    total = 0\n",
 29 |     "    for val, prob in pmf.Items():\n",
 30 |     "        total += prob\n",
 31 |     "        if total >= p:\n",
 32 |     "            return val"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Now import the locomotive suite of hypotheses so we can apply the Percentile function to it. "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 11,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import os\n",
 51 |     "import sys\n",
 52 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 53 |     "if module_path not in sys.path:\n",
 54 |     "    sys.path.append(module_path)\n",
 55 |     "    \n",
 56 |     "from thinkbayes import Pmf, Suite"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 12,
 62 |    "metadata": {
 63 |     "collapsed": true
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# Taken from the first \"Estimation\" tutorial\n",
 68 |     "class Dice(Suite): \n",
 69 |     "    def Likelihood(self, data, hypo):\n",
 70 |     "        if hypo < data:\n",
 71 |     "            return 0 \n",
 72 |     "        else:\n",
 73 |     "            return 1.0/hypo\n",
 74 |     "\n",
 75 |     "# The likelihood function is the same in the Train as the Dice\n",
 76 |     "class Train(Dice):\n",
 77 |     "    def __init__(self, hypos, alpha = 1.0):  # Adding alpha to the arguments\n",
 78 |     "        Pmf.__init__(self)\n",
 79 |     "        for hypo in hypos:\n",
 80 |     "            self.Set(hypo, hypo**(-alpha))  # adding in the power law here to alter the prior\n",
 81 |     "        self.Normalize()"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 13,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "hypos = range(1, 1001) # PRIOR p(H)\n",
 93 |     "suite = Train(hypos)\n",
 94 |     "\n",
 95 |     "for data in [60, 30, 90]:\n",
 96 |     "    suite.Update(data)"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "Now we can use the Percentile function we defined above."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 10,
109 |    "metadata": {
110 |     "collapsed": false
111 |    },
112 |    "outputs": [
113 |     {
114 |      "name": "stdout",
115 |      "output_type": "stream",
116 |      "text": [
117 |       "(91, 242)\n"
118 |      ]
119 |     }
120 |    ],
121 |    "source": [
122 |     "# To use Percentile\n",
123 |     "interval = Percentile(suite, 5), Percentile(suite, 95)\n",
124 |     "print (interval)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "For the locomotive problem, using a power law prior and 3 trains, the 90% credible interval is (91, 243) - (5th ,95th). This very wide range correctly suggests the massive uncertainty in how many trains there are all together. "
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "### Cumulative distribution functions\n",
139 |     "\n",
140 |     "In the previous section we computed percentiles by iterating through the values and probabilities in a Pmf. If we need to compute more than a few percentiles, it is more efficient to use a cumulative distribution function (Cdf).\n",
141 |     "\n",
142 |     "Cdfs and Pmfs are equivalent in the sense that they contain the same information about the distribution, and you can convert on to the other. The advantage of the Cdf is that you can compute percentiles more efficiently.\n",
143 |     "\n",
144 |     "thinkbayes provides a Cdf class that represents a cumulative distribution function. Pmf provides a method that makes the corresponsing Cdf:"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 15,
150 |    "metadata": {
151 |     "collapsed": false
152 |    },
153 |    "outputs": [
154 |     {
155 |      "name": "stdout",
156 |      "output_type": "stream",
157 |      "text": [
158 |       "(91, 242)\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "cdf = suite.MakeCdf()\n",
164 |     "\n",
165 |     "# Cdf provides a function named Percentile\n",
166 |     "interval = cdf.Percentile(5), cdf.Percentile(95)\n",
167 |     "\n",
168 |     "print(interval)"
169 |    ]
170 |   }
171 |  ],
172 |  "metadata": {
173 |   "kernelspec": {
174 |    "display_name": "Python 3",
175 |    "language": "python",
176 |    "name": "python3"
177 |   },
178 |   "language_info": {
179 |    "codemirror_mode": {
180 |     "name": "ipython",
181 |     "version": 3
182 |    },
183 |    "file_extension": ".py",
184 |    "mimetype": "text/x-python",
185 |    "name": "python",
186 |    "nbconvert_exporter": "python",
187 |    "pygments_lexer": "ipython3",
188 |    "version": "3.5.1"
189 |   }
190 |  },
191 |  "nbformat": 4,
192 |  "nbformat_minor": 0
193 | }
194 | 


--------------------------------------------------------------------------------
/windspeed/notebooks/010_1-windspeed.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 184,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "               year         month           day          hour            ws\n",
 15 |       "count  29372.000000  29372.000000  29372.000000  29372.000000  29372.000000\n",
 16 |       "mean    1997.568058      6.460813     15.689160   1192.411140      5.479576\n",
 17 |       "std        7.985148      3.380070      8.809184    437.121055      1.993366\n",
 18 |       "min     1984.000000      1.000000      1.000000      0.000000      0.077814\n",
 19 |       "25%     1991.000000      4.000000      8.000000    900.000000      4.090943\n",
 20 |       "50%     1997.000000      6.000000     16.000000   1200.000000      5.421245\n",
 21 |       "75%     2005.000000      9.000000     23.000000   1500.000000      6.842833\n",
 22 |       "max     2012.000000     12.000000     31.000000   2100.000000     15.215400\n",
 23 |       "   year  month  day\n",
 24 |       "0  1984      3    1\n",
 25 |       "1  1984      3    1\n",
 26 |       "2  1984      3    1\n",
 27 |       "3  1984      3    2\n",
 28 |       "4  1984      3    2\n"
 29 |      ]
 30 |     }
 31 |    ],
 32 |    "source": [
 33 |     "import pandas as pd\n",
 34 |     "import numpy as np\n",
 35 |     "from datetime import datetime\n",
 36 |     "\n",
 37 |     "column_names=[\"year\",\"month\",\"day\",\"hour\",\"ws\"]\n",
 38 |     "dtype={\"year\":int,\"month\":int,\"day\":int,\"hour\":int,\"ws\":float}\n",
 39 |     "\n",
 40 |     "date_spec = {'date_time': [0,1,2]}\n",
 41 |     "\n",
 42 |     "datafile='/home/sophie/projects/windspeed/data/61401BirMoghrein_allwinds.txt'\n",
 43 |     "\n",
 44 |     "#using infer_datetime_format=True didn't help\n",
 45 |     "#when you use keep_dat_col it keeps them as objects, not as the dtype you read them in as.\n",
 46 |     "wind = pd.read_csv(datafile, sep=\" \", names=column_names, index_col=False ) \n",
 47 |     "print wind.describe()\n",
 48 |     "print wind[['year','month','day']][0:5]"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 185,
 54 |    "metadata": {
 55 |     "collapsed": false
 56 |    },
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "[1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998\n",
 63 |       " 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012]\n",
 64 |       "[ 3  4  5  6  7  8  9 10 11 12  1  2]\n",
 65 |       "[ 1  2  3  4  5 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 31  6  7\n",
 66 |       "  8  9 10 11 12 30]\n",
 67 |       "[ 600 1200 1800  900 1500 2100    0  300]\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "#checking what the unique values are in each column\n",
 73 |     "#A good check when you suspect discrete values\n",
 74 |     "for x in range(0,4): print wind[column_names[x]].unique()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 186,
 80 |    "metadata": {
 81 |     "collapsed": false
 82 |    },
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "0    06\n",
 89 |       "1    12\n",
 90 |       "2    18\n",
 91 |       "3    06\n",
 92 |       "4    12\n",
 93 |       "Name: hour, dtype: object\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "wind[\"hour\"]=(wind[\"hour\"]/100).astype(int)\n",
 99 |     "wind[\"hour\"] = wind.hour.map(\"{:02}\".format)\n",
100 |     "\n",
101 |     "year = wind['year'].apply(str)[0:5]\n",
102 |     "month = wind['month'].apply(str)[0:5]\n",
103 |     "day = wind['day'].apply(str)[0:5]\n",
104 |     "#hour = wind['hour'].apply(str)[0:5]; print hour\n",
105 |     "hour = wind['hour'][0:5]; print hour\n"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 187,
111 |    "metadata": {
112 |     "collapsed": false
113 |    },
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "0   1984-03-10 06:00:00\n",
120 |       "1   1984-03-11 02:00:00\n",
121 |       "2   1984-03-11 08:00:00\n",
122 |       "3   1984-03-20 06:00:00\n",
123 |       "4   1984-03-21 02:00:00\n",
124 |       "dtype: datetime64[ns]\n"
125 |      ]
126 |     }
127 |    ],
128 |    "source": [
129 |     "p = pd.to_datetime(year + month + day + hour, yearfirst=True, utc=True, format='%Y%m%d%H') ; print p"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 188,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [
139 |     {
140 |      "name": "stdout",
141 |      "output_type": "stream",
142 |      "text": [
143 |       "0   1984-03-01 06:00:00\n",
144 |       "1   1984-03-01 12:00:00\n",
145 |       "2   1984-03-01 18:00:00\n",
146 |       "3   1984-03-02 06:00:00\n",
147 |       "4   1984-03-02 12:00:00\n",
148 |       "Name: date_time, dtype: datetime64[ns]\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "#specify the columns you want to group together. Can't include hour at this point as it is not in the right format. \n",
154 |     "date_spec = {'date_time': [0,1,2]}\n",
155 |     "\n",
156 |     "#when you use keep_dat_col it keeps them as objects, not as the dtype you read them in as.\n",
157 |     "wind = pd.read_csv(datafile, sep=\" \", names=column_names, parse_dates=date_spec, keep_date_col=True, index_col=False ) \n",
158 |     "\n",
159 |     "#Dealing with hour - going from 600, 1200 etc to 6,12, 18\n",
160 |     "wind[\"hour\"]=(wind[\"hour\"]/100).astype(int)\n",
161 |     "\n",
162 |     "#combining year, month, day that were parsed together into date_time with hour, which is now in the correct format.\n",
163 |     "wind['date_time'] = pd.to_datetime(wind.date_time) + wind.hour.astype('timedelta64[h]')\n",
164 |     "\n",
165 |     "print wind.date_time[0:5]"
166 |    ]
167 |   }
168 |  ],
169 |  "metadata": {
170 |   "kernelspec": {
171 |    "display_name": "Python 2",
172 |    "language": "python",
173 |    "name": "python2"
174 |   },
175 |   "language_info": {
176 |    "codemirror_mode": {
177 |     "name": "ipython",
178 |     "version": 2
179 |    },
180 |    "file_extension": ".py",
181 |    "mimetype": "text/x-python",
182 |    "name": "python",
183 |    "nbconvert_exporter": "python",
184 |    "pygments_lexer": "ipython2",
185 |    "version": "2.7.11"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 0
190 | }
191 | 


--------------------------------------------------------------------------------
/windspeed/scripts/040-group_tseries.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import datetime as datetime
  4 | import matplotlib.pyplot as plt
  5 | 
  6 | # Creating a panel of timeseries for each group of stations.
  7 | 
  8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 
  9 | # obs per month. 
 10 | 
 11 | # An average over the group will be an extra plot in the panel.
 12 | 
 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh',
 14 | '60555Touggourt','60559ElOued','60566Ghardaia',
 15 | '60580Ouargla','60581HassiMessaoud']
 16 | 
 17 | 
 18 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah',
 19 | '62103Ghadames','62124Sebha']
 20 | 
 21 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa',
 22 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika']
 23 | 
 24 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya',
 25 | '61017Bilma'] 
 26 | 
 27 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan',
 28 | '62420Baharia','62423Farafra','62435Kharga'] 
 29 | 
 30 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima',
 31 | '62680Atbara']
 32 | 
 33 | 
 34 | stations=[NAl,CSar,WSa,CSal,Egy,Sud]
 35 | #stations = [CSal]
 36 | 
 37 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 
 38 | 'Egypt':Egy,'Sudan':Sud}
 39 | 
 40 | group_strings=['NAlgeria','CSahara','WSahel','CSahel', 'Egypt','Sudan']
 41 | #group_strings=['CSahara','WSahel']
 42 | 
 43 | 
 44 | # Could these two functions be turned into lambda functions?
 45 | # Would that be preferable or are these fine?
 46 | 
 47 | def meanf(x):
 48 |     if x.count() > 10:
 49 |         return x.mean()
 50 | 	
 51 | def sdf(x):
 52 |     if x.count() > 10:
 53 |         return x.std()
 54 | 
 55 | def read_file(fname):
 56 |     '''put the station name into read_file and read_file will return a 
 57 |     dataFrame called wind which has the following columns a dataframe with a 
 58 |     datetime index'''
 59 |     
 60 |      
 61 |     column_names=["year","month","day","hour","ws"]
 62 |     dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float}
 63 |     
 64 |     datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname
 65 | 
 66 |     # specify the columns you want to group together. Can't include hour at 
 67 |     # this point as it is not in the right format. 
 68 |     date_spec = {'date_time': [0,1,2]}
 69 | 
 70 |     # when you use keep_dat_col it keeps them as objects, not as the dtype you 
 71 |     # read them in as.
 72 |     wind = pd.read_csv(datafile, sep=" ", names=column_names, 
 73 |     parse_dates=date_spec,   keep_date_col=True, index_col=False ) 
 74 | 
 75 |     # Dealing with hour - going from 600, 1200 etc to 6,12, 18
 76 |     wind["hour"]=(wind["hour"]/100).astype(int)
 77 | 
 78 |     # combining year, month, day that were parsed together into date_time with 
 79 |     # hour, which is now in the correct format.
 80 |     wind['date_time'] = pd.to_datetime(wind.date_time) + \
 81 |     wind.hour.astype('timedelta64[h]')
 82 |   
 83 |     # make datetime the index before making subsections.
 84 |     wind.index = wind['date_time']  
 85 |     
 86 |     # drop date_time index. For some reason it caused a problem at Niamey if I 
 87 |     # didn't.
 88 |     #wind.drop('date_time', axis=1, inplace=True)
 89 |     
 90 |     #Also a good idea to drop duplicate columns. 
 91 |     # For this case, where the datetime object is the same it needs to be 
 92 |     # dropped, otherwise it doesn't let you add more columns, as in 
 93 |     # wind['ws_0'] etc. below
 94 |     wind.drop_duplicates(['date_time'],inplace=True)
 95 |     
 96 |     # Adds extra rows where value is kept if it meets isin() criteria. Nan if 
 97 |     # it doesn't.
 98 |     wind['ws_0']= wind['ws'][wind['hour'].isin([0])]
 99 |     wind['ws_06']= wind['ws'][wind['hour'].isin([6])]
100 |     wind['ws_12']= wind['ws'][wind['hour'].isin([12])]
101 |     wind['ws_18']= wind['ws'][wind['hour'].isin([18])]
102 |     
103 |     group = wind.groupby(['year', 'month'])
104 |         
105 |     wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf])
106 |     
107 |     return wind_group
108 | 
109 |     
110 | def plot_tseries(group):
111 |     '''set up n+1 subplots where n is number of stations in the group. Fill in 
112 |     each plot with timeseries from each station and then a mean of all the 
113 |     stations. Output to file eps.'''
114 |     
115 |    
116 |     fig = plt.figure(figsize=(10,10))
117 |     
118 |     for i in range(len(group)):
119 | 
120 |         # just for testing, see what group we are on
121 |         print(group_strings[j])
122 |         print(type(group))
123 |         print(group[i])
124 |         
125 |         # read in one station from the group, read_file will create a group by 
126 |         # object ready for plotting 
127 |         wind_group = read_file(group[i])
128 |         
129 |         # check that there is data for the time period of interest
130 |         #assert len(wind_group['1990':'1994']) != 0, ('No data for %s in this ' 
131 |          #      'time period so no plot!'% group[i])
132 | 	
133 |         if len(wind_group['1990':'1994']) != 0:
134 |         # Dump the month part of the index to make the xaxis less crowded     
135 |             wind_group.index = wind_group.index.droplevel(['month'])
136 | 	  
137 | 	    # fig.add_subplot(nrows, ncols, num)
138 | 
139 |             ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1)
140 | 
141 |             plt.title(s=group[i], fontsize=15)
142 | 	    
143 | 	    # May not need the if statements if I can solve the x problem below.
144 | 	    # No, I do, so if there are no data in that time period it will be 
145 | 	    # caught - as in Ouargla!
146 | 	    #print(len(wind_group.ws_0['meanf']))  
147 |             
148 |             wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8),c='m')    
149 |             wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c='r')  
150 |             wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8),c='b')   
151 |             wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c='c') 
152 |          
153 |     ax.legend(loc=4,bbox_to_anchor=(0.95, 1.05),labels 
154 |     = ['00','06','12','18'],prop={'size':6})
155 |     
156 |     plt.tight_layout() # very nice! stops the titles overlapping
157 |     fig.suptitle(group_strings[j])
158 |     fig.savefig('/home/sophie/projects/windspeed/'
159 |                 'output/%s.png'%(group_strings[j]),dpi=125)
160 | 
161 | if __name__ == '__main__':
162 |     
163 |     # x is coming as a list and we need it as just an object name.
164 |     for j,x in enumerate(stations): plot_tseries(x)
165 |     #plot_tseries(NAl)
166 |     
167 | 
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/043-Distributions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Computational Statistics\n",
  8 |     "\n",
  9 |     "### Distributions"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Count the number of times each word appears in a sequence"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# This tells Python of that additional module import path. \n",
 28 |     "import os\n",
 29 |     "import sys\n",
 30 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 31 |     "if module_path not in sys.path:\n",
 32 |     "    sys.path.append(module_path)\n",
 33 |     "    \n",
 34 |     "from thinkbayes import Pmf   # Probability mass function\n",
 35 |     "\n",
 36 |     "# Creates an instance of class Pmf (pmf) to represent the distribution of outcomes for a six-sided die:\n",
 37 |     "# class Pmf inherits from _DictWrapper (an object which contains a dictionary)\n",
 38 |     "pmf = Pmf()\n",
 39 |     "\n",
 40 |     "# Set --> {1: 1/6.0, 2:1/6.0, 3:1/6.0....}\n",
 41 |     "for x in [1,2,3,4,5,6]:\n",
 42 |     "    pmf.Set(x,1/6.0)  # Set function is within the _DictWrapper class. So pmf inherits it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    },
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "dict_keys([1, 2, 3, 4, 5, 6])\n",
 57 |       "dict_items([(1, 0.16666666666666666), (2, 0.16666666666666666), (3, 0.16666666666666666), (4, 0.16666666666666666), (5, 0.16666666666666666), (6, 0.16666666666666666)])\n",
 58 |       "<zip object at 0x7fb6d2358648>\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "# How to access values from pmf? You need to use the right methods. i.e.\n",
 64 |     "# \n",
 65 |     "print (pmf.Values()) # just gives the keys. \n",
 66 |     "print (pmf.Items()) # gives the key: value pairs in the dictionary\n",
 67 |     "print (pmf.Render()) # create items for plotting"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 20,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# help(pmf) # A list of the available classes and methods."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "#### The Cookie Problem"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "pmf = Pmf()\n",
 97 |     "\n",
 98 |     "# Hypothesis B1 and B2 (Bowl 1 and Bowl 2).\n",
 99 |     "# This is the prior distribution (contains the priors for each hypothesis)\n",
100 |     "pmf.Set('Bowl 1', 0.5) # p(B1)\n",
101 |     "pmf.Set('Bowl 2', 0.5) # p(B2)\n"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "To update the distribution based on new data (vanilla cookie) we multiply each prior by the corresponding likelihood.\n",
109 |     "Now we have new data - A vanilla cookie! - we can update each of B1 and B2, i.e. determining p(B1|D) and p(B2|D).      \n",
110 |     "So for B1, this would be:       \n",
111 |     "    p(B1|D) = prior\\*Prob of Vanilla from B1/ Prob of Vanilla from either bowl                    \n",
112 |     "    p(B1|D) = p(B1)\\*p(D|B1)/p(D)                   \n",
113 |     "    p(B1) = 1/2 (there are two bowls)                   \n",
114 |     "    p(D|B1) = 3/4 (ratio is 30:10 vanilla to choc)                      \n",
115 |     "    p(D) = 5/8 (80 cookies altogether in both bowls, 50 are vanilla)                        \n",
116 |     "    So:           \n",
117 |     "    posterior = (1/2*3/2)/(5/8)               \n",
118 |     "    \n",
119 |     "    \n",
120 |     "    p(B2|D) = prior*Prob of Vanilla from B1/ Prob of Vanilla from either bowl\n",
121 |     "\n",
122 |     "The likelihood of drawing a vanilla cookie from Bowl 1 is 3/4 and Bowl 2 is 1/2.\n"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 5,
128 |    "metadata": {
129 |     "collapsed": true
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "# Mult get the probability for the given hypothesis and multiplies by the given likelihood\n",
134 |     "pmf.Mult('Bowl 1', 0.75)\n",
135 |     "pmf.Mult('Bowl 2', 0.5)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "After this update, the distribution is no longer normalized, but because these hypotheses are mutally exclusive and collectively exhaustive, we can renormalize:"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "0.625"
156 |       ]
157 |      },
158 |      "execution_count": 6,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "pmf.Normalize()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "source": [
173 |     "The result is a distribution that contains the posterior probability for each hypothesis, now called the POSTERIOR DISTRIBUTION"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 7,
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "0.6000000000000001\n",
188 |       "dict_keys(['Bowl 1', 'Bowl 2'])\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "# Get the posterior probability for Bowl 1.\n",
194 |     "print (pmf.Prob('Bowl 1'))\n",
195 |     "\n",
196 |     "print (pmf.Values())"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": []
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 3",
212 |    "language": "python",
213 |    "name": "python3"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.5.1"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 0
230 | }
231 | 


--------------------------------------------------------------------------------
/tutorials/Samsung/notebooks/031-Samsung_cleanup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 26,
  6 |    "metadata": {
  7 |     "collapsed": false
  8 |    },
  9 |    "outputs": [
 10 |     {
 11 |      "name": "stdout",
 12 |      "output_type": "stream",
 13 |      "text": [
 14 |       "Populating the interactive namespace from numpy and matplotlib\n"
 15 |      ]
 16 |     }
 17 |    ],
 18 |    "source": [
 19 |     "# Changed the order in which things are done, from the previous workbook 030-Samsung_cleanup, as it does make a difference\n",
 20 |     "# for later commands. \n",
 21 |     "\n",
 22 |     "%pylab inline\n",
 23 |     "import pandas as pd\n",
 24 |     "\n",
 25 |     "# copy 2 is just a copy of features.\n",
 26 |     "df = pd.read_csv('/home/sophie/projects/Samsung/data/UCI_HAR_Dataset/UCI_HAR_Dataset/features_copy2.txt',sep=\" \",\n",
 27 |     "                 names = ['name'], dtype='str')\n",
 28 |     "\n",
 29 |     "# First will drop duplicates\n",
 30 |     "df.drop_duplicates(['name'],inplace=True)\n",
 31 |     "\n",
 32 |     "# remove numbers, brackets, \"-\" and \",\" from all columns\n",
 33 |     "\n",
 34 |     "df.name = df.name.str.replace('[()]', '') # remove brackets\n",
 35 |     "df.name = df.name.str.replace('[0-9]','') # remove any numbers\n",
 36 |     "\n",
 37 |     "df.drop_duplicates(['name'],inplace=True)\n"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 27,
 43 |    "metadata": {
 44 |     "collapsed": false
 45 |    },
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stdout",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "                    name\n",
 52 |       "345  fBodyAccJerk-mean-X\n",
 53 |       "346  fBodyAccJerk-mean-Y\n",
 54 |       "347  fBodyAccJerk-mean-Z\n",
 55 |       "348   fBodyAccJerk-std-X\n",
 56 |       "349   fBodyAccJerk-std-Y\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "# Print out lines that contain \"Jerk\" and \"Mean\" to look for reasons why I may have dumped them\n",
 62 |     "print df[df.name.str.contains('f.*Jerk')][0:5]\n"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 41,
 68 |    "metadata": {
 69 |     "collapsed": false
 70 |    },
 71 |    "outputs": [
 72 |     {
 73 |      "ename": "AttributeError",
 74 |      "evalue": "'str' object has no attribute 'str'",
 75 |      "output_type": "error",
 76 |      "traceback": [
 77 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 78 |       "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
 79 |       "\u001b[1;32m<ipython-input-41-2451f6f54f77>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;31m# conflicting. If I can't do it this way, can just use their list.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;31m# Had to keep \"-\" in in order to only remove columns with -X, -Y, -Z\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontains\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'-X|-Y|-Z|min|max|mad|sma|iqr|entropy|energy|band|Coeff'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mFalse\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# How many are left now?\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 80 |       "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'str'"
 81 |      ]
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "# Dropping the lines i'm confident we definitely don't need. The documentation is actually a bit confusing and \n",
 86 |     "# conflicting. If I can't do it this way, can just use their list. \n",
 87 |     "# Had to keep \"-\" in in order to only remove columns with -X, -Y, -Z\n",
 88 |     "df= df.name[df.name.str.contains('-X|-Y|-Z|min|max|mad|sma|iqr|entropy|energy|band|Coeff') == False]\n",
 89 |     "\n",
 90 |     "print len(df) # How many are left now?"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 37,
 96 |    "metadata": {
 97 |     "collapsed": false
 98 |    },
 99 |    "outputs": [],
100 |    "source": [
101 |     "# Now to get rid of Body and Mag and change mean to Mean and std to SD. \n",
102 |     "# Can also remove \"-\" and \",\"\n",
103 |     "\n",
104 |     "df = df.str.replace('Body', '')\n",
105 |     "df = df.str.replace('Mag', '')\n",
106 |     "df = df.str.replace('mean', 'Mean')\n",
107 |     "df = df.str.replace('std', 'SD')\n",
108 |     "df = df.str.replace('-', '')\n",
109 |     "df = df.str.replace(',', '')"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 2,
115 |    "metadata": {
116 |     "collapsed": false
117 |    },
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "35\n"
124 |      ]
125 |     }
126 |    ],
127 |    "source": [
128 |     "b = [\"tAccMean\", \"tAccSD\", \"tJerkMean\", \"tJerkSD\",\"tGyroMean\", \"tGyroSD\", \"tGyroJerkMean\", \"tGyroJerkSD\",\n",
129 |     "\"fAccMean\", \"fAccSD\", \"fJerkMean\", \"fJerkSD\",\n",
130 |     "\"fGyroMean\", \"fGyroSD\", \"fGyroJerkMean\", \"fGyroJerkSD\",\n",
131 |     "\"fGyroMeanFreq\", \"fGyroJerkMeanFreq\", \"fAccMeanFreq\", \"fJerkMeanFreq\",\n",
132 |     "\"fAccSkewness\", \"fAccKurtosis\", \"fJerkSkewness\", \"fJerkKurtosis\",\n",
133 |     "\"fGyroSkewness\", \"fGyroKurtosis\", \"fGyroJerkSkewness\", \"fGyroJerkKurtosis\",\n",
134 |     "\"angleAccGravity\", \"angleJerkGravity\", \"angleGyroGravity\", \"angleGyroJerkGravity\",\n",
135 |     "\"angleXGravity\", \"angleYGravity\", \"angleZGravity\"]\n",
136 |     "\n",
137 |     "print len(b) # Not sure why this is 31, when the documentation says there were 37 in the end.\n"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {
144 |     "collapsed": true
145 |    },
146 |    "outputs": [],
147 |    "source": []
148 |   }
149 |  ],
150 |  "metadata": {
151 |   "kernelspec": {
152 |    "display_name": "Python 2",
153 |    "language": "python",
154 |    "name": "python2"
155 |   },
156 |   "language_info": {
157 |    "codemirror_mode": {
158 |     "name": "ipython",
159 |     "version": 2
160 |    },
161 |    "file_extension": ".py",
162 |    "mimetype": "text/x-python",
163 |    "name": "python",
164 |    "nbconvert_exporter": "python",
165 |    "pygments_lexer": "ipython2",
166 |    "version": "2.7.11"
167 |   }
168 |  },
169 |  "nbformat": 4,
170 |  "nbformat_minor": 0
171 | }
172 | 


--------------------------------------------------------------------------------
/tutorials/ThinkBayes/.ipynb_checkpoints/043-Distributions-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Computational Statistics\n",
  8 |     "\n",
  9 |     "### Distributions"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Count the number of times each word appears in a sequence"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {
 23 |     "collapsed": false
 24 |    },
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "# This tells Python of that additional module import path. \n",
 28 |     "import os\n",
 29 |     "import sys\n",
 30 |     "module_path = os.path.abspath(os.path.join('..'))\n",
 31 |     "if module_path not in sys.path:\n",
 32 |     "    sys.path.append(module_path)\n",
 33 |     "    \n",
 34 |     "from thinkbayes import Pmf   # Probability mass function\n",
 35 |     "\n",
 36 |     "# Creates an instance of class Pmf (pmf) to represent the distribution of outcomes for a six-sided die:\n",
 37 |     "# class Pmf inherits from _DictWrapper (an object which contains a dictionary)\n",
 38 |     "pmf = Pmf()\n",
 39 |     "\n",
 40 |     "# Set --> {1: 1/6.0, 2:1/6.0, 3:1/6.0....}\n",
 41 |     "for x in [1,2,3,4,5,6]:\n",
 42 |     "    pmf.Set(x,1/6.0)  # Set function is within the _DictWrapper class. So pmf inherits it."
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {
 49 |     "collapsed": false
 50 |    },
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "dict_keys([1, 2, 3, 4, 5, 6])\n",
 57 |       "dict_items([(1, 0.16666666666666666), (2, 0.16666666666666666), (3, 0.16666666666666666), (4, 0.16666666666666666), (5, 0.16666666666666666), (6, 0.16666666666666666)])\n",
 58 |       "<zip object at 0x7fb6d2358648>\n"
 59 |      ]
 60 |     }
 61 |    ],
 62 |    "source": [
 63 |     "# How to access values from pmf? You need to use the right methods. i.e.\n",
 64 |     "# \n",
 65 |     "print (pmf.Values()) # just gives the keys. \n",
 66 |     "print (pmf.Items()) # gives the key: value pairs in the dictionary\n",
 67 |     "print (pmf.Render()) # create items for plotting"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 20,
 73 |    "metadata": {
 74 |     "collapsed": false
 75 |    },
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "# help(pmf) # A list of the available classes and methods."
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "#### The Cookie Problem"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 4,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "pmf = Pmf()\n",
 97 |     "\n",
 98 |     "# Hypothesis B1 and B2 (Bowl 1 and Bowl 2).\n",
 99 |     "# This is the prior distribution (contains the priors for each hypothesis)\n",
100 |     "pmf.Set('Bowl 1', 0.5) # p(B1)\n",
101 |     "pmf.Set('Bowl 2', 0.5) # p(B2)\n"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "To update the distribution based on new data (vanilla cookie) we multiply each prior by the corresponding likelihood.\n",
109 |     "Now we have new data - A vanilla cookie! - we can update each of B1 and B2, i.e. determining p(B1|D) and p(B2|D).      \n",
110 |     "So for B1, this would be:       \n",
111 |     "    p(B1|D) = prior\\*Prob of Vanilla from B1/ Prob of Vanilla from either bowl                    \n",
112 |     "    p(B1|D) = p(B1)\\*p(D|B1)/p(D)                   \n",
113 |     "    p(B1) = 1/2 (there are two bowls)                   \n",
114 |     "    p(D|B1) = 3/4 (ratio is 30:10 vanilla to choc)                      \n",
115 |     "    p(D) = 5/8 (80 cookies altogether in both bowls, 50 are vanilla)                        \n",
116 |     "    So:           \n",
117 |     "    posterior = (1/2*3/2)/(5/8)               \n",
118 |     "    \n",
119 |     "    \n",
120 |     "    p(B2|D) = prior*Prob of Vanilla from B1/ Prob of Vanilla from either bowl\n",
121 |     "\n",
122 |     "The likelihood of drawing a vanilla cookie from Bowl 1 is 3/4 and Bowl 2 is 1/2.\n"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 5,
128 |    "metadata": {
129 |     "collapsed": true
130 |    },
131 |    "outputs": [],
132 |    "source": [
133 |     "# Mult get the probability for the given hypothesis and multiplies by the given likelihood\n",
134 |     "pmf.Mult('Bowl 1', 0.75)\n",
135 |     "pmf.Mult('Bowl 2', 0.5)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "markdown",
140 |    "metadata": {},
141 |    "source": [
142 |     "After this update, the distribution is no longer normalized, but because these hypotheses are mutally exclusive and collectively exhaustive, we can renormalize:"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 6,
148 |    "metadata": {
149 |     "collapsed": false
150 |    },
151 |    "outputs": [
152 |     {
153 |      "data": {
154 |       "text/plain": [
155 |        "0.625"
156 |       ]
157 |      },
158 |      "execution_count": 6,
159 |      "metadata": {},
160 |      "output_type": "execute_result"
161 |     }
162 |    ],
163 |    "source": [
164 |     "pmf.Normalize()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {
170 |     "collapsed": true
171 |    },
172 |    "source": [
173 |     "The result is a distribution that contains the posterior probability for each hypothesis, now called the POSTERIOR DISTRIBUTION"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 7,
179 |    "metadata": {
180 |     "collapsed": false
181 |    },
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "0.6000000000000001\n",
188 |       "dict_keys(['Bowl 1', 'Bowl 2'])\n"
189 |      ]
190 |     }
191 |    ],
192 |    "source": [
193 |     "# Get the posterior probability for Bowl 1.\n",
194 |     "print (pmf.Prob('Bowl 1'))\n",
195 |     "\n",
196 |     "print (pmf.Values())"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": null,
202 |    "metadata": {
203 |     "collapsed": true
204 |    },
205 |    "outputs": [],
206 |    "source": []
207 |   }
208 |  ],
209 |  "metadata": {
210 |   "kernelspec": {
211 |    "display_name": "Python 3",
212 |    "language": "python",
213 |    "name": "python3"
214 |   },
215 |   "language_info": {
216 |    "codemirror_mode": {
217 |     "name": "ipython",
218 |     "version": 3
219 |    },
220 |    "file_extension": ".py",
221 |    "mimetype": "text/x-python",
222 |    "name": "python",
223 |    "nbconvert_exporter": "python",
224 |    "pygments_lexer": "ipython3",
225 |    "version": "3.5.1"
226 |   }
227 |  },
228 |  "nbformat": 4,
229 |  "nbformat_minor": 0
230 | }
231 | 


--------------------------------------------------------------------------------