├── firstform ├── tests │ ├── __init__.py │ ├── tools.py │ └── app_tests.py ├── firstform │ ├── __init__.py │ └── __init__.pyc ├── bin │ ├── app.pyc │ ├── tools.py │ ├── __init__.py │ └── app.py ├── templates │ ├── layout.html │ ├── index.html │ └── hello_form.html └── setup.py ├── gothonweb ├── bin │ ├── __init__.py │ ├── app.py │ └── map.py ├── tests │ ├── __init__.py │ ├── tools.py │ ├── app_tests.py │ └── map_tests.py ├── templates │ ├── you_died.html │ ├── layout.html │ └── show_room.html ├── sessions │ ├── 40ad7454d4b4cbacedaa449f7e2c8fb04165ecf4 │ ├── 5524b4c1828de273b8ae4c70bbbe0e631e031e4a │ └── 6adbe20488a3ffd0040abc4ac06991d1d79c97d0 └── setup.py ├── .gitignore ├── tutorials ├── exercism_py3 │ ├── leap │ │ ├── .cache │ │ │ └── v │ │ │ │ └── cache │ │ │ │ └── lastfailed │ │ ├── year5.py │ │ ├── leap.py │ │ ├── year4.py │ │ ├── year.py │ │ ├── leap_test.py │ │ └── README.md │ ├── hello-world │ │ ├── .cache │ │ │ └── v │ │ │ │ └── cache │ │ │ │ └── lastfailed │ │ ├── hello_world2.py │ │ ├── hello_world.py │ │ ├── hello_world_test.py │ │ ├── hello_world_test2.py │ │ └── README.md │ ├── Ex5_hamming │ │ ├── hamming2.py │ │ └── hamming.py │ ├── dna │ │ ├── dna2.py │ │ └── dna.py │ ├── word_count │ │ ├── wordcount3.py │ │ ├── wordcount2.py │ │ ├── README.md │ │ └── word_count_test.py │ └── pangram │ │ ├── pangram.py │ │ ├── pangram2.py │ │ └── pangram_detailed.py ├── ThinkBayes │ ├── thinkbayesLoco.png │ ├── thinkbayesLoco2.png │ ├── thinkbayeseuro.png │ ├── thinkbayeseuro2.png │ ├── thinkbayesprice.png │ ├── thinkbayesprice2.png │ ├── thinkbayesprice3.png │ ├── .ipynb_checkpoints │ │ ├── 046-ImplimentingSuite-checkpoint.ipynb │ │ ├── 056 - Chap6DecisionAnalysis-checkpoint.ipynb │ │ ├── 046-Suite_m&m-checkpoint.ipynb │ │ ├── 046-MontyHall_framework-checkpoint.ipynb │ │ ├── 047-Dice-checkpoint.ipynb │ │ ├── 049-Credible_intervals_cdfs-checkpoint.ipynb │ │ └── 043-Distributions-checkpoint.ipynb │ ├── 056 - Chap6DecisionAnalysis.ipynb │ ├── 046-ImplimentingSuite.ipynb │ ├── 046-Suite_m&m.ipynb │ ├── 046-MontyHall_framework.ipynb │ ├── 047-Dice.ipynb │ ├── 049-Credible_intervals_cdfs.ipynb │ └── 043-Distributions.ipynb ├── algorithms │ ├── notebooks │ │ ├── .ipynb_checkpoints │ │ │ ├── 068-Lesson2-checkpoint.ipynb │ │ │ └── Lesson1-checkpoint.ipynb │ │ ├── 068-Lesson2.ipynb │ │ └── Lesson1.ipynb │ └── scripts │ │ ├── L1_Eulerian_Q10.py │ │ └── L1_EulerianPath.py ├── KaggleNLP │ └── word_vectors.py ├── K-means │ └── kmeans.py ├── Samsung │ └── notebooks │ │ ├── 029-Samsung_cleanup.ipynb │ │ └── 031-Samsung_cleanup.ipynb └── 026-Linear_Regression_Analysis.ipynb ├── windspeed ├── plots │ ├── WSahel.png │ ├── 038-62124Sebha.png │ └── 038-62124Sebha_2.png ├── scripts │ ├── 012-ws_tseries.py │ ├── 030-group_tseries.py │ ├── 037-group_tseries.py │ ├── 013-ws_tseries.py │ ├── 038-group_tseries.py │ ├── 039-group_tseries.py │ └── 040-group_tseries.py └── notebooks │ └── 010_1-windspeed.ipynb ├── SQL └── galaXQL_17.sql ├── 001-git-basics.md ├── monkeylearn └── 015-selectdata.py ├── DSFromScratch ├── Chap13 │ └── machine_learning.py └── Chap6 │ ├── 064-Chap6.ipynb │ └── .ipynb_checkpoints │ └── 064-Chap6-checkpoint.ipynb ├── Titanic └── bin │ ├── clean_test.py │ └── clean_test_53.py └── TOdo.md /firstform/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gothonweb/bin/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gothonweb/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /firstform/firstform/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | API_key.txt -------------------------------------------------------------------------------- /tutorials/exercism_py3/leap/.cache/v/cache/lastfailed: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /tutorials/exercism_py3/hello-world/.cache/v/cache/lastfailed: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /firstform/bin/app.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/firstform/bin/app.pyc -------------------------------------------------------------------------------- /windspeed/plots/WSahel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/windspeed/plots/WSahel.png -------------------------------------------------------------------------------- /firstform/firstform/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/firstform/firstform/__init__.pyc -------------------------------------------------------------------------------- /windspeed/plots/038-62124Sebha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/windspeed/plots/038-62124Sebha.png -------------------------------------------------------------------------------- /tutorials/exercism_py3/Ex5_hamming/hamming2.py: -------------------------------------------------------------------------------- 1 | def distance(dna1, dna2): 2 | return sum(d1 != d2 for d1, d2 in zip(dna1, dna2)) -------------------------------------------------------------------------------- /windspeed/plots/038-62124Sebha_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/windspeed/plots/038-62124Sebha_2.png -------------------------------------------------------------------------------- /tutorials/ThinkBayes/thinkbayesLoco.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesLoco.png -------------------------------------------------------------------------------- /tutorials/ThinkBayes/thinkbayesLoco2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesLoco2.png -------------------------------------------------------------------------------- /tutorials/ThinkBayes/thinkbayeseuro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayeseuro.png -------------------------------------------------------------------------------- /tutorials/ThinkBayes/thinkbayeseuro2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayeseuro2.png -------------------------------------------------------------------------------- /tutorials/ThinkBayes/thinkbayesprice.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesprice.png -------------------------------------------------------------------------------- /tutorials/ThinkBayes/thinkbayesprice2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesprice2.png -------------------------------------------------------------------------------- /tutorials/ThinkBayes/thinkbayesprice3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SophMC/notechain/HEAD/tutorials/ThinkBayes/thinkbayesprice3.png -------------------------------------------------------------------------------- /gothonweb/templates/you_died.html: -------------------------------------------------------------------------------- 1 |

You Died!

2 | 3 |

Looks like you bit the dust.

4 |

Play Again

5 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/dna/dna2.py: -------------------------------------------------------------------------------- 1 | DNA_TO_RNA = str.maketrans("GCTA", "CGAU") 2 | 3 | def to_rna(dna): 4 | return dna.translate(DNA_TO_RNA) -------------------------------------------------------------------------------- /tutorials/ThinkBayes/.ipynb_checkpoints/046-ImplimentingSuite-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /tutorials/algorithms/notebooks/.ipynb_checkpoints/068-Lesson2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/.ipynb_checkpoints/056 - Chap6DecisionAnalysis-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 0 6 | } 7 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/Ex5_hamming/hamming.py: -------------------------------------------------------------------------------- 1 | def distance(x,y): 2 | count=0 3 | for i,x in enumerate(x): 4 | if x != y[i]: count += 1 5 | return count 6 | 7 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/hello-world/hello_world2.py: -------------------------------------------------------------------------------- 1 | # 2 | # Skeleton file for the Python "Hello World" exercise. 3 | # 4 | 5 | def hello(name=''): 6 | return 'Hello, %s!' % (name or 'World') -------------------------------------------------------------------------------- /tutorials/exercism_py3/dna/dna.py: -------------------------------------------------------------------------------- 1 | 2 | def to_rna(dna): 3 | 4 | d={'G':'C','C':'G','T':'A','A':'U'} 5 | 6 | p = list(dna) 7 | return ''.join([d[m] for m in p]) 8 | 9 | -------------------------------------------------------------------------------- /gothonweb/sessions/40ad7454d4b4cbacedaa449f7e2c8fb04165ecf4: -------------------------------------------------------------------------------- 1 | KGRwMQpTJ2lwJwpwMgpWMTI3LjAuMC4xCnAzCnNTJ3Jvb20nCnA0Ck5zUydzZXNzaW9uX2lkJwpw 2 | NQpTJzQwYWQ3NDU0ZDRiNGNiYWNlZGFhNDQ5ZjdlMmM4ZmIwNDE2NWVjZjQnCnA2CnMu 3 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/word_count/wordcount3.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import re 3 | 4 | 5 | def word_count(phrase): 6 | return Counter(re.findall(r"[\w]+", phrase.lower().replace('_', ' '))) -------------------------------------------------------------------------------- /tutorials/exercism_py3/pangram/pangram.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: UTF-8 -*- 3 | 4 | import re 5 | 6 | def is_pangram(s): 7 | 8 | letters = re.sub('[^a-zA-Z]','',s) 9 | 10 | return len(list(set(letters.lower())))== 26 11 | 12 | -------------------------------------------------------------------------------- /firstform/templates/layout.html: -------------------------------------------------------------------------------- 1 | $def with (content) 2 | 3 | 4 | 5 | My first form 6 | 7 | 8 | 9 | 10 | 11 | $:content 12 | 13 | 14 | -------------------------------------------------------------------------------- /gothonweb/templates/layout.html: -------------------------------------------------------------------------------- 1 | $def with (content) 2 | 3 | 4 | 5 | Interactive Game 6 | 7 | 8 | 9 | 10 | 11 | $:content 12 | 13 | 14 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/leap/year5.py: -------------------------------------------------------------------------------- 1 | 2 | def is_leap_year(year): 3 | return year % 4 == 0 and (year % 100 != 0 or year % 400 == 0) 4 | 5 | if __name__ == '__main__': 6 | 7 | year = int(input('Type in a year to test if it is a leap year\n> ')) 8 | is_leap_year(year) 9 | 10 | -------------------------------------------------------------------------------- /SQL/galaXQL_17.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO hilight 2 | SELECT stars.starid AS starid 3 | FROM stars 4 | LEFT OUTER JOIN planets ON stars.starid == planets.starid 5 | LEFT OUTER JOIN moons ON planets.planetid == moons.planetid 6 | GROUP BY stars.starid ORDER BY (COUNT(planets.planetid) + COUNT(moons.moonid)) 7 | DESC 8 | LIMIT 1 -------------------------------------------------------------------------------- /tutorials/exercism_py3/leap/leap.py: -------------------------------------------------------------------------------- 1 | 2 | def is_leap_year(year): 3 | if (year%400 !=0) & (year%4 != 0) & (year%100 != 0): 4 | print ("%d is not a leap year" % year) 5 | return False 6 | else: 7 | print ("%d is a leap year!" % year) 8 | return True 9 | 10 | year = int(input('Type in a year to test if it is a leap year\n> ')) 11 | is_leap_year(year) -------------------------------------------------------------------------------- /firstform/templates/index.html: -------------------------------------------------------------------------------- 1 | $def with (greeting) 2 | 3 | $if greeting: 4 | I would just like to say \ 5 | $greeting. 6 | $else: 7 | Hello, world! 8 | 10 |

Input Form takes you back to the 11 | submission form.

12 | 13 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/leap/year4.py: -------------------------------------------------------------------------------- 1 | 2 | def is_leap_year(year): 3 | 4 | if year % 4 ==0 and year % 100 != 0 or year % 400 == 0: 5 | print ("%d is a leap year! "% year) 6 | return True 7 | 8 | else: 9 | print ("%d is not a leap year" % year) 10 | return False 11 | 12 | if __name__ == '__main__': 13 | 14 | year = int(input('Type in a year to test if it is a leap year\n> ')) 15 | is_leap_year(year) 16 | 17 | -------------------------------------------------------------------------------- /firstform/templates/hello_form.html: -------------------------------------------------------------------------------- 1 |

Fill Out This Form, Please

2 | 3 | 4 |
5 | 6 | A Greeting: 7 |
8 |
9 | Your Name: 10 |
11 |
12 | 13 |
14 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/hello-world/hello_world.py: -------------------------------------------------------------------------------- 1 | # 2 | # Skeleton file for the Python "Hello World" exercise. 3 | # 4 | def hello(name=''): 5 | 6 | if name == '': 7 | greeting = "Hello, World!" 8 | print (greeting) 9 | return greeting 10 | 11 | else: 12 | greeting = 'Hello, %s!' % name 13 | print (greeting) 14 | return greeting 15 | 16 | if __name__ == '__main__': 17 | 18 | name = input('What is your name?\n> ') 19 | hello(name) -------------------------------------------------------------------------------- /firstform/setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup 3 | except ImportError: 4 | from distutils.core import setup 5 | 6 | config = { 7 | 'description': 'My Project', 8 | 'author': 'Sophie Cowie', 9 | 'url': 'URL to get it at.', 10 | 'download_url': 'Where to download it.', 11 | 'author_email': 'sophie_cowie@hotmail.com', 12 | 'version': '0.1', 13 | 'install_requires': ['nose'], 14 | 'packages': ['NAME'], 15 | 'scripts': [], 16 | 'name': 'gothonweb' 17 | } 18 | 19 | setup(**config) -------------------------------------------------------------------------------- /gothonweb/setup.py: -------------------------------------------------------------------------------- 1 | try: 2 | from setuptools import setup 3 | except ImportError: 4 | from distutils.core import setup 5 | 6 | config = { 7 | 'description': 'My Project', 8 | 'author': 'Sophie Cowie', 9 | 'url': 'URL to get it at.', 10 | 'download_url': 'Where to download it.', 11 | 'author_email': 'sophie_cowie@hotmail.com', 12 | 'version': '0.1', 13 | 'install_requires': ['nose'], 14 | 'packages': ['NAME'], 15 | 'scripts': [], 16 | 'name': 'projectname' 17 | } 18 | 19 | setup(**config) -------------------------------------------------------------------------------- /tutorials/exercism_py3/leap/year.py: -------------------------------------------------------------------------------- 1 | 2 | def is_leap_year(year): 3 | 4 | b = (year%4 ==0) 5 | c = (year%100 != 0) 6 | d = (year%400 == 0) 7 | 8 | if b == True and c == True or d == True: 9 | print ("%d is a leap year! "% year) 10 | return True 11 | 12 | else: 13 | print ("%d is not a leap year" % year) 14 | return False 15 | 16 | if __name__ == '__main__': 17 | 18 | year = int(input('Type in a year to test if it is a leap year\n> ')) 19 | is_leap_year(year) 20 | 21 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/pangram/pangram2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | ALPHABET = 'abcdefghijklmnopqrstuvwxyz ' 4 | 5 | 6 | def is_pangram(s): 7 | 8 | 9 | return set(list(s.lower())) >= set(ALPHABET) 10 | 11 | if __name__ == '__main__': 12 | 13 | #is_pangram('the quick brown fox jumps over the lazy dog') 14 | # When I declare the encoding at the beginning, it doesnt throw up an error 15 | # with string here. 16 | string = 'Victor jagt zwölf Boxkämpfer quer über den großen Sylter Deich.' 17 | #new = string.encode('utf-8') 18 | is_pangram(string) -------------------------------------------------------------------------------- /001-git-basics.md: -------------------------------------------------------------------------------- 1 | Some basic git commands I used today to set this up: 2 | 3 | 4 | `git init` initialises a local repository 5 | 6 | 7 | `git add` stages the work to Index 8 | 9 | `git commit -m "comment"` saves the work to the repository 10 | 11 | 12 | Now the locally saved work can be added to the remote repository. 13 | 14 | First you want to connect to the remote server: 15 | `git remote add origin git@github.com:SophMC/notechain` 16 | 17 | 18 | `git push -u origin master` -u is added the first time, after that you just 19 | need to be inside the local repo that you want to push and type 20 | `git push` 21 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/leap/leap_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from year5 import is_leap_year 4 | 5 | 6 | class YearTest(unittest.TestCase): 7 | def test_leap_year(self): 8 | self.assertIs(is_leap_year(1996), True) 9 | 10 | def test_non_leap_year(self): 11 | self.assertIs(is_leap_year(1997), False) 12 | 13 | def test_non_leap_even_year(self): 14 | self.assertIs(is_leap_year(1998), False) 15 | 16 | def test_century(self): 17 | self.assertIs(is_leap_year(1900), False) 18 | 19 | def test_exceptional_century(self): 20 | self.assertIs(is_leap_year(2400), True) 21 | 22 | if __name__ == '__main__': 23 | unittest.main() 24 | -------------------------------------------------------------------------------- /firstform/bin/tools.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | import re 3 | 4 | def assert_response(resp, contains=None, matches=None,headers=None, 5 | status="200"): 6 | assert status in resp.status, \ 7 | "Expected response %r not in %r" \ 8 | % (status, resp.status) 9 | 10 | if status == "200": 11 | assert resp.data, "Response data is empty." 12 | 13 | if contains: 14 | assert contains in resp.data, "Response does not contain %r"\ 15 | % contains 16 | 17 | if matches: 18 | reg = re.compile(matches) 19 | assert reg.matches(resp.data), "Response does not match %r"\ 20 | % matches 21 | 22 | if headers: 23 | assert_equal(resp.headers,headers) -------------------------------------------------------------------------------- /firstform/bin/__init__.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | import re 3 | 4 | def assert_response(resp, contains=None, matches=None,headers=None, 5 | status="200"): 6 | assert status in resp.status, \ 7 | "Expected response %r not in %r" \ 8 | % (status, resp.status) 9 | 10 | if status == "200": 11 | assert resp.data, "Response data is empty." 12 | 13 | if contains: 14 | assert contains in resp.data, "Response does not contain %r"\ 15 | % contains 16 | 17 | if matches: 18 | reg = re.compile(matches) 19 | assert reg.matches(resp.data), "Response does not match %r"\ 20 | % matches 21 | 22 | if headers: 23 | assert_equal(resp.headers,headers) -------------------------------------------------------------------------------- /firstform/tests/tools.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | import re 3 | 4 | 5 | 6 | def assert_response(resp, contains=None, matches=None,headers=None, 7 | status="200"): 8 | assert status in resp.status, \ 9 | "Expected response %r not in %r" \ 10 | % (status, resp.status) 11 | 12 | if status == "200": 13 | assert resp.data, "Response data is empty." 14 | 15 | if contains: 16 | assert contains in resp.data, "Response does not contain %r"\ 17 | % contains 18 | 19 | if matches: 20 | reg = re.compile(matches) 21 | assert reg.matches(resp.data), "Response does not match %r"\ 22 | % matches 23 | 24 | if headers: 25 | assert_equal(resp.headers,headers) -------------------------------------------------------------------------------- /gothonweb/tests/tools.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | import re 3 | 4 | def assert_response(resp, contains=None, matches=None,headers=None, 5 | status="200"): 6 | assert status in resp.status, \ 7 | "Expected response %r not in %r" \ 8 | % (status, resp.status) 9 | 10 | if status == "200": 11 | assert resp.data, "Response data is empty." 12 | 13 | if contains: 14 | #confirm that number x, is in resp.data and if now print out.."Response 15 | #does not contain...." 16 | assert contains in resp.data, "Response does not contain %r"\ 17 | % contains 18 | 19 | if matches: 20 | reg = re.compile(matches) 21 | assert reg.matches(resp.data), "Response does not match %r"\ 22 | % matches 23 | 24 | if headers: 25 | assert_equal(resp.headers,headers) -------------------------------------------------------------------------------- /tutorials/exercism_py3/word_count/wordcount2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import re 4 | 5 | def word_count(sentence): 6 | 7 | sentence = re.sub('[,_]',' ',sentence) 8 | 9 | # ^ to substitute things that are NOT \s(spaces) and \w(alphanumeric 10 | # characters-letters). r, means raw string notation. 11 | sentence = re.sub(r'[^\s\w_]+', '', sentence.lower()) 12 | f = sentence.split() 13 | 14 | # Make a dictionary to store the pairs 15 | p = {} 16 | 17 | for x in f: 18 | 19 | # \b before and after helps to preserve whole words. 20 | matches = re.findall((r'\b%s\b'%x),' '.join(x for x in f)) 21 | 22 | #match the key to the value in the dictionary 23 | p[x] = len(matches) 24 | 25 | return p 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/word_count/README.md: -------------------------------------------------------------------------------- 1 | # Word Count 2 | 3 | Write a program that given a phrase can count the occurrences of each word in that phrase. 4 | 5 | For example for the input `"olly olly in come free"` 6 | 7 | ```plain 8 | olly: 2 9 | in: 1 10 | come: 1 11 | free: 1 12 | ``` 13 | 14 | 15 | ### Submitting Exercises 16 | 17 | Note that, when trying to submit an exercise, make sure the solution is in the `exercism/python/` directory. 18 | 19 | For example, if you're submitting `bob.py` for the Bob exercise, the submit command would be something like `exercism submit /python/bob/bob.py`. 20 | 21 | 22 | For more detailed information about running tests, code style and linting, 23 | please see the [help page](http://exercism.io/languages/python). 24 | 25 | ## Source 26 | 27 | This is a classic toy problem, but we were reminded of it by seeing it in the Go Tour. 28 | -------------------------------------------------------------------------------- /firstform/tests/app_tests.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | #How to import an application and run it directly for the automated test! 3 | #Important! 4 | from bin.app import app 5 | #From dir tests, import assert_response function from tools.py 6 | from tests.tools import assert_response 7 | 8 | def test_index(): 9 | # check that we get a 404 on the / URL 10 | resp = app.request("/") 11 | assert_response(resp,status="404") 12 | 13 | #test our first GET request to /hello 14 | resp = app.request("/hello") 15 | assert_response(resp) 16 | 17 | #make sure default values work for the form 18 | resp = app.request("/hello", method="POST") 19 | assert_response(resp, contains="Nobody") 20 | 21 | # test that we get expected values 22 | data = {'name':'Zed','greet':'Hola'} 23 | resp = app.request("/hello", method="POST",data=data) 24 | assert_response(resp,contains="Zed") -------------------------------------------------------------------------------- /gothonweb/tests/app_tests.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | #How to import an application and run it directly for the automated test! 3 | #Important! 4 | from bin.app import app 5 | #From dir tests, import assert_response function from tools.py 6 | from tests.tools import assert_response 7 | 8 | def test_index(): 9 | # check that we get a 404 on the / URL 10 | resp = app.request("/") 11 | assert_response(resp,status="404") 12 | 13 | #test our first GET request to /hello 14 | resp = app.request("/game") 15 | assert_response(resp) 16 | 17 | #make sure default values work for the form 18 | #resp = app.request("/game", method="POST") 19 | #assert_response(resp, action=None) 20 | 21 | # test that we get expected values 22 | #data = {'name':'Zed','greet':'Hola'} 23 | #resp = app.request("/hello", method="POST",data=data) 24 | #assert_response(resp,contains="Zed") -------------------------------------------------------------------------------- /tutorials/algorithms/notebooks/068-Lesson2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "m $\\in \\Theta$ (n)" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [] 18 | } 19 | ], 20 | "metadata": { 21 | "anaconda-cloud": {}, 22 | "kernelspec": { 23 | "display_name": "Python [Root]", 24 | "language": "python", 25 | "name": "Python [Root]" 26 | }, 27 | "language_info": { 28 | "codemirror_mode": { 29 | "name": "ipython", 30 | "version": 3 31 | }, 32 | "file_extension": ".py", 33 | "mimetype": "text/x-python", 34 | "name": "python", 35 | "nbconvert_exporter": "python", 36 | "pygments_lexer": "ipython3", 37 | "version": "3.5.2" 38 | } 39 | }, 40 | "nbformat": 4, 41 | "nbformat_minor": 0 42 | } 43 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/hello-world/hello_world_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | import unittest 5 | 6 | import hello_world2 7 | 8 | 9 | class HelloWorldTests(unittest.TestCase): 10 | 11 | def test_hello_without_name(self): 12 | self.assertEqual( 13 | 'Hello, World!', 14 | hello_world2.hello() 15 | ) 16 | 17 | def test_hello_with_sample_name(self): 18 | self.assertEqual( 19 | 'Hello, Alice!', 20 | hello_world2.hello('Alice') 21 | ) 22 | 23 | def test_hello_with_other_sample_name(self): 24 | self.assertEqual( 25 | 'Hello, Bob!', 26 | hello_world2.hello('Bob') 27 | ) 28 | 29 | def test_hello_with_umlaut_name(self): 30 | self.assertEqual( 31 | 'Hello, Jürgen!', 32 | hello_world2.hello('Jürgen') 33 | ) 34 | 35 | if __name__ == '__main__': 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/hello-world/hello_world_test2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from __future__ import unicode_literals 4 | import unittest 5 | 6 | import hello_world2 7 | 8 | 9 | class HelloWorldTests(unittest.TestCase): 10 | 11 | def test_hello_without_name(self): 12 | self.assertEqual( 13 | 'Hello, World!', 14 | hello_world2.hello() 15 | ) 16 | 17 | def test_hello_with_sample_name(self): 18 | self.assertEqual( 19 | 'Hello, Alice!', 20 | hello_world2.hello('Alice') 21 | ) 22 | 23 | def test_hello_with_other_sample_name(self): 24 | self.assertEqual( 25 | 'Hello, Bob!', 26 | hello_world2.hello('Bob') 27 | ) 28 | 29 | def test_hello_with_umlaut_name(self): 30 | self.assertEqual( 31 | 'Hello, Jürgen!', 32 | hello_world2.hello('Jürgen') 33 | ) 34 | 35 | if __name__ == '__main__': 36 | unittest.main() 37 | -------------------------------------------------------------------------------- /firstform/bin/app.py: -------------------------------------------------------------------------------- 1 | import web 2 | 3 | #This is mapping /hello to the class index. 4 | #Whenever someone types in /hello they will 5 | #get sent to the index class first. 6 | urls = ( 7 | '/hello', 'index' 8 | ) 9 | 10 | '''Whenever /hello is accessed while this app is running, it will begin 11 | a chain of processes starting from here. /hello is the key for index''' 12 | 13 | app = web.application(urls, globals()) 14 | 15 | render = web.template.render('templates/', base="layout") 16 | 17 | class index: 18 | def GET(self): 19 | #use render to display a page from the hello_form.html template 20 | return render.hello_form() 21 | #name="Nobody is the default if the information is not given 22 | #inputs=(name="Nobody") 23 | def POST(self): 24 | form = web.input(name="Nobody",greet="Hello") 25 | #forgot to put brackets around form.greet form.name! 26 | greeting = "%s, %s" % (form.greet, form.name) 27 | return render.index(greeting = greeting) 28 | 29 | if __name__ == "__main__": 30 | app.run() -------------------------------------------------------------------------------- /gothonweb/templates/show_room.html: -------------------------------------------------------------------------------- 1 | $def with (room) 2 | 3 |

$room.name

4 | 5 |
 6 | $room.description
 7 | 
8 | 9 | $if room.name == "death": 10 |

Play Again?

11 | 12 | 13 | $if room.name == "Central Corridor": 14 |

15 |

16 | Write 1,2 or 3 in the box \ 17 | 18 |
19 |

20 | 21 | 22 | $if room.name == "Laser Weapon Armory": 23 |

24 |

25 | 26 | Guess the code \ 27 | 28 |
29 |

30 | 31 | $if room.name == "The Bridge": 32 |

33 |

34 | \ 35 | 36 |
37 |

38 | 39 | 40 | -------------------------------------------------------------------------------- /tutorials/exercism_py3/pangram/pangram_detailed.py: -------------------------------------------------------------------------------- 1 | 2 | # -*- coding: UTF-8 -*- 3 | 4 | import re 5 | 6 | def is_pangram(s): 7 | 8 | 9 | # create a regular expression object(regex) to pull out only letters 10 | # from chars. ^ matches start of the string. 11 | regex = re.compile('[^a-zA-Z]') 12 | 13 | # Use regex object to substitute anything that doesn't match the pattern. 14 | # is the same as letters = re.sub('[^a-zA-Z]','',s) 15 | letters = regex.sub('', s) 16 | 17 | 18 | #break up the sentence into characters and extract the unique values 19 | if len(list(set(letters.lower())))== 26: 20 | 21 | print(list(set(letters))) 22 | print('This is a pangram') 23 | return True 24 | else: 25 | print(list(set(letters))) 26 | print('This is not a pangram') 27 | return False 28 | 29 | 30 | if __name__ == '__main__': 31 | 32 | #is_pangram('the quick brown fox jumps over the lazy dog') 33 | is_pangram('Victor jagt zwölf Boxkämpfer quer über den großen Sylter' 34 | 'Deich.') 35 | 36 | #set(list(s.lower())) >= set(ALPHABET) -------------------------------------------------------------------------------- /monkeylearn/015-selectdata.py: -------------------------------------------------------------------------------- 1 | import json 2 | import pandas as pd 3 | import requests 4 | 5 | with open('API_key.txt') as f: 6 | API_KEY = f.read().strip() 7 | 8 | API_KEY = API_read 9 | 10 | raw_df = pd.read_csv('indeed_edin.csv', encoding='utf-8', 11 | error_bad_lines=False) 12 | #turnstilelink_link_1/_text 13 | 14 | df = raw_df[['location_value', 'turnstilelink_link_1/_text', 15 | 'summary_description']] 16 | df.columns = ['location', 'title', 'description'] 17 | 18 | content_df = list(df.title + ' ' + df.description) 19 | 20 | categories = [] 21 | step = 150 22 | for start in xrange(0, len(content_df), step): 23 | end = start + step 24 | 25 | response = requests.post( 26 | 27 | "https://api.monkeylearn.com/v2/classifiers/cl_4PFzSWVR/classify/", 28 | data=json.dumps({'text_list': content_df[start:end]}), 29 | headers={'Authorization': 'Token {}'.format(API_KEY), 30 | 'Content-Type': 'application/json'}).json() 31 | 32 | # We go through the results of the API call, storing the result on a list. 33 | for category in response['result']: 34 | categories.append(category[0]['label']) 35 | 36 | augmented_df = df.join(pd.DataFrame(categories, columns=['category'])) 37 | augmented_df.to_csv('indeed_aug.csv', encoding='utf-8', index=False, 38 | header=False) -------------------------------------------------------------------------------- /tutorials/exercism_py3/leap/README.md: -------------------------------------------------------------------------------- 1 | # Leap 2 | 3 | Write a program that will take a year and report if it is a leap year. 4 | 5 | The tricky thing here is that a leap year in the Gregorian calendar occurs: 6 | 7 | ```plain 8 | on every year that is evenly divisible by 4 9 | except every year that is evenly divisible by 100 10 | unless the year is also evenly divisible by 400 11 | ``` 12 | 13 | For example, 1997 is not a leap year, but 1996 is. 1900 is not a leap 14 | year, but 2000 is. 15 | 16 | If your language provides a method in the standard library that does 17 | this look-up, pretend it doesn't exist and implement it yourself. 18 | 19 | ## Notes 20 | 21 | Though our exercise adopts some very simple rules, there is more to 22 | learn! 23 | 24 | For a delightful, four minute explanation of the whole leap year 25 | phenomenon, go watch [this youtube video][video]. 26 | 27 | [video]: http://www.youtube.com/watch?v=xX96xng7sAE 28 | 29 | ### Submitting Exercises 30 | 31 | Note that, when trying to submit an exercise, make sure the solution is in the `exercism/python/` directory. 32 | 33 | For example, if you're submitting `bob.py` for the Bob exercise, the submit command would be something like `exercism submit /python/bob/bob.py`. 34 | 35 | 36 | For more detailed information about running tests, code style and linting, 37 | please see the [help page](http://exercism.io/languages/python). 38 | 39 | ## Source 40 | 41 | JavaRanch Cattle Drive, exercise 3 [http://www.javaranch.com/leap.jsp](http://www.javaranch.com/leap.jsp) 42 | -------------------------------------------------------------------------------- /DSFromScratch/Chap13/machine_learning.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import math, random 3 | 4 | # 5 | # data splitting 6 | # 7 | 8 | def split_data(data, prob): 9 | """split data into fractions [prob, 1 - prob]""" 10 | results = [], [] 11 | for row in data: 12 | results[0 if random.random() < prob else 1].append(row) 13 | return results 14 | 15 | def train_test_split(x, y, test_pct): 16 | data = list(zip(x, y)) # pair corresponding values 17 | train, test = split_data(data, 1 - test_pct) # split the dataset of pairs 18 | x_train, y_train = list(zip(*train)) # magical un-zip trick 19 | x_test, y_test = list(zip(*test)) 20 | return x_train, x_test, y_train, y_test 21 | 22 | # 23 | # correctness 24 | # 25 | 26 | def accuracy(tp, fp, fn, tn): 27 | correct = tp + tn 28 | total = tp + fp + fn + tn 29 | return correct / total 30 | 31 | def precision(tp, fp, fn, tn): 32 | return tp / (tp + fp) 33 | 34 | def recall(tp, fp, fn, tn): 35 | return tp / (tp + fn) 36 | 37 | def f1_score(tp, fp, fn, tn): 38 | p = precision(tp, fp, fn, tn) 39 | r = recall(tp, fp, fn, tn) 40 | 41 | return 2 * p * r / (p + r) 42 | 43 | if __name__ == "__main__": 44 | 45 | print("accuracy(70, 4930, 13930, 981070)", accuracy(70, 4930, 13930, 46 | 981070)) 47 | print("precision(70, 4930, 13930, 981070)", precision(70, 4930, 13930, 48 | 981070)) 49 | print("recall(70, 4930, 13930, 981070)", recall(70, 4930, 13930, 981070)) 50 | print("f1_score(70, 4930, 13930, 981070)", f1_score(70, 4930, 13930, 51 | 981070)) -------------------------------------------------------------------------------- /gothonweb/tests/map_tests.py: -------------------------------------------------------------------------------- 1 | from nose.tools import * 2 | #from map file in dir bin import everything in map file. 3 | #This originally had just a class but as it has class instances also 4 | # defined, we want to import everything in the file. 5 | from bin.map import * 6 | 7 | def test_room(): 8 | gold = Room("GoldRoom","""This room has gold in it you can grab. There's a 9 | door to the north.""") 10 | assert_equal(gold.name, "GoldRoom") 11 | assert_equal(gold.paths,{}) 12 | 13 | def test_room_paths(): 14 | center = Room("Center", "Test room in the center.") 15 | north = Room("North", "Test room in the north.") 16 | south = Room("South", "Test room in the south.") 17 | 18 | center.add_paths({'north': north, 'south': south}) 19 | assert_equal(center.go('north'), north) 20 | assert_equal(center.go('south'), south) 21 | 22 | 23 | def test_map(): 24 | start = Room("Start", "You can go west and down a hole.") 25 | west = Room("Trees", "There are trees here, you can go east.") 26 | down = Room("Dungeon", "It's dark down here, you can go up.") 27 | 28 | start.add_paths({'west': west, 'down': down}) 29 | west.add_paths({'east': start}) 30 | down.add_paths({'up': start}) 31 | 32 | assert_equal(start.go('west'), west) 33 | assert_equal(start.go('west').go('east'), start) 34 | assert_equal(start.go('down').go('up'), start) 35 | 36 | def test_gothon_game_map(): 37 | assert_equal(START.go('shoot!'), generic_death) 38 | assert_equal(START.go('dodge!'), generic_death) 39 | 40 | room = START.go('tell a joke') 41 | assert_equal(room, laser_weapon_armory) -------------------------------------------------------------------------------- /Titanic/bin/clean_test.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | df = pd.read_csv('/home/sophie/projects/Titanic/data/test.csv', header=0) 5 | 6 | # Change Sex column to 1/0 in Gender 7 | df['Gender'] = df['Sex'].map({'female': 0, 'male': 1}).astype(float) 8 | 9 | #Drop columns 10 | df = df.drop(['Name','Cabin','Ticket','Sex'], axis=1) 11 | 12 | # Remove any rows which have a nan in the Embarked or Fare column 13 | df = df.dropna(subset = ['Embarked','Fare']) 14 | 15 | # Turn Embarked into float numbers 16 | df['Embarked'] = df['Embarked'].map({'C': 1 ,'Q': 2 ,'S': 3}).astype(float) 17 | 18 | 19 | ###Make guesses for Age. Use the medians for each class 20 | #Make a table filled with zeros 21 | median_ages = np.zeros((2,3)) # male/female for each class 22 | 23 | # Loop over the table to fill in the values 24 | for i in range(0, 2): 25 | for j in range(0, 3): 26 | median_ages[i,j] = df[(df['Gender'] == i) & (df['Pclass'] == j + 27 | 1)]['Age'].dropna().median() 28 | 29 | # Make a copy of Age 30 | df['AgeFill'] = df['Age'] 31 | 32 | 33 | # Fill the new column with the correct values. 34 | for i in range(0, 2): 35 | for j in range(0, 3): 36 | # we need df.loc here to specify the row AND the column. 37 | # only where age is null, gender is 1/0 and class is 1-3, that AgeFill 38 | # will be set to the median age. 39 | df.loc[(df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j + 1), 40 | 'AgeFill'] = median_ages[i,j] 41 | 42 | # We can drop the Age column now we have AgeFill 43 | df = df.drop(['Age'], axis=1) 44 | 45 | # Transform the whole dataframe into floats. 46 | df= df.astype(float) 47 | 48 | #Output this to csv to be read in for predicting values. 49 | df.to_csv('/home/sophie/projects/Titanic/data/clean_test.csv', sep = " ", index 50 | = False) 51 | 52 | 53 | 54 | 55 | 56 | -------------------------------------------------------------------------------- /windspeed/scripts/012-ws_tseries.py: -------------------------------------------------------------------------------- 1 | #The aim of this script is to produce a timeseries of windspeed for each 2 | #station, with lines for winds at 0000, 0600, 1200 and 1800 3 | 4 | import glob,os 5 | import pandas as pd 6 | 7 | #change the directory in here first 8 | os.chdir("/home/sophie/projects/windspeed/data/") 9 | fname_list = glob.glob('*allwinds.txt') 10 | 11 | def read_file(fname): 12 | '''take a file and read it into a dataframe''' 13 | 14 | print """ %s please select the index of the following files to make a plot \ 15 | of: """ % list(enumerate(fname)) 16 | location = int(raw_input("> ")) 17 | 18 | date_spec = {'date_time': [0,1,2]} 19 | column_names=["year","month","day","hour","ws"] 20 | dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float} 21 | 22 | print fname[location] 23 | wind = pd.read_csv(fname[location], sep=" ",parse_dates=date_spec, 24 | keep_date_col=True, names=column_names, index_col=False) 25 | #Dealing with hour - going from 600, 1200 etc to 6,12, 18 26 | wind["hour"]=(wind["hour"]/100).astype(int) 27 | 28 | #adding a date_time column with timestamp data 29 | wind['date_time'] = pd.to_datetime(wind.date_time) + \ 30 | wind.hour.astype('timedelta64[h]') 31 | 32 | print "here the data from %s will be split up" % fname_list[location] 33 | print "location index= %d" %location 34 | print "wind dataframe= %r" %wind[0:5] 35 | #data_subs(wind,location) 36 | return data_subs(wind,location) 37 | 38 | 39 | def data_subs(wind,location): 40 | '''Takes a dataframe and splits it into four new dataframes ready for 41 | plotting''' 42 | print wind[0:5] 43 | print location 44 | #print "here the data from %s will be split up" % fname_list[location] 45 | pass 46 | 47 | if __name__ == "__main__": 48 | 49 | data = read_file(fname_list) 50 | #data_subs(wind, location) -------------------------------------------------------------------------------- /tutorials/ThinkBayes/056 - Chap6DecisionAnalysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Chap 6: Decision Analysis\n", 8 | "\n", 9 | "How to decide on the price of a showcase?\n", 10 | "Bayesian thinking towards an answer:\n", 11 | "1) Prior beliefs on what the showcase prices could be: Analyse previous prices on the show.\n", 12 | "2) Likelihood/Update: Seeing the prizes, how should you update? i.e. How to interpret the data?\n", 13 | "3) Results from Update on the Prior: the Posterior. How to choose from the posterior distribution?\n", 14 | "\n", 15 | "All of these steps require subjective decisions. \n", 16 | "\n", 17 | "**Modeling the contestants**\n", 18 | "If you were a contestant on the show you could use this distribution (fig 6.1) to quantify your prior belief about the price of each showcase (before you even see the prizes). \n", 19 | "To Update, we have to answer these questions:\n", 20 | "\n", 21 | "1) What data should we consider and how should we quantify it?\n", 22 | "2) Can we compute a likelihood function; ie.e for each hypo value of `price`, can we compute the conditional likelihood of the data?" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": { 29 | "collapsed": true 30 | }, 31 | "outputs": [], 32 | "source": [] 33 | } 34 | ], 35 | "metadata": { 36 | "kernelspec": { 37 | "display_name": "Python 3", 38 | "language": "python", 39 | "name": "python3" 40 | }, 41 | "language_info": { 42 | "codemirror_mode": { 43 | "name": "ipython", 44 | "version": 3 45 | }, 46 | "file_extension": ".py", 47 | "mimetype": "text/x-python", 48 | "name": "python", 49 | "nbconvert_exporter": "python", 50 | "pygments_lexer": "ipython3", 51 | "version": "3.5.1" 52 | } 53 | }, 54 | "nbformat": 4, 55 | "nbformat_minor": 0 56 | } 57 | -------------------------------------------------------------------------------- /tutorials/KaggleNLP/word_vectors.py: -------------------------------------------------------------------------------- 1 | # Download the punkt tokenizer for sentence splitting 2 | import nltk.data 3 | 4 | # Import various modules for string cleaning 5 | from bs4 import BeautifulSoup 6 | import re 7 | from nltk.corpus import stopwords 8 | import pandas as pd 9 | 10 | #Load the punkt tokenizer 11 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 12 | 13 | # Define a function to split a review into parsed sentences 14 | def review_to_sentences(review, tokenizer, remove_stopwords=False): 15 | """Split a review into parsed sentences. Returns a list of sentences, 16 | where each sentence is a list of words""" 17 | # 1. Use the NLTK tokenizer to split the paragraph into sentences 18 | raw_sentences = tokenizer.tokenize(review.strip()) 19 | 20 | # 2. Loop over each sentence 21 | sentences = [] 22 | for raw_sentence in raw_sentences: 23 | # If a sentence is empty, skip it 24 | if len(raw_sentence) > 0: 25 | # Otherwise, call review_to_wordlist to get a list of words 26 | sentences.append(review_to_wordlist(raw_sentence, remove_stopwords)) 27 | 28 | # Return the list of sentences (each sentence is a list of words, so this 29 | # returns a list of lists) 30 | return sentences 31 | 32 | 33 | def review_to_wordlist(review, remove_stopwords=False): 34 | """Convert a document to a sequence of words, optionally removing stop words 35 | Returns a list of words""" 36 | # 1. Remove HTML 37 | review_text = BeautifulSoup(review).get_text() 38 | # 2. Remove non-letters 39 | review_text = re.sub("[^a-zA-Z]"," ", review_text) 40 | # 3. Convert words to lower case and split them 41 | words = review_text.lower().split() 42 | # 4. Optionally remove stop words (false by default) 43 | if remove_stopwords: 44 | stops = set(stopwords.words("english")) 45 | words = [w for w in words if not w in stops] 46 | # 5. Return a list of words 47 | return(words) 48 | 49 | -------------------------------------------------------------------------------- /tutorials/K-means/kmeans.py: -------------------------------------------------------------------------------- 1 | # supporting lib for kmeans clustering 2 | # Nitin Borwankar 3 | # Open Data Science Training 4 | 5 | import numpy as np 6 | from scipy.cluster.vq import kmeans,vq 7 | from scipy.spatial.distance import cdist 8 | import matplotlib.pyplot as plt 9 | 10 | 11 | def load_data(fName = '../datasets/UN4col.csv'): 12 | fp = open(fName) 13 | XX = np.loadtxt(fp) 14 | fp.close() 15 | return XX 16 | 17 | 18 | def run_kmeans(X, n=10): 19 | _K = range(1,n) 20 | 21 | # scipy.cluster.vq.kmeans 22 | _KM = [kmeans(X,k) for k in _K] # apply kmeans 1 to 10 23 | _centroids = [cent for (cent,var) in _KM] # cluster centroids 24 | 25 | _D_k = [cdist(X, cent, 'euclidean') for cent in _centroids] 26 | 27 | _cIdx = [np.argmin(D,axis=1) for D in _D_k] 28 | _dist = [np.min(D,axis=1) for D in _D_k] 29 | _avgWithinSS = [sum(d)/X.shape[0] for d in _dist] 30 | 31 | return (_K, _KM, _centroids, _D_k, _cIdx, _dist, _avgWithinSS) 32 | 33 | def plot_elbow_curve(kIdx, K, avgWithinSS): 34 | fig = plt.figure() 35 | ax = fig.add_subplot(111) 36 | ax.plot(K, avgWithinSS, 'b*-') 37 | ax.plot(K[kIdx], avgWithinSS[kIdx], marker='o', markersize=12, 38 | markeredgewidth=2, markeredgecolor='r', markerfacecolor='None') 39 | plt.grid(True) 40 | plt.xlabel('Number of clusters') 41 | plt.ylabel('Average within-cluster sum of squares') 42 | tt = plt.title('Elbow for KMeans clustering') 43 | return(fig,ax) 44 | 45 | def plot_clusters(orig,pred,nx,ny,legend=True): 46 | data = orig 47 | import matplotlib.pyplot as plt 48 | ylabels = { 0:'Male life expectancy in yrs',1:'Female life expectancy in yrs',2:'Infant mortality, per 1000'} 49 | # plot data into three clusters based on value of c 50 | p0 = plt.plot(data[pred==0,nx],data[pred==0,ny],'ro',label='Underdeveloped') 51 | p2 = plt.plot(data[pred==2,nx],data[pred==2,ny],'go',label='Developing') 52 | p1 = plt.plot(data[pred==1,nx],data[pred==1,ny],'bo',label='Developed') 53 | 54 | lx = p1[0].axes.set_xlabel('Per Capita GDP in US$') 55 | ly = p1[0].axes.set_ylabel(ylabels[ny]) 56 | tt= plt.title('UN countries Dataset, KMeans clustering with K=3') 57 | if legend: 58 | ll=plt.legend() 59 | return (p0,p1,p2) 60 | 61 | -------------------------------------------------------------------------------- /gothonweb/bin/app.py: -------------------------------------------------------------------------------- 1 | import web 2 | import map 3 | 4 | urls = ("/game", "GameEngine", "/", "Index") 5 | 6 | app = web.application(urls, globals()) 7 | 8 | #little hack so that debug mode works with sessions 9 | # 10 | if web.config.get('_session') is None: 11 | store = web.session.DiskStore('sessions') 12 | session = web.session.Session(app, store, initializer={'room': None}) 13 | 14 | web.config._session = session 15 | else: 16 | session = web.config._session 17 | 18 | render = web.template.render('templates/', base="layout") 19 | 20 | class Index(object): 21 | def GET(self): 22 | # this is used to "setup" the session with starting values 23 | #Give us the first session.room = central_corridor 24 | session.room = map.START 25 | #Sends you on your way to GameEngine class 26 | web.seeother("/game") 27 | 28 | class GameEngine(object): 29 | #inside the html page you have standard 30 | def GET(self): 31 | #session.room should = TRUE, either because it has been through Index, 32 | #or been given another link 33 | if session.room: 34 | #make html page from show_room.html. Take session.room as the 35 | #variable in the html page, accessed by $ 36 | return render.show_room(room=session.room) 37 | else: 38 | # why is there here? do you need it? 39 | #if something is passed to session.room which is not recognised, it 40 | #won't fail 41 | return render.you_died() 42 | 43 | def POST(self): 44 | #inside
you can pass method= which takes a function such as GET 45 | #or POST(as they are defined in this app) 46 | #if action is not given a value in the form, it will automatically be 47 | #None 48 | # 49 | form = web.input(action=None) 50 | 51 | if session.room and form.action: 52 | # 53 | session.room = session.room.go(form.action) 54 | web.seeother("/game") 55 | #if session.room = laser_weapon_armory and form.action != '123' and 56 | # count < 10: 57 | #count =+ 1 58 | #session.room = session.room.go(form.action) 59 | #web.seeother("/game") 60 | #if session.room = laser_weapon_armory and form.action != '123' and 61 | #count >= 10: 62 | #session.room = None 63 | #web.seeother("/game") 64 | 65 | else: 66 | web.seeother("/game") 67 | 68 | if __name__ == "__main__": 69 | app.run() -------------------------------------------------------------------------------- /tutorials/exercism_py3/hello-world/README.md: -------------------------------------------------------------------------------- 1 | # Hello World 2 | 3 | Write a program that greets the user by name, or by saying "Hello, World!" if no name is given. 4 | 5 | ["Hello, World!"](http://en.wikipedia.org/wiki/%22Hello,_world!%22_program) is the traditional first program for beginning programming in a new language. 6 | 7 | **Note:** You can skip this exercise by running: 8 | 9 | exercism skip $LANGUAGE hello-world 10 | 11 | ## Specification 12 | 13 | The `Hello World!` program will greet me, the caller. 14 | 15 | If I tell the program my name is Alice, it will greet me by saying "Hello, Alice!". 16 | 17 | If I neglect to give it my name, it will greet me by saying "Hello, World!" 18 | 19 | ## Test-Driven Development 20 | 21 | As programmers mature, they eventually want to test their code. 22 | 23 | Here at Exercism we simulate [Test-Driven Development](http://en.wikipedia.org/wiki/Test-driven_development) (TDD), where you write your tests before writing any functionality. The simulation comes in the form of a pre-written test suite, which will signal that you have solved the problem. 24 | 25 | It will also provide you with a safety net to explore other solutions without breaking the functionality. 26 | 27 | ### A typical TDD workflow on Exercism: 28 | 29 | 1. Run the test file and pick one test that's failing. 30 | 2. Write some code to fix the test you picked. 31 | 3. Re-run the tests to confirm the test is now passing. 32 | 4. Repeat from step 1. 33 | 5. Submit your solution (`exercism submit /path/to/file`) 34 | 35 | ## Instructions 36 | 37 | Submissions are encouraged to be general, within reason. Having said that, it's also important not to over-engineer a solution. 38 | 39 | It's important to remember that the goal is to make code as expressive and readable as we can. However, solutions to the hello-world exercise will not be reviewed by a person, but by rikki- the robot, who will offer an encouraging word. 40 | 41 | ### Submitting Exercises 42 | 43 | Note that, when trying to submit an exercise, make sure the solution is in the `exercism/python/` directory. 44 | 45 | For example, if you're submitting `bob.py` for the Bob exercise, the submit command would be something like `exercism submit /python/bob/bob.py`. 46 | 47 | 48 | For more detailed information about running tests, code style and linting, 49 | please see the [help page](http://exercism.io/languages/python). 50 | 51 | ## Source 52 | 53 | This is a program to introduce users to using Exercism [http://en.wikipedia.org/wiki/%22Hello,_world!%22_program](http://en.wikipedia.org/wiki/%22Hello,_world!%22_program) 54 | -------------------------------------------------------------------------------- /tutorials/algorithms/scripts/L1_Eulerian_Q10.py: -------------------------------------------------------------------------------- 1 | # Taken from 2 | #https://discussions.udacity.com/t/problem-set-1-challenge-find-eulerian-tour/ 3 | #26214/8 4 | 5 | #### What this script does 6 | #- Goes through the edges fo the graph in order, trying each edge as the 7 | #starting point. 8 | #- If the staring point leads to a complete eulerian tour, it returns it. 9 | #- Otherwise, it tries again starting with the next edge in the graph 10 | 11 | #- While going through the tour, it selects the next edge as the first edge it 12 | #comes to in the graph that leaves from the current node (that hasn't already 13 | #been used). 14 | #- Is there a better selection method to find the best edge out of all possible 15 | #edges form the current node? 16 | 17 | 18 | 19 | def get_degree(graph): 20 | degree = {} 21 | for x, y in graph: 22 | degree[x] = degree.get(x, 0) + 1 23 | degree[y] = degree.get(y, 0) + 1 24 | return degree 25 | 26 | def eulerian_tour_is_possible(graph): 27 | degree = get_degree(graph) 28 | odd = 0 29 | for entry in degree: 30 | if degree[entry] % 2 != 0: 31 | odd += 1 32 | if odd == 0: return True 33 | return False 34 | 35 | def find_next_edge(node, graph): 36 | edges = find_all_edges(node, graph) 37 | for edge in edges: 38 | if node in edge: 39 | return edge 40 | return None 41 | 42 | def find_all_edges(node, graph): 43 | edges = [] 44 | for edge in graph: 45 | if node in edge: 46 | edges.append(edge) 47 | return edges 48 | 49 | def find_eulerian_tour(graph): 50 | if eulerian_tour_is_possible(graph): 51 | for i in range(len(graph)): 52 | tour = [] 53 | graph_copy = graph[:] # make copy of graph to do work on 54 | start_edge = graph_copy.pop(i) # change starting edge as loop 55 | # iterates 56 | tour.append(start_edge[0]) 57 | tour.append(start_edge[1]) 58 | while len(graph_copy) > 0: 59 | edge = find_next_edge(tour[-1], graph_copy) 60 | if edge == None: break # we've reached a node where no more 61 | # possible edges exist 62 | if tour[-1] == edge[0]: 63 | tour.append(edge[1]) 64 | else: 65 | tour.append(edge[0]) 66 | graph_copy.pop(graph_copy.index(edge)) 67 | if graph_copy == []: return tour # we've used all edges, tour 68 | # found! 69 | return None 70 | else: 71 | return None 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /tutorials/algorithms/scripts/L1_EulerianPath.py: -------------------------------------------------------------------------------- 1 | # Needs python 2.7 2 | # Eulerian Tour Ver 1 3 | # 4 | # Write a function, `create_tour` that takes as 5 | # input a list of nodes 6 | # and outputs a list of tuples representing 7 | # edges between nodes that have an Eulerian tour. 8 | # 9 | 10 | def edge(x,y): 11 | """Ensures that each set of edges in the tour 12 | goes from a lower value to a higher value""" 13 | return (x, y) if x < y else (y, x) 14 | 15 | def create_tour(nodes): 16 | """For each node create edges which incrementally 17 | increase """ 18 | tour = [] 19 | l = len(nodes) 20 | for i in range(l): 21 | t = edge(nodes[i], nodes[(i+1) % l]) 22 | print nodes 23 | print t 24 | tour.append(t) 25 | return tour 26 | 27 | 28 | ############ 29 | 30 | def get_degree(tour): 31 | degree = {} 32 | for x, y in tour: 33 | degree[x] = degree.get(x, 0) + 1 34 | degree[y] = degree.get(y, 0) + 1 35 | return degree 36 | 37 | def check_edge(t, b, nodes): 38 | """ 39 | t: tuple representing an edge 40 | b: origin node 41 | nodes: set of nodes already visited 42 | 43 | if we can get to a new node from `b` following `t` 44 | then return that node, else return None 45 | """ 46 | if t[0] == b: 47 | if t[1] not in nodes: 48 | return t[1] 49 | elif t[1] == b: 50 | if t[0] not in nodes: 51 | return t[0] 52 | return None 53 | 54 | def connected_nodes(tour): 55 | """return the set of nodes reachable from 56 | the first node in `tour`""" 57 | a = tour[0][0] 58 | nodes = set([a]) 59 | explore = set([a]) 60 | while len(explore) > 0: 61 | # see what other nodes we can reach 62 | b = explore.pop() 63 | for t in tour: 64 | node = check_edge(t, b, nodes) 65 | if node is None: 66 | continue 67 | nodes.add(node) 68 | explore.add(node) 69 | return nodes 70 | 71 | def is_eulerian_tour(nodes, tour): 72 | # all nodes must be even degree 73 | # and every node must be in graph 74 | degree = get_degree(tour) 75 | for node in nodes: 76 | try: 77 | d = degree[node] 78 | if d % 2 == 1: 79 | print "Node %s has odd degree" % node 80 | return False 81 | except KeyError: 82 | print "Node %s was not in your tour" % node 83 | return False 84 | connected = connected_nodes(tour) 85 | if len(connected) == len(nodes): 86 | return True 87 | else: 88 | print "Your graph wasn't connected" 89 | return False 90 | 91 | def test(): 92 | nodes = [20, 21, 22, 23, 24, 25] 93 | tour = create_tour(nodes) 94 | return is_eulerian_tour(nodes, tour) 95 | 96 | print test() -------------------------------------------------------------------------------- /tutorials/exercism_py3/word_count/word_count_test.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | from wordcount2 import word_count 5 | 6 | 7 | # to be backwards compatible with the old Python 2.X 8 | def decode_if_needed(string): 9 | try: 10 | return string.decode('utf-8') 11 | except AttributeError: 12 | return string 13 | 14 | 15 | class WordCountTests(unittest.TestCase): 16 | 17 | def test_count_one_word(self): 18 | self.assertEqual( 19 | {'word': 1}, 20 | word_count('word') 21 | ) 22 | 23 | def test_count_one_of_each(self): 24 | self.assertEqual( 25 | {'one': 1, 'of': 1, 'each': 1}, 26 | word_count('one of each') 27 | ) 28 | 29 | def test_count_multiple_occurences(self): 30 | self.assertEqual( 31 | {'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1}, 32 | word_count('one fish two fish red fish blue fish') 33 | ) 34 | 35 | def test_preserves_punctuation(self): 36 | self.assertEqual( 37 | {'car': 1, 'carpet': 1, 'as': 1, 'java': 1, 'javascript': 1}, 38 | word_count('car : carpet as java : javascript!!&@$%^&') 39 | ) 40 | 41 | def test_include_numbers(self): 42 | self.assertEqual( 43 | {'testing': 2, '1': 1, '2': 1}, 44 | word_count('testing 1 2 testing') 45 | ) 46 | 47 | def test_mixed_case(self): 48 | self.assertEqual( 49 | [2, 3], 50 | sorted(list(word_count('go Go GO Stop stop').values())) 51 | ) 52 | 53 | def test_multiple_spaces(self): 54 | self.assertEqual( 55 | {'wait': 1, 'for': 1, 'it': 1}, 56 | word_count('wait for it') 57 | ) 58 | 59 | def test_newlines(self): 60 | self.assertEqual( 61 | {'rah': 2, 'ah': 3, 'roma': 2, 'ma': 1, 'ga': 2, 'oh': 1, 'la': 2, 62 | 'want': 1, 'your': 1, 'bad': 1, 'romance': 1}, 63 | word_count('rah rah ah ah ah\nroma roma ma\n' 64 | 'ga ga oh la la\nwant your bad romance') 65 | ) 66 | 67 | def test_tabs(self): 68 | self.assertEqual( 69 | {'rah': 2, 'ah': 3, 'roma': 2, 'ma': 1, 'ga': 2, 'oh': 1, 'la': 2, 70 | 'want': 1, 'your': 1, 'bad': 1, 'romance': 1}, 71 | word_count('rah rah ah ah ah\troma roma ma\tga ga oh la la\t' 72 | 'want your bad romance') 73 | ) 74 | 75 | def test_non_alphanumeric(self): 76 | self.assertEqual( 77 | {'hey': 1, 'my': 1, 'spacebar': 1, 'is': 1, 'broken': 1}, 78 | word_count('hey,my_spacebar_is_broken.') 79 | ) 80 | 81 | def test_unicode(self): 82 | self.assertEqual( 83 | {decode_if_needed('до'): 1, decode_if_needed('свидания'): 1}, 84 | word_count('до🖖свидания!') 85 | ) 86 | 87 | if __name__ == '__main__': 88 | unittest.main() 89 | -------------------------------------------------------------------------------- /windspeed/scripts/030-group_tseries.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime as datetime 4 | import matplotlib.pyplot as plt 5 | 6 | # Creating a panel of timeseries for each group of stations. 7 | 8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 9 | # obs per month. 10 | 11 | # An average over the group will be an extra plot in the panel. 12 | 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh', 14 | '60555Touggourt','60559ElOued','60566Ghardaia','60580Ouargla', 15 | '60581HassiMessaoud'] 16 | 17 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah', 18 | '62103Ghadames','62124Sebha'] 19 | 20 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa', 21 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika'] 22 | 23 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya', 24 | '61017Bilma'] 25 | 26 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan', 27 | '62420Baharia','62423Farafra','62435Kharga'] 28 | 29 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima', 30 | '62680Atbara'] 31 | 32 | 33 | stations=[NAl,CSar,WSa,CSal,Egy,Sud] 34 | 35 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 36 | 'Egypt':Egy,'Sudan':Sud} 37 | 38 | 39 | def read_file(fname): 40 | '''put the station name into read_file and read_file will return a 41 | dataFrame called wind which has the following columns a dataframe with a 42 | datetime index''' 43 | 44 | 45 | column_names=["year","month","day","hour","ws"] 46 | dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float} 47 | 48 | datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname 49 | 50 | # specify the columns you want to group together. Can't include hour at 51 | # this point as it is not in the right format. 52 | date_spec = {'date_time': [0,1,2]} 53 | 54 | # when you use keep_dat_col it keeps them as objects, not as the dtype you 55 | # read them in as. 56 | wind = pd.read_csv(datafile, sep=" ", names=column_names, 57 | parse_dates=date_spec, keep_date_col=True, index_col=False ) 58 | 59 | # Dealing with hour - going from 600, 1200 etc to 6,12, 18 60 | wind["hour"]=(wind["hour"]/100).astype(int) 61 | 62 | # combining year, month, day that were parsed together into date_time with 63 | # hour, which is now in the correct format. 64 | wind['date_time'] = pd.to_datetime(wind.date_time) + \ 65 | wind.hour.astype('timedelta64[h]') 66 | 67 | # make datetime the index before making subsections. 68 | wind.index = wind['date_time'] 69 | 70 | # Adds extra rows where value is kept if it meets isin() criteria. Nan if 71 | # it doesn't. 72 | wind['ws_0']= wind['ws'][wind['hour'].isin([0])] 73 | wind['ws_06']= wind['ws'][wind['hour'].isin([6])] 74 | wind['ws_12']= wind['ws'][wind['hour'].isin([12])] 75 | wind['ws_18']= wind['ws'][wind['hour'].isin([18])] 76 | 77 | return wind 78 | 79 | def group_mean(group): 80 | '''loop over items in group list, using read_file on each one 81 | return''' 82 | pass 83 | 84 | def plot_tseries(): 85 | '''set up n+1 subplots where n is number of stations in the group. Fill in 86 | each plot with timeseries from each station and then a mean of all the 87 | stations. Output to file eps.''' 88 | pass 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /tutorials/Samsung/notebooks/029-Samsung_cleanup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 109, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Populating the interactive namespace from numpy and matplotlib\n", 15 | " name\n", 16 | "1 tBodyAcc-mean()\n", 17 | "2 tBodyAcc-mean()\n", 18 | "3 tBodyAcc-mean()\n", 19 | "4 tBodyAcc-std()\n", 20 | "5 tBodyAcc-std()\n", 21 | " name\n", 22 | "1 tBodyAcc-mean()\n", 23 | "4 tBodyAcc-std()\n", 24 | "7 tBodyAcc-mad()\n", 25 | "10 tBodyAcc-max()\n", 26 | "13 tBodyAcc-min()\n" 27 | ] 28 | } 29 | ], 30 | "source": [ 31 | "%pylab inline\n", 32 | "import pandas as pd\n", 33 | "\n", 34 | "# \n", 35 | "df = pd.read_csv('/home/sophie/projects/Samsung/data/UCI_HAR_Dataset/UCI_HAR_Dataset/features_copy.txt',sep=\" \",\n", 36 | " names = ['name'], dtype='str')\n", 37 | "\n", 38 | "print df[0:5] # Shows us some duplicates\n", 39 | "\n", 40 | "# This works to drop duplicate columns. Have to specify the column name. \n", 41 | "df.drop_duplicates(['name'],inplace=True)\n", 42 | "\n", 43 | "print df[0:5] # Line above removes the duplicates" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 110, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | " name\n", 58 | "303 fBodyAcc-bandsEnergy()-18\n", 59 | "304 fBodyAcc-bandsEnergy()-916\n", 60 | "305 fBodyAcc-bandsEnergy()-1724\n", 61 | "306 fBodyAcc-bandsEnergy()-2532\n", 62 | "307 fBodyAcc-bandsEnergy()-3340\n", 63 | "308 fBodyAcc-bandsEnergy()-4148\n", 64 | "309 fBodyAcc-bandsEnergy()-4956\n", 65 | "310 fBodyAcc-bandsEnergy()-5764\n", 66 | "311 fBodyAcc-bandsEnergy()-116\n", 67 | "312 fBodyAcc-bandsEnergy()-1732\n", 68 | "Empty DataFrame\n", 69 | "Columns: [name]\n", 70 | "Index: []\n" 71 | ] 72 | } 73 | ], 74 | "source": [ 75 | "# Print out the lines that have numbers in them.\n", 76 | "print df[df.name.str.contains('[0-9]')][0:10] # You have to specify the column. \n", 77 | "\n", 78 | "\n", 79 | "# remove numbers, brackets and \"-\" from all columns\n", 80 | "\n", 81 | "df.name = df.name.str.replace('[()]', '') # remove brackets\n", 82 | "df.name = df.name.str.replace('-','') # remove -\n", 83 | "df.name = df.name.str.replace('[0-9]','') # remove any numbers\n", 84 | "\n", 85 | "# Select something which has a number in it.\n", 86 | "print df[df.name.str.contains('[0-9]')] # This is empty now.\n" 87 | ] 88 | } 89 | ], 90 | "metadata": { 91 | "kernelspec": { 92 | "display_name": "Python 2", 93 | "language": "python", 94 | "name": "python2" 95 | }, 96 | "language_info": { 97 | "codemirror_mode": { 98 | "name": "ipython", 99 | "version": 2 100 | }, 101 | "file_extension": ".py", 102 | "mimetype": "text/x-python", 103 | "name": "python", 104 | "nbconvert_exporter": "python", 105 | "pygments_lexer": "ipython2", 106 | "version": "2.7.11" 107 | } 108 | }, 109 | "nbformat": 4, 110 | "nbformat_minor": 0 111 | } 112 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/046-ImplimentingSuite.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "How to encapsulate the framework in an object - A Suite is a Pmf that provides \\__init\\__, Update and Print:" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 5, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# This tells Python of that additional module import path. \n", 19 | "import os\n", 20 | "import sys\n", 21 | "module_path = os.path.abspath(os.path.join('..'))\n", 22 | "if module_path not in sys.path:\n", 23 | " sys.path.append(module_path)\n", 24 | " \n", 25 | "from thinkbayes import Pmf" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 6, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "class Suite(Pmf):\n", 37 | " '''Represents a suite of hypotheses and their probabilities.'''\n", 38 | " def __init__(self, hypo=tuple()):\n", 39 | " '''Initializes the distribution.'''\n", 40 | " def Update(self,data):\n", 41 | " '''Updates each hypothesis based on the data'''\n", 42 | " def Print(self):\n", 43 | " '''Prints the hypothese and their probabilities.'''" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "Suite is implemented in thinkbayes.py. To use Suite, write a class that inherits from it and provides Liklihood.\n", 51 | "e.g using the Monty Hall problem" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 7, 57 | "metadata": { 58 | "collapsed": true 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "from thinkbayes import Suite" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 8, 68 | "metadata": { 69 | "collapsed": true 70 | }, 71 | "outputs": [], 72 | "source": [ 73 | "class Monty(Suite):\n", 74 | " def Likelihood(self, data, hypo):\n", 75 | " if hypo == data:\n", 76 | " return 0 \n", 77 | " elif hypo == 'A':\n", 78 | " return 0.5\n", 79 | " else:\n", 80 | " return 1" 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": {}, 86 | "source": [ 87 | "And, to use the class:" 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 10, 93 | "metadata": { 94 | "collapsed": false 95 | }, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "A 0.3333333333333333\n", 102 | "B 0.6666666666666666\n", 103 | "C 0.0\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "suite = Monty('ABC')\n", 109 | "suite.Update('C')\n", 110 | "suite.Print()" 111 | ] 112 | } 113 | ], 114 | "metadata": { 115 | "kernelspec": { 116 | "display_name": "Python 3", 117 | "language": "python", 118 | "name": "python3" 119 | }, 120 | "language_info": { 121 | "codemirror_mode": { 122 | "name": "ipython", 123 | "version": 3 124 | }, 125 | "file_extension": ".py", 126 | "mimetype": "text/x-python", 127 | "name": "python", 128 | "nbconvert_exporter": "python", 129 | "pygments_lexer": "ipython3", 130 | "version": "3.5.1" 131 | } 132 | }, 133 | "nbformat": 4, 134 | "nbformat_minor": 0 135 | } 136 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/046-Suite_m&m.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### Using the Suite framework to solve the M&M problem\n", 8 | "\n", 9 | "- Two bags of m&ms (one from 94 and one from 96), with different proportions of colors.\n", 10 | "- You get an m&m from each bag. A yellow and a green, but you don't know which bag they came from. \n", 11 | "\n", 12 | "What is the probability that the yellow is from bag 1?" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 20, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import sys\n", 25 | "module_path = os.path.abspath(os.path.join('..'))\n", 26 | "if module_path not in sys.path:\n", 27 | " sys.path.append(module_path)\n", 28 | " \n", 29 | "from thinkbayes import Pmf, Suite" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "First, encode the color mixes from before and after 1995:" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 21, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "class M_and_M(Suite):\n", 48 | " \n", 49 | " # Encode the color mixes from before and after 1995\n", 50 | " mix94 = dict(brown=30, yellow=20, red=20, green=10, orange=10, tan=10)\n", 51 | " mix96 = dict(blue=24, green=20, orange=16, yellow=14, red=13)\n", 52 | " \n", 53 | " #Next, encode the hypotheses\n", 54 | " hypoA = dict(bag1=mix94, bag2=mix96)\n", 55 | " hypoB = dict(bag1=mix96, bag2=mix94)\n", 56 | " \n", 57 | " # Map the name of the hypothesis to the representation\n", 58 | " hypotheses = dict(A=hypoA, B=hypoB)\n", 59 | " \n", 60 | " # In this case the hypothesis, hypo, is astring, either A or B. The data is a tuple that spcifies a bag and a color.\n", 61 | " def Likelihood(self, data, hypo):\n", 62 | " bag, color = data\n", 63 | " mix = self.hypotheses[hypo][bag]\n", 64 | " like = mix[color]\n", 65 | " return like\n", 66 | " " 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "Code to create the Suite and update it:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 22, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "A 0.7407407407407407\n", 88 | "B 0.2592592592592592\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "suite = M_and_M('AB') # All of the hypotheses are passed to suite.\n", 94 | "\n", 95 | "suite.Update(('bag1','yellow')) # This tuple is unpacked inside Likelihood into bag, color. \n", 96 | "suite.Update(('bag2','green'))\n", 97 | "\n", 98 | "suite.Print()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "We have picked a yellow from bag1 and a green from bag2. \n", 106 | "A is the probability that bag1 = 94, bag2 = 96 \n", 107 | "B is the probability that bag1 = 96, bag2 = 94 \n", 108 | "\n", 109 | "The posterior probability of A is approximately 20/27 - same result as earlier. " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.5.1" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 0 143 | } 144 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/.ipynb_checkpoints/046-Suite_m&m-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "#### Using the Suite framework to solve the M&M problem\n", 8 | "\n", 9 | "- Two bags of m&ms (one from 94 and one from 96), with different proportions of colors.\n", 10 | "- You get an m&m from each bag. A yellow and a green, but you don't know which bag they came from. \n", 11 | "\n", 12 | "What is the probability that the yellow is from bag 1?" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 20, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import sys\n", 25 | "module_path = os.path.abspath(os.path.join('..'))\n", 26 | "if module_path not in sys.path:\n", 27 | " sys.path.append(module_path)\n", 28 | " \n", 29 | "from thinkbayes import Pmf, Suite" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "First, encode the color mixes from before and after 1995:" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 21, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "class M_and_M(Suite):\n", 48 | " \n", 49 | " # Encode the color mixes from before and after 1995\n", 50 | " mix94 = dict(brown=30, yellow=20, red=20, green=10, orange=10, tan=10)\n", 51 | " mix96 = dict(blue=24, green=20, orange=16, yellow=14, red=13)\n", 52 | " \n", 53 | " #Next, encode the hypotheses\n", 54 | " hypoA = dict(bag1=mix94, bag2=mix96)\n", 55 | " hypoB = dict(bag1=mix96, bag2=mix94)\n", 56 | " \n", 57 | " # Map the name of the hypothesis to the representation\n", 58 | " hypotheses = dict(A=hypoA, B=hypoB)\n", 59 | " \n", 60 | " # In this case the hypothesis, hypo, is astring, either A or B. The data is a tuple that spcifies a bag and a color.\n", 61 | " def Likelihood(self, data, hypo):\n", 62 | " bag, color = data\n", 63 | " mix = self.hypotheses[hypo][bag]\n", 64 | " like = mix[color]\n", 65 | " return like\n", 66 | " " 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "Code to create the Suite and update it:" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 22, 79 | "metadata": { 80 | "collapsed": false 81 | }, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "A 0.7407407407407407\n", 88 | "B 0.2592592592592592\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "suite = M_and_M('AB') # All of the hypotheses are passed to suite.\n", 94 | "\n", 95 | "suite.Update(('bag1','yellow')) # This tuple is unpacked inside Likelihood into bag, color. \n", 96 | "suite.Update(('bag2','green'))\n", 97 | "\n", 98 | "suite.Print()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": {}, 104 | "source": [ 105 | "We have picked a yellow from bag1 and a green from bag2. \n", 106 | "A is the probability that bag1 = 94, bag2 = 96 \n", 107 | "B is the probability that bag1 = 96, bag2 = 94 \n", 108 | "\n", 109 | "The posterior probability of A is approximately 20/27 - same result as earlier. " 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [] 120 | } 121 | ], 122 | "metadata": { 123 | "kernelspec": { 124 | "display_name": "Python 3", 125 | "language": "python", 126 | "name": "python3" 127 | }, 128 | "language_info": { 129 | "codemirror_mode": { 130 | "name": "ipython", 131 | "version": 3 132 | }, 133 | "file_extension": ".py", 134 | "mimetype": "text/x-python", 135 | "name": "python", 136 | "nbconvert_exporter": "python", 137 | "pygments_lexer": "ipython3", 138 | "version": "3.5.1" 139 | } 140 | }, 141 | "nbformat": 4, 142 | "nbformat_minor": 0 143 | } 144 | -------------------------------------------------------------------------------- /windspeed/scripts/037-group_tseries.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime as datetime 4 | import matplotlib.pyplot as plt 5 | 6 | # Creating a panel of timeseries for each group of stations. 7 | 8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 9 | # obs per month. 10 | 11 | # An average over the group will be an extra plot in the panel. 12 | 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh', 14 | '60555Touggourt','60559ElOued','60566Ghardaia','60580Ouargla', 15 | '60581HassiMessaoud'] 16 | 17 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah', 18 | '62103Ghadames','62124Sebha'] 19 | 20 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa', 21 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika'] 22 | 23 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya', 24 | '61017Bilma'] 25 | 26 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan', 27 | '62420Baharia','62423Farafra','62435Kharga'] 28 | 29 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima', 30 | '62680Atbara'] 31 | 32 | 33 | stations=[NAl,CSar,WSa,CSal,Egy,Sud] 34 | 35 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 36 | 'Egypt':Egy,'Sudan':Sud} 37 | 38 | 39 | 40 | # Could these two functions be turned into lambda functions? 41 | # Would that be preferable or are these fine? 42 | 43 | def meanf(x): 44 | if x.count() > 10: 45 | return x.mean() 46 | 47 | def sdf(x): 48 | if x.count() > 10: 49 | return x.std() 50 | 51 | def read_file(fname): 52 | '''put the station name into read_file and read_file will return a 53 | dataFrame called wind which has the following columns a dataframe with a 54 | datetime index''' 55 | 56 | 57 | column_names=["year","month","day","hour","ws"] 58 | dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float} 59 | 60 | datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname 61 | 62 | # specify the columns you want to group together. Can't include hour at 63 | # this point as it is not in the right format. 64 | date_spec = {'date_time': [0,1,2]} 65 | 66 | # when you use keep_dat_col it keeps them as objects, not as the dtype you 67 | # read them in as. 68 | wind = pd.read_csv(datafile, sep=" ", names=column_names, 69 | parse_dates=date_spec, keep_date_col=True, index_col=False ) 70 | 71 | # Dealing with hour - going from 600, 1200 etc to 6,12, 18 72 | wind["hour"]=(wind["hour"]/100).astype(int) 73 | 74 | # combining year, month, day that were parsed together into date_time with 75 | # hour, which is now in the correct format. 76 | wind['date_time'] = pd.to_datetime(wind.date_time) + \ 77 | wind.hour.astype('timedelta64[h]') 78 | 79 | # make datetime the index before making subsections. 80 | wind.index = wind['date_time'] 81 | 82 | # Adds extra rows where value is kept if it meets isin() criteria. Nan if 83 | # it doesn't. 84 | wind['ws_0']= wind['ws'][wind['hour'].isin([0])] 85 | wind['ws_06']= wind['ws'][wind['hour'].isin([6])] 86 | wind['ws_12']= wind['ws'][wind['hour'].isin([12])] 87 | wind['ws_18']= wind['ws'][wind['hour'].isin([18])] 88 | 89 | group = wind.groupby(['year', 'month']) 90 | 91 | wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf]) 92 | 93 | return wind_group 94 | 95 | 96 | def plot_tseries(group): 97 | '''set up n+1 subplots where n is number of stations in the group. Fill in 98 | each plot with timeseries from each station and then a mean of all the 99 | stations. Output to file eps.''' 100 | 101 | 102 | fig = plt.figure(figsize=(10,10)) 103 | 104 | for i in range(len(group)): 105 | 106 | #read the file in for plotting 107 | wind_group = read_file(group[i]) 108 | 109 | # fig.add_subplot(nrows, ncols, num) 110 | 111 | ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1) 112 | 113 | plt.title(s=group[i], fontsize=15) 114 | 115 | wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'm') 116 | wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'r') 117 | wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'b') 118 | wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'c') 119 | 120 | ax.legend(loc=4,prop={'size':6}) 121 | 122 | plt.tight_layout() # very nice! stops the titles overlapping 123 | 124 | fig.suptitle(group_strings[i]) 125 | 126 | fig.savefig('/home/sophie/projects/windspeed/output/%s.png'%(group[i]), 127 | dpi=125) 128 | 129 | if __name__ == '__main__': 130 | 131 | for x in stations: plot_tseries(x) 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /windspeed/scripts/013-ws_tseries.py: -------------------------------------------------------------------------------- 1 | #The aim of this script is to produce a timeseries of windspeed for each 2 | #station, with lines for winds at 0000, 0600, 1200 and 1800 3 | 4 | import glob,os 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | class Analysis(object): 10 | '''take a file and read it into a dataframe 11 | then ask user if they want to show statistics 12 | or make plots''' 13 | 14 | 15 | def __init__(self): 16 | 17 | #df with all values 18 | self.wind=wind 19 | 20 | #why can't I do this here? 21 | #wind_00['hour'] = wind[wind['hour'].isin([0000])] 22 | 23 | 24 | def split_df(self,wind): 25 | '''Split the original dataframe up into hours''' 26 | 27 | #self.wind_00= wind[wind['hour'].isin([0000])] 28 | #print self.wind_00['hour'][0:5] 29 | wind_00=wind[wind['hour'].isin([0])] 30 | wind_06=wind[wind['hour'].isin([6])] 31 | wind_12=wind[wind['hour'].isin([12])] 32 | wind_18=wind[wind['hour'].isin([18])] 33 | #do I need to have return here? 34 | 35 | #do you want to look at some stats for these? 36 | stats = raw_input("Do you want to see some data stats for the " 37 | "hours 00,06,12,18, y/n ? \n> ") 38 | if stats == 'y' or 'Y' or 'Yes': 39 | self.investigate(wind, wind_00, wind_06, wind_12, wind_18) 40 | 41 | else: pass 42 | 43 | #Ask user if they want a timeseries plot 44 | plots = raw_input("Do you want to look at a timeseries plot y/n ?" 45 | "\n>") 46 | 47 | if plots == 'y' or 'Y' or 'Yes': 48 | self.plot_tseries(wind, wind_00, wind_06, wind_12, wind_18) 49 | 50 | else: pass 51 | 52 | def plot_tseries(self, wind, wind_00, wind_06, wind_12, wind_18): 53 | 54 | plt.plot(wind['date_time'],wind['ws']) 55 | 56 | #labels 57 | plt.xlabel("Time") 58 | plt.ylabel("wind-speed", size=10) 59 | 60 | 61 | #change size of x ticks 62 | plt.rc("font", size=7) 63 | 64 | 65 | #chopping the file extension off to put in the name of the image file 66 | fname = fname_list[location][:-4] 67 | 68 | #print the plot to the screen 69 | plt.show() 70 | 71 | #Ask user if they want to save the plot in a file 72 | qu = raw_input("Do you want save the timeseries in a png y/n ?" 73 | "\n>") 74 | 75 | if qu == 'y' or 'Y' or 'Yes': 76 | path = '/home/sophie/projects/windspeed/output/' 77 | plt.savefig(path+'%stseries.png' % fname, format='png') 78 | 79 | else: pass 80 | 81 | def plot_hist(self, wind, wind_00, wind_06, wind_12, wind_18): 82 | pass 83 | 84 | 85 | def investigate(self, wind, wind_00, wind_06, wind_12, wind_18): 86 | 87 | print "-" * 10 88 | print "00 subset:" 89 | print wind_00.describe(percentiles=[.05,0.5,0.95]) 90 | print "-" * 10 91 | print "06 subset:" 92 | print wind_06.describe(percentiles=[.05,0.5,0.95]) 93 | print "-" * 10 94 | print "12 subset:" 95 | print wind_12.describe(percentiles=[.05,0.5,0.95]) 96 | print "-" * 10 97 | print "18 subset: " 98 | print wind_18.describe(percentiles=[.05,0.5,0.95]) 99 | print "-" * 10 100 | 101 | 102 | 103 | if __name__ == "__main__": 104 | 105 | #change the directory in here first 106 | os.chdir("/home/sophie/projects/windspeed/data/") 107 | fname_list = glob.glob('*allwinds.txt') 108 | 109 | #Choose a station from the list. 110 | print """ %s please select the index of the following files to make a plot\ 111 | of: """ % list(enumerate(fname_list)) 112 | 113 | location = int(raw_input("> ")) 114 | 115 | ##Group first 3 columns into a datetime object 116 | date_spec = {'date_time': [0,1,2]} 117 | column_names=["year","month","day","hour","ws"] 118 | 119 | #specify the data type of each column 120 | dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float} 121 | 122 | #read in the data into a dataframe called wind 123 | wind = pd.read_csv(fname_list[location], sep=" ",parse_dates=date_spec, 124 | keep_date_col=True, names=column_names, index_col=False) 125 | 126 | #using keep_date_col=True puts forgets the dtypes specified for the columns 127 | #so we need to change them again here. 128 | wind[['year','month','day']]=wind[['year','month','day']].astype(int) 129 | 130 | #Dealing with hour - going from 600, 1200 etc to 6,12, 18 131 | wind["hour"]=(wind["hour"]/100).astype(int) 132 | 133 | #adding a date_time column with timestamp data 134 | wind['date_time'] = pd.to_datetime(wind.date_time) + \ 135 | wind.hour.astype('timedelta64[h]') 136 | 137 | b = Analysis() 138 | b.split_df(wind) 139 | -------------------------------------------------------------------------------- /gothonweb/bin/map.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Room(object): 4 | 5 | def __init__(self,name,description): 6 | self.name = name 7 | self.description = description 8 | self.paths = {} 9 | 10 | def go(self,direction): 11 | """ takes a direction and puts it into the 12 | empty paths dictionary. get() returns a value 13 | for a given key""" 14 | return self.paths.get(direction, None) 15 | 16 | def add_paths(self, paths): 17 | #adds the key-values from paths into paths 18 | self.paths.update(paths) 19 | 20 | central_corridor = Room("Central Corridor", 21 | """ 22 | The Gothons of Planet Percal #25 have invaded your ship and destroyed 23 | your entire crew. You are the last surviving member and your last 24 | mission is to get the neutron destruct bomb from the Weapons Armory, 25 | put it in the bridge, and blow the ship up after getting into an 26 | escape pod. 27 | 28 | You're running down the central corridor to the Weapons Armory when 29 | a Gothon jumps out, red scaly skin, dark grimy teeth, and evil clown costume 30 | flowing around his hate filled body. He's blocking the door to the 31 | Armory and about to pull a weapon to blast you. You can either: 1) eat him, 32 | 2) kick him in the nuts, or 3) blast him first. 33 | """) 34 | 35 | 36 | laser_weapon_armory = Room("Laser Weapon Armory", 37 | """ 38 | Lucky for you they made you learn Gothon insults in the academy. 39 | You tell the one Gothon joke you know: 40 | Lbhe zbgure vf fb sng, jura fur fvgf nebhaq gur ubhfr, fur fvgf nebhaq gur 41 | ubhfr. 42 | The Gothon stops, tries not to laugh, then busts out laughing and can't 43 | move. 44 | While he's laughing you run up and shoot him square in the head 45 | putting him down, then jump through the Weapon Armory door. 46 | 47 | You do a dive roll into the Weapon Armory, crouch and scan the room 48 | for more Gothons that might be hiding. It's dead quiet, too quiet. 49 | You stand up and run to the far side of the room and find the 50 | neutron bomb in its container. There's a keypad lock on the box 51 | and you need the code to get the bomb out. If you get the code 52 | wrong 10 times then the lock closes forever and you can't 53 | get the bomb. The code is 3 digits. (hint: the first two digits are 13) 54 | """) 55 | 56 | 57 | the_bridge = Room("The Bridge", 58 | """ 59 | The container clicks open and the seal breaks, letting gas out. 60 | You grab the neutron bomb and run as fast as you can to the 61 | bridge where you must place it in the right spot. 62 | 63 | You burst onto the Bridge with the netron destruct bomb 64 | under your arm and surprise 5 Gothons who are trying to 65 | take control of the ship. Each of them has an even uglier 66 | clown costume than the last. They haven't pulled their 67 | weapons out yet, as they see the active bomb under your 68 | arm and don't want to set it off. 69 | """) 70 | 71 | 72 | escape_pod = Room("Escape Pod", 73 | """ 74 | You point your blaster at the bomb under your arm 75 | and the Gothons put their hands up and start to sweat. 76 | You inch backward to the door, open it, and then carefully 77 | place the bomb on the floor, pointing your blaster at it. 78 | You then jump back through the door, punch the close button 79 | and blast the lock so the Gothons can't get out. 80 | Now that the bomb is placed you run to the escape pod to 81 | get off this tin can. 82 | 83 | You rush through the ship desperately trying to make it to 84 | the escape pod before the whole ship explodes. It seems like 85 | hardly any Gothons are on the ship, so your run is clear of 86 | interference. You get to the chamber with the escape pods, and 87 | now need to pick one to take. Some of them could be damaged 88 | but you don't have time to look. There's 5 pods, which one 89 | do you take? 90 | """) 91 | 92 | 93 | the_end_winner = Room("The End", 94 | """ 95 | You jump into pod 2 and hit the eject button. 96 | The pod easily slides out into space heading to 97 | the planet below. As it flies to the planet, you look 98 | back and see your ship implode then explode like a 99 | bright star, taking out the Gothon ship at the same 100 | time. You won! 101 | """) 102 | 103 | 104 | the_end_loser = Room("The End", 105 | """ 106 | You jump into a random pod and hit the eject button. 107 | The pod escapes out into the void of space, then 108 | implodes as the hull ruptures, crushing your body 109 | into jam jelly. 110 | """ 111 | ) 112 | 113 | escape_pod.add_paths({ 114 | '2': the_end_winner, 115 | '*': the_end_loser 116 | }) 117 | 118 | generic_death = Room("death", "You died.") 119 | 120 | the_bridge.add_paths({ 121 | 'throw the bomb': generic_death, 122 | 'slowly place the bomb': escape_pod 123 | }) 124 | 125 | laser_weapon_armory.add_paths({ 126 | '132': the_bridge, 127 | '*': laser_weapon_armory 128 | }) 129 | 130 | central_corridor.add_paths({ 131 | '1': generic_death, 132 | '2': laser_weapon_armory, 133 | '3': laser_weapon_armory 134 | }) 135 | 136 | START = central_corridor 137 | 138 | 139 | 140 | 141 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/046-MontyHall_framework.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Here we are setting up the framework to investigate what happens to the probability that the car is behind A,B,C depending on some new data. This new data is Monty opening door B and there being no car behind it. The likelihood that this new data is factored into the Likelihood function. " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 20, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# This tells Python of that additional module import path. \n", 19 | "import os\n", 20 | "import sys\n", 21 | "module_path = os.path.abspath(os.path.join('..'))\n", 22 | "if module_path not in sys.path:\n", 23 | " sys.path.append(module_path)\n", 24 | " \n", 25 | "from thinkbayes import Pmf" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "In this example the car is equally likely to be behind doors A, B or C for the PRIOR, p(H). \n", 33 | "Working out the Likelihood is tricky:\n", 34 | " - If car is behind A, there is a 50% chance that Monty will pick B or C and both have no car.\n", 35 | " - If car is behind B, there is a 0% chance than Monty will pick B and there will be no car!\n", 36 | " - If car is behind C, there is a 100% chance that Monty will pick B and there will be no car. You have picked A so he has not choice but to pick C. \n", 37 | "\n", 38 | "Likelihood is set up here that if you pick B, likelihood will be 0.5 ,0 ,1 for ABC. \n", 39 | "What happens if data is not B? What are we saying in real terms if we do that? \n", 40 | "As the hypothesis is that \"car is behind door x\" then the liklihood that Monty chooses that door and there is no car behind it is always going to be 0. That leaves an equal chance of the car being behind B or C. " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 16, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "class Monty(Pmf):\n", 52 | " \n", 53 | " def __init__(self,hypos):\n", 54 | " Pmf.__init__(self)\n", 55 | " for hypo in hypos:\n", 56 | " self.Set(hypo,1)\n", 57 | " self.Normalize()\n", 58 | " \n", 59 | " def Update(self,data):\n", 60 | " for hypo in self.Values():\n", 61 | " like = self.Likelihood(data,hypo)\n", 62 | " self.Mult(hypo, like)\n", 63 | " self.Normalize()\n", 64 | " \n", 65 | " # So far code is the same as in the Cookie problem\n", 66 | " # Likelihood, however, requires some work:\n", 67 | " \n", 68 | " def Likelihood(self, data, hypo):\n", 69 | " if hypo == data:\n", 70 | " return 0 \n", 71 | " elif hypo == 'A':\n", 72 | " return 0.5\n", 73 | " else:\n", 74 | " return 1 " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 17, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "hypos = 'ABC' # for hypo in hypos will break this up into 'A', 'B', 'C'.\n", 86 | "pmf = Monty(hypos) # Class Monty inherits from class Pmf." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 18, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "# Calling update is pretty much the same\n", 98 | "data = 'A'\n", 99 | "pmf.Update(data)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 19, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "C 0.5\n", 114 | "B 0.5\n", 115 | "A 0.0\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "# Now to print out the results (Same as Cookie problem)\n", 121 | "for hypo, prob in pmf.Items():\n", 122 | " print (hypo, prob)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "The only thing that is different here is that writing Likelihood is a little more complicated. " 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.5.1" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 0 154 | } 155 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/.ipynb_checkpoints/046-MontyHall_framework-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Here we are setting up the framework to investigate what happens to the probability that the car is behind A,B,C depending on some new data. This new data is Monty opening door B and there being no car behind it. The likelihood that this new data is factored into the Likelihood function. " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 20, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "# This tells Python of that additional module import path. \n", 19 | "import os\n", 20 | "import sys\n", 21 | "module_path = os.path.abspath(os.path.join('..'))\n", 22 | "if module_path not in sys.path:\n", 23 | " sys.path.append(module_path)\n", 24 | " \n", 25 | "from thinkbayes import Pmf" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "In this example the car is equally likely to be behind doors A, B or C for the PRIOR, p(H). \n", 33 | "Working out the Likelihood is tricky:\n", 34 | " - If car is behind A, there is a 50% chance that Monty will pick B or C and both have no car.\n", 35 | " - If car is behind B, there is a 0% chance than Monty will pick B and there will be no car!\n", 36 | " - If car is behind C, there is a 100% chance that Monty will pick B and there will be no car. You have picked A so he has not choice but to pick C. \n", 37 | "\n", 38 | "Likelihood is set up here that if you pick B, likelihood will be 0.5 ,0 ,1 for ABC. \n", 39 | "What happens if data is not B? What are we saying in real terms if we do that? \n", 40 | "As the hypothesis is that \"car is behind door x\" then the liklihood that Monty chooses that door and there is no car behind it is always going to be 0. That leaves an equal chance of the car being behind B or C. " 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 16, 46 | "metadata": { 47 | "collapsed": false 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "class Monty(Pmf):\n", 52 | " \n", 53 | " def __init__(self,hypos):\n", 54 | " Pmf.__init__(self)\n", 55 | " for hypo in hypos:\n", 56 | " self.Set(hypo,1)\n", 57 | " self.Normalize()\n", 58 | " \n", 59 | " def Update(self,data):\n", 60 | " for hypo in self.Values():\n", 61 | " like = self.Likelihood(data,hypo)\n", 62 | " self.Mult(hypo, like)\n", 63 | " self.Normalize()\n", 64 | " \n", 65 | " # So far code is the same as in the Cookie problem\n", 66 | " # Likelihood, however, requires some work:\n", 67 | " \n", 68 | " def Likelihood(self, data, hypo):\n", 69 | " if hypo == data:\n", 70 | " return 0 \n", 71 | " elif hypo == 'A':\n", 72 | " return 0.5\n", 73 | " else:\n", 74 | " return 1 " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 17, 80 | "metadata": { 81 | "collapsed": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "hypos = 'ABC' # for hypo in hypos will break this up into 'A', 'B', 'C'.\n", 86 | "pmf = Monty(hypos) # Class Monty inherits from class Pmf." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 18, 92 | "metadata": { 93 | "collapsed": true 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "# Calling update is pretty much the same\n", 98 | "data = 'A'\n", 99 | "pmf.Update(data)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 19, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "C 0.5\n", 114 | "B 0.5\n", 115 | "A 0.0\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "# Now to print out the results (Same as Cookie problem)\n", 121 | "for hypo, prob in pmf.Items():\n", 122 | " print (hypo, prob)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "The only thing that is different here is that writing Likelihood is a little more complicated. " 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 3", 136 | "language": "python", 137 | "name": "python3" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 3 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython3", 149 | "version": "3.5.1" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 0 154 | } 155 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/.ipynb_checkpoints/047-Dice-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### The Dice Problem\n", 8 | "\n", 9 | "I have 5 dice - a 4-sided, 6-sided, 8-sided and 20-sided. \n", 10 | "If I select a die from the box at random and get a 6, what is the probability that it was each of the dice.\n", 11 | "Here we will: \n", 12 | "1) Choose a representation for the hypotheses \n", 13 | "2) Choose a representation for the data \n", 14 | "3) Write a likelihood function \n", 15 | "\n", 16 | "Previously we used strings to represent hypotheses and data, here we will use numbers. \n", 17 | "Specifically 4,6,8,12 and 20 to represent hypotheses:" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": 7, 23 | "metadata": { 24 | "collapsed": true 25 | }, 26 | "outputs": [], 27 | "source": [ 28 | "import os\n", 29 | "import sys\n", 30 | "module_path = os.path.abspath(os.path.join('..'))\n", 31 | "if module_path not in sys.path:\n", 32 | " sys.path.append(module_path)\n", 33 | " \n", 34 | "from thinkbayes import Pmf, Suite" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 8, 40 | "metadata": { 41 | "collapsed": false 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "class Dice(Suite):\n", 46 | " \n", 47 | " #notice there are no class variables here. \n", 48 | " \n", 49 | " def Likelihood(self, data, hypo):\n", 50 | " if hypo < data:\n", 51 | " return 0 \n", 52 | " else:\n", 53 | " return 1.0/hypo\n", 54 | "\n", 55 | "\n", 56 | "# We use integers to represent hypotheses\n", 57 | "suite = Dice([4 ,6 ,8 ,12 ,20 ])" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "If hypo < data this means the roll is greater than the number of sides on the die. This is not possible to the likelihood is 0.\n", 65 | "\n", 66 | "Otherwise the question is, \"Given that there are hypo sides, what is the chance of rolling data?\" \n", 67 | "The answer is 1/hypo, regardless of data." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 9, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "4 0.0\n", 82 | "6 0.3921568627450979\n", 83 | "8 0.2941176470588235\n", 84 | "12 0.19607843137254896\n", 85 | "20 0.11764705882352941\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "# Update hypothesis if I roll a 6\n", 91 | "suite.Update(6)\n", 92 | "suite.Print()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "After we roll a 6, the probability for the 4-sided die is 0. 6-sided is most likely, though there is still almost a 12% chance for the 20-sided die" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 13, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [ 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "4 0.0\n", 114 | "6 0.0\n", 115 | "8 0.9965835404647062\n", 116 | "12 0.0034137843517224823\n", 117 | "20 2.6751835712673812e-06\n", 118 | "-----\n", 119 | "4 0.0\n", 120 | "6 0.0\n", 121 | "8 0.9977204760988618\n", 122 | "12 0.0022784526069342752\n", 123 | "20 1.0712942038485414e-06\n", 124 | "-----\n", 125 | "4 0.0\n", 126 | "6 0.0\n", 127 | "8 0.9984794472645385\n", 128 | "12 0.0015201238918041802\n", 129 | "20 4.2884365717293543e-07\n", 130 | "-----\n", 131 | "4 0.0\n", 132 | "6 0.0\n", 133 | "8 0.9989858984203864\n", 134 | "12 0.0010139299551430545\n", 135 | "20 1.7162447051522972e-07\n", 136 | "-----\n", 137 | "4 0.0\n", 138 | "6 0.0\n", 139 | "8 0.9993237494202397\n", 140 | "12 0.0006761819067551149\n", 141 | "20 6.867300515001654e-08\n", 142 | "-----\n", 143 | "4 0.0\n", 144 | "6 0.0\n", 145 | "8 0.999549082940396\n", 146 | "12 0.00045088958420803747\n", 147 | "20 2.7475395980645105e-08\n", 148 | "-----\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "# What if we roll a few more times an get 6,8,7,7,5 and 4 from the same dice?\n", 154 | "for roll in [6,8,7,7,5,4]:\n", 155 | " suite.Update(roll)\n", 156 | " suite.Print()\n", 157 | " print('-----')\n" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "Now the probability is 94% that we roll the 8-sided die and less than 1% for the 20-sided." 165 | ] 166 | } 167 | ], 168 | "metadata": { 169 | "kernelspec": { 170 | "display_name": "Python 3", 171 | "language": "python", 172 | "name": "python3" 173 | }, 174 | "language_info": { 175 | "codemirror_mode": { 176 | "name": "ipython", 177 | "version": 3 178 | }, 179 | "file_extension": ".py", 180 | "mimetype": "text/x-python", 181 | "name": "python", 182 | "nbconvert_exporter": "python", 183 | "pygments_lexer": "ipython3", 184 | "version": "3.5.1" 185 | } 186 | }, 187 | "nbformat": 4, 188 | "nbformat_minor": 0 189 | } 190 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/047-Dice.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "### The Dice Problem\n", 10 | "\n", 11 | "I have 5 dice - a 4-sided, 6-sided, 8-sided and 20-sided. \n", 12 | "If I select a die from the box at random and get a 6, what is the probability that it was each of the dice.\n", 13 | "Here we will: \n", 14 | "1) Choose a representation for the hypotheses \n", 15 | "2) Choose a representation for the data \n", 16 | "3) Write a likelihood function \n", 17 | "\n", 18 | "Previously we used strings to represent hypotheses and data, here we will use numbers. \n", 19 | "Specifically 4,6,8,12 and 20 to represent hypotheses:" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 7, 25 | "metadata": { 26 | "collapsed": true 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "import os\n", 31 | "import sys\n", 32 | "module_path = os.path.abspath(os.path.join('..'))\n", 33 | "if module_path not in sys.path:\n", 34 | " sys.path.append(module_path)\n", 35 | " \n", 36 | "from thinkbayes import Pmf, Suite" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 8, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "class Dice(Suite):\n", 48 | " \n", 49 | " #notice there are no class variables here. \n", 50 | " \n", 51 | " def Likelihood(self, data, hypo):\n", 52 | " if hypo < data:\n", 53 | " return 0 \n", 54 | " else:\n", 55 | " return 1.0/hypo\n", 56 | "\n", 57 | "\n", 58 | "# We use integers to represent hypotheses\n", 59 | "suite = Dice([4 ,6 ,8 ,12 ,20 ])" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "If hypo < data this means the roll is greater than the number of sides on the die. This is not possible to the likelihood is 0.\n", 67 | "\n", 68 | "Otherwise the question is, \"Given that there are hypo sides, what is the chance of rolling data?\" \n", 69 | "The answer is 1/hypo, regardless of data." 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 9, 75 | "metadata": { 76 | "collapsed": false 77 | }, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "4 0.0\n", 84 | "6 0.3921568627450979\n", 85 | "8 0.2941176470588235\n", 86 | "12 0.19607843137254896\n", 87 | "20 0.11764705882352941\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "# Update hypothesis if I roll a 6\n", 93 | "suite.Update(6)\n", 94 | "suite.Print()" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "After we roll a 6, the probability for the 4-sided die is 0. 6-sided is most likely, though there is still almost a 12% chance for the 20-sided die" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 13, 107 | "metadata": { 108 | "collapsed": false 109 | }, 110 | "outputs": [ 111 | { 112 | "name": "stdout", 113 | "output_type": "stream", 114 | "text": [ 115 | "4 0.0\n", 116 | "6 0.0\n", 117 | "8 0.9965835404647062\n", 118 | "12 0.0034137843517224823\n", 119 | "20 2.6751835712673812e-06\n", 120 | "-----\n", 121 | "4 0.0\n", 122 | "6 0.0\n", 123 | "8 0.9977204760988618\n", 124 | "12 0.0022784526069342752\n", 125 | "20 1.0712942038485414e-06\n", 126 | "-----\n", 127 | "4 0.0\n", 128 | "6 0.0\n", 129 | "8 0.9984794472645385\n", 130 | "12 0.0015201238918041802\n", 131 | "20 4.2884365717293543e-07\n", 132 | "-----\n", 133 | "4 0.0\n", 134 | "6 0.0\n", 135 | "8 0.9989858984203864\n", 136 | "12 0.0010139299551430545\n", 137 | "20 1.7162447051522972e-07\n", 138 | "-----\n", 139 | "4 0.0\n", 140 | "6 0.0\n", 141 | "8 0.9993237494202397\n", 142 | "12 0.0006761819067551149\n", 143 | "20 6.867300515001654e-08\n", 144 | "-----\n", 145 | "4 0.0\n", 146 | "6 0.0\n", 147 | "8 0.999549082940396\n", 148 | "12 0.00045088958420803747\n", 149 | "20 2.7475395980645105e-08\n", 150 | "-----\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "# What if we roll a few more times an get 6,8,7,7,5 and 4 from the same dice?\n", 156 | "for roll in [6,8,7,7,5,4]:\n", 157 | " suite.Update(roll)\n", 158 | " suite.Print()\n", 159 | " print('-----')\n" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "Now the probability is 94% that we roll the 8-sided die and less than 1% for the 20-sided." 167 | ] 168 | } 169 | ], 170 | "metadata": { 171 | "kernelspec": { 172 | "display_name": "Python 3", 173 | "language": "python", 174 | "name": "python3" 175 | }, 176 | "language_info": { 177 | "codemirror_mode": { 178 | "name": "ipython", 179 | "version": 3 180 | }, 181 | "file_extension": ".py", 182 | "mimetype": "text/x-python", 183 | "name": "python", 184 | "nbconvert_exporter": "python", 185 | "pygments_lexer": "ipython3", 186 | "version": "3.5.1" 187 | } 188 | }, 189 | "nbformat": 4, 190 | "nbformat_minor": 0 191 | } 192 | -------------------------------------------------------------------------------- /Titanic/bin/clean_test_53.py: -------------------------------------------------------------------------------- 1 | # This script will clean up the test data for the Titanic competition using a 2 | # similar method to the notebook https://www.kaggle.com/creepykoala/ 3 | # titanic/study-of-tree-and-forest-algorithms which I have already applied to 4 | # the training data 5 | 6 | # Import libraries 7 | 8 | import numpy as np 9 | from numpy.random import random_integers 10 | import pandas as pd 11 | import matplotlib.pyplot as plt 12 | import sklearn 13 | from sklearn.cross_validation import train_test_split 14 | from sklearn.tree import DecisionTreeClassifier 15 | from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier 16 | from scipy.stats import pointbiserialr, spearmanr 17 | 18 | # Load the test data 19 | df = pd.read_csv('/home/sophie/projects/Titanic/data/test.csv', header=0) 20 | 21 | # People with stronger titles tend to have more help on board. Hence, we will 22 | # categorize passengers based on titles. 23 | Title_Dictionary = { 24 | "Capt": "Officer", 25 | "Col": "Officer", 26 | "Major": "Officer", 27 | "Jonkheer": "Royalty", 28 | "Don": "Royalty", 29 | "Sir" : "Royalty", 30 | "Dr": "Officer", 31 | "Rev": "Officer", 32 | "the Countess":"Royalty", 33 | "Dona": "Royalty", 34 | "Mme": "Mrs", 35 | "Mlle": "Miss", 36 | "Ms": "Mrs", 37 | "Mr" : "Mr", 38 | "Mrs" : "Mrs", 39 | "Miss" : "Miss", 40 | "Master" : "Master", 41 | "Lady" : "Royalty" 42 | } 43 | df['Title'] = df['Name'].apply(lambda x: 44 | Title_Dictionary[x.split(',')[1].split('.')[0].strip()]) 45 | 46 | # Extract the letters from the beginning of each element of 'Ticket' 47 | def Ticket_Prefix(s): 48 | s=s.split()[0] # if you don't include anything in split() 49 | if s.isdigit(): 50 | return 'NoClue' 51 | else: 52 | return s 53 | 54 | df['TicketPrefix'] = df['Ticket'].apply(lambda x: Ticket_Prefix(x)) 55 | 56 | # Make an array where null values are False. 57 | mask_Age = df.Age.notnull() 58 | 59 | # New dataframe where all rows have a value for age. 60 | Age_Sex_Title_Pclass = df.loc[mask_Age, ["Age", "Title", "Sex", "Pclass"]] 61 | 62 | # Groupby object to group by Title, Pclass and Sex 63 | Filler_Ages_1 = Age_Sex_Title_Pclass.groupby(by = ["Title", "Pclass", 64 | "Sex"]).median() 65 | 66 | # This moves both Sex and Pclass into column headers and does so in that order. 67 | Filler_Ages = Filler_Ages_1.Age.unstack(level = -1).unstack(level = -1) 68 | 69 | mask_Age = df.Age.isnull() # A mask where null values are True 70 | 71 | # New DataFrame with missing values for age 72 | Age_Sex_Title_Pclass_missing = df.loc[mask_Age, ["Title", "Sex", "Pclass"]] 73 | 74 | # Look-up function for the calculated median ages. 75 | def Age_filler(row): 76 | if row.Sex == "female": 77 | age = Filler_Ages.female.loc[row["Title"], row["Pclass"]] 78 | return age 79 | elif row.Sex == "male": 80 | age = Filler_Ages.male.loc[row["Title"], row["Pclass"]] 81 | return age 82 | 83 | # Make a new column on "missing" dataframe and add the median value to each 84 | # row. 85 | Age_Sex_Title_Pclass_missing["Age"]=Age_Sex_Title_Pclass_missing.apply( 86 | Age_filler, axis =1 ) 87 | 88 | 89 | # reform the 'Age' column. 90 | df["Age"] = pd.concat([Age_Sex_Title_Pclass["Age"], 91 | Age_Sex_Title_Pclass_missing["Age"]]) 92 | 93 | # Filling in with the mean of all fares. 94 | df['Fare'] = df['Fare'].fillna(value=df.Fare.mean()) 95 | 96 | df['FamilySize'] = df['SibSp'] + df['Parch'] 97 | df = df.drop(['Ticket', 'Cabin'], axis=1) 98 | 99 | # get_dummies splits up a column into two seperate columns of 1 and 0, where 100 | # they are true or false. 101 | dummies_Sex = pd.get_dummies(df['Sex'], prefix='Sex') 102 | 103 | # Making dummies for the other categorical features 104 | dummies_Embarked = pd.get_dummies(df['Embarked'], prefix = 'Embarked') 105 | dummies_Pclass = pd.get_dummies(df['Pclass'], prefix = 'Pclass') 106 | dummies_Titles = pd.get_dummies(df['Title'], prefix= 'Title') 107 | dummies_TicketPrefix = pd.get_dummies(df['TicketPrefix'], prefix='TicketPrefix') 108 | 109 | # Make new dataframes which have the dummies added on to the end 110 | df = pd.concat([df,dummies_Sex, dummies_Embarked, dummies_Pclass, 111 | dummies_Titles, dummies_TicketPrefix], axis = 1) 112 | 113 | # Drop the categorical data 114 | df = df.drop(['Sex', 'Embarked','Pclass','Title','Name','TicketPrefix'], axis=1) 115 | 116 | # Set PassengerId as the index: 117 | df = df.set_index(['PassengerId']) 118 | 119 | # FEATURE SELECTION 120 | # To select features we correlate each feature against Survived in the 121 | # training data. We need # to use different algorithms for the different data 122 | # types: 123 | # - Spearman-Rank correlation for nominal vs nominal data 124 | # - Point-Biserial correlation for nominal vs continuous data 125 | 126 | best_features = df[['Title_Mr', 'Sex_male', 'Sex_female', 'Title_Mrs', 127 | 'Title_Miss', 'Pclass_3', 'Pclass_1', 'Fare', 'Embarked_C', 128 | 'Embarked_S']] 129 | 130 | #Output this to csv to be read in for making a prediction 131 | best_features.to_csv('/home/sophie/projects/Titanic/data/clean_test_53.csv', 132 | sep = " ") 133 | 134 | -------------------------------------------------------------------------------- /TOdo.md: -------------------------------------------------------------------------------- 1 | # Learning resources to work through 2 | 3 | ### General Data Science stats 4 | 5 | - Do [this](http://nbviewer.jupyter.org/github/nborwankar/LearnDataScience/tree/master/notebooks/) before the Machine Learning below. 6 | 7 | - [Harvard CS09 course](http://cs109.github.io/2015/pages/videos.html). Lots of tutorials and lectures covering everything from pandas, web 8 | scraping to bayesian stats. 9 | - perhaps start with the homeworks and labs which also have solutions [here](https://github.com/cs109/content) 10 | 11 | #### Baysian Statistics 12 | 13 | - Very important to get a decent grounding in this: [Programming and Bayesian Methods for 14 | hackers](http://nbviewer.jupyter.org/github/CamDavidsonPilon/Probabilistic-Programming-and-Bayesian-Methods-for-Hackers/blob/master/ 15 | Prologue/Prologue.ipynb) 16 | 17 | - Recommended reading from S2DS. [Think Bayes](http://www.greenteapress.com/thinkbayes/thinkbayes.pdf). 18 | - Some problems, and solutions, from the Allen Downey's 19 | [Blog](http://allendowney.blogspot.co.uk/2011/10/all-your-bayes-are-belong-to-us.html) 20 | 21 | - From the Harvard course: 22 | - [Bayesian Tomatoes](http://nbviewer.jupyter.org/github/cs109/content/blob/master/HW3.ipynb) 23 | - Lab 6: [Bayesianism, with MCMC] (http://nbviewer.jupyter.org/github/cs109/content/blob/master/labs/lab6/BayesLinear.ipynb) 24 | 25 | 26 | ### Machine Learning 27 | 28 | - Work through the ISLR book (downloaded) with [this](https://github.com/JWarmenhoven/ISLR-python) python repo for guidance. 29 | 30 | - Learn and understand [this introduction](http://www.astroml.org/sklearn_tutorial/general_concepts.html) from the python lib scikit. I 31 | also 32 | like this [reference map](http://scikit-learn.org/stable/tutorial/machine_learning_map/index.html). 33 | 34 | - Clone [this](https://github.com/ogrisel/sklearn_pycon2014) repository and work through. 35 | 36 | - Do the MonkeyLearn tutorials [here] 37 | (https://blog.monkeylearn.com/getting-actionable-insights-from-reviews-using-machine-learning-part1/?utm_source=Email&utm_medium= 38 | Newsletter& utm_campaign=actionable-insights-reviews-using-machine-learning-part1) 39 | and[here] 40 | (https://blog.monkeylearn.com/hacker-news-categorizer-with-monkeylearn/?utm_source=Email&utm_medium=Intercom&utm_content=FP& 41 | utm_campaign=16-hacker-news-categorizer) 42 | 43 | - [Very quick and dirty introduction to Random Forests](http://blog.yhat.com/posts/random-forests-in-python.html) using python and iris 44 | data. 45 | 46 | - From harvard data science course Lab 4: [Scikit-Learn, Regression, and 47 | PCA](http://nbviewer.jupyter.org/github/cs109/content/blob/master/labs/lab4/Lab4full.ipynb) 48 | 49 | - [Statistical Natural Language Processing](http://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb) 50 | 51 | - A cheatsheet of sorts for scikit-learn in Python using pandas - ['Python Machine 52 | Learning'](https://github.com/rasbt/python-machine-learning-book) 53 | 54 | - `Scikit.learn` for large data sets [article](https://www.opendatascience.com/blog/riding-on-large-data-with-scikit-learn/) and 55 | [tutorial](https://github.com/rasbt/pattern_classification/blob/master/machine_learning/scikit-learn/outofcore_modelpersistence.ipynb) 56 | 57 | ### Social Network Analysis 58 | 59 | - Udacity course on [Algorithms](https://classroom.udacity.com/courses/cs215/lessons/48311839/concepts/486877000923) 60 | 61 | 62 | ### SQL 63 | 64 | - Complete [this](http://sol.gfxile.net/g3/) excellent tutorial playing around with astronomy data. Done! 65 | 66 | - [SQL Zoo](http://sqlzoo.net/) is a a good tutorial site. Work through these. 67 | 68 | - Do [this](http://www.sqlcourse.com/) basic tutorial, followed by [this](http://www.sqlcourse2.com/) more advanced one. Looks good with 69 | lots of questions (+ answers!). 70 | 71 | - Dip into [this](http://dev.mysql.com/doc/refman/5.5/en/examples.html) tutorial which goes through common queries from MySQL. 72 | 73 | 74 | ### Pandas and Python 75 | 76 | - Start working through [exorcism.io](http://exercism.io/languages/python#exercises) 77 | 78 | - [How to think like a computer scientist](http://interactivepython.org/runestone/static/thinkcspy/toc.html) with clear explanations, 79 | videos and tests you as you go through. 80 | 81 | - Work through [these Python 3 tutorial videos](https://www.youtube.com/playlist?list=PL1A2CSdiySGJd0LJRRSwQZbPZaDP0q67j). They are nice 82 | and short. 83 | 84 | - [Matplotlib visualisation tutorial](https://www.dataquest.io/blog/matplotlib-tutorial/) which encorporates sentiment analysis with 85 | suggested futher exercises. 86 | 87 | - [Statistical Natural Language Processing](http://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb) 88 | 89 | ### Kaggle 90 | 91 | - Do the Titanic competition. 92 | - Start with this for [Python](https://www.kaggle.com/c/titanic/details/getting-started-with-python) and [then 93 | again](https://www.kaggle.com/c/titanic/details/getting-started-with-python-ii), but using pandas. 94 | - Try tutorial in [Machine Learning DataBase 95 | (MLDB)](https://docs.mldb.ai/ipy/notebooks/_demos/_latest/Predicting%20Titanic%20Survival.html) 96 | 97 | ### Visualisation 98 | 99 | - Python interactive visualization library that targets modern web browsers for presentation:[Bokeh](http://bokeh.pydata.org/en/latest/) 100 | - [Plotly](https://plot.ly/api/) 101 | - Getting started with [d3](https://github.com/d3/d3/wiki/Tutorials) 102 | 103 | ### git 104 | 105 | - Work through [this](http://gitreal.codeschool.com/?utm_source=github&utm_medium=codeschool_option&utm_campaign=trygit) course. 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /gothonweb/sessions/5524b4c1828de273b8ae4c70bbbe0e631e031e4a: -------------------------------------------------------------------------------- 1 | KGRwMQpTJ2lwJwpwMgpWMTI3LjAuMC4xCnAzCnNTJ3Jvb20nCnA0CmNjb3B5X3JlZwpfcmVjb25z 2 | dHJ1Y3RvcgpwNQooY21hcApSb29tCnA2CmNfX2J1aWx0aW5fXwpvYmplY3QKcDcKTnRScDgKKGRw 3 | OQpTJ3BhdGhzJwpwMTAKKGRwMTEKUycxJwpnNQooZzYKZzcKTnRScDEyCihkcDEzCmcxMAooZHAx 4 | NApzUyduYW1lJwpwMTUKUydkZWF0aCcKcDE2CnNTJ2Rlc2NyaXB0aW9uJwpwMTcKUydZb3UgZGll 5 | ZC4nCnAxOApzYnNTJzMnCmc1CihnNgpnNwpOdFJwMTkKKGRwMjAKZzEwCihkcDIxClMnMTMyJwpw 6 | MjIKZzUKKGc2Cmc3Ck50UnAyMwooZHAyNApnMTAKKGRwMjUKUyd0aHJvdyB0aGUgYm9tYicKcDI2 7 | CmcxMgpzUydzbG93bHkgcGxhY2UgdGhlIGJvbWInCnAyNwpnNQooZzYKZzcKTnRScDI4CihkcDI5 8 | CmcxMAooZHAzMApTJyonCmc1CihnNgpnNwpOdFJwMzEKKGRwMzIKZzEwCihkcDMzCnNnMTUKUydU 9 | aGUgRW5kJwpwMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBhIHJhbmRvbSBwb2QgYW5kIGhpdCB0 10 | aGUgZWplY3QgYnV0dG9uLlxuVGhlIHBvZCBlc2NhcGVzIG91dCBpbnRvIHRoZSB2b2lkIG9mIHNw 11 | YWNlLCB0aGVuXG5pbXBsb2RlcyBhcyB0aGUgaHVsbCBydXB0dXJlcywgY3J1c2hpbmcgeW91ciBi 12 | b2R5XG5pbnRvIGphbSBqZWxseS5cbicKcDM1CnNic1MnMicKZzUKKGc2Cmc3Ck50UnAzNgooZHAz 13 | NwpnMTAKKGRwMzgKc2cxNQpnMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBwb2QgMiBhbmQgaGl0 14 | IHRoZSBlamVjdCBidXR0b24uXG5UaGUgcG9kIGVhc2lseSBzbGlkZXMgb3V0IGludG8gc3BhY2Ug 15 | aGVhZGluZyB0b1xudGhlIHBsYW5ldCBiZWxvdy4gIEFzIGl0IGZsaWVzIHRvIHRoZSBwbGFuZXQs 16 | IHlvdSBsb29rXG5iYWNrIGFuZCBzZWUgeW91ciBzaGlwIGltcGxvZGUgdGhlbiBleHBsb2RlIGxp 17 | a2UgYVxuYnJpZ2h0IHN0YXIsIHRha2luZyBvdXQgdGhlIEdvdGhvbiBzaGlwIGF0IHRoZSBzYW1l 18 | XG50aW1lLiAgWW91IHdvbiFcbicKcDM5CnNic3NnMTUKUydFc2NhcGUgUG9kJwpwNDAKc2cxNwpT 19 | IlxuWW91IHBvaW50IHlvdXIgYmxhc3RlciBhdCB0aGUgYm9tYiB1bmRlciB5b3VyIGFybVxuYW5k 20 | IHRoZSBHb3Rob25zIHB1dCB0aGVpciBoYW5kcyB1cCBhbmQgc3RhcnQgdG8gc3dlYXQuXG5Zb3Ug 21 | aW5jaCBiYWNrd2FyZCB0byB0aGUgZG9vciwgb3BlbiBpdCwgYW5kIHRoZW4gY2FyZWZ1bGx5XG5w 22 | bGFjZSB0aGUgYm9tYiBvbiB0aGUgZmxvb3IsIHBvaW50aW5nIHlvdXIgYmxhc3RlciBhdCBpdC5c 23 | bllvdSB0aGVuIGp1bXAgYmFjayB0aHJvdWdoIHRoZSBkb29yLCBwdW5jaCB0aGUgY2xvc2UgYnV0 24 | dG9uXG5hbmQgYmxhc3QgdGhlIGxvY2sgc28gdGhlIEdvdGhvbnMgY2FuJ3QgZ2V0IG91dC5cbk5v 25 | dyB0aGF0IHRoZSBib21iIGlzIHBsYWNlZCB5b3UgcnVuIHRvIHRoZSBlc2NhcGUgcG9kIHRvXG5n 26 | ZXQgb2ZmIHRoaXMgdGluIGNhbi5cblxuWW91IHJ1c2ggdGhyb3VnaCB0aGUgc2hpcCBkZXNwZXJh 27 | dGVseSB0cnlpbmcgdG8gbWFrZSBpdCB0b1xudGhlIGVzY2FwZSBwb2QgYmVmb3JlIHRoZSB3aG9s 28 | ZSBzaGlwIGV4cGxvZGVzLiAgSXQgc2VlbXMgbGlrZVxuaGFyZGx5IGFueSBHb3Rob25zIGFyZSBv 29 | biB0aGUgc2hpcCwgc28geW91ciBydW4gaXMgY2xlYXIgb2ZcbmludGVyZmVyZW5jZS4gIFlvdSBn 30 | ZXQgdG8gdGhlIGNoYW1iZXIgd2l0aCB0aGUgZXNjYXBlIHBvZHMsIGFuZFxubm93IG5lZWQgdG8g 31 | cGljayBvbmUgdG8gdGFrZS4gIFNvbWUgb2YgdGhlbSBjb3VsZCBiZSBkYW1hZ2VkXG5idXQgeW91 32 | IGRvbid0IGhhdmUgdGltZSB0byBsb29rLiAgVGhlcmUncyA1IHBvZHMsIHdoaWNoIG9uZVxuZG8g 33 | eW91IHRha2U/XG4iCnA0MQpzYnNzZzE1ClMnVGhlIEJyaWRnZScKcDQyCnNnMTcKUyJcblRoZSBj 34 | b250YWluZXIgY2xpY2tzIG9wZW4gYW5kIHRoZSBzZWFsIGJyZWFrcywgbGV0dGluZyBnYXMgb3V0 35 | LlxuWW91IGdyYWIgdGhlIG5ldXRyb24gYm9tYiBhbmQgcnVuIGFzIGZhc3QgYXMgeW91IGNhbiB0 36 | byB0aGVcbmJyaWRnZSB3aGVyZSB5b3UgbXVzdCBwbGFjZSBpdCBpbiB0aGUgcmlnaHQgc3BvdC5c 37 | blxuWW91IGJ1cnN0IG9udG8gdGhlIEJyaWRnZSB3aXRoIHRoZSBuZXRyb24gZGVzdHJ1Y3QgYm9t 38 | YlxudW5kZXIgeW91ciBhcm0gYW5kIHN1cnByaXNlIDUgR290aG9ucyB3aG8gYXJlIHRyeWluZyB0 39 | b1xudGFrZSBjb250cm9sIG9mIHRoZSBzaGlwLiAgRWFjaCBvZiB0aGVtIGhhcyBhbiBldmVuIHVn 40 | bGllclxuY2xvd24gY29zdHVtZSB0aGFuIHRoZSBsYXN0LiAgVGhleSBoYXZlbid0IHB1bGxlZCB0 41 | aGVpclxud2VhcG9ucyBvdXQgeWV0LCBhcyB0aGV5IHNlZSB0aGUgYWN0aXZlIGJvbWIgdW5kZXIg 42 | eW91clxuYXJtIGFuZCBkb24ndCB3YW50IHRvIHNldCBpdCBvZmYuXG4iCnA0MwpzYnNTJyonCmcx 43 | OQpzc2cxNQpTJ0xhc2VyIFdlYXBvbiBBcm1vcnknCnA0NApzZzE3ClMiXG5MdWNreSBmb3IgeW91 44 | IHRoZXkgbWFkZSB5b3UgbGVhcm4gR290aG9uIGluc3VsdHMgaW4gdGhlIGFjYWRlbXkuXG5Zb3Ug 45 | dGVsbCB0aGUgb25lIEdvdGhvbiBqb2tlIHlvdSBrbm93OlxuTGJoZSB6Ymd1cmUgdmYgZmIgc25n 46 | LCBqdXJhIGZ1ciBmdmdmIG5lYmhhcSBndXIgdWJoZnIsIGZ1ciBmdmdmIG5lYmhhcSBndXIgXG51 47 | Ymhmci5cblRoZSBHb3Rob24gc3RvcHMsIHRyaWVzIG5vdCB0byBsYXVnaCwgdGhlbiBidXN0cyBv 48 | dXQgbGF1Z2hpbmcgYW5kIGNhbid0IFxubW92ZS5cbldoaWxlIGhlJ3MgbGF1Z2hpbmcgeW91IHJ1 49 | biB1cCBhbmQgc2hvb3QgaGltIHNxdWFyZSBpbiB0aGUgaGVhZFxucHV0dGluZyBoaW0gZG93biwg 50 | dGhlbiBqdW1wIHRocm91Z2ggdGhlIFdlYXBvbiBBcm1vcnkgZG9vci5cblxuWW91IGRvIGEgZGl2 51 | ZSByb2xsIGludG8gdGhlIFdlYXBvbiBBcm1vcnksIGNyb3VjaCBhbmQgc2NhbiB0aGUgcm9vbVxu 52 | Zm9yIG1vcmUgR290aG9ucyB0aGF0IG1pZ2h0IGJlIGhpZGluZy4gIEl0J3MgZGVhZCBxdWlldCwg 53 | dG9vIHF1aWV0LlxuWW91IHN0YW5kIHVwIGFuZCBydW4gdG8gdGhlIGZhciBzaWRlIG9mIHRoZSBy 54 | b29tIGFuZCBmaW5kIHRoZVxubmV1dHJvbiBib21iIGluIGl0cyBjb250YWluZXIuICBUaGVyZSdz 55 | IGEga2V5cGFkIGxvY2sgb24gdGhlIGJveFxuYW5kIHlvdSBuZWVkIHRoZSBjb2RlIHRvIGdldCB0 56 | aGUgYm9tYiBvdXQuICBJZiB5b3UgZ2V0IHRoZSBjb2RlXG53cm9uZyAxMCB0aW1lcyB0aGVuIHRo 57 | ZSBsb2NrIGNsb3NlcyBmb3JldmVyIGFuZCB5b3UgY2FuJ3RcbmdldCB0aGUgYm9tYi4gIFRoZSBj 58 | b2RlIGlzIDMgZGlnaXRzLiAoaGludDogdGhlIGZpcnN0IHR3byBkaWdpdHMgYXJlIDEzKVxuIgpw 59 | NDUKc2JzUycyJwpnMTkKc3NnMTUKUydDZW50cmFsIENvcnJpZG9yJwpwNDYKc2cxNwpTIlxuVGhl 60 | IEdvdGhvbnMgb2YgUGxhbmV0IFBlcmNhbCAjMjUgaGF2ZSBpbnZhZGVkIHlvdXIgc2hpcCBhbmQg 61 | ZGVzdHJveWVkXG55b3VyIGVudGlyZSBjcmV3LiAgWW91IGFyZSB0aGUgbGFzdCBzdXJ2aXZpbmcg 62 | bWVtYmVyIGFuZCB5b3VyIGxhc3Rcbm1pc3Npb24gaXMgdG8gZ2V0IHRoZSBuZXV0cm9uIGRlc3Ry 63 | dWN0IGJvbWIgZnJvbSB0aGUgV2VhcG9ucyBBcm1vcnksXG5wdXQgaXQgaW4gdGhlIGJyaWRnZSwg 64 | YW5kIGJsb3cgdGhlIHNoaXAgdXAgYWZ0ZXIgZ2V0dGluZyBpbnRvIGFuIFxuZXNjYXBlIHBvZC5c 65 | blxuWW91J3JlIHJ1bm5pbmcgZG93biB0aGUgY2VudHJhbCBjb3JyaWRvciB0byB0aGUgV2VhcG9u 66 | cyBBcm1vcnkgd2hlblxuYSBHb3Rob24ganVtcHMgb3V0LCByZWQgc2NhbHkgc2tpbiwgZGFyayBn 67 | cmlteSB0ZWV0aCwgYW5kIGV2aWwgY2xvd24gY29zdHVtZVxuZmxvd2luZyBhcm91bmQgaGlzIGhh 68 | dGUgZmlsbGVkIGJvZHkuICBIZSdzIGJsb2NraW5nIHRoZSBkb29yIHRvIHRoZVxuQXJtb3J5IGFu 69 | ZCBhYm91dCB0byBwdWxsIGEgd2VhcG9uIHRvIGJsYXN0IHlvdS4gWW91IGNhbiBlaXRoZXI6IDEp 70 | IGVhdCBoaW0sIFxuMikga2ljayBoaW0gaW4gdGhlIG51dHMsIG9yIDMpIGJsYXN0IGhpbSBmaXJz 71 | dC5cbiIKcDQ3CnNic1Mnc2Vzc2lvbl9pZCcKcDQ4ClMnNTUyNGI0YzE4MjhkZTI3M2I4YWU0Yzcw 72 | YmJiZTBlNjMxZTAzMWU0YScKcDQ5CnMu 73 | -------------------------------------------------------------------------------- /gothonweb/sessions/6adbe20488a3ffd0040abc4ac06991d1d79c97d0: -------------------------------------------------------------------------------- 1 | KGRwMQpTJ2lwJwpwMgpWMTI3LjAuMC4xCnAzCnNTJ3Jvb20nCnA0CmNjb3B5X3JlZwpfcmVjb25z 2 | dHJ1Y3RvcgpwNQooY21hcApSb29tCnA2CmNfX2J1aWx0aW5fXwpvYmplY3QKcDcKTnRScDgKKGRw 3 | OQpTJ3BhdGhzJwpwMTAKKGRwMTEKUycxJwpnNQooZzYKZzcKTnRScDEyCihkcDEzCmcxMAooZHAx 4 | NApzUyduYW1lJwpwMTUKUydkZWF0aCcKcDE2CnNTJ2Rlc2NyaXB0aW9uJwpwMTcKUydZb3UgZGll 5 | ZC4nCnAxOApzYnNTJzMnCmc1CihnNgpnNwpOdFJwMTkKKGRwMjAKZzEwCihkcDIxClMnMTMyJwpw 6 | MjIKZzUKKGc2Cmc3Ck50UnAyMwooZHAyNApnMTAKKGRwMjUKUyd0aHJvdyB0aGUgYm9tYicKcDI2 7 | CmcxMgpzUydzbG93bHkgcGxhY2UgdGhlIGJvbWInCnAyNwpnNQooZzYKZzcKTnRScDI4CihkcDI5 8 | CmcxMAooZHAzMApTJzInCmc1CihnNgpnNwpOdFJwMzEKKGRwMzIKZzEwCihkcDMzCnNnMTUKUydU 9 | aGUgRW5kJwpwMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBwb2QgMiBhbmQgaGl0IHRoZSBlamVj 10 | dCBidXR0b24uXG5UaGUgcG9kIGVhc2lseSBzbGlkZXMgb3V0IGludG8gc3BhY2UgaGVhZGluZyB0 11 | b1xudGhlIHBsYW5ldCBiZWxvdy4gIEFzIGl0IGZsaWVzIHRvIHRoZSBwbGFuZXQsIHlvdSBsb29r 12 | XG5iYWNrIGFuZCBzZWUgeW91ciBzaGlwIGltcGxvZGUgdGhlbiBleHBsb2RlIGxpa2UgYVxuYnJp 13 | Z2h0IHN0YXIsIHRha2luZyBvdXQgdGhlIEdvdGhvbiBzaGlwIGF0IHRoZSBzYW1lXG50aW1lLiAg 14 | WW91IHdvbiFcbicKcDM1CnNic1MnKicKZzUKKGc2Cmc3Ck50UnAzNgooZHAzNwpnMTAKKGRwMzgK 15 | c2cxNQpnMzQKc2cxNwpTJ1xuWW91IGp1bXAgaW50byBhIHJhbmRvbSBwb2QgYW5kIGhpdCB0aGUg 16 | ZWplY3QgYnV0dG9uLlxuVGhlIHBvZCBlc2NhcGVzIG91dCBpbnRvIHRoZSB2b2lkIG9mIHNwYWNl 17 | LCB0aGVuXG5pbXBsb2RlcyBhcyB0aGUgaHVsbCBydXB0dXJlcywgY3J1c2hpbmcgeW91ciBib2R5 18 | XG5pbnRvIGphbSBqZWxseS5cbicKcDM5CnNic3NnMTUKUydFc2NhcGUgUG9kJwpwNDAKc2cxNwpT 19 | IlxuWW91IHBvaW50IHlvdXIgYmxhc3RlciBhdCB0aGUgYm9tYiB1bmRlciB5b3VyIGFybVxuYW5k 20 | IHRoZSBHb3Rob25zIHB1dCB0aGVpciBoYW5kcyB1cCBhbmQgc3RhcnQgdG8gc3dlYXQuXG5Zb3Ug 21 | aW5jaCBiYWNrd2FyZCB0byB0aGUgZG9vciwgb3BlbiBpdCwgYW5kIHRoZW4gY2FyZWZ1bGx5XG5w 22 | bGFjZSB0aGUgYm9tYiBvbiB0aGUgZmxvb3IsIHBvaW50aW5nIHlvdXIgYmxhc3RlciBhdCBpdC5c 23 | bllvdSB0aGVuIGp1bXAgYmFjayB0aHJvdWdoIHRoZSBkb29yLCBwdW5jaCB0aGUgY2xvc2UgYnV0 24 | dG9uXG5hbmQgYmxhc3QgdGhlIGxvY2sgc28gdGhlIEdvdGhvbnMgY2FuJ3QgZ2V0IG91dC5cbk5v 25 | dyB0aGF0IHRoZSBib21iIGlzIHBsYWNlZCB5b3UgcnVuIHRvIHRoZSBlc2NhcGUgcG9kIHRvXG5n 26 | ZXQgb2ZmIHRoaXMgdGluIGNhbi5cblxuWW91IHJ1c2ggdGhyb3VnaCB0aGUgc2hpcCBkZXNwZXJh 27 | dGVseSB0cnlpbmcgdG8gbWFrZSBpdCB0b1xudGhlIGVzY2FwZSBwb2QgYmVmb3JlIHRoZSB3aG9s 28 | ZSBzaGlwIGV4cGxvZGVzLiAgSXQgc2VlbXMgbGlrZVxuaGFyZGx5IGFueSBHb3Rob25zIGFyZSBv 29 | biB0aGUgc2hpcCwgc28geW91ciBydW4gaXMgY2xlYXIgb2ZcbmludGVyZmVyZW5jZS4gIFlvdSBn 30 | ZXQgdG8gdGhlIGNoYW1iZXIgd2l0aCB0aGUgZXNjYXBlIHBvZHMsIGFuZFxubm93IG5lZWQgdG8g 31 | cGljayBvbmUgdG8gdGFrZS4gIFNvbWUgb2YgdGhlbSBjb3VsZCBiZSBkYW1hZ2VkXG5idXQgeW91 32 | IGRvbid0IGhhdmUgdGltZSB0byBsb29rLiAgVGhlcmUncyA1IHBvZHMsIHdoaWNoIG9uZVxuZG8g 33 | eW91IHRha2U/XG4iCnA0MQpzYnNzZzE1ClMnVGhlIEJyaWRnZScKcDQyCnNnMTcKUyJcblRoZSBj 34 | b250YWluZXIgY2xpY2tzIG9wZW4gYW5kIHRoZSBzZWFsIGJyZWFrcywgbGV0dGluZyBnYXMgb3V0 35 | LlxuWW91IGdyYWIgdGhlIG5ldXRyb24gYm9tYiBhbmQgcnVuIGFzIGZhc3QgYXMgeW91IGNhbiB0 36 | byB0aGVcbmJyaWRnZSB3aGVyZSB5b3UgbXVzdCBwbGFjZSBpdCBpbiB0aGUgcmlnaHQgc3BvdC5c 37 | blxuWW91IGJ1cnN0IG9udG8gdGhlIEJyaWRnZSB3aXRoIHRoZSBuZXRyb24gZGVzdHJ1Y3QgYm9t 38 | YlxudW5kZXIgeW91ciBhcm0gYW5kIHN1cnByaXNlIDUgR290aG9ucyB3aG8gYXJlIHRyeWluZyB0 39 | b1xudGFrZSBjb250cm9sIG9mIHRoZSBzaGlwLiAgRWFjaCBvZiB0aGVtIGhhcyBhbiBldmVuIHVn 40 | bGllclxuY2xvd24gY29zdHVtZSB0aGFuIHRoZSBsYXN0LiAgVGhleSBoYXZlbid0IHB1bGxlZCB0 41 | aGVpclxud2VhcG9ucyBvdXQgeWV0LCBhcyB0aGV5IHNlZSB0aGUgYWN0aXZlIGJvbWIgdW5kZXIg 42 | eW91clxuYXJtIGFuZCBkb24ndCB3YW50IHRvIHNldCBpdCBvZmYuXG4iCnA0MwpzYnNTJyonCmcx 43 | OQpzc2cxNQpTJ0xhc2VyIFdlYXBvbiBBcm1vcnknCnA0NApzZzE3ClMiXG5MdWNreSBmb3IgeW91 44 | IHRoZXkgbWFkZSB5b3UgbGVhcm4gR290aG9uIGluc3VsdHMgaW4gdGhlIGFjYWRlbXkuXG5Zb3Ug 45 | dGVsbCB0aGUgb25lIEdvdGhvbiBqb2tlIHlvdSBrbm93OlxuTGJoZSB6Ymd1cmUgdmYgZmIgc25n 46 | LCBqdXJhIGZ1ciBmdmdmIG5lYmhhcSBndXIgdWJoZnIsIGZ1ciBmdmdmIG5lYmhhcSBndXIgXG51 47 | Ymhmci5cblRoZSBHb3Rob24gc3RvcHMsIHRyaWVzIG5vdCB0byBsYXVnaCwgdGhlbiBidXN0cyBv 48 | dXQgbGF1Z2hpbmcgYW5kIGNhbid0IFxubW92ZS5cbldoaWxlIGhlJ3MgbGF1Z2hpbmcgeW91IHJ1 49 | biB1cCBhbmQgc2hvb3QgaGltIHNxdWFyZSBpbiB0aGUgaGVhZFxucHV0dGluZyBoaW0gZG93biwg 50 | dGhlbiBqdW1wIHRocm91Z2ggdGhlIFdlYXBvbiBBcm1vcnkgZG9vci5cblxuWW91IGRvIGEgZGl2 51 | ZSByb2xsIGludG8gdGhlIFdlYXBvbiBBcm1vcnksIGNyb3VjaCBhbmQgc2NhbiB0aGUgcm9vbVxu 52 | Zm9yIG1vcmUgR290aG9ucyB0aGF0IG1pZ2h0IGJlIGhpZGluZy4gIEl0J3MgZGVhZCBxdWlldCwg 53 | dG9vIHF1aWV0LlxuWW91IHN0YW5kIHVwIGFuZCBydW4gdG8gdGhlIGZhciBzaWRlIG9mIHRoZSBy 54 | b29tIGFuZCBmaW5kIHRoZVxubmV1dHJvbiBib21iIGluIGl0cyBjb250YWluZXIuICBUaGVyZSdz 55 | IGEga2V5cGFkIGxvY2sgb24gdGhlIGJveFxuYW5kIHlvdSBuZWVkIHRoZSBjb2RlIHRvIGdldCB0 56 | aGUgYm9tYiBvdXQuICBJZiB5b3UgZ2V0IHRoZSBjb2RlXG53cm9uZyAxMCB0aW1lcyB0aGVuIHRo 57 | ZSBsb2NrIGNsb3NlcyBmb3JldmVyIGFuZCB5b3UgY2FuJ3RcbmdldCB0aGUgYm9tYi4gIFRoZSBj 58 | b2RlIGlzIDMgZGlnaXRzLiAoaGludDogdGhlIGZpcnN0IHR3byBkaWdpdHMgYXJlIDEzKVxuIgpw 59 | NDUKc2JzUycyJwpnMTkKc3NnMTUKUydDZW50cmFsIENvcnJpZG9yJwpwNDYKc2cxNwpTIlxuVGhl 60 | IEdvdGhvbnMgb2YgUGxhbmV0IFBlcmNhbCAjMjUgaGF2ZSBpbnZhZGVkIHlvdXIgc2hpcCBhbmQg 61 | ZGVzdHJveWVkXG55b3VyIGVudGlyZSBjcmV3LiAgWW91IGFyZSB0aGUgbGFzdCBzdXJ2aXZpbmcg 62 | bWVtYmVyIGFuZCB5b3VyIGxhc3Rcbm1pc3Npb24gaXMgdG8gZ2V0IHRoZSBuZXV0cm9uIGRlc3Ry 63 | dWN0IGJvbWIgZnJvbSB0aGUgV2VhcG9ucyBBcm1vcnksXG5wdXQgaXQgaW4gdGhlIGJyaWRnZSwg 64 | YW5kIGJsb3cgdGhlIHNoaXAgdXAgYWZ0ZXIgZ2V0dGluZyBpbnRvIGFuIFxuZXNjYXBlIHBvZC5c 65 | blxuWW91J3JlIHJ1bm5pbmcgZG93biB0aGUgY2VudHJhbCBjb3JyaWRvciB0byB0aGUgV2VhcG9u 66 | cyBBcm1vcnkgd2hlblxuYSBHb3Rob24ganVtcHMgb3V0LCByZWQgc2NhbHkgc2tpbiwgZGFyayBn 67 | cmlteSB0ZWV0aCwgYW5kIGV2aWwgY2xvd24gY29zdHVtZVxuZmxvd2luZyBhcm91bmQgaGlzIGhh 68 | dGUgZmlsbGVkIGJvZHkuICBIZSdzIGJsb2NraW5nIHRoZSBkb29yIHRvIHRoZVxuQXJtb3J5IGFu 69 | ZCBhYm91dCB0byBwdWxsIGEgd2VhcG9uIHRvIGJsYXN0IHlvdS4gWW91IGNhbiBlaXRoZXI6IDEp 70 | IGVhdCBoaW0sIFxuMikga2ljayBoaW0gaW4gdGhlIG51dHMsIG9yIDMpIGJsYXN0IGhpbSBmaXJz 71 | dC5cbiIKcDQ3CnNic1Mnc2Vzc2lvbl9pZCcKcDQ4ClMnNmFkYmUyMDQ4OGEzZmZkMDA0MGFiYzRh 72 | YzA2OTkxZDFkNzljOTdkMCcKcDQ5CnMu 73 | -------------------------------------------------------------------------------- /tutorials/algorithms/notebooks/Lesson1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Lesson 1: Case Study" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "def naive(a,b):\n", 19 | " x = a\n", 20 | " y = b\n", 21 | " z = 0\n", 22 | " while x > 0:\n", 23 | " z = z + y\n", 24 | " x = x - 1\n", 25 | " return z" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 8, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "516\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print(naive(43,12))" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 9, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "17.0\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "print(102/6)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 12, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "24\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "import math\n", 83 | "\n", 84 | "print(math.factorial(4))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "#### Russian Peasants Algorithm\n", 92 | "(Ancient Egyption Multiplication)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "def russian(a,b):\n", 104 | " x = a\n", 105 | " y = b\n", 106 | " z = 0\n", 107 | " while x > 0:\n", 108 | " if x % 2 == 1: z = z + y # if x is odd add y to z\n", 109 | " y = y << 1\n", 110 | " x = x >> 1\n", 111 | " return z\n", 112 | " " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 17, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "140\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "print(russian(20,7))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "binary numbers:\n", 139 | "1(2$^0$), 2(2$^1$), 4(2$^2$), 8(2$^3$), 16(2$^4$), 32(2$^5$), 64(2$^6$), 128(2$^7$), 256(2$^8$) \n", 140 | "\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 16, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "256\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "print(2**8)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 19, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "24.5\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "print(int(49)/int(2))" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 21, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "1\n", 193 | "None\n", 194 | "9\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "import math\n", 200 | "\n", 201 | "def time(n):\n", 202 | " \"\"\" Return the number of steps \n", 203 | " necessary to calculate\n", 204 | " `print countdown(n)`\"\"\"\n", 205 | " steps = 0\n", 206 | " \n", 207 | " if n >= 10: steps = math.ceil(n/5.0)*2 + 3\n", 208 | " if (n < 10) & (n > 5) : steps = 7.0\n", 209 | " if (n <= 5) : steps = 5.0\n", 210 | " \n", 211 | " # answer.\n", 212 | " #steps = 3 + 2 * math.ceil(n/5.0)\n", 213 | " \n", 214 | " return steps\n", 215 | "\n", 216 | "def countdown(x):\n", 217 | " y = 0\n", 218 | " while x > 0:\n", 219 | " x = x - 5\n", 220 | " y = y + 1\n", 221 | " print (y)\n", 222 | "\n", 223 | "print (countdown(5))\n", 224 | "print (time(12))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "outputs": [], 234 | "source": [] 235 | } 236 | ], 237 | "metadata": { 238 | "kernelspec": { 239 | "display_name": "Python [Root]", 240 | "language": "python", 241 | "name": "Python [Root]" 242 | }, 243 | "language_info": { 244 | "codemirror_mode": { 245 | "name": "ipython", 246 | "version": 3 247 | }, 248 | "file_extension": ".py", 249 | "mimetype": "text/x-python", 250 | "name": "python", 251 | "nbconvert_exporter": "python", 252 | "pygments_lexer": "ipython3", 253 | "version": "3.5.2" 254 | } 255 | }, 256 | "nbformat": 4, 257 | "nbformat_minor": 0 258 | } 259 | -------------------------------------------------------------------------------- /tutorials/algorithms/notebooks/.ipynb_checkpoints/Lesson1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Lesson 1: Case Study" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "def naive(a,b):\n", 19 | " x = a\n", 20 | " y = b\n", 21 | " z = 0\n", 22 | " while x > 0:\n", 23 | " z = z + y\n", 24 | " x = x - 1\n", 25 | " return z" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 8, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "516\n" 40 | ] 41 | } 42 | ], 43 | "source": [ 44 | "print(naive(43,12))" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 9, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "17.0\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "print(102/6)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 12, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "24\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "import math\n", 83 | "\n", 84 | "print(math.factorial(4))" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "#### Russian Peasants Algorithm\n", 92 | "(Ancient Egyption Multiplication)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 7, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "def russian(a,b):\n", 104 | " x = a\n", 105 | " y = b\n", 106 | " z = 0\n", 107 | " while x > 0:\n", 108 | " if x % 2 == 1: z = z + y # if x is odd add y to z\n", 109 | " y = y << 1\n", 110 | " x = x >> 1\n", 111 | " return z\n", 112 | " " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 17, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "140\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "print(russian(20,7))" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "binary numbers:\n", 139 | "1(2$^0$), 2(2$^1$), 4(2$^2$), 8(2$^3$), 16(2$^4$), 32(2$^5$), 64(2$^6$), 128(2$^7$), 256(2$^8$) \n", 140 | "\n" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 16, 146 | "metadata": { 147 | "collapsed": false 148 | }, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "256\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "print(2**8)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 19, 165 | "metadata": { 166 | "collapsed": false 167 | }, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "24.5\n" 174 | ] 175 | } 176 | ], 177 | "source": [ 178 | "print(int(49)/int(2))" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 21, 184 | "metadata": { 185 | "collapsed": false 186 | }, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "1\n", 193 | "None\n", 194 | "9\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "import math\n", 200 | "\n", 201 | "def time(n):\n", 202 | " \"\"\" Return the number of steps \n", 203 | " necessary to calculate\n", 204 | " `print countdown(n)`\"\"\"\n", 205 | " steps = 0\n", 206 | " \n", 207 | " if n >= 10: steps = math.ceil(n/5.0)*2 + 3\n", 208 | " if (n < 10) & (n > 5) : steps = 7.0\n", 209 | " if (n <= 5) : steps = 5.0\n", 210 | " \n", 211 | " # answer.\n", 212 | " #steps = 3 + 2 * math.ceil(n/5.0)\n", 213 | " \n", 214 | " return steps\n", 215 | "\n", 216 | "def countdown(x):\n", 217 | " y = 0\n", 218 | " while x > 0:\n", 219 | " x = x - 5\n", 220 | " y = y + 1\n", 221 | " print (y)\n", 222 | "\n", 223 | "print (countdown(5))\n", 224 | "print (time(12))" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "outputs": [], 234 | "source": [] 235 | } 236 | ], 237 | "metadata": { 238 | "kernelspec": { 239 | "display_name": "Python [Root]", 240 | "language": "python", 241 | "name": "Python [Root]" 242 | }, 243 | "language_info": { 244 | "codemirror_mode": { 245 | "name": "ipython", 246 | "version": 3 247 | }, 248 | "file_extension": ".py", 249 | "mimetype": "text/x-python", 250 | "name": "python", 251 | "nbconvert_exporter": "python", 252 | "pygments_lexer": "ipython3", 253 | "version": "3.5.2" 254 | } 255 | }, 256 | "nbformat": 4, 257 | "nbformat_minor": 0 258 | } 259 | -------------------------------------------------------------------------------- /tutorials/026-Linear_Regression_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### This notebook will use modelling software to generate the model coefficients a0, a1 and a2 to investigate FICO Score and Loan Amount as predictors of Interest Rate" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 44, 13 | "metadata": { 14 | "collapsed": false 15 | }, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Populating the interactive namespace from numpy and matplotlib\n", 22 | "[[735]\n", 23 | " [715]]\n", 24 | "[[20000]\n", 25 | " [19200]]\n", 26 | "[[ 1.00000000e+00 7.35000000e+02 2.00000000e+04]\n", 27 | " [ 1.00000000e+00 7.15000000e+02 1.92000000e+04]\n", 28 | " [ 1.00000000e+00 6.95000000e+02 1.00000000e+04]\n", 29 | " ..., \n", 30 | " [ 1.00000000e+00 6.80000000e+02 1.00000000e+04]\n", 31 | " [ 1.00000000e+00 6.75000000e+02 6.00000000e+03]\n", 32 | " [ 1.00000000e+00 6.70000000e+02 9.00000000e+03]] [[ 735 20000]\n", 33 | " [ 715 19200]\n", 34 | " [ 695 10000]\n", 35 | " ..., \n", 36 | " [ 680 10000]\n", 37 | " [ 675 6000]\n", 38 | " [ 670 9000]]\n", 39 | "Coefficients: [ 0.7232804 -0.00087589]\n", 40 | "Intercepts: 1.97716000896e-06\n", 41 | "P-Values: [ 0.00000000e+00 0.00000000e+00 3.00521465e-98]\n", 42 | "R-Squared: 0.644760522744\n" 43 | ] 44 | }, 45 | { 46 | "name": "stderr", 47 | "output_type": "stream", 48 | "text": [ 49 | "WARNING: pylab import has clobbered these variables: ['f']\n", 50 | "`%matplotlib` prevents importing * from pylab and numpy\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "%pylab inline\n", 56 | "import pylab as pl\n", 57 | "import numpy as np\n", 58 | "import pandas as pd\n", 59 | "import statsmodels.api as sm\n", 60 | "\n", 61 | "# import the cleaned up dataset\n", 62 | "df = pd.read_csv('/home/sophie/projects/LendingClub/data/clean_LD.csv')\n", 63 | "\n", 64 | "intrate = df['Interest.Rate']\n", 65 | "loanamt = df['Amount.Requested']\n", 66 | "fico = df['FICO.Score']\n", 67 | "\n", 68 | "# reshape the data from a pandas Series to columns\n", 69 | "# the dependent variable\n", 70 | "# This creates a 2D array, with T turning it from (1,1867) to (1867,1)\n", 71 | "y = np.matrix(intrate).T # I think T does the same as transpose()\n", 72 | "\n", 73 | "# the independent variables shaped as columns\n", 74 | "x1 = np.matrix(fico).transpose()\n", 75 | "x2 = np.matrix(loanamt).transpose()\n", 76 | "\n", 77 | "# put the two columns together to create an input matrix\n", 78 | "# if we had n independent variables we would have n columns here\n", 79 | "x = np.column_stack([x1,x2]) # column_stack takes a sequence fo 1-D arrays and stacks them as columns.\n", 80 | "\n", 81 | "print x[0:2,0] # to access x1\n", 82 | "print x[0:2,1] # to access x2\n", 83 | "\n", 84 | "# create a linear model and fit it to the data\n", 85 | "X = sm.add_constant(x) # adds a column of 1s (the first column) to the x (2D stacked data)\n", 86 | "model = sm.OLS(y,X) # creates an ordinary least squares model. Y = response variable, X, should include an intercept.\n", 87 | "\n", 88 | "# f is a A RegressionResults class instance. The list of attributes are found \n", 89 | "# here http://statsmodels.sourceforge.net/devel/generated/statsmodels.regression.linear_model.RegressionResults.html\n", 90 | "f = model.fit() # fit is one of the methods which can be applied to an OLS object\n", 91 | "\n", 92 | "print 'Coefficients: ', f.params[0:2] # linear coefficients that minimize the least squares criterion. a1 and a2\n", 93 | "print 'Intercepts: ', f.params[2] # a0\n", 94 | "print 'P-Values: ', f.pvalues\n", 95 | "print 'R-Squared: ', f.rsquared\n", 96 | "\n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "collapsed": true 103 | }, 104 | "source": [ 105 | "Coefficients: contains $a_1$ and $a_2$\n", 106 | "Intercept: is at $a_0$\n", 107 | "\n", 108 | "Next, we need to work out how reliable the numbers are. \n", 109 | "P-values are probabilities we can use to do this and to be confident we want it to be close to 0.\n", 110 | "Convention is p < 0.05. If it is more, we have less confidence using that dimension in modelling and predicting.\n", 111 | "\n", 112 | "$R^2$ : How much variance in the data is captured by the model. \n", 113 | "$R$ : coefficient of correlation between independent variables and dependent variable. How much Y depends on the seperate X's. Lies between -1 and 1, so $R^2$ lies between 0 and 1.\n", 114 | "We want a high $R^2$.\n", 115 | "\n", 116 | "We have created a linear multivariate regression model for Interest Rate, which is well described by the parameters above.\n" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [] 127 | } 128 | ], 129 | "metadata": { 130 | "kernelspec": { 131 | "display_name": "Python 2", 132 | "language": "python", 133 | "name": "python2" 134 | }, 135 | "language_info": { 136 | "codemirror_mode": { 137 | "name": "ipython", 138 | "version": 2 139 | }, 140 | "file_extension": ".py", 141 | "mimetype": "text/x-python", 142 | "name": "python", 143 | "nbconvert_exporter": "python", 144 | "pygments_lexer": "ipython2", 145 | "version": "2.7.11" 146 | } 147 | }, 148 | "nbformat": 4, 149 | "nbformat_minor": 0 150 | } 151 | -------------------------------------------------------------------------------- /DSFromScratch/Chap6/064-Chap6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Data Science from Scratch\n", 8 | "\n", 9 | "#### Chapter 6: Probability" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "#### Dependence and Independence\n", 17 | "\n", 18 | "Two events *E* and *F* are independent if the probability that they both happen is the product of the probabilities that each one happens:\n", 19 | "\n", 20 | "`P(E,F) = P(E)P(F)`" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "#### Conditional Probability\n", 28 | "\n", 29 | "`E` conditional on `F` as:\n", 30 | "`P(E|F) = P(E,F)/P(F)`\n", 31 | "\n", 32 | "This is the probability that E happens given that we know that `F` happens. \n", 33 | "Often rewritten as:\n", 34 | "\n", 35 | "`P(E,F) = P(E|F)P(F)`\n", 36 | "\n", 37 | "Mathematically, this says that knowing that F occurred gives us no additional information about whether E occurred." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "#### Baye's Theorem\n", 63 | "\n", 64 | "Imagine a certain disease that affects 1 in every 10,000 people. And imagine\n", 65 | "that there is a test for this disease that gives the correct result (“diseased” if you have\n", 66 | "the disease, “nondiseased” if you don’t) 99% of the time.\n", 67 | "\n", 68 | "A more intuitive way to see this is to imagine a population of 1 million people. You’d\n", 69 | "expect 100 of them to have the disease, and 99 of those 100 to test positive. On the\n", 70 | "other hand, you’d expect 999,900 of them not to have the disease, and 9,999 of those\n", 71 | "to test positive. Which means that" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Random Variables" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "#### Continuous Distributions" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "#### The Normal Distribution" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "#### The Central Limit Theorem" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": true 181 | }, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python [Root]", 225 | "language": "python", 226 | "name": "Python [Root]" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.5.2" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 0 243 | } 244 | -------------------------------------------------------------------------------- /windspeed/scripts/038-group_tseries.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime as datetime 4 | import matplotlib.pyplot as plt 5 | 6 | # Creating a panel of timeseries for each group of stations. 7 | 8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 9 | # obs per month. 10 | 11 | # An average over the group will be an extra plot in the panel. 12 | 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh', 14 | '60555Touggourt','60559ElOued','60566Ghardaia','60580Ouargla', 15 | '60581HassiMessaoud'] 16 | 17 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah', 18 | '62103Ghadames','62124Sebha'] 19 | 20 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa', 21 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika'] 22 | 23 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya', 24 | '61017Bilma'] 25 | 26 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan', 27 | '62420Baharia','62423Farafra','62435Kharga'] 28 | 29 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima', 30 | '62680Atbara'] 31 | 32 | 33 | stations=[NAl,CSar,WSa,CSal,Egy,Sud] 34 | #stations = [CSar, WSa] 35 | 36 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 37 | 'Egypt':Egy,'Sudan':Sud} 38 | 39 | group_strings=['NAlgeria','CSahara','WSahel','CSahel', 'Egypt','Sudan'] 40 | #group_strings=['CSahara','WSahel'] 41 | 42 | 43 | # Could these two functions be turned into lambda functions? 44 | # Would that be preferable or are these fine? 45 | 46 | def meanf(x): 47 | if x.count() > 10: 48 | return x.mean() 49 | 50 | def sdf(x): 51 | if x.count() > 10: 52 | return x.std() 53 | 54 | def read_file(fname): 55 | '''put the station name into read_file and read_file will return a 56 | dataFrame called wind which has the following columns a dataframe with a 57 | datetime index''' 58 | 59 | 60 | column_names=["year","month","day","hour","ws"] 61 | dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float} 62 | 63 | datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname 64 | 65 | # specify the columns you want to group together. Can't include hour at 66 | # this point as it is not in the right format. 67 | date_spec = {'date_time': [0,1,2]} 68 | 69 | # when you use keep_dat_col it keeps them as objects, not as the dtype you 70 | # read them in as. 71 | wind = pd.read_csv(datafile, sep=" ", names=column_names, 72 | parse_dates=date_spec, keep_date_col=True, index_col=False ) 73 | 74 | # Dealing with hour - going from 600, 1200 etc to 6,12, 18 75 | wind["hour"]=(wind["hour"]/100).astype(int) 76 | 77 | # combining year, month, day that were parsed together into date_time with 78 | # hour, which is now in the correct format. 79 | wind['date_time'] = pd.to_datetime(wind.date_time) + \ 80 | wind.hour.astype('timedelta64[h]') 81 | 82 | # make datetime the index before making subsections. 83 | wind.index = wind['date_time'] 84 | 85 | # Adds extra rows where value is kept if it meets isin() criteria. Nan if 86 | # it doesn't. 87 | wind['ws_0']= wind['ws'][wind['hour'].isin([0])] 88 | wind['ws_06']= wind['ws'][wind['hour'].isin([6])] 89 | wind['ws_12']= wind['ws'][wind['hour'].isin([12])] 90 | wind['ws_18']= wind['ws'][wind['hour'].isin([18])] 91 | 92 | group = wind.groupby(['year', 'month']) 93 | 94 | wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf]) 95 | 96 | return wind_group 97 | 98 | 99 | def plot_tseries(group): 100 | '''set up n+1 subplots where n is number of stations in the group. Fill in 101 | each plot with timeseries from each station and then a mean of all the 102 | stations. Output to file eps.''' 103 | 104 | 105 | fig = plt.figure(figsize=(10,10)) 106 | 107 | for i in range(len(group)): 108 | 109 | #just for testing, see what group we are on 110 | print(group_strings[j]) 111 | print(type(group)) 112 | 113 | #read the file in for plotting 114 | wind_group = read_file(group[i]) 115 | 116 | 117 | 118 | # Dump the month part of the index to make the xaxis less crowded 119 | wind_group.index = wind_group.index.droplevel(['month']) 120 | 121 | # fig.add_subplot(nrows, ncols, num) 122 | 123 | ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1) 124 | 125 | plt.title(s=group[i], fontsize=15) 126 | 127 | # May not need the if statements if I can solve the x problem below. 128 | if len(wind_group.ws_0['meanf']) != 0: 129 | wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8), c = 'm') 130 | 131 | if len(wind_group.ws_0['meanf']) != 0: 132 | wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c ='r') 133 | 134 | if len(wind_group.ws_0['meanf']) != 0: 135 | wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8), c ='b') 136 | 137 | if len(wind_group.ws_0['meanf']) != 0: 138 | wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c='c') 139 | 140 | ax.legend(loc=4,bbox_to_anchor=(0.95, 1.05),labels 141 | = ['00','06','12','18'],prop={'size':6}) 142 | 143 | plt.tight_layout() # very nice! stops the titles overlapping 144 | fig.suptitle(group_strings[j]) 145 | fig.savefig('/home/sophie/projects/windspeed/' 146 | 'output/%s.png'%(group_strings[j]),dpi=125) 147 | 148 | if __name__ == '__main__': 149 | 150 | # x is coming as a list and we need it as just an object name. 151 | for j,x in enumerate(stations): plot_tseries(x) 152 | #plot_tseries(CSar) 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | -------------------------------------------------------------------------------- /DSFromScratch/Chap6/.ipynb_checkpoints/064-Chap6-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Data Science from Scratch\n", 8 | "\n", 9 | "#### Chapter 6: Probability" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "#### Dependence and Independence\n", 17 | "\n", 18 | "Two events *E* and *F* are independent if the probability that they both happen is the product of the probabilities that each one happens:\n", 19 | "\n", 20 | "`P(E,F) = P(E)P(F)`" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": {}, 26 | "source": [ 27 | "#### Conditional Probability\n", 28 | "\n", 29 | "`E` conditional on `F` as:\n", 30 | "`P(E|F) = P(E,F)/P(F)`\n", 31 | "\n", 32 | "This is the probability that E happens given that we know that `F` happens. \n", 33 | "Often rewritten as:\n", 34 | "\n", 35 | "`P(E,F) = P(E|F)P(F)`\n", 36 | "\n", 37 | "Mathematically, this says that knowing that F occurred gives us no additional information about whether E occurred." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "outputs": [], 56 | "source": [] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "#### Baye's Theorem\n", 63 | "\n", 64 | "Imagine a certain disease that affects 1 in every 10,000 people. And imagine\n", 65 | "that there is a test for this disease that gives the correct result (“diseased” if you have\n", 66 | "the disease, “nondiseased” if you don’t) 99% of the time.\n", 67 | "\n", 68 | "A more intuitive way to see this is to imagine a population of 1 million people. You’d\n", 69 | "expect 100 of them to have the disease, and 99 of those 100 to test positive. On the\n", 70 | "other hand, you’d expect 999,900 of them not to have the disease, and 9,999 of those\n", 71 | "to test positive. Which means that" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": { 78 | "collapsed": true 79 | }, 80 | "outputs": [], 81 | "source": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "#### Random Variables" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "#### Continuous Distributions" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": true 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "\n" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "collapsed": true 131 | }, 132 | "outputs": [], 133 | "source": [] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "#### The Normal Distribution" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": { 146 | "collapsed": true 147 | }, 148 | "outputs": [], 149 | "source": [] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": { 155 | "collapsed": true 156 | }, 157 | "outputs": [], 158 | "source": [] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "#### The Central Limit Theorem" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": { 171 | "collapsed": true 172 | }, 173 | "outputs": [], 174 | "source": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": null, 179 | "metadata": { 180 | "collapsed": true 181 | }, 182 | "outputs": [], 183 | "source": [] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": { 189 | "collapsed": true 190 | }, 191 | "outputs": [], 192 | "source": [] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": null, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": { 216 | "collapsed": true 217 | }, 218 | "outputs": [], 219 | "source": [] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "Python [Root]", 225 | "language": "python", 226 | "name": "Python [Root]" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "ipython", 231 | "version": 3 232 | }, 233 | "file_extension": ".py", 234 | "mimetype": "text/x-python", 235 | "name": "python", 236 | "nbconvert_exporter": "python", 237 | "pygments_lexer": "ipython3", 238 | "version": "3.5.2" 239 | } 240 | }, 241 | "nbformat": 4, 242 | "nbformat_minor": 0 243 | } 244 | -------------------------------------------------------------------------------- /windspeed/scripts/039-group_tseries.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime as datetime 4 | import matplotlib.pyplot as plt 5 | 6 | # Creating a panel of timeseries for each group of stations. 7 | 8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 9 | # obs per month. 10 | 11 | # An average over the group will be an extra plot in the panel. 12 | 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh', 14 | '60555Touggourt','60559ElOued','60566Ghardaia', 15 | '60580Ouargla','60581HassiMessaoud'] 16 | 17 | 18 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah', 19 | '62103Ghadames','62124Sebha'] 20 | 21 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa', 22 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika'] 23 | 24 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya', 25 | '61017Bilma'] 26 | 27 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan', 28 | '62420Baharia','62423Farafra','62435Kharga'] 29 | 30 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima', 31 | '62680Atbara'] 32 | 33 | 34 | stations=[NAl,CSar,WSa,CSal,Egy,Sud] 35 | #stations = [CSar, WSa] 36 | 37 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 38 | 'Egypt':Egy,'Sudan':Sud} 39 | 40 | group_strings=['NAlgeria','CSahara','WSahel','CSahel', 'Egypt','Sudan'] 41 | #group_strings=['CSahara','WSahel'] 42 | 43 | 44 | # Could these two functions be turned into lambda functions? 45 | # Would that be preferable or are these fine? 46 | 47 | def meanf(x): 48 | if x.count() > 10: 49 | return x.mean() 50 | 51 | def sdf(x): 52 | if x.count() > 10: 53 | return x.std() 54 | 55 | def read_file(fname): 56 | '''put the station name into read_file and read_file will return a 57 | dataFrame called wind which has the following columns a dataframe with a 58 | datetime index''' 59 | 60 | 61 | column_names=["year","month","day","hour","ws"] 62 | dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float} 63 | 64 | datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname 65 | 66 | # specify the columns you want to group together. Can't include hour at 67 | # this point as it is not in the right format. 68 | date_spec = {'date_time': [0,1,2]} 69 | 70 | # when you use keep_dat_col it keeps them as objects, not as the dtype you 71 | # read them in as. 72 | wind = pd.read_csv(datafile, sep=" ", names=column_names, 73 | parse_dates=date_spec, keep_date_col=True, index_col=False ) 74 | 75 | # Dealing with hour - going from 600, 1200 etc to 6,12, 18 76 | wind["hour"]=(wind["hour"]/100).astype(int) 77 | 78 | # combining year, month, day that were parsed together into date_time with 79 | # hour, which is now in the correct format. 80 | wind['date_time'] = pd.to_datetime(wind.date_time) + \ 81 | wind.hour.astype('timedelta64[h]') 82 | 83 | # make datetime the index before making subsections. 84 | wind.index = wind['date_time'] 85 | 86 | # Adds extra rows where value is kept if it meets isin() criteria. Nan if 87 | # it doesn't. 88 | wind['ws_0']= wind['ws'][wind['hour'].isin([0])] 89 | wind['ws_06']= wind['ws'][wind['hour'].isin([6])] 90 | wind['ws_12']= wind['ws'][wind['hour'].isin([12])] 91 | wind['ws_18']= wind['ws'][wind['hour'].isin([18])] 92 | 93 | group = wind.groupby(['year', 'month']) 94 | 95 | wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf]) 96 | 97 | return wind_group 98 | 99 | 100 | def plot_tseries(group): 101 | '''set up n+1 subplots where n is number of stations in the group. Fill in 102 | each plot with timeseries from each station and then a mean of all the 103 | stations. Output to file eps.''' 104 | 105 | 106 | fig = plt.figure(figsize=(10,10)) 107 | 108 | for i in range(len(group)): 109 | 110 | #just for testing, see what group we are on 111 | print(group_strings[0]) 112 | print(type(group)) 113 | 114 | #read the file in for plotting 115 | wind_group = read_file(group[i]) 116 | 117 | # check that there is data for the time period of interest 118 | assert len(wind_group['1990':'1994']) != 0, ('No data for %s in this' 119 | 'time period so no plot!'% group[i]) 120 | 121 | if len(wind_group['1990':'1994']) != 0: 122 | # Dump the month part of the index to make the xaxis less crowded 123 | wind_group.index = wind_group.index.droplevel(['month']) 124 | 125 | # fig.add_subplot(nrows, ncols, num) 126 | 127 | ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1) 128 | 129 | plt.title(s=group[i], fontsize=15) 130 | 131 | # May not need the if statements if I can solve the x problem below. 132 | # No, I do, so if there are no data in that time period it will be 133 | # caught - as in Ouargla! 134 | #print(len(wind_group.ws_0['meanf'])) 135 | 136 | wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8),c='m') 137 | wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c='r') 138 | wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8),c='b') 139 | wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c='c') 140 | 141 | ax.legend(loc=4,bbox_to_anchor=(0.95, 1.05),labels 142 | = ['00','06','12','18'],prop={'size':6}) 143 | 144 | plt.tight_layout() # very nice! stops the titles overlapping 145 | fig.suptitle(group_strings[0]) 146 | fig.savefig('/home/sophie/projects/windspeed/' 147 | 'output/%s.png'%(group_strings[0]),dpi=125) 148 | 149 | if __name__ == '__main__': 150 | 151 | # x is coming as a list and we need it as just an object name. 152 | #for j,x in enumerate(stations): plot_tseries(x) 153 | plot_tseries(NAl) 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/049-Credible_intervals_cdfs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Credible Intervals\n", 8 | "\n", 9 | "Once you have computed a posterior distribution, it is often useful to summarize the results with a single point estimate or an interval. For point estimates it is common to use the mean, median, or the value with maximum likelihood.\n", 10 | "\n", 11 | "A **credible interval** are the values where there is a 90% chance that the unknown value falls between them. \n", 12 | "\n", 13 | "To compute a **credible interval** add up the probabilities in the posterior distribution and record the values that correspond to the 5th and 95th percentiles.\n", 14 | "\n", 15 | "We can use ThinkBayes" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 6, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "def Percentile(pmf, percentage):\n", 27 | " p = percentage / 100.0\n", 28 | " total = 0\n", 29 | " for val, prob in pmf.Items():\n", 30 | " total += prob\n", 31 | " if total >= p:\n", 32 | " return val" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Now import the locomotive suite of hypotheses so we can apply the Percentile function to it. " 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 11, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import os\n", 51 | "import sys\n", 52 | "module_path = os.path.abspath(os.path.join('..'))\n", 53 | "if module_path not in sys.path:\n", 54 | " sys.path.append(module_path)\n", 55 | " \n", 56 | "from thinkbayes import Pmf, Suite" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 12, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "# Taken from the first \"Estimation\" tutorial\n", 68 | "class Dice(Suite): \n", 69 | " def Likelihood(self, data, hypo):\n", 70 | " if hypo < data:\n", 71 | " return 0 \n", 72 | " else:\n", 73 | " return 1.0/hypo\n", 74 | "\n", 75 | "# The likelihood function is the same in the Train as the Dice\n", 76 | "class Train(Dice):\n", 77 | " def __init__(self, hypos, alpha = 1.0): # Adding alpha to the arguments\n", 78 | " Pmf.__init__(self)\n", 79 | " for hypo in hypos:\n", 80 | " self.Set(hypo, hypo**(-alpha)) # adding in the power law here to alter the prior\n", 81 | " self.Normalize()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 13, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "hypos = range(1, 1001) # PRIOR p(H)\n", 93 | "suite = Train(hypos)\n", 94 | "\n", 95 | "for data in [60, 30, 90]:\n", 96 | " suite.Update(data)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "Now we can use the Percentile function we defined above." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 10, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "(91, 242)\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "# To use Percentile\n", 123 | "interval = Percentile(suite, 5), Percentile(suite, 95)\n", 124 | "print (interval)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "For the locomotive problem, using a power law prior and 3 trains, the 90% credible interval is (91, 243) - (5th ,95th). This very wide range correctly suggests the massive uncertainty in how many trains there are all together. " 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### Cumulative distribution functions\n", 139 | "\n", 140 | "In the previous section we computed percentiles by iterating through the values and probabilities in a Pmf. If we need to compute more than a few percentiles, it is more efficient to use a cumulative distribution function (Cdf).\n", 141 | "\n", 142 | "Cdfs and Pmfs are equivalent in the sense that they contain the same information about the distribution, and you can convert on to the other. The advantage of the Cdf is that you can compute percentiles more efficiently.\n", 143 | "\n", 144 | "thinkbayes provides a Cdf class that represents a cumulative distribution function. Pmf provides a method that makes the corresponsing Cdf:" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 15, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "(91, 242)\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "cdf = suite.MakeCdf()\n", 164 | "\n", 165 | "# Cdf provides a function named Percentile\n", 166 | "interval = cdf.Percentile(5), cdf.Percentile(95)\n", 167 | "\n", 168 | "print(interval)" 169 | ] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 2", 175 | "language": "python", 176 | "name": "python2" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 2 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython2", 188 | "version": "2.7.11" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 0 193 | } 194 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/.ipynb_checkpoints/049-Credible_intervals_cdfs-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### Credible Intervals\n", 8 | "\n", 9 | "Once you have computed a posterior distribution, it is often useful to summarize the results with a single point estimate or an interval. For point estimates it is common to use the mean, median, or the value with maximum likelihood.\n", 10 | "\n", 11 | "A **credible interval** are the values where there is a 90% chance that the unknown value falls between them. \n", 12 | "\n", 13 | "To compute a **credible interval** add up the probabilities in the posterior distribution and record the values that correspond to the 5th and 95th percentiles.\n", 14 | "\n", 15 | "We can use ThinkBayes" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 6, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "def Percentile(pmf, percentage):\n", 27 | " p = percentage / 100.0\n", 28 | " total = 0\n", 29 | " for val, prob in pmf.Items():\n", 30 | " total += prob\n", 31 | " if total >= p:\n", 32 | " return val" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Now import the locomotive suite of hypotheses so we can apply the Percentile function to it. " 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 11, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "import os\n", 51 | "import sys\n", 52 | "module_path = os.path.abspath(os.path.join('..'))\n", 53 | "if module_path not in sys.path:\n", 54 | " sys.path.append(module_path)\n", 55 | " \n", 56 | "from thinkbayes import Pmf, Suite" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 12, 62 | "metadata": { 63 | "collapsed": true 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "# Taken from the first \"Estimation\" tutorial\n", 68 | "class Dice(Suite): \n", 69 | " def Likelihood(self, data, hypo):\n", 70 | " if hypo < data:\n", 71 | " return 0 \n", 72 | " else:\n", 73 | " return 1.0/hypo\n", 74 | "\n", 75 | "# The likelihood function is the same in the Train as the Dice\n", 76 | "class Train(Dice):\n", 77 | " def __init__(self, hypos, alpha = 1.0): # Adding alpha to the arguments\n", 78 | " Pmf.__init__(self)\n", 79 | " for hypo in hypos:\n", 80 | " self.Set(hypo, hypo**(-alpha)) # adding in the power law here to alter the prior\n", 81 | " self.Normalize()" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 13, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "hypos = range(1, 1001) # PRIOR p(H)\n", 93 | "suite = Train(hypos)\n", 94 | "\n", 95 | "for data in [60, 30, 90]:\n", 96 | " suite.Update(data)" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "Now we can use the Percentile function we defined above." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 10, 109 | "metadata": { 110 | "collapsed": false 111 | }, 112 | "outputs": [ 113 | { 114 | "name": "stdout", 115 | "output_type": "stream", 116 | "text": [ 117 | "(91, 242)\n" 118 | ] 119 | } 120 | ], 121 | "source": [ 122 | "# To use Percentile\n", 123 | "interval = Percentile(suite, 5), Percentile(suite, 95)\n", 124 | "print (interval)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": {}, 130 | "source": [ 131 | "For the locomotive problem, using a power law prior and 3 trains, the 90% credible interval is (91, 243) - (5th ,95th). This very wide range correctly suggests the massive uncertainty in how many trains there are all together. " 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### Cumulative distribution functions\n", 139 | "\n", 140 | "In the previous section we computed percentiles by iterating through the values and probabilities in a Pmf. If we need to compute more than a few percentiles, it is more efficient to use a cumulative distribution function (Cdf).\n", 141 | "\n", 142 | "Cdfs and Pmfs are equivalent in the sense that they contain the same information about the distribution, and you can convert on to the other. The advantage of the Cdf is that you can compute percentiles more efficiently.\n", 143 | "\n", 144 | "thinkbayes provides a Cdf class that represents a cumulative distribution function. Pmf provides a method that makes the corresponsing Cdf:" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 15, 150 | "metadata": { 151 | "collapsed": false 152 | }, 153 | "outputs": [ 154 | { 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "(91, 242)\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "cdf = suite.MakeCdf()\n", 164 | "\n", 165 | "# Cdf provides a function named Percentile\n", 166 | "interval = cdf.Percentile(5), cdf.Percentile(95)\n", 167 | "\n", 168 | "print(interval)" 169 | ] 170 | } 171 | ], 172 | "metadata": { 173 | "kernelspec": { 174 | "display_name": "Python 3", 175 | "language": "python", 176 | "name": "python3" 177 | }, 178 | "language_info": { 179 | "codemirror_mode": { 180 | "name": "ipython", 181 | "version": 3 182 | }, 183 | "file_extension": ".py", 184 | "mimetype": "text/x-python", 185 | "name": "python", 186 | "nbconvert_exporter": "python", 187 | "pygments_lexer": "ipython3", 188 | "version": "3.5.1" 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 0 193 | } 194 | -------------------------------------------------------------------------------- /windspeed/notebooks/010_1-windspeed.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 184, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | " year month day hour ws\n", 15 | "count 29372.000000 29372.000000 29372.000000 29372.000000 29372.000000\n", 16 | "mean 1997.568058 6.460813 15.689160 1192.411140 5.479576\n", 17 | "std 7.985148 3.380070 8.809184 437.121055 1.993366\n", 18 | "min 1984.000000 1.000000 1.000000 0.000000 0.077814\n", 19 | "25% 1991.000000 4.000000 8.000000 900.000000 4.090943\n", 20 | "50% 1997.000000 6.000000 16.000000 1200.000000 5.421245\n", 21 | "75% 2005.000000 9.000000 23.000000 1500.000000 6.842833\n", 22 | "max 2012.000000 12.000000 31.000000 2100.000000 15.215400\n", 23 | " year month day\n", 24 | "0 1984 3 1\n", 25 | "1 1984 3 1\n", 26 | "2 1984 3 1\n", 27 | "3 1984 3 2\n", 28 | "4 1984 3 2\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import pandas as pd\n", 34 | "import numpy as np\n", 35 | "from datetime import datetime\n", 36 | "\n", 37 | "column_names=[\"year\",\"month\",\"day\",\"hour\",\"ws\"]\n", 38 | "dtype={\"year\":int,\"month\":int,\"day\":int,\"hour\":int,\"ws\":float}\n", 39 | "\n", 40 | "date_spec = {'date_time': [0,1,2]}\n", 41 | "\n", 42 | "datafile='/home/sophie/projects/windspeed/data/61401BirMoghrein_allwinds.txt'\n", 43 | "\n", 44 | "#using infer_datetime_format=True didn't help\n", 45 | "#when you use keep_dat_col it keeps them as objects, not as the dtype you read them in as.\n", 46 | "wind = pd.read_csv(datafile, sep=\" \", names=column_names, index_col=False ) \n", 47 | "print wind.describe()\n", 48 | "print wind[['year','month','day']][0:5]" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 185, 54 | "metadata": { 55 | "collapsed": false 56 | }, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "[1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998\n", 63 | " 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012]\n", 64 | "[ 3 4 5 6 7 8 9 10 11 12 1 2]\n", 65 | "[ 1 2 3 4 5 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 31 6 7\n", 66 | " 8 9 10 11 12 30]\n", 67 | "[ 600 1200 1800 900 1500 2100 0 300]\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "#checking what the unique values are in each column\n", 73 | "#A good check when you suspect discrete values\n", 74 | "for x in range(0,4): print wind[column_names[x]].unique()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 186, 80 | "metadata": { 81 | "collapsed": false 82 | }, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "0 06\n", 89 | "1 12\n", 90 | "2 18\n", 91 | "3 06\n", 92 | "4 12\n", 93 | "Name: hour, dtype: object\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "wind[\"hour\"]=(wind[\"hour\"]/100).astype(int)\n", 99 | "wind[\"hour\"] = wind.hour.map(\"{:02}\".format)\n", 100 | "\n", 101 | "year = wind['year'].apply(str)[0:5]\n", 102 | "month = wind['month'].apply(str)[0:5]\n", 103 | "day = wind['day'].apply(str)[0:5]\n", 104 | "#hour = wind['hour'].apply(str)[0:5]; print hour\n", 105 | "hour = wind['hour'][0:5]; print hour\n" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": 187, 111 | "metadata": { 112 | "collapsed": false 113 | }, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "0 1984-03-10 06:00:00\n", 120 | "1 1984-03-11 02:00:00\n", 121 | "2 1984-03-11 08:00:00\n", 122 | "3 1984-03-20 06:00:00\n", 123 | "4 1984-03-21 02:00:00\n", 124 | "dtype: datetime64[ns]\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "p = pd.to_datetime(year + month + day + hour, yearfirst=True, utc=True, format='%Y%m%d%H') ; print p" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 188, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "0 1984-03-01 06:00:00\n", 144 | "1 1984-03-01 12:00:00\n", 145 | "2 1984-03-01 18:00:00\n", 146 | "3 1984-03-02 06:00:00\n", 147 | "4 1984-03-02 12:00:00\n", 148 | "Name: date_time, dtype: datetime64[ns]\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "#specify the columns you want to group together. Can't include hour at this point as it is not in the right format. \n", 154 | "date_spec = {'date_time': [0,1,2]}\n", 155 | "\n", 156 | "#when you use keep_dat_col it keeps them as objects, not as the dtype you read them in as.\n", 157 | "wind = pd.read_csv(datafile, sep=\" \", names=column_names, parse_dates=date_spec, keep_date_col=True, index_col=False ) \n", 158 | "\n", 159 | "#Dealing with hour - going from 600, 1200 etc to 6,12, 18\n", 160 | "wind[\"hour\"]=(wind[\"hour\"]/100).astype(int)\n", 161 | "\n", 162 | "#combining year, month, day that were parsed together into date_time with hour, which is now in the correct format.\n", 163 | "wind['date_time'] = pd.to_datetime(wind.date_time) + wind.hour.astype('timedelta64[h]')\n", 164 | "\n", 165 | "print wind.date_time[0:5]" 166 | ] 167 | } 168 | ], 169 | "metadata": { 170 | "kernelspec": { 171 | "display_name": "Python 2", 172 | "language": "python", 173 | "name": "python2" 174 | }, 175 | "language_info": { 176 | "codemirror_mode": { 177 | "name": "ipython", 178 | "version": 2 179 | }, 180 | "file_extension": ".py", 181 | "mimetype": "text/x-python", 182 | "name": "python", 183 | "nbconvert_exporter": "python", 184 | "pygments_lexer": "ipython2", 185 | "version": "2.7.11" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 0 190 | } 191 | -------------------------------------------------------------------------------- /windspeed/scripts/040-group_tseries.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import datetime as datetime 4 | import matplotlib.pyplot as plt 5 | 6 | # Creating a panel of timeseries for each group of stations. 7 | 8 | # Panel will have a timeseries of 00,06,12,18 ws if that hour has at least 14 9 | # obs per month. 10 | 11 | # An average over the group will be an extra plot in the panel. 12 | 13 | NAl=['60525Biskra','60549Mecheria','60550Elbayadh', 14 | '60555Touggourt','60559ElOued','60566Ghardaia', 15 | '60580Ouargla','60581HassiMessaoud'] 16 | 17 | 18 | CSar=['60607Timimoun','60611InAmenas','60620Adrar','60630InSalah', 19 | '62103Ghadames','62124Sebha'] 20 | 21 | WSa=['61223Tombouctou','61226Gao','61230NioroDuSahel','61498Kiffa', 22 | '61499AiounElAtrouss','61492Kaedi','61497Nema','61450Tidjika'] 23 | 24 | CSal=['61024Agadez','61045Goure','61052Niamey','64753Faya', 25 | '61017Bilma'] 26 | 27 | Egy=['62387Minya','62393Asyut','62405Luxor','62414Asswan', 28 | '62420Baharia','62423Farafra','62435Kharga'] 29 | 30 | Sud=['62600WadiHalfa','62640AbuHamed','62650Dongola','62660Karima', 31 | '62680Atbara'] 32 | 33 | 34 | stations=[NAl,CSar,WSa,CSal,Egy,Sud] 35 | #stations = [CSal] 36 | 37 | group_names={'NAlgeria':NAl,'CSahara':CSar,'WSahel':WSa,'CSahel':CSal, 38 | 'Egypt':Egy,'Sudan':Sud} 39 | 40 | group_strings=['NAlgeria','CSahara','WSahel','CSahel', 'Egypt','Sudan'] 41 | #group_strings=['CSahara','WSahel'] 42 | 43 | 44 | # Could these two functions be turned into lambda functions? 45 | # Would that be preferable or are these fine? 46 | 47 | def meanf(x): 48 | if x.count() > 10: 49 | return x.mean() 50 | 51 | def sdf(x): 52 | if x.count() > 10: 53 | return x.std() 54 | 55 | def read_file(fname): 56 | '''put the station name into read_file and read_file will return a 57 | dataFrame called wind which has the following columns a dataframe with a 58 | datetime index''' 59 | 60 | 61 | column_names=["year","month","day","hour","ws"] 62 | dtype={"year":int,"month":int,"day":int,"hour":int,"ws":float} 63 | 64 | datafile='/home/sophie/projects/windspeed/data/%s_allwinds.txt' %fname 65 | 66 | # specify the columns you want to group together. Can't include hour at 67 | # this point as it is not in the right format. 68 | date_spec = {'date_time': [0,1,2]} 69 | 70 | # when you use keep_dat_col it keeps them as objects, not as the dtype you 71 | # read them in as. 72 | wind = pd.read_csv(datafile, sep=" ", names=column_names, 73 | parse_dates=date_spec, keep_date_col=True, index_col=False ) 74 | 75 | # Dealing with hour - going from 600, 1200 etc to 6,12, 18 76 | wind["hour"]=(wind["hour"]/100).astype(int) 77 | 78 | # combining year, month, day that were parsed together into date_time with 79 | # hour, which is now in the correct format. 80 | wind['date_time'] = pd.to_datetime(wind.date_time) + \ 81 | wind.hour.astype('timedelta64[h]') 82 | 83 | # make datetime the index before making subsections. 84 | wind.index = wind['date_time'] 85 | 86 | # drop date_time index. For some reason it caused a problem at Niamey if I 87 | # didn't. 88 | #wind.drop('date_time', axis=1, inplace=True) 89 | 90 | #Also a good idea to drop duplicate columns. 91 | # For this case, where the datetime object is the same it needs to be 92 | # dropped, otherwise it doesn't let you add more columns, as in 93 | # wind['ws_0'] etc. below 94 | wind.drop_duplicates(['date_time'],inplace=True) 95 | 96 | # Adds extra rows where value is kept if it meets isin() criteria. Nan if 97 | # it doesn't. 98 | wind['ws_0']= wind['ws'][wind['hour'].isin([0])] 99 | wind['ws_06']= wind['ws'][wind['hour'].isin([6])] 100 | wind['ws_12']= wind['ws'][wind['hour'].isin([12])] 101 | wind['ws_18']= wind['ws'][wind['hour'].isin([18])] 102 | 103 | group = wind.groupby(['year', 'month']) 104 | 105 | wind_group = group['ws','ws_0','ws_06','ws_12','ws_18'].agg([meanf,sdf]) 106 | 107 | return wind_group 108 | 109 | 110 | def plot_tseries(group): 111 | '''set up n+1 subplots where n is number of stations in the group. Fill in 112 | each plot with timeseries from each station and then a mean of all the 113 | stations. Output to file eps.''' 114 | 115 | 116 | fig = plt.figure(figsize=(10,10)) 117 | 118 | for i in range(len(group)): 119 | 120 | # just for testing, see what group we are on 121 | print(group_strings[j]) 122 | print(type(group)) 123 | print(group[i]) 124 | 125 | # read in one station from the group, read_file will create a group by 126 | # object ready for plotting 127 | wind_group = read_file(group[i]) 128 | 129 | # check that there is data for the time period of interest 130 | #assert len(wind_group['1990':'1994']) != 0, ('No data for %s in this ' 131 | # 'time period so no plot!'% group[i]) 132 | 133 | if len(wind_group['1990':'1994']) != 0: 134 | # Dump the month part of the index to make the xaxis less crowded 135 | wind_group.index = wind_group.index.droplevel(['month']) 136 | 137 | # fig.add_subplot(nrows, ncols, num) 138 | 139 | ax = fig.add_subplot(int((len(group)+1)/2), 2, i+1) 140 | 141 | plt.title(s=group[i], fontsize=15) 142 | 143 | # May not need the if statements if I can solve the x problem below. 144 | # No, I do, so if there are no data in that time period it will be 145 | # caught - as in Ouargla! 146 | #print(len(wind_group.ws_0['meanf'])) 147 | 148 | wind_group.ws_0['meanf']['1990':'1994'].plot(figsize=(8,8),c='m') 149 | wind_group.ws_06['meanf']['1990':'1994'].plot(figsize=(8,8), c='r') 150 | wind_group.ws_12['meanf']['1990':'1994'].plot(figsize=(8,8),c='b') 151 | wind_group.ws_18['meanf']['1990':'1994'].plot(figsize=(8,8), c='c') 152 | 153 | ax.legend(loc=4,bbox_to_anchor=(0.95, 1.05),labels 154 | = ['00','06','12','18'],prop={'size':6}) 155 | 156 | plt.tight_layout() # very nice! stops the titles overlapping 157 | fig.suptitle(group_strings[j]) 158 | fig.savefig('/home/sophie/projects/windspeed/' 159 | 'output/%s.png'%(group_strings[j]),dpi=125) 160 | 161 | if __name__ == '__main__': 162 | 163 | # x is coming as a list and we need it as just an object name. 164 | for j,x in enumerate(stations): plot_tseries(x) 165 | #plot_tseries(NAl) 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/043-Distributions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Computational Statistics\n", 8 | "\n", 9 | "### Distributions" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Count the number of times each word appears in a sequence" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "# This tells Python of that additional module import path. \n", 28 | "import os\n", 29 | "import sys\n", 30 | "module_path = os.path.abspath(os.path.join('..'))\n", 31 | "if module_path not in sys.path:\n", 32 | " sys.path.append(module_path)\n", 33 | " \n", 34 | "from thinkbayes import Pmf # Probability mass function\n", 35 | "\n", 36 | "# Creates an instance of class Pmf (pmf) to represent the distribution of outcomes for a six-sided die:\n", 37 | "# class Pmf inherits from _DictWrapper (an object which contains a dictionary)\n", 38 | "pmf = Pmf()\n", 39 | "\n", 40 | "# Set --> {1: 1/6.0, 2:1/6.0, 3:1/6.0....}\n", 41 | "for x in [1,2,3,4,5,6]:\n", 42 | " pmf.Set(x,1/6.0) # Set function is within the _DictWrapper class. So pmf inherits it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "dict_keys([1, 2, 3, 4, 5, 6])\n", 57 | "dict_items([(1, 0.16666666666666666), (2, 0.16666666666666666), (3, 0.16666666666666666), (4, 0.16666666666666666), (5, 0.16666666666666666), (6, 0.16666666666666666)])\n", 58 | "\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "# How to access values from pmf? You need to use the right methods. i.e.\n", 64 | "# \n", 65 | "print (pmf.Values()) # just gives the keys. \n", 66 | "print (pmf.Items()) # gives the key: value pairs in the dictionary\n", 67 | "print (pmf.Render()) # create items for plotting" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 20, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# help(pmf) # A list of the available classes and methods." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "#### The Cookie Problem" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "pmf = Pmf()\n", 97 | "\n", 98 | "# Hypothesis B1 and B2 (Bowl 1 and Bowl 2).\n", 99 | "# This is the prior distribution (contains the priors for each hypothesis)\n", 100 | "pmf.Set('Bowl 1', 0.5) # p(B1)\n", 101 | "pmf.Set('Bowl 2', 0.5) # p(B2)\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "To update the distribution based on new data (vanilla cookie) we multiply each prior by the corresponding likelihood.\n", 109 | "Now we have new data - A vanilla cookie! - we can update each of B1 and B2, i.e. determining p(B1|D) and p(B2|D). \n", 110 | "So for B1, this would be: \n", 111 | " p(B1|D) = prior\\*Prob of Vanilla from B1/ Prob of Vanilla from either bowl \n", 112 | " p(B1|D) = p(B1)\\*p(D|B1)/p(D) \n", 113 | " p(B1) = 1/2 (there are two bowls) \n", 114 | " p(D|B1) = 3/4 (ratio is 30:10 vanilla to choc) \n", 115 | " p(D) = 5/8 (80 cookies altogether in both bowls, 50 are vanilla) \n", 116 | " So: \n", 117 | " posterior = (1/2*3/2)/(5/8) \n", 118 | " \n", 119 | " \n", 120 | " p(B2|D) = prior*Prob of Vanilla from B1/ Prob of Vanilla from either bowl\n", 121 | "\n", 122 | "The likelihood of drawing a vanilla cookie from Bowl 1 is 3/4 and Bowl 2 is 1/2.\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# Mult get the probability for the given hypothesis and multiplies by the given likelihood\n", 134 | "pmf.Mult('Bowl 1', 0.75)\n", 135 | "pmf.Mult('Bowl 2', 0.5)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "After this update, the distribution is no longer normalized, but because these hypotheses are mutally exclusive and collectively exhaustive, we can renormalize:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "0.625" 156 | ] 157 | }, 158 | "execution_count": 6, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "pmf.Normalize()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "source": [ 173 | "The result is a distribution that contains the posterior probability for each hypothesis, now called the POSTERIOR DISTRIBUTION" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 7, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "0.6000000000000001\n", 188 | "dict_keys(['Bowl 1', 'Bowl 2'])\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "# Get the posterior probability for Bowl 1.\n", 194 | "print (pmf.Prob('Bowl 1'))\n", 195 | "\n", 196 | "print (pmf.Values())" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.5.1" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 0 230 | } 231 | -------------------------------------------------------------------------------- /tutorials/Samsung/notebooks/031-Samsung_cleanup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 26, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Populating the interactive namespace from numpy and matplotlib\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "# Changed the order in which things are done, from the previous workbook 030-Samsung_cleanup, as it does make a difference\n", 20 | "# for later commands. \n", 21 | "\n", 22 | "%pylab inline\n", 23 | "import pandas as pd\n", 24 | "\n", 25 | "# copy 2 is just a copy of features.\n", 26 | "df = pd.read_csv('/home/sophie/projects/Samsung/data/UCI_HAR_Dataset/UCI_HAR_Dataset/features_copy2.txt',sep=\" \",\n", 27 | " names = ['name'], dtype='str')\n", 28 | "\n", 29 | "# First will drop duplicates\n", 30 | "df.drop_duplicates(['name'],inplace=True)\n", 31 | "\n", 32 | "# remove numbers, brackets, \"-\" and \",\" from all columns\n", 33 | "\n", 34 | "df.name = df.name.str.replace('[()]', '') # remove brackets\n", 35 | "df.name = df.name.str.replace('[0-9]','') # remove any numbers\n", 36 | "\n", 37 | "df.drop_duplicates(['name'],inplace=True)\n" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 27, 43 | "metadata": { 44 | "collapsed": false 45 | }, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | " name\n", 52 | "345 fBodyAccJerk-mean-X\n", 53 | "346 fBodyAccJerk-mean-Y\n", 54 | "347 fBodyAccJerk-mean-Z\n", 55 | "348 fBodyAccJerk-std-X\n", 56 | "349 fBodyAccJerk-std-Y\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "# Print out lines that contain \"Jerk\" and \"Mean\" to look for reasons why I may have dumped them\n", 62 | "print df[df.name.str.contains('f.*Jerk')][0:5]\n" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 41, 68 | "metadata": { 69 | "collapsed": false 70 | }, 71 | "outputs": [ 72 | { 73 | "ename": "AttributeError", 74 | "evalue": "'str' object has no attribute 'str'", 75 | "output_type": "error", 76 | "traceback": [ 77 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 78 | "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", 79 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[1;31m# conflicting. If I can't do it this way, can just use their list.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;31m# Had to keep \"-\" in in order to only remove columns with -X, -Y, -Z\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m \u001b[0mdf\u001b[0m\u001b[1;33m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mname\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mstr\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcontains\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'-X|-Y|-Z|min|max|mad|sma|iqr|entropy|energy|band|Coeff'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m==\u001b[0m \u001b[0mFalse\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 6\u001b[0m \u001b[1;32mprint\u001b[0m \u001b[0mlen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;31m# How many are left now?\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 80 | "\u001b[1;31mAttributeError\u001b[0m: 'str' object has no attribute 'str'" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "# Dropping the lines i'm confident we definitely don't need. The documentation is actually a bit confusing and \n", 86 | "# conflicting. If I can't do it this way, can just use their list. \n", 87 | "# Had to keep \"-\" in in order to only remove columns with -X, -Y, -Z\n", 88 | "df= df.name[df.name.str.contains('-X|-Y|-Z|min|max|mad|sma|iqr|entropy|energy|band|Coeff') == False]\n", 89 | "\n", 90 | "print len(df) # How many are left now?" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 37, 96 | "metadata": { 97 | "collapsed": false 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "# Now to get rid of Body and Mag and change mean to Mean and std to SD. \n", 102 | "# Can also remove \"-\" and \",\"\n", 103 | "\n", 104 | "df = df.str.replace('Body', '')\n", 105 | "df = df.str.replace('Mag', '')\n", 106 | "df = df.str.replace('mean', 'Mean')\n", 107 | "df = df.str.replace('std', 'SD')\n", 108 | "df = df.str.replace('-', '')\n", 109 | "df = df.str.replace(',', '')" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 2, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "35\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "b = [\"tAccMean\", \"tAccSD\", \"tJerkMean\", \"tJerkSD\",\"tGyroMean\", \"tGyroSD\", \"tGyroJerkMean\", \"tGyroJerkSD\",\n", 129 | "\"fAccMean\", \"fAccSD\", \"fJerkMean\", \"fJerkSD\",\n", 130 | "\"fGyroMean\", \"fGyroSD\", \"fGyroJerkMean\", \"fGyroJerkSD\",\n", 131 | "\"fGyroMeanFreq\", \"fGyroJerkMeanFreq\", \"fAccMeanFreq\", \"fJerkMeanFreq\",\n", 132 | "\"fAccSkewness\", \"fAccKurtosis\", \"fJerkSkewness\", \"fJerkKurtosis\",\n", 133 | "\"fGyroSkewness\", \"fGyroKurtosis\", \"fGyroJerkSkewness\", \"fGyroJerkKurtosis\",\n", 134 | "\"angleAccGravity\", \"angleJerkGravity\", \"angleGyroGravity\", \"angleGyroJerkGravity\",\n", 135 | "\"angleXGravity\", \"angleYGravity\", \"angleZGravity\"]\n", 136 | "\n", 137 | "print len(b) # Not sure why this is 31, when the documentation says there were 37 in the end.\n" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": { 144 | "collapsed": true 145 | }, 146 | "outputs": [], 147 | "source": [] 148 | } 149 | ], 150 | "metadata": { 151 | "kernelspec": { 152 | "display_name": "Python 2", 153 | "language": "python", 154 | "name": "python2" 155 | }, 156 | "language_info": { 157 | "codemirror_mode": { 158 | "name": "ipython", 159 | "version": 2 160 | }, 161 | "file_extension": ".py", 162 | "mimetype": "text/x-python", 163 | "name": "python", 164 | "nbconvert_exporter": "python", 165 | "pygments_lexer": "ipython2", 166 | "version": "2.7.11" 167 | } 168 | }, 169 | "nbformat": 4, 170 | "nbformat_minor": 0 171 | } 172 | -------------------------------------------------------------------------------- /tutorials/ThinkBayes/.ipynb_checkpoints/043-Distributions-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Computational Statistics\n", 8 | "\n", 9 | "### Distributions" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "Count the number of times each word appears in a sequence" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": { 23 | "collapsed": false 24 | }, 25 | "outputs": [], 26 | "source": [ 27 | "# This tells Python of that additional module import path. \n", 28 | "import os\n", 29 | "import sys\n", 30 | "module_path = os.path.abspath(os.path.join('..'))\n", 31 | "if module_path not in sys.path:\n", 32 | " sys.path.append(module_path)\n", 33 | " \n", 34 | "from thinkbayes import Pmf # Probability mass function\n", 35 | "\n", 36 | "# Creates an instance of class Pmf (pmf) to represent the distribution of outcomes for a six-sided die:\n", 37 | "# class Pmf inherits from _DictWrapper (an object which contains a dictionary)\n", 38 | "pmf = Pmf()\n", 39 | "\n", 40 | "# Set --> {1: 1/6.0, 2:1/6.0, 3:1/6.0....}\n", 41 | "for x in [1,2,3,4,5,6]:\n", 42 | " pmf.Set(x,1/6.0) # Set function is within the _DictWrapper class. So pmf inherits it." 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": { 49 | "collapsed": false 50 | }, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "dict_keys([1, 2, 3, 4, 5, 6])\n", 57 | "dict_items([(1, 0.16666666666666666), (2, 0.16666666666666666), (3, 0.16666666666666666), (4, 0.16666666666666666), (5, 0.16666666666666666), (6, 0.16666666666666666)])\n", 58 | "\n" 59 | ] 60 | } 61 | ], 62 | "source": [ 63 | "# How to access values from pmf? You need to use the right methods. i.e.\n", 64 | "# \n", 65 | "print (pmf.Values()) # just gives the keys. \n", 66 | "print (pmf.Items()) # gives the key: value pairs in the dictionary\n", 67 | "print (pmf.Render()) # create items for plotting" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 20, 73 | "metadata": { 74 | "collapsed": false 75 | }, 76 | "outputs": [], 77 | "source": [ 78 | "# help(pmf) # A list of the available classes and methods." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "#### The Cookie Problem" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 4, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "pmf = Pmf()\n", 97 | "\n", 98 | "# Hypothesis B1 and B2 (Bowl 1 and Bowl 2).\n", 99 | "# This is the prior distribution (contains the priors for each hypothesis)\n", 100 | "pmf.Set('Bowl 1', 0.5) # p(B1)\n", 101 | "pmf.Set('Bowl 2', 0.5) # p(B2)\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "To update the distribution based on new data (vanilla cookie) we multiply each prior by the corresponding likelihood.\n", 109 | "Now we have new data - A vanilla cookie! - we can update each of B1 and B2, i.e. determining p(B1|D) and p(B2|D). \n", 110 | "So for B1, this would be: \n", 111 | " p(B1|D) = prior\\*Prob of Vanilla from B1/ Prob of Vanilla from either bowl \n", 112 | " p(B1|D) = p(B1)\\*p(D|B1)/p(D) \n", 113 | " p(B1) = 1/2 (there are two bowls) \n", 114 | " p(D|B1) = 3/4 (ratio is 30:10 vanilla to choc) \n", 115 | " p(D) = 5/8 (80 cookies altogether in both bowls, 50 are vanilla) \n", 116 | " So: \n", 117 | " posterior = (1/2*3/2)/(5/8) \n", 118 | " \n", 119 | " \n", 120 | " p(B2|D) = prior*Prob of Vanilla from B1/ Prob of Vanilla from either bowl\n", 121 | "\n", 122 | "The likelihood of drawing a vanilla cookie from Bowl 1 is 3/4 and Bowl 2 is 1/2.\n" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 5, 128 | "metadata": { 129 | "collapsed": true 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "# Mult get the probability for the given hypothesis and multiplies by the given likelihood\n", 134 | "pmf.Mult('Bowl 1', 0.75)\n", 135 | "pmf.Mult('Bowl 2', 0.5)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "After this update, the distribution is no longer normalized, but because these hypotheses are mutally exclusive and collectively exhaustive, we can renormalize:" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 6, 148 | "metadata": { 149 | "collapsed": false 150 | }, 151 | "outputs": [ 152 | { 153 | "data": { 154 | "text/plain": [ 155 | "0.625" 156 | ] 157 | }, 158 | "execution_count": 6, 159 | "metadata": {}, 160 | "output_type": "execute_result" 161 | } 162 | ], 163 | "source": [ 164 | "pmf.Normalize()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": { 170 | "collapsed": true 171 | }, 172 | "source": [ 173 | "The result is a distribution that contains the posterior probability for each hypothesis, now called the POSTERIOR DISTRIBUTION" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 7, 179 | "metadata": { 180 | "collapsed": false 181 | }, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "0.6000000000000001\n", 188 | "dict_keys(['Bowl 1', 'Bowl 2'])\n" 189 | ] 190 | } 191 | ], 192 | "source": [ 193 | "# Get the posterior probability for Bowl 1.\n", 194 | "print (pmf.Prob('Bowl 1'))\n", 195 | "\n", 196 | "print (pmf.Values())" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": null, 202 | "metadata": { 203 | "collapsed": true 204 | }, 205 | "outputs": [], 206 | "source": [] 207 | } 208 | ], 209 | "metadata": { 210 | "kernelspec": { 211 | "display_name": "Python 3", 212 | "language": "python", 213 | "name": "python3" 214 | }, 215 | "language_info": { 216 | "codemirror_mode": { 217 | "name": "ipython", 218 | "version": 3 219 | }, 220 | "file_extension": ".py", 221 | "mimetype": "text/x-python", 222 | "name": "python", 223 | "nbconvert_exporter": "python", 224 | "pygments_lexer": "ipython3", 225 | "version": "3.5.1" 226 | } 227 | }, 228 | "nbformat": 4, 229 | "nbformat_minor": 0 230 | } 231 | --------------------------------------------------------------------------------