├── Machine Learning
    ├── naive_bayes
    │   ├── vsm.png
    │   ├── terms.png
    │   ├── terms2.png
    │   └── callibration.png
    ├── clustering
    │   └── WineKMC.xlsx
    ├── logistic_regression
    │   ├── images
    │   │   ├── bias.png
    │   │   ├── data.png
    │   │   ├── knn1.png
    │   │   ├── knn2.png
    │   │   ├── linreg.png
    │   │   ├── linsep.png
    │   │   ├── pcanim.gif
    │   │   ├── reshape.jpg
    │   │   ├── sklearn2.jpg
    │   │   ├── train-cv2.png
    │   │   ├── train-cv3.png
    │   │   ├── onelinesplit.png
    │   │   ├── sklearntrans.jpg
    │   │   ├── train-test.png
    │   │   ├── train-validate-test.png
    │   │   ├── complexity-error-plot.png
    │   │   ├── complexity-error-reg.png
    │   │   ├── train-validate-test3.png
    │   │   └── train-validate-test-cont.png
    │   └── .gitignore
    └── linear_regression
    │   ├── images
    │       ├── shuttle.png
    │       ├── cs109gitflow3.png
    │       └── conditionalmean.png
    │   └── .gitignore
├── Capstone Project
    ├── Final Report
    │   ├── Report.pdf
    │   └── Slide Deck.pdf
    ├── Capstone Project Proposal.pdf
    ├── Data
    │   └── notes.txt
    ├── Notebooks
    │   └── Classification_Baseline.ipynb
    └── .ipynb_checkpoints
    │   ├── Classification_Baseline-checkpoint.ipynb
    │   └── Classification_Adding_Shots-checkpoint.ipynb
├── Data Wrangling
    ├── data_wrangling_json
    │   ├── .DS_Store
    │   ├── data
    │   │   ├── .DS_Store
    │   │   └── world_bank_projects_less.json
    │   └── .ipynb_checkpoints
    │   │   └── sliderule_dsi_xml_exercise-checkpoint.ipynb
    └── data_wrangling_xml
    │   └── data_wrangling_xml
    │       ├── .DS_Store
    │       ├── sliderule_dsi_xml_exercise.ipynb
    │       └── .ipynb_checkpoints
    │           └── sliderule_dsi_xml_exercise-checkpoint.ipynb
├── Inferential Statistics
    ├── statistics project 1
    │   ├── .DS_Store
    │   ├── data
    │   │   ├── .DS_Store
    │   │   └── human_body_temperature.csv
    │   └── .ipynb_checkpoints
    │   │   ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb
    │   │   └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb
    ├── statistics project 2
    │   ├── .DS_Store
    │   ├── data
    │   │   ├── .DS_Store
    │   │   └── us_job_market_discrimination.dta
    │   ├── .ipynb_checkpoints
    │   │   ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb
    │   │   └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb
    │   └── sliderule_dsi_inferential_statistics_exercise_2.ipynb
    └── statistics project 3
    │   ├── .DS_Store
    │   ├── data
    │       └── .DS_Store
    │   └── .ipynb_checkpoints
    │       ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb
    │       └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb
└── README.md


/Machine Learning/naive_bayes/vsm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/vsm.png


--------------------------------------------------------------------------------
/Capstone Project/Final Report/Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Final Report/Report.pdf


--------------------------------------------------------------------------------
/Machine Learning/clustering/WineKMC.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/clustering/WineKMC.xlsx


--------------------------------------------------------------------------------
/Machine Learning/naive_bayes/terms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/terms.png


--------------------------------------------------------------------------------
/Machine Learning/naive_bayes/terms2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/terms2.png


--------------------------------------------------------------------------------
/Capstone Project/Final Report/Slide Deck.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Final Report/Slide Deck.pdf


--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_json/.DS_Store


--------------------------------------------------------------------------------
/Machine Learning/naive_bayes/callibration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/callibration.png


--------------------------------------------------------------------------------
/Capstone Project/Capstone Project Proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Capstone Project Proposal.pdf


--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_json/data/.DS_Store


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/bias.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/data.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/knn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/knn1.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/knn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/knn2.png


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 1/.DS_Store


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/.DS_Store


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 3/.DS_Store


--------------------------------------------------------------------------------
/Machine Learning/linear_regression/images/shuttle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/shuttle.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/linreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/linreg.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/linsep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/linsep.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/pcanim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/pcanim.gif


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/reshape.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/reshape.jpg


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/sklearn2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/sklearn2.jpg


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-cv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-cv2.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-cv3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-cv3.png


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 1/data/.DS_Store


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/data/.DS_Store


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 3/data/.DS_Store


--------------------------------------------------------------------------------
/Machine Learning/linear_regression/images/cs109gitflow3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/cs109gitflow3.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/onelinesplit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/onelinesplit.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/sklearntrans.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/sklearntrans.jpg


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-test.png


--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_xml/data_wrangling_xml/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_xml/data_wrangling_xml/.DS_Store


--------------------------------------------------------------------------------
/Machine Learning/linear_regression/images/conditionalmean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/conditionalmean.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-validate-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/complexity-error-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/complexity-error-plot.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/complexity-error-reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/complexity-error-reg.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-validate-test3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test3.png


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-validate-test-cont.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test-cont.png


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/data/us_job_market_discrimination.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/data/us_job_market_discrimination.dta


--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | #Ipython
60 | .ipynb_checkpoints/
61 | # Created by .ignore support plugin (hsz.mobi)
62 | ### OSX template
63 | .DS_Store
64 | .AppleDouble
65 | .LSOverride
66 | 
67 | # Icon must end with two \r
68 | Icon
69 | 
70 | # Thumbnails
71 | ._*
72 | 
73 | # Files that might appear in the root of a volume
74 | .DocumentRevisions-V100
75 | .fseventsd
76 | .Spotlight-V100
77 | .TemporaryItems
78 | .Trashes
79 | .VolumeIcon.icns
80 | 
81 | # Directories potentially created on remote AFP share
82 | .AppleDB
83 | .AppleDesktop
84 | Network Trash Folder
85 | Temporary Items
86 | .apdisk
87 | 
88 | #Temporary data
89 | tempdata/
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Springboard
 2 | 
 3 | Projects done as a part of Springboard's Data Science Intensive curriculum.
 4 | 
 5 | ### Capstone Project : Football, Goals and Machine Learning
 6 | An attempt to model the highly unpredictable English Premier League and predict the results of each match.
 7 | 
 8 | ### Data Story
 9 | Do home teams really have an advantage in football? Is the effect of this advantage reducing in the English Premier League? How predictable are football leagues anyway? Data to the rescue!
10 | 
11 | ### Data Wrangling
12 | Practise on cleaning up messy data using pandas - XML, JSON, raw text and working with databases.
13 | 
14 | ### Inferential Statistics
15 | Useful inferential statistics for drawing conclusions and predicting outcomes. 
16 | Contains three miniprojects : 
17 | * Human Body Temperature - hypothesis testing, confidence intervals, and statistical significance
18 | * Examining Racial Discrimination - does race have a significant impact on the rate of callbacks?
19 | * Reducing Hospital Readmissions - statistical analysis to reduce readmissions to hospitals.
20 | 
21 | ### Machine Learning
22 | To learn various machine learning models, their advantages and limitations.
23 | Contains the following miniprojects : 
24 | * Boston House Pricing - predicting housing prices in Boston using linear regression
25 | * Heights and Weights - using logistic regression to classify gender
26 | * Predicting Movie Ratings - use naive bayes algorithm to accurately predict movie ratings based on their reviews
27 | * Customer Segmentation - employ k-means clustering and associated accuracy metrics to partitioning problems
28 | 


--------------------------------------------------------------------------------
/Machine Learning/linear_regression/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | #Ipython
60 | .ipynb_checkpoints/
61 | # Created by .ignore support plugin (hsz.mobi)
62 | ### OSX template
63 | .DS_Store
64 | .AppleDouble
65 | .LSOverride
66 | 
67 | # Icon must end with two \r
68 | Icon
69 | 
70 | # Thumbnails
71 | ._*
72 | 
73 | # Files that might appear in the root of a volume
74 | .DocumentRevisions-V100
75 | .fseventsd
76 | .Spotlight-V100
77 | .TemporaryItems
78 | .Trashes
79 | .VolumeIcon.icns
80 | 
81 | # Directories potentially created on remote AFP share
82 | .AppleDB
83 | .AppleDesktop
84 | Network Trash Folder
85 | Temporary Items
86 | .apdisk
87 | 
88 | #Temporary data
89 | hw1/tempdata/
90 | hw1/.ipynb_checkpoints/
91 | 
92 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/data/human_body_temperature.csv:
--------------------------------------------------------------------------------
  1 | temperature,gender,heart_rate
  2 | 99.3,F,68.0
  3 | 98.4,F,81.0
  4 | 97.8,M,73.0
  5 | 99.2,F,66.0
  6 | 98.0,F,73.0
  7 | 99.2,M,83.0
  8 | 98.0,M,71.0
  9 | 98.8,M,78.0
 10 | 98.4,F,84.0
 11 | 98.6,F,86.0
 12 | 98.8,F,89.0
 13 | 96.7,F,62.0
 14 | 98.2,M,72.0
 15 | 98.7,F,79.0
 16 | 97.8,F,77.0
 17 | 98.8,F,83.0
 18 | 98.3,F,79.0
 19 | 98.2,M,64.0
 20 | 97.2,F,68.0
 21 | 99.4,M,70.0
 22 | 98.3,F,78.0
 23 | 98.2,M,71.0
 24 | 98.6,M,70.0
 25 | 98.4,M,68.0
 26 | 97.8,M,65.0
 27 | 98.0,F,87.0
 28 | 97.8,F,62.0
 29 | 98.2,F,69.0
 30 | 98.4,F,73.0
 31 | 98.1,M,67.0
 32 | 98.3,M,86.0
 33 | 97.6,F,61.0
 34 | 98.5,M,71.0
 35 | 98.6,M,82.0
 36 | 99.3,M,63.0
 37 | 99.5,M,75.0
 38 | 99.1,M,71.0
 39 | 98.3,M,72.0
 40 | 97.9,F,79.0
 41 | 96.4,F,69.0
 42 | 98.4,F,79.0
 43 | 98.4,M,82.0
 44 | 96.9,M,74.0
 45 | 97.2,M,64.0
 46 | 99.0,F,79.0
 47 | 97.9,F,69.0
 48 | 97.4,M,72.0
 49 | 97.4,M,68.0
 50 | 97.9,M,76.0
 51 | 97.1,M,82.0
 52 | 98.9,F,76.0
 53 | 98.3,F,80.0
 54 | 98.5,F,83.0
 55 | 98.6,M,78.0
 56 | 98.2,F,73.0
 57 | 98.6,F,82.0
 58 | 98.8,F,70.0
 59 | 98.2,M,66.0
 60 | 98.2,F,65.0
 61 | 97.6,M,73.0
 62 | 99.1,F,80.0
 63 | 98.4,M,84.0
 64 | 98.2,F,57.0
 65 | 98.6,M,83.0
 66 | 98.7,F,65.0
 67 | 97.4,M,70.0
 68 | 97.4,F,57.0
 69 | 98.6,M,77.0
 70 | 98.7,F,82.0
 71 | 98.9,M,80.0
 72 | 98.1,F,81.0
 73 | 97.7,F,61.0
 74 | 98.0,M,78.0
 75 | 98.8,M,81.0
 76 | 99.0,M,75.0
 77 | 98.8,M,78.0
 78 | 98.0,F,76.0
 79 | 98.4,M,70.0
 80 | 97.4,M,78.0
 81 | 97.6,M,74.0
 82 | 98.8,F,73.0
 83 | 98.0,M,67.0
 84 | 97.5,M,70.0
 85 | 99.2,F,77.0
 86 | 98.6,F,85.0
 87 | 97.1,M,75.0
 88 | 98.6,F,77.0
 89 | 98.0,M,78.0
 90 | 98.7,M,73.0
 91 | 98.1,M,73.0
 92 | 97.8,M,74.0
 93 | 100.0,F,78.0
 94 | 98.8,F,84.0
 95 | 97.1,M,73.0
 96 | 97.8,M,58.0
 97 | 96.8,F,75.0
 98 | 99.9,F,79.0
 99 | 98.7,F,64.0
100 | 98.8,F,64.0
101 | 98.0,M,74.0
102 | 99.0,M,81.0
103 | 98.5,M,68.0
104 | 98.0,F,78.0
105 | 99.4,F,77.0
106 | 97.6,M,69.0
107 | 96.7,M,71.0
108 | 97.0,M,80.0
109 | 98.6,M,66.0
110 | 98.7,F,72.0
111 | 97.3,M,69.0
112 | 98.8,F,69.0
113 | 98.0,F,89.0
114 | 98.2,F,64.0
115 | 99.1,F,74.0
116 | 99.0,M,79.0
117 | 98.0,M,64.0
118 | 100.8,F,77.0
119 | 97.8,F,71.0
120 | 98.7,M,78.0
121 | 98.4,F,74.0
122 | 97.7,F,84.0
123 | 97.9,F,68.0
124 | 99.0,F,81.0
125 | 97.2,F,66.0
126 | 97.5,M,75.0
127 | 96.3,M,70.0
128 | 97.7,M,77.0
129 | 98.2,F,73.0
130 | 97.9,M,72.0
131 | 98.7,F,59.0
132 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## What is the true normal human body temperature? \n",
  8 |     "\n",
  9 |     "#### Background\n",
 10 |     "\n",
 11 |     "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n",
 12 |     "\n",
 13 |     "#### Exercise\n",
 14 |     "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n",
 15 |     "\n",
 16 |     "1.  Is the distribution of body temperatures normal? \n",
 17 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 18 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 19 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "#### Resources\n",
 24 |     "\n",
 25 |     "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 58,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import pandas as pd"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 62,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "df = pd.read_csv('data/human_body_temperature.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "source": [
 56 |     "# Exercise\n",
 57 |     "\n",
 58 |     "Answer the following questions in this notebook and submit to your Github account. \n",
 59 |     "\n",
 60 |     "1.  Is the distribution of body temperatures normal? \n",
 61 |     "    - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n",
 62 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 63 |     "    - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n",
 64 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 65 |     "    - Start by computing the margin of error and confidence interval.\n",
 66 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 67 |     "    - Set up and solve for a two sample hypothesis testing."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 2",
 83 |    "language": "python",
 84 |    "name": "python2"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 2
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython2",
 96 |    "version": "2.7.9"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 0
101 | }
102 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## What is the true normal human body temperature? \n",
  8 |     "\n",
  9 |     "#### Background\n",
 10 |     "\n",
 11 |     "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n",
 12 |     "\n",
 13 |     "#### Exercise\n",
 14 |     "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n",
 15 |     "\n",
 16 |     "1.  Is the distribution of body temperatures normal? \n",
 17 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 18 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 19 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "#### Resources\n",
 24 |     "\n",
 25 |     "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 58,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import pandas as pd"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 62,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "df = pd.read_csv('data/human_body_temperature.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "source": [
 56 |     "# Exercise\n",
 57 |     "\n",
 58 |     "Answer the following questions in this notebook and submit to your Github account. \n",
 59 |     "\n",
 60 |     "1.  Is the distribution of body temperatures normal? \n",
 61 |     "    - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n",
 62 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 63 |     "    - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n",
 64 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 65 |     "    - Start by computing the margin of error and confidence interval.\n",
 66 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 67 |     "    - Set up and solve for a two sample hypothesis testing."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 2",
 83 |    "language": "python",
 84 |    "name": "python2"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 2
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython2",
 96 |    "version": "2.7.9"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 0
101 | }
102 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## What is the true normal human body temperature? \n",
  8 |     "\n",
  9 |     "#### Background\n",
 10 |     "\n",
 11 |     "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n",
 12 |     "\n",
 13 |     "#### Exercise\n",
 14 |     "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n",
 15 |     "\n",
 16 |     "1.  Is the distribution of body temperatures normal? \n",
 17 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 18 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 19 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "#### Resources\n",
 24 |     "\n",
 25 |     "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 58,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import pandas as pd"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 62,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "df = pd.read_csv('data/human_body_temperature.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "source": [
 56 |     "# Exercise\n",
 57 |     "\n",
 58 |     "Answer the following questions in this notebook and submit to your Github account. \n",
 59 |     "\n",
 60 |     "1.  Is the distribution of body temperatures normal? \n",
 61 |     "    - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n",
 62 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 63 |     "    - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n",
 64 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 65 |     "    - Start by computing the margin of error and confidence interval.\n",
 66 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 67 |     "    - Set up and solve for a two sample hypothesis testing."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 2",
 83 |    "language": "python",
 84 |    "name": "python2"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 2
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython2",
 96 |    "version": "2.7.9"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 0
101 | }
102 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "### Examining racial discrimination in the US job market\n",
  9 |     "\n",
 10 |     "#### Background\n",
 11 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 12 |     "\n",
 13 |     "#### Data\n",
 14 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 15 |     "\n",
 16 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
 17 |     "\n",
 18 |     "#### Exercise\n",
 19 |     "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "#### Resources\n",
 23 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 24 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html "
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "****"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import pandas as pd\n",
 43 |     "import numpy as np\n",
 44 |     "from scipy import stats"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "157.0"
 69 |       ]
 70 |      },
 71 |      "execution_count": 4,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# number of callbacks for balck-sounding names\n",
 78 |     "sum(data[data.race=='b'].call)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {
 84 |     "collapsed": true
 85 |    },
 86 |    "source": [
 87 |     "****\n",
 88 |     "\n",
 89 |     "# Exercise\n",
 90 |     "\n",
 91 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 92 |     "   2. What are the null and alternate hypotheses?\n",
 93 |     "   3. Compute margin of error, confidence interval, and p-value.\n",
 94 |     "   4. Discuss statistical significance.\n",
 95 |     "    \n",
 96 |     "You can include written notes in notebook cells using Markdown: \n",
 97 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 98 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 99 |     "   \n",
100 |     "****"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": []
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 2",
116 |    "language": "python",
117 |    "name": "python2"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 2
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython2",
129 |    "version": "2.7.9"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 0
134 | }
135 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "### Examining racial discrimination in the US job market\n",
  9 |     "\n",
 10 |     "#### Background\n",
 11 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 12 |     "\n",
 13 |     "#### Data\n",
 14 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 15 |     "\n",
 16 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
 17 |     "\n",
 18 |     "#### Exercise\n",
 19 |     "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "#### Resources\n",
 23 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 24 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html "
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "****"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import pandas as pd\n",
 43 |     "import numpy as np\n",
 44 |     "from scipy import stats"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "157.0"
 69 |       ]
 70 |      },
 71 |      "execution_count": 4,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# number of callbacks for balck-sounding names\n",
 78 |     "sum(data[data.race=='b'].call)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {
 84 |     "collapsed": true
 85 |    },
 86 |    "source": [
 87 |     "****\n",
 88 |     "\n",
 89 |     "# Exercise\n",
 90 |     "\n",
 91 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 92 |     "   2. What are the null and alternate hypotheses?\n",
 93 |     "   3. Compute margin of error, confidence interval, and p-value.\n",
 94 |     "   4. Discuss statistical significance.\n",
 95 |     "    \n",
 96 |     "You can include written notes in notebook cells using Markdown: \n",
 97 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 98 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 99 |     "   \n",
100 |     "****"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": []
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 2",
116 |    "language": "python",
117 |    "name": "python2"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 2
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython2",
129 |    "version": "2.7.9"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 0
134 | }
135 | 


--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/.ipynb_checkpoints/sliderule_dsi_xml_exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# XML example and exercise\n",
  8 |     "****\n",
  9 |     "+ study examples of accessing nodes in XML tree structure  \n",
 10 |     "+ work on exercise to be completed and submitted\n",
 11 |     "****\n",
 12 |     "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n",
 13 |     "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n",
 14 |     "****"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from xml.etree import ElementTree as ET"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## XML example\n",
 33 |     "\n",
 34 |     "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "document_tree = ET.parse( './data/mondial_database_less.xml' )"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    },
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "Albania\n",
 60 |       "Greece\n",
 61 |       "Macedonia\n",
 62 |       "Serbia\n",
 63 |       "Montenegro\n",
 64 |       "Kosovo\n",
 65 |       "Andorra\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "# print names of all countries\n",
 71 |     "for child in document_tree.getroot():\n",
 72 |     "    print child.find('name').text"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n",
 87 |       "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n",
 88 |       "* Macedonia: Skopje, Kumanovo\n",
 89 |       "* Serbia: Beograd, Novi Sad, Niš\n",
 90 |       "* Montenegro: Podgorica\n",
 91 |       "* Kosovo: Prishtine\n",
 92 |       "* Andorra: Andorra la Vella\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# print names of all countries and their cities\n",
 98 |     "for element in document_tree.iterfind('country'):\n",
 99 |     "    print '* ' + element.find('name').text + ':',\n",
100 |     "    capitals_string = ''\n",
101 |     "    for subelement in element.getiterator('city'):\n",
102 |     "        capitals_string += subelement.find('name').text + ', '\n",
103 |     "    print capitals_string[:-2]"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "****\n",
111 |     "## XML exercise\n",
112 |     "\n",
113 |     "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n",
114 |     "\n",
115 |     "1. 10 countries with the lowest infant mortality rates\n",
116 |     "2. 10 cities with the largest population\n",
117 |     "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
118 |     "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "document = ET.parse( './data/mondial_database.xml' )"
130 |    ]
131 |   }
132 |  ],
133 |  "metadata": {
134 |   "kernelspec": {
135 |    "display_name": "Python 2",
136 |    "language": "python",
137 |    "name": "python2"
138 |   },
139 |   "language_info": {
140 |    "codemirror_mode": {
141 |     "name": "ipython",
142 |     "version": 2
143 |    },
144 |    "file_extension": ".py",
145 |    "mimetype": "text/x-python",
146 |    "name": "python",
147 |    "nbconvert_exporter": "python",
148 |    "pygments_lexer": "ipython2",
149 |    "version": "2.7.9"
150 |   }
151 |  },
152 |  "nbformat": 4,
153 |  "nbformat_minor": 0
154 | }
155 | 


--------------------------------------------------------------------------------
/Capstone Project/Data/notes.txt:
--------------------------------------------------------------------------------
  1 | Notes for Football Data
  2 | 
  3 | All data is in csv format, ready for use within standard spreadsheet applications. Please note that some abbreviations are no longer in use (in particular odds from specific bookmakers no longer used) and refer to data collected in earlier seasons. For a current list of what bookmakers are included in the dataset please visit http://www.football-data.co.uk/matches.php
  4 | 
  5 | Key to results data:
  6 | 
  7 | Div = League Division
  8 | Date = Match Date (dd/mm/yy)
  9 | HomeTeam = Home Team
 10 | AwayTeam = Away Team
 11 | FTHG = Full Time Home Team Goals
 12 | FTAG = Full Time Away Team Goals
 13 | FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
 14 | HTHG = Half Time Home Team Goals
 15 | HTAG = Half Time Away Team Goals
 16 | HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)
 17 | 
 18 | Match Statistics (where available)
 19 | Attendance = Crowd Attendance
 20 | Referee = Match Referee
 21 | HS = Home Team Shots
 22 | AS = Away Team Shots
 23 | HST = Home Team Shots on Target
 24 | AST = Away Team Shots on Target
 25 | HHW = Home Team Hit Woodwork
 26 | AHW = Away Team Hit Woodwork
 27 | HC = Home Team Corners
 28 | AC = Away Team Corners
 29 | HF = Home Team Fouls Committed
 30 | AF = Away Team Fouls Committed
 31 | HO = Home Team Offsides
 32 | AO = Away Team Offsides
 33 | HY = Home Team Yellow Cards
 34 | AY = Away Team Yellow Cards
 35 | HR = Home Team Red Cards
 36 | AR = Away Team Red Cards
 37 | HBP = Home Team Bookings Points (10 = yellow, 25 = red)
 38 | ABP = Away Team Bookings Points (10 = yellow, 25 = red)
 39 | 
 40 | Key to 1X2 (match) betting odds data:
 41 | 
 42 | B365H = Bet365 home win odds
 43 | B365D = Bet365 draw odds
 44 | B365A = Bet365 away win odds
 45 | BSH = Blue Square home win odds
 46 | BSD = Blue Square draw odds
 47 | BSA = Blue Square away win odds
 48 | BWH = Bet&Win home win odds
 49 | BWD = Bet&Win draw odds
 50 | BWA = Bet&Win away win odds
 51 | GBH = Gamebookers home win odds
 52 | GBD = Gamebookers draw odds
 53 | GBA = Gamebookers away win odds
 54 | IWH = Interwetten home win odds
 55 | IWD = Interwetten draw odds
 56 | IWA = Interwetten away win odds
 57 | LBH = Ladbrokes home win odds
 58 | LBD = Ladbrokes draw odds
 59 | LBA = Ladbrokes away win odds
 60 | PSH = Pinnacle home win odds
 61 | PSD = Pinnacle draw odds
 62 | PSA = Pinnacle away win odds
 63 | SOH = Sporting Odds home win odds
 64 | SOD = Sporting Odds draw odds
 65 | SOA = Sporting Odds away win odds
 66 | SBH = Sportingbet home win odds
 67 | SBD = Sportingbet draw odds
 68 | SBA = Sportingbet away win odds
 69 | SJH = Stan James home win odds
 70 | SJD = Stan James draw odds
 71 | SJA = Stan James away win odds
 72 | SYH = Stanleybet home win odds
 73 | SYD = Stanleybet draw odds
 74 | SYA = Stanleybet away win odds
 75 | VCH = VC Bet home win odds
 76 | VCD = VC Bet draw odds
 77 | VCA = VC Bet away win odds
 78 | WHH = William Hill home win odds
 79 | WHD = William Hill draw odds
 80 | WHA = William Hill away win odds
 81 | 
 82 | Bb1X2 = Number of BetBrain bookmakers used to calculate match odds averages and maximums
 83 | BbMxH = Betbrain maximum home win odds
 84 | BbAvH = Betbrain average home win odds
 85 | BbMxD = Betbrain maximum draw odds
 86 | BbAvD = Betbrain average draw win odds
 87 | BbMxA = Betbrain maximum away win odds
 88 | BbAvA = Betbrain average away win odds
 89 | 
 90 | 
 91 | 
 92 | Key to total goals betting odds:
 93 | 
 94 | BbOU = Number of BetBrain bookmakers used to calculate over/under 2.5 goals (total goals) averages and maximums
 95 | BbMx>2.5 = Betbrain maximum over 2.5 goals
 96 | BbAv>2.5 = Betbrain average over 2.5 goals
 97 | BbMx<2.5 = Betbrain maximum under 2.5 goals
 98 | BbAv<2.5 = Betbrain average under 2.5 goals
 99 | 
100 | GB>2.5 = Gamebookers over 2.5 goals
101 | GB<2.5 = Gamebookers under 2.5 goals
102 | B365>2.5 = Bet365 over 2.5 goals
103 | B365<2.5 = Bet365 under 2.5 goals
104 | 
105 | 
106 | Key to Asian handicap betting odds:
107 | 
108 | BbAH = Number of BetBrain bookmakers used to Asian handicap averages and maximums
109 | BbAHh = Betbrain size of handicap (home team)
110 | BbMxAHH = Betbrain maximum Asian handicap home team odds
111 | BbAvAHH = Betbrain average Asian handicap home team odds
112 | BbMxAHA = Betbrain maximum Asian handicap away team odds
113 | BbAvAHA = Betbrain average Asian handicap away team odds
114 | 
115 | GBAHH = Gamebookers Asian handicap home team odds
116 | GBAHA = Gamebookers Asian handicap away team odds
117 | GBAH = Gamebookers size of handicap (home team)
118 | LBAHH = Ladbrokes Asian handicap home team odds
119 | LBAHA = Ladbrokes Asian handicap away team odds
120 | LBAH = Ladbrokes size of handicap (home team)
121 | B365AHH = Bet365 Asian handicap home team odds
122 | B365AHA = Bet365 Asian handicap away team odds
123 | B365AH = Bet365 size of handicap (home team)
124 | 
125 | 
126 | Closing odds (last odds before match starts)
127 | 
128 | PSCH = Pinnacle closing home win odds
129 | PSCD = Pinnacle closing draw odds
130 | PSCA = Pinnacle closing away win odds
131 | 
132 | Football-Data would like to acknowledge the following sources which have been utilised in the compilation of Football-Data's results and odds files.
133 | 
134 | Historical results:
135 | International Soccer Server - http://sunsite.tut.fi/rec/riku/soccer.html
136 | European Football - http://www.eurofootball.be/
137 | RSSSF Archive - http://www.rsssf.com/
138 | 
139 | Current results (full time, half time)
140 | TBWSport - http://www.tbwsport.com
141 | Livescore- http://www.livescore.com
142 | 
143 | Match statistics
144 | Sportinglife, ESPN Soccer, Bundesliga.de, Gazzetta.it and Football.fr
145 | 
146 | Bookmakers betting odds
147 | Betbrain - http://www.betbrain.com
148 | Betbase - http://www.betbase.info
149 | 
150 | Betting odds for weekend games are collected Friday afternoons, and on Tuesday afternoons for midweek games.
151 | 
152 | Additional match statistics (corners, shots, bookings, referee etc.) for the 2000/01 and 2001/02 seasons for the English, Scottish and German leagues were provided by Sports.com (now under new ownership and no longer available).
153 | 
154 | 


--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/data/world_bank_projects_less.json:
--------------------------------------------------------------------------------
1 | [{ "_id" : { "$oid" : "52b213b38594d8a2be17c780" }, "approvalfy" : 1999, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-12T00:00:00Z", "borrower" : "FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA", "closingdate" : "2018-07-07T00:00:00Z", "country_namecode" : "Federal Democratic Republic of Ethiopia!$!ET", "countrycode" : "ET", "countryname" : "Federal Democratic Republic of Ethiopia", "countryshortname" : "Ethiopia", "docty" : "Project Information Document,Indigenous Peoples Plan,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 0, "ibrdcommamt" : 0, "id" : "P129828", "idacommamt" : 130000000, "impagency" : "MINISTRY OF EDUCATION", "lendinginstr" : "Investment Project Financing", "lendinginstrtype" : "IN", "lendprojectcost" : 550000000, "majorsector_percent" : [ { "Name" : "Education", "Percent" : 46 }, { "Name" : "Education", "Percent" : 26 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 16 }, { "Name" : "Education", "Percent" : 12 } ], "mjsector_namecode" : [ { "name" : "Education", "code" : "EX" }, { "name" : "Education", "code" : "EX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Education", "code" : "EX" } ], "mjtheme" : [ "Human development" ], "mjtheme_namecode" : [ { "name" : "Human development", "code" : "8" }, { "name" : "", "code" : "11" } ], "mjthemecode" : "8,11", "prodline" : "PE", "prodlinetext" : "IBRD/IDA", "productlinetype" : "L", "project_abstract" : { "cdata" : "The development objective of the Second Phase of General Education Quality Improvement Project for Ethiopia is to improve learning conditions in primary and secondary schools and strengthen institutions at different levels of educational administration. The project has six components. The first component is curriculum, textbooks, assessment, examinations, and inspection. This component will support improvement of learning conditions in grades KG-12 by providing increased access to teaching and learning materials and through improvements to the curriculum by assessing the strengths and weaknesses of the current curriculum. This component has following four sub-components: (i) curriculum reform and implementation; (ii) teaching and learning materials; (iii) assessment and examinations; and (iv) inspection. The second component is teacher development program (TDP). This component will support improvements in learning conditions in both primary and secondary schools by advancing the quality of teaching in general education through: (a) enhancing the training of pre-service teachers in teacher education institutions; and (b) improving the quality of in-service teacher training. This component has following three sub-components: (i) pre-service teacher training; (ii) in-service teacher training; and (iii) licensing and relicensing of teachers and school leaders. The third component is school improvement plan. This component will support the strengthening of school planning in order to improve learning outcomes, and to partly fund the school improvement plans through school grants. It has following two sub-components: (i) school improvement plan; and (ii) school grants. The fourth component is management and capacity building, including education management information systems (EMIS). This component will support management and capacity building aspect of the project. This component has following three sub-components: (i) capacity building for education planning and management; (ii) capacity building for school planning and management; and (iii) EMIS. The fifth component is improving the quality of learning and teaching in secondary schools and universities through the use of information and communications technology (ICT). It has following five sub-components: (i) national policy and institution for ICT in general education; (ii) national ICT infrastructure improvement plan for general education; (iii) develop an integrated monitoring, evaluation, and learning system specifically for the ICT component; (iv) teacher professional development in the use of ICT; and (v) provision of limited number of e-Braille display readers with the possibility to scale up to all secondary education schools based on the successful implementation and usage of the readers. The sixth component is program coordination, monitoring and evaluation, and communication. It will support institutional strengthening by developing capacities in all aspects of program coordination, monitoring and evaluation; a new sub-component on communications will support information sharing for better management and accountability. It has following three sub-components: (i) program coordination; (ii) monitoring and evaluation (M and E); and (iii) communication." }, "project_name" : "Ethiopia General Education Quality Improvement Project II", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID),  Vol.", "DocType" : "PID", "EntityID" : "090224b081e545fb_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b081e545fb_1_0", "DocDate" : "28-AUG-2013" }, { "DocTypeDesc" : "Indigenous Peoples Plan (IP),  Vol.1 of 1", "DocType" : "IP", "EntityID" : "000442464_20130920111729", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20130920111729", "DocDate" : "01-JUL-2013" }, { "DocTypeDesc" : "Project Information Document (PID),  Vol.", "DocType" : "PID", "EntityID" : "090224b0817b19e2_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b0817b19e2_1_0", "DocDate" : "22-NOV-2012" } ], "projectfinancialtype" : "IDA", "projectstatusdisplay" : "Active", "regionname" : "Africa", "sector" : [ { "Name" : "Primary education" }, { "Name" : "Secondary education" }, { "Name" : "Public administration- Other social services" }, { "Name" : "Tertiary education" } ], "sector1" : { "Name" : "Primary education", "Percent" : 46 }, "sector2" : { "Name" : "Secondary education", "Percent" : 26 }, "sector3" : { "Name" : "Public administration- Other social services", "Percent" : 16 }, "sector4" : { "Name" : "Tertiary education", "Percent" : 12 }, "sector_namecode" : [ { "name" : "Primary education", "code" : "EP" }, { "name" : "Secondary education", "code" : "ES" }, { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "Tertiary education", "code" : "ET" } ], "sectorcode" : "ET,BS,ES,EP", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Education for all", "Percent" : 100 }, "theme_namecode" : [ { "name" : "Education for all", "code" : "65" } ], "themecode" : "65", "totalamt" : 130000000, "totalcommamt" : 130000000, "url" : "http://www.worldbank.org/projects/P129828/ethiopia-general-education-quality-improvement-project-ii?lang=en" },
2 | { "_id" : { "$oid" : "52b213b38594d8a2be17c781" }, "approvalfy" : 2015, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-04T00:00:00Z", "borrower" : "GOVERNMENT OF TUNISIA", "country_namecode" : "Republic of Tunisia!$!TN", "countrycode" : "TN", "countryname" : "Republic of Tunisia", "countryshortname" : "Tunisia", "docty" : "Project Information Document,Integrated Safeguards Data Sheet,Integrated Safeguards Data Sheet,Project Information Document,Integrated Safeguards Data Sheet,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 4700000, "ibrdcommamt" : 0, "id" : "P144674", "idacommamt" : 0, "impagency" : "MINISTRY OF FINANCE", "lendinginstr" : "Specific Investment Loan", "lendinginstrtype" : "IN", "lendprojectcost" : 5700000, "majorsector_percent" : [ { "Name" : "Public Administration, Law, and Justice", "Percent" : 70 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 30 } ], "mjsector_namecode" : [ { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" } ], "mjtheme" : [ "Economic management", "Social protection and risk management" ], "mjtheme_namecode" : [ { "name" : "Economic management", "code" : "1" }, { "name" : "Social protection and risk management", "code" : "6" } ], "mjthemecode" : "1,6", "prodline" : "RE", "prodlinetext" : "Recipient Executed Activities", "productlinetype" : "L", "project_name" : "TN: DTF Social Protection Reforms Support", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID),  Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131024115616", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131024115616", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS),  Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20131024151611", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20131024151611", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS),  Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000442464_20131031112136", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20131031112136", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Project Information Document (PID),  Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131031105716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131031105716", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS),  Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20130305113209", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113209", "DocDate" : "16-JAN-2013" }, { "DocTypeDesc" : "Project Information Document (PID),  Vol.1 of 1", "DocType" : "PID", "EntityID" : "000356161_20130305113716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113716", "DocDate" : "16-JAN-2013" } ], "projectfinancialtype" : "OTHER", "projectstatusdisplay" : "Active", "regionname" : "Middle East and North Africa", "sector" : [ { "Name" : "Public administration- Other social services" }, { "Name" : "General public administration sector" } ], "sector1" : { "Name" : "Public administration- Other social services", "Percent" : 70 }, "sector2" : { "Name" : "General public administration sector", "Percent" : 30 }, "sector_namecode" : [ { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "General public administration sector", "code" : "BZ" } ], "sectorcode" : "BZ,BS", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Other economic management", "Percent" : 30 }, "theme_namecode" : [ { "name" : "Other economic management", "code" : "24" }, { "name" : "Social safety nets", "code" : "54" } ], "themecode" : "54,24", "totalamt" : 0, "totalcommamt" : 4700000, "url" : "http://www.worldbank.org/projects/P144674?lang=en" }
3 | ]
4 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/sliderule_dsi_inferential_statistics_exercise_2.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "### Examining racial discrimination in the US job market\n",
  9 |     "\n",
 10 |     "#### Background\n",
 11 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 12 |     "\n",
 13 |     "#### Data\n",
 14 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 15 |     "\n",
 16 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
 17 |     "\n",
 18 |     "#### Exercise\n",
 19 |     "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 20 |     "\n",
 21 |     "Answer the following questions **in this notebook below and submit to your Github account**. \n",
 22 |     "\n",
 23 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 24 |     "   2. What are the null and alternate hypotheses?\n",
 25 |     "   3. Compute margin of error, confidence interval, and p-value.\n",
 26 |     "   4. Discuss statistical significance.\n",
 27 |     "\n",
 28 |     "You can include written notes in notebook cells using Markdown: \n",
 29 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 30 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "#### Resources\n",
 34 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 35 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n",
 36 |     "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 37 |     "\n",
 38 |     "****"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 42,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import pandas as pd\n",
 50 |     "import numpy as np\n",
 51 |     "from scipy import stats\n",
 52 |     "import math"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 19,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "157.0"
 77 |       ]
 78 |      },
 79 |      "execution_count": 19,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "# number of callbacks for black-sounding names\n",
 86 |     "sum(data[data.race=='b'].call)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 10,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/html": [
 99 |        "<div>\n",
100 |        "<table border=\"1\" class=\"dataframe\">\n",
101 |        "  <thead>\n",
102 |        "    <tr style=\"text-align: right;\">\n",
103 |        "      <th></th>\n",
104 |        "      <th>id</th>\n",
105 |        "      <th>ad</th>\n",
106 |        "      <th>education</th>\n",
107 |        "      <th>ofjobs</th>\n",
108 |        "      <th>yearsexp</th>\n",
109 |        "      <th>honors</th>\n",
110 |        "      <th>volunteer</th>\n",
111 |        "      <th>military</th>\n",
112 |        "      <th>empholes</th>\n",
113 |        "      <th>occupspecific</th>\n",
114 |        "      <th>...</th>\n",
115 |        "      <th>compreq</th>\n",
116 |        "      <th>orgreq</th>\n",
117 |        "      <th>manuf</th>\n",
118 |        "      <th>transcom</th>\n",
119 |        "      <th>bankreal</th>\n",
120 |        "      <th>trade</th>\n",
121 |        "      <th>busservice</th>\n",
122 |        "      <th>othservice</th>\n",
123 |        "      <th>missind</th>\n",
124 |        "      <th>ownership</th>\n",
125 |        "    </tr>\n",
126 |        "  </thead>\n",
127 |        "  <tbody>\n",
128 |        "    <tr>\n",
129 |        "      <th>0</th>\n",
130 |        "      <td>b</td>\n",
131 |        "      <td>1</td>\n",
132 |        "      <td>4</td>\n",
133 |        "      <td>2</td>\n",
134 |        "      <td>6</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>0</td>\n",
137 |        "      <td>0</td>\n",
138 |        "      <td>1</td>\n",
139 |        "      <td>17</td>\n",
140 |        "      <td>...</td>\n",
141 |        "      <td>1.0</td>\n",
142 |        "      <td>0.0</td>\n",
143 |        "      <td>1.0</td>\n",
144 |        "      <td>0.0</td>\n",
145 |        "      <td>0.0</td>\n",
146 |        "      <td>0.0</td>\n",
147 |        "      <td>0.0</td>\n",
148 |        "      <td>0.0</td>\n",
149 |        "      <td>0.0</td>\n",
150 |        "      <td></td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>1</th>\n",
154 |        "      <td>b</td>\n",
155 |        "      <td>1</td>\n",
156 |        "      <td>3</td>\n",
157 |        "      <td>3</td>\n",
158 |        "      <td>6</td>\n",
159 |        "      <td>0</td>\n",
160 |        "      <td>1</td>\n",
161 |        "      <td>1</td>\n",
162 |        "      <td>0</td>\n",
163 |        "      <td>316</td>\n",
164 |        "      <td>...</td>\n",
165 |        "      <td>1.0</td>\n",
166 |        "      <td>0.0</td>\n",
167 |        "      <td>1.0</td>\n",
168 |        "      <td>0.0</td>\n",
169 |        "      <td>0.0</td>\n",
170 |        "      <td>0.0</td>\n",
171 |        "      <td>0.0</td>\n",
172 |        "      <td>0.0</td>\n",
173 |        "      <td>0.0</td>\n",
174 |        "      <td></td>\n",
175 |        "    </tr>\n",
176 |        "  </tbody>\n",
177 |        "</table>\n",
178 |        "<p>2 rows × 65 columns</p>\n",
179 |        "</div>"
180 |       ],
181 |       "text/plain": [
182 |        "  id ad  education  ofjobs  yearsexp  honors  volunteer  military  empholes  \\\n",
183 |        "0  b  1          4       2         6       0          0         0         1   \n",
184 |        "1  b  1          3       3         6       0          1         1         0   \n",
185 |        "\n",
186 |        "   occupspecific    ...      compreq  orgreq  manuf  transcom  bankreal trade  \\\n",
187 |        "0             17    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
188 |        "1            316    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
189 |        "\n",
190 |        "  busservice othservice  missind  ownership  \n",
191 |        "0        0.0        0.0      0.0             \n",
192 |        "1        0.0        0.0      0.0             \n",
193 |        "\n",
194 |        "[2 rows x 65 columns]"
195 |       ]
196 |      },
197 |      "execution_count": 10,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "data.columns\n",
204 |     "data.head(2)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "#### What test is appropriate for this problem? Does CLT apply?\n",
212 |     "\n",
213 |     "Let's being by looking the number of observations where race = b and race = w. We can also check the number of such observations that have received a call back and the ones that didn't. Using these factors, we will be able to create a contingency table. \n",
214 |     "\n",
215 |     "Hence, the problem boils down to comparison of two proportions. \n",
216 |     "If certain conditions are satisfied, we can also perform the Fischer's Exact Test using the contingency table.\n"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 73,
222 |    "metadata": {
223 |     "collapsed": false
224 |    },
225 |    "outputs": [
226 |     {
227 |      "name": "stdout",
228 |      "output_type": "stream",
229 |      "text": [
230 |       "Number of observations where race is b :  2435\n",
231 |       "Number of observations where race is w :  2435\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "data_b = data[data.race=='b']\n",
237 |     "data_w = data[data.race=='w']\n",
238 |     "num_b = len(data_b)\n",
239 |     "num_w = len(data_w)\n",
240 |     "print \"Number of observations where race is b : \",num_b\n",
241 |     "print \"Number of observations where race is w : \",num_w"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "Since there can be only two states for the 'call' variable, we can arbitrarily assign getting a call back as \"success\" and not getting a call back as a failure. "
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 74,
254 |    "metadata": {
255 |     "collapsed": false
256 |    },
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "157 235\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "b_success = len(data_b[data_b.call == 1])\n",
268 |     "w_success = len(data_w[data_w.call == 1])\n",
269 |     "print b_success, w_success"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Let's also calculate the proportion $\\hat{p}_b$ of black sounding names getting a callback and the proportion $\\hat{p}_w$ white sounding names getting a call back."
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 75,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "Proportion of black sounding names getting a callback :  0.064476386037\n",
291 |       "Proportion of white sounding names getting a callback :  0.0965092402464\n"
292 |      ]
293 |     }
294 |    ],
295 |    "source": [
296 |     "p_b = 1.0 * b_success/num_b\n",
297 |     "p_w = 1.0 * w_success/num_w\n",
298 |     "print \"Proportion of black sounding names getting a callback : \",p_b\n",
299 |     "print \"Proportion of white sounding names getting a callback : \",p_w"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "Also, to make sure the samples are big enough such that we can use a normal distribution to model difference between \n",
307 |     "proportions, we need to check if $n*p$ and $n*(1-p)$ are greater than 10. This is a conclusion from the Central Limit Theorem. "
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 76,
313 |    "metadata": {
314 |     "collapsed": false
315 |    },
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "157.0\n",
322 |       "2278.0\n",
323 |       "---\n",
324 |       "235.0\n",
325 |       "2200.0\n"
326 |      ]
327 |     }
328 |    ],
329 |    "source": [
330 |     "print num_b * p_b\n",
331 |     "print num_b * (1-p_b)\n",
332 |     "print \"---\"\n",
333 |     "print num_b * p_w\n",
334 |     "print num_b * (1-p_w)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "Since, all the values are above 10, we can use the normal distribution to model differences between proportions."
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "Now that we have all the required data, we formulate the null and alternate hypotheses.\n",
349 |     "\n",
350 |     "$H_0\\:is \\: p_b = p_w\\\\\n",
351 |     "H_A \\:is \\: p_b \\neq p_w$\n",
352 |     "\n",
353 |     "The Standard Error for the sample statistic is given by\n",
354 |     "$\\sqrt{\\frac{\\hat{p}_b(1-\\hat{p}_b)}{n_b} + \\frac{\\hat{p}_w(1-\\hat{p}_w)}{n_w}} $\n",
355 |     "\n",
356 |     "We can use the z-statistic to place a confidence interval on this sample statistic.Hence, the margin of error is \n",
357 |     "$Z_{\\alpha/2} * SE$. For a 95% confidence interval, the z-value is 1.96. \n",
358 |     "\n",
359 |     "The confidence interval, subsequently, is $\\hat{p}_b - \\hat{p}_w \\pm {Z_{\\alpha/2} * SE}$"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 77,
365 |    "metadata": {
366 |     "collapsed": false
367 |    },
368 |    "outputs": [
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "Margin of error =  0.0152554063499\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "z = 1.96\n",
379 |     "margin = z * math.sqrt( ( p_w*(1-p_w) / num_b) + (p_b*(1-p_b)/num_w) )\n",
380 |     "\n",
381 |     "print \"Margin of error = \", margin"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 78,
387 |    "metadata": {
388 |     "collapsed": false
389 |    },
390 |    "outputs": [
391 |     {
392 |      "name": "stdout",
393 |      "output_type": "stream",
394 |      "text": [
395 |       "The confidence interval is given by : 0.00213225776367 to 0.0619334506552\n"
396 |      ]
397 |     }
398 |    ],
399 |    "source": [
400 |     "print \"The confidence interval is given by :\", p_w-p_b-z*margin,\"to\", p_w-p_b+z*margin"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "0 is not in this confidence interval. Nevertheless, let's go ahead and calculating the p-value."
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 79,
413 |    "metadata": {
414 |     "collapsed": false
415 |    },
416 |    "outputs": [
417 |     {
418 |      "data": {
419 |       "text/plain": [
420 |        "(-4.1084121524343464, 3.9838868375850767e-05)"
421 |       ]
422 |      },
423 |      "execution_count": 79,
424 |      "metadata": {},
425 |      "output_type": "execute_result"
426 |     }
427 |    ],
428 |    "source": [
429 |     "from statsmodels.stats.proportion import proportions_ztest as pz\n",
430 |     "pz(np.array([b_success,w_success]),np.array([num_b,num_w]),value=0)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "The second value is the p-value and it is much lesser than 0.05. Hence, we can reject the null hypothesis. \n",
438 |     "\n",
439 |     "#### Significance of the result\n",
440 |     "What does it practically mean to reject the null hypothesis? Our null hypothesis was that the proportion of black sounding names getting a call back is equal to the number of white sounding names getting a call back. After analysis, we have decided to reject it. This means that, in reality, there is a significant difference in the number of call backs ; white sounding names getting more call backs."
441 |    ]
442 |   }
443 |  ],
444 |  "metadata": {
445 |   "kernelspec": {
446 |    "display_name": "Python [Root]",
447 |    "language": "python",
448 |    "name": "Python [Root]"
449 |   },
450 |   "language_info": {
451 |    "codemirror_mode": {
452 |     "name": "ipython",
453 |     "version": 2
454 |    },
455 |    "file_extension": ".py",
456 |    "mimetype": "text/x-python",
457 |    "name": "python",
458 |    "nbconvert_exporter": "python",
459 |    "pygments_lexer": "ipython2",
460 |    "version": "2.7.12"
461 |   }
462 |  },
463 |  "nbformat": 4,
464 |  "nbformat_minor": 0
465 | }
466 | 


--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "### Examining racial discrimination in the US job market\n",
  9 |     "\n",
 10 |     "#### Background\n",
 11 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 12 |     "\n",
 13 |     "#### Data\n",
 14 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 15 |     "\n",
 16 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
 17 |     "\n",
 18 |     "#### Exercise\n",
 19 |     "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 20 |     "\n",
 21 |     "Answer the following questions **in this notebook below and submit to your Github account**. \n",
 22 |     "\n",
 23 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 24 |     "   2. What are the null and alternate hypotheses?\n",
 25 |     "   3. Compute margin of error, confidence interval, and p-value.\n",
 26 |     "   4. Discuss statistical significance.\n",
 27 |     "\n",
 28 |     "You can include written notes in notebook cells using Markdown: \n",
 29 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 30 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 31 |     "\n",
 32 |     "\n",
 33 |     "#### Resources\n",
 34 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 35 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n",
 36 |     "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 37 |     "\n",
 38 |     "****"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 42,
 44 |    "metadata": {
 45 |     "collapsed": true
 46 |    },
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "import pandas as pd\n",
 50 |     "import numpy as np\n",
 51 |     "from scipy import stats\n",
 52 |     "import math"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {
 59 |     "collapsed": false
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 19,
 69 |    "metadata": {
 70 |     "collapsed": false
 71 |    },
 72 |    "outputs": [
 73 |     {
 74 |      "data": {
 75 |       "text/plain": [
 76 |        "157.0"
 77 |       ]
 78 |      },
 79 |      "execution_count": 19,
 80 |      "metadata": {},
 81 |      "output_type": "execute_result"
 82 |     }
 83 |    ],
 84 |    "source": [
 85 |     "# number of callbacks for black-sounding names\n",
 86 |     "sum(data[data.race=='b'].call)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 10,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [
 96 |     {
 97 |      "data": {
 98 |       "text/html": [
 99 |        "<div>\n",
100 |        "<table border=\"1\" class=\"dataframe\">\n",
101 |        "  <thead>\n",
102 |        "    <tr style=\"text-align: right;\">\n",
103 |        "      <th></th>\n",
104 |        "      <th>id</th>\n",
105 |        "      <th>ad</th>\n",
106 |        "      <th>education</th>\n",
107 |        "      <th>ofjobs</th>\n",
108 |        "      <th>yearsexp</th>\n",
109 |        "      <th>honors</th>\n",
110 |        "      <th>volunteer</th>\n",
111 |        "      <th>military</th>\n",
112 |        "      <th>empholes</th>\n",
113 |        "      <th>occupspecific</th>\n",
114 |        "      <th>...</th>\n",
115 |        "      <th>compreq</th>\n",
116 |        "      <th>orgreq</th>\n",
117 |        "      <th>manuf</th>\n",
118 |        "      <th>transcom</th>\n",
119 |        "      <th>bankreal</th>\n",
120 |        "      <th>trade</th>\n",
121 |        "      <th>busservice</th>\n",
122 |        "      <th>othservice</th>\n",
123 |        "      <th>missind</th>\n",
124 |        "      <th>ownership</th>\n",
125 |        "    </tr>\n",
126 |        "  </thead>\n",
127 |        "  <tbody>\n",
128 |        "    <tr>\n",
129 |        "      <th>0</th>\n",
130 |        "      <td>b</td>\n",
131 |        "      <td>1</td>\n",
132 |        "      <td>4</td>\n",
133 |        "      <td>2</td>\n",
134 |        "      <td>6</td>\n",
135 |        "      <td>0</td>\n",
136 |        "      <td>0</td>\n",
137 |        "      <td>0</td>\n",
138 |        "      <td>1</td>\n",
139 |        "      <td>17</td>\n",
140 |        "      <td>...</td>\n",
141 |        "      <td>1.0</td>\n",
142 |        "      <td>0.0</td>\n",
143 |        "      <td>1.0</td>\n",
144 |        "      <td>0.0</td>\n",
145 |        "      <td>0.0</td>\n",
146 |        "      <td>0.0</td>\n",
147 |        "      <td>0.0</td>\n",
148 |        "      <td>0.0</td>\n",
149 |        "      <td>0.0</td>\n",
150 |        "      <td></td>\n",
151 |        "    </tr>\n",
152 |        "    <tr>\n",
153 |        "      <th>1</th>\n",
154 |        "      <td>b</td>\n",
155 |        "      <td>1</td>\n",
156 |        "      <td>3</td>\n",
157 |        "      <td>3</td>\n",
158 |        "      <td>6</td>\n",
159 |        "      <td>0</td>\n",
160 |        "      <td>1</td>\n",
161 |        "      <td>1</td>\n",
162 |        "      <td>0</td>\n",
163 |        "      <td>316</td>\n",
164 |        "      <td>...</td>\n",
165 |        "      <td>1.0</td>\n",
166 |        "      <td>0.0</td>\n",
167 |        "      <td>1.0</td>\n",
168 |        "      <td>0.0</td>\n",
169 |        "      <td>0.0</td>\n",
170 |        "      <td>0.0</td>\n",
171 |        "      <td>0.0</td>\n",
172 |        "      <td>0.0</td>\n",
173 |        "      <td>0.0</td>\n",
174 |        "      <td></td>\n",
175 |        "    </tr>\n",
176 |        "  </tbody>\n",
177 |        "</table>\n",
178 |        "<p>2 rows × 65 columns</p>\n",
179 |        "</div>"
180 |       ],
181 |       "text/plain": [
182 |        "  id ad  education  ofjobs  yearsexp  honors  volunteer  military  empholes  \\\n",
183 |        "0  b  1          4       2         6       0          0         0         1   \n",
184 |        "1  b  1          3       3         6       0          1         1         0   \n",
185 |        "\n",
186 |        "   occupspecific    ...      compreq  orgreq  manuf  transcom  bankreal trade  \\\n",
187 |        "0             17    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
188 |        "1            316    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
189 |        "\n",
190 |        "  busservice othservice  missind  ownership  \n",
191 |        "0        0.0        0.0      0.0             \n",
192 |        "1        0.0        0.0      0.0             \n",
193 |        "\n",
194 |        "[2 rows x 65 columns]"
195 |       ]
196 |      },
197 |      "execution_count": 10,
198 |      "metadata": {},
199 |      "output_type": "execute_result"
200 |     }
201 |    ],
202 |    "source": [
203 |     "data.columns\n",
204 |     "data.head(2)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "#### What test is appropriate for this problem? Does CLT apply?\n",
212 |     "\n",
213 |     "Let's being by looking the number of observations where race = b and race = w. We can also check the number of such observations that have received a call back and the ones that didn't. Using these factors, we will be able to create a contingency table. \n",
214 |     "\n",
215 |     "Hence, the problem boils down to comparison of two proportions. \n",
216 |     "If certain conditions are satisfied, we can also perform the Fischer's Exact Test using the contingency table.\n"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 73,
222 |    "metadata": {
223 |     "collapsed": false
224 |    },
225 |    "outputs": [
226 |     {
227 |      "name": "stdout",
228 |      "output_type": "stream",
229 |      "text": [
230 |       "Number of observations where race is b :  2435\n",
231 |       "Number of observations where race is w :  2435\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "data_b = data[data.race=='b']\n",
237 |     "data_w = data[data.race=='w']\n",
238 |     "num_b = len(data_b)\n",
239 |     "num_w = len(data_w)\n",
240 |     "print \"Number of observations where race is b : \",num_b\n",
241 |     "print \"Number of observations where race is w : \",num_w"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "markdown",
246 |    "metadata": {},
247 |    "source": [
248 |     "Since there can be only two states for the 'call' variable, we can arbitrarily assign getting a call back as \"success\" and not getting a call back as a failure. "
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "code",
253 |    "execution_count": 74,
254 |    "metadata": {
255 |     "collapsed": false
256 |    },
257 |    "outputs": [
258 |     {
259 |      "name": "stdout",
260 |      "output_type": "stream",
261 |      "text": [
262 |       "157 235\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "b_success = len(data_b[data_b.call == 1])\n",
268 |     "w_success = len(data_w[data_w.call == 1])\n",
269 |     "print b_success, w_success"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "markdown",
274 |    "metadata": {},
275 |    "source": [
276 |     "Let's also calculate the proportion $\\hat{p}_b$ of black sounding names getting a callback and the proportion $\\hat{p}_w$ white sounding names getting a call back."
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 75,
282 |    "metadata": {
283 |     "collapsed": false
284 |    },
285 |    "outputs": [
286 |     {
287 |      "name": "stdout",
288 |      "output_type": "stream",
289 |      "text": [
290 |       "Proportion of black sounding names getting a callback :  0.064476386037\n",
291 |       "Proportion of white sounding names getting a callback :  0.0965092402464\n"
292 |      ]
293 |     }
294 |    ],
295 |    "source": [
296 |     "p_b = 1.0 * b_success/num_b\n",
297 |     "p_w = 1.0 * w_success/num_w\n",
298 |     "print \"Proportion of black sounding names getting a callback : \",p_b\n",
299 |     "print \"Proportion of white sounding names getting a callback : \",p_w"
300 |    ]
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "Also, to make sure the samples are big enough such that we can use a normal distribution to model difference between \n",
307 |     "proportions, we need to check if $n*p$ and $n*(1-p)$ are greater than 10. This is a conclusion from the Central Limit Theorem. "
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 76,
313 |    "metadata": {
314 |     "collapsed": false
315 |    },
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "157.0\n",
322 |       "2278.0\n",
323 |       "---\n",
324 |       "235.0\n",
325 |       "2200.0\n"
326 |      ]
327 |     }
328 |    ],
329 |    "source": [
330 |     "print num_b * p_b\n",
331 |     "print num_b * (1-p_b)\n",
332 |     "print \"---\"\n",
333 |     "print num_b * p_w\n",
334 |     "print num_b * (1-p_w)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "markdown",
339 |    "metadata": {},
340 |    "source": [
341 |     "Since, all the values are above 10, we can use the normal distribution to model differences between proportions."
342 |    ]
343 |   },
344 |   {
345 |    "cell_type": "markdown",
346 |    "metadata": {},
347 |    "source": [
348 |     "Now that we have all the required data, we formulate the null and alternate hypotheses.\n",
349 |     "\n",
350 |     "$H_0\\:is \\: p_b = p_w\\\\\n",
351 |     "H_A \\:is \\: p_b \\neq p_w$\n",
352 |     "\n",
353 |     "The Standard Error for the sample statistic is given by\n",
354 |     "$\\sqrt{\\frac{\\hat{p}_b(1-\\hat{p}_b)}{n_b} + \\frac{\\hat{p}_w(1-\\hat{p}_w)}{n_w}} $\n",
355 |     "\n",
356 |     "We can use the z-statistic to place a confidence interval on this sample statistic.Hence, the margin of error is \n",
357 |     "$Z_{\\alpha/2} * SE$. For a 95% confidence interval, the z-value is 1.96. \n",
358 |     "\n",
359 |     "The confidence interval, subsequently, is $\\hat{p}_b - \\hat{p}_w \\pm {Z_{\\alpha/2} * SE}$"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "code",
364 |    "execution_count": 77,
365 |    "metadata": {
366 |     "collapsed": false
367 |    },
368 |    "outputs": [
369 |     {
370 |      "name": "stdout",
371 |      "output_type": "stream",
372 |      "text": [
373 |       "Margin of error =  0.0152554063499\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "z = 1.96\n",
379 |     "margin = z * math.sqrt( ( p_w*(1-p_w) / num_b) + (p_b*(1-p_b)/num_w) )\n",
380 |     "\n",
381 |     "print \"Margin of error = \", margin"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "code",
386 |    "execution_count": 78,
387 |    "metadata": {
388 |     "collapsed": false
389 |    },
390 |    "outputs": [
391 |     {
392 |      "name": "stdout",
393 |      "output_type": "stream",
394 |      "text": [
395 |       "The confidence interval is given by : 0.00213225776367 to 0.0619334506552\n"
396 |      ]
397 |     }
398 |    ],
399 |    "source": [
400 |     "print \"The confidence interval is given by :\", p_w-p_b-z*margin,\"to\", p_w-p_b+z*margin"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "0 is not in this confidence interval. Nevertheless, let's go ahead and calculating the p-value."
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 79,
413 |    "metadata": {
414 |     "collapsed": false
415 |    },
416 |    "outputs": [
417 |     {
418 |      "data": {
419 |       "text/plain": [
420 |        "(-4.1084121524343464, 3.9838868375850767e-05)"
421 |       ]
422 |      },
423 |      "execution_count": 79,
424 |      "metadata": {},
425 |      "output_type": "execute_result"
426 |     }
427 |    ],
428 |    "source": [
429 |     "from statsmodels.stats.proportion import proportions_ztest as pz\n",
430 |     "pz(np.array([b_success,w_success]),np.array([num_b,num_w]),value=0)"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "The second value is the p-value and it is much lesser than 0.05. Hence, we can reject the null hypothesis. \n",
438 |     "\n",
439 |     "#### Significance of the result\n",
440 |     "What does it practically mean to reject the null hypothesis? Our null hypothesis was that the proportion of black sounding names getting a call back is equal to the number of white sounding names getting a call back. After analysis, we have decided to reject it. This means that, in reality, there is a significant difference in the number of call backs ; white sounding names getting more call backs."
441 |    ]
442 |   }
443 |  ],
444 |  "metadata": {
445 |   "kernelspec": {
446 |    "display_name": "Python [Root]",
447 |    "language": "python",
448 |    "name": "Python [Root]"
449 |   },
450 |   "language_info": {
451 |    "codemirror_mode": {
452 |     "name": "ipython",
453 |     "version": 2
454 |    },
455 |    "file_extension": ".py",
456 |    "mimetype": "text/x-python",
457 |    "name": "python",
458 |    "nbconvert_exporter": "python",
459 |    "pygments_lexer": "ipython2",
460 |    "version": "2.7.12"
461 |   }
462 |  },
463 |  "nbformat": 4,
464 |  "nbformat_minor": 0
465 | }
466 | 


--------------------------------------------------------------------------------
/Capstone Project/Notebooks/Classification_Baseline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Creating a baseline for classification\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "Notebook attempting to predict the result (Home win, away win, draw) of any fixture given the teams that are playing it based on their performance in the previous season. We use multiclass classification to predict the results of the matches. More feature engineering on the data might lead us to better results."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 12,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "import scipy.stats as scipy\n",
 24 |     "import random"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### Load the data"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 15,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "df = pd.read_csv(\"./Data/England/E0_13.csv\")\n",
 43 |     "df_14 = pd.read_csv(\"./Data/England/E0_14.csv\")"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 16,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n",
 57 |        "       u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n",
 58 |        "       u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n",
 59 |        "       u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n",
 60 |        "       u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n",
 61 |        "       u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n",
 62 |        "       u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n",
 63 |        "       u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n",
 64 |        "       u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n",
 65 |        "      dtype='object')"
 66 |       ]
 67 |      },
 68 |      "execution_count": 16,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "df.columns"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### Cleaning\n",
 82 |     "\n",
 83 |     "We do not need information about division, data, referee and the betting odds from various companies for this method. "
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 65,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "res_13 = df.ix[:,:23]\n",
 95 |     "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n",
 96 |     "res_14 = df_14.ix[:,:23]\n",
 97 |     "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n",
 98 |     "table_features = df.ix[:,:7]\n",
 99 |     "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n",
100 |     "bet_13 = df.ix[:,23:]"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 19,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "from math import log"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 20,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "entropy = -((0.32 * log(0.32,3)) + (0.20 * log(0.20,3)) + (0.47 * log(0.47,3)))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 21,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [
132 |     {
133 |      "data": {
134 |       "text/plain": [
135 |        "0.947893245378005"
136 |       ]
137 |      },
138 |      "execution_count": 21,
139 |      "metadata": {},
140 |      "output_type": "execute_result"
141 |     }
142 |    ],
143 |    "source": [
144 |     "entropy"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 22,
150 |    "metadata": {
151 |     "collapsed": false,
152 |     "scrolled": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "res_13.head()\n",
157 |     "feature_table = df.ix[:,:23]"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 23,
163 |    "metadata": {
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n",
169 |     "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 24,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "Average number of goals at home 1.57368421053\n",
184 |       "Average number of goals away 1.19473684211\n",
185 |       "Average number of goals conceded at home 1.57368421053\n",
186 |       "Average number of goals conceded away 1.19473684211\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n",
192 |     "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n",
193 |     "avg_home_conceded_13 = avg_away_scored_13\n",
194 |     "avg_away_conceded_13 = avg_home_scored_13\n",
195 |     "print \"Average number of goals at home\",avg_home_scored_13\n",
196 |     "print \"Average number of goals away\", avg_away_scored_13\n",
197 |     "print \"Average number of goals conceded at home\",avg_away_conceded_13\n",
198 |     "print \"Average number of goals conceded away\",avg_home_conceded_13\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 25,
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "res_home = res_13.groupby('HomeTeam')\n",
210 |     "res_away = res_13.groupby('AwayTeam')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 26,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [
220 |     {
221 |      "data": {
222 |       "text/html": [
223 |        "<div>\n",
224 |        "<table border=\"1\" class=\"dataframe\">\n",
225 |        "  <thead>\n",
226 |        "    <tr style=\"text-align: right;\">\n",
227 |        "      <th></th>\n",
228 |        "      <th>Team</th>\n",
229 |        "      <th>HGS</th>\n",
230 |        "      <th>AGS</th>\n",
231 |        "      <th>HAS</th>\n",
232 |        "      <th>AAS</th>\n",
233 |        "      <th>HGC</th>\n",
234 |        "      <th>AGC</th>\n",
235 |        "      <th>HDS</th>\n",
236 |        "      <th>ADS</th>\n",
237 |        "    </tr>\n",
238 |        "  </thead>\n",
239 |        "  <tbody>\n",
240 |        "    <tr>\n",
241 |        "      <th>0</th>\n",
242 |        "      <td>Arsenal</td>\n",
243 |        "      <td>36</td>\n",
244 |        "      <td>32</td>\n",
245 |        "      <td>NaN</td>\n",
246 |        "      <td>NaN</td>\n",
247 |        "      <td>11</td>\n",
248 |        "      <td>30</td>\n",
249 |        "      <td>NaN</td>\n",
250 |        "      <td>NaN</td>\n",
251 |        "    </tr>\n",
252 |        "    <tr>\n",
253 |        "      <th>1</th>\n",
254 |        "      <td>Aston Villa</td>\n",
255 |        "      <td>22</td>\n",
256 |        "      <td>17</td>\n",
257 |        "      <td>NaN</td>\n",
258 |        "      <td>NaN</td>\n",
259 |        "      <td>29</td>\n",
260 |        "      <td>32</td>\n",
261 |        "      <td>NaN</td>\n",
262 |        "      <td>NaN</td>\n",
263 |        "    </tr>\n",
264 |        "    <tr>\n",
265 |        "      <th>2</th>\n",
266 |        "      <td>Cardiff</td>\n",
267 |        "      <td>20</td>\n",
268 |        "      <td>12</td>\n",
269 |        "      <td>NaN</td>\n",
270 |        "      <td>NaN</td>\n",
271 |        "      <td>35</td>\n",
272 |        "      <td>39</td>\n",
273 |        "      <td>NaN</td>\n",
274 |        "      <td>NaN</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>3</th>\n",
278 |        "      <td>Chelsea</td>\n",
279 |        "      <td>43</td>\n",
280 |        "      <td>28</td>\n",
281 |        "      <td>NaN</td>\n",
282 |        "      <td>NaN</td>\n",
283 |        "      <td>11</td>\n",
284 |        "      <td>16</td>\n",
285 |        "      <td>NaN</td>\n",
286 |        "      <td>NaN</td>\n",
287 |        "    </tr>\n",
288 |        "    <tr>\n",
289 |        "      <th>4</th>\n",
290 |        "      <td>Crystal Palace</td>\n",
291 |        "      <td>18</td>\n",
292 |        "      <td>15</td>\n",
293 |        "      <td>NaN</td>\n",
294 |        "      <td>NaN</td>\n",
295 |        "      <td>23</td>\n",
296 |        "      <td>25</td>\n",
297 |        "      <td>NaN</td>\n",
298 |        "      <td>NaN</td>\n",
299 |        "    </tr>\n",
300 |        "  </tbody>\n",
301 |        "</table>\n",
302 |        "</div>"
303 |       ],
304 |       "text/plain": [
305 |        "             Team  HGS  AGS  HAS  AAS  HGC  AGC  HDS  ADS\n",
306 |        "0         Arsenal   36   32  NaN  NaN   11   30  NaN  NaN\n",
307 |        "1     Aston Villa   22   17  NaN  NaN   29   32  NaN  NaN\n",
308 |        "2         Cardiff   20   12  NaN  NaN   35   39  NaN  NaN\n",
309 |        "3         Chelsea   43   28  NaN  NaN   11   16  NaN  NaN\n",
310 |        "4  Crystal Palace   18   15  NaN  NaN   23   25  NaN  NaN"
311 |       ]
312 |      },
313 |      "execution_count": 26,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "table_13.Team = res_home.HomeTeam.all().values\n",
320 |     "table_13.HGS = res_home.FTHG.sum().values\n",
321 |     "table_13.HGC = res_home.FTAG.sum().values\n",
322 |     "table_13.AGS = res_away.FTAG.sum().values\n",
323 |     "table_13.AGC = res_away.FTHG.sum().values\n",
324 |     "table_13.head()"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 27,
330 |    "metadata": {
331 |     "collapsed": false
332 |    },
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/html": [
337 |        "<div>\n",
338 |        "<table border=\"1\" class=\"dataframe\">\n",
339 |        "  <thead>\n",
340 |        "    <tr style=\"text-align: right;\">\n",
341 |        "      <th></th>\n",
342 |        "      <th>Team</th>\n",
343 |        "      <th>HGS</th>\n",
344 |        "      <th>AGS</th>\n",
345 |        "      <th>HAS</th>\n",
346 |        "      <th>AAS</th>\n",
347 |        "      <th>HGC</th>\n",
348 |        "      <th>AGC</th>\n",
349 |        "      <th>HDS</th>\n",
350 |        "      <th>ADS</th>\n",
351 |        "    </tr>\n",
352 |        "  </thead>\n",
353 |        "  <tbody>\n",
354 |        "    <tr>\n",
355 |        "      <th>0</th>\n",
356 |        "      <td>Arsenal</td>\n",
357 |        "      <td>36</td>\n",
358 |        "      <td>32</td>\n",
359 |        "      <td>1.204013</td>\n",
360 |        "      <td>1.409692</td>\n",
361 |        "      <td>11</td>\n",
362 |        "      <td>30</td>\n",
363 |        "      <td>0.484581</td>\n",
364 |        "      <td>1.003344</td>\n",
365 |        "    </tr>\n",
366 |        "    <tr>\n",
367 |        "      <th>1</th>\n",
368 |        "      <td>Aston Villa</td>\n",
369 |        "      <td>22</td>\n",
370 |        "      <td>17</td>\n",
371 |        "      <td>0.735786</td>\n",
372 |        "      <td>0.748899</td>\n",
373 |        "      <td>29</td>\n",
374 |        "      <td>32</td>\n",
375 |        "      <td>1.277533</td>\n",
376 |        "      <td>1.070234</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>2</th>\n",
380 |        "      <td>Cardiff</td>\n",
381 |        "      <td>20</td>\n",
382 |        "      <td>12</td>\n",
383 |        "      <td>0.668896</td>\n",
384 |        "      <td>0.528634</td>\n",
385 |        "      <td>35</td>\n",
386 |        "      <td>39</td>\n",
387 |        "      <td>1.541850</td>\n",
388 |        "      <td>1.304348</td>\n",
389 |        "    </tr>\n",
390 |        "    <tr>\n",
391 |        "      <th>3</th>\n",
392 |        "      <td>Chelsea</td>\n",
393 |        "      <td>43</td>\n",
394 |        "      <td>28</td>\n",
395 |        "      <td>1.438127</td>\n",
396 |        "      <td>1.233480</td>\n",
397 |        "      <td>11</td>\n",
398 |        "      <td>16</td>\n",
399 |        "      <td>0.484581</td>\n",
400 |        "      <td>0.535117</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>4</th>\n",
404 |        "      <td>Crystal Palace</td>\n",
405 |        "      <td>18</td>\n",
406 |        "      <td>15</td>\n",
407 |        "      <td>0.602007</td>\n",
408 |        "      <td>0.660793</td>\n",
409 |        "      <td>23</td>\n",
410 |        "      <td>25</td>\n",
411 |        "      <td>1.013216</td>\n",
412 |        "      <td>0.836120</td>\n",
413 |        "    </tr>\n",
414 |        "  </tbody>\n",
415 |        "</table>\n",
416 |        "</div>"
417 |       ],
418 |       "text/plain": [
419 |        "             Team  HGS  AGS       HAS       AAS  HGC  AGC       HDS       ADS\n",
420 |        "0         Arsenal   36   32  1.204013  1.409692   11   30  0.484581  1.003344\n",
421 |        "1     Aston Villa   22   17  0.735786  0.748899   29   32  1.277533  1.070234\n",
422 |        "2         Cardiff   20   12  0.668896  0.528634   35   39  1.541850  1.304348\n",
423 |        "3         Chelsea   43   28  1.438127  1.233480   11   16  0.484581  0.535117\n",
424 |        "4  Crystal Palace   18   15  0.602007  0.660793   23   25  1.013216  0.836120"
425 |       ]
426 |      },
427 |      "execution_count": 27,
428 |      "metadata": {},
429 |      "output_type": "execute_result"
430 |     }
431 |    ],
432 |    "source": [
433 |     "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n",
434 |     "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n",
435 |     "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n",
436 |     "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n",
437 |     "table_13.head()"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 28,
443 |    "metadata": {
444 |     "collapsed": false
445 |    },
446 |    "outputs": [],
447 |    "source": [
448 |     "feature_table = feature_table[['HomeTeam','AwayTeam','FTR']]\n",
449 |     "f_HAS = []\n",
450 |     "f_HDS = []\n",
451 |     "f_AAS = []\n",
452 |     "f_ADS = []\n",
453 |     "for index,row in feature_table.iterrows():\n",
454 |     "    f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n",
455 |     "    f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n",
456 |     "    f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n",
457 |     "    f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n",
458 |     "    \n",
459 |     "feature_table['HAS'] = f_HAS\n",
460 |     "feature_table['HDS'] = f_HDS\n",
461 |     "feature_table['AAS'] = f_AAS\n",
462 |     "feature_table['ADS'] = f_ADS"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 29,
468 |    "metadata": {
469 |     "collapsed": false
470 |    },
471 |    "outputs": [
472 |     {
473 |      "data": {
474 |       "text/html": [
475 |        "<div>\n",
476 |        "<table border=\"1\" class=\"dataframe\">\n",
477 |        "  <thead>\n",
478 |        "    <tr style=\"text-align: right;\">\n",
479 |        "      <th></th>\n",
480 |        "      <th>HomeTeam</th>\n",
481 |        "      <th>AwayTeam</th>\n",
482 |        "      <th>FTR</th>\n",
483 |        "      <th>HAS</th>\n",
484 |        "      <th>HDS</th>\n",
485 |        "      <th>AAS</th>\n",
486 |        "      <th>ADS</th>\n",
487 |        "    </tr>\n",
488 |        "  </thead>\n",
489 |        "  <tbody>\n",
490 |        "    <tr>\n",
491 |        "      <th>0</th>\n",
492 |        "      <td>Arsenal</td>\n",
493 |        "      <td>Aston Villa</td>\n",
494 |        "      <td>A</td>\n",
495 |        "      <td>1.204013</td>\n",
496 |        "      <td>0.484581</td>\n",
497 |        "      <td>1.409692</td>\n",
498 |        "      <td>1.003344</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>1</th>\n",
502 |        "      <td>Liverpool</td>\n",
503 |        "      <td>Stoke</td>\n",
504 |        "      <td>H</td>\n",
505 |        "      <td>1.772575</td>\n",
506 |        "      <td>0.792952</td>\n",
507 |        "      <td>2.114537</td>\n",
508 |        "      <td>1.070234</td>\n",
509 |        "    </tr>\n",
510 |        "    <tr>\n",
511 |        "      <th>2</th>\n",
512 |        "      <td>Norwich</td>\n",
513 |        "      <td>Everton</td>\n",
514 |        "      <td>D</td>\n",
515 |        "      <td>0.568562</td>\n",
516 |        "      <td>0.792952</td>\n",
517 |        "      <td>0.484581</td>\n",
518 |        "      <td>1.471572</td>\n",
519 |        "    </tr>\n",
520 |        "    <tr>\n",
521 |        "      <th>3</th>\n",
522 |        "      <td>Sunderland</td>\n",
523 |        "      <td>Fulham</td>\n",
524 |        "      <td>A</td>\n",
525 |        "      <td>0.702341</td>\n",
526 |        "      <td>1.189427</td>\n",
527 |        "      <td>0.881057</td>\n",
528 |        "      <td>1.103679</td>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>4</th>\n",
532 |        "      <td>Swansea</td>\n",
533 |        "      <td>Man United</td>\n",
534 |        "      <td>A</td>\n",
535 |        "      <td>1.103679</td>\n",
536 |        "      <td>1.145374</td>\n",
537 |        "      <td>0.925110</td>\n",
538 |        "      <td>0.936455</td>\n",
539 |        "    </tr>\n",
540 |        "  </tbody>\n",
541 |        "</table>\n",
542 |        "</div>"
543 |       ],
544 |       "text/plain": [
545 |        "     HomeTeam     AwayTeam FTR       HAS       HDS       AAS       ADS\n",
546 |        "0     Arsenal  Aston Villa   A  1.204013  0.484581  1.409692  1.003344\n",
547 |        "1   Liverpool        Stoke   H  1.772575  0.792952  2.114537  1.070234\n",
548 |        "2     Norwich      Everton   D  0.568562  0.792952  0.484581  1.471572\n",
549 |        "3  Sunderland       Fulham   A  0.702341  1.189427  0.881057  1.103679\n",
550 |        "4     Swansea   Man United   A  1.103679  1.145374  0.925110  0.936455"
551 |       ]
552 |      },
553 |      "execution_count": 29,
554 |      "metadata": {},
555 |      "output_type": "execute_result"
556 |     }
557 |    ],
558 |    "source": [
559 |     "feature_table.head()"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 30,
565 |    "metadata": {
566 |     "collapsed": true
567 |    },
568 |    "outputs": [],
569 |    "source": [
570 |     "def transformResult(row):\n",
571 |     "    if(row.FTR == 'H'):\n",
572 |     "        return 1\n",
573 |     "    elif(row.FTR == 'A'):\n",
574 |     "        return -1\n",
575 |     "    else:\n",
576 |     "        return 0"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 31,
582 |    "metadata": {
583 |     "collapsed": false
584 |    },
585 |    "outputs": [],
586 |    "source": [
587 |     "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": 32,
593 |    "metadata": {
594 |     "collapsed": false
595 |    },
596 |    "outputs": [
597 |     {
598 |      "data": {
599 |       "text/html": [
600 |        "<div>\n",
601 |        "<table border=\"1\" class=\"dataframe\">\n",
602 |        "  <thead>\n",
603 |        "    <tr style=\"text-align: right;\">\n",
604 |        "      <th></th>\n",
605 |        "      <th>HomeTeam</th>\n",
606 |        "      <th>AwayTeam</th>\n",
607 |        "      <th>FTR</th>\n",
608 |        "      <th>HAS</th>\n",
609 |        "      <th>HDS</th>\n",
610 |        "      <th>AAS</th>\n",
611 |        "      <th>ADS</th>\n",
612 |        "      <th>Result</th>\n",
613 |        "    </tr>\n",
614 |        "  </thead>\n",
615 |        "  <tbody>\n",
616 |        "    <tr>\n",
617 |        "      <th>0</th>\n",
618 |        "      <td>Arsenal</td>\n",
619 |        "      <td>Aston Villa</td>\n",
620 |        "      <td>A</td>\n",
621 |        "      <td>1.204013</td>\n",
622 |        "      <td>0.484581</td>\n",
623 |        "      <td>1.409692</td>\n",
624 |        "      <td>1.003344</td>\n",
625 |        "      <td>-1</td>\n",
626 |        "    </tr>\n",
627 |        "    <tr>\n",
628 |        "      <th>1</th>\n",
629 |        "      <td>Liverpool</td>\n",
630 |        "      <td>Stoke</td>\n",
631 |        "      <td>H</td>\n",
632 |        "      <td>1.772575</td>\n",
633 |        "      <td>0.792952</td>\n",
634 |        "      <td>2.114537</td>\n",
635 |        "      <td>1.070234</td>\n",
636 |        "      <td>1</td>\n",
637 |        "    </tr>\n",
638 |        "    <tr>\n",
639 |        "      <th>2</th>\n",
640 |        "      <td>Norwich</td>\n",
641 |        "      <td>Everton</td>\n",
642 |        "      <td>D</td>\n",
643 |        "      <td>0.568562</td>\n",
644 |        "      <td>0.792952</td>\n",
645 |        "      <td>0.484581</td>\n",
646 |        "      <td>1.471572</td>\n",
647 |        "      <td>0</td>\n",
648 |        "    </tr>\n",
649 |        "    <tr>\n",
650 |        "      <th>3</th>\n",
651 |        "      <td>Sunderland</td>\n",
652 |        "      <td>Fulham</td>\n",
653 |        "      <td>A</td>\n",
654 |        "      <td>0.702341</td>\n",
655 |        "      <td>1.189427</td>\n",
656 |        "      <td>0.881057</td>\n",
657 |        "      <td>1.103679</td>\n",
658 |        "      <td>-1</td>\n",
659 |        "    </tr>\n",
660 |        "    <tr>\n",
661 |        "      <th>4</th>\n",
662 |        "      <td>Swansea</td>\n",
663 |        "      <td>Man United</td>\n",
664 |        "      <td>A</td>\n",
665 |        "      <td>1.103679</td>\n",
666 |        "      <td>1.145374</td>\n",
667 |        "      <td>0.925110</td>\n",
668 |        "      <td>0.936455</td>\n",
669 |        "      <td>-1</td>\n",
670 |        "    </tr>\n",
671 |        "  </tbody>\n",
672 |        "</table>\n",
673 |        "</div>"
674 |       ],
675 |       "text/plain": [
676 |        "     HomeTeam     AwayTeam FTR       HAS       HDS       AAS       ADS  Result\n",
677 |        "0     Arsenal  Aston Villa   A  1.204013  0.484581  1.409692  1.003344      -1\n",
678 |        "1   Liverpool        Stoke   H  1.772575  0.792952  2.114537  1.070234       1\n",
679 |        "2     Norwich      Everton   D  0.568562  0.792952  0.484581  1.471572       0\n",
680 |        "3  Sunderland       Fulham   A  0.702341  1.189427  0.881057  1.103679      -1\n",
681 |        "4     Swansea   Man United   A  1.103679  1.145374  0.925110  0.936455      -1"
682 |       ]
683 |      },
684 |      "execution_count": 32,
685 |      "metadata": {},
686 |      "output_type": "execute_result"
687 |     }
688 |    ],
689 |    "source": [
690 |     "feature_table.head()"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": 33,
696 |    "metadata": {
697 |     "collapsed": true
698 |    },
699 |    "outputs": [],
700 |    "source": [
701 |     "X_train = feature_table[['HAS','HDS','AAS','ADS']]\n",
702 |     "y_train = feature_table['Result']"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": 34,
708 |    "metadata": {
709 |     "collapsed": false
710 |    },
711 |    "outputs": [],
712 |    "source": [
713 |     "from sklearn.tree import DecisionTreeClassifier\n",
714 |     "from sklearn.naive_bayes import MultinomialNB\n",
715 |     "from xgboost import XGBClassifier\n",
716 |     "from sklearn.neighbors import KNeighborsClassifier\n",
717 |     "from sklearn.multiclass import OneVsRestClassifier\n",
718 |     "\n",
719 |     "from sklearn.linear_model import LogisticRegression\n",
720 |     "from sklearn.metrics import accuracy_score"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "markdown",
725 |    "metadata": {},
726 |    "source": [
727 |     "## Randomized Model as Benchmark"
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "code",
732 |    "execution_count": 41,
733 |    "metadata": {
734 |     "collapsed": false
735 |    },
736 |    "outputs": [],
737 |    "source": [
738 |     "outcome_list = [-1,0,1]\n",
739 |     "y_pred = []\n",
740 |     "for i in xrange(1,381):\n",
741 |     "    y_pred.append(random.choice(outcome_list))\n",
742 |     "    "
743 |    ]
744 |   },
745 |   {
746 |    "cell_type": "code",
747 |    "execution_count": 42,
748 |    "metadata": {
749 |     "collapsed": false
750 |    },
751 |    "outputs": [
752 |     {
753 |      "data": {
754 |       "text/plain": [
755 |        "0.32631578947368423"
756 |       ]
757 |      },
758 |      "execution_count": 42,
759 |      "metadata": {},
760 |      "output_type": "execute_result"
761 |     }
762 |    ],
763 |    "source": [
764 |     "accuracy_score(y_train,y_pred)"
765 |    ]
766 |   },
767 |   {
768 |    "cell_type": "markdown",
769 |    "metadata": {},
770 |    "source": [
771 |     "With just a random model, we get an accuracy of 33% on an average which is expected since there are three outcomes to any fixture. "
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "markdown",
776 |    "metadata": {},
777 |    "source": [
778 |     "### Classifiers"
779 |    ]
780 |   },
781 |   {
782 |    "cell_type": "code",
783 |    "execution_count": 87,
784 |    "metadata": {
785 |     "collapsed": false
786 |    },
787 |    "outputs": [
788 |     {
789 |      "data": {
790 |       "text/plain": [
791 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
792 |        "           metric_params=None, n_jobs=1, n_neighbors=15, p=2,\n",
793 |        "           weights='uniform')"
794 |       ]
795 |      },
796 |      "execution_count": 87,
797 |      "metadata": {},
798 |      "output_type": "execute_result"
799 |     }
800 |    ],
801 |    "source": [
802 |     "clf1 = DecisionTreeClassifier()\n",
803 |     "clf2 = XGBClassifier()\n",
804 |     "clf3 = KNeighborsClassifier(n_neighbors=15)\n",
805 |     "clf3.fit(X_train,y_train)"
806 |    ]
807 |   },
808 |   {
809 |    "cell_type": "code",
810 |    "execution_count": 88,
811 |    "metadata": {
812 |     "collapsed": false
813 |    },
814 |    "outputs": [
815 |     {
816 |      "data": {
817 |       "text/plain": [
818 |        "0.54736842105263162"
819 |       ]
820 |      },
821 |      "execution_count": 88,
822 |      "metadata": {},
823 |      "output_type": "execute_result"
824 |     }
825 |    ],
826 |    "source": [
827 |     "y_pred = clf3.predict(X_train)\n",
828 |     "accuracy_score(y_pred,y_train)"
829 |    ]
830 |   },
831 |   {
832 |    "cell_type": "code",
833 |    "execution_count": null,
834 |    "metadata": {
835 |     "collapsed": true
836 |    },
837 |    "outputs": [],
838 |    "source": []
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {
844 |     "collapsed": true
845 |    },
846 |    "outputs": [],
847 |    "source": []
848 |   }
849 |  ],
850 |  "metadata": {
851 |   "kernelspec": {
852 |    "display_name": "Python [Root]",
853 |    "language": "python",
854 |    "name": "Python [Root]"
855 |   },
856 |   "language_info": {
857 |    "codemirror_mode": {
858 |     "name": "ipython",
859 |     "version": 2
860 |    },
861 |    "file_extension": ".py",
862 |    "mimetype": "text/x-python",
863 |    "name": "python",
864 |    "nbconvert_exporter": "python",
865 |    "pygments_lexer": "ipython2",
866 |    "version": "2.7.12"
867 |   }
868 |  },
869 |  "nbformat": 4,
870 |  "nbformat_minor": 0
871 | }
872 | 


--------------------------------------------------------------------------------
/Capstone Project/.ipynb_checkpoints/Classification_Baseline-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Creating a baseline for classification\n",
  8 |     "\n",
  9 |     "\n",
 10 |     "Notebook attempting to predict the result (Home win, away win, draw) of any fixture given the teams that are playing it based on their performance in the previous season. We use multiclass classification to predict the results of the matches. More feature engineering on the data might lead us to better results."
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 12,
 16 |    "metadata": {
 17 |     "collapsed": true
 18 |    },
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "import pandas as pd\n",
 22 |     "import numpy as np\n",
 23 |     "import scipy.stats as scipy\n",
 24 |     "import random"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "### Load the data"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 15,
 37 |    "metadata": {
 38 |     "collapsed": false
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "df = pd.read_csv(\"./Data/England/E0_13.csv\")\n",
 43 |     "df_14 = pd.read_csv(\"./Data/England/E0_14.csv\")"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 16,
 49 |    "metadata": {
 50 |     "collapsed": false
 51 |    },
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/plain": [
 56 |        "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n",
 57 |        "       u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n",
 58 |        "       u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n",
 59 |        "       u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n",
 60 |        "       u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n",
 61 |        "       u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n",
 62 |        "       u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n",
 63 |        "       u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n",
 64 |        "       u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n",
 65 |        "      dtype='object')"
 66 |       ]
 67 |      },
 68 |      "execution_count": 16,
 69 |      "metadata": {},
 70 |      "output_type": "execute_result"
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "df.columns"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "markdown",
 79 |    "metadata": {},
 80 |    "source": [
 81 |     "### Cleaning\n",
 82 |     "\n",
 83 |     "We do not need information about division, data, referee and the betting odds from various companies for this method. "
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 65,
 89 |    "metadata": {
 90 |     "collapsed": false
 91 |    },
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "res_13 = df.ix[:,:23]\n",
 95 |     "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n",
 96 |     "res_14 = df_14.ix[:,:23]\n",
 97 |     "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n",
 98 |     "table_features = df.ix[:,:7]\n",
 99 |     "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n",
100 |     "bet_13 = df.ix[:,23:]"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 19,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": [
111 |     "from math import log"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 20,
117 |    "metadata": {
118 |     "collapsed": false
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "entropy = -((0.32 * log(0.32,3)) + (0.20 * log(0.20,3)) + (0.47 * log(0.47,3)))"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 21,
128 |    "metadata": {
129 |     "collapsed": false
130 |    },
131 |    "outputs": [
132 |     {
133 |      "data": {
134 |       "text/plain": [
135 |        "0.947893245378005"
136 |       ]
137 |      },
138 |      "execution_count": 21,
139 |      "metadata": {},
140 |      "output_type": "execute_result"
141 |     }
142 |    ],
143 |    "source": [
144 |     "entropy"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 22,
150 |    "metadata": {
151 |     "collapsed": false,
152 |     "scrolled": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "res_13.head()\n",
157 |     "feature_table = df.ix[:,:23]"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 23,
163 |    "metadata": {
164 |     "collapsed": false
165 |    },
166 |    "outputs": [],
167 |    "source": [
168 |     "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n",
169 |     "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 24,
175 |    "metadata": {
176 |     "collapsed": false
177 |    },
178 |    "outputs": [
179 |     {
180 |      "name": "stdout",
181 |      "output_type": "stream",
182 |      "text": [
183 |       "Average number of goals at home 1.57368421053\n",
184 |       "Average number of goals away 1.19473684211\n",
185 |       "Average number of goals conceded at home 1.57368421053\n",
186 |       "Average number of goals conceded away 1.19473684211\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n",
192 |     "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n",
193 |     "avg_home_conceded_13 = avg_away_scored_13\n",
194 |     "avg_away_conceded_13 = avg_home_scored_13\n",
195 |     "print \"Average number of goals at home\",avg_home_scored_13\n",
196 |     "print \"Average number of goals away\", avg_away_scored_13\n",
197 |     "print \"Average number of goals conceded at home\",avg_away_conceded_13\n",
198 |     "print \"Average number of goals conceded away\",avg_home_conceded_13\n"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": 25,
204 |    "metadata": {
205 |     "collapsed": false
206 |    },
207 |    "outputs": [],
208 |    "source": [
209 |     "res_home = res_13.groupby('HomeTeam')\n",
210 |     "res_away = res_13.groupby('AwayTeam')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 26,
216 |    "metadata": {
217 |     "collapsed": false
218 |    },
219 |    "outputs": [
220 |     {
221 |      "data": {
222 |       "text/html": [
223 |        "<div>\n",
224 |        "<table border=\"1\" class=\"dataframe\">\n",
225 |        "  <thead>\n",
226 |        "    <tr style=\"text-align: right;\">\n",
227 |        "      <th></th>\n",
228 |        "      <th>Team</th>\n",
229 |        "      <th>HGS</th>\n",
230 |        "      <th>AGS</th>\n",
231 |        "      <th>HAS</th>\n",
232 |        "      <th>AAS</th>\n",
233 |        "      <th>HGC</th>\n",
234 |        "      <th>AGC</th>\n",
235 |        "      <th>HDS</th>\n",
236 |        "      <th>ADS</th>\n",
237 |        "    </tr>\n",
238 |        "  </thead>\n",
239 |        "  <tbody>\n",
240 |        "    <tr>\n",
241 |        "      <th>0</th>\n",
242 |        "      <td>Arsenal</td>\n",
243 |        "      <td>36</td>\n",
244 |        "      <td>32</td>\n",
245 |        "      <td>NaN</td>\n",
246 |        "      <td>NaN</td>\n",
247 |        "      <td>11</td>\n",
248 |        "      <td>30</td>\n",
249 |        "      <td>NaN</td>\n",
250 |        "      <td>NaN</td>\n",
251 |        "    </tr>\n",
252 |        "    <tr>\n",
253 |        "      <th>1</th>\n",
254 |        "      <td>Aston Villa</td>\n",
255 |        "      <td>22</td>\n",
256 |        "      <td>17</td>\n",
257 |        "      <td>NaN</td>\n",
258 |        "      <td>NaN</td>\n",
259 |        "      <td>29</td>\n",
260 |        "      <td>32</td>\n",
261 |        "      <td>NaN</td>\n",
262 |        "      <td>NaN</td>\n",
263 |        "    </tr>\n",
264 |        "    <tr>\n",
265 |        "      <th>2</th>\n",
266 |        "      <td>Cardiff</td>\n",
267 |        "      <td>20</td>\n",
268 |        "      <td>12</td>\n",
269 |        "      <td>NaN</td>\n",
270 |        "      <td>NaN</td>\n",
271 |        "      <td>35</td>\n",
272 |        "      <td>39</td>\n",
273 |        "      <td>NaN</td>\n",
274 |        "      <td>NaN</td>\n",
275 |        "    </tr>\n",
276 |        "    <tr>\n",
277 |        "      <th>3</th>\n",
278 |        "      <td>Chelsea</td>\n",
279 |        "      <td>43</td>\n",
280 |        "      <td>28</td>\n",
281 |        "      <td>NaN</td>\n",
282 |        "      <td>NaN</td>\n",
283 |        "      <td>11</td>\n",
284 |        "      <td>16</td>\n",
285 |        "      <td>NaN</td>\n",
286 |        "      <td>NaN</td>\n",
287 |        "    </tr>\n",
288 |        "    <tr>\n",
289 |        "      <th>4</th>\n",
290 |        "      <td>Crystal Palace</td>\n",
291 |        "      <td>18</td>\n",
292 |        "      <td>15</td>\n",
293 |        "      <td>NaN</td>\n",
294 |        "      <td>NaN</td>\n",
295 |        "      <td>23</td>\n",
296 |        "      <td>25</td>\n",
297 |        "      <td>NaN</td>\n",
298 |        "      <td>NaN</td>\n",
299 |        "    </tr>\n",
300 |        "  </tbody>\n",
301 |        "</table>\n",
302 |        "</div>"
303 |       ],
304 |       "text/plain": [
305 |        "             Team  HGS  AGS  HAS  AAS  HGC  AGC  HDS  ADS\n",
306 |        "0         Arsenal   36   32  NaN  NaN   11   30  NaN  NaN\n",
307 |        "1     Aston Villa   22   17  NaN  NaN   29   32  NaN  NaN\n",
308 |        "2         Cardiff   20   12  NaN  NaN   35   39  NaN  NaN\n",
309 |        "3         Chelsea   43   28  NaN  NaN   11   16  NaN  NaN\n",
310 |        "4  Crystal Palace   18   15  NaN  NaN   23   25  NaN  NaN"
311 |       ]
312 |      },
313 |      "execution_count": 26,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "table_13.Team = res_home.HomeTeam.all().values\n",
320 |     "table_13.HGS = res_home.FTHG.sum().values\n",
321 |     "table_13.HGC = res_home.FTAG.sum().values\n",
322 |     "table_13.AGS = res_away.FTAG.sum().values\n",
323 |     "table_13.AGC = res_away.FTHG.sum().values\n",
324 |     "table_13.head()"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 27,
330 |    "metadata": {
331 |     "collapsed": false
332 |    },
333 |    "outputs": [
334 |     {
335 |      "data": {
336 |       "text/html": [
337 |        "<div>\n",
338 |        "<table border=\"1\" class=\"dataframe\">\n",
339 |        "  <thead>\n",
340 |        "    <tr style=\"text-align: right;\">\n",
341 |        "      <th></th>\n",
342 |        "      <th>Team</th>\n",
343 |        "      <th>HGS</th>\n",
344 |        "      <th>AGS</th>\n",
345 |        "      <th>HAS</th>\n",
346 |        "      <th>AAS</th>\n",
347 |        "      <th>HGC</th>\n",
348 |        "      <th>AGC</th>\n",
349 |        "      <th>HDS</th>\n",
350 |        "      <th>ADS</th>\n",
351 |        "    </tr>\n",
352 |        "  </thead>\n",
353 |        "  <tbody>\n",
354 |        "    <tr>\n",
355 |        "      <th>0</th>\n",
356 |        "      <td>Arsenal</td>\n",
357 |        "      <td>36</td>\n",
358 |        "      <td>32</td>\n",
359 |        "      <td>1.204013</td>\n",
360 |        "      <td>1.409692</td>\n",
361 |        "      <td>11</td>\n",
362 |        "      <td>30</td>\n",
363 |        "      <td>0.484581</td>\n",
364 |        "      <td>1.003344</td>\n",
365 |        "    </tr>\n",
366 |        "    <tr>\n",
367 |        "      <th>1</th>\n",
368 |        "      <td>Aston Villa</td>\n",
369 |        "      <td>22</td>\n",
370 |        "      <td>17</td>\n",
371 |        "      <td>0.735786</td>\n",
372 |        "      <td>0.748899</td>\n",
373 |        "      <td>29</td>\n",
374 |        "      <td>32</td>\n",
375 |        "      <td>1.277533</td>\n",
376 |        "      <td>1.070234</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>2</th>\n",
380 |        "      <td>Cardiff</td>\n",
381 |        "      <td>20</td>\n",
382 |        "      <td>12</td>\n",
383 |        "      <td>0.668896</td>\n",
384 |        "      <td>0.528634</td>\n",
385 |        "      <td>35</td>\n",
386 |        "      <td>39</td>\n",
387 |        "      <td>1.541850</td>\n",
388 |        "      <td>1.304348</td>\n",
389 |        "    </tr>\n",
390 |        "    <tr>\n",
391 |        "      <th>3</th>\n",
392 |        "      <td>Chelsea</td>\n",
393 |        "      <td>43</td>\n",
394 |        "      <td>28</td>\n",
395 |        "      <td>1.438127</td>\n",
396 |        "      <td>1.233480</td>\n",
397 |        "      <td>11</td>\n",
398 |        "      <td>16</td>\n",
399 |        "      <td>0.484581</td>\n",
400 |        "      <td>0.535117</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>4</th>\n",
404 |        "      <td>Crystal Palace</td>\n",
405 |        "      <td>18</td>\n",
406 |        "      <td>15</td>\n",
407 |        "      <td>0.602007</td>\n",
408 |        "      <td>0.660793</td>\n",
409 |        "      <td>23</td>\n",
410 |        "      <td>25</td>\n",
411 |        "      <td>1.013216</td>\n",
412 |        "      <td>0.836120</td>\n",
413 |        "    </tr>\n",
414 |        "  </tbody>\n",
415 |        "</table>\n",
416 |        "</div>"
417 |       ],
418 |       "text/plain": [
419 |        "             Team  HGS  AGS       HAS       AAS  HGC  AGC       HDS       ADS\n",
420 |        "0         Arsenal   36   32  1.204013  1.409692   11   30  0.484581  1.003344\n",
421 |        "1     Aston Villa   22   17  0.735786  0.748899   29   32  1.277533  1.070234\n",
422 |        "2         Cardiff   20   12  0.668896  0.528634   35   39  1.541850  1.304348\n",
423 |        "3         Chelsea   43   28  1.438127  1.233480   11   16  0.484581  0.535117\n",
424 |        "4  Crystal Palace   18   15  0.602007  0.660793   23   25  1.013216  0.836120"
425 |       ]
426 |      },
427 |      "execution_count": 27,
428 |      "metadata": {},
429 |      "output_type": "execute_result"
430 |     }
431 |    ],
432 |    "source": [
433 |     "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n",
434 |     "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n",
435 |     "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n",
436 |     "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n",
437 |     "table_13.head()"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": 28,
443 |    "metadata": {
444 |     "collapsed": false
445 |    },
446 |    "outputs": [],
447 |    "source": [
448 |     "feature_table = feature_table[['HomeTeam','AwayTeam','FTR']]\n",
449 |     "f_HAS = []\n",
450 |     "f_HDS = []\n",
451 |     "f_AAS = []\n",
452 |     "f_ADS = []\n",
453 |     "for index,row in feature_table.iterrows():\n",
454 |     "    f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n",
455 |     "    f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n",
456 |     "    f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n",
457 |     "    f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n",
458 |     "    \n",
459 |     "feature_table['HAS'] = f_HAS\n",
460 |     "feature_table['HDS'] = f_HDS\n",
461 |     "feature_table['AAS'] = f_AAS\n",
462 |     "feature_table['ADS'] = f_ADS"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": 29,
468 |    "metadata": {
469 |     "collapsed": false
470 |    },
471 |    "outputs": [
472 |     {
473 |      "data": {
474 |       "text/html": [
475 |        "<div>\n",
476 |        "<table border=\"1\" class=\"dataframe\">\n",
477 |        "  <thead>\n",
478 |        "    <tr style=\"text-align: right;\">\n",
479 |        "      <th></th>\n",
480 |        "      <th>HomeTeam</th>\n",
481 |        "      <th>AwayTeam</th>\n",
482 |        "      <th>FTR</th>\n",
483 |        "      <th>HAS</th>\n",
484 |        "      <th>HDS</th>\n",
485 |        "      <th>AAS</th>\n",
486 |        "      <th>ADS</th>\n",
487 |        "    </tr>\n",
488 |        "  </thead>\n",
489 |        "  <tbody>\n",
490 |        "    <tr>\n",
491 |        "      <th>0</th>\n",
492 |        "      <td>Arsenal</td>\n",
493 |        "      <td>Aston Villa</td>\n",
494 |        "      <td>A</td>\n",
495 |        "      <td>1.204013</td>\n",
496 |        "      <td>0.484581</td>\n",
497 |        "      <td>1.409692</td>\n",
498 |        "      <td>1.003344</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>1</th>\n",
502 |        "      <td>Liverpool</td>\n",
503 |        "      <td>Stoke</td>\n",
504 |        "      <td>H</td>\n",
505 |        "      <td>1.772575</td>\n",
506 |        "      <td>0.792952</td>\n",
507 |        "      <td>2.114537</td>\n",
508 |        "      <td>1.070234</td>\n",
509 |        "    </tr>\n",
510 |        "    <tr>\n",
511 |        "      <th>2</th>\n",
512 |        "      <td>Norwich</td>\n",
513 |        "      <td>Everton</td>\n",
514 |        "      <td>D</td>\n",
515 |        "      <td>0.568562</td>\n",
516 |        "      <td>0.792952</td>\n",
517 |        "      <td>0.484581</td>\n",
518 |        "      <td>1.471572</td>\n",
519 |        "    </tr>\n",
520 |        "    <tr>\n",
521 |        "      <th>3</th>\n",
522 |        "      <td>Sunderland</td>\n",
523 |        "      <td>Fulham</td>\n",
524 |        "      <td>A</td>\n",
525 |        "      <td>0.702341</td>\n",
526 |        "      <td>1.189427</td>\n",
527 |        "      <td>0.881057</td>\n",
528 |        "      <td>1.103679</td>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>4</th>\n",
532 |        "      <td>Swansea</td>\n",
533 |        "      <td>Man United</td>\n",
534 |        "      <td>A</td>\n",
535 |        "      <td>1.103679</td>\n",
536 |        "      <td>1.145374</td>\n",
537 |        "      <td>0.925110</td>\n",
538 |        "      <td>0.936455</td>\n",
539 |        "    </tr>\n",
540 |        "  </tbody>\n",
541 |        "</table>\n",
542 |        "</div>"
543 |       ],
544 |       "text/plain": [
545 |        "     HomeTeam     AwayTeam FTR       HAS       HDS       AAS       ADS\n",
546 |        "0     Arsenal  Aston Villa   A  1.204013  0.484581  1.409692  1.003344\n",
547 |        "1   Liverpool        Stoke   H  1.772575  0.792952  2.114537  1.070234\n",
548 |        "2     Norwich      Everton   D  0.568562  0.792952  0.484581  1.471572\n",
549 |        "3  Sunderland       Fulham   A  0.702341  1.189427  0.881057  1.103679\n",
550 |        "4     Swansea   Man United   A  1.103679  1.145374  0.925110  0.936455"
551 |       ]
552 |      },
553 |      "execution_count": 29,
554 |      "metadata": {},
555 |      "output_type": "execute_result"
556 |     }
557 |    ],
558 |    "source": [
559 |     "feature_table.head()"
560 |    ]
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": 30,
565 |    "metadata": {
566 |     "collapsed": true
567 |    },
568 |    "outputs": [],
569 |    "source": [
570 |     "def transformResult(row):\n",
571 |     "    if(row.FTR == 'H'):\n",
572 |     "        return 1\n",
573 |     "    elif(row.FTR == 'A'):\n",
574 |     "        return -1\n",
575 |     "    else:\n",
576 |     "        return 0"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 31,
582 |    "metadata": {
583 |     "collapsed": false
584 |    },
585 |    "outputs": [],
586 |    "source": [
587 |     "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": 32,
593 |    "metadata": {
594 |     "collapsed": false
595 |    },
596 |    "outputs": [
597 |     {
598 |      "data": {
599 |       "text/html": [
600 |        "<div>\n",
601 |        "<table border=\"1\" class=\"dataframe\">\n",
602 |        "  <thead>\n",
603 |        "    <tr style=\"text-align: right;\">\n",
604 |        "      <th></th>\n",
605 |        "      <th>HomeTeam</th>\n",
606 |        "      <th>AwayTeam</th>\n",
607 |        "      <th>FTR</th>\n",
608 |        "      <th>HAS</th>\n",
609 |        "      <th>HDS</th>\n",
610 |        "      <th>AAS</th>\n",
611 |        "      <th>ADS</th>\n",
612 |        "      <th>Result</th>\n",
613 |        "    </tr>\n",
614 |        "  </thead>\n",
615 |        "  <tbody>\n",
616 |        "    <tr>\n",
617 |        "      <th>0</th>\n",
618 |        "      <td>Arsenal</td>\n",
619 |        "      <td>Aston Villa</td>\n",
620 |        "      <td>A</td>\n",
621 |        "      <td>1.204013</td>\n",
622 |        "      <td>0.484581</td>\n",
623 |        "      <td>1.409692</td>\n",
624 |        "      <td>1.003344</td>\n",
625 |        "      <td>-1</td>\n",
626 |        "    </tr>\n",
627 |        "    <tr>\n",
628 |        "      <th>1</th>\n",
629 |        "      <td>Liverpool</td>\n",
630 |        "      <td>Stoke</td>\n",
631 |        "      <td>H</td>\n",
632 |        "      <td>1.772575</td>\n",
633 |        "      <td>0.792952</td>\n",
634 |        "      <td>2.114537</td>\n",
635 |        "      <td>1.070234</td>\n",
636 |        "      <td>1</td>\n",
637 |        "    </tr>\n",
638 |        "    <tr>\n",
639 |        "      <th>2</th>\n",
640 |        "      <td>Norwich</td>\n",
641 |        "      <td>Everton</td>\n",
642 |        "      <td>D</td>\n",
643 |        "      <td>0.568562</td>\n",
644 |        "      <td>0.792952</td>\n",
645 |        "      <td>0.484581</td>\n",
646 |        "      <td>1.471572</td>\n",
647 |        "      <td>0</td>\n",
648 |        "    </tr>\n",
649 |        "    <tr>\n",
650 |        "      <th>3</th>\n",
651 |        "      <td>Sunderland</td>\n",
652 |        "      <td>Fulham</td>\n",
653 |        "      <td>A</td>\n",
654 |        "      <td>0.702341</td>\n",
655 |        "      <td>1.189427</td>\n",
656 |        "      <td>0.881057</td>\n",
657 |        "      <td>1.103679</td>\n",
658 |        "      <td>-1</td>\n",
659 |        "    </tr>\n",
660 |        "    <tr>\n",
661 |        "      <th>4</th>\n",
662 |        "      <td>Swansea</td>\n",
663 |        "      <td>Man United</td>\n",
664 |        "      <td>A</td>\n",
665 |        "      <td>1.103679</td>\n",
666 |        "      <td>1.145374</td>\n",
667 |        "      <td>0.925110</td>\n",
668 |        "      <td>0.936455</td>\n",
669 |        "      <td>-1</td>\n",
670 |        "    </tr>\n",
671 |        "  </tbody>\n",
672 |        "</table>\n",
673 |        "</div>"
674 |       ],
675 |       "text/plain": [
676 |        "     HomeTeam     AwayTeam FTR       HAS       HDS       AAS       ADS  Result\n",
677 |        "0     Arsenal  Aston Villa   A  1.204013  0.484581  1.409692  1.003344      -1\n",
678 |        "1   Liverpool        Stoke   H  1.772575  0.792952  2.114537  1.070234       1\n",
679 |        "2     Norwich      Everton   D  0.568562  0.792952  0.484581  1.471572       0\n",
680 |        "3  Sunderland       Fulham   A  0.702341  1.189427  0.881057  1.103679      -1\n",
681 |        "4     Swansea   Man United   A  1.103679  1.145374  0.925110  0.936455      -1"
682 |       ]
683 |      },
684 |      "execution_count": 32,
685 |      "metadata": {},
686 |      "output_type": "execute_result"
687 |     }
688 |    ],
689 |    "source": [
690 |     "feature_table.head()"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": 33,
696 |    "metadata": {
697 |     "collapsed": true
698 |    },
699 |    "outputs": [],
700 |    "source": [
701 |     "X_train = feature_table[['HAS','HDS','AAS','ADS']]\n",
702 |     "y_train = feature_table['Result']"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "code",
707 |    "execution_count": null,
708 |    "metadata": {
709 |     "collapsed": true
710 |    },
711 |    "outputs": [],
712 |    "source": []
713 |   },
714 |   {
715 |    "cell_type": "code",
716 |    "execution_count": 34,
717 |    "metadata": {
718 |     "collapsed": false
719 |    },
720 |    "outputs": [],
721 |    "source": [
722 |     "from sklearn.tree import DecisionTreeClassifier\n",
723 |     "from sklearn.naive_bayes import MultinomialNB\n",
724 |     "from xgboost import XGBClassifier\n",
725 |     "from sklearn.neighbors import KNeighborsClassifier\n",
726 |     "from sklearn.multiclass import OneVsRestClassifier\n",
727 |     "\n",
728 |     "from sklearn.linear_model import LogisticRegression\n",
729 |     "from sklearn.metrics import accuracy_score"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "markdown",
734 |    "metadata": {},
735 |    "source": [
736 |     "## Randomized Model as Benchmark"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "code",
741 |    "execution_count": 41,
742 |    "metadata": {
743 |     "collapsed": false
744 |    },
745 |    "outputs": [],
746 |    "source": [
747 |     "outcome_list = [-1,0,1]\n",
748 |     "y_pred = []\n",
749 |     "for i in xrange(1,381):\n",
750 |     "    y_pred.append(random.choice(outcome_list))\n",
751 |     "    "
752 |    ]
753 |   },
754 |   {
755 |    "cell_type": "code",
756 |    "execution_count": 42,
757 |    "metadata": {
758 |     "collapsed": false
759 |    },
760 |    "outputs": [
761 |     {
762 |      "data": {
763 |       "text/plain": [
764 |        "0.32631578947368423"
765 |       ]
766 |      },
767 |      "execution_count": 42,
768 |      "metadata": {},
769 |      "output_type": "execute_result"
770 |     }
771 |    ],
772 |    "source": [
773 |     "accuracy_score(y_train,y_pred)"
774 |    ]
775 |   },
776 |   {
777 |    "cell_type": "markdown",
778 |    "metadata": {},
779 |    "source": [
780 |     "With just a random model, we get an accuracy of 33% on an average which is expected since there are three outcomes to any fixture. "
781 |    ]
782 |   },
783 |   {
784 |    "cell_type": "markdown",
785 |    "metadata": {},
786 |    "source": [
787 |     "### Classifiers"
788 |    ]
789 |   },
790 |   {
791 |    "cell_type": "code",
792 |    "execution_count": 87,
793 |    "metadata": {
794 |     "collapsed": false
795 |    },
796 |    "outputs": [
797 |     {
798 |      "data": {
799 |       "text/plain": [
800 |        "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
801 |        "           metric_params=None, n_jobs=1, n_neighbors=15, p=2,\n",
802 |        "           weights='uniform')"
803 |       ]
804 |      },
805 |      "execution_count": 87,
806 |      "metadata": {},
807 |      "output_type": "execute_result"
808 |     }
809 |    ],
810 |    "source": [
811 |     "clf1 = DecisionTreeClassifier()\n",
812 |     "clf2 = XGBClassifier()\n",
813 |     "clf3 = KNeighborsClassifier(n_neighbors=15)\n",
814 |     "clf3.fit(X_train,y_train)"
815 |    ]
816 |   },
817 |   {
818 |    "cell_type": "code",
819 |    "execution_count": 88,
820 |    "metadata": {
821 |     "collapsed": false
822 |    },
823 |    "outputs": [
824 |     {
825 |      "data": {
826 |       "text/plain": [
827 |        "0.54736842105263162"
828 |       ]
829 |      },
830 |      "execution_count": 88,
831 |      "metadata": {},
832 |      "output_type": "execute_result"
833 |     }
834 |    ],
835 |    "source": [
836 |     "y_pred = clf3.predict(X_train)\n",
837 |     "accuracy_score(y_pred,y_train)"
838 |    ]
839 |   },
840 |   {
841 |    "cell_type": "code",
842 |    "execution_count": null,
843 |    "metadata": {
844 |     "collapsed": true
845 |    },
846 |    "outputs": [],
847 |    "source": []
848 |   },
849 |   {
850 |    "cell_type": "code",
851 |    "execution_count": null,
852 |    "metadata": {
853 |     "collapsed": true
854 |    },
855 |    "outputs": [],
856 |    "source": []
857 |   }
858 |  ],
859 |  "metadata": {
860 |   "kernelspec": {
861 |    "display_name": "Python [Root]",
862 |    "language": "python",
863 |    "name": "Python [Root]"
864 |   },
865 |   "language_info": {
866 |    "codemirror_mode": {
867 |     "name": "ipython",
868 |     "version": 2
869 |    },
870 |    "file_extension": ".py",
871 |    "mimetype": "text/x-python",
872 |    "name": "python",
873 |    "nbconvert_exporter": "python",
874 |    "pygments_lexer": "ipython2",
875 |    "version": "2.7.12"
876 |   }
877 |  },
878 |  "nbformat": 4,
879 |  "nbformat_minor": 0
880 | }
881 | 


--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_xml/data_wrangling_xml/sliderule_dsi_xml_exercise.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# XML example and exercise\n",
  8 |     "****\n",
  9 |     "+ study examples of accessing nodes in XML tree structure  \n",
 10 |     "+ work on exercise to be completed and submitted\n",
 11 |     "****\n",
 12 |     "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n",
 13 |     "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n",
 14 |     "****"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from xml.etree import ElementTree as ET"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## XML example\n",
 33 |     "\n",
 34 |     "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "document_tree = ET.parse( './data/mondial_database_less.xml' )"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    },
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "Albania\n",
 60 |       "Greece\n",
 61 |       "Macedonia\n",
 62 |       "Serbia\n",
 63 |       "Montenegro\n",
 64 |       "Kosovo\n",
 65 |       "Andorra\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "# print names of all countries\n",
 71 |     "for child in document_tree.getroot():\n",
 72 |     "    print child.find('name').text"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n",
 87 |       "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n",
 88 |       "* Macedonia: Skopje, Kumanovo\n",
 89 |       "* Serbia: Beograd, Novi Sad, Niš\n",
 90 |       "* Montenegro: Podgorica\n",
 91 |       "* Kosovo: Prishtine\n",
 92 |       "* Andorra: Andorra la Vella\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# print names of all countries and their cities\n",
 98 |     "for element in document_tree.iterfind('country'):\n",
 99 |     "    print '* ' + element.find('name').text + ':',\n",
100 |     "    capitals_string = ''\n",
101 |     "    for subelement in element.getiterator('city'):\n",
102 |     "        capitals_string += subelement.find('name').text + ', '\n",
103 |     "    print capitals_string[:-2]"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "****\n",
111 |     "## XML exercise\n",
112 |     "\n",
113 |     "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n",
114 |     "\n",
115 |     "1. 10 countries with the lowest infant mortality rates\n",
116 |     "2. 10 cities with the largest population\n",
117 |     "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
118 |     "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "document = ET.parse( './data/mondial_database.xml' )"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 7,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "#1. 10 countries with the lowest infant mortality rates\n",
141 |     "country_im = []\n",
142 |     "for country in document.iterfind('country'):\n",
143 |     "    if country.find('infant_mortality') is not None:\n",
144 |     "        country_im.append([country.find('name').text,country.find('infant_mortality').text])\n",
145 |     "    "
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 8,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "import pandas as pd\n",
157 |     "im = pd.DataFrame(country_im)\n",
158 |     "im.columns = [\"country\",\"infant_moratality\"]"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 9,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/html": [
171 |        "<div>\n",
172 |        "<table border=\"1\" class=\"dataframe\">\n",
173 |        "  <thead>\n",
174 |        "    <tr style=\"text-align: right;\">\n",
175 |        "      <th></th>\n",
176 |        "      <th>country</th>\n",
177 |        "      <th>infant_moratality</th>\n",
178 |        "    </tr>\n",
179 |        "  </thead>\n",
180 |        "  <tbody>\n",
181 |        "    <tr>\n",
182 |        "      <th>36</th>\n",
183 |        "      <td>Monaco</td>\n",
184 |        "      <td>1.81</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>90</th>\n",
188 |        "      <td>Japan</td>\n",
189 |        "      <td>2.13</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>109</th>\n",
193 |        "      <td>Bermuda</td>\n",
194 |        "      <td>2.48</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>34</th>\n",
198 |        "      <td>Norway</td>\n",
199 |        "      <td>2.48</td>\n",
200 |        "    </tr>\n",
201 |        "    <tr>\n",
202 |        "      <th>98</th>\n",
203 |        "      <td>Singapore</td>\n",
204 |        "      <td>2.53</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>35</th>\n",
208 |        "      <td>Sweden</td>\n",
209 |        "      <td>2.60</td>\n",
210 |        "    </tr>\n",
211 |        "    <tr>\n",
212 |        "      <th>8</th>\n",
213 |        "      <td>Czech Republic</td>\n",
214 |        "      <td>2.63</td>\n",
215 |        "    </tr>\n",
216 |        "    <tr>\n",
217 |        "      <th>72</th>\n",
218 |        "      <td>Hong Kong</td>\n",
219 |        "      <td>2.73</td>\n",
220 |        "    </tr>\n",
221 |        "    <tr>\n",
222 |        "      <th>73</th>\n",
223 |        "      <td>Macao</td>\n",
224 |        "      <td>3.13</td>\n",
225 |        "    </tr>\n",
226 |        "    <tr>\n",
227 |        "      <th>39</th>\n",
228 |        "      <td>Iceland</td>\n",
229 |        "      <td>3.15</td>\n",
230 |        "    </tr>\n",
231 |        "  </tbody>\n",
232 |        "</table>\n",
233 |        "</div>"
234 |       ],
235 |       "text/plain": [
236 |        "            country  infant_moratality\n",
237 |        "36           Monaco               1.81\n",
238 |        "90            Japan               2.13\n",
239 |        "109         Bermuda               2.48\n",
240 |        "34           Norway               2.48\n",
241 |        "98        Singapore               2.53\n",
242 |        "35           Sweden               2.60\n",
243 |        "8    Czech Republic               2.63\n",
244 |        "72        Hong Kong               2.73\n",
245 |        "73            Macao               3.13\n",
246 |        "39          Iceland               3.15"
247 |       ]
248 |      },
249 |      "execution_count": 9,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "im.infant_moratality = im.infant_moratality.astype(float)\n",
256 |     "im = im.sort_values(by='infant_moratality')\n",
257 |     "im.head(10)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 10,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "#2. 10 cities with the largest population\n",
269 |     "populations = []\n",
270 |     "for country in document.iterfind('country'):\n",
271 |     "    if country.find('population') is not None:\n",
272 |     "        populations.append([country.find('name').text,country.find('population').text])\n",
273 |     "    "
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 11,
279 |    "metadata": {
280 |     "collapsed": false
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "pop = pd.DataFrame(populations)\n",
285 |     "pop.columns = [\"country\",\"population\"]"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 12,
291 |    "metadata": {
292 |     "collapsed": false
293 |    },
294 |    "outputs": [
295 |     {
296 |      "data": {
297 |       "text/html": [
298 |        "<div>\n",
299 |        "<table border=\"1\" class=\"dataframe\">\n",
300 |        "  <thead>\n",
301 |        "    <tr style=\"text-align: right;\">\n",
302 |        "      <th></th>\n",
303 |        "      <th>country</th>\n",
304 |        "      <th>population</th>\n",
305 |        "    </tr>\n",
306 |        "  </thead>\n",
307 |        "  <tbody>\n",
308 |        "    <tr>\n",
309 |        "      <th>166</th>\n",
310 |        "      <td>Pitcairn</td>\n",
311 |        "      <td>68.0</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <th>83</th>\n",
315 |        "      <td>Cocos Islands</td>\n",
316 |        "      <td>628.0</td>\n",
317 |        "    </tr>\n",
318 |        "    <tr>\n",
319 |        "      <th>41</th>\n",
320 |        "      <td>Holy See</td>\n",
321 |        "      <td>840.0</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>121</th>\n",
325 |        "      <td>Cayman Islands</td>\n",
326 |        "      <td>933.0</td>\n",
327 |        "    </tr>\n",
328 |        "    <tr>\n",
329 |        "      <th>138</th>\n",
330 |        "      <td>Sint Maarten</td>\n",
331 |        "      <td>1497.0</td>\n",
332 |        "    </tr>\n",
333 |        "    <tr>\n",
334 |        "      <th>170</th>\n",
335 |        "      <td>Tokelau</td>\n",
336 |        "      <td>1570.0</td>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>39</th>\n",
340 |        "      <td>Gibraltar</td>\n",
341 |        "      <td>1816.0</td>\n",
342 |        "    </tr>\n",
343 |        "    <tr>\n",
344 |        "      <th>186</th>\n",
345 |        "      <td>Falkland Islands</td>\n",
346 |        "      <td>2043.0</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <th>159</th>\n",
350 |        "      <td>Nauru</td>\n",
351 |        "      <td>2066.0</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "      <th>52</th>\n",
355 |        "      <td>Svalbard</td>\n",
356 |        "      <td>2116.0</td>\n",
357 |        "    </tr>\n",
358 |        "  </tbody>\n",
359 |        "</table>\n",
360 |        "</div>"
361 |       ],
362 |       "text/plain": [
363 |        "              country  population\n",
364 |        "166          Pitcairn        68.0\n",
365 |        "83      Cocos Islands       628.0\n",
366 |        "41           Holy See       840.0\n",
367 |        "121    Cayman Islands       933.0\n",
368 |        "138      Sint Maarten      1497.0\n",
369 |        "170           Tokelau      1570.0\n",
370 |        "39          Gibraltar      1816.0\n",
371 |        "186  Falkland Islands      2043.0\n",
372 |        "159             Nauru      2066.0\n",
373 |        "52           Svalbard      2116.0"
374 |       ]
375 |      },
376 |      "execution_count": 12,
377 |      "metadata": {},
378 |      "output_type": "execute_result"
379 |     }
380 |    ],
381 |    "source": [
382 |     "pop.population = pop.population.astype(float)\n",
383 |     "pop = pop.sort_values(by = \"population\")\n",
384 |     "pop.head(10)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 13,
390 |    "metadata": {
391 |     "collapsed": false
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/html": [
397 |        "<div>\n",
398 |        "<table border=\"1\" class=\"dataframe\">\n",
399 |        "  <thead>\n",
400 |        "    <tr style=\"text-align: right;\">\n",
401 |        "      <th></th>\n",
402 |        "      <th>countries</th>\n",
403 |        "      <th>country_percentage</th>\n",
404 |        "      <th>country_pop</th>\n",
405 |        "      <th>ethnicity</th>\n",
406 |        "      <th>population</th>\n",
407 |        "      <th>year</th>\n",
408 |        "    </tr>\n",
409 |        "  </thead>\n",
410 |        "  <tbody>\n",
411 |        "    <tr>\n",
412 |        "      <th>0</th>\n",
413 |        "      <td>Albania</td>\n",
414 |        "      <td>95.0</td>\n",
415 |        "      <td>2800138</td>\n",
416 |        "      <td>Albanian</td>\n",
417 |        "      <td>2660131</td>\n",
418 |        "      <td>2011</td>\n",
419 |        "    </tr>\n",
420 |        "    <tr>\n",
421 |        "      <th>1</th>\n",
422 |        "      <td>Albania</td>\n",
423 |        "      <td>3.0</td>\n",
424 |        "      <td>2800138</td>\n",
425 |        "      <td>Greek</td>\n",
426 |        "      <td>84004</td>\n",
427 |        "      <td>2011</td>\n",
428 |        "    </tr>\n",
429 |        "    <tr>\n",
430 |        "      <th>2</th>\n",
431 |        "      <td>Greece</td>\n",
432 |        "      <td>93.0</td>\n",
433 |        "      <td>10816286</td>\n",
434 |        "      <td>Greek</td>\n",
435 |        "      <td>10059145</td>\n",
436 |        "      <td>2011</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>3</th>\n",
440 |        "      <td>Macedonia</td>\n",
441 |        "      <td>64.2</td>\n",
442 |        "      <td>2059794</td>\n",
443 |        "      <td>Macedonian</td>\n",
444 |        "      <td>1322387</td>\n",
445 |        "      <td>2011</td>\n",
446 |        "    </tr>\n",
447 |        "    <tr>\n",
448 |        "      <th>4</th>\n",
449 |        "      <td>Macedonia</td>\n",
450 |        "      <td>25.2</td>\n",
451 |        "      <td>2059794</td>\n",
452 |        "      <td>Albanian</td>\n",
453 |        "      <td>519068</td>\n",
454 |        "      <td>2011</td>\n",
455 |        "    </tr>\n",
456 |        "    <tr>\n",
457 |        "      <th>5</th>\n",
458 |        "      <td>Macedonia</td>\n",
459 |        "      <td>3.9</td>\n",
460 |        "      <td>2059794</td>\n",
461 |        "      <td>Turkish</td>\n",
462 |        "      <td>80331</td>\n",
463 |        "      <td>2011</td>\n",
464 |        "    </tr>\n",
465 |        "    <tr>\n",
466 |        "      <th>6</th>\n",
467 |        "      <td>Macedonia</td>\n",
468 |        "      <td>2.7</td>\n",
469 |        "      <td>2059794</td>\n",
470 |        "      <td>Gypsy</td>\n",
471 |        "      <td>55614</td>\n",
472 |        "      <td>2011</td>\n",
473 |        "    </tr>\n",
474 |        "    <tr>\n",
475 |        "      <th>7</th>\n",
476 |        "      <td>Macedonia</td>\n",
477 |        "      <td>1.8</td>\n",
478 |        "      <td>2059794</td>\n",
479 |        "      <td>Serb</td>\n",
480 |        "      <td>37076</td>\n",
481 |        "      <td>2011</td>\n",
482 |        "    </tr>\n",
483 |        "    <tr>\n",
484 |        "      <th>8</th>\n",
485 |        "      <td>Serbia</td>\n",
486 |        "      <td>82.9</td>\n",
487 |        "      <td>7120666</td>\n",
488 |        "      <td>Serb</td>\n",
489 |        "      <td>5903032</td>\n",
490 |        "      <td>2011</td>\n",
491 |        "    </tr>\n",
492 |        "    <tr>\n",
493 |        "      <th>9</th>\n",
494 |        "      <td>Serbia</td>\n",
495 |        "      <td>0.9</td>\n",
496 |        "      <td>7120666</td>\n",
497 |        "      <td>Montenegrin</td>\n",
498 |        "      <td>64085</td>\n",
499 |        "      <td>2011</td>\n",
500 |        "    </tr>\n",
501 |        "  </tbody>\n",
502 |        "</table>\n",
503 |        "</div>"
504 |       ],
505 |       "text/plain": [
506 |        "   countries  country_percentage  country_pop    ethnicity  population  year\n",
507 |        "0    Albania                95.0      2800138     Albanian     2660131  2011\n",
508 |        "1    Albania                 3.0      2800138        Greek       84004  2011\n",
509 |        "2     Greece                93.0     10816286        Greek    10059145  2011\n",
510 |        "3  Macedonia                64.2      2059794   Macedonian     1322387  2011\n",
511 |        "4  Macedonia                25.2      2059794     Albanian      519068  2011\n",
512 |        "5  Macedonia                 3.9      2059794      Turkish       80331  2011\n",
513 |        "6  Macedonia                 2.7      2059794        Gypsy       55614  2011\n",
514 |        "7  Macedonia                 1.8      2059794         Serb       37076  2011\n",
515 |        "8     Serbia                82.9      7120666         Serb     5903032  2011\n",
516 |        "9     Serbia                 0.9      7120666  Montenegrin       64085  2011"
517 |       ]
518 |      },
519 |      "execution_count": 13,
520 |      "metadata": {},
521 |      "output_type": "execute_result"
522 |     }
523 |    ],
524 |    "source": [
525 |     "#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
526 |     "data = [] \n",
527 |     "\n",
528 |     "for country in document.findall('country'):\n",
529 |     "    for node in list(country):\n",
530 |     "        if node.tag == 'name':\n",
531 |     "            co = node.text\n",
532 |     "        elif node.tag == 'population':\n",
533 |     "            # the last listed population statistic is used\n",
534 |     "            pop = int(node.text)\n",
535 |     "            #meas = node.attrib['measured'] --leads to an error, potentially unpopulated at times\n",
536 |     "            yr = int(node.attrib['year'])\n",
537 |     "        elif node.tag == 'ethnicgroup':\n",
538 |     "            eth = node.text\n",
539 |     "            perc = float(node.attrib['percentage'])\n",
540 |     "            epop = int(pop * perc / 100.)\n",
541 |     "            \n",
542 |     "            data.append({'countries':co, 'country_pop':pop, 'year':yr,\n",
543 |     "                        'ethnicity':eth, 'country_percentage':perc, 'population':epop})\n",
544 |     "    \n",
545 |     "df = pd.DataFrame(data)\n",
546 |     "df.head(10)"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": 15,
552 |    "metadata": {
553 |     "collapsed": false
554 |    },
555 |    "outputs": [
556 |     {
557 |      "data": {
558 |       "text/html": [
559 |        "<div>\n",
560 |        "<table border=\"1\" class=\"dataframe\">\n",
561 |        "  <thead>\n",
562 |        "    <tr style=\"text-align: right;\">\n",
563 |        "      <th></th>\n",
564 |        "      <th>ethnicity</th>\n",
565 |        "      <th>population</th>\n",
566 |        "    </tr>\n",
567 |        "  </thead>\n",
568 |        "  <tbody>\n",
569 |        "    <tr>\n",
570 |        "      <th>0</th>\n",
571 |        "      <td>Han Chinese</td>\n",
572 |        "      <td>1245058800</td>\n",
573 |        "    </tr>\n",
574 |        "    <tr>\n",
575 |        "      <th>1</th>\n",
576 |        "      <td>Indo-Aryan</td>\n",
577 |        "      <td>871815583</td>\n",
578 |        "    </tr>\n",
579 |        "    <tr>\n",
580 |        "      <th>2</th>\n",
581 |        "      <td>European</td>\n",
582 |        "      <td>494872201</td>\n",
583 |        "    </tr>\n",
584 |        "    <tr>\n",
585 |        "      <th>3</th>\n",
586 |        "      <td>African</td>\n",
587 |        "      <td>318325104</td>\n",
588 |        "    </tr>\n",
589 |        "    <tr>\n",
590 |        "      <th>4</th>\n",
591 |        "      <td>Dravidian</td>\n",
592 |        "      <td>302713744</td>\n",
593 |        "    </tr>\n",
594 |        "    <tr>\n",
595 |        "      <th>5</th>\n",
596 |        "      <td>Mestizo</td>\n",
597 |        "      <td>157734349</td>\n",
598 |        "    </tr>\n",
599 |        "    <tr>\n",
600 |        "      <th>6</th>\n",
601 |        "      <td>Bengali</td>\n",
602 |        "      <td>146776916</td>\n",
603 |        "    </tr>\n",
604 |        "    <tr>\n",
605 |        "      <th>7</th>\n",
606 |        "      <td>Russian</td>\n",
607 |        "      <td>131856989</td>\n",
608 |        "    </tr>\n",
609 |        "    <tr>\n",
610 |        "      <th>8</th>\n",
611 |        "      <td>Japanese</td>\n",
612 |        "      <td>126534212</td>\n",
613 |        "    </tr>\n",
614 |        "    <tr>\n",
615 |        "      <th>9</th>\n",
616 |        "      <td>Malay</td>\n",
617 |        "      <td>121993548</td>\n",
618 |        "    </tr>\n",
619 |        "  </tbody>\n",
620 |        "</table>\n",
621 |        "</div>"
622 |       ],
623 |       "text/plain": [
624 |        "     ethnicity  population\n",
625 |        "0  Han Chinese  1245058800\n",
626 |        "1   Indo-Aryan   871815583\n",
627 |        "2     European   494872201\n",
628 |        "3      African   318325104\n",
629 |        "4    Dravidian   302713744\n",
630 |        "5      Mestizo   157734349\n",
631 |        "6      Bengali   146776916\n",
632 |        "7      Russian   131856989\n",
633 |        "8     Japanese   126534212\n",
634 |        "9        Malay   121993548"
635 |       ]
636 |      },
637 |      "execution_count": 15,
638 |      "metadata": {},
639 |      "output_type": "execute_result"
640 |     }
641 |    ],
642 |    "source": [
643 |     "df.groupby('ethnicity').population.sum().sort_values(ascending=False).head(10).reset_index()"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": 14,
649 |    "metadata": {
650 |     "collapsed": true
651 |    },
652 |    "outputs": [],
653 |    "source": [
654 |     "#4. name and country of a) longest river "
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 17,
660 |    "metadata": {
661 |     "collapsed": false
662 |    },
663 |    "outputs": [
664 |     {
665 |      "data": {
666 |       "text/html": [
667 |        "<div>\n",
668 |        "<table border=\"1\" class=\"dataframe\">\n",
669 |        "  <thead>\n",
670 |        "    <tr style=\"text-align: right;\">\n",
671 |        "      <th></th>\n",
672 |        "      <th>country</th>\n",
673 |        "      <th>length</th>\n",
674 |        "      <th>name</th>\n",
675 |        "    </tr>\n",
676 |        "  </thead>\n",
677 |        "  <tbody>\n",
678 |        "    <tr>\n",
679 |        "      <th>161</th>\n",
680 |        "      <td>CO</td>\n",
681 |        "      <td>6448</td>\n",
682 |        "      <td>Amazonas</td>\n",
683 |        "    </tr>\n",
684 |        "  </tbody>\n",
685 |        "</table>\n",
686 |        "</div>"
687 |       ],
688 |       "text/plain": [
689 |        "    country  length      name\n",
690 |        "161      CO    6448  Amazonas"
691 |       ]
692 |      },
693 |      "execution_count": 17,
694 |      "metadata": {},
695 |      "output_type": "execute_result"
696 |     }
697 |    ],
698 |    "source": [
699 |     "rivers_list=[]\n",
700 |     "rivers_df = pd.DataFrame()\n",
701 |     "for rivers in document.iterfind('river'):\n",
702 |     "    try:\n",
703 |     "        rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})\n",
704 |     "    except:\n",
705 |     "        next\n",
706 |     "rivers_df = pd.DataFrame(rivers_list)\n",
707 |     "rivers_df.sort_values(by = 'length', ascending=False).head(1)"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": null,
713 |    "metadata": {
714 |     "collapsed": true
715 |    },
716 |    "outputs": [],
717 |    "source": [
718 |     "#b) largest lake"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": 19,
724 |    "metadata": {
725 |     "collapsed": false
726 |    },
727 |    "outputs": [
728 |     {
729 |      "data": {
730 |       "text/html": [
731 |        "<div>\n",
732 |        "<table border=\"1\" class=\"dataframe\">\n",
733 |        "  <thead>\n",
734 |        "    <tr style=\"text-align: right;\">\n",
735 |        "      <th></th>\n",
736 |        "      <th>area</th>\n",
737 |        "      <th>country</th>\n",
738 |        "      <th>name</th>\n",
739 |        "    </tr>\n",
740 |        "  </thead>\n",
741 |        "  <tbody>\n",
742 |        "    <tr>\n",
743 |        "      <th>42</th>\n",
744 |        "      <td>386400</td>\n",
745 |        "      <td>R</td>\n",
746 |        "      <td>Caspian Sea</td>\n",
747 |        "    </tr>\n",
748 |        "  </tbody>\n",
749 |        "</table>\n",
750 |        "</div>"
751 |       ],
752 |       "text/plain": [
753 |        "      area country         name\n",
754 |        "42  386400       R  Caspian Sea"
755 |       ]
756 |      },
757 |      "execution_count": 19,
758 |      "metadata": {},
759 |      "output_type": "execute_result"
760 |     }
761 |    ],
762 |    "source": [
763 |     "lake_list=[]\n",
764 |     "lake_df = pd.DataFrame()\n",
765 |     "for lakes in document.iterfind('lake'):\n",
766 |     "    try:\n",
767 |     "        lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})\n",
768 |     "    except:\n",
769 |     "        next\n",
770 |     "lakes_df = pd.DataFrame(lake_list)\n",
771 |     "lakes_df.sort_values(by = 'area', ascending=False).head(1)"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": null,
777 |    "metadata": {
778 |     "collapsed": true
779 |    },
780 |    "outputs": [],
781 |    "source": [
782 |     "#c) airport at highest elevation"
783 |    ]
784 |   },
785 |   {
786 |    "cell_type": "code",
787 |    "execution_count": 20,
788 |    "metadata": {
789 |     "collapsed": false
790 |    },
791 |    "outputs": [
792 |     {
793 |      "name": "stderr",
794 |      "output_type": "stream",
795 |      "text": [
796 |       "/home/sibi/acad/prog_tools/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
797 |      ]
798 |     },
799 |     {
800 |      "data": {
801 |       "text/html": [
802 |        "<div>\n",
803 |        "<table border=\"1\" class=\"dataframe\">\n",
804 |        "  <thead>\n",
805 |        "    <tr style=\"text-align: right;\">\n",
806 |        "      <th></th>\n",
807 |        "      <th>country</th>\n",
808 |        "      <th>elevation</th>\n",
809 |        "      <th>name</th>\n",
810 |        "    </tr>\n",
811 |        "  </thead>\n",
812 |        "  <tbody>\n",
813 |        "    <tr>\n",
814 |        "      <th>80</th>\n",
815 |        "      <td>BOL</td>\n",
816 |        "      <td>4063</td>\n",
817 |        "      <td>El Alto Intl</td>\n",
818 |        "    </tr>\n",
819 |        "  </tbody>\n",
820 |        "</table>\n",
821 |        "</div>"
822 |       ],
823 |       "text/plain": [
824 |        "   country  elevation          name\n",
825 |        "80     BOL       4063  El Alto Intl"
826 |       ]
827 |      },
828 |      "execution_count": 20,
829 |      "metadata": {},
830 |      "output_type": "execute_result"
831 |     }
832 |    ],
833 |    "source": [
834 |     "ap_list=[]\n",
835 |     "ap_df = pd.DataFrame()\n",
836 |     "for ap in document.iterfind('airport'):\n",
837 |     "    try:\n",
838 |     "        ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})\n",
839 |     "    except:\n",
840 |     "        next\n",
841 |     "ap_df = pd.DataFrame(ap_list)\n",
842 |     "ap_df.sort('elevation', ascending=False).head(1)\n"
843 |    ]
844 |   },
845 |   {
846 |    "cell_type": "code",
847 |    "execution_count": null,
848 |    "metadata": {
849 |     "collapsed": true
850 |    },
851 |    "outputs": [],
852 |    "source": []
853 |   }
854 |  ],
855 |  "metadata": {
856 |   "kernelspec": {
857 |    "display_name": "Python [Root]",
858 |    "language": "python",
859 |    "name": "Python [Root]"
860 |   },
861 |   "language_info": {
862 |    "codemirror_mode": {
863 |     "name": "ipython",
864 |     "version": 2
865 |    },
866 |    "file_extension": ".py",
867 |    "mimetype": "text/x-python",
868 |    "name": "python",
869 |    "nbconvert_exporter": "python",
870 |    "pygments_lexer": "ipython2",
871 |    "version": "2.7.12"
872 |   }
873 |  },
874 |  "nbformat": 4,
875 |  "nbformat_minor": 0
876 | }
877 | 


--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_xml/data_wrangling_xml/.ipynb_checkpoints/sliderule_dsi_xml_exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# XML example and exercise\n",
  8 |     "****\n",
  9 |     "+ study examples of accessing nodes in XML tree structure  \n",
 10 |     "+ work on exercise to be completed and submitted\n",
 11 |     "****\n",
 12 |     "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n",
 13 |     "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n",
 14 |     "****"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {
 21 |     "collapsed": true
 22 |    },
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "from xml.etree import ElementTree as ET"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "## XML example\n",
 33 |     "\n",
 34 |     "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "document_tree = ET.parse( './data/mondial_database_less.xml' )"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {
 52 |     "collapsed": false
 53 |    },
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "Albania\n",
 60 |       "Greece\n",
 61 |       "Macedonia\n",
 62 |       "Serbia\n",
 63 |       "Montenegro\n",
 64 |       "Kosovo\n",
 65 |       "Andorra\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "# print names of all countries\n",
 71 |     "for child in document_tree.getroot():\n",
 72 |     "    print child.find('name').text"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 5,
 78 |    "metadata": {
 79 |     "collapsed": false
 80 |    },
 81 |    "outputs": [
 82 |     {
 83 |      "name": "stdout",
 84 |      "output_type": "stream",
 85 |      "text": [
 86 |       "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n",
 87 |       "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n",
 88 |       "* Macedonia: Skopje, Kumanovo\n",
 89 |       "* Serbia: Beograd, Novi Sad, Niš\n",
 90 |       "* Montenegro: Podgorica\n",
 91 |       "* Kosovo: Prishtine\n",
 92 |       "* Andorra: Andorra la Vella\n"
 93 |      ]
 94 |     }
 95 |    ],
 96 |    "source": [
 97 |     "# print names of all countries and their cities\n",
 98 |     "for element in document_tree.iterfind('country'):\n",
 99 |     "    print '* ' + element.find('name').text + ':',\n",
100 |     "    capitals_string = ''\n",
101 |     "    for subelement in element.getiterator('city'):\n",
102 |     "        capitals_string += subelement.find('name').text + ', '\n",
103 |     "    print capitals_string[:-2]"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "****\n",
111 |     "## XML exercise\n",
112 |     "\n",
113 |     "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n",
114 |     "\n",
115 |     "1. 10 countries with the lowest infant mortality rates\n",
116 |     "2. 10 cities with the largest population\n",
117 |     "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
118 |     "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 6,
124 |    "metadata": {
125 |     "collapsed": false
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "document = ET.parse( './data/mondial_database.xml' )"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 7,
135 |    "metadata": {
136 |     "collapsed": false
137 |    },
138 |    "outputs": [],
139 |    "source": [
140 |     "#1. 10 countries with the lowest infant mortality rates\n",
141 |     "country_im = []\n",
142 |     "for country in document.iterfind('country'):\n",
143 |     "    if country.find('infant_mortality') is not None:\n",
144 |     "        country_im.append([country.find('name').text,country.find('infant_mortality').text])\n",
145 |     "    "
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 8,
151 |    "metadata": {
152 |     "collapsed": false
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "import pandas as pd\n",
157 |     "im = pd.DataFrame(country_im)\n",
158 |     "im.columns = [\"country\",\"infant_moratality\"]"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 9,
164 |    "metadata": {
165 |     "collapsed": false
166 |    },
167 |    "outputs": [
168 |     {
169 |      "data": {
170 |       "text/html": [
171 |        "<div>\n",
172 |        "<table border=\"1\" class=\"dataframe\">\n",
173 |        "  <thead>\n",
174 |        "    <tr style=\"text-align: right;\">\n",
175 |        "      <th></th>\n",
176 |        "      <th>country</th>\n",
177 |        "      <th>infant_moratality</th>\n",
178 |        "    </tr>\n",
179 |        "  </thead>\n",
180 |        "  <tbody>\n",
181 |        "    <tr>\n",
182 |        "      <th>36</th>\n",
183 |        "      <td>Monaco</td>\n",
184 |        "      <td>1.81</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>90</th>\n",
188 |        "      <td>Japan</td>\n",
189 |        "      <td>2.13</td>\n",
190 |        "    </tr>\n",
191 |        "    <tr>\n",
192 |        "      <th>109</th>\n",
193 |        "      <td>Bermuda</td>\n",
194 |        "      <td>2.48</td>\n",
195 |        "    </tr>\n",
196 |        "    <tr>\n",
197 |        "      <th>34</th>\n",
198 |        "      <td>Norway</td>\n",
199 |        "      <td>2.48</td>\n",
200 |        "    </tr>\n",
201 |        "    <tr>\n",
202 |        "      <th>98</th>\n",
203 |        "      <td>Singapore</td>\n",
204 |        "      <td>2.53</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>35</th>\n",
208 |        "      <td>Sweden</td>\n",
209 |        "      <td>2.60</td>\n",
210 |        "    </tr>\n",
211 |        "    <tr>\n",
212 |        "      <th>8</th>\n",
213 |        "      <td>Czech Republic</td>\n",
214 |        "      <td>2.63</td>\n",
215 |        "    </tr>\n",
216 |        "    <tr>\n",
217 |        "      <th>72</th>\n",
218 |        "      <td>Hong Kong</td>\n",
219 |        "      <td>2.73</td>\n",
220 |        "    </tr>\n",
221 |        "    <tr>\n",
222 |        "      <th>73</th>\n",
223 |        "      <td>Macao</td>\n",
224 |        "      <td>3.13</td>\n",
225 |        "    </tr>\n",
226 |        "    <tr>\n",
227 |        "      <th>39</th>\n",
228 |        "      <td>Iceland</td>\n",
229 |        "      <td>3.15</td>\n",
230 |        "    </tr>\n",
231 |        "  </tbody>\n",
232 |        "</table>\n",
233 |        "</div>"
234 |       ],
235 |       "text/plain": [
236 |        "            country  infant_moratality\n",
237 |        "36           Monaco               1.81\n",
238 |        "90            Japan               2.13\n",
239 |        "109         Bermuda               2.48\n",
240 |        "34           Norway               2.48\n",
241 |        "98        Singapore               2.53\n",
242 |        "35           Sweden               2.60\n",
243 |        "8    Czech Republic               2.63\n",
244 |        "72        Hong Kong               2.73\n",
245 |        "73            Macao               3.13\n",
246 |        "39          Iceland               3.15"
247 |       ]
248 |      },
249 |      "execution_count": 9,
250 |      "metadata": {},
251 |      "output_type": "execute_result"
252 |     }
253 |    ],
254 |    "source": [
255 |     "im.infant_moratality = im.infant_moratality.astype(float)\n",
256 |     "im = im.sort_values(by='infant_moratality')\n",
257 |     "im.head(10)"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 10,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "#2. 10 cities with the largest population\n",
269 |     "populations = []\n",
270 |     "for country in document.iterfind('country'):\n",
271 |     "    if country.find('population') is not None:\n",
272 |     "        populations.append([country.find('name').text,country.find('population').text])\n",
273 |     "    "
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 11,
279 |    "metadata": {
280 |     "collapsed": false
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "pop = pd.DataFrame(populations)\n",
285 |     "pop.columns = [\"country\",\"population\"]"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 12,
291 |    "metadata": {
292 |     "collapsed": false
293 |    },
294 |    "outputs": [
295 |     {
296 |      "data": {
297 |       "text/html": [
298 |        "<div>\n",
299 |        "<table border=\"1\" class=\"dataframe\">\n",
300 |        "  <thead>\n",
301 |        "    <tr style=\"text-align: right;\">\n",
302 |        "      <th></th>\n",
303 |        "      <th>country</th>\n",
304 |        "      <th>population</th>\n",
305 |        "    </tr>\n",
306 |        "  </thead>\n",
307 |        "  <tbody>\n",
308 |        "    <tr>\n",
309 |        "      <th>166</th>\n",
310 |        "      <td>Pitcairn</td>\n",
311 |        "      <td>68.0</td>\n",
312 |        "    </tr>\n",
313 |        "    <tr>\n",
314 |        "      <th>83</th>\n",
315 |        "      <td>Cocos Islands</td>\n",
316 |        "      <td>628.0</td>\n",
317 |        "    </tr>\n",
318 |        "    <tr>\n",
319 |        "      <th>41</th>\n",
320 |        "      <td>Holy See</td>\n",
321 |        "      <td>840.0</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>121</th>\n",
325 |        "      <td>Cayman Islands</td>\n",
326 |        "      <td>933.0</td>\n",
327 |        "    </tr>\n",
328 |        "    <tr>\n",
329 |        "      <th>138</th>\n",
330 |        "      <td>Sint Maarten</td>\n",
331 |        "      <td>1497.0</td>\n",
332 |        "    </tr>\n",
333 |        "    <tr>\n",
334 |        "      <th>170</th>\n",
335 |        "      <td>Tokelau</td>\n",
336 |        "      <td>1570.0</td>\n",
337 |        "    </tr>\n",
338 |        "    <tr>\n",
339 |        "      <th>39</th>\n",
340 |        "      <td>Gibraltar</td>\n",
341 |        "      <td>1816.0</td>\n",
342 |        "    </tr>\n",
343 |        "    <tr>\n",
344 |        "      <th>186</th>\n",
345 |        "      <td>Falkland Islands</td>\n",
346 |        "      <td>2043.0</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <th>159</th>\n",
350 |        "      <td>Nauru</td>\n",
351 |        "      <td>2066.0</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "      <th>52</th>\n",
355 |        "      <td>Svalbard</td>\n",
356 |        "      <td>2116.0</td>\n",
357 |        "    </tr>\n",
358 |        "  </tbody>\n",
359 |        "</table>\n",
360 |        "</div>"
361 |       ],
362 |       "text/plain": [
363 |        "              country  population\n",
364 |        "166          Pitcairn        68.0\n",
365 |        "83      Cocos Islands       628.0\n",
366 |        "41           Holy See       840.0\n",
367 |        "121    Cayman Islands       933.0\n",
368 |        "138      Sint Maarten      1497.0\n",
369 |        "170           Tokelau      1570.0\n",
370 |        "39          Gibraltar      1816.0\n",
371 |        "186  Falkland Islands      2043.0\n",
372 |        "159             Nauru      2066.0\n",
373 |        "52           Svalbard      2116.0"
374 |       ]
375 |      },
376 |      "execution_count": 12,
377 |      "metadata": {},
378 |      "output_type": "execute_result"
379 |     }
380 |    ],
381 |    "source": [
382 |     "pop.population = pop.population.astype(float)\n",
383 |     "pop = pop.sort_values(by = \"population\")\n",
384 |     "pop.head(10)"
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 13,
390 |    "metadata": {
391 |     "collapsed": false
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/html": [
397 |        "<div>\n",
398 |        "<table border=\"1\" class=\"dataframe\">\n",
399 |        "  <thead>\n",
400 |        "    <tr style=\"text-align: right;\">\n",
401 |        "      <th></th>\n",
402 |        "      <th>countries</th>\n",
403 |        "      <th>country_percentage</th>\n",
404 |        "      <th>country_pop</th>\n",
405 |        "      <th>ethnicity</th>\n",
406 |        "      <th>population</th>\n",
407 |        "      <th>year</th>\n",
408 |        "    </tr>\n",
409 |        "  </thead>\n",
410 |        "  <tbody>\n",
411 |        "    <tr>\n",
412 |        "      <th>0</th>\n",
413 |        "      <td>Albania</td>\n",
414 |        "      <td>95.0</td>\n",
415 |        "      <td>2800138</td>\n",
416 |        "      <td>Albanian</td>\n",
417 |        "      <td>2660131</td>\n",
418 |        "      <td>2011</td>\n",
419 |        "    </tr>\n",
420 |        "    <tr>\n",
421 |        "      <th>1</th>\n",
422 |        "      <td>Albania</td>\n",
423 |        "      <td>3.0</td>\n",
424 |        "      <td>2800138</td>\n",
425 |        "      <td>Greek</td>\n",
426 |        "      <td>84004</td>\n",
427 |        "      <td>2011</td>\n",
428 |        "    </tr>\n",
429 |        "    <tr>\n",
430 |        "      <th>2</th>\n",
431 |        "      <td>Greece</td>\n",
432 |        "      <td>93.0</td>\n",
433 |        "      <td>10816286</td>\n",
434 |        "      <td>Greek</td>\n",
435 |        "      <td>10059145</td>\n",
436 |        "      <td>2011</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>3</th>\n",
440 |        "      <td>Macedonia</td>\n",
441 |        "      <td>64.2</td>\n",
442 |        "      <td>2059794</td>\n",
443 |        "      <td>Macedonian</td>\n",
444 |        "      <td>1322387</td>\n",
445 |        "      <td>2011</td>\n",
446 |        "    </tr>\n",
447 |        "    <tr>\n",
448 |        "      <th>4</th>\n",
449 |        "      <td>Macedonia</td>\n",
450 |        "      <td>25.2</td>\n",
451 |        "      <td>2059794</td>\n",
452 |        "      <td>Albanian</td>\n",
453 |        "      <td>519068</td>\n",
454 |        "      <td>2011</td>\n",
455 |        "    </tr>\n",
456 |        "    <tr>\n",
457 |        "      <th>5</th>\n",
458 |        "      <td>Macedonia</td>\n",
459 |        "      <td>3.9</td>\n",
460 |        "      <td>2059794</td>\n",
461 |        "      <td>Turkish</td>\n",
462 |        "      <td>80331</td>\n",
463 |        "      <td>2011</td>\n",
464 |        "    </tr>\n",
465 |        "    <tr>\n",
466 |        "      <th>6</th>\n",
467 |        "      <td>Macedonia</td>\n",
468 |        "      <td>2.7</td>\n",
469 |        "      <td>2059794</td>\n",
470 |        "      <td>Gypsy</td>\n",
471 |        "      <td>55614</td>\n",
472 |        "      <td>2011</td>\n",
473 |        "    </tr>\n",
474 |        "    <tr>\n",
475 |        "      <th>7</th>\n",
476 |        "      <td>Macedonia</td>\n",
477 |        "      <td>1.8</td>\n",
478 |        "      <td>2059794</td>\n",
479 |        "      <td>Serb</td>\n",
480 |        "      <td>37076</td>\n",
481 |        "      <td>2011</td>\n",
482 |        "    </tr>\n",
483 |        "    <tr>\n",
484 |        "      <th>8</th>\n",
485 |        "      <td>Serbia</td>\n",
486 |        "      <td>82.9</td>\n",
487 |        "      <td>7120666</td>\n",
488 |        "      <td>Serb</td>\n",
489 |        "      <td>5903032</td>\n",
490 |        "      <td>2011</td>\n",
491 |        "    </tr>\n",
492 |        "    <tr>\n",
493 |        "      <th>9</th>\n",
494 |        "      <td>Serbia</td>\n",
495 |        "      <td>0.9</td>\n",
496 |        "      <td>7120666</td>\n",
497 |        "      <td>Montenegrin</td>\n",
498 |        "      <td>64085</td>\n",
499 |        "      <td>2011</td>\n",
500 |        "    </tr>\n",
501 |        "  </tbody>\n",
502 |        "</table>\n",
503 |        "</div>"
504 |       ],
505 |       "text/plain": [
506 |        "   countries  country_percentage  country_pop    ethnicity  population  year\n",
507 |        "0    Albania                95.0      2800138     Albanian     2660131  2011\n",
508 |        "1    Albania                 3.0      2800138        Greek       84004  2011\n",
509 |        "2     Greece                93.0     10816286        Greek    10059145  2011\n",
510 |        "3  Macedonia                64.2      2059794   Macedonian     1322387  2011\n",
511 |        "4  Macedonia                25.2      2059794     Albanian      519068  2011\n",
512 |        "5  Macedonia                 3.9      2059794      Turkish       80331  2011\n",
513 |        "6  Macedonia                 2.7      2059794        Gypsy       55614  2011\n",
514 |        "7  Macedonia                 1.8      2059794         Serb       37076  2011\n",
515 |        "8     Serbia                82.9      7120666         Serb     5903032  2011\n",
516 |        "9     Serbia                 0.9      7120666  Montenegrin       64085  2011"
517 |       ]
518 |      },
519 |      "execution_count": 13,
520 |      "metadata": {},
521 |      "output_type": "execute_result"
522 |     }
523 |    ],
524 |    "source": [
525 |     "#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
526 |     "data = [] \n",
527 |     "\n",
528 |     "for country in document.findall('country'):\n",
529 |     "    for node in list(country):\n",
530 |     "        if node.tag == 'name':\n",
531 |     "            co = node.text\n",
532 |     "        elif node.tag == 'population':\n",
533 |     "            # the last listed population statistic is used\n",
534 |     "            pop = int(node.text)\n",
535 |     "            #meas = node.attrib['measured'] --leads to an error, potentially unpopulated at times\n",
536 |     "            yr = int(node.attrib['year'])\n",
537 |     "        elif node.tag == 'ethnicgroup':\n",
538 |     "            eth = node.text\n",
539 |     "            perc = float(node.attrib['percentage'])\n",
540 |     "            epop = int(pop * perc / 100.)\n",
541 |     "            \n",
542 |     "            data.append({'countries':co, 'country_pop':pop, 'year':yr,\n",
543 |     "                        'ethnicity':eth, 'country_percentage':perc, 'population':epop})\n",
544 |     "    \n",
545 |     "df = pd.DataFrame(data)\n",
546 |     "df.head(10)"
547 |    ]
548 |   },
549 |   {
550 |    "cell_type": "code",
551 |    "execution_count": 15,
552 |    "metadata": {
553 |     "collapsed": false
554 |    },
555 |    "outputs": [
556 |     {
557 |      "data": {
558 |       "text/html": [
559 |        "<div>\n",
560 |        "<table border=\"1\" class=\"dataframe\">\n",
561 |        "  <thead>\n",
562 |        "    <tr style=\"text-align: right;\">\n",
563 |        "      <th></th>\n",
564 |        "      <th>ethnicity</th>\n",
565 |        "      <th>population</th>\n",
566 |        "    </tr>\n",
567 |        "  </thead>\n",
568 |        "  <tbody>\n",
569 |        "    <tr>\n",
570 |        "      <th>0</th>\n",
571 |        "      <td>Han Chinese</td>\n",
572 |        "      <td>1245058800</td>\n",
573 |        "    </tr>\n",
574 |        "    <tr>\n",
575 |        "      <th>1</th>\n",
576 |        "      <td>Indo-Aryan</td>\n",
577 |        "      <td>871815583</td>\n",
578 |        "    </tr>\n",
579 |        "    <tr>\n",
580 |        "      <th>2</th>\n",
581 |        "      <td>European</td>\n",
582 |        "      <td>494872201</td>\n",
583 |        "    </tr>\n",
584 |        "    <tr>\n",
585 |        "      <th>3</th>\n",
586 |        "      <td>African</td>\n",
587 |        "      <td>318325104</td>\n",
588 |        "    </tr>\n",
589 |        "    <tr>\n",
590 |        "      <th>4</th>\n",
591 |        "      <td>Dravidian</td>\n",
592 |        "      <td>302713744</td>\n",
593 |        "    </tr>\n",
594 |        "    <tr>\n",
595 |        "      <th>5</th>\n",
596 |        "      <td>Mestizo</td>\n",
597 |        "      <td>157734349</td>\n",
598 |        "    </tr>\n",
599 |        "    <tr>\n",
600 |        "      <th>6</th>\n",
601 |        "      <td>Bengali</td>\n",
602 |        "      <td>146776916</td>\n",
603 |        "    </tr>\n",
604 |        "    <tr>\n",
605 |        "      <th>7</th>\n",
606 |        "      <td>Russian</td>\n",
607 |        "      <td>131856989</td>\n",
608 |        "    </tr>\n",
609 |        "    <tr>\n",
610 |        "      <th>8</th>\n",
611 |        "      <td>Japanese</td>\n",
612 |        "      <td>126534212</td>\n",
613 |        "    </tr>\n",
614 |        "    <tr>\n",
615 |        "      <th>9</th>\n",
616 |        "      <td>Malay</td>\n",
617 |        "      <td>121993548</td>\n",
618 |        "    </tr>\n",
619 |        "  </tbody>\n",
620 |        "</table>\n",
621 |        "</div>"
622 |       ],
623 |       "text/plain": [
624 |        "     ethnicity  population\n",
625 |        "0  Han Chinese  1245058800\n",
626 |        "1   Indo-Aryan   871815583\n",
627 |        "2     European   494872201\n",
628 |        "3      African   318325104\n",
629 |        "4    Dravidian   302713744\n",
630 |        "5      Mestizo   157734349\n",
631 |        "6      Bengali   146776916\n",
632 |        "7      Russian   131856989\n",
633 |        "8     Japanese   126534212\n",
634 |        "9        Malay   121993548"
635 |       ]
636 |      },
637 |      "execution_count": 15,
638 |      "metadata": {},
639 |      "output_type": "execute_result"
640 |     }
641 |    ],
642 |    "source": [
643 |     "df.groupby('ethnicity').population.sum().sort_values(ascending=False).head(10).reset_index()"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "code",
648 |    "execution_count": 14,
649 |    "metadata": {
650 |     "collapsed": true
651 |    },
652 |    "outputs": [],
653 |    "source": [
654 |     "#4. name and country of a) longest river "
655 |    ]
656 |   },
657 |   {
658 |    "cell_type": "code",
659 |    "execution_count": 17,
660 |    "metadata": {
661 |     "collapsed": false
662 |    },
663 |    "outputs": [
664 |     {
665 |      "data": {
666 |       "text/html": [
667 |        "<div>\n",
668 |        "<table border=\"1\" class=\"dataframe\">\n",
669 |        "  <thead>\n",
670 |        "    <tr style=\"text-align: right;\">\n",
671 |        "      <th></th>\n",
672 |        "      <th>country</th>\n",
673 |        "      <th>length</th>\n",
674 |        "      <th>name</th>\n",
675 |        "    </tr>\n",
676 |        "  </thead>\n",
677 |        "  <tbody>\n",
678 |        "    <tr>\n",
679 |        "      <th>161</th>\n",
680 |        "      <td>CO</td>\n",
681 |        "      <td>6448</td>\n",
682 |        "      <td>Amazonas</td>\n",
683 |        "    </tr>\n",
684 |        "  </tbody>\n",
685 |        "</table>\n",
686 |        "</div>"
687 |       ],
688 |       "text/plain": [
689 |        "    country  length      name\n",
690 |        "161      CO    6448  Amazonas"
691 |       ]
692 |      },
693 |      "execution_count": 17,
694 |      "metadata": {},
695 |      "output_type": "execute_result"
696 |     }
697 |    ],
698 |    "source": [
699 |     "rivers_list=[]\n",
700 |     "rivers_df = pd.DataFrame()\n",
701 |     "for rivers in document.iterfind('river'):\n",
702 |     "    try:\n",
703 |     "        rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})\n",
704 |     "    except:\n",
705 |     "        next\n",
706 |     "rivers_df = pd.DataFrame(rivers_list)\n",
707 |     "rivers_df.sort_values(by = 'length', ascending=False).head(1)"
708 |    ]
709 |   },
710 |   {
711 |    "cell_type": "code",
712 |    "execution_count": null,
713 |    "metadata": {
714 |     "collapsed": true
715 |    },
716 |    "outputs": [],
717 |    "source": [
718 |     "#b) largest lake"
719 |    ]
720 |   },
721 |   {
722 |    "cell_type": "code",
723 |    "execution_count": 19,
724 |    "metadata": {
725 |     "collapsed": false
726 |    },
727 |    "outputs": [
728 |     {
729 |      "data": {
730 |       "text/html": [
731 |        "<div>\n",
732 |        "<table border=\"1\" class=\"dataframe\">\n",
733 |        "  <thead>\n",
734 |        "    <tr style=\"text-align: right;\">\n",
735 |        "      <th></th>\n",
736 |        "      <th>area</th>\n",
737 |        "      <th>country</th>\n",
738 |        "      <th>name</th>\n",
739 |        "    </tr>\n",
740 |        "  </thead>\n",
741 |        "  <tbody>\n",
742 |        "    <tr>\n",
743 |        "      <th>42</th>\n",
744 |        "      <td>386400</td>\n",
745 |        "      <td>R</td>\n",
746 |        "      <td>Caspian Sea</td>\n",
747 |        "    </tr>\n",
748 |        "  </tbody>\n",
749 |        "</table>\n",
750 |        "</div>"
751 |       ],
752 |       "text/plain": [
753 |        "      area country         name\n",
754 |        "42  386400       R  Caspian Sea"
755 |       ]
756 |      },
757 |      "execution_count": 19,
758 |      "metadata": {},
759 |      "output_type": "execute_result"
760 |     }
761 |    ],
762 |    "source": [
763 |     "lake_list=[]\n",
764 |     "lake_df = pd.DataFrame()\n",
765 |     "for lakes in document.iterfind('lake'):\n",
766 |     "    try:\n",
767 |     "        lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})\n",
768 |     "    except:\n",
769 |     "        next\n",
770 |     "lakes_df = pd.DataFrame(lake_list)\n",
771 |     "lakes_df.sort_values(by = 'area', ascending=False).head(1)"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "code",
776 |    "execution_count": null,
777 |    "metadata": {
778 |     "collapsed": true
779 |    },
780 |    "outputs": [],
781 |    "source": [
782 |     "#c) airport at highest elevation"
783 |    ]
784 |   },
785 |   {
786 |    "cell_type": "code",
787 |    "execution_count": 20,
788 |    "metadata": {
789 |     "collapsed": false
790 |    },
791 |    "outputs": [
792 |     {
793 |      "name": "stderr",
794 |      "output_type": "stream",
795 |      "text": [
796 |       "/home/sibi/acad/prog_tools/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
797 |      ]
798 |     },
799 |     {
800 |      "data": {
801 |       "text/html": [
802 |        "<div>\n",
803 |        "<table border=\"1\" class=\"dataframe\">\n",
804 |        "  <thead>\n",
805 |        "    <tr style=\"text-align: right;\">\n",
806 |        "      <th></th>\n",
807 |        "      <th>country</th>\n",
808 |        "      <th>elevation</th>\n",
809 |        "      <th>name</th>\n",
810 |        "    </tr>\n",
811 |        "  </thead>\n",
812 |        "  <tbody>\n",
813 |        "    <tr>\n",
814 |        "      <th>80</th>\n",
815 |        "      <td>BOL</td>\n",
816 |        "      <td>4063</td>\n",
817 |        "      <td>El Alto Intl</td>\n",
818 |        "    </tr>\n",
819 |        "  </tbody>\n",
820 |        "</table>\n",
821 |        "</div>"
822 |       ],
823 |       "text/plain": [
824 |        "   country  elevation          name\n",
825 |        "80     BOL       4063  El Alto Intl"
826 |       ]
827 |      },
828 |      "execution_count": 20,
829 |      "metadata": {},
830 |      "output_type": "execute_result"
831 |     }
832 |    ],
833 |    "source": [
834 |     "ap_list=[]\n",
835 |     "ap_df = pd.DataFrame()\n",
836 |     "for ap in document.iterfind('airport'):\n",
837 |     "    try:\n",
838 |     "        ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})\n",
839 |     "    except:\n",
840 |     "        next\n",
841 |     "ap_df = pd.DataFrame(ap_list)\n",
842 |     "ap_df.sort('elevation', ascending=False).head(1)\n"
843 |    ]
844 |   },
845 |   {
846 |    "cell_type": "code",
847 |    "execution_count": null,
848 |    "metadata": {
849 |     "collapsed": true
850 |    },
851 |    "outputs": [],
852 |    "source": []
853 |   }
854 |  ],
855 |  "metadata": {
856 |   "kernelspec": {
857 |    "display_name": "Python [Root]",
858 |    "language": "python",
859 |    "name": "Python [Root]"
860 |   },
861 |   "language_info": {
862 |    "codemirror_mode": {
863 |     "name": "ipython",
864 |     "version": 2
865 |    },
866 |    "file_extension": ".py",
867 |    "mimetype": "text/x-python",
868 |    "name": "python",
869 |    "nbconvert_exporter": "python",
870 |    "pygments_lexer": "ipython2",
871 |    "version": "2.7.12"
872 |   }
873 |  },
874 |  "nbformat": 4,
875 |  "nbformat_minor": 0
876 | }
877 | 


--------------------------------------------------------------------------------
/Capstone Project/.ipynb_checkpoints/Classification_Adding_Shots-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "import pandas as pd\n",
 12 |     "import numpy as np\n",
 13 |     "import scipy.stats as scipy"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 3,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "df = pd.read_csv(\"./Data/E0_13.csv\")\n",
 25 |     "df_14 = pd.read_csv(\"./Data/E0_14.csv\")"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 4,
 31 |    "metadata": {
 32 |     "collapsed": false
 33 |    },
 34 |    "outputs": [
 35 |     {
 36 |      "data": {
 37 |       "text/plain": [
 38 |        "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n",
 39 |        "       u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n",
 40 |        "       u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n",
 41 |        "       u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n",
 42 |        "       u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n",
 43 |        "       u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n",
 44 |        "       u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n",
 45 |        "       u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n",
 46 |        "       u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n",
 47 |        "      dtype='object')"
 48 |       ]
 49 |      },
 50 |      "execution_count": 4,
 51 |      "metadata": {},
 52 |      "output_type": "execute_result"
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "df.columns"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 5,
 62 |    "metadata": {
 63 |     "collapsed": false
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "res_13 = df.ix[:,:23]\n",
 68 |     "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n",
 69 |     "res_14 = df_14.ix[:,:23]\n",
 70 |     "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n",
 71 |     "table_features = df.ix[:,:7]\n",
 72 |     "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n",
 73 |     "bet_13 = df.ix[:,23:]\n"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 6,
 79 |    "metadata": {
 80 |     "collapsed": false,
 81 |     "scrolled": true
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "res_13.head()\n",
 86 |     "feature_table = df.ix[:,:23]"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 7,
 92 |    "metadata": {
 93 |     "collapsed": false
 94 |    },
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n",
 98 |     "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 8,
104 |    "metadata": {
105 |     "collapsed": false
106 |    },
107 |    "outputs": [
108 |     {
109 |      "name": "stdout",
110 |      "output_type": "stream",
111 |      "text": [
112 |       "Average number of goals at home 1.57368421053\n",
113 |       "Average number of goals away 1.19473684211\n",
114 |       "Average number of goals conceded at home 1.57368421053\n",
115 |       "Average number of goals conceded away 1.19473684211\n"
116 |      ]
117 |     }
118 |    ],
119 |    "source": [
120 |     "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n",
121 |     "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n",
122 |     "avg_home_conceded_13 = avg_away_scored_13\n",
123 |     "avg_away_conceded_13 = avg_home_scored_13\n",
124 |     "print \"Average number of goals at home\",avg_home_scored_13\n",
125 |     "print \"Average number of goals away\", avg_away_scored_13\n",
126 |     "print \"Average number of goals conceded at home\",avg_away_conceded_13\n",
127 |     "print \"Average number of goals conceded away\",avg_home_conceded_13\n"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 9,
133 |    "metadata": {
134 |     "collapsed": false
135 |    },
136 |    "outputs": [],
137 |    "source": [
138 |     "res_home = res_13.groupby('HomeTeam')\n",
139 |     "res_away = res_13.groupby('AwayTeam')"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 10,
145 |    "metadata": {
146 |     "collapsed": false
147 |    },
148 |    "outputs": [
149 |     {
150 |      "data": {
151 |       "text/plain": [
152 |        "('Arsenal',\n",
153 |        "     HomeTeam        AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR  HS  AS  HST  \\\n",
154 |        " 0    Arsenal     Aston Villa     1     3   A     1     1   D  16   9    4   \n",
155 |        " 27   Arsenal       Tottenham     1     0   H     1     0   H  12  14    5   \n",
156 |        " 46   Arsenal           Stoke     3     1   H     2     1   H  16   9    8   \n",
157 |        " 70   Arsenal         Norwich     4     1   H     1     0   H  20  12   11   \n",
158 |        " 90   Arsenal       Liverpool     2     0   H     1     0   H  12  12    7   \n",
159 |        " 110  Arsenal     Southampton     2     0   H     1     0   H   9  10    4   \n",
160 |        " 131  Arsenal            Hull     2     0   H     1     0   H  20   7    7   \n",
161 |        " 147  Arsenal         Everton     1     1   D     0     0   D  11  12    5   \n",
162 |        " 169  Arsenal         Chelsea     0     0   D     0     0   D   7  13    2   \n",
163 |        " 190  Arsenal         Cardiff     2     0   H     0     0   D  28   8    6   \n",
164 |        " 210  Arsenal          Fulham     2     0   H     0     0   D  22   8    8   \n",
165 |        " 237  Arsenal  Crystal Palace     2     0   H     0     0   D  11  10    6   \n",
166 |        " 254  Arsenal      Man United     0     0   D     0     0   D  17   6    5   \n",
167 |        " 258  Arsenal      Sunderland     4     1   H     3     0   H  12   7    9   \n",
168 |        " 301  Arsenal         Swansea     2     2   D     0     1   A  13   8    4   \n",
169 |        " 306  Arsenal        Man City     1     1   D     0     1   A  10  15    3   \n",
170 |        " 334  Arsenal        West Ham     3     1   H     1     1   D  14  12    8   \n",
171 |        " 356  Arsenal       Newcastle     3     0   H     2     0   H  20   8    8   \n",
172 |        " 364  Arsenal       West Brom     1     0   H     1     0   H  15  11    4   \n",
173 |        " \n",
174 |        "      AST  HF  AF  HC  AC  HY  AY  HR  AR  \n",
175 |        " 0      4  15  18   4   3   4   5   1   0  \n",
176 |        " 27     4  15  14   3   6   2   2   0   0  \n",
177 |        " 46     3   8  15   6   7   0   2   0   0  \n",
178 |        " 70     6   8   7  10   1   0   0   0   0  \n",
179 |        " 90     4  11   7   3   5   2   1   0   0  \n",
180 |        " 110    4  10  14   5   6   0   3   0   0  \n",
181 |        " 131    2   9   6  11   1   0   0   0   0  \n",
182 |        " 147    4  13  11   3   2   0   4   0   0  \n",
183 |        " 169    4   7  11   8   6   2   1   0   0  \n",
184 |        " 190    2   6  11  12   2   1   2   0   0  \n",
185 |        " 210    2   5   7   4   4   0   0   0   0  \n",
186 |        " 237    2   9  14   6   5   1   2   0   0  \n",
187 |        " 254    2  10  14   5   5   1   2   0   0  \n",
188 |        " 258    3  10  11   7   3   0   1   0   0  \n",
189 |        " 301    2  11  12   7   0   0   1   0   0  \n",
190 |        " 306    4   8  11   6   6   1   4   0   0  \n",
191 |        " 334    2  14  12   4   3   2   2   0   0  \n",
192 |        " 356    3   9   8  14   0   3   2   0   0  \n",
193 |        " 364    1   9   6   9  10   2   2   0   0  )"
194 |       ]
195 |      },
196 |      "execution_count": 10,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "list(res_home)[0]"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 11,
208 |    "metadata": {
209 |     "collapsed": false
210 |    },
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/html": [
215 |        "<div>\n",
216 |        "<table border=\"1\" class=\"dataframe\">\n",
217 |        "  <thead>\n",
218 |        "    <tr style=\"text-align: right;\">\n",
219 |        "      <th></th>\n",
220 |        "      <th>Team</th>\n",
221 |        "      <th>HGS</th>\n",
222 |        "      <th>AGS</th>\n",
223 |        "      <th>HAS</th>\n",
224 |        "      <th>AAS</th>\n",
225 |        "      <th>HGC</th>\n",
226 |        "      <th>AGC</th>\n",
227 |        "      <th>HDS</th>\n",
228 |        "      <th>ADS</th>\n",
229 |        "    </tr>\n",
230 |        "  </thead>\n",
231 |        "  <tbody>\n",
232 |        "    <tr>\n",
233 |        "      <th>0</th>\n",
234 |        "      <td>Arsenal</td>\n",
235 |        "      <td>36</td>\n",
236 |        "      <td>32</td>\n",
237 |        "      <td>NaN</td>\n",
238 |        "      <td>NaN</td>\n",
239 |        "      <td>11</td>\n",
240 |        "      <td>30</td>\n",
241 |        "      <td>NaN</td>\n",
242 |        "      <td>NaN</td>\n",
243 |        "    </tr>\n",
244 |        "    <tr>\n",
245 |        "      <th>1</th>\n",
246 |        "      <td>Aston Villa</td>\n",
247 |        "      <td>22</td>\n",
248 |        "      <td>17</td>\n",
249 |        "      <td>NaN</td>\n",
250 |        "      <td>NaN</td>\n",
251 |        "      <td>29</td>\n",
252 |        "      <td>32</td>\n",
253 |        "      <td>NaN</td>\n",
254 |        "      <td>NaN</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>2</th>\n",
258 |        "      <td>Cardiff</td>\n",
259 |        "      <td>20</td>\n",
260 |        "      <td>12</td>\n",
261 |        "      <td>NaN</td>\n",
262 |        "      <td>NaN</td>\n",
263 |        "      <td>35</td>\n",
264 |        "      <td>39</td>\n",
265 |        "      <td>NaN</td>\n",
266 |        "      <td>NaN</td>\n",
267 |        "    </tr>\n",
268 |        "    <tr>\n",
269 |        "      <th>3</th>\n",
270 |        "      <td>Chelsea</td>\n",
271 |        "      <td>43</td>\n",
272 |        "      <td>28</td>\n",
273 |        "      <td>NaN</td>\n",
274 |        "      <td>NaN</td>\n",
275 |        "      <td>11</td>\n",
276 |        "      <td>16</td>\n",
277 |        "      <td>NaN</td>\n",
278 |        "      <td>NaN</td>\n",
279 |        "    </tr>\n",
280 |        "    <tr>\n",
281 |        "      <th>4</th>\n",
282 |        "      <td>Crystal Palace</td>\n",
283 |        "      <td>18</td>\n",
284 |        "      <td>15</td>\n",
285 |        "      <td>NaN</td>\n",
286 |        "      <td>NaN</td>\n",
287 |        "      <td>23</td>\n",
288 |        "      <td>25</td>\n",
289 |        "      <td>NaN</td>\n",
290 |        "      <td>NaN</td>\n",
291 |        "    </tr>\n",
292 |        "  </tbody>\n",
293 |        "</table>\n",
294 |        "</div>"
295 |       ],
296 |       "text/plain": [
297 |        "             Team  HGS  AGS  HAS  AAS  HGC  AGC  HDS  ADS\n",
298 |        "0         Arsenal   36   32  NaN  NaN   11   30  NaN  NaN\n",
299 |        "1     Aston Villa   22   17  NaN  NaN   29   32  NaN  NaN\n",
300 |        "2         Cardiff   20   12  NaN  NaN   35   39  NaN  NaN\n",
301 |        "3         Chelsea   43   28  NaN  NaN   11   16  NaN  NaN\n",
302 |        "4  Crystal Palace   18   15  NaN  NaN   23   25  NaN  NaN"
303 |       ]
304 |      },
305 |      "execution_count": 11,
306 |      "metadata": {},
307 |      "output_type": "execute_result"
308 |     }
309 |    ],
310 |    "source": [
311 |     "table_13.Team = res_home.HomeTeam.all().values\n",
312 |     "table_13.HGS = res_home.FTHG.sum().values\n",
313 |     "table_13.HGC = res_home.FTAG.sum().values\n",
314 |     "table_13.AGS = res_away.FTAG.sum().values\n",
315 |     "table_13.AGC = res_away.FTHG.sum().values\n",
316 |     "table_13.head()"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 12,
322 |    "metadata": {
323 |     "collapsed": false
324 |    },
325 |    "outputs": [
326 |     {
327 |      "data": {
328 |       "text/html": [
329 |        "<div>\n",
330 |        "<table border=\"1\" class=\"dataframe\">\n",
331 |        "  <thead>\n",
332 |        "    <tr style=\"text-align: right;\">\n",
333 |        "      <th></th>\n",
334 |        "      <th>Team</th>\n",
335 |        "      <th>HGS</th>\n",
336 |        "      <th>AGS</th>\n",
337 |        "      <th>HAS</th>\n",
338 |        "      <th>AAS</th>\n",
339 |        "      <th>HGC</th>\n",
340 |        "      <th>AGC</th>\n",
341 |        "      <th>HDS</th>\n",
342 |        "      <th>ADS</th>\n",
343 |        "    </tr>\n",
344 |        "  </thead>\n",
345 |        "  <tbody>\n",
346 |        "    <tr>\n",
347 |        "      <th>0</th>\n",
348 |        "      <td>Arsenal</td>\n",
349 |        "      <td>36</td>\n",
350 |        "      <td>32</td>\n",
351 |        "      <td>1.204013</td>\n",
352 |        "      <td>1.409692</td>\n",
353 |        "      <td>11</td>\n",
354 |        "      <td>30</td>\n",
355 |        "      <td>0.484581</td>\n",
356 |        "      <td>1.003344</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>1</th>\n",
360 |        "      <td>Aston Villa</td>\n",
361 |        "      <td>22</td>\n",
362 |        "      <td>17</td>\n",
363 |        "      <td>0.735786</td>\n",
364 |        "      <td>0.748899</td>\n",
365 |        "      <td>29</td>\n",
366 |        "      <td>32</td>\n",
367 |        "      <td>1.277533</td>\n",
368 |        "      <td>1.070234</td>\n",
369 |        "    </tr>\n",
370 |        "    <tr>\n",
371 |        "      <th>2</th>\n",
372 |        "      <td>Cardiff</td>\n",
373 |        "      <td>20</td>\n",
374 |        "      <td>12</td>\n",
375 |        "      <td>0.668896</td>\n",
376 |        "      <td>0.528634</td>\n",
377 |        "      <td>35</td>\n",
378 |        "      <td>39</td>\n",
379 |        "      <td>1.541850</td>\n",
380 |        "      <td>1.304348</td>\n",
381 |        "    </tr>\n",
382 |        "    <tr>\n",
383 |        "      <th>3</th>\n",
384 |        "      <td>Chelsea</td>\n",
385 |        "      <td>43</td>\n",
386 |        "      <td>28</td>\n",
387 |        "      <td>1.438127</td>\n",
388 |        "      <td>1.233480</td>\n",
389 |        "      <td>11</td>\n",
390 |        "      <td>16</td>\n",
391 |        "      <td>0.484581</td>\n",
392 |        "      <td>0.535117</td>\n",
393 |        "    </tr>\n",
394 |        "    <tr>\n",
395 |        "      <th>4</th>\n",
396 |        "      <td>Crystal Palace</td>\n",
397 |        "      <td>18</td>\n",
398 |        "      <td>15</td>\n",
399 |        "      <td>0.602007</td>\n",
400 |        "      <td>0.660793</td>\n",
401 |        "      <td>23</td>\n",
402 |        "      <td>25</td>\n",
403 |        "      <td>1.013216</td>\n",
404 |        "      <td>0.836120</td>\n",
405 |        "    </tr>\n",
406 |        "  </tbody>\n",
407 |        "</table>\n",
408 |        "</div>"
409 |       ],
410 |       "text/plain": [
411 |        "             Team  HGS  AGS       HAS       AAS  HGC  AGC       HDS       ADS\n",
412 |        "0         Arsenal   36   32  1.204013  1.409692   11   30  0.484581  1.003344\n",
413 |        "1     Aston Villa   22   17  0.735786  0.748899   29   32  1.277533  1.070234\n",
414 |        "2         Cardiff   20   12  0.668896  0.528634   35   39  1.541850  1.304348\n",
415 |        "3         Chelsea   43   28  1.438127  1.233480   11   16  0.484581  0.535117\n",
416 |        "4  Crystal Palace   18   15  0.602007  0.660793   23   25  1.013216  0.836120"
417 |       ]
418 |      },
419 |      "execution_count": 12,
420 |      "metadata": {},
421 |      "output_type": "execute_result"
422 |     }
423 |    ],
424 |    "source": [
425 |     "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n",
426 |     "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n",
427 |     "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n",
428 |     "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n",
429 |     "table_13.head()"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": 15,
435 |    "metadata": {
436 |     "collapsed": false
437 |    },
438 |    "outputs": [],
439 |    "source": [
440 |     "feature_table = feature_table[['HomeTeam','AwayTeam','FTR','HST','AST']]\n",
441 |     "f_HAS = []\n",
442 |     "f_HDS = []\n",
443 |     "f_AAS = []\n",
444 |     "f_ADS = []\n",
445 |     "for index,row in feature_table.iterrows():\n",
446 |     "    f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n",
447 |     "    f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n",
448 |     "    f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n",
449 |     "    f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n",
450 |     "    \n",
451 |     "feature_table['HAS'] = f_HAS\n",
452 |     "feature_table['HDS'] = f_HDS\n",
453 |     "feature_table['AAS'] = f_AAS\n",
454 |     "feature_table['ADS'] = f_ADS"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": 16,
460 |    "metadata": {
461 |     "collapsed": false
462 |    },
463 |    "outputs": [
464 |     {
465 |      "data": {
466 |       "text/html": [
467 |        "<div>\n",
468 |        "<table border=\"1\" class=\"dataframe\">\n",
469 |        "  <thead>\n",
470 |        "    <tr style=\"text-align: right;\">\n",
471 |        "      <th></th>\n",
472 |        "      <th>HomeTeam</th>\n",
473 |        "      <th>AwayTeam</th>\n",
474 |        "      <th>FTR</th>\n",
475 |        "      <th>HST</th>\n",
476 |        "      <th>AST</th>\n",
477 |        "      <th>HAS</th>\n",
478 |        "      <th>HDS</th>\n",
479 |        "      <th>AAS</th>\n",
480 |        "      <th>ADS</th>\n",
481 |        "    </tr>\n",
482 |        "  </thead>\n",
483 |        "  <tbody>\n",
484 |        "    <tr>\n",
485 |        "      <th>0</th>\n",
486 |        "      <td>Arsenal</td>\n",
487 |        "      <td>Aston Villa</td>\n",
488 |        "      <td>A</td>\n",
489 |        "      <td>4</td>\n",
490 |        "      <td>4</td>\n",
491 |        "      <td>1.204013</td>\n",
492 |        "      <td>0.484581</td>\n",
493 |        "      <td>1.409692</td>\n",
494 |        "      <td>1.003344</td>\n",
495 |        "    </tr>\n",
496 |        "    <tr>\n",
497 |        "      <th>1</th>\n",
498 |        "      <td>Liverpool</td>\n",
499 |        "      <td>Stoke</td>\n",
500 |        "      <td>H</td>\n",
501 |        "      <td>11</td>\n",
502 |        "      <td>4</td>\n",
503 |        "      <td>1.772575</td>\n",
504 |        "      <td>0.792952</td>\n",
505 |        "      <td>2.114537</td>\n",
506 |        "      <td>1.070234</td>\n",
507 |        "    </tr>\n",
508 |        "    <tr>\n",
509 |        "      <th>2</th>\n",
510 |        "      <td>Norwich</td>\n",
511 |        "      <td>Everton</td>\n",
512 |        "      <td>D</td>\n",
513 |        "      <td>2</td>\n",
514 |        "      <td>6</td>\n",
515 |        "      <td>0.568562</td>\n",
516 |        "      <td>0.792952</td>\n",
517 |        "      <td>0.484581</td>\n",
518 |        "      <td>1.471572</td>\n",
519 |        "    </tr>\n",
520 |        "    <tr>\n",
521 |        "      <th>3</th>\n",
522 |        "      <td>Sunderland</td>\n",
523 |        "      <td>Fulham</td>\n",
524 |        "      <td>A</td>\n",
525 |        "      <td>3</td>\n",
526 |        "      <td>1</td>\n",
527 |        "      <td>0.702341</td>\n",
528 |        "      <td>1.189427</td>\n",
529 |        "      <td>0.881057</td>\n",
530 |        "      <td>1.103679</td>\n",
531 |        "    </tr>\n",
532 |        "    <tr>\n",
533 |        "      <th>4</th>\n",
534 |        "      <td>Swansea</td>\n",
535 |        "      <td>Man United</td>\n",
536 |        "      <td>A</td>\n",
537 |        "      <td>6</td>\n",
538 |        "      <td>7</td>\n",
539 |        "      <td>1.103679</td>\n",
540 |        "      <td>1.145374</td>\n",
541 |        "      <td>0.925110</td>\n",
542 |        "      <td>0.936455</td>\n",
543 |        "    </tr>\n",
544 |        "  </tbody>\n",
545 |        "</table>\n",
546 |        "</div>"
547 |       ],
548 |       "text/plain": [
549 |        "     HomeTeam     AwayTeam FTR  HST  AST       HAS       HDS       AAS  \\\n",
550 |        "0     Arsenal  Aston Villa   A    4    4  1.204013  0.484581  1.409692   \n",
551 |        "1   Liverpool        Stoke   H   11    4  1.772575  0.792952  2.114537   \n",
552 |        "2     Norwich      Everton   D    2    6  0.568562  0.792952  0.484581   \n",
553 |        "3  Sunderland       Fulham   A    3    1  0.702341  1.189427  0.881057   \n",
554 |        "4     Swansea   Man United   A    6    7  1.103679  1.145374  0.925110   \n",
555 |        "\n",
556 |        "        ADS  \n",
557 |        "0  1.003344  \n",
558 |        "1  1.070234  \n",
559 |        "2  1.471572  \n",
560 |        "3  1.103679  \n",
561 |        "4  0.936455  "
562 |       ]
563 |      },
564 |      "execution_count": 16,
565 |      "metadata": {},
566 |      "output_type": "execute_result"
567 |     }
568 |    ],
569 |    "source": [
570 |     "feature_table.head()"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 17,
576 |    "metadata": {
577 |     "collapsed": true
578 |    },
579 |    "outputs": [],
580 |    "source": [
581 |     "def transformResult(row):\n",
582 |     "    if(row.FTR == 'H'):\n",
583 |     "        return 1\n",
584 |     "    elif(row.FTR == 'A'):\n",
585 |     "        return -1\n",
586 |     "    else:\n",
587 |     "        return 0"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": 18,
593 |    "metadata": {
594 |     "collapsed": false
595 |    },
596 |    "outputs": [],
597 |    "source": [
598 |     "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)"
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": 19,
604 |    "metadata": {
605 |     "collapsed": false
606 |    },
607 |    "outputs": [
608 |     {
609 |      "data": {
610 |       "text/html": [
611 |        "<div>\n",
612 |        "<table border=\"1\" class=\"dataframe\">\n",
613 |        "  <thead>\n",
614 |        "    <tr style=\"text-align: right;\">\n",
615 |        "      <th></th>\n",
616 |        "      <th>HomeTeam</th>\n",
617 |        "      <th>AwayTeam</th>\n",
618 |        "      <th>FTR</th>\n",
619 |        "      <th>HST</th>\n",
620 |        "      <th>AST</th>\n",
621 |        "      <th>HAS</th>\n",
622 |        "      <th>HDS</th>\n",
623 |        "      <th>AAS</th>\n",
624 |        "      <th>ADS</th>\n",
625 |        "      <th>Result</th>\n",
626 |        "    </tr>\n",
627 |        "  </thead>\n",
628 |        "  <tbody>\n",
629 |        "    <tr>\n",
630 |        "      <th>0</th>\n",
631 |        "      <td>Arsenal</td>\n",
632 |        "      <td>Aston Villa</td>\n",
633 |        "      <td>A</td>\n",
634 |        "      <td>4</td>\n",
635 |        "      <td>4</td>\n",
636 |        "      <td>1.204013</td>\n",
637 |        "      <td>0.484581</td>\n",
638 |        "      <td>1.409692</td>\n",
639 |        "      <td>1.003344</td>\n",
640 |        "      <td>-1</td>\n",
641 |        "    </tr>\n",
642 |        "    <tr>\n",
643 |        "      <th>1</th>\n",
644 |        "      <td>Liverpool</td>\n",
645 |        "      <td>Stoke</td>\n",
646 |        "      <td>H</td>\n",
647 |        "      <td>11</td>\n",
648 |        "      <td>4</td>\n",
649 |        "      <td>1.772575</td>\n",
650 |        "      <td>0.792952</td>\n",
651 |        "      <td>2.114537</td>\n",
652 |        "      <td>1.070234</td>\n",
653 |        "      <td>1</td>\n",
654 |        "    </tr>\n",
655 |        "    <tr>\n",
656 |        "      <th>2</th>\n",
657 |        "      <td>Norwich</td>\n",
658 |        "      <td>Everton</td>\n",
659 |        "      <td>D</td>\n",
660 |        "      <td>2</td>\n",
661 |        "      <td>6</td>\n",
662 |        "      <td>0.568562</td>\n",
663 |        "      <td>0.792952</td>\n",
664 |        "      <td>0.484581</td>\n",
665 |        "      <td>1.471572</td>\n",
666 |        "      <td>0</td>\n",
667 |        "    </tr>\n",
668 |        "    <tr>\n",
669 |        "      <th>3</th>\n",
670 |        "      <td>Sunderland</td>\n",
671 |        "      <td>Fulham</td>\n",
672 |        "      <td>A</td>\n",
673 |        "      <td>3</td>\n",
674 |        "      <td>1</td>\n",
675 |        "      <td>0.702341</td>\n",
676 |        "      <td>1.189427</td>\n",
677 |        "      <td>0.881057</td>\n",
678 |        "      <td>1.103679</td>\n",
679 |        "      <td>-1</td>\n",
680 |        "    </tr>\n",
681 |        "    <tr>\n",
682 |        "      <th>4</th>\n",
683 |        "      <td>Swansea</td>\n",
684 |        "      <td>Man United</td>\n",
685 |        "      <td>A</td>\n",
686 |        "      <td>6</td>\n",
687 |        "      <td>7</td>\n",
688 |        "      <td>1.103679</td>\n",
689 |        "      <td>1.145374</td>\n",
690 |        "      <td>0.925110</td>\n",
691 |        "      <td>0.936455</td>\n",
692 |        "      <td>-1</td>\n",
693 |        "    </tr>\n",
694 |        "  </tbody>\n",
695 |        "</table>\n",
696 |        "</div>"
697 |       ],
698 |       "text/plain": [
699 |        "     HomeTeam     AwayTeam FTR  HST  AST       HAS       HDS       AAS  \\\n",
700 |        "0     Arsenal  Aston Villa   A    4    4  1.204013  0.484581  1.409692   \n",
701 |        "1   Liverpool        Stoke   H   11    4  1.772575  0.792952  2.114537   \n",
702 |        "2     Norwich      Everton   D    2    6  0.568562  0.792952  0.484581   \n",
703 |        "3  Sunderland       Fulham   A    3    1  0.702341  1.189427  0.881057   \n",
704 |        "4     Swansea   Man United   A    6    7  1.103679  1.145374  0.925110   \n",
705 |        "\n",
706 |        "        ADS  Result  \n",
707 |        "0  1.003344      -1  \n",
708 |        "1  1.070234       1  \n",
709 |        "2  1.471572       0  \n",
710 |        "3  1.103679      -1  \n",
711 |        "4  0.936455      -1  "
712 |       ]
713 |      },
714 |      "execution_count": 19,
715 |      "metadata": {},
716 |      "output_type": "execute_result"
717 |     }
718 |    ],
719 |    "source": [
720 |     "feature_table.head()"
721 |    ]
722 |   },
723 |   {
724 |    "cell_type": "code",
725 |    "execution_count": 21,
726 |    "metadata": {
727 |     "collapsed": true
728 |    },
729 |    "outputs": [],
730 |    "source": [
731 |     "X_train = feature_table[['HST','AST','HAS','HDS','AAS','ADS']]\n",
732 |     "y_train = feature_table['Result']"
733 |    ]
734 |   },
735 |   {
736 |    "cell_type": "code",
737 |    "execution_count": 27,
738 |    "metadata": {
739 |     "collapsed": false
740 |    },
741 |    "outputs": [],
742 |    "source": [
743 |     "from sklearn.tree import DecisionTreeClassifier\n",
744 |     "from sklearn.naive_bayes import MultinomialNB\n",
745 |     "from xgboost import XGBClassifier\n",
746 |     "from sklearn.metrics import accuracy_score\n",
747 |     "from sklearn.model_selection import cross_val_score\n"
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "code",
752 |    "execution_count": 43,
753 |    "metadata": {
754 |     "collapsed": false
755 |    },
756 |    "outputs": [
757 |     {
758 |      "data": {
759 |       "text/plain": [
760 |        "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
761 |        "            max_features=None, max_leaf_nodes=None,\n",
762 |        "            min_impurity_split=1e-07, min_samples_leaf=1,\n",
763 |        "            min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
764 |        "            presort=False, random_state=None, splitter='best')"
765 |       ]
766 |      },
767 |      "execution_count": 43,
768 |      "metadata": {},
769 |      "output_type": "execute_result"
770 |     }
771 |    ],
772 |    "source": [
773 |     "clf = DecisionTreeClassifier()\n",
774 |     "clf2 = MultinomialNB()\n",
775 |     "clf3 = XGBClassifier()\n",
776 |     "clf.fit(X_train,y_train)"
777 |    ]
778 |   },
779 |   {
780 |    "cell_type": "code",
781 |    "execution_count": 45,
782 |    "metadata": {
783 |     "collapsed": false
784 |    },
785 |    "outputs": [],
786 |    "source": [
787 |     "# y_pred = clf3.predict(X_train)\n",
788 |     "accuracy_score(y_pred,y_train)\n",
789 |     "scores = cross_val_score(clf2, X_train, y_train, cv=10)\n"
790 |    ]
791 |   },
792 |   {
793 |    "cell_type": "code",
794 |    "execution_count": 46,
795 |    "metadata": {
796 |     "collapsed": false
797 |    },
798 |    "outputs": [
799 |     {
800 |      "name": "stdout",
801 |      "output_type": "stream",
802 |      "text": [
803 |       "[ 0.56410256  0.64102564  0.56410256  0.57894737  0.65789474  0.65789474\n",
804 |       "  0.65789474  0.65789474  0.54054054  0.75      ]\n",
805 |       "0.627029762556\n"
806 |      ]
807 |     }
808 |    ],
809 |    "source": [
810 |     "print scores\n",
811 |     "print scores.mean()"
812 |    ]
813 |   }
814 |  ],
815 |  "metadata": {
816 |   "kernelspec": {
817 |    "display_name": "Python [Root]",
818 |    "language": "python",
819 |    "name": "Python [Root]"
820 |   },
821 |   "language_info": {
822 |    "codemirror_mode": {
823 |     "name": "ipython",
824 |     "version": 2
825 |    },
826 |    "file_extension": ".py",
827 |    "mimetype": "text/x-python",
828 |    "name": "python",
829 |    "nbconvert_exporter": "python",
830 |    "pygments_lexer": "ipython2",
831 |    "version": "2.7.12"
832 |   }
833 |  },
834 |  "nbformat": 4,
835 |  "nbformat_minor": 0
836 | }
837 | 


--------------------------------------------------------------------------------