├── Machine Learning
├── naive_bayes
│ ├── vsm.png
│ ├── terms.png
│ ├── terms2.png
│ └── callibration.png
├── clustering
│ └── WineKMC.xlsx
├── logistic_regression
│ ├── images
│ │ ├── bias.png
│ │ ├── data.png
│ │ ├── knn1.png
│ │ ├── knn2.png
│ │ ├── linreg.png
│ │ ├── linsep.png
│ │ ├── pcanim.gif
│ │ ├── reshape.jpg
│ │ ├── sklearn2.jpg
│ │ ├── train-cv2.png
│ │ ├── train-cv3.png
│ │ ├── onelinesplit.png
│ │ ├── sklearntrans.jpg
│ │ ├── train-test.png
│ │ ├── train-validate-test.png
│ │ ├── complexity-error-plot.png
│ │ ├── complexity-error-reg.png
│ │ ├── train-validate-test3.png
│ │ └── train-validate-test-cont.png
│ └── .gitignore
└── linear_regression
│ ├── images
│ ├── shuttle.png
│ ├── cs109gitflow3.png
│ └── conditionalmean.png
│ └── .gitignore
├── Capstone Project
├── Final Report
│ ├── Report.pdf
│ └── Slide Deck.pdf
├── Capstone Project Proposal.pdf
├── Data
│ └── notes.txt
├── Notebooks
│ └── Classification_Baseline.ipynb
└── .ipynb_checkpoints
│ ├── Classification_Baseline-checkpoint.ipynb
│ └── Classification_Adding_Shots-checkpoint.ipynb
├── Data Wrangling
├── data_wrangling_json
│ ├── .DS_Store
│ ├── data
│ │ ├── .DS_Store
│ │ └── world_bank_projects_less.json
│ └── .ipynb_checkpoints
│ │ └── sliderule_dsi_xml_exercise-checkpoint.ipynb
└── data_wrangling_xml
│ └── data_wrangling_xml
│ ├── .DS_Store
│ ├── sliderule_dsi_xml_exercise.ipynb
│ └── .ipynb_checkpoints
│ └── sliderule_dsi_xml_exercise-checkpoint.ipynb
├── Inferential Statistics
├── statistics project 1
│ ├── .DS_Store
│ ├── data
│ │ ├── .DS_Store
│ │ └── human_body_temperature.csv
│ └── .ipynb_checkpoints
│ │ ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb
│ │ └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb
├── statistics project 2
│ ├── .DS_Store
│ ├── data
│ │ ├── .DS_Store
│ │ └── us_job_market_discrimination.dta
│ ├── .ipynb_checkpoints
│ │ ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb
│ │ └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb
│ └── sliderule_dsi_inferential_statistics_exercise_2.ipynb
└── statistics project 3
│ ├── .DS_Store
│ ├── data
│ └── .DS_Store
│ └── .ipynb_checkpoints
│ ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb
│ └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb
└── README.md
/Machine Learning/naive_bayes/vsm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/vsm.png
--------------------------------------------------------------------------------
/Capstone Project/Final Report/Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Final Report/Report.pdf
--------------------------------------------------------------------------------
/Machine Learning/clustering/WineKMC.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/clustering/WineKMC.xlsx
--------------------------------------------------------------------------------
/Machine Learning/naive_bayes/terms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/terms.png
--------------------------------------------------------------------------------
/Machine Learning/naive_bayes/terms2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/terms2.png
--------------------------------------------------------------------------------
/Capstone Project/Final Report/Slide Deck.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Final Report/Slide Deck.pdf
--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_json/.DS_Store
--------------------------------------------------------------------------------
/Machine Learning/naive_bayes/callibration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/callibration.png
--------------------------------------------------------------------------------
/Capstone Project/Capstone Project Proposal.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Capstone Project Proposal.pdf
--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_json/data/.DS_Store
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/bias.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/data.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/knn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/knn1.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/knn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/knn2.png
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 1/.DS_Store
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/.DS_Store
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 3/.DS_Store
--------------------------------------------------------------------------------
/Machine Learning/linear_regression/images/shuttle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/shuttle.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/linreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/linreg.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/linsep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/linsep.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/pcanim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/pcanim.gif
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/reshape.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/reshape.jpg
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/sklearn2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/sklearn2.jpg
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-cv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-cv2.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-cv3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-cv3.png
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 1/data/.DS_Store
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/data/.DS_Store
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 3/data/.DS_Store
--------------------------------------------------------------------------------
/Machine Learning/linear_regression/images/cs109gitflow3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/cs109gitflow3.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/onelinesplit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/onelinesplit.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/sklearntrans.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/sklearntrans.jpg
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-test.png
--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_xml/data_wrangling_xml/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_xml/data_wrangling_xml/.DS_Store
--------------------------------------------------------------------------------
/Machine Learning/linear_regression/images/conditionalmean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/conditionalmean.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-validate-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/complexity-error-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/complexity-error-plot.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/complexity-error-reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/complexity-error-reg.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-validate-test3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test3.png
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/images/train-validate-test-cont.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test-cont.png
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/data/us_job_market_discrimination.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/data/us_job_market_discrimination.dta
--------------------------------------------------------------------------------
/Machine Learning/logistic_regression/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | #Ipython
60 | .ipynb_checkpoints/
61 | # Created by .ignore support plugin (hsz.mobi)
62 | ### OSX template
63 | .DS_Store
64 | .AppleDouble
65 | .LSOverride
66 |
67 | # Icon must end with two \r
68 | Icon
69 |
70 | # Thumbnails
71 | ._*
72 |
73 | # Files that might appear in the root of a volume
74 | .DocumentRevisions-V100
75 | .fseventsd
76 | .Spotlight-V100
77 | .TemporaryItems
78 | .Trashes
79 | .VolumeIcon.icns
80 |
81 | # Directories potentially created on remote AFP share
82 | .AppleDB
83 | .AppleDesktop
84 | Network Trash Folder
85 | Temporary Items
86 | .apdisk
87 |
88 | #Temporary data
89 | tempdata/
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Springboard
2 |
3 | Projects done as a part of Springboard's Data Science Intensive curriculum.
4 |
5 | ### Capstone Project : Football, Goals and Machine Learning
6 | An attempt to model the highly unpredictable English Premier League and predict the results of each match.
7 |
8 | ### Data Story
9 | Do home teams really have an advantage in football? Is the effect of this advantage reducing in the English Premier League? How predictable are football leagues anyway? Data to the rescue!
10 |
11 | ### Data Wrangling
12 | Practise on cleaning up messy data using pandas - XML, JSON, raw text and working with databases.
13 |
14 | ### Inferential Statistics
15 | Useful inferential statistics for drawing conclusions and predicting outcomes.
16 | Contains three miniprojects :
17 | * Human Body Temperature - hypothesis testing, confidence intervals, and statistical significance
18 | * Examining Racial Discrimination - does race have a significant impact on the rate of callbacks?
19 | * Reducing Hospital Readmissions - statistical analysis to reduce readmissions to hospitals.
20 |
21 | ### Machine Learning
22 | To learn various machine learning models, their advantages and limitations.
23 | Contains the following miniprojects :
24 | * Boston House Pricing - predicting housing prices in Boston using linear regression
25 | * Heights and Weights - using logistic regression to classify gender
26 | * Predicting Movie Ratings - use naive bayes algorithm to accurately predict movie ratings based on their reviews
27 | * Customer Segmentation - employ k-means clustering and associated accuracy metrics to partitioning problems
28 |
--------------------------------------------------------------------------------
/Machine Learning/linear_regression/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 |
5 | # C extensions
6 | *.so
7 |
8 | # Distribution / packaging
9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 |
26 | # PyInstaller
27 | # Usually these files are written by a python script from a template
28 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 |
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 |
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 |
46 | # Translations
47 | *.mo
48 | *.pot
49 |
50 | # Django stuff:
51 | *.log
52 |
53 | # Sphinx documentation
54 | docs/_build/
55 |
56 | # PyBuilder
57 | target/
58 |
59 | #Ipython
60 | .ipynb_checkpoints/
61 | # Created by .ignore support plugin (hsz.mobi)
62 | ### OSX template
63 | .DS_Store
64 | .AppleDouble
65 | .LSOverride
66 |
67 | # Icon must end with two \r
68 | Icon
69 |
70 | # Thumbnails
71 | ._*
72 |
73 | # Files that might appear in the root of a volume
74 | .DocumentRevisions-V100
75 | .fseventsd
76 | .Spotlight-V100
77 | .TemporaryItems
78 | .Trashes
79 | .VolumeIcon.icns
80 |
81 | # Directories potentially created on remote AFP share
82 | .AppleDB
83 | .AppleDesktop
84 | Network Trash Folder
85 | Temporary Items
86 | .apdisk
87 |
88 | #Temporary data
89 | hw1/tempdata/
90 | hw1/.ipynb_checkpoints/
91 |
92 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/data/human_body_temperature.csv:
--------------------------------------------------------------------------------
1 | temperature,gender,heart_rate
2 | 99.3,F,68.0
3 | 98.4,F,81.0
4 | 97.8,M,73.0
5 | 99.2,F,66.0
6 | 98.0,F,73.0
7 | 99.2,M,83.0
8 | 98.0,M,71.0
9 | 98.8,M,78.0
10 | 98.4,F,84.0
11 | 98.6,F,86.0
12 | 98.8,F,89.0
13 | 96.7,F,62.0
14 | 98.2,M,72.0
15 | 98.7,F,79.0
16 | 97.8,F,77.0
17 | 98.8,F,83.0
18 | 98.3,F,79.0
19 | 98.2,M,64.0
20 | 97.2,F,68.0
21 | 99.4,M,70.0
22 | 98.3,F,78.0
23 | 98.2,M,71.0
24 | 98.6,M,70.0
25 | 98.4,M,68.0
26 | 97.8,M,65.0
27 | 98.0,F,87.0
28 | 97.8,F,62.0
29 | 98.2,F,69.0
30 | 98.4,F,73.0
31 | 98.1,M,67.0
32 | 98.3,M,86.0
33 | 97.6,F,61.0
34 | 98.5,M,71.0
35 | 98.6,M,82.0
36 | 99.3,M,63.0
37 | 99.5,M,75.0
38 | 99.1,M,71.0
39 | 98.3,M,72.0
40 | 97.9,F,79.0
41 | 96.4,F,69.0
42 | 98.4,F,79.0
43 | 98.4,M,82.0
44 | 96.9,M,74.0
45 | 97.2,M,64.0
46 | 99.0,F,79.0
47 | 97.9,F,69.0
48 | 97.4,M,72.0
49 | 97.4,M,68.0
50 | 97.9,M,76.0
51 | 97.1,M,82.0
52 | 98.9,F,76.0
53 | 98.3,F,80.0
54 | 98.5,F,83.0
55 | 98.6,M,78.0
56 | 98.2,F,73.0
57 | 98.6,F,82.0
58 | 98.8,F,70.0
59 | 98.2,M,66.0
60 | 98.2,F,65.0
61 | 97.6,M,73.0
62 | 99.1,F,80.0
63 | 98.4,M,84.0
64 | 98.2,F,57.0
65 | 98.6,M,83.0
66 | 98.7,F,65.0
67 | 97.4,M,70.0
68 | 97.4,F,57.0
69 | 98.6,M,77.0
70 | 98.7,F,82.0
71 | 98.9,M,80.0
72 | 98.1,F,81.0
73 | 97.7,F,61.0
74 | 98.0,M,78.0
75 | 98.8,M,81.0
76 | 99.0,M,75.0
77 | 98.8,M,78.0
78 | 98.0,F,76.0
79 | 98.4,M,70.0
80 | 97.4,M,78.0
81 | 97.6,M,74.0
82 | 98.8,F,73.0
83 | 98.0,M,67.0
84 | 97.5,M,70.0
85 | 99.2,F,77.0
86 | 98.6,F,85.0
87 | 97.1,M,75.0
88 | 98.6,F,77.0
89 | 98.0,M,78.0
90 | 98.7,M,73.0
91 | 98.1,M,73.0
92 | 97.8,M,74.0
93 | 100.0,F,78.0
94 | 98.8,F,84.0
95 | 97.1,M,73.0
96 | 97.8,M,58.0
97 | 96.8,F,75.0
98 | 99.9,F,79.0
99 | 98.7,F,64.0
100 | 98.8,F,64.0
101 | 98.0,M,74.0
102 | 99.0,M,81.0
103 | 98.5,M,68.0
104 | 98.0,F,78.0
105 | 99.4,F,77.0
106 | 97.6,M,69.0
107 | 96.7,M,71.0
108 | 97.0,M,80.0
109 | 98.6,M,66.0
110 | 98.7,F,72.0
111 | 97.3,M,69.0
112 | 98.8,F,69.0
113 | 98.0,F,89.0
114 | 98.2,F,64.0
115 | 99.1,F,74.0
116 | 99.0,M,79.0
117 | 98.0,M,64.0
118 | 100.8,F,77.0
119 | 97.8,F,71.0
120 | 98.7,M,78.0
121 | 98.4,F,74.0
122 | 97.7,F,84.0
123 | 97.9,F,68.0
124 | 99.0,F,81.0
125 | 97.2,F,66.0
126 | 97.5,M,75.0
127 | 96.3,M,70.0
128 | 97.7,M,77.0
129 | 98.2,F,73.0
130 | 97.9,M,72.0
131 | 98.7,F,59.0
132 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## What is the true normal human body temperature? \n",
8 | "\n",
9 | "#### Background\n",
10 | "\n",
11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n",
12 | "\n",
13 | "#### Exercise\n",
14 | "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n",
15 | "\n",
16 | "1. Is the distribution of body temperatures normal? \n",
17 | "2. Is the true population mean really 98.6 degrees F?\n",
18 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
19 | "4. Is there a significant difference between males and females in normal temperature?\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "#### Resources\n",
24 | "\n",
25 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 58,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "import pandas as pd"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 62,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [],
46 | "source": [
47 | "df = pd.read_csv('data/human_body_temperature.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "source": [
56 | "# Exercise\n",
57 | "\n",
58 | "Answer the following questions in this notebook and submit to your Github account. \n",
59 | "\n",
60 | "1. Is the distribution of body temperatures normal? \n",
61 | " - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n",
62 | "2. Is the true population mean really 98.6 degrees F?\n",
63 | " - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n",
64 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
65 | " - Start by computing the margin of error and confidence interval.\n",
66 | "4. Is there a significant difference between males and females in normal temperature?\n",
67 | " - Set up and solve for a two sample hypothesis testing."
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "collapsed": true
75 | },
76 | "outputs": [],
77 | "source": []
78 | }
79 | ],
80 | "metadata": {
81 | "kernelspec": {
82 | "display_name": "Python 2",
83 | "language": "python",
84 | "name": "python2"
85 | },
86 | "language_info": {
87 | "codemirror_mode": {
88 | "name": "ipython",
89 | "version": 2
90 | },
91 | "file_extension": ".py",
92 | "mimetype": "text/x-python",
93 | "name": "python",
94 | "nbconvert_exporter": "python",
95 | "pygments_lexer": "ipython2",
96 | "version": "2.7.9"
97 | }
98 | },
99 | "nbformat": 4,
100 | "nbformat_minor": 0
101 | }
102 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## What is the true normal human body temperature? \n",
8 | "\n",
9 | "#### Background\n",
10 | "\n",
11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n",
12 | "\n",
13 | "#### Exercise\n",
14 | "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n",
15 | "\n",
16 | "1. Is the distribution of body temperatures normal? \n",
17 | "2. Is the true population mean really 98.6 degrees F?\n",
18 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
19 | "4. Is there a significant difference between males and females in normal temperature?\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "#### Resources\n",
24 | "\n",
25 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 58,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "import pandas as pd"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 62,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [],
46 | "source": [
47 | "df = pd.read_csv('data/human_body_temperature.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "source": [
56 | "# Exercise\n",
57 | "\n",
58 | "Answer the following questions in this notebook and submit to your Github account. \n",
59 | "\n",
60 | "1. Is the distribution of body temperatures normal? \n",
61 | " - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n",
62 | "2. Is the true population mean really 98.6 degrees F?\n",
63 | " - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n",
64 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
65 | " - Start by computing the margin of error and confidence interval.\n",
66 | "4. Is there a significant difference between males and females in normal temperature?\n",
67 | " - Set up and solve for a two sample hypothesis testing."
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "collapsed": true
75 | },
76 | "outputs": [],
77 | "source": []
78 | }
79 | ],
80 | "metadata": {
81 | "kernelspec": {
82 | "display_name": "Python 2",
83 | "language": "python",
84 | "name": "python2"
85 | },
86 | "language_info": {
87 | "codemirror_mode": {
88 | "name": "ipython",
89 | "version": 2
90 | },
91 | "file_extension": ".py",
92 | "mimetype": "text/x-python",
93 | "name": "python",
94 | "nbconvert_exporter": "python",
95 | "pygments_lexer": "ipython2",
96 | "version": "2.7.9"
97 | }
98 | },
99 | "nbformat": 4,
100 | "nbformat_minor": 0
101 | }
102 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## What is the true normal human body temperature? \n",
8 | "\n",
9 | "#### Background\n",
10 | "\n",
11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n",
12 | "\n",
13 | "#### Exercise\n",
14 | "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n",
15 | "\n",
16 | "1. Is the distribution of body temperatures normal? \n",
17 | "2. Is the true population mean really 98.6 degrees F?\n",
18 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
19 | "4. Is there a significant difference between males and females in normal temperature?\n",
20 | "\n",
21 | "\n",
22 | "\n",
23 | "#### Resources\n",
24 | "\n",
25 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 58,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "import pandas as pd"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 62,
42 | "metadata": {
43 | "collapsed": false
44 | },
45 | "outputs": [],
46 | "source": [
47 | "df = pd.read_csv('data/human_body_temperature.csv')"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {
53 | "collapsed": true
54 | },
55 | "source": [
56 | "# Exercise\n",
57 | "\n",
58 | "Answer the following questions in this notebook and submit to your Github account. \n",
59 | "\n",
60 | "1. Is the distribution of body temperatures normal? \n",
61 | " - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n",
62 | "2. Is the true population mean really 98.6 degrees F?\n",
63 | " - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n",
64 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
65 | " - Start by computing the margin of error and confidence interval.\n",
66 | "4. Is there a significant difference between males and females in normal temperature?\n",
67 | " - Set up and solve for a two sample hypothesis testing."
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": null,
73 | "metadata": {
74 | "collapsed": true
75 | },
76 | "outputs": [],
77 | "source": []
78 | }
79 | ],
80 | "metadata": {
81 | "kernelspec": {
82 | "display_name": "Python 2",
83 | "language": "python",
84 | "name": "python2"
85 | },
86 | "language_info": {
87 | "codemirror_mode": {
88 | "name": "ipython",
89 | "version": 2
90 | },
91 | "file_extension": ".py",
92 | "mimetype": "text/x-python",
93 | "name": "python",
94 | "nbconvert_exporter": "python",
95 | "pygments_lexer": "ipython2",
96 | "version": "2.7.9"
97 | }
98 | },
99 | "nbformat": 4,
100 | "nbformat_minor": 0
101 | }
102 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 1/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "### Examining racial discrimination in the US job market\n",
9 | "\n",
10 | "#### Background\n",
11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
12 | "\n",
13 | "#### Data\n",
14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
15 | "\n",
16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
17 | "\n",
18 | "#### Exercise\n",
19 | "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
20 | "\n",
21 | "\n",
22 | "#### Resources\n",
23 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
24 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html "
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "****"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {
38 | "collapsed": true
39 | },
40 | "outputs": [],
41 | "source": [
42 | "import pandas as pd\n",
43 | "import numpy as np\n",
44 | "from scipy import stats"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {
51 | "collapsed": false
52 | },
53 | "outputs": [],
54 | "source": [
55 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {
62 | "collapsed": false
63 | },
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/plain": [
68 | "157.0"
69 | ]
70 | },
71 | "execution_count": 4,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "# number of callbacks for balck-sounding names\n",
78 | "sum(data[data.race=='b'].call)"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {
84 | "collapsed": true
85 | },
86 | "source": [
87 | "****\n",
88 | "\n",
89 | "# Exercise\n",
90 | "\n",
91 | " 1. What test is appropriate for this problem? Does CLT apply?\n",
92 | " 2. What are the null and alternate hypotheses?\n",
93 | " 3. Compute margin of error, confidence interval, and p-value.\n",
94 | " 4. Discuss statistical significance.\n",
95 | " \n",
96 | "You can include written notes in notebook cells using Markdown: \n",
97 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
98 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
99 | " \n",
100 | "****"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": true
108 | },
109 | "outputs": [],
110 | "source": []
111 | }
112 | ],
113 | "metadata": {
114 | "kernelspec": {
115 | "display_name": "Python 2",
116 | "language": "python",
117 | "name": "python2"
118 | },
119 | "language_info": {
120 | "codemirror_mode": {
121 | "name": "ipython",
122 | "version": 2
123 | },
124 | "file_extension": ".py",
125 | "mimetype": "text/x-python",
126 | "name": "python",
127 | "nbconvert_exporter": "python",
128 | "pygments_lexer": "ipython2",
129 | "version": "2.7.9"
130 | }
131 | },
132 | "nbformat": 4,
133 | "nbformat_minor": 0
134 | }
135 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 3/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "### Examining racial discrimination in the US job market\n",
9 | "\n",
10 | "#### Background\n",
11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
12 | "\n",
13 | "#### Data\n",
14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
15 | "\n",
16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
17 | "\n",
18 | "#### Exercise\n",
19 | "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
20 | "\n",
21 | "\n",
22 | "#### Resources\n",
23 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
24 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html "
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "****"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 2,
37 | "metadata": {
38 | "collapsed": true
39 | },
40 | "outputs": [],
41 | "source": [
42 | "import pandas as pd\n",
43 | "import numpy as np\n",
44 | "from scipy import stats"
45 | ]
46 | },
47 | {
48 | "cell_type": "code",
49 | "execution_count": 3,
50 | "metadata": {
51 | "collapsed": false
52 | },
53 | "outputs": [],
54 | "source": [
55 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 4,
61 | "metadata": {
62 | "collapsed": false
63 | },
64 | "outputs": [
65 | {
66 | "data": {
67 | "text/plain": [
68 | "157.0"
69 | ]
70 | },
71 | "execution_count": 4,
72 | "metadata": {},
73 | "output_type": "execute_result"
74 | }
75 | ],
76 | "source": [
77 | "# number of callbacks for balck-sounding names\n",
78 | "sum(data[data.race=='b'].call)"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {
84 | "collapsed": true
85 | },
86 | "source": [
87 | "****\n",
88 | "\n",
89 | "# Exercise\n",
90 | "\n",
91 | " 1. What test is appropriate for this problem? Does CLT apply?\n",
92 | " 2. What are the null and alternate hypotheses?\n",
93 | " 3. Compute margin of error, confidence interval, and p-value.\n",
94 | " 4. Discuss statistical significance.\n",
95 | " \n",
96 | "You can include written notes in notebook cells using Markdown: \n",
97 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
98 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
99 | " \n",
100 | "****"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {
107 | "collapsed": true
108 | },
109 | "outputs": [],
110 | "source": []
111 | }
112 | ],
113 | "metadata": {
114 | "kernelspec": {
115 | "display_name": "Python 2",
116 | "language": "python",
117 | "name": "python2"
118 | },
119 | "language_info": {
120 | "codemirror_mode": {
121 | "name": "ipython",
122 | "version": 2
123 | },
124 | "file_extension": ".py",
125 | "mimetype": "text/x-python",
126 | "name": "python",
127 | "nbconvert_exporter": "python",
128 | "pygments_lexer": "ipython2",
129 | "version": "2.7.9"
130 | }
131 | },
132 | "nbformat": 4,
133 | "nbformat_minor": 0
134 | }
135 |
--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/.ipynb_checkpoints/sliderule_dsi_xml_exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# XML example and exercise\n",
8 | "****\n",
9 | "+ study examples of accessing nodes in XML tree structure \n",
10 | "+ work on exercise to be completed and submitted\n",
11 | "****\n",
12 | "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n",
13 | "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n",
14 | "****"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "from xml.etree import ElementTree as ET"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## XML example\n",
33 | "\n",
34 | "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 3,
40 | "metadata": {
41 | "collapsed": true
42 | },
43 | "outputs": [],
44 | "source": [
45 | "document_tree = ET.parse( './data/mondial_database_less.xml' )"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "Albania\n",
60 | "Greece\n",
61 | "Macedonia\n",
62 | "Serbia\n",
63 | "Montenegro\n",
64 | "Kosovo\n",
65 | "Andorra\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "# print names of all countries\n",
71 | "for child in document_tree.getroot():\n",
72 | " print child.find('name').text"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {
79 | "collapsed": false
80 | },
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n",
87 | "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n",
88 | "* Macedonia: Skopje, Kumanovo\n",
89 | "* Serbia: Beograd, Novi Sad, Niš\n",
90 | "* Montenegro: Podgorica\n",
91 | "* Kosovo: Prishtine\n",
92 | "* Andorra: Andorra la Vella\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "# print names of all countries and their cities\n",
98 | "for element in document_tree.iterfind('country'):\n",
99 | " print '* ' + element.find('name').text + ':',\n",
100 | " capitals_string = ''\n",
101 | " for subelement in element.getiterator('city'):\n",
102 | " capitals_string += subelement.find('name').text + ', '\n",
103 | " print capitals_string[:-2]"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "****\n",
111 | "## XML exercise\n",
112 | "\n",
113 | "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n",
114 | "\n",
115 | "1. 10 countries with the lowest infant mortality rates\n",
116 | "2. 10 cities with the largest population\n",
117 | "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
118 | "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 6,
124 | "metadata": {
125 | "collapsed": true
126 | },
127 | "outputs": [],
128 | "source": [
129 | "document = ET.parse( './data/mondial_database.xml' )"
130 | ]
131 | }
132 | ],
133 | "metadata": {
134 | "kernelspec": {
135 | "display_name": "Python 2",
136 | "language": "python",
137 | "name": "python2"
138 | },
139 | "language_info": {
140 | "codemirror_mode": {
141 | "name": "ipython",
142 | "version": 2
143 | },
144 | "file_extension": ".py",
145 | "mimetype": "text/x-python",
146 | "name": "python",
147 | "nbconvert_exporter": "python",
148 | "pygments_lexer": "ipython2",
149 | "version": "2.7.9"
150 | }
151 | },
152 | "nbformat": 4,
153 | "nbformat_minor": 0
154 | }
155 |
--------------------------------------------------------------------------------
/Capstone Project/Data/notes.txt:
--------------------------------------------------------------------------------
1 | Notes for Football Data
2 |
3 | All data is in csv format, ready for use within standard spreadsheet applications. Please note that some abbreviations are no longer in use (in particular odds from specific bookmakers no longer used) and refer to data collected in earlier seasons. For a current list of what bookmakers are included in the dataset please visit http://www.football-data.co.uk/matches.php
4 |
5 | Key to results data:
6 |
7 | Div = League Division
8 | Date = Match Date (dd/mm/yy)
9 | HomeTeam = Home Team
10 | AwayTeam = Away Team
11 | FTHG = Full Time Home Team Goals
12 | FTAG = Full Time Away Team Goals
13 | FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win)
14 | HTHG = Half Time Home Team Goals
15 | HTAG = Half Time Away Team Goals
16 | HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win)
17 |
18 | Match Statistics (where available)
19 | Attendance = Crowd Attendance
20 | Referee = Match Referee
21 | HS = Home Team Shots
22 | AS = Away Team Shots
23 | HST = Home Team Shots on Target
24 | AST = Away Team Shots on Target
25 | HHW = Home Team Hit Woodwork
26 | AHW = Away Team Hit Woodwork
27 | HC = Home Team Corners
28 | AC = Away Team Corners
29 | HF = Home Team Fouls Committed
30 | AF = Away Team Fouls Committed
31 | HO = Home Team Offsides
32 | AO = Away Team Offsides
33 | HY = Home Team Yellow Cards
34 | AY = Away Team Yellow Cards
35 | HR = Home Team Red Cards
36 | AR = Away Team Red Cards
37 | HBP = Home Team Bookings Points (10 = yellow, 25 = red)
38 | ABP = Away Team Bookings Points (10 = yellow, 25 = red)
39 |
40 | Key to 1X2 (match) betting odds data:
41 |
42 | B365H = Bet365 home win odds
43 | B365D = Bet365 draw odds
44 | B365A = Bet365 away win odds
45 | BSH = Blue Square home win odds
46 | BSD = Blue Square draw odds
47 | BSA = Blue Square away win odds
48 | BWH = Bet&Win home win odds
49 | BWD = Bet&Win draw odds
50 | BWA = Bet&Win away win odds
51 | GBH = Gamebookers home win odds
52 | GBD = Gamebookers draw odds
53 | GBA = Gamebookers away win odds
54 | IWH = Interwetten home win odds
55 | IWD = Interwetten draw odds
56 | IWA = Interwetten away win odds
57 | LBH = Ladbrokes home win odds
58 | LBD = Ladbrokes draw odds
59 | LBA = Ladbrokes away win odds
60 | PSH = Pinnacle home win odds
61 | PSD = Pinnacle draw odds
62 | PSA = Pinnacle away win odds
63 | SOH = Sporting Odds home win odds
64 | SOD = Sporting Odds draw odds
65 | SOA = Sporting Odds away win odds
66 | SBH = Sportingbet home win odds
67 | SBD = Sportingbet draw odds
68 | SBA = Sportingbet away win odds
69 | SJH = Stan James home win odds
70 | SJD = Stan James draw odds
71 | SJA = Stan James away win odds
72 | SYH = Stanleybet home win odds
73 | SYD = Stanleybet draw odds
74 | SYA = Stanleybet away win odds
75 | VCH = VC Bet home win odds
76 | VCD = VC Bet draw odds
77 | VCA = VC Bet away win odds
78 | WHH = William Hill home win odds
79 | WHD = William Hill draw odds
80 | WHA = William Hill away win odds
81 |
82 | Bb1X2 = Number of BetBrain bookmakers used to calculate match odds averages and maximums
83 | BbMxH = Betbrain maximum home win odds
84 | BbAvH = Betbrain average home win odds
85 | BbMxD = Betbrain maximum draw odds
86 | BbAvD = Betbrain average draw win odds
87 | BbMxA = Betbrain maximum away win odds
88 | BbAvA = Betbrain average away win odds
89 |
90 |
91 |
92 | Key to total goals betting odds:
93 |
94 | BbOU = Number of BetBrain bookmakers used to calculate over/under 2.5 goals (total goals) averages and maximums
95 | BbMx>2.5 = Betbrain maximum over 2.5 goals
96 | BbAv>2.5 = Betbrain average over 2.5 goals
97 | BbMx<2.5 = Betbrain maximum under 2.5 goals
98 | BbAv<2.5 = Betbrain average under 2.5 goals
99 |
100 | GB>2.5 = Gamebookers over 2.5 goals
101 | GB<2.5 = Gamebookers under 2.5 goals
102 | B365>2.5 = Bet365 over 2.5 goals
103 | B365<2.5 = Bet365 under 2.5 goals
104 |
105 |
106 | Key to Asian handicap betting odds:
107 |
108 | BbAH = Number of BetBrain bookmakers used to Asian handicap averages and maximums
109 | BbAHh = Betbrain size of handicap (home team)
110 | BbMxAHH = Betbrain maximum Asian handicap home team odds
111 | BbAvAHH = Betbrain average Asian handicap home team odds
112 | BbMxAHA = Betbrain maximum Asian handicap away team odds
113 | BbAvAHA = Betbrain average Asian handicap away team odds
114 |
115 | GBAHH = Gamebookers Asian handicap home team odds
116 | GBAHA = Gamebookers Asian handicap away team odds
117 | GBAH = Gamebookers size of handicap (home team)
118 | LBAHH = Ladbrokes Asian handicap home team odds
119 | LBAHA = Ladbrokes Asian handicap away team odds
120 | LBAH = Ladbrokes size of handicap (home team)
121 | B365AHH = Bet365 Asian handicap home team odds
122 | B365AHA = Bet365 Asian handicap away team odds
123 | B365AH = Bet365 size of handicap (home team)
124 |
125 |
126 | Closing odds (last odds before match starts)
127 |
128 | PSCH = Pinnacle closing home win odds
129 | PSCD = Pinnacle closing draw odds
130 | PSCA = Pinnacle closing away win odds
131 |
132 | Football-Data would like to acknowledge the following sources which have been utilised in the compilation of Football-Data's results and odds files.
133 |
134 | Historical results:
135 | International Soccer Server - http://sunsite.tut.fi/rec/riku/soccer.html
136 | European Football - http://www.eurofootball.be/
137 | RSSSF Archive - http://www.rsssf.com/
138 |
139 | Current results (full time, half time)
140 | TBWSport - http://www.tbwsport.com
141 | Livescore- http://www.livescore.com
142 |
143 | Match statistics
144 | Sportinglife, ESPN Soccer, Bundesliga.de, Gazzetta.it and Football.fr
145 |
146 | Bookmakers betting odds
147 | Betbrain - http://www.betbrain.com
148 | Betbase - http://www.betbase.info
149 |
150 | Betting odds for weekend games are collected Friday afternoons, and on Tuesday afternoons for midweek games.
151 |
152 | Additional match statistics (corners, shots, bookings, referee etc.) for the 2000/01 and 2001/02 seasons for the English, Scottish and German leagues were provided by Sports.com (now under new ownership and no longer available).
153 |
154 |
--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_json/data/world_bank_projects_less.json:
--------------------------------------------------------------------------------
1 | [{ "_id" : { "$oid" : "52b213b38594d8a2be17c780" }, "approvalfy" : 1999, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-12T00:00:00Z", "borrower" : "FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA", "closingdate" : "2018-07-07T00:00:00Z", "country_namecode" : "Federal Democratic Republic of Ethiopia!$!ET", "countrycode" : "ET", "countryname" : "Federal Democratic Republic of Ethiopia", "countryshortname" : "Ethiopia", "docty" : "Project Information Document,Indigenous Peoples Plan,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 0, "ibrdcommamt" : 0, "id" : "P129828", "idacommamt" : 130000000, "impagency" : "MINISTRY OF EDUCATION", "lendinginstr" : "Investment Project Financing", "lendinginstrtype" : "IN", "lendprojectcost" : 550000000, "majorsector_percent" : [ { "Name" : "Education", "Percent" : 46 }, { "Name" : "Education", "Percent" : 26 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 16 }, { "Name" : "Education", "Percent" : 12 } ], "mjsector_namecode" : [ { "name" : "Education", "code" : "EX" }, { "name" : "Education", "code" : "EX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Education", "code" : "EX" } ], "mjtheme" : [ "Human development" ], "mjtheme_namecode" : [ { "name" : "Human development", "code" : "8" }, { "name" : "", "code" : "11" } ], "mjthemecode" : "8,11", "prodline" : "PE", "prodlinetext" : "IBRD/IDA", "productlinetype" : "L", "project_abstract" : { "cdata" : "The development objective of the Second Phase of General Education Quality Improvement Project for Ethiopia is to improve learning conditions in primary and secondary schools and strengthen institutions at different levels of educational administration. The project has six components. The first component is curriculum, textbooks, assessment, examinations, and inspection. This component will support improvement of learning conditions in grades KG-12 by providing increased access to teaching and learning materials and through improvements to the curriculum by assessing the strengths and weaknesses of the current curriculum. This component has following four sub-components: (i) curriculum reform and implementation; (ii) teaching and learning materials; (iii) assessment and examinations; and (iv) inspection. The second component is teacher development program (TDP). This component will support improvements in learning conditions in both primary and secondary schools by advancing the quality of teaching in general education through: (a) enhancing the training of pre-service teachers in teacher education institutions; and (b) improving the quality of in-service teacher training. This component has following three sub-components: (i) pre-service teacher training; (ii) in-service teacher training; and (iii) licensing and relicensing of teachers and school leaders. The third component is school improvement plan. This component will support the strengthening of school planning in order to improve learning outcomes, and to partly fund the school improvement plans through school grants. It has following two sub-components: (i) school improvement plan; and (ii) school grants. The fourth component is management and capacity building, including education management information systems (EMIS). This component will support management and capacity building aspect of the project. This component has following three sub-components: (i) capacity building for education planning and management; (ii) capacity building for school planning and management; and (iii) EMIS. The fifth component is improving the quality of learning and teaching in secondary schools and universities through the use of information and communications technology (ICT). It has following five sub-components: (i) national policy and institution for ICT in general education; (ii) national ICT infrastructure improvement plan for general education; (iii) develop an integrated monitoring, evaluation, and learning system specifically for the ICT component; (iv) teacher professional development in the use of ICT; and (v) provision of limited number of e-Braille display readers with the possibility to scale up to all secondary education schools based on the successful implementation and usage of the readers. The sixth component is program coordination, monitoring and evaluation, and communication. It will support institutional strengthening by developing capacities in all aspects of program coordination, monitoring and evaluation; a new sub-component on communications will support information sharing for better management and accountability. It has following three sub-components: (i) program coordination; (ii) monitoring and evaluation (M and E); and (iii) communication." }, "project_name" : "Ethiopia General Education Quality Improvement Project II", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID), Vol.", "DocType" : "PID", "EntityID" : "090224b081e545fb_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b081e545fb_1_0", "DocDate" : "28-AUG-2013" }, { "DocTypeDesc" : "Indigenous Peoples Plan (IP), Vol.1 of 1", "DocType" : "IP", "EntityID" : "000442464_20130920111729", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20130920111729", "DocDate" : "01-JUL-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.", "DocType" : "PID", "EntityID" : "090224b0817b19e2_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b0817b19e2_1_0", "DocDate" : "22-NOV-2012" } ], "projectfinancialtype" : "IDA", "projectstatusdisplay" : "Active", "regionname" : "Africa", "sector" : [ { "Name" : "Primary education" }, { "Name" : "Secondary education" }, { "Name" : "Public administration- Other social services" }, { "Name" : "Tertiary education" } ], "sector1" : { "Name" : "Primary education", "Percent" : 46 }, "sector2" : { "Name" : "Secondary education", "Percent" : 26 }, "sector3" : { "Name" : "Public administration- Other social services", "Percent" : 16 }, "sector4" : { "Name" : "Tertiary education", "Percent" : 12 }, "sector_namecode" : [ { "name" : "Primary education", "code" : "EP" }, { "name" : "Secondary education", "code" : "ES" }, { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "Tertiary education", "code" : "ET" } ], "sectorcode" : "ET,BS,ES,EP", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Education for all", "Percent" : 100 }, "theme_namecode" : [ { "name" : "Education for all", "code" : "65" } ], "themecode" : "65", "totalamt" : 130000000, "totalcommamt" : 130000000, "url" : "http://www.worldbank.org/projects/P129828/ethiopia-general-education-quality-improvement-project-ii?lang=en" },
2 | { "_id" : { "$oid" : "52b213b38594d8a2be17c781" }, "approvalfy" : 2015, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-04T00:00:00Z", "borrower" : "GOVERNMENT OF TUNISIA", "country_namecode" : "Republic of Tunisia!$!TN", "countrycode" : "TN", "countryname" : "Republic of Tunisia", "countryshortname" : "Tunisia", "docty" : "Project Information Document,Integrated Safeguards Data Sheet,Integrated Safeguards Data Sheet,Project Information Document,Integrated Safeguards Data Sheet,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 4700000, "ibrdcommamt" : 0, "id" : "P144674", "idacommamt" : 0, "impagency" : "MINISTRY OF FINANCE", "lendinginstr" : "Specific Investment Loan", "lendinginstrtype" : "IN", "lendprojectcost" : 5700000, "majorsector_percent" : [ { "Name" : "Public Administration, Law, and Justice", "Percent" : 70 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 30 } ], "mjsector_namecode" : [ { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" } ], "mjtheme" : [ "Economic management", "Social protection and risk management" ], "mjtheme_namecode" : [ { "name" : "Economic management", "code" : "1" }, { "name" : "Social protection and risk management", "code" : "6" } ], "mjthemecode" : "1,6", "prodline" : "RE", "prodlinetext" : "Recipient Executed Activities", "productlinetype" : "L", "project_name" : "TN: DTF Social Protection Reforms Support", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131024115616", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131024115616", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20131024151611", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20131024151611", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000442464_20131031112136", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20131031112136", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131031105716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131031105716", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20130305113209", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113209", "DocDate" : "16-JAN-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000356161_20130305113716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113716", "DocDate" : "16-JAN-2013" } ], "projectfinancialtype" : "OTHER", "projectstatusdisplay" : "Active", "regionname" : "Middle East and North Africa", "sector" : [ { "Name" : "Public administration- Other social services" }, { "Name" : "General public administration sector" } ], "sector1" : { "Name" : "Public administration- Other social services", "Percent" : 70 }, "sector2" : { "Name" : "General public administration sector", "Percent" : 30 }, "sector_namecode" : [ { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "General public administration sector", "code" : "BZ" } ], "sectorcode" : "BZ,BS", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Other economic management", "Percent" : 30 }, "theme_namecode" : [ { "name" : "Other economic management", "code" : "24" }, { "name" : "Social safety nets", "code" : "54" } ], "themecode" : "54,24", "totalamt" : 0, "totalcommamt" : 4700000, "url" : "http://www.worldbank.org/projects/P144674?lang=en" }
3 | ]
4 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/sliderule_dsi_inferential_statistics_exercise_2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "### Examining racial discrimination in the US job market\n",
9 | "\n",
10 | "#### Background\n",
11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
12 | "\n",
13 | "#### Data\n",
14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
15 | "\n",
16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
17 | "\n",
18 | "#### Exercise\n",
19 | "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
20 | "\n",
21 | "Answer the following questions **in this notebook below and submit to your Github account**. \n",
22 | "\n",
23 | " 1. What test is appropriate for this problem? Does CLT apply?\n",
24 | " 2. What are the null and alternate hypotheses?\n",
25 | " 3. Compute margin of error, confidence interval, and p-value.\n",
26 | " 4. Discuss statistical significance.\n",
27 | "\n",
28 | "You can include written notes in notebook cells using Markdown: \n",
29 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
30 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
31 | "\n",
32 | "\n",
33 | "#### Resources\n",
34 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
35 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n",
36 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
37 | "\n",
38 | "****"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 42,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "import pandas as pd\n",
50 | "import numpy as np\n",
51 | "from scipy import stats\n",
52 | "import math"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [],
62 | "source": [
63 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 19,
69 | "metadata": {
70 | "collapsed": false
71 | },
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/plain": [
76 | "157.0"
77 | ]
78 | },
79 | "execution_count": 19,
80 | "metadata": {},
81 | "output_type": "execute_result"
82 | }
83 | ],
84 | "source": [
85 | "# number of callbacks for black-sounding names\n",
86 | "sum(data[data.race=='b'].call)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 10,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [
96 | {
97 | "data": {
98 | "text/html": [
99 | "
\n",
100 | "
\n",
101 | " \n",
102 | " \n",
103 | " | \n",
104 | " id | \n",
105 | " ad | \n",
106 | " education | \n",
107 | " ofjobs | \n",
108 | " yearsexp | \n",
109 | " honors | \n",
110 | " volunteer | \n",
111 | " military | \n",
112 | " empholes | \n",
113 | " occupspecific | \n",
114 | " ... | \n",
115 | " compreq | \n",
116 | " orgreq | \n",
117 | " manuf | \n",
118 | " transcom | \n",
119 | " bankreal | \n",
120 | " trade | \n",
121 | " busservice | \n",
122 | " othservice | \n",
123 | " missind | \n",
124 | " ownership | \n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " \n",
129 | " | 0 | \n",
130 | " b | \n",
131 | " 1 | \n",
132 | " 4 | \n",
133 | " 2 | \n",
134 | " 6 | \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 1 | \n",
139 | " 17 | \n",
140 | " ... | \n",
141 | " 1.0 | \n",
142 | " 0.0 | \n",
143 | " 1.0 | \n",
144 | " 0.0 | \n",
145 | " 0.0 | \n",
146 | " 0.0 | \n",
147 | " 0.0 | \n",
148 | " 0.0 | \n",
149 | " 0.0 | \n",
150 | " | \n",
151 | "
\n",
152 | " \n",
153 | " | 1 | \n",
154 | " b | \n",
155 | " 1 | \n",
156 | " 3 | \n",
157 | " 3 | \n",
158 | " 6 | \n",
159 | " 0 | \n",
160 | " 1 | \n",
161 | " 1 | \n",
162 | " 0 | \n",
163 | " 316 | \n",
164 | " ... | \n",
165 | " 1.0 | \n",
166 | " 0.0 | \n",
167 | " 1.0 | \n",
168 | " 0.0 | \n",
169 | " 0.0 | \n",
170 | " 0.0 | \n",
171 | " 0.0 | \n",
172 | " 0.0 | \n",
173 | " 0.0 | \n",
174 | " | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
2 rows × 65 columns
\n",
179 | "
"
180 | ],
181 | "text/plain": [
182 | " id ad education ofjobs yearsexp honors volunteer military empholes \\\n",
183 | "0 b 1 4 2 6 0 0 0 1 \n",
184 | "1 b 1 3 3 6 0 1 1 0 \n",
185 | "\n",
186 | " occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n",
187 | "0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
188 | "1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
189 | "\n",
190 | " busservice othservice missind ownership \n",
191 | "0 0.0 0.0 0.0 \n",
192 | "1 0.0 0.0 0.0 \n",
193 | "\n",
194 | "[2 rows x 65 columns]"
195 | ]
196 | },
197 | "execution_count": 10,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "data.columns\n",
204 | "data.head(2)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "#### What test is appropriate for this problem? Does CLT apply?\n",
212 | "\n",
213 | "Let's being by looking the number of observations where race = b and race = w. We can also check the number of such observations that have received a call back and the ones that didn't. Using these factors, we will be able to create a contingency table. \n",
214 | "\n",
215 | "Hence, the problem boils down to comparison of two proportions. \n",
216 | "If certain conditions are satisfied, we can also perform the Fischer's Exact Test using the contingency table.\n"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 73,
222 | "metadata": {
223 | "collapsed": false
224 | },
225 | "outputs": [
226 | {
227 | "name": "stdout",
228 | "output_type": "stream",
229 | "text": [
230 | "Number of observations where race is b : 2435\n",
231 | "Number of observations where race is w : 2435\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "data_b = data[data.race=='b']\n",
237 | "data_w = data[data.race=='w']\n",
238 | "num_b = len(data_b)\n",
239 | "num_w = len(data_w)\n",
240 | "print \"Number of observations where race is b : \",num_b\n",
241 | "print \"Number of observations where race is w : \",num_w"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "Since there can be only two states for the 'call' variable, we can arbitrarily assign getting a call back as \"success\" and not getting a call back as a failure. "
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 74,
254 | "metadata": {
255 | "collapsed": false
256 | },
257 | "outputs": [
258 | {
259 | "name": "stdout",
260 | "output_type": "stream",
261 | "text": [
262 | "157 235\n"
263 | ]
264 | }
265 | ],
266 | "source": [
267 | "b_success = len(data_b[data_b.call == 1])\n",
268 | "w_success = len(data_w[data_w.call == 1])\n",
269 | "print b_success, w_success"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "Let's also calculate the proportion $\\hat{p}_b$ of black sounding names getting a callback and the proportion $\\hat{p}_w$ white sounding names getting a call back."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 75,
282 | "metadata": {
283 | "collapsed": false
284 | },
285 | "outputs": [
286 | {
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "Proportion of black sounding names getting a callback : 0.064476386037\n",
291 | "Proportion of white sounding names getting a callback : 0.0965092402464\n"
292 | ]
293 | }
294 | ],
295 | "source": [
296 | "p_b = 1.0 * b_success/num_b\n",
297 | "p_w = 1.0 * w_success/num_w\n",
298 | "print \"Proportion of black sounding names getting a callback : \",p_b\n",
299 | "print \"Proportion of white sounding names getting a callback : \",p_w"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "Also, to make sure the samples are big enough such that we can use a normal distribution to model difference between \n",
307 | "proportions, we need to check if $n*p$ and $n*(1-p)$ are greater than 10. This is a conclusion from the Central Limit Theorem. "
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 76,
313 | "metadata": {
314 | "collapsed": false
315 | },
316 | "outputs": [
317 | {
318 | "name": "stdout",
319 | "output_type": "stream",
320 | "text": [
321 | "157.0\n",
322 | "2278.0\n",
323 | "---\n",
324 | "235.0\n",
325 | "2200.0\n"
326 | ]
327 | }
328 | ],
329 | "source": [
330 | "print num_b * p_b\n",
331 | "print num_b * (1-p_b)\n",
332 | "print \"---\"\n",
333 | "print num_b * p_w\n",
334 | "print num_b * (1-p_w)"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "Since, all the values are above 10, we can use the normal distribution to model differences between proportions."
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "Now that we have all the required data, we formulate the null and alternate hypotheses.\n",
349 | "\n",
350 | "$H_0\\:is \\: p_b = p_w\\\\\n",
351 | "H_A \\:is \\: p_b \\neq p_w$\n",
352 | "\n",
353 | "The Standard Error for the sample statistic is given by\n",
354 | "$\\sqrt{\\frac{\\hat{p}_b(1-\\hat{p}_b)}{n_b} + \\frac{\\hat{p}_w(1-\\hat{p}_w)}{n_w}} $\n",
355 | "\n",
356 | "We can use the z-statistic to place a confidence interval on this sample statistic.Hence, the margin of error is \n",
357 | "$Z_{\\alpha/2} * SE$. For a 95% confidence interval, the z-value is 1.96. \n",
358 | "\n",
359 | "The confidence interval, subsequently, is $\\hat{p}_b - \\hat{p}_w \\pm {Z_{\\alpha/2} * SE}$"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 77,
365 | "metadata": {
366 | "collapsed": false
367 | },
368 | "outputs": [
369 | {
370 | "name": "stdout",
371 | "output_type": "stream",
372 | "text": [
373 | "Margin of error = 0.0152554063499\n"
374 | ]
375 | }
376 | ],
377 | "source": [
378 | "z = 1.96\n",
379 | "margin = z * math.sqrt( ( p_w*(1-p_w) / num_b) + (p_b*(1-p_b)/num_w) )\n",
380 | "\n",
381 | "print \"Margin of error = \", margin"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 78,
387 | "metadata": {
388 | "collapsed": false
389 | },
390 | "outputs": [
391 | {
392 | "name": "stdout",
393 | "output_type": "stream",
394 | "text": [
395 | "The confidence interval is given by : 0.00213225776367 to 0.0619334506552\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "print \"The confidence interval is given by :\", p_w-p_b-z*margin,\"to\", p_w-p_b+z*margin"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "0 is not in this confidence interval. Nevertheless, let's go ahead and calculating the p-value."
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 79,
413 | "metadata": {
414 | "collapsed": false
415 | },
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/plain": [
420 | "(-4.1084121524343464, 3.9838868375850767e-05)"
421 | ]
422 | },
423 | "execution_count": 79,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "from statsmodels.stats.proportion import proportions_ztest as pz\n",
430 | "pz(np.array([b_success,w_success]),np.array([num_b,num_w]),value=0)"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "The second value is the p-value and it is much lesser than 0.05. Hence, we can reject the null hypothesis. \n",
438 | "\n",
439 | "#### Significance of the result\n",
440 | "What does it practically mean to reject the null hypothesis? Our null hypothesis was that the proportion of black sounding names getting a call back is equal to the number of white sounding names getting a call back. After analysis, we have decided to reject it. This means that, in reality, there is a significant difference in the number of call backs ; white sounding names getting more call backs."
441 | ]
442 | }
443 | ],
444 | "metadata": {
445 | "kernelspec": {
446 | "display_name": "Python [Root]",
447 | "language": "python",
448 | "name": "Python [Root]"
449 | },
450 | "language_info": {
451 | "codemirror_mode": {
452 | "name": "ipython",
453 | "version": 2
454 | },
455 | "file_extension": ".py",
456 | "mimetype": "text/x-python",
457 | "name": "python",
458 | "nbconvert_exporter": "python",
459 | "pygments_lexer": "ipython2",
460 | "version": "2.7.12"
461 | }
462 | },
463 | "nbformat": 4,
464 | "nbformat_minor": 0
465 | }
466 |
--------------------------------------------------------------------------------
/Inferential Statistics/statistics project 2/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "\n",
8 | "### Examining racial discrimination in the US job market\n",
9 | "\n",
10 | "#### Background\n",
11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
12 | "\n",
13 | "#### Data\n",
14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
15 | "\n",
16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
17 | "\n",
18 | "#### Exercise\n",
19 | "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
20 | "\n",
21 | "Answer the following questions **in this notebook below and submit to your Github account**. \n",
22 | "\n",
23 | " 1. What test is appropriate for this problem? Does CLT apply?\n",
24 | " 2. What are the null and alternate hypotheses?\n",
25 | " 3. Compute margin of error, confidence interval, and p-value.\n",
26 | " 4. Discuss statistical significance.\n",
27 | "\n",
28 | "You can include written notes in notebook cells using Markdown: \n",
29 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
30 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
31 | "\n",
32 | "\n",
33 | "#### Resources\n",
34 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
35 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n",
36 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
37 | "\n",
38 | "****"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 42,
44 | "metadata": {
45 | "collapsed": true
46 | },
47 | "outputs": [],
48 | "source": [
49 | "import pandas as pd\n",
50 | "import numpy as np\n",
51 | "from scipy import stats\n",
52 | "import math"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {
59 | "collapsed": false
60 | },
61 | "outputs": [],
62 | "source": [
63 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 19,
69 | "metadata": {
70 | "collapsed": false
71 | },
72 | "outputs": [
73 | {
74 | "data": {
75 | "text/plain": [
76 | "157.0"
77 | ]
78 | },
79 | "execution_count": 19,
80 | "metadata": {},
81 | "output_type": "execute_result"
82 | }
83 | ],
84 | "source": [
85 | "# number of callbacks for black-sounding names\n",
86 | "sum(data[data.race=='b'].call)"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 10,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [
96 | {
97 | "data": {
98 | "text/html": [
99 | "\n",
100 | "
\n",
101 | " \n",
102 | " \n",
103 | " | \n",
104 | " id | \n",
105 | " ad | \n",
106 | " education | \n",
107 | " ofjobs | \n",
108 | " yearsexp | \n",
109 | " honors | \n",
110 | " volunteer | \n",
111 | " military | \n",
112 | " empholes | \n",
113 | " occupspecific | \n",
114 | " ... | \n",
115 | " compreq | \n",
116 | " orgreq | \n",
117 | " manuf | \n",
118 | " transcom | \n",
119 | " bankreal | \n",
120 | " trade | \n",
121 | " busservice | \n",
122 | " othservice | \n",
123 | " missind | \n",
124 | " ownership | \n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " \n",
129 | " | 0 | \n",
130 | " b | \n",
131 | " 1 | \n",
132 | " 4 | \n",
133 | " 2 | \n",
134 | " 6 | \n",
135 | " 0 | \n",
136 | " 0 | \n",
137 | " 0 | \n",
138 | " 1 | \n",
139 | " 17 | \n",
140 | " ... | \n",
141 | " 1.0 | \n",
142 | " 0.0 | \n",
143 | " 1.0 | \n",
144 | " 0.0 | \n",
145 | " 0.0 | \n",
146 | " 0.0 | \n",
147 | " 0.0 | \n",
148 | " 0.0 | \n",
149 | " 0.0 | \n",
150 | " | \n",
151 | "
\n",
152 | " \n",
153 | " | 1 | \n",
154 | " b | \n",
155 | " 1 | \n",
156 | " 3 | \n",
157 | " 3 | \n",
158 | " 6 | \n",
159 | " 0 | \n",
160 | " 1 | \n",
161 | " 1 | \n",
162 | " 0 | \n",
163 | " 316 | \n",
164 | " ... | \n",
165 | " 1.0 | \n",
166 | " 0.0 | \n",
167 | " 1.0 | \n",
168 | " 0.0 | \n",
169 | " 0.0 | \n",
170 | " 0.0 | \n",
171 | " 0.0 | \n",
172 | " 0.0 | \n",
173 | " 0.0 | \n",
174 | " | \n",
175 | "
\n",
176 | " \n",
177 | "
\n",
178 | "
2 rows × 65 columns
\n",
179 | "
"
180 | ],
181 | "text/plain": [
182 | " id ad education ofjobs yearsexp honors volunteer military empholes \\\n",
183 | "0 b 1 4 2 6 0 0 0 1 \n",
184 | "1 b 1 3 3 6 0 1 1 0 \n",
185 | "\n",
186 | " occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n",
187 | "0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
188 | "1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n",
189 | "\n",
190 | " busservice othservice missind ownership \n",
191 | "0 0.0 0.0 0.0 \n",
192 | "1 0.0 0.0 0.0 \n",
193 | "\n",
194 | "[2 rows x 65 columns]"
195 | ]
196 | },
197 | "execution_count": 10,
198 | "metadata": {},
199 | "output_type": "execute_result"
200 | }
201 | ],
202 | "source": [
203 | "data.columns\n",
204 | "data.head(2)"
205 | ]
206 | },
207 | {
208 | "cell_type": "markdown",
209 | "metadata": {},
210 | "source": [
211 | "#### What test is appropriate for this problem? Does CLT apply?\n",
212 | "\n",
213 | "Let's being by looking the number of observations where race = b and race = w. We can also check the number of such observations that have received a call back and the ones that didn't. Using these factors, we will be able to create a contingency table. \n",
214 | "\n",
215 | "Hence, the problem boils down to comparison of two proportions. \n",
216 | "If certain conditions are satisfied, we can also perform the Fischer's Exact Test using the contingency table.\n"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 73,
222 | "metadata": {
223 | "collapsed": false
224 | },
225 | "outputs": [
226 | {
227 | "name": "stdout",
228 | "output_type": "stream",
229 | "text": [
230 | "Number of observations where race is b : 2435\n",
231 | "Number of observations where race is w : 2435\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "data_b = data[data.race=='b']\n",
237 | "data_w = data[data.race=='w']\n",
238 | "num_b = len(data_b)\n",
239 | "num_w = len(data_w)\n",
240 | "print \"Number of observations where race is b : \",num_b\n",
241 | "print \"Number of observations where race is w : \",num_w"
242 | ]
243 | },
244 | {
245 | "cell_type": "markdown",
246 | "metadata": {},
247 | "source": [
248 | "Since there can be only two states for the 'call' variable, we can arbitrarily assign getting a call back as \"success\" and not getting a call back as a failure. "
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 74,
254 | "metadata": {
255 | "collapsed": false
256 | },
257 | "outputs": [
258 | {
259 | "name": "stdout",
260 | "output_type": "stream",
261 | "text": [
262 | "157 235\n"
263 | ]
264 | }
265 | ],
266 | "source": [
267 | "b_success = len(data_b[data_b.call == 1])\n",
268 | "w_success = len(data_w[data_w.call == 1])\n",
269 | "print b_success, w_success"
270 | ]
271 | },
272 | {
273 | "cell_type": "markdown",
274 | "metadata": {},
275 | "source": [
276 | "Let's also calculate the proportion $\\hat{p}_b$ of black sounding names getting a callback and the proportion $\\hat{p}_w$ white sounding names getting a call back."
277 | ]
278 | },
279 | {
280 | "cell_type": "code",
281 | "execution_count": 75,
282 | "metadata": {
283 | "collapsed": false
284 | },
285 | "outputs": [
286 | {
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "Proportion of black sounding names getting a callback : 0.064476386037\n",
291 | "Proportion of white sounding names getting a callback : 0.0965092402464\n"
292 | ]
293 | }
294 | ],
295 | "source": [
296 | "p_b = 1.0 * b_success/num_b\n",
297 | "p_w = 1.0 * w_success/num_w\n",
298 | "print \"Proportion of black sounding names getting a callback : \",p_b\n",
299 | "print \"Proportion of white sounding names getting a callback : \",p_w"
300 | ]
301 | },
302 | {
303 | "cell_type": "markdown",
304 | "metadata": {},
305 | "source": [
306 | "Also, to make sure the samples are big enough such that we can use a normal distribution to model difference between \n",
307 | "proportions, we need to check if $n*p$ and $n*(1-p)$ are greater than 10. This is a conclusion from the Central Limit Theorem. "
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 76,
313 | "metadata": {
314 | "collapsed": false
315 | },
316 | "outputs": [
317 | {
318 | "name": "stdout",
319 | "output_type": "stream",
320 | "text": [
321 | "157.0\n",
322 | "2278.0\n",
323 | "---\n",
324 | "235.0\n",
325 | "2200.0\n"
326 | ]
327 | }
328 | ],
329 | "source": [
330 | "print num_b * p_b\n",
331 | "print num_b * (1-p_b)\n",
332 | "print \"---\"\n",
333 | "print num_b * p_w\n",
334 | "print num_b * (1-p_w)"
335 | ]
336 | },
337 | {
338 | "cell_type": "markdown",
339 | "metadata": {},
340 | "source": [
341 | "Since, all the values are above 10, we can use the normal distribution to model differences between proportions."
342 | ]
343 | },
344 | {
345 | "cell_type": "markdown",
346 | "metadata": {},
347 | "source": [
348 | "Now that we have all the required data, we formulate the null and alternate hypotheses.\n",
349 | "\n",
350 | "$H_0\\:is \\: p_b = p_w\\\\\n",
351 | "H_A \\:is \\: p_b \\neq p_w$\n",
352 | "\n",
353 | "The Standard Error for the sample statistic is given by\n",
354 | "$\\sqrt{\\frac{\\hat{p}_b(1-\\hat{p}_b)}{n_b} + \\frac{\\hat{p}_w(1-\\hat{p}_w)}{n_w}} $\n",
355 | "\n",
356 | "We can use the z-statistic to place a confidence interval on this sample statistic.Hence, the margin of error is \n",
357 | "$Z_{\\alpha/2} * SE$. For a 95% confidence interval, the z-value is 1.96. \n",
358 | "\n",
359 | "The confidence interval, subsequently, is $\\hat{p}_b - \\hat{p}_w \\pm {Z_{\\alpha/2} * SE}$"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 77,
365 | "metadata": {
366 | "collapsed": false
367 | },
368 | "outputs": [
369 | {
370 | "name": "stdout",
371 | "output_type": "stream",
372 | "text": [
373 | "Margin of error = 0.0152554063499\n"
374 | ]
375 | }
376 | ],
377 | "source": [
378 | "z = 1.96\n",
379 | "margin = z * math.sqrt( ( p_w*(1-p_w) / num_b) + (p_b*(1-p_b)/num_w) )\n",
380 | "\n",
381 | "print \"Margin of error = \", margin"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 78,
387 | "metadata": {
388 | "collapsed": false
389 | },
390 | "outputs": [
391 | {
392 | "name": "stdout",
393 | "output_type": "stream",
394 | "text": [
395 | "The confidence interval is given by : 0.00213225776367 to 0.0619334506552\n"
396 | ]
397 | }
398 | ],
399 | "source": [
400 | "print \"The confidence interval is given by :\", p_w-p_b-z*margin,\"to\", p_w-p_b+z*margin"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {},
406 | "source": [
407 | "0 is not in this confidence interval. Nevertheless, let's go ahead and calculating the p-value."
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 79,
413 | "metadata": {
414 | "collapsed": false
415 | },
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/plain": [
420 | "(-4.1084121524343464, 3.9838868375850767e-05)"
421 | ]
422 | },
423 | "execution_count": 79,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "from statsmodels.stats.proportion import proportions_ztest as pz\n",
430 | "pz(np.array([b_success,w_success]),np.array([num_b,num_w]),value=0)"
431 | ]
432 | },
433 | {
434 | "cell_type": "markdown",
435 | "metadata": {},
436 | "source": [
437 | "The second value is the p-value and it is much lesser than 0.05. Hence, we can reject the null hypothesis. \n",
438 | "\n",
439 | "#### Significance of the result\n",
440 | "What does it practically mean to reject the null hypothesis? Our null hypothesis was that the proportion of black sounding names getting a call back is equal to the number of white sounding names getting a call back. After analysis, we have decided to reject it. This means that, in reality, there is a significant difference in the number of call backs ; white sounding names getting more call backs."
441 | ]
442 | }
443 | ],
444 | "metadata": {
445 | "kernelspec": {
446 | "display_name": "Python [Root]",
447 | "language": "python",
448 | "name": "Python [Root]"
449 | },
450 | "language_info": {
451 | "codemirror_mode": {
452 | "name": "ipython",
453 | "version": 2
454 | },
455 | "file_extension": ".py",
456 | "mimetype": "text/x-python",
457 | "name": "python",
458 | "nbconvert_exporter": "python",
459 | "pygments_lexer": "ipython2",
460 | "version": "2.7.12"
461 | }
462 | },
463 | "nbformat": 4,
464 | "nbformat_minor": 0
465 | }
466 |
--------------------------------------------------------------------------------
/Capstone Project/Notebooks/Classification_Baseline.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Creating a baseline for classification\n",
8 | "\n",
9 | "\n",
10 | "Notebook attempting to predict the result (Home win, away win, draw) of any fixture given the teams that are playing it based on their performance in the previous season. We use multiclass classification to predict the results of the matches. More feature engineering on the data might lead us to better results."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 12,
16 | "metadata": {
17 | "collapsed": true
18 | },
19 | "outputs": [],
20 | "source": [
21 | "import pandas as pd\n",
22 | "import numpy as np\n",
23 | "import scipy.stats as scipy\n",
24 | "import random"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "### Load the data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 15,
37 | "metadata": {
38 | "collapsed": false
39 | },
40 | "outputs": [],
41 | "source": [
42 | "df = pd.read_csv(\"./Data/England/E0_13.csv\")\n",
43 | "df_14 = pd.read_csv(\"./Data/England/E0_14.csv\")"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 16,
49 | "metadata": {
50 | "collapsed": false
51 | },
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/plain": [
56 | "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n",
57 | " u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n",
58 | " u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n",
59 | " u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n",
60 | " u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n",
61 | " u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n",
62 | " u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n",
63 | " u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n",
64 | " u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n",
65 | " dtype='object')"
66 | ]
67 | },
68 | "execution_count": 16,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "df.columns"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Cleaning\n",
82 | "\n",
83 | "We do not need information about division, data, referee and the betting odds from various companies for this method. "
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 65,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [],
93 | "source": [
94 | "res_13 = df.ix[:,:23]\n",
95 | "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n",
96 | "res_14 = df_14.ix[:,:23]\n",
97 | "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n",
98 | "table_features = df.ix[:,:7]\n",
99 | "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n",
100 | "bet_13 = df.ix[:,23:]"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 19,
106 | "metadata": {
107 | "collapsed": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "from math import log"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 20,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "entropy = -((0.32 * log(0.32,3)) + (0.20 * log(0.20,3)) + (0.47 * log(0.47,3)))"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 21,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [
132 | {
133 | "data": {
134 | "text/plain": [
135 | "0.947893245378005"
136 | ]
137 | },
138 | "execution_count": 21,
139 | "metadata": {},
140 | "output_type": "execute_result"
141 | }
142 | ],
143 | "source": [
144 | "entropy"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 22,
150 | "metadata": {
151 | "collapsed": false,
152 | "scrolled": true
153 | },
154 | "outputs": [],
155 | "source": [
156 | "res_13.head()\n",
157 | "feature_table = df.ix[:,:23]"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 23,
163 | "metadata": {
164 | "collapsed": false
165 | },
166 | "outputs": [],
167 | "source": [
168 | "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n",
169 | "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 24,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "Average number of goals at home 1.57368421053\n",
184 | "Average number of goals away 1.19473684211\n",
185 | "Average number of goals conceded at home 1.57368421053\n",
186 | "Average number of goals conceded away 1.19473684211\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n",
192 | "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n",
193 | "avg_home_conceded_13 = avg_away_scored_13\n",
194 | "avg_away_conceded_13 = avg_home_scored_13\n",
195 | "print \"Average number of goals at home\",avg_home_scored_13\n",
196 | "print \"Average number of goals away\", avg_away_scored_13\n",
197 | "print \"Average number of goals conceded at home\",avg_away_conceded_13\n",
198 | "print \"Average number of goals conceded away\",avg_home_conceded_13\n"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 25,
204 | "metadata": {
205 | "collapsed": false
206 | },
207 | "outputs": [],
208 | "source": [
209 | "res_home = res_13.groupby('HomeTeam')\n",
210 | "res_away = res_13.groupby('AwayTeam')"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 26,
216 | "metadata": {
217 | "collapsed": false
218 | },
219 | "outputs": [
220 | {
221 | "data": {
222 | "text/html": [
223 | "\n",
224 | "
\n",
225 | " \n",
226 | " \n",
227 | " | \n",
228 | " Team | \n",
229 | " HGS | \n",
230 | " AGS | \n",
231 | " HAS | \n",
232 | " AAS | \n",
233 | " HGC | \n",
234 | " AGC | \n",
235 | " HDS | \n",
236 | " ADS | \n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " | 0 | \n",
242 | " Arsenal | \n",
243 | " 36 | \n",
244 | " 32 | \n",
245 | " NaN | \n",
246 | " NaN | \n",
247 | " 11 | \n",
248 | " 30 | \n",
249 | " NaN | \n",
250 | " NaN | \n",
251 | "
\n",
252 | " \n",
253 | " | 1 | \n",
254 | " Aston Villa | \n",
255 | " 22 | \n",
256 | " 17 | \n",
257 | " NaN | \n",
258 | " NaN | \n",
259 | " 29 | \n",
260 | " 32 | \n",
261 | " NaN | \n",
262 | " NaN | \n",
263 | "
\n",
264 | " \n",
265 | " | 2 | \n",
266 | " Cardiff | \n",
267 | " 20 | \n",
268 | " 12 | \n",
269 | " NaN | \n",
270 | " NaN | \n",
271 | " 35 | \n",
272 | " 39 | \n",
273 | " NaN | \n",
274 | " NaN | \n",
275 | "
\n",
276 | " \n",
277 | " | 3 | \n",
278 | " Chelsea | \n",
279 | " 43 | \n",
280 | " 28 | \n",
281 | " NaN | \n",
282 | " NaN | \n",
283 | " 11 | \n",
284 | " 16 | \n",
285 | " NaN | \n",
286 | " NaN | \n",
287 | "
\n",
288 | " \n",
289 | " | 4 | \n",
290 | " Crystal Palace | \n",
291 | " 18 | \n",
292 | " 15 | \n",
293 | " NaN | \n",
294 | " NaN | \n",
295 | " 23 | \n",
296 | " 25 | \n",
297 | " NaN | \n",
298 | " NaN | \n",
299 | "
\n",
300 | " \n",
301 | "
\n",
302 | "
"
303 | ],
304 | "text/plain": [
305 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n",
306 | "0 Arsenal 36 32 NaN NaN 11 30 NaN NaN\n",
307 | "1 Aston Villa 22 17 NaN NaN 29 32 NaN NaN\n",
308 | "2 Cardiff 20 12 NaN NaN 35 39 NaN NaN\n",
309 | "3 Chelsea 43 28 NaN NaN 11 16 NaN NaN\n",
310 | "4 Crystal Palace 18 15 NaN NaN 23 25 NaN NaN"
311 | ]
312 | },
313 | "execution_count": 26,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "table_13.Team = res_home.HomeTeam.all().values\n",
320 | "table_13.HGS = res_home.FTHG.sum().values\n",
321 | "table_13.HGC = res_home.FTAG.sum().values\n",
322 | "table_13.AGS = res_away.FTAG.sum().values\n",
323 | "table_13.AGC = res_away.FTHG.sum().values\n",
324 | "table_13.head()"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 27,
330 | "metadata": {
331 | "collapsed": false
332 | },
333 | "outputs": [
334 | {
335 | "data": {
336 | "text/html": [
337 | "\n",
338 | "
\n",
339 | " \n",
340 | " \n",
341 | " | \n",
342 | " Team | \n",
343 | " HGS | \n",
344 | " AGS | \n",
345 | " HAS | \n",
346 | " AAS | \n",
347 | " HGC | \n",
348 | " AGC | \n",
349 | " HDS | \n",
350 | " ADS | \n",
351 | "
\n",
352 | " \n",
353 | " \n",
354 | " \n",
355 | " | 0 | \n",
356 | " Arsenal | \n",
357 | " 36 | \n",
358 | " 32 | \n",
359 | " 1.204013 | \n",
360 | " 1.409692 | \n",
361 | " 11 | \n",
362 | " 30 | \n",
363 | " 0.484581 | \n",
364 | " 1.003344 | \n",
365 | "
\n",
366 | " \n",
367 | " | 1 | \n",
368 | " Aston Villa | \n",
369 | " 22 | \n",
370 | " 17 | \n",
371 | " 0.735786 | \n",
372 | " 0.748899 | \n",
373 | " 29 | \n",
374 | " 32 | \n",
375 | " 1.277533 | \n",
376 | " 1.070234 | \n",
377 | "
\n",
378 | " \n",
379 | " | 2 | \n",
380 | " Cardiff | \n",
381 | " 20 | \n",
382 | " 12 | \n",
383 | " 0.668896 | \n",
384 | " 0.528634 | \n",
385 | " 35 | \n",
386 | " 39 | \n",
387 | " 1.541850 | \n",
388 | " 1.304348 | \n",
389 | "
\n",
390 | " \n",
391 | " | 3 | \n",
392 | " Chelsea | \n",
393 | " 43 | \n",
394 | " 28 | \n",
395 | " 1.438127 | \n",
396 | " 1.233480 | \n",
397 | " 11 | \n",
398 | " 16 | \n",
399 | " 0.484581 | \n",
400 | " 0.535117 | \n",
401 | "
\n",
402 | " \n",
403 | " | 4 | \n",
404 | " Crystal Palace | \n",
405 | " 18 | \n",
406 | " 15 | \n",
407 | " 0.602007 | \n",
408 | " 0.660793 | \n",
409 | " 23 | \n",
410 | " 25 | \n",
411 | " 1.013216 | \n",
412 | " 0.836120 | \n",
413 | "
\n",
414 | " \n",
415 | "
\n",
416 | "
"
417 | ],
418 | "text/plain": [
419 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n",
420 | "0 Arsenal 36 32 1.204013 1.409692 11 30 0.484581 1.003344\n",
421 | "1 Aston Villa 22 17 0.735786 0.748899 29 32 1.277533 1.070234\n",
422 | "2 Cardiff 20 12 0.668896 0.528634 35 39 1.541850 1.304348\n",
423 | "3 Chelsea 43 28 1.438127 1.233480 11 16 0.484581 0.535117\n",
424 | "4 Crystal Palace 18 15 0.602007 0.660793 23 25 1.013216 0.836120"
425 | ]
426 | },
427 | "execution_count": 27,
428 | "metadata": {},
429 | "output_type": "execute_result"
430 | }
431 | ],
432 | "source": [
433 | "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n",
434 | "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n",
435 | "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n",
436 | "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n",
437 | "table_13.head()"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 28,
443 | "metadata": {
444 | "collapsed": false
445 | },
446 | "outputs": [],
447 | "source": [
448 | "feature_table = feature_table[['HomeTeam','AwayTeam','FTR']]\n",
449 | "f_HAS = []\n",
450 | "f_HDS = []\n",
451 | "f_AAS = []\n",
452 | "f_ADS = []\n",
453 | "for index,row in feature_table.iterrows():\n",
454 | " f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n",
455 | " f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n",
456 | " f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n",
457 | " f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n",
458 | " \n",
459 | "feature_table['HAS'] = f_HAS\n",
460 | "feature_table['HDS'] = f_HDS\n",
461 | "feature_table['AAS'] = f_AAS\n",
462 | "feature_table['ADS'] = f_ADS"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 29,
468 | "metadata": {
469 | "collapsed": false
470 | },
471 | "outputs": [
472 | {
473 | "data": {
474 | "text/html": [
475 | "\n",
476 | "
\n",
477 | " \n",
478 | " \n",
479 | " | \n",
480 | " HomeTeam | \n",
481 | " AwayTeam | \n",
482 | " FTR | \n",
483 | " HAS | \n",
484 | " HDS | \n",
485 | " AAS | \n",
486 | " ADS | \n",
487 | "
\n",
488 | " \n",
489 | " \n",
490 | " \n",
491 | " | 0 | \n",
492 | " Arsenal | \n",
493 | " Aston Villa | \n",
494 | " A | \n",
495 | " 1.204013 | \n",
496 | " 0.484581 | \n",
497 | " 1.409692 | \n",
498 | " 1.003344 | \n",
499 | "
\n",
500 | " \n",
501 | " | 1 | \n",
502 | " Liverpool | \n",
503 | " Stoke | \n",
504 | " H | \n",
505 | " 1.772575 | \n",
506 | " 0.792952 | \n",
507 | " 2.114537 | \n",
508 | " 1.070234 | \n",
509 | "
\n",
510 | " \n",
511 | " | 2 | \n",
512 | " Norwich | \n",
513 | " Everton | \n",
514 | " D | \n",
515 | " 0.568562 | \n",
516 | " 0.792952 | \n",
517 | " 0.484581 | \n",
518 | " 1.471572 | \n",
519 | "
\n",
520 | " \n",
521 | " | 3 | \n",
522 | " Sunderland | \n",
523 | " Fulham | \n",
524 | " A | \n",
525 | " 0.702341 | \n",
526 | " 1.189427 | \n",
527 | " 0.881057 | \n",
528 | " 1.103679 | \n",
529 | "
\n",
530 | " \n",
531 | " | 4 | \n",
532 | " Swansea | \n",
533 | " Man United | \n",
534 | " A | \n",
535 | " 1.103679 | \n",
536 | " 1.145374 | \n",
537 | " 0.925110 | \n",
538 | " 0.936455 | \n",
539 | "
\n",
540 | " \n",
541 | "
\n",
542 | "
"
543 | ],
544 | "text/plain": [
545 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS\n",
546 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344\n",
547 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234\n",
548 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572\n",
549 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679\n",
550 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455"
551 | ]
552 | },
553 | "execution_count": 29,
554 | "metadata": {},
555 | "output_type": "execute_result"
556 | }
557 | ],
558 | "source": [
559 | "feature_table.head()"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 30,
565 | "metadata": {
566 | "collapsed": true
567 | },
568 | "outputs": [],
569 | "source": [
570 | "def transformResult(row):\n",
571 | " if(row.FTR == 'H'):\n",
572 | " return 1\n",
573 | " elif(row.FTR == 'A'):\n",
574 | " return -1\n",
575 | " else:\n",
576 | " return 0"
577 | ]
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": 31,
582 | "metadata": {
583 | "collapsed": false
584 | },
585 | "outputs": [],
586 | "source": [
587 | "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 32,
593 | "metadata": {
594 | "collapsed": false
595 | },
596 | "outputs": [
597 | {
598 | "data": {
599 | "text/html": [
600 | "\n",
601 | "
\n",
602 | " \n",
603 | " \n",
604 | " | \n",
605 | " HomeTeam | \n",
606 | " AwayTeam | \n",
607 | " FTR | \n",
608 | " HAS | \n",
609 | " HDS | \n",
610 | " AAS | \n",
611 | " ADS | \n",
612 | " Result | \n",
613 | "
\n",
614 | " \n",
615 | " \n",
616 | " \n",
617 | " | 0 | \n",
618 | " Arsenal | \n",
619 | " Aston Villa | \n",
620 | " A | \n",
621 | " 1.204013 | \n",
622 | " 0.484581 | \n",
623 | " 1.409692 | \n",
624 | " 1.003344 | \n",
625 | " -1 | \n",
626 | "
\n",
627 | " \n",
628 | " | 1 | \n",
629 | " Liverpool | \n",
630 | " Stoke | \n",
631 | " H | \n",
632 | " 1.772575 | \n",
633 | " 0.792952 | \n",
634 | " 2.114537 | \n",
635 | " 1.070234 | \n",
636 | " 1 | \n",
637 | "
\n",
638 | " \n",
639 | " | 2 | \n",
640 | " Norwich | \n",
641 | " Everton | \n",
642 | " D | \n",
643 | " 0.568562 | \n",
644 | " 0.792952 | \n",
645 | " 0.484581 | \n",
646 | " 1.471572 | \n",
647 | " 0 | \n",
648 | "
\n",
649 | " \n",
650 | " | 3 | \n",
651 | " Sunderland | \n",
652 | " Fulham | \n",
653 | " A | \n",
654 | " 0.702341 | \n",
655 | " 1.189427 | \n",
656 | " 0.881057 | \n",
657 | " 1.103679 | \n",
658 | " -1 | \n",
659 | "
\n",
660 | " \n",
661 | " | 4 | \n",
662 | " Swansea | \n",
663 | " Man United | \n",
664 | " A | \n",
665 | " 1.103679 | \n",
666 | " 1.145374 | \n",
667 | " 0.925110 | \n",
668 | " 0.936455 | \n",
669 | " -1 | \n",
670 | "
\n",
671 | " \n",
672 | "
\n",
673 | "
"
674 | ],
675 | "text/plain": [
676 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS Result\n",
677 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344 -1\n",
678 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234 1\n",
679 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572 0\n",
680 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679 -1\n",
681 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455 -1"
682 | ]
683 | },
684 | "execution_count": 32,
685 | "metadata": {},
686 | "output_type": "execute_result"
687 | }
688 | ],
689 | "source": [
690 | "feature_table.head()"
691 | ]
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 33,
696 | "metadata": {
697 | "collapsed": true
698 | },
699 | "outputs": [],
700 | "source": [
701 | "X_train = feature_table[['HAS','HDS','AAS','ADS']]\n",
702 | "y_train = feature_table['Result']"
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": 34,
708 | "metadata": {
709 | "collapsed": false
710 | },
711 | "outputs": [],
712 | "source": [
713 | "from sklearn.tree import DecisionTreeClassifier\n",
714 | "from sklearn.naive_bayes import MultinomialNB\n",
715 | "from xgboost import XGBClassifier\n",
716 | "from sklearn.neighbors import KNeighborsClassifier\n",
717 | "from sklearn.multiclass import OneVsRestClassifier\n",
718 | "\n",
719 | "from sklearn.linear_model import LogisticRegression\n",
720 | "from sklearn.metrics import accuracy_score"
721 | ]
722 | },
723 | {
724 | "cell_type": "markdown",
725 | "metadata": {},
726 | "source": [
727 | "## Randomized Model as Benchmark"
728 | ]
729 | },
730 | {
731 | "cell_type": "code",
732 | "execution_count": 41,
733 | "metadata": {
734 | "collapsed": false
735 | },
736 | "outputs": [],
737 | "source": [
738 | "outcome_list = [-1,0,1]\n",
739 | "y_pred = []\n",
740 | "for i in xrange(1,381):\n",
741 | " y_pred.append(random.choice(outcome_list))\n",
742 | " "
743 | ]
744 | },
745 | {
746 | "cell_type": "code",
747 | "execution_count": 42,
748 | "metadata": {
749 | "collapsed": false
750 | },
751 | "outputs": [
752 | {
753 | "data": {
754 | "text/plain": [
755 | "0.32631578947368423"
756 | ]
757 | },
758 | "execution_count": 42,
759 | "metadata": {},
760 | "output_type": "execute_result"
761 | }
762 | ],
763 | "source": [
764 | "accuracy_score(y_train,y_pred)"
765 | ]
766 | },
767 | {
768 | "cell_type": "markdown",
769 | "metadata": {},
770 | "source": [
771 | "With just a random model, we get an accuracy of 33% on an average which is expected since there are three outcomes to any fixture. "
772 | ]
773 | },
774 | {
775 | "cell_type": "markdown",
776 | "metadata": {},
777 | "source": [
778 | "### Classifiers"
779 | ]
780 | },
781 | {
782 | "cell_type": "code",
783 | "execution_count": 87,
784 | "metadata": {
785 | "collapsed": false
786 | },
787 | "outputs": [
788 | {
789 | "data": {
790 | "text/plain": [
791 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
792 | " metric_params=None, n_jobs=1, n_neighbors=15, p=2,\n",
793 | " weights='uniform')"
794 | ]
795 | },
796 | "execution_count": 87,
797 | "metadata": {},
798 | "output_type": "execute_result"
799 | }
800 | ],
801 | "source": [
802 | "clf1 = DecisionTreeClassifier()\n",
803 | "clf2 = XGBClassifier()\n",
804 | "clf3 = KNeighborsClassifier(n_neighbors=15)\n",
805 | "clf3.fit(X_train,y_train)"
806 | ]
807 | },
808 | {
809 | "cell_type": "code",
810 | "execution_count": 88,
811 | "metadata": {
812 | "collapsed": false
813 | },
814 | "outputs": [
815 | {
816 | "data": {
817 | "text/plain": [
818 | "0.54736842105263162"
819 | ]
820 | },
821 | "execution_count": 88,
822 | "metadata": {},
823 | "output_type": "execute_result"
824 | }
825 | ],
826 | "source": [
827 | "y_pred = clf3.predict(X_train)\n",
828 | "accuracy_score(y_pred,y_train)"
829 | ]
830 | },
831 | {
832 | "cell_type": "code",
833 | "execution_count": null,
834 | "metadata": {
835 | "collapsed": true
836 | },
837 | "outputs": [],
838 | "source": []
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": null,
843 | "metadata": {
844 | "collapsed": true
845 | },
846 | "outputs": [],
847 | "source": []
848 | }
849 | ],
850 | "metadata": {
851 | "kernelspec": {
852 | "display_name": "Python [Root]",
853 | "language": "python",
854 | "name": "Python [Root]"
855 | },
856 | "language_info": {
857 | "codemirror_mode": {
858 | "name": "ipython",
859 | "version": 2
860 | },
861 | "file_extension": ".py",
862 | "mimetype": "text/x-python",
863 | "name": "python",
864 | "nbconvert_exporter": "python",
865 | "pygments_lexer": "ipython2",
866 | "version": "2.7.12"
867 | }
868 | },
869 | "nbformat": 4,
870 | "nbformat_minor": 0
871 | }
872 |
--------------------------------------------------------------------------------
/Capstone Project/.ipynb_checkpoints/Classification_Baseline-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Creating a baseline for classification\n",
8 | "\n",
9 | "\n",
10 | "Notebook attempting to predict the result (Home win, away win, draw) of any fixture given the teams that are playing it based on their performance in the previous season. We use multiclass classification to predict the results of the matches. More feature engineering on the data might lead us to better results."
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 12,
16 | "metadata": {
17 | "collapsed": true
18 | },
19 | "outputs": [],
20 | "source": [
21 | "import pandas as pd\n",
22 | "import numpy as np\n",
23 | "import scipy.stats as scipy\n",
24 | "import random"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "### Load the data"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 15,
37 | "metadata": {
38 | "collapsed": false
39 | },
40 | "outputs": [],
41 | "source": [
42 | "df = pd.read_csv(\"./Data/England/E0_13.csv\")\n",
43 | "df_14 = pd.read_csv(\"./Data/England/E0_14.csv\")"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 16,
49 | "metadata": {
50 | "collapsed": false
51 | },
52 | "outputs": [
53 | {
54 | "data": {
55 | "text/plain": [
56 | "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n",
57 | " u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n",
58 | " u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n",
59 | " u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n",
60 | " u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n",
61 | " u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n",
62 | " u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n",
63 | " u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n",
64 | " u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n",
65 | " dtype='object')"
66 | ]
67 | },
68 | "execution_count": 16,
69 | "metadata": {},
70 | "output_type": "execute_result"
71 | }
72 | ],
73 | "source": [
74 | "df.columns"
75 | ]
76 | },
77 | {
78 | "cell_type": "markdown",
79 | "metadata": {},
80 | "source": [
81 | "### Cleaning\n",
82 | "\n",
83 | "We do not need information about division, data, referee and the betting odds from various companies for this method. "
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 65,
89 | "metadata": {
90 | "collapsed": false
91 | },
92 | "outputs": [],
93 | "source": [
94 | "res_13 = df.ix[:,:23]\n",
95 | "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n",
96 | "res_14 = df_14.ix[:,:23]\n",
97 | "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n",
98 | "table_features = df.ix[:,:7]\n",
99 | "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n",
100 | "bet_13 = df.ix[:,23:]"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 19,
106 | "metadata": {
107 | "collapsed": true
108 | },
109 | "outputs": [],
110 | "source": [
111 | "from math import log"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 20,
117 | "metadata": {
118 | "collapsed": false
119 | },
120 | "outputs": [],
121 | "source": [
122 | "entropy = -((0.32 * log(0.32,3)) + (0.20 * log(0.20,3)) + (0.47 * log(0.47,3)))"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 21,
128 | "metadata": {
129 | "collapsed": false
130 | },
131 | "outputs": [
132 | {
133 | "data": {
134 | "text/plain": [
135 | "0.947893245378005"
136 | ]
137 | },
138 | "execution_count": 21,
139 | "metadata": {},
140 | "output_type": "execute_result"
141 | }
142 | ],
143 | "source": [
144 | "entropy"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 22,
150 | "metadata": {
151 | "collapsed": false,
152 | "scrolled": true
153 | },
154 | "outputs": [],
155 | "source": [
156 | "res_13.head()\n",
157 | "feature_table = df.ix[:,:23]"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 23,
163 | "metadata": {
164 | "collapsed": false
165 | },
166 | "outputs": [],
167 | "source": [
168 | "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n",
169 | "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 24,
175 | "metadata": {
176 | "collapsed": false
177 | },
178 | "outputs": [
179 | {
180 | "name": "stdout",
181 | "output_type": "stream",
182 | "text": [
183 | "Average number of goals at home 1.57368421053\n",
184 | "Average number of goals away 1.19473684211\n",
185 | "Average number of goals conceded at home 1.57368421053\n",
186 | "Average number of goals conceded away 1.19473684211\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n",
192 | "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n",
193 | "avg_home_conceded_13 = avg_away_scored_13\n",
194 | "avg_away_conceded_13 = avg_home_scored_13\n",
195 | "print \"Average number of goals at home\",avg_home_scored_13\n",
196 | "print \"Average number of goals away\", avg_away_scored_13\n",
197 | "print \"Average number of goals conceded at home\",avg_away_conceded_13\n",
198 | "print \"Average number of goals conceded away\",avg_home_conceded_13\n"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 25,
204 | "metadata": {
205 | "collapsed": false
206 | },
207 | "outputs": [],
208 | "source": [
209 | "res_home = res_13.groupby('HomeTeam')\n",
210 | "res_away = res_13.groupby('AwayTeam')"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 26,
216 | "metadata": {
217 | "collapsed": false
218 | },
219 | "outputs": [
220 | {
221 | "data": {
222 | "text/html": [
223 | "\n",
224 | "
\n",
225 | " \n",
226 | " \n",
227 | " | \n",
228 | " Team | \n",
229 | " HGS | \n",
230 | " AGS | \n",
231 | " HAS | \n",
232 | " AAS | \n",
233 | " HGC | \n",
234 | " AGC | \n",
235 | " HDS | \n",
236 | " ADS | \n",
237 | "
\n",
238 | " \n",
239 | " \n",
240 | " \n",
241 | " | 0 | \n",
242 | " Arsenal | \n",
243 | " 36 | \n",
244 | " 32 | \n",
245 | " NaN | \n",
246 | " NaN | \n",
247 | " 11 | \n",
248 | " 30 | \n",
249 | " NaN | \n",
250 | " NaN | \n",
251 | "
\n",
252 | " \n",
253 | " | 1 | \n",
254 | " Aston Villa | \n",
255 | " 22 | \n",
256 | " 17 | \n",
257 | " NaN | \n",
258 | " NaN | \n",
259 | " 29 | \n",
260 | " 32 | \n",
261 | " NaN | \n",
262 | " NaN | \n",
263 | "
\n",
264 | " \n",
265 | " | 2 | \n",
266 | " Cardiff | \n",
267 | " 20 | \n",
268 | " 12 | \n",
269 | " NaN | \n",
270 | " NaN | \n",
271 | " 35 | \n",
272 | " 39 | \n",
273 | " NaN | \n",
274 | " NaN | \n",
275 | "
\n",
276 | " \n",
277 | " | 3 | \n",
278 | " Chelsea | \n",
279 | " 43 | \n",
280 | " 28 | \n",
281 | " NaN | \n",
282 | " NaN | \n",
283 | " 11 | \n",
284 | " 16 | \n",
285 | " NaN | \n",
286 | " NaN | \n",
287 | "
\n",
288 | " \n",
289 | " | 4 | \n",
290 | " Crystal Palace | \n",
291 | " 18 | \n",
292 | " 15 | \n",
293 | " NaN | \n",
294 | " NaN | \n",
295 | " 23 | \n",
296 | " 25 | \n",
297 | " NaN | \n",
298 | " NaN | \n",
299 | "
\n",
300 | " \n",
301 | "
\n",
302 | "
"
303 | ],
304 | "text/plain": [
305 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n",
306 | "0 Arsenal 36 32 NaN NaN 11 30 NaN NaN\n",
307 | "1 Aston Villa 22 17 NaN NaN 29 32 NaN NaN\n",
308 | "2 Cardiff 20 12 NaN NaN 35 39 NaN NaN\n",
309 | "3 Chelsea 43 28 NaN NaN 11 16 NaN NaN\n",
310 | "4 Crystal Palace 18 15 NaN NaN 23 25 NaN NaN"
311 | ]
312 | },
313 | "execution_count": 26,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "table_13.Team = res_home.HomeTeam.all().values\n",
320 | "table_13.HGS = res_home.FTHG.sum().values\n",
321 | "table_13.HGC = res_home.FTAG.sum().values\n",
322 | "table_13.AGS = res_away.FTAG.sum().values\n",
323 | "table_13.AGC = res_away.FTHG.sum().values\n",
324 | "table_13.head()"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 27,
330 | "metadata": {
331 | "collapsed": false
332 | },
333 | "outputs": [
334 | {
335 | "data": {
336 | "text/html": [
337 | "\n",
338 | "
\n",
339 | " \n",
340 | " \n",
341 | " | \n",
342 | " Team | \n",
343 | " HGS | \n",
344 | " AGS | \n",
345 | " HAS | \n",
346 | " AAS | \n",
347 | " HGC | \n",
348 | " AGC | \n",
349 | " HDS | \n",
350 | " ADS | \n",
351 | "
\n",
352 | " \n",
353 | " \n",
354 | " \n",
355 | " | 0 | \n",
356 | " Arsenal | \n",
357 | " 36 | \n",
358 | " 32 | \n",
359 | " 1.204013 | \n",
360 | " 1.409692 | \n",
361 | " 11 | \n",
362 | " 30 | \n",
363 | " 0.484581 | \n",
364 | " 1.003344 | \n",
365 | "
\n",
366 | " \n",
367 | " | 1 | \n",
368 | " Aston Villa | \n",
369 | " 22 | \n",
370 | " 17 | \n",
371 | " 0.735786 | \n",
372 | " 0.748899 | \n",
373 | " 29 | \n",
374 | " 32 | \n",
375 | " 1.277533 | \n",
376 | " 1.070234 | \n",
377 | "
\n",
378 | " \n",
379 | " | 2 | \n",
380 | " Cardiff | \n",
381 | " 20 | \n",
382 | " 12 | \n",
383 | " 0.668896 | \n",
384 | " 0.528634 | \n",
385 | " 35 | \n",
386 | " 39 | \n",
387 | " 1.541850 | \n",
388 | " 1.304348 | \n",
389 | "
\n",
390 | " \n",
391 | " | 3 | \n",
392 | " Chelsea | \n",
393 | " 43 | \n",
394 | " 28 | \n",
395 | " 1.438127 | \n",
396 | " 1.233480 | \n",
397 | " 11 | \n",
398 | " 16 | \n",
399 | " 0.484581 | \n",
400 | " 0.535117 | \n",
401 | "
\n",
402 | " \n",
403 | " | 4 | \n",
404 | " Crystal Palace | \n",
405 | " 18 | \n",
406 | " 15 | \n",
407 | " 0.602007 | \n",
408 | " 0.660793 | \n",
409 | " 23 | \n",
410 | " 25 | \n",
411 | " 1.013216 | \n",
412 | " 0.836120 | \n",
413 | "
\n",
414 | " \n",
415 | "
\n",
416 | "
"
417 | ],
418 | "text/plain": [
419 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n",
420 | "0 Arsenal 36 32 1.204013 1.409692 11 30 0.484581 1.003344\n",
421 | "1 Aston Villa 22 17 0.735786 0.748899 29 32 1.277533 1.070234\n",
422 | "2 Cardiff 20 12 0.668896 0.528634 35 39 1.541850 1.304348\n",
423 | "3 Chelsea 43 28 1.438127 1.233480 11 16 0.484581 0.535117\n",
424 | "4 Crystal Palace 18 15 0.602007 0.660793 23 25 1.013216 0.836120"
425 | ]
426 | },
427 | "execution_count": 27,
428 | "metadata": {},
429 | "output_type": "execute_result"
430 | }
431 | ],
432 | "source": [
433 | "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n",
434 | "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n",
435 | "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n",
436 | "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n",
437 | "table_13.head()"
438 | ]
439 | },
440 | {
441 | "cell_type": "code",
442 | "execution_count": 28,
443 | "metadata": {
444 | "collapsed": false
445 | },
446 | "outputs": [],
447 | "source": [
448 | "feature_table = feature_table[['HomeTeam','AwayTeam','FTR']]\n",
449 | "f_HAS = []\n",
450 | "f_HDS = []\n",
451 | "f_AAS = []\n",
452 | "f_ADS = []\n",
453 | "for index,row in feature_table.iterrows():\n",
454 | " f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n",
455 | " f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n",
456 | " f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n",
457 | " f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n",
458 | " \n",
459 | "feature_table['HAS'] = f_HAS\n",
460 | "feature_table['HDS'] = f_HDS\n",
461 | "feature_table['AAS'] = f_AAS\n",
462 | "feature_table['ADS'] = f_ADS"
463 | ]
464 | },
465 | {
466 | "cell_type": "code",
467 | "execution_count": 29,
468 | "metadata": {
469 | "collapsed": false
470 | },
471 | "outputs": [
472 | {
473 | "data": {
474 | "text/html": [
475 | "\n",
476 | "
\n",
477 | " \n",
478 | " \n",
479 | " | \n",
480 | " HomeTeam | \n",
481 | " AwayTeam | \n",
482 | " FTR | \n",
483 | " HAS | \n",
484 | " HDS | \n",
485 | " AAS | \n",
486 | " ADS | \n",
487 | "
\n",
488 | " \n",
489 | " \n",
490 | " \n",
491 | " | 0 | \n",
492 | " Arsenal | \n",
493 | " Aston Villa | \n",
494 | " A | \n",
495 | " 1.204013 | \n",
496 | " 0.484581 | \n",
497 | " 1.409692 | \n",
498 | " 1.003344 | \n",
499 | "
\n",
500 | " \n",
501 | " | 1 | \n",
502 | " Liverpool | \n",
503 | " Stoke | \n",
504 | " H | \n",
505 | " 1.772575 | \n",
506 | " 0.792952 | \n",
507 | " 2.114537 | \n",
508 | " 1.070234 | \n",
509 | "
\n",
510 | " \n",
511 | " | 2 | \n",
512 | " Norwich | \n",
513 | " Everton | \n",
514 | " D | \n",
515 | " 0.568562 | \n",
516 | " 0.792952 | \n",
517 | " 0.484581 | \n",
518 | " 1.471572 | \n",
519 | "
\n",
520 | " \n",
521 | " | 3 | \n",
522 | " Sunderland | \n",
523 | " Fulham | \n",
524 | " A | \n",
525 | " 0.702341 | \n",
526 | " 1.189427 | \n",
527 | " 0.881057 | \n",
528 | " 1.103679 | \n",
529 | "
\n",
530 | " \n",
531 | " | 4 | \n",
532 | " Swansea | \n",
533 | " Man United | \n",
534 | " A | \n",
535 | " 1.103679 | \n",
536 | " 1.145374 | \n",
537 | " 0.925110 | \n",
538 | " 0.936455 | \n",
539 | "
\n",
540 | " \n",
541 | "
\n",
542 | "
"
543 | ],
544 | "text/plain": [
545 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS\n",
546 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344\n",
547 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234\n",
548 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572\n",
549 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679\n",
550 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455"
551 | ]
552 | },
553 | "execution_count": 29,
554 | "metadata": {},
555 | "output_type": "execute_result"
556 | }
557 | ],
558 | "source": [
559 | "feature_table.head()"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 30,
565 | "metadata": {
566 | "collapsed": true
567 | },
568 | "outputs": [],
569 | "source": [
570 | "def transformResult(row):\n",
571 | " if(row.FTR == 'H'):\n",
572 | " return 1\n",
573 | " elif(row.FTR == 'A'):\n",
574 | " return -1\n",
575 | " else:\n",
576 | " return 0"
577 | ]
578 | },
579 | {
580 | "cell_type": "code",
581 | "execution_count": 31,
582 | "metadata": {
583 | "collapsed": false
584 | },
585 | "outputs": [],
586 | "source": [
587 | "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 32,
593 | "metadata": {
594 | "collapsed": false
595 | },
596 | "outputs": [
597 | {
598 | "data": {
599 | "text/html": [
600 | "\n",
601 | "
\n",
602 | " \n",
603 | " \n",
604 | " | \n",
605 | " HomeTeam | \n",
606 | " AwayTeam | \n",
607 | " FTR | \n",
608 | " HAS | \n",
609 | " HDS | \n",
610 | " AAS | \n",
611 | " ADS | \n",
612 | " Result | \n",
613 | "
\n",
614 | " \n",
615 | " \n",
616 | " \n",
617 | " | 0 | \n",
618 | " Arsenal | \n",
619 | " Aston Villa | \n",
620 | " A | \n",
621 | " 1.204013 | \n",
622 | " 0.484581 | \n",
623 | " 1.409692 | \n",
624 | " 1.003344 | \n",
625 | " -1 | \n",
626 | "
\n",
627 | " \n",
628 | " | 1 | \n",
629 | " Liverpool | \n",
630 | " Stoke | \n",
631 | " H | \n",
632 | " 1.772575 | \n",
633 | " 0.792952 | \n",
634 | " 2.114537 | \n",
635 | " 1.070234 | \n",
636 | " 1 | \n",
637 | "
\n",
638 | " \n",
639 | " | 2 | \n",
640 | " Norwich | \n",
641 | " Everton | \n",
642 | " D | \n",
643 | " 0.568562 | \n",
644 | " 0.792952 | \n",
645 | " 0.484581 | \n",
646 | " 1.471572 | \n",
647 | " 0 | \n",
648 | "
\n",
649 | " \n",
650 | " | 3 | \n",
651 | " Sunderland | \n",
652 | " Fulham | \n",
653 | " A | \n",
654 | " 0.702341 | \n",
655 | " 1.189427 | \n",
656 | " 0.881057 | \n",
657 | " 1.103679 | \n",
658 | " -1 | \n",
659 | "
\n",
660 | " \n",
661 | " | 4 | \n",
662 | " Swansea | \n",
663 | " Man United | \n",
664 | " A | \n",
665 | " 1.103679 | \n",
666 | " 1.145374 | \n",
667 | " 0.925110 | \n",
668 | " 0.936455 | \n",
669 | " -1 | \n",
670 | "
\n",
671 | " \n",
672 | "
\n",
673 | "
"
674 | ],
675 | "text/plain": [
676 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS Result\n",
677 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344 -1\n",
678 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234 1\n",
679 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572 0\n",
680 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679 -1\n",
681 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455 -1"
682 | ]
683 | },
684 | "execution_count": 32,
685 | "metadata": {},
686 | "output_type": "execute_result"
687 | }
688 | ],
689 | "source": [
690 | "feature_table.head()"
691 | ]
692 | },
693 | {
694 | "cell_type": "code",
695 | "execution_count": 33,
696 | "metadata": {
697 | "collapsed": true
698 | },
699 | "outputs": [],
700 | "source": [
701 | "X_train = feature_table[['HAS','HDS','AAS','ADS']]\n",
702 | "y_train = feature_table['Result']"
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": null,
708 | "metadata": {
709 | "collapsed": true
710 | },
711 | "outputs": [],
712 | "source": []
713 | },
714 | {
715 | "cell_type": "code",
716 | "execution_count": 34,
717 | "metadata": {
718 | "collapsed": false
719 | },
720 | "outputs": [],
721 | "source": [
722 | "from sklearn.tree import DecisionTreeClassifier\n",
723 | "from sklearn.naive_bayes import MultinomialNB\n",
724 | "from xgboost import XGBClassifier\n",
725 | "from sklearn.neighbors import KNeighborsClassifier\n",
726 | "from sklearn.multiclass import OneVsRestClassifier\n",
727 | "\n",
728 | "from sklearn.linear_model import LogisticRegression\n",
729 | "from sklearn.metrics import accuracy_score"
730 | ]
731 | },
732 | {
733 | "cell_type": "markdown",
734 | "metadata": {},
735 | "source": [
736 | "## Randomized Model as Benchmark"
737 | ]
738 | },
739 | {
740 | "cell_type": "code",
741 | "execution_count": 41,
742 | "metadata": {
743 | "collapsed": false
744 | },
745 | "outputs": [],
746 | "source": [
747 | "outcome_list = [-1,0,1]\n",
748 | "y_pred = []\n",
749 | "for i in xrange(1,381):\n",
750 | " y_pred.append(random.choice(outcome_list))\n",
751 | " "
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": 42,
757 | "metadata": {
758 | "collapsed": false
759 | },
760 | "outputs": [
761 | {
762 | "data": {
763 | "text/plain": [
764 | "0.32631578947368423"
765 | ]
766 | },
767 | "execution_count": 42,
768 | "metadata": {},
769 | "output_type": "execute_result"
770 | }
771 | ],
772 | "source": [
773 | "accuracy_score(y_train,y_pred)"
774 | ]
775 | },
776 | {
777 | "cell_type": "markdown",
778 | "metadata": {},
779 | "source": [
780 | "With just a random model, we get an accuracy of 33% on an average which is expected since there are three outcomes to any fixture. "
781 | ]
782 | },
783 | {
784 | "cell_type": "markdown",
785 | "metadata": {},
786 | "source": [
787 | "### Classifiers"
788 | ]
789 | },
790 | {
791 | "cell_type": "code",
792 | "execution_count": 87,
793 | "metadata": {
794 | "collapsed": false
795 | },
796 | "outputs": [
797 | {
798 | "data": {
799 | "text/plain": [
800 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n",
801 | " metric_params=None, n_jobs=1, n_neighbors=15, p=2,\n",
802 | " weights='uniform')"
803 | ]
804 | },
805 | "execution_count": 87,
806 | "metadata": {},
807 | "output_type": "execute_result"
808 | }
809 | ],
810 | "source": [
811 | "clf1 = DecisionTreeClassifier()\n",
812 | "clf2 = XGBClassifier()\n",
813 | "clf3 = KNeighborsClassifier(n_neighbors=15)\n",
814 | "clf3.fit(X_train,y_train)"
815 | ]
816 | },
817 | {
818 | "cell_type": "code",
819 | "execution_count": 88,
820 | "metadata": {
821 | "collapsed": false
822 | },
823 | "outputs": [
824 | {
825 | "data": {
826 | "text/plain": [
827 | "0.54736842105263162"
828 | ]
829 | },
830 | "execution_count": 88,
831 | "metadata": {},
832 | "output_type": "execute_result"
833 | }
834 | ],
835 | "source": [
836 | "y_pred = clf3.predict(X_train)\n",
837 | "accuracy_score(y_pred,y_train)"
838 | ]
839 | },
840 | {
841 | "cell_type": "code",
842 | "execution_count": null,
843 | "metadata": {
844 | "collapsed": true
845 | },
846 | "outputs": [],
847 | "source": []
848 | },
849 | {
850 | "cell_type": "code",
851 | "execution_count": null,
852 | "metadata": {
853 | "collapsed": true
854 | },
855 | "outputs": [],
856 | "source": []
857 | }
858 | ],
859 | "metadata": {
860 | "kernelspec": {
861 | "display_name": "Python [Root]",
862 | "language": "python",
863 | "name": "Python [Root]"
864 | },
865 | "language_info": {
866 | "codemirror_mode": {
867 | "name": "ipython",
868 | "version": 2
869 | },
870 | "file_extension": ".py",
871 | "mimetype": "text/x-python",
872 | "name": "python",
873 | "nbconvert_exporter": "python",
874 | "pygments_lexer": "ipython2",
875 | "version": "2.7.12"
876 | }
877 | },
878 | "nbformat": 4,
879 | "nbformat_minor": 0
880 | }
881 |
--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_xml/data_wrangling_xml/sliderule_dsi_xml_exercise.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# XML example and exercise\n",
8 | "****\n",
9 | "+ study examples of accessing nodes in XML tree structure \n",
10 | "+ work on exercise to be completed and submitted\n",
11 | "****\n",
12 | "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n",
13 | "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n",
14 | "****"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "from xml.etree import ElementTree as ET"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## XML example\n",
33 | "\n",
34 | "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 3,
40 | "metadata": {
41 | "collapsed": true
42 | },
43 | "outputs": [],
44 | "source": [
45 | "document_tree = ET.parse( './data/mondial_database_less.xml' )"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "Albania\n",
60 | "Greece\n",
61 | "Macedonia\n",
62 | "Serbia\n",
63 | "Montenegro\n",
64 | "Kosovo\n",
65 | "Andorra\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "# print names of all countries\n",
71 | "for child in document_tree.getroot():\n",
72 | " print child.find('name').text"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {
79 | "collapsed": false
80 | },
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n",
87 | "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n",
88 | "* Macedonia: Skopje, Kumanovo\n",
89 | "* Serbia: Beograd, Novi Sad, Niš\n",
90 | "* Montenegro: Podgorica\n",
91 | "* Kosovo: Prishtine\n",
92 | "* Andorra: Andorra la Vella\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "# print names of all countries and their cities\n",
98 | "for element in document_tree.iterfind('country'):\n",
99 | " print '* ' + element.find('name').text + ':',\n",
100 | " capitals_string = ''\n",
101 | " for subelement in element.getiterator('city'):\n",
102 | " capitals_string += subelement.find('name').text + ', '\n",
103 | " print capitals_string[:-2]"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "****\n",
111 | "## XML exercise\n",
112 | "\n",
113 | "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n",
114 | "\n",
115 | "1. 10 countries with the lowest infant mortality rates\n",
116 | "2. 10 cities with the largest population\n",
117 | "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
118 | "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 6,
124 | "metadata": {
125 | "collapsed": false
126 | },
127 | "outputs": [],
128 | "source": [
129 | "document = ET.parse( './data/mondial_database.xml' )"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 7,
135 | "metadata": {
136 | "collapsed": false
137 | },
138 | "outputs": [],
139 | "source": [
140 | "#1. 10 countries with the lowest infant mortality rates\n",
141 | "country_im = []\n",
142 | "for country in document.iterfind('country'):\n",
143 | " if country.find('infant_mortality') is not None:\n",
144 | " country_im.append([country.find('name').text,country.find('infant_mortality').text])\n",
145 | " "
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 8,
151 | "metadata": {
152 | "collapsed": false
153 | },
154 | "outputs": [],
155 | "source": [
156 | "import pandas as pd\n",
157 | "im = pd.DataFrame(country_im)\n",
158 | "im.columns = [\"country\",\"infant_moratality\"]"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 9,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [
168 | {
169 | "data": {
170 | "text/html": [
171 | "\n",
172 | "
\n",
173 | " \n",
174 | " \n",
175 | " | \n",
176 | " country | \n",
177 | " infant_moratality | \n",
178 | "
\n",
179 | " \n",
180 | " \n",
181 | " \n",
182 | " | 36 | \n",
183 | " Monaco | \n",
184 | " 1.81 | \n",
185 | "
\n",
186 | " \n",
187 | " | 90 | \n",
188 | " Japan | \n",
189 | " 2.13 | \n",
190 | "
\n",
191 | " \n",
192 | " | 109 | \n",
193 | " Bermuda | \n",
194 | " 2.48 | \n",
195 | "
\n",
196 | " \n",
197 | " | 34 | \n",
198 | " Norway | \n",
199 | " 2.48 | \n",
200 | "
\n",
201 | " \n",
202 | " | 98 | \n",
203 | " Singapore | \n",
204 | " 2.53 | \n",
205 | "
\n",
206 | " \n",
207 | " | 35 | \n",
208 | " Sweden | \n",
209 | " 2.60 | \n",
210 | "
\n",
211 | " \n",
212 | " | 8 | \n",
213 | " Czech Republic | \n",
214 | " 2.63 | \n",
215 | "
\n",
216 | " \n",
217 | " | 72 | \n",
218 | " Hong Kong | \n",
219 | " 2.73 | \n",
220 | "
\n",
221 | " \n",
222 | " | 73 | \n",
223 | " Macao | \n",
224 | " 3.13 | \n",
225 | "
\n",
226 | " \n",
227 | " | 39 | \n",
228 | " Iceland | \n",
229 | " 3.15 | \n",
230 | "
\n",
231 | " \n",
232 | "
\n",
233 | "
"
234 | ],
235 | "text/plain": [
236 | " country infant_moratality\n",
237 | "36 Monaco 1.81\n",
238 | "90 Japan 2.13\n",
239 | "109 Bermuda 2.48\n",
240 | "34 Norway 2.48\n",
241 | "98 Singapore 2.53\n",
242 | "35 Sweden 2.60\n",
243 | "8 Czech Republic 2.63\n",
244 | "72 Hong Kong 2.73\n",
245 | "73 Macao 3.13\n",
246 | "39 Iceland 3.15"
247 | ]
248 | },
249 | "execution_count": 9,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "im.infant_moratality = im.infant_moratality.astype(float)\n",
256 | "im = im.sort_values(by='infant_moratality')\n",
257 | "im.head(10)"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 10,
263 | "metadata": {
264 | "collapsed": true
265 | },
266 | "outputs": [],
267 | "source": [
268 | "#2. 10 cities with the largest population\n",
269 | "populations = []\n",
270 | "for country in document.iterfind('country'):\n",
271 | " if country.find('population') is not None:\n",
272 | " populations.append([country.find('name').text,country.find('population').text])\n",
273 | " "
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 11,
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "outputs": [],
283 | "source": [
284 | "pop = pd.DataFrame(populations)\n",
285 | "pop.columns = [\"country\",\"population\"]"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 12,
291 | "metadata": {
292 | "collapsed": false
293 | },
294 | "outputs": [
295 | {
296 | "data": {
297 | "text/html": [
298 | "\n",
299 | "
\n",
300 | " \n",
301 | " \n",
302 | " | \n",
303 | " country | \n",
304 | " population | \n",
305 | "
\n",
306 | " \n",
307 | " \n",
308 | " \n",
309 | " | 166 | \n",
310 | " Pitcairn | \n",
311 | " 68.0 | \n",
312 | "
\n",
313 | " \n",
314 | " | 83 | \n",
315 | " Cocos Islands | \n",
316 | " 628.0 | \n",
317 | "
\n",
318 | " \n",
319 | " | 41 | \n",
320 | " Holy See | \n",
321 | " 840.0 | \n",
322 | "
\n",
323 | " \n",
324 | " | 121 | \n",
325 | " Cayman Islands | \n",
326 | " 933.0 | \n",
327 | "
\n",
328 | " \n",
329 | " | 138 | \n",
330 | " Sint Maarten | \n",
331 | " 1497.0 | \n",
332 | "
\n",
333 | " \n",
334 | " | 170 | \n",
335 | " Tokelau | \n",
336 | " 1570.0 | \n",
337 | "
\n",
338 | " \n",
339 | " | 39 | \n",
340 | " Gibraltar | \n",
341 | " 1816.0 | \n",
342 | "
\n",
343 | " \n",
344 | " | 186 | \n",
345 | " Falkland Islands | \n",
346 | " 2043.0 | \n",
347 | "
\n",
348 | " \n",
349 | " | 159 | \n",
350 | " Nauru | \n",
351 | " 2066.0 | \n",
352 | "
\n",
353 | " \n",
354 | " | 52 | \n",
355 | " Svalbard | \n",
356 | " 2116.0 | \n",
357 | "
\n",
358 | " \n",
359 | "
\n",
360 | "
"
361 | ],
362 | "text/plain": [
363 | " country population\n",
364 | "166 Pitcairn 68.0\n",
365 | "83 Cocos Islands 628.0\n",
366 | "41 Holy See 840.0\n",
367 | "121 Cayman Islands 933.0\n",
368 | "138 Sint Maarten 1497.0\n",
369 | "170 Tokelau 1570.0\n",
370 | "39 Gibraltar 1816.0\n",
371 | "186 Falkland Islands 2043.0\n",
372 | "159 Nauru 2066.0\n",
373 | "52 Svalbard 2116.0"
374 | ]
375 | },
376 | "execution_count": 12,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "pop.population = pop.population.astype(float)\n",
383 | "pop = pop.sort_values(by = \"population\")\n",
384 | "pop.head(10)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 13,
390 | "metadata": {
391 | "collapsed": false
392 | },
393 | "outputs": [
394 | {
395 | "data": {
396 | "text/html": [
397 | "\n",
398 | "
\n",
399 | " \n",
400 | " \n",
401 | " | \n",
402 | " countries | \n",
403 | " country_percentage | \n",
404 | " country_pop | \n",
405 | " ethnicity | \n",
406 | " population | \n",
407 | " year | \n",
408 | "
\n",
409 | " \n",
410 | " \n",
411 | " \n",
412 | " | 0 | \n",
413 | " Albania | \n",
414 | " 95.0 | \n",
415 | " 2800138 | \n",
416 | " Albanian | \n",
417 | " 2660131 | \n",
418 | " 2011 | \n",
419 | "
\n",
420 | " \n",
421 | " | 1 | \n",
422 | " Albania | \n",
423 | " 3.0 | \n",
424 | " 2800138 | \n",
425 | " Greek | \n",
426 | " 84004 | \n",
427 | " 2011 | \n",
428 | "
\n",
429 | " \n",
430 | " | 2 | \n",
431 | " Greece | \n",
432 | " 93.0 | \n",
433 | " 10816286 | \n",
434 | " Greek | \n",
435 | " 10059145 | \n",
436 | " 2011 | \n",
437 | "
\n",
438 | " \n",
439 | " | 3 | \n",
440 | " Macedonia | \n",
441 | " 64.2 | \n",
442 | " 2059794 | \n",
443 | " Macedonian | \n",
444 | " 1322387 | \n",
445 | " 2011 | \n",
446 | "
\n",
447 | " \n",
448 | " | 4 | \n",
449 | " Macedonia | \n",
450 | " 25.2 | \n",
451 | " 2059794 | \n",
452 | " Albanian | \n",
453 | " 519068 | \n",
454 | " 2011 | \n",
455 | "
\n",
456 | " \n",
457 | " | 5 | \n",
458 | " Macedonia | \n",
459 | " 3.9 | \n",
460 | " 2059794 | \n",
461 | " Turkish | \n",
462 | " 80331 | \n",
463 | " 2011 | \n",
464 | "
\n",
465 | " \n",
466 | " | 6 | \n",
467 | " Macedonia | \n",
468 | " 2.7 | \n",
469 | " 2059794 | \n",
470 | " Gypsy | \n",
471 | " 55614 | \n",
472 | " 2011 | \n",
473 | "
\n",
474 | " \n",
475 | " | 7 | \n",
476 | " Macedonia | \n",
477 | " 1.8 | \n",
478 | " 2059794 | \n",
479 | " Serb | \n",
480 | " 37076 | \n",
481 | " 2011 | \n",
482 | "
\n",
483 | " \n",
484 | " | 8 | \n",
485 | " Serbia | \n",
486 | " 82.9 | \n",
487 | " 7120666 | \n",
488 | " Serb | \n",
489 | " 5903032 | \n",
490 | " 2011 | \n",
491 | "
\n",
492 | " \n",
493 | " | 9 | \n",
494 | " Serbia | \n",
495 | " 0.9 | \n",
496 | " 7120666 | \n",
497 | " Montenegrin | \n",
498 | " 64085 | \n",
499 | " 2011 | \n",
500 | "
\n",
501 | " \n",
502 | "
\n",
503 | "
"
504 | ],
505 | "text/plain": [
506 | " countries country_percentage country_pop ethnicity population year\n",
507 | "0 Albania 95.0 2800138 Albanian 2660131 2011\n",
508 | "1 Albania 3.0 2800138 Greek 84004 2011\n",
509 | "2 Greece 93.0 10816286 Greek 10059145 2011\n",
510 | "3 Macedonia 64.2 2059794 Macedonian 1322387 2011\n",
511 | "4 Macedonia 25.2 2059794 Albanian 519068 2011\n",
512 | "5 Macedonia 3.9 2059794 Turkish 80331 2011\n",
513 | "6 Macedonia 2.7 2059794 Gypsy 55614 2011\n",
514 | "7 Macedonia 1.8 2059794 Serb 37076 2011\n",
515 | "8 Serbia 82.9 7120666 Serb 5903032 2011\n",
516 | "9 Serbia 0.9 7120666 Montenegrin 64085 2011"
517 | ]
518 | },
519 | "execution_count": 13,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": [
525 | "#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
526 | "data = [] \n",
527 | "\n",
528 | "for country in document.findall('country'):\n",
529 | " for node in list(country):\n",
530 | " if node.tag == 'name':\n",
531 | " co = node.text\n",
532 | " elif node.tag == 'population':\n",
533 | " # the last listed population statistic is used\n",
534 | " pop = int(node.text)\n",
535 | " #meas = node.attrib['measured'] --leads to an error, potentially unpopulated at times\n",
536 | " yr = int(node.attrib['year'])\n",
537 | " elif node.tag == 'ethnicgroup':\n",
538 | " eth = node.text\n",
539 | " perc = float(node.attrib['percentage'])\n",
540 | " epop = int(pop * perc / 100.)\n",
541 | " \n",
542 | " data.append({'countries':co, 'country_pop':pop, 'year':yr,\n",
543 | " 'ethnicity':eth, 'country_percentage':perc, 'population':epop})\n",
544 | " \n",
545 | "df = pd.DataFrame(data)\n",
546 | "df.head(10)"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": 15,
552 | "metadata": {
553 | "collapsed": false
554 | },
555 | "outputs": [
556 | {
557 | "data": {
558 | "text/html": [
559 | "\n",
560 | "
\n",
561 | " \n",
562 | " \n",
563 | " | \n",
564 | " ethnicity | \n",
565 | " population | \n",
566 | "
\n",
567 | " \n",
568 | " \n",
569 | " \n",
570 | " | 0 | \n",
571 | " Han Chinese | \n",
572 | " 1245058800 | \n",
573 | "
\n",
574 | " \n",
575 | " | 1 | \n",
576 | " Indo-Aryan | \n",
577 | " 871815583 | \n",
578 | "
\n",
579 | " \n",
580 | " | 2 | \n",
581 | " European | \n",
582 | " 494872201 | \n",
583 | "
\n",
584 | " \n",
585 | " | 3 | \n",
586 | " African | \n",
587 | " 318325104 | \n",
588 | "
\n",
589 | " \n",
590 | " | 4 | \n",
591 | " Dravidian | \n",
592 | " 302713744 | \n",
593 | "
\n",
594 | " \n",
595 | " | 5 | \n",
596 | " Mestizo | \n",
597 | " 157734349 | \n",
598 | "
\n",
599 | " \n",
600 | " | 6 | \n",
601 | " Bengali | \n",
602 | " 146776916 | \n",
603 | "
\n",
604 | " \n",
605 | " | 7 | \n",
606 | " Russian | \n",
607 | " 131856989 | \n",
608 | "
\n",
609 | " \n",
610 | " | 8 | \n",
611 | " Japanese | \n",
612 | " 126534212 | \n",
613 | "
\n",
614 | " \n",
615 | " | 9 | \n",
616 | " Malay | \n",
617 | " 121993548 | \n",
618 | "
\n",
619 | " \n",
620 | "
\n",
621 | "
"
622 | ],
623 | "text/plain": [
624 | " ethnicity population\n",
625 | "0 Han Chinese 1245058800\n",
626 | "1 Indo-Aryan 871815583\n",
627 | "2 European 494872201\n",
628 | "3 African 318325104\n",
629 | "4 Dravidian 302713744\n",
630 | "5 Mestizo 157734349\n",
631 | "6 Bengali 146776916\n",
632 | "7 Russian 131856989\n",
633 | "8 Japanese 126534212\n",
634 | "9 Malay 121993548"
635 | ]
636 | },
637 | "execution_count": 15,
638 | "metadata": {},
639 | "output_type": "execute_result"
640 | }
641 | ],
642 | "source": [
643 | "df.groupby('ethnicity').population.sum().sort_values(ascending=False).head(10).reset_index()"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 14,
649 | "metadata": {
650 | "collapsed": true
651 | },
652 | "outputs": [],
653 | "source": [
654 | "#4. name and country of a) longest river "
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": 17,
660 | "metadata": {
661 | "collapsed": false
662 | },
663 | "outputs": [
664 | {
665 | "data": {
666 | "text/html": [
667 | "\n",
668 | "
\n",
669 | " \n",
670 | " \n",
671 | " | \n",
672 | " country | \n",
673 | " length | \n",
674 | " name | \n",
675 | "
\n",
676 | " \n",
677 | " \n",
678 | " \n",
679 | " | 161 | \n",
680 | " CO | \n",
681 | " 6448 | \n",
682 | " Amazonas | \n",
683 | "
\n",
684 | " \n",
685 | "
\n",
686 | "
"
687 | ],
688 | "text/plain": [
689 | " country length name\n",
690 | "161 CO 6448 Amazonas"
691 | ]
692 | },
693 | "execution_count": 17,
694 | "metadata": {},
695 | "output_type": "execute_result"
696 | }
697 | ],
698 | "source": [
699 | "rivers_list=[]\n",
700 | "rivers_df = pd.DataFrame()\n",
701 | "for rivers in document.iterfind('river'):\n",
702 | " try:\n",
703 | " rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})\n",
704 | " except:\n",
705 | " next\n",
706 | "rivers_df = pd.DataFrame(rivers_list)\n",
707 | "rivers_df.sort_values(by = 'length', ascending=False).head(1)"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": null,
713 | "metadata": {
714 | "collapsed": true
715 | },
716 | "outputs": [],
717 | "source": [
718 | "#b) largest lake"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 19,
724 | "metadata": {
725 | "collapsed": false
726 | },
727 | "outputs": [
728 | {
729 | "data": {
730 | "text/html": [
731 | "\n",
732 | "
\n",
733 | " \n",
734 | " \n",
735 | " | \n",
736 | " area | \n",
737 | " country | \n",
738 | " name | \n",
739 | "
\n",
740 | " \n",
741 | " \n",
742 | " \n",
743 | " | 42 | \n",
744 | " 386400 | \n",
745 | " R | \n",
746 | " Caspian Sea | \n",
747 | "
\n",
748 | " \n",
749 | "
\n",
750 | "
"
751 | ],
752 | "text/plain": [
753 | " area country name\n",
754 | "42 386400 R Caspian Sea"
755 | ]
756 | },
757 | "execution_count": 19,
758 | "metadata": {},
759 | "output_type": "execute_result"
760 | }
761 | ],
762 | "source": [
763 | "lake_list=[]\n",
764 | "lake_df = pd.DataFrame()\n",
765 | "for lakes in document.iterfind('lake'):\n",
766 | " try:\n",
767 | " lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})\n",
768 | " except:\n",
769 | " next\n",
770 | "lakes_df = pd.DataFrame(lake_list)\n",
771 | "lakes_df.sort_values(by = 'area', ascending=False).head(1)"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": null,
777 | "metadata": {
778 | "collapsed": true
779 | },
780 | "outputs": [],
781 | "source": [
782 | "#c) airport at highest elevation"
783 | ]
784 | },
785 | {
786 | "cell_type": "code",
787 | "execution_count": 20,
788 | "metadata": {
789 | "collapsed": false
790 | },
791 | "outputs": [
792 | {
793 | "name": "stderr",
794 | "output_type": "stream",
795 | "text": [
796 | "/home/sibi/acad/prog_tools/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
797 | ]
798 | },
799 | {
800 | "data": {
801 | "text/html": [
802 | "\n",
803 | "
\n",
804 | " \n",
805 | " \n",
806 | " | \n",
807 | " country | \n",
808 | " elevation | \n",
809 | " name | \n",
810 | "
\n",
811 | " \n",
812 | " \n",
813 | " \n",
814 | " | 80 | \n",
815 | " BOL | \n",
816 | " 4063 | \n",
817 | " El Alto Intl | \n",
818 | "
\n",
819 | " \n",
820 | "
\n",
821 | "
"
822 | ],
823 | "text/plain": [
824 | " country elevation name\n",
825 | "80 BOL 4063 El Alto Intl"
826 | ]
827 | },
828 | "execution_count": 20,
829 | "metadata": {},
830 | "output_type": "execute_result"
831 | }
832 | ],
833 | "source": [
834 | "ap_list=[]\n",
835 | "ap_df = pd.DataFrame()\n",
836 | "for ap in document.iterfind('airport'):\n",
837 | " try:\n",
838 | " ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})\n",
839 | " except:\n",
840 | " next\n",
841 | "ap_df = pd.DataFrame(ap_list)\n",
842 | "ap_df.sort('elevation', ascending=False).head(1)\n"
843 | ]
844 | },
845 | {
846 | "cell_type": "code",
847 | "execution_count": null,
848 | "metadata": {
849 | "collapsed": true
850 | },
851 | "outputs": [],
852 | "source": []
853 | }
854 | ],
855 | "metadata": {
856 | "kernelspec": {
857 | "display_name": "Python [Root]",
858 | "language": "python",
859 | "name": "Python [Root]"
860 | },
861 | "language_info": {
862 | "codemirror_mode": {
863 | "name": "ipython",
864 | "version": 2
865 | },
866 | "file_extension": ".py",
867 | "mimetype": "text/x-python",
868 | "name": "python",
869 | "nbconvert_exporter": "python",
870 | "pygments_lexer": "ipython2",
871 | "version": "2.7.12"
872 | }
873 | },
874 | "nbformat": 4,
875 | "nbformat_minor": 0
876 | }
877 |
--------------------------------------------------------------------------------
/Data Wrangling/data_wrangling_xml/data_wrangling_xml/.ipynb_checkpoints/sliderule_dsi_xml_exercise-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# XML example and exercise\n",
8 | "****\n",
9 | "+ study examples of accessing nodes in XML tree structure \n",
10 | "+ work on exercise to be completed and submitted\n",
11 | "****\n",
12 | "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n",
13 | "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n",
14 | "****"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 2,
20 | "metadata": {
21 | "collapsed": true
22 | },
23 | "outputs": [],
24 | "source": [
25 | "from xml.etree import ElementTree as ET"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {},
31 | "source": [
32 | "## XML example\n",
33 | "\n",
34 | "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 3,
40 | "metadata": {
41 | "collapsed": true
42 | },
43 | "outputs": [],
44 | "source": [
45 | "document_tree = ET.parse( './data/mondial_database_less.xml' )"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 4,
51 | "metadata": {
52 | "collapsed": false
53 | },
54 | "outputs": [
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "Albania\n",
60 | "Greece\n",
61 | "Macedonia\n",
62 | "Serbia\n",
63 | "Montenegro\n",
64 | "Kosovo\n",
65 | "Andorra\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "# print names of all countries\n",
71 | "for child in document_tree.getroot():\n",
72 | " print child.find('name').text"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 5,
78 | "metadata": {
79 | "collapsed": false
80 | },
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n",
87 | "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n",
88 | "* Macedonia: Skopje, Kumanovo\n",
89 | "* Serbia: Beograd, Novi Sad, Niš\n",
90 | "* Montenegro: Podgorica\n",
91 | "* Kosovo: Prishtine\n",
92 | "* Andorra: Andorra la Vella\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "# print names of all countries and their cities\n",
98 | "for element in document_tree.iterfind('country'):\n",
99 | " print '* ' + element.find('name').text + ':',\n",
100 | " capitals_string = ''\n",
101 | " for subelement in element.getiterator('city'):\n",
102 | " capitals_string += subelement.find('name').text + ', '\n",
103 | " print capitals_string[:-2]"
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {},
109 | "source": [
110 | "****\n",
111 | "## XML exercise\n",
112 | "\n",
113 | "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n",
114 | "\n",
115 | "1. 10 countries with the lowest infant mortality rates\n",
116 | "2. 10 cities with the largest population\n",
117 | "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
118 | "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 6,
124 | "metadata": {
125 | "collapsed": false
126 | },
127 | "outputs": [],
128 | "source": [
129 | "document = ET.parse( './data/mondial_database.xml' )"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 7,
135 | "metadata": {
136 | "collapsed": false
137 | },
138 | "outputs": [],
139 | "source": [
140 | "#1. 10 countries with the lowest infant mortality rates\n",
141 | "country_im = []\n",
142 | "for country in document.iterfind('country'):\n",
143 | " if country.find('infant_mortality') is not None:\n",
144 | " country_im.append([country.find('name').text,country.find('infant_mortality').text])\n",
145 | " "
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 8,
151 | "metadata": {
152 | "collapsed": false
153 | },
154 | "outputs": [],
155 | "source": [
156 | "import pandas as pd\n",
157 | "im = pd.DataFrame(country_im)\n",
158 | "im.columns = [\"country\",\"infant_moratality\"]"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 9,
164 | "metadata": {
165 | "collapsed": false
166 | },
167 | "outputs": [
168 | {
169 | "data": {
170 | "text/html": [
171 | "\n",
172 | "
\n",
173 | " \n",
174 | " \n",
175 | " | \n",
176 | " country | \n",
177 | " infant_moratality | \n",
178 | "
\n",
179 | " \n",
180 | " \n",
181 | " \n",
182 | " | 36 | \n",
183 | " Monaco | \n",
184 | " 1.81 | \n",
185 | "
\n",
186 | " \n",
187 | " | 90 | \n",
188 | " Japan | \n",
189 | " 2.13 | \n",
190 | "
\n",
191 | " \n",
192 | " | 109 | \n",
193 | " Bermuda | \n",
194 | " 2.48 | \n",
195 | "
\n",
196 | " \n",
197 | " | 34 | \n",
198 | " Norway | \n",
199 | " 2.48 | \n",
200 | "
\n",
201 | " \n",
202 | " | 98 | \n",
203 | " Singapore | \n",
204 | " 2.53 | \n",
205 | "
\n",
206 | " \n",
207 | " | 35 | \n",
208 | " Sweden | \n",
209 | " 2.60 | \n",
210 | "
\n",
211 | " \n",
212 | " | 8 | \n",
213 | " Czech Republic | \n",
214 | " 2.63 | \n",
215 | "
\n",
216 | " \n",
217 | " | 72 | \n",
218 | " Hong Kong | \n",
219 | " 2.73 | \n",
220 | "
\n",
221 | " \n",
222 | " | 73 | \n",
223 | " Macao | \n",
224 | " 3.13 | \n",
225 | "
\n",
226 | " \n",
227 | " | 39 | \n",
228 | " Iceland | \n",
229 | " 3.15 | \n",
230 | "
\n",
231 | " \n",
232 | "
\n",
233 | "
"
234 | ],
235 | "text/plain": [
236 | " country infant_moratality\n",
237 | "36 Monaco 1.81\n",
238 | "90 Japan 2.13\n",
239 | "109 Bermuda 2.48\n",
240 | "34 Norway 2.48\n",
241 | "98 Singapore 2.53\n",
242 | "35 Sweden 2.60\n",
243 | "8 Czech Republic 2.63\n",
244 | "72 Hong Kong 2.73\n",
245 | "73 Macao 3.13\n",
246 | "39 Iceland 3.15"
247 | ]
248 | },
249 | "execution_count": 9,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "im.infant_moratality = im.infant_moratality.astype(float)\n",
256 | "im = im.sort_values(by='infant_moratality')\n",
257 | "im.head(10)"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 10,
263 | "metadata": {
264 | "collapsed": true
265 | },
266 | "outputs": [],
267 | "source": [
268 | "#2. 10 cities with the largest population\n",
269 | "populations = []\n",
270 | "for country in document.iterfind('country'):\n",
271 | " if country.find('population') is not None:\n",
272 | " populations.append([country.find('name').text,country.find('population').text])\n",
273 | " "
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 11,
279 | "metadata": {
280 | "collapsed": false
281 | },
282 | "outputs": [],
283 | "source": [
284 | "pop = pd.DataFrame(populations)\n",
285 | "pop.columns = [\"country\",\"population\"]"
286 | ]
287 | },
288 | {
289 | "cell_type": "code",
290 | "execution_count": 12,
291 | "metadata": {
292 | "collapsed": false
293 | },
294 | "outputs": [
295 | {
296 | "data": {
297 | "text/html": [
298 | "\n",
299 | "
\n",
300 | " \n",
301 | " \n",
302 | " | \n",
303 | " country | \n",
304 | " population | \n",
305 | "
\n",
306 | " \n",
307 | " \n",
308 | " \n",
309 | " | 166 | \n",
310 | " Pitcairn | \n",
311 | " 68.0 | \n",
312 | "
\n",
313 | " \n",
314 | " | 83 | \n",
315 | " Cocos Islands | \n",
316 | " 628.0 | \n",
317 | "
\n",
318 | " \n",
319 | " | 41 | \n",
320 | " Holy See | \n",
321 | " 840.0 | \n",
322 | "
\n",
323 | " \n",
324 | " | 121 | \n",
325 | " Cayman Islands | \n",
326 | " 933.0 | \n",
327 | "
\n",
328 | " \n",
329 | " | 138 | \n",
330 | " Sint Maarten | \n",
331 | " 1497.0 | \n",
332 | "
\n",
333 | " \n",
334 | " | 170 | \n",
335 | " Tokelau | \n",
336 | " 1570.0 | \n",
337 | "
\n",
338 | " \n",
339 | " | 39 | \n",
340 | " Gibraltar | \n",
341 | " 1816.0 | \n",
342 | "
\n",
343 | " \n",
344 | " | 186 | \n",
345 | " Falkland Islands | \n",
346 | " 2043.0 | \n",
347 | "
\n",
348 | " \n",
349 | " | 159 | \n",
350 | " Nauru | \n",
351 | " 2066.0 | \n",
352 | "
\n",
353 | " \n",
354 | " | 52 | \n",
355 | " Svalbard | \n",
356 | " 2116.0 | \n",
357 | "
\n",
358 | " \n",
359 | "
\n",
360 | "
"
361 | ],
362 | "text/plain": [
363 | " country population\n",
364 | "166 Pitcairn 68.0\n",
365 | "83 Cocos Islands 628.0\n",
366 | "41 Holy See 840.0\n",
367 | "121 Cayman Islands 933.0\n",
368 | "138 Sint Maarten 1497.0\n",
369 | "170 Tokelau 1570.0\n",
370 | "39 Gibraltar 1816.0\n",
371 | "186 Falkland Islands 2043.0\n",
372 | "159 Nauru 2066.0\n",
373 | "52 Svalbard 2116.0"
374 | ]
375 | },
376 | "execution_count": 12,
377 | "metadata": {},
378 | "output_type": "execute_result"
379 | }
380 | ],
381 | "source": [
382 | "pop.population = pop.population.astype(float)\n",
383 | "pop = pop.sort_values(by = \"population\")\n",
384 | "pop.head(10)"
385 | ]
386 | },
387 | {
388 | "cell_type": "code",
389 | "execution_count": 13,
390 | "metadata": {
391 | "collapsed": false
392 | },
393 | "outputs": [
394 | {
395 | "data": {
396 | "text/html": [
397 | "\n",
398 | "
\n",
399 | " \n",
400 | " \n",
401 | " | \n",
402 | " countries | \n",
403 | " country_percentage | \n",
404 | " country_pop | \n",
405 | " ethnicity | \n",
406 | " population | \n",
407 | " year | \n",
408 | "
\n",
409 | " \n",
410 | " \n",
411 | " \n",
412 | " | 0 | \n",
413 | " Albania | \n",
414 | " 95.0 | \n",
415 | " 2800138 | \n",
416 | " Albanian | \n",
417 | " 2660131 | \n",
418 | " 2011 | \n",
419 | "
\n",
420 | " \n",
421 | " | 1 | \n",
422 | " Albania | \n",
423 | " 3.0 | \n",
424 | " 2800138 | \n",
425 | " Greek | \n",
426 | " 84004 | \n",
427 | " 2011 | \n",
428 | "
\n",
429 | " \n",
430 | " | 2 | \n",
431 | " Greece | \n",
432 | " 93.0 | \n",
433 | " 10816286 | \n",
434 | " Greek | \n",
435 | " 10059145 | \n",
436 | " 2011 | \n",
437 | "
\n",
438 | " \n",
439 | " | 3 | \n",
440 | " Macedonia | \n",
441 | " 64.2 | \n",
442 | " 2059794 | \n",
443 | " Macedonian | \n",
444 | " 1322387 | \n",
445 | " 2011 | \n",
446 | "
\n",
447 | " \n",
448 | " | 4 | \n",
449 | " Macedonia | \n",
450 | " 25.2 | \n",
451 | " 2059794 | \n",
452 | " Albanian | \n",
453 | " 519068 | \n",
454 | " 2011 | \n",
455 | "
\n",
456 | " \n",
457 | " | 5 | \n",
458 | " Macedonia | \n",
459 | " 3.9 | \n",
460 | " 2059794 | \n",
461 | " Turkish | \n",
462 | " 80331 | \n",
463 | " 2011 | \n",
464 | "
\n",
465 | " \n",
466 | " | 6 | \n",
467 | " Macedonia | \n",
468 | " 2.7 | \n",
469 | " 2059794 | \n",
470 | " Gypsy | \n",
471 | " 55614 | \n",
472 | " 2011 | \n",
473 | "
\n",
474 | " \n",
475 | " | 7 | \n",
476 | " Macedonia | \n",
477 | " 1.8 | \n",
478 | " 2059794 | \n",
479 | " Serb | \n",
480 | " 37076 | \n",
481 | " 2011 | \n",
482 | "
\n",
483 | " \n",
484 | " | 8 | \n",
485 | " Serbia | \n",
486 | " 82.9 | \n",
487 | " 7120666 | \n",
488 | " Serb | \n",
489 | " 5903032 | \n",
490 | " 2011 | \n",
491 | "
\n",
492 | " \n",
493 | " | 9 | \n",
494 | " Serbia | \n",
495 | " 0.9 | \n",
496 | " 7120666 | \n",
497 | " Montenegrin | \n",
498 | " 64085 | \n",
499 | " 2011 | \n",
500 | "
\n",
501 | " \n",
502 | "
\n",
503 | "
"
504 | ],
505 | "text/plain": [
506 | " countries country_percentage country_pop ethnicity population year\n",
507 | "0 Albania 95.0 2800138 Albanian 2660131 2011\n",
508 | "1 Albania 3.0 2800138 Greek 84004 2011\n",
509 | "2 Greece 93.0 10816286 Greek 10059145 2011\n",
510 | "3 Macedonia 64.2 2059794 Macedonian 1322387 2011\n",
511 | "4 Macedonia 25.2 2059794 Albanian 519068 2011\n",
512 | "5 Macedonia 3.9 2059794 Turkish 80331 2011\n",
513 | "6 Macedonia 2.7 2059794 Gypsy 55614 2011\n",
514 | "7 Macedonia 1.8 2059794 Serb 37076 2011\n",
515 | "8 Serbia 82.9 7120666 Serb 5903032 2011\n",
516 | "9 Serbia 0.9 7120666 Montenegrin 64085 2011"
517 | ]
518 | },
519 | "execution_count": 13,
520 | "metadata": {},
521 | "output_type": "execute_result"
522 | }
523 | ],
524 | "source": [
525 | "#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n",
526 | "data = [] \n",
527 | "\n",
528 | "for country in document.findall('country'):\n",
529 | " for node in list(country):\n",
530 | " if node.tag == 'name':\n",
531 | " co = node.text\n",
532 | " elif node.tag == 'population':\n",
533 | " # the last listed population statistic is used\n",
534 | " pop = int(node.text)\n",
535 | " #meas = node.attrib['measured'] --leads to an error, potentially unpopulated at times\n",
536 | " yr = int(node.attrib['year'])\n",
537 | " elif node.tag == 'ethnicgroup':\n",
538 | " eth = node.text\n",
539 | " perc = float(node.attrib['percentage'])\n",
540 | " epop = int(pop * perc / 100.)\n",
541 | " \n",
542 | " data.append({'countries':co, 'country_pop':pop, 'year':yr,\n",
543 | " 'ethnicity':eth, 'country_percentage':perc, 'population':epop})\n",
544 | " \n",
545 | "df = pd.DataFrame(data)\n",
546 | "df.head(10)"
547 | ]
548 | },
549 | {
550 | "cell_type": "code",
551 | "execution_count": 15,
552 | "metadata": {
553 | "collapsed": false
554 | },
555 | "outputs": [
556 | {
557 | "data": {
558 | "text/html": [
559 | "\n",
560 | "
\n",
561 | " \n",
562 | " \n",
563 | " | \n",
564 | " ethnicity | \n",
565 | " population | \n",
566 | "
\n",
567 | " \n",
568 | " \n",
569 | " \n",
570 | " | 0 | \n",
571 | " Han Chinese | \n",
572 | " 1245058800 | \n",
573 | "
\n",
574 | " \n",
575 | " | 1 | \n",
576 | " Indo-Aryan | \n",
577 | " 871815583 | \n",
578 | "
\n",
579 | " \n",
580 | " | 2 | \n",
581 | " European | \n",
582 | " 494872201 | \n",
583 | "
\n",
584 | " \n",
585 | " | 3 | \n",
586 | " African | \n",
587 | " 318325104 | \n",
588 | "
\n",
589 | " \n",
590 | " | 4 | \n",
591 | " Dravidian | \n",
592 | " 302713744 | \n",
593 | "
\n",
594 | " \n",
595 | " | 5 | \n",
596 | " Mestizo | \n",
597 | " 157734349 | \n",
598 | "
\n",
599 | " \n",
600 | " | 6 | \n",
601 | " Bengali | \n",
602 | " 146776916 | \n",
603 | "
\n",
604 | " \n",
605 | " | 7 | \n",
606 | " Russian | \n",
607 | " 131856989 | \n",
608 | "
\n",
609 | " \n",
610 | " | 8 | \n",
611 | " Japanese | \n",
612 | " 126534212 | \n",
613 | "
\n",
614 | " \n",
615 | " | 9 | \n",
616 | " Malay | \n",
617 | " 121993548 | \n",
618 | "
\n",
619 | " \n",
620 | "
\n",
621 | "
"
622 | ],
623 | "text/plain": [
624 | " ethnicity population\n",
625 | "0 Han Chinese 1245058800\n",
626 | "1 Indo-Aryan 871815583\n",
627 | "2 European 494872201\n",
628 | "3 African 318325104\n",
629 | "4 Dravidian 302713744\n",
630 | "5 Mestizo 157734349\n",
631 | "6 Bengali 146776916\n",
632 | "7 Russian 131856989\n",
633 | "8 Japanese 126534212\n",
634 | "9 Malay 121993548"
635 | ]
636 | },
637 | "execution_count": 15,
638 | "metadata": {},
639 | "output_type": "execute_result"
640 | }
641 | ],
642 | "source": [
643 | "df.groupby('ethnicity').population.sum().sort_values(ascending=False).head(10).reset_index()"
644 | ]
645 | },
646 | {
647 | "cell_type": "code",
648 | "execution_count": 14,
649 | "metadata": {
650 | "collapsed": true
651 | },
652 | "outputs": [],
653 | "source": [
654 | "#4. name and country of a) longest river "
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": 17,
660 | "metadata": {
661 | "collapsed": false
662 | },
663 | "outputs": [
664 | {
665 | "data": {
666 | "text/html": [
667 | "\n",
668 | "
\n",
669 | " \n",
670 | " \n",
671 | " | \n",
672 | " country | \n",
673 | " length | \n",
674 | " name | \n",
675 | "
\n",
676 | " \n",
677 | " \n",
678 | " \n",
679 | " | 161 | \n",
680 | " CO | \n",
681 | " 6448 | \n",
682 | " Amazonas | \n",
683 | "
\n",
684 | " \n",
685 | "
\n",
686 | "
"
687 | ],
688 | "text/plain": [
689 | " country length name\n",
690 | "161 CO 6448 Amazonas"
691 | ]
692 | },
693 | "execution_count": 17,
694 | "metadata": {},
695 | "output_type": "execute_result"
696 | }
697 | ],
698 | "source": [
699 | "rivers_list=[]\n",
700 | "rivers_df = pd.DataFrame()\n",
701 | "for rivers in document.iterfind('river'):\n",
702 | " try:\n",
703 | " rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})\n",
704 | " except:\n",
705 | " next\n",
706 | "rivers_df = pd.DataFrame(rivers_list)\n",
707 | "rivers_df.sort_values(by = 'length', ascending=False).head(1)"
708 | ]
709 | },
710 | {
711 | "cell_type": "code",
712 | "execution_count": null,
713 | "metadata": {
714 | "collapsed": true
715 | },
716 | "outputs": [],
717 | "source": [
718 | "#b) largest lake"
719 | ]
720 | },
721 | {
722 | "cell_type": "code",
723 | "execution_count": 19,
724 | "metadata": {
725 | "collapsed": false
726 | },
727 | "outputs": [
728 | {
729 | "data": {
730 | "text/html": [
731 | "\n",
732 | "
\n",
733 | " \n",
734 | " \n",
735 | " | \n",
736 | " area | \n",
737 | " country | \n",
738 | " name | \n",
739 | "
\n",
740 | " \n",
741 | " \n",
742 | " \n",
743 | " | 42 | \n",
744 | " 386400 | \n",
745 | " R | \n",
746 | " Caspian Sea | \n",
747 | "
\n",
748 | " \n",
749 | "
\n",
750 | "
"
751 | ],
752 | "text/plain": [
753 | " area country name\n",
754 | "42 386400 R Caspian Sea"
755 | ]
756 | },
757 | "execution_count": 19,
758 | "metadata": {},
759 | "output_type": "execute_result"
760 | }
761 | ],
762 | "source": [
763 | "lake_list=[]\n",
764 | "lake_df = pd.DataFrame()\n",
765 | "for lakes in document.iterfind('lake'):\n",
766 | " try:\n",
767 | " lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})\n",
768 | " except:\n",
769 | " next\n",
770 | "lakes_df = pd.DataFrame(lake_list)\n",
771 | "lakes_df.sort_values(by = 'area', ascending=False).head(1)"
772 | ]
773 | },
774 | {
775 | "cell_type": "code",
776 | "execution_count": null,
777 | "metadata": {
778 | "collapsed": true
779 | },
780 | "outputs": [],
781 | "source": [
782 | "#c) airport at highest elevation"
783 | ]
784 | },
785 | {
786 | "cell_type": "code",
787 | "execution_count": 20,
788 | "metadata": {
789 | "collapsed": false
790 | },
791 | "outputs": [
792 | {
793 | "name": "stderr",
794 | "output_type": "stream",
795 | "text": [
796 | "/home/sibi/acad/prog_tools/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n"
797 | ]
798 | },
799 | {
800 | "data": {
801 | "text/html": [
802 | "\n",
803 | "
\n",
804 | " \n",
805 | " \n",
806 | " | \n",
807 | " country | \n",
808 | " elevation | \n",
809 | " name | \n",
810 | "
\n",
811 | " \n",
812 | " \n",
813 | " \n",
814 | " | 80 | \n",
815 | " BOL | \n",
816 | " 4063 | \n",
817 | " El Alto Intl | \n",
818 | "
\n",
819 | " \n",
820 | "
\n",
821 | "
"
822 | ],
823 | "text/plain": [
824 | " country elevation name\n",
825 | "80 BOL 4063 El Alto Intl"
826 | ]
827 | },
828 | "execution_count": 20,
829 | "metadata": {},
830 | "output_type": "execute_result"
831 | }
832 | ],
833 | "source": [
834 | "ap_list=[]\n",
835 | "ap_df = pd.DataFrame()\n",
836 | "for ap in document.iterfind('airport'):\n",
837 | " try:\n",
838 | " ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})\n",
839 | " except:\n",
840 | " next\n",
841 | "ap_df = pd.DataFrame(ap_list)\n",
842 | "ap_df.sort('elevation', ascending=False).head(1)\n"
843 | ]
844 | },
845 | {
846 | "cell_type": "code",
847 | "execution_count": null,
848 | "metadata": {
849 | "collapsed": true
850 | },
851 | "outputs": [],
852 | "source": []
853 | }
854 | ],
855 | "metadata": {
856 | "kernelspec": {
857 | "display_name": "Python [Root]",
858 | "language": "python",
859 | "name": "Python [Root]"
860 | },
861 | "language_info": {
862 | "codemirror_mode": {
863 | "name": "ipython",
864 | "version": 2
865 | },
866 | "file_extension": ".py",
867 | "mimetype": "text/x-python",
868 | "name": "python",
869 | "nbconvert_exporter": "python",
870 | "pygments_lexer": "ipython2",
871 | "version": "2.7.12"
872 | }
873 | },
874 | "nbformat": 4,
875 | "nbformat_minor": 0
876 | }
877 |
--------------------------------------------------------------------------------
/Capstone Project/.ipynb_checkpoints/Classification_Adding_Shots-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import pandas as pd\n",
12 | "import numpy as np\n",
13 | "import scipy.stats as scipy"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 3,
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "outputs": [],
23 | "source": [
24 | "df = pd.read_csv(\"./Data/E0_13.csv\")\n",
25 | "df_14 = pd.read_csv(\"./Data/E0_14.csv\")"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 4,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [
35 | {
36 | "data": {
37 | "text/plain": [
38 | "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n",
39 | " u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n",
40 | " u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n",
41 | " u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n",
42 | " u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n",
43 | " u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n",
44 | " u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n",
45 | " u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n",
46 | " u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n",
47 | " dtype='object')"
48 | ]
49 | },
50 | "execution_count": 4,
51 | "metadata": {},
52 | "output_type": "execute_result"
53 | }
54 | ],
55 | "source": [
56 | "df.columns"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 5,
62 | "metadata": {
63 | "collapsed": false
64 | },
65 | "outputs": [],
66 | "source": [
67 | "res_13 = df.ix[:,:23]\n",
68 | "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n",
69 | "res_14 = df_14.ix[:,:23]\n",
70 | "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n",
71 | "table_features = df.ix[:,:7]\n",
72 | "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n",
73 | "bet_13 = df.ix[:,23:]\n"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 6,
79 | "metadata": {
80 | "collapsed": false,
81 | "scrolled": true
82 | },
83 | "outputs": [],
84 | "source": [
85 | "res_13.head()\n",
86 | "feature_table = df.ix[:,:23]"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 7,
92 | "metadata": {
93 | "collapsed": false
94 | },
95 | "outputs": [],
96 | "source": [
97 | "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n",
98 | "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 8,
104 | "metadata": {
105 | "collapsed": false
106 | },
107 | "outputs": [
108 | {
109 | "name": "stdout",
110 | "output_type": "stream",
111 | "text": [
112 | "Average number of goals at home 1.57368421053\n",
113 | "Average number of goals away 1.19473684211\n",
114 | "Average number of goals conceded at home 1.57368421053\n",
115 | "Average number of goals conceded away 1.19473684211\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n",
121 | "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n",
122 | "avg_home_conceded_13 = avg_away_scored_13\n",
123 | "avg_away_conceded_13 = avg_home_scored_13\n",
124 | "print \"Average number of goals at home\",avg_home_scored_13\n",
125 | "print \"Average number of goals away\", avg_away_scored_13\n",
126 | "print \"Average number of goals conceded at home\",avg_away_conceded_13\n",
127 | "print \"Average number of goals conceded away\",avg_home_conceded_13\n"
128 | ]
129 | },
130 | {
131 | "cell_type": "code",
132 | "execution_count": 9,
133 | "metadata": {
134 | "collapsed": false
135 | },
136 | "outputs": [],
137 | "source": [
138 | "res_home = res_13.groupby('HomeTeam')\n",
139 | "res_away = res_13.groupby('AwayTeam')"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 10,
145 | "metadata": {
146 | "collapsed": false
147 | },
148 | "outputs": [
149 | {
150 | "data": {
151 | "text/plain": [
152 | "('Arsenal',\n",
153 | " HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR HS AS HST \\\n",
154 | " 0 Arsenal Aston Villa 1 3 A 1 1 D 16 9 4 \n",
155 | " 27 Arsenal Tottenham 1 0 H 1 0 H 12 14 5 \n",
156 | " 46 Arsenal Stoke 3 1 H 2 1 H 16 9 8 \n",
157 | " 70 Arsenal Norwich 4 1 H 1 0 H 20 12 11 \n",
158 | " 90 Arsenal Liverpool 2 0 H 1 0 H 12 12 7 \n",
159 | " 110 Arsenal Southampton 2 0 H 1 0 H 9 10 4 \n",
160 | " 131 Arsenal Hull 2 0 H 1 0 H 20 7 7 \n",
161 | " 147 Arsenal Everton 1 1 D 0 0 D 11 12 5 \n",
162 | " 169 Arsenal Chelsea 0 0 D 0 0 D 7 13 2 \n",
163 | " 190 Arsenal Cardiff 2 0 H 0 0 D 28 8 6 \n",
164 | " 210 Arsenal Fulham 2 0 H 0 0 D 22 8 8 \n",
165 | " 237 Arsenal Crystal Palace 2 0 H 0 0 D 11 10 6 \n",
166 | " 254 Arsenal Man United 0 0 D 0 0 D 17 6 5 \n",
167 | " 258 Arsenal Sunderland 4 1 H 3 0 H 12 7 9 \n",
168 | " 301 Arsenal Swansea 2 2 D 0 1 A 13 8 4 \n",
169 | " 306 Arsenal Man City 1 1 D 0 1 A 10 15 3 \n",
170 | " 334 Arsenal West Ham 3 1 H 1 1 D 14 12 8 \n",
171 | " 356 Arsenal Newcastle 3 0 H 2 0 H 20 8 8 \n",
172 | " 364 Arsenal West Brom 1 0 H 1 0 H 15 11 4 \n",
173 | " \n",
174 | " AST HF AF HC AC HY AY HR AR \n",
175 | " 0 4 15 18 4 3 4 5 1 0 \n",
176 | " 27 4 15 14 3 6 2 2 0 0 \n",
177 | " 46 3 8 15 6 7 0 2 0 0 \n",
178 | " 70 6 8 7 10 1 0 0 0 0 \n",
179 | " 90 4 11 7 3 5 2 1 0 0 \n",
180 | " 110 4 10 14 5 6 0 3 0 0 \n",
181 | " 131 2 9 6 11 1 0 0 0 0 \n",
182 | " 147 4 13 11 3 2 0 4 0 0 \n",
183 | " 169 4 7 11 8 6 2 1 0 0 \n",
184 | " 190 2 6 11 12 2 1 2 0 0 \n",
185 | " 210 2 5 7 4 4 0 0 0 0 \n",
186 | " 237 2 9 14 6 5 1 2 0 0 \n",
187 | " 254 2 10 14 5 5 1 2 0 0 \n",
188 | " 258 3 10 11 7 3 0 1 0 0 \n",
189 | " 301 2 11 12 7 0 0 1 0 0 \n",
190 | " 306 4 8 11 6 6 1 4 0 0 \n",
191 | " 334 2 14 12 4 3 2 2 0 0 \n",
192 | " 356 3 9 8 14 0 3 2 0 0 \n",
193 | " 364 1 9 6 9 10 2 2 0 0 )"
194 | ]
195 | },
196 | "execution_count": 10,
197 | "metadata": {},
198 | "output_type": "execute_result"
199 | }
200 | ],
201 | "source": [
202 | "list(res_home)[0]"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 11,
208 | "metadata": {
209 | "collapsed": false
210 | },
211 | "outputs": [
212 | {
213 | "data": {
214 | "text/html": [
215 | "\n",
216 | "
\n",
217 | " \n",
218 | " \n",
219 | " | \n",
220 | " Team | \n",
221 | " HGS | \n",
222 | " AGS | \n",
223 | " HAS | \n",
224 | " AAS | \n",
225 | " HGC | \n",
226 | " AGC | \n",
227 | " HDS | \n",
228 | " ADS | \n",
229 | "
\n",
230 | " \n",
231 | " \n",
232 | " \n",
233 | " | 0 | \n",
234 | " Arsenal | \n",
235 | " 36 | \n",
236 | " 32 | \n",
237 | " NaN | \n",
238 | " NaN | \n",
239 | " 11 | \n",
240 | " 30 | \n",
241 | " NaN | \n",
242 | " NaN | \n",
243 | "
\n",
244 | " \n",
245 | " | 1 | \n",
246 | " Aston Villa | \n",
247 | " 22 | \n",
248 | " 17 | \n",
249 | " NaN | \n",
250 | " NaN | \n",
251 | " 29 | \n",
252 | " 32 | \n",
253 | " NaN | \n",
254 | " NaN | \n",
255 | "
\n",
256 | " \n",
257 | " | 2 | \n",
258 | " Cardiff | \n",
259 | " 20 | \n",
260 | " 12 | \n",
261 | " NaN | \n",
262 | " NaN | \n",
263 | " 35 | \n",
264 | " 39 | \n",
265 | " NaN | \n",
266 | " NaN | \n",
267 | "
\n",
268 | " \n",
269 | " | 3 | \n",
270 | " Chelsea | \n",
271 | " 43 | \n",
272 | " 28 | \n",
273 | " NaN | \n",
274 | " NaN | \n",
275 | " 11 | \n",
276 | " 16 | \n",
277 | " NaN | \n",
278 | " NaN | \n",
279 | "
\n",
280 | " \n",
281 | " | 4 | \n",
282 | " Crystal Palace | \n",
283 | " 18 | \n",
284 | " 15 | \n",
285 | " NaN | \n",
286 | " NaN | \n",
287 | " 23 | \n",
288 | " 25 | \n",
289 | " NaN | \n",
290 | " NaN | \n",
291 | "
\n",
292 | " \n",
293 | "
\n",
294 | "
"
295 | ],
296 | "text/plain": [
297 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n",
298 | "0 Arsenal 36 32 NaN NaN 11 30 NaN NaN\n",
299 | "1 Aston Villa 22 17 NaN NaN 29 32 NaN NaN\n",
300 | "2 Cardiff 20 12 NaN NaN 35 39 NaN NaN\n",
301 | "3 Chelsea 43 28 NaN NaN 11 16 NaN NaN\n",
302 | "4 Crystal Palace 18 15 NaN NaN 23 25 NaN NaN"
303 | ]
304 | },
305 | "execution_count": 11,
306 | "metadata": {},
307 | "output_type": "execute_result"
308 | }
309 | ],
310 | "source": [
311 | "table_13.Team = res_home.HomeTeam.all().values\n",
312 | "table_13.HGS = res_home.FTHG.sum().values\n",
313 | "table_13.HGC = res_home.FTAG.sum().values\n",
314 | "table_13.AGS = res_away.FTAG.sum().values\n",
315 | "table_13.AGC = res_away.FTHG.sum().values\n",
316 | "table_13.head()"
317 | ]
318 | },
319 | {
320 | "cell_type": "code",
321 | "execution_count": 12,
322 | "metadata": {
323 | "collapsed": false
324 | },
325 | "outputs": [
326 | {
327 | "data": {
328 | "text/html": [
329 | "\n",
330 | "
\n",
331 | " \n",
332 | " \n",
333 | " | \n",
334 | " Team | \n",
335 | " HGS | \n",
336 | " AGS | \n",
337 | " HAS | \n",
338 | " AAS | \n",
339 | " HGC | \n",
340 | " AGC | \n",
341 | " HDS | \n",
342 | " ADS | \n",
343 | "
\n",
344 | " \n",
345 | " \n",
346 | " \n",
347 | " | 0 | \n",
348 | " Arsenal | \n",
349 | " 36 | \n",
350 | " 32 | \n",
351 | " 1.204013 | \n",
352 | " 1.409692 | \n",
353 | " 11 | \n",
354 | " 30 | \n",
355 | " 0.484581 | \n",
356 | " 1.003344 | \n",
357 | "
\n",
358 | " \n",
359 | " | 1 | \n",
360 | " Aston Villa | \n",
361 | " 22 | \n",
362 | " 17 | \n",
363 | " 0.735786 | \n",
364 | " 0.748899 | \n",
365 | " 29 | \n",
366 | " 32 | \n",
367 | " 1.277533 | \n",
368 | " 1.070234 | \n",
369 | "
\n",
370 | " \n",
371 | " | 2 | \n",
372 | " Cardiff | \n",
373 | " 20 | \n",
374 | " 12 | \n",
375 | " 0.668896 | \n",
376 | " 0.528634 | \n",
377 | " 35 | \n",
378 | " 39 | \n",
379 | " 1.541850 | \n",
380 | " 1.304348 | \n",
381 | "
\n",
382 | " \n",
383 | " | 3 | \n",
384 | " Chelsea | \n",
385 | " 43 | \n",
386 | " 28 | \n",
387 | " 1.438127 | \n",
388 | " 1.233480 | \n",
389 | " 11 | \n",
390 | " 16 | \n",
391 | " 0.484581 | \n",
392 | " 0.535117 | \n",
393 | "
\n",
394 | " \n",
395 | " | 4 | \n",
396 | " Crystal Palace | \n",
397 | " 18 | \n",
398 | " 15 | \n",
399 | " 0.602007 | \n",
400 | " 0.660793 | \n",
401 | " 23 | \n",
402 | " 25 | \n",
403 | " 1.013216 | \n",
404 | " 0.836120 | \n",
405 | "
\n",
406 | " \n",
407 | "
\n",
408 | "
"
409 | ],
410 | "text/plain": [
411 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n",
412 | "0 Arsenal 36 32 1.204013 1.409692 11 30 0.484581 1.003344\n",
413 | "1 Aston Villa 22 17 0.735786 0.748899 29 32 1.277533 1.070234\n",
414 | "2 Cardiff 20 12 0.668896 0.528634 35 39 1.541850 1.304348\n",
415 | "3 Chelsea 43 28 1.438127 1.233480 11 16 0.484581 0.535117\n",
416 | "4 Crystal Palace 18 15 0.602007 0.660793 23 25 1.013216 0.836120"
417 | ]
418 | },
419 | "execution_count": 12,
420 | "metadata": {},
421 | "output_type": "execute_result"
422 | }
423 | ],
424 | "source": [
425 | "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n",
426 | "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n",
427 | "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n",
428 | "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n",
429 | "table_13.head()"
430 | ]
431 | },
432 | {
433 | "cell_type": "code",
434 | "execution_count": 15,
435 | "metadata": {
436 | "collapsed": false
437 | },
438 | "outputs": [],
439 | "source": [
440 | "feature_table = feature_table[['HomeTeam','AwayTeam','FTR','HST','AST']]\n",
441 | "f_HAS = []\n",
442 | "f_HDS = []\n",
443 | "f_AAS = []\n",
444 | "f_ADS = []\n",
445 | "for index,row in feature_table.iterrows():\n",
446 | " f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n",
447 | " f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n",
448 | " f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n",
449 | " f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n",
450 | " \n",
451 | "feature_table['HAS'] = f_HAS\n",
452 | "feature_table['HDS'] = f_HDS\n",
453 | "feature_table['AAS'] = f_AAS\n",
454 | "feature_table['ADS'] = f_ADS"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 16,
460 | "metadata": {
461 | "collapsed": false
462 | },
463 | "outputs": [
464 | {
465 | "data": {
466 | "text/html": [
467 | "\n",
468 | "
\n",
469 | " \n",
470 | " \n",
471 | " | \n",
472 | " HomeTeam | \n",
473 | " AwayTeam | \n",
474 | " FTR | \n",
475 | " HST | \n",
476 | " AST | \n",
477 | " HAS | \n",
478 | " HDS | \n",
479 | " AAS | \n",
480 | " ADS | \n",
481 | "
\n",
482 | " \n",
483 | " \n",
484 | " \n",
485 | " | 0 | \n",
486 | " Arsenal | \n",
487 | " Aston Villa | \n",
488 | " A | \n",
489 | " 4 | \n",
490 | " 4 | \n",
491 | " 1.204013 | \n",
492 | " 0.484581 | \n",
493 | " 1.409692 | \n",
494 | " 1.003344 | \n",
495 | "
\n",
496 | " \n",
497 | " | 1 | \n",
498 | " Liverpool | \n",
499 | " Stoke | \n",
500 | " H | \n",
501 | " 11 | \n",
502 | " 4 | \n",
503 | " 1.772575 | \n",
504 | " 0.792952 | \n",
505 | " 2.114537 | \n",
506 | " 1.070234 | \n",
507 | "
\n",
508 | " \n",
509 | " | 2 | \n",
510 | " Norwich | \n",
511 | " Everton | \n",
512 | " D | \n",
513 | " 2 | \n",
514 | " 6 | \n",
515 | " 0.568562 | \n",
516 | " 0.792952 | \n",
517 | " 0.484581 | \n",
518 | " 1.471572 | \n",
519 | "
\n",
520 | " \n",
521 | " | 3 | \n",
522 | " Sunderland | \n",
523 | " Fulham | \n",
524 | " A | \n",
525 | " 3 | \n",
526 | " 1 | \n",
527 | " 0.702341 | \n",
528 | " 1.189427 | \n",
529 | " 0.881057 | \n",
530 | " 1.103679 | \n",
531 | "
\n",
532 | " \n",
533 | " | 4 | \n",
534 | " Swansea | \n",
535 | " Man United | \n",
536 | " A | \n",
537 | " 6 | \n",
538 | " 7 | \n",
539 | " 1.103679 | \n",
540 | " 1.145374 | \n",
541 | " 0.925110 | \n",
542 | " 0.936455 | \n",
543 | "
\n",
544 | " \n",
545 | "
\n",
546 | "
"
547 | ],
548 | "text/plain": [
549 | " HomeTeam AwayTeam FTR HST AST HAS HDS AAS \\\n",
550 | "0 Arsenal Aston Villa A 4 4 1.204013 0.484581 1.409692 \n",
551 | "1 Liverpool Stoke H 11 4 1.772575 0.792952 2.114537 \n",
552 | "2 Norwich Everton D 2 6 0.568562 0.792952 0.484581 \n",
553 | "3 Sunderland Fulham A 3 1 0.702341 1.189427 0.881057 \n",
554 | "4 Swansea Man United A 6 7 1.103679 1.145374 0.925110 \n",
555 | "\n",
556 | " ADS \n",
557 | "0 1.003344 \n",
558 | "1 1.070234 \n",
559 | "2 1.471572 \n",
560 | "3 1.103679 \n",
561 | "4 0.936455 "
562 | ]
563 | },
564 | "execution_count": 16,
565 | "metadata": {},
566 | "output_type": "execute_result"
567 | }
568 | ],
569 | "source": [
570 | "feature_table.head()"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": 17,
576 | "metadata": {
577 | "collapsed": true
578 | },
579 | "outputs": [],
580 | "source": [
581 | "def transformResult(row):\n",
582 | " if(row.FTR == 'H'):\n",
583 | " return 1\n",
584 | " elif(row.FTR == 'A'):\n",
585 | " return -1\n",
586 | " else:\n",
587 | " return 0"
588 | ]
589 | },
590 | {
591 | "cell_type": "code",
592 | "execution_count": 18,
593 | "metadata": {
594 | "collapsed": false
595 | },
596 | "outputs": [],
597 | "source": [
598 | "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)"
599 | ]
600 | },
601 | {
602 | "cell_type": "code",
603 | "execution_count": 19,
604 | "metadata": {
605 | "collapsed": false
606 | },
607 | "outputs": [
608 | {
609 | "data": {
610 | "text/html": [
611 | "\n",
612 | "
\n",
613 | " \n",
614 | " \n",
615 | " | \n",
616 | " HomeTeam | \n",
617 | " AwayTeam | \n",
618 | " FTR | \n",
619 | " HST | \n",
620 | " AST | \n",
621 | " HAS | \n",
622 | " HDS | \n",
623 | " AAS | \n",
624 | " ADS | \n",
625 | " Result | \n",
626 | "
\n",
627 | " \n",
628 | " \n",
629 | " \n",
630 | " | 0 | \n",
631 | " Arsenal | \n",
632 | " Aston Villa | \n",
633 | " A | \n",
634 | " 4 | \n",
635 | " 4 | \n",
636 | " 1.204013 | \n",
637 | " 0.484581 | \n",
638 | " 1.409692 | \n",
639 | " 1.003344 | \n",
640 | " -1 | \n",
641 | "
\n",
642 | " \n",
643 | " | 1 | \n",
644 | " Liverpool | \n",
645 | " Stoke | \n",
646 | " H | \n",
647 | " 11 | \n",
648 | " 4 | \n",
649 | " 1.772575 | \n",
650 | " 0.792952 | \n",
651 | " 2.114537 | \n",
652 | " 1.070234 | \n",
653 | " 1 | \n",
654 | "
\n",
655 | " \n",
656 | " | 2 | \n",
657 | " Norwich | \n",
658 | " Everton | \n",
659 | " D | \n",
660 | " 2 | \n",
661 | " 6 | \n",
662 | " 0.568562 | \n",
663 | " 0.792952 | \n",
664 | " 0.484581 | \n",
665 | " 1.471572 | \n",
666 | " 0 | \n",
667 | "
\n",
668 | " \n",
669 | " | 3 | \n",
670 | " Sunderland | \n",
671 | " Fulham | \n",
672 | " A | \n",
673 | " 3 | \n",
674 | " 1 | \n",
675 | " 0.702341 | \n",
676 | " 1.189427 | \n",
677 | " 0.881057 | \n",
678 | " 1.103679 | \n",
679 | " -1 | \n",
680 | "
\n",
681 | " \n",
682 | " | 4 | \n",
683 | " Swansea | \n",
684 | " Man United | \n",
685 | " A | \n",
686 | " 6 | \n",
687 | " 7 | \n",
688 | " 1.103679 | \n",
689 | " 1.145374 | \n",
690 | " 0.925110 | \n",
691 | " 0.936455 | \n",
692 | " -1 | \n",
693 | "
\n",
694 | " \n",
695 | "
\n",
696 | "
"
697 | ],
698 | "text/plain": [
699 | " HomeTeam AwayTeam FTR HST AST HAS HDS AAS \\\n",
700 | "0 Arsenal Aston Villa A 4 4 1.204013 0.484581 1.409692 \n",
701 | "1 Liverpool Stoke H 11 4 1.772575 0.792952 2.114537 \n",
702 | "2 Norwich Everton D 2 6 0.568562 0.792952 0.484581 \n",
703 | "3 Sunderland Fulham A 3 1 0.702341 1.189427 0.881057 \n",
704 | "4 Swansea Man United A 6 7 1.103679 1.145374 0.925110 \n",
705 | "\n",
706 | " ADS Result \n",
707 | "0 1.003344 -1 \n",
708 | "1 1.070234 1 \n",
709 | "2 1.471572 0 \n",
710 | "3 1.103679 -1 \n",
711 | "4 0.936455 -1 "
712 | ]
713 | },
714 | "execution_count": 19,
715 | "metadata": {},
716 | "output_type": "execute_result"
717 | }
718 | ],
719 | "source": [
720 | "feature_table.head()"
721 | ]
722 | },
723 | {
724 | "cell_type": "code",
725 | "execution_count": 21,
726 | "metadata": {
727 | "collapsed": true
728 | },
729 | "outputs": [],
730 | "source": [
731 | "X_train = feature_table[['HST','AST','HAS','HDS','AAS','ADS']]\n",
732 | "y_train = feature_table['Result']"
733 | ]
734 | },
735 | {
736 | "cell_type": "code",
737 | "execution_count": 27,
738 | "metadata": {
739 | "collapsed": false
740 | },
741 | "outputs": [],
742 | "source": [
743 | "from sklearn.tree import DecisionTreeClassifier\n",
744 | "from sklearn.naive_bayes import MultinomialNB\n",
745 | "from xgboost import XGBClassifier\n",
746 | "from sklearn.metrics import accuracy_score\n",
747 | "from sklearn.model_selection import cross_val_score\n"
748 | ]
749 | },
750 | {
751 | "cell_type": "code",
752 | "execution_count": 43,
753 | "metadata": {
754 | "collapsed": false
755 | },
756 | "outputs": [
757 | {
758 | "data": {
759 | "text/plain": [
760 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n",
761 | " max_features=None, max_leaf_nodes=None,\n",
762 | " min_impurity_split=1e-07, min_samples_leaf=1,\n",
763 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n",
764 | " presort=False, random_state=None, splitter='best')"
765 | ]
766 | },
767 | "execution_count": 43,
768 | "metadata": {},
769 | "output_type": "execute_result"
770 | }
771 | ],
772 | "source": [
773 | "clf = DecisionTreeClassifier()\n",
774 | "clf2 = MultinomialNB()\n",
775 | "clf3 = XGBClassifier()\n",
776 | "clf.fit(X_train,y_train)"
777 | ]
778 | },
779 | {
780 | "cell_type": "code",
781 | "execution_count": 45,
782 | "metadata": {
783 | "collapsed": false
784 | },
785 | "outputs": [],
786 | "source": [
787 | "# y_pred = clf3.predict(X_train)\n",
788 | "accuracy_score(y_pred,y_train)\n",
789 | "scores = cross_val_score(clf2, X_train, y_train, cv=10)\n"
790 | ]
791 | },
792 | {
793 | "cell_type": "code",
794 | "execution_count": 46,
795 | "metadata": {
796 | "collapsed": false
797 | },
798 | "outputs": [
799 | {
800 | "name": "stdout",
801 | "output_type": "stream",
802 | "text": [
803 | "[ 0.56410256 0.64102564 0.56410256 0.57894737 0.65789474 0.65789474\n",
804 | " 0.65789474 0.65789474 0.54054054 0.75 ]\n",
805 | "0.627029762556\n"
806 | ]
807 | }
808 | ],
809 | "source": [
810 | "print scores\n",
811 | "print scores.mean()"
812 | ]
813 | }
814 | ],
815 | "metadata": {
816 | "kernelspec": {
817 | "display_name": "Python [Root]",
818 | "language": "python",
819 | "name": "Python [Root]"
820 | },
821 | "language_info": {
822 | "codemirror_mode": {
823 | "name": "ipython",
824 | "version": 2
825 | },
826 | "file_extension": ".py",
827 | "mimetype": "text/x-python",
828 | "name": "python",
829 | "nbconvert_exporter": "python",
830 | "pygments_lexer": "ipython2",
831 | "version": "2.7.12"
832 | }
833 | },
834 | "nbformat": 4,
835 | "nbformat_minor": 0
836 | }
837 |
--------------------------------------------------------------------------------