├── Machine Learning ├── naive_bayes │ ├── vsm.png │ ├── terms.png │ ├── terms2.png │ └── callibration.png ├── clustering │ └── WineKMC.xlsx ├── logistic_regression │ ├── images │ │ ├── bias.png │ │ ├── data.png │ │ ├── knn1.png │ │ ├── knn2.png │ │ ├── linreg.png │ │ ├── linsep.png │ │ ├── pcanim.gif │ │ ├── reshape.jpg │ │ ├── sklearn2.jpg │ │ ├── train-cv2.png │ │ ├── train-cv3.png │ │ ├── onelinesplit.png │ │ ├── sklearntrans.jpg │ │ ├── train-test.png │ │ ├── train-validate-test.png │ │ ├── complexity-error-plot.png │ │ ├── complexity-error-reg.png │ │ ├── train-validate-test3.png │ │ └── train-validate-test-cont.png │ └── .gitignore └── linear_regression │ ├── images │ ├── shuttle.png │ ├── cs109gitflow3.png │ └── conditionalmean.png │ └── .gitignore ├── Capstone Project ├── Final Report │ ├── Report.pdf │ └── Slide Deck.pdf ├── Capstone Project Proposal.pdf ├── Data │ └── notes.txt ├── Notebooks │ └── Classification_Baseline.ipynb └── .ipynb_checkpoints │ ├── Classification_Baseline-checkpoint.ipynb │ └── Classification_Adding_Shots-checkpoint.ipynb ├── Data Wrangling ├── data_wrangling_json │ ├── .DS_Store │ ├── data │ │ ├── .DS_Store │ │ └── world_bank_projects_less.json │ └── .ipynb_checkpoints │ │ └── sliderule_dsi_xml_exercise-checkpoint.ipynb └── data_wrangling_xml │ └── data_wrangling_xml │ ├── .DS_Store │ ├── sliderule_dsi_xml_exercise.ipynb │ └── .ipynb_checkpoints │ └── sliderule_dsi_xml_exercise-checkpoint.ipynb ├── Inferential Statistics ├── statistics project 1 │ ├── .DS_Store │ ├── data │ │ ├── .DS_Store │ │ └── human_body_temperature.csv │ └── .ipynb_checkpoints │ │ ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb │ │ └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb ├── statistics project 2 │ ├── .DS_Store │ ├── data │ │ ├── .DS_Store │ │ └── us_job_market_discrimination.dta │ ├── .ipynb_checkpoints │ │ ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb │ │ └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb │ └── sliderule_dsi_inferential_statistics_exercise_2.ipynb └── statistics project 3 │ ├── .DS_Store │ ├── data │ └── .DS_Store │ └── .ipynb_checkpoints │ ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb │ └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb └── README.md /Machine Learning/naive_bayes/vsm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/vsm.png -------------------------------------------------------------------------------- /Capstone Project/Final Report/Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Final Report/Report.pdf -------------------------------------------------------------------------------- /Machine Learning/clustering/WineKMC.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/clustering/WineKMC.xlsx -------------------------------------------------------------------------------- /Machine Learning/naive_bayes/terms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/terms.png -------------------------------------------------------------------------------- /Machine Learning/naive_bayes/terms2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/terms2.png -------------------------------------------------------------------------------- /Capstone Project/Final Report/Slide Deck.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Final Report/Slide Deck.pdf -------------------------------------------------------------------------------- /Data Wrangling/data_wrangling_json/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_json/.DS_Store -------------------------------------------------------------------------------- /Machine Learning/naive_bayes/callibration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/naive_bayes/callibration.png -------------------------------------------------------------------------------- /Capstone Project/Capstone Project Proposal.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Capstone Project/Capstone Project Proposal.pdf -------------------------------------------------------------------------------- /Data Wrangling/data_wrangling_json/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_json/data/.DS_Store -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/bias.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/data.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/knn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/knn1.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/knn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/knn2.png -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 1/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 1/.DS_Store -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/.DS_Store -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 3/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 3/.DS_Store -------------------------------------------------------------------------------- /Machine Learning/linear_regression/images/shuttle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/shuttle.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/linreg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/linreg.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/linsep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/linsep.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/pcanim.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/pcanim.gif -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/reshape.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/reshape.jpg -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/sklearn2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/sklearn2.jpg -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/train-cv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-cv2.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/train-cv3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-cv3.png -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 1/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 1/data/.DS_Store -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 2/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/data/.DS_Store -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 3/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 3/data/.DS_Store -------------------------------------------------------------------------------- /Machine Learning/linear_regression/images/cs109gitflow3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/cs109gitflow3.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/onelinesplit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/onelinesplit.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/sklearntrans.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/sklearntrans.jpg -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/train-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-test.png -------------------------------------------------------------------------------- /Data Wrangling/data_wrangling_xml/data_wrangling_xml/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Data Wrangling/data_wrangling_xml/data_wrangling_xml/.DS_Store -------------------------------------------------------------------------------- /Machine Learning/linear_regression/images/conditionalmean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/linear_regression/images/conditionalmean.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/train-validate-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/complexity-error-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/complexity-error-plot.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/complexity-error-reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/complexity-error-reg.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/train-validate-test3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test3.png -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/images/train-validate-test-cont.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Machine Learning/logistic_regression/images/train-validate-test-cont.png -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 2/data/us_job_market_discrimination.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsibi/Springboard/HEAD/Inferential Statistics/statistics project 2/data/us_job_market_discrimination.dta -------------------------------------------------------------------------------- /Machine Learning/logistic_regression/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | #Ipython 60 | .ipynb_checkpoints/ 61 | # Created by .ignore support plugin (hsz.mobi) 62 | ### OSX template 63 | .DS_Store 64 | .AppleDouble 65 | .LSOverride 66 | 67 | # Icon must end with two \r 68 | Icon 69 | 70 | # Thumbnails 71 | ._* 72 | 73 | # Files that might appear in the root of a volume 74 | .DocumentRevisions-V100 75 | .fseventsd 76 | .Spotlight-V100 77 | .TemporaryItems 78 | .Trashes 79 | .VolumeIcon.icns 80 | 81 | # Directories potentially created on remote AFP share 82 | .AppleDB 83 | .AppleDesktop 84 | Network Trash Folder 85 | Temporary Items 86 | .apdisk 87 | 88 | #Temporary data 89 | tempdata/ 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Springboard 2 | 3 | Projects done as a part of Springboard's Data Science Intensive curriculum. 4 | 5 | ### Capstone Project : Football, Goals and Machine Learning 6 | An attempt to model the highly unpredictable English Premier League and predict the results of each match. 7 | 8 | ### Data Story 9 | Do home teams really have an advantage in football? Is the effect of this advantage reducing in the English Premier League? How predictable are football leagues anyway? Data to the rescue! 10 | 11 | ### Data Wrangling 12 | Practise on cleaning up messy data using pandas - XML, JSON, raw text and working with databases. 13 | 14 | ### Inferential Statistics 15 | Useful inferential statistics for drawing conclusions and predicting outcomes. 16 | Contains three miniprojects : 17 | * Human Body Temperature - hypothesis testing, confidence intervals, and statistical significance 18 | * Examining Racial Discrimination - does race have a significant impact on the rate of callbacks? 19 | * Reducing Hospital Readmissions - statistical analysis to reduce readmissions to hospitals. 20 | 21 | ### Machine Learning 22 | To learn various machine learning models, their advantages and limitations. 23 | Contains the following miniprojects : 24 | * Boston House Pricing - predicting housing prices in Boston using linear regression 25 | * Heights and Weights - using logistic regression to classify gender 26 | * Predicting Movie Ratings - use naive bayes algorithm to accurately predict movie ratings based on their reviews 27 | * Customer Segmentation - employ k-means clustering and associated accuracy metrics to partitioning problems 28 | -------------------------------------------------------------------------------- /Machine Learning/linear_regression/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | #Ipython 60 | .ipynb_checkpoints/ 61 | # Created by .ignore support plugin (hsz.mobi) 62 | ### OSX template 63 | .DS_Store 64 | .AppleDouble 65 | .LSOverride 66 | 67 | # Icon must end with two \r 68 | Icon 69 | 70 | # Thumbnails 71 | ._* 72 | 73 | # Files that might appear in the root of a volume 74 | .DocumentRevisions-V100 75 | .fseventsd 76 | .Spotlight-V100 77 | .TemporaryItems 78 | .Trashes 79 | .VolumeIcon.icns 80 | 81 | # Directories potentially created on remote AFP share 82 | .AppleDB 83 | .AppleDesktop 84 | Network Trash Folder 85 | Temporary Items 86 | .apdisk 87 | 88 | #Temporary data 89 | hw1/tempdata/ 90 | hw1/.ipynb_checkpoints/ 91 | 92 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 1/data/human_body_temperature.csv: -------------------------------------------------------------------------------- 1 | temperature,gender,heart_rate 2 | 99.3,F,68.0 3 | 98.4,F,81.0 4 | 97.8,M,73.0 5 | 99.2,F,66.0 6 | 98.0,F,73.0 7 | 99.2,M,83.0 8 | 98.0,M,71.0 9 | 98.8,M,78.0 10 | 98.4,F,84.0 11 | 98.6,F,86.0 12 | 98.8,F,89.0 13 | 96.7,F,62.0 14 | 98.2,M,72.0 15 | 98.7,F,79.0 16 | 97.8,F,77.0 17 | 98.8,F,83.0 18 | 98.3,F,79.0 19 | 98.2,M,64.0 20 | 97.2,F,68.0 21 | 99.4,M,70.0 22 | 98.3,F,78.0 23 | 98.2,M,71.0 24 | 98.6,M,70.0 25 | 98.4,M,68.0 26 | 97.8,M,65.0 27 | 98.0,F,87.0 28 | 97.8,F,62.0 29 | 98.2,F,69.0 30 | 98.4,F,73.0 31 | 98.1,M,67.0 32 | 98.3,M,86.0 33 | 97.6,F,61.0 34 | 98.5,M,71.0 35 | 98.6,M,82.0 36 | 99.3,M,63.0 37 | 99.5,M,75.0 38 | 99.1,M,71.0 39 | 98.3,M,72.0 40 | 97.9,F,79.0 41 | 96.4,F,69.0 42 | 98.4,F,79.0 43 | 98.4,M,82.0 44 | 96.9,M,74.0 45 | 97.2,M,64.0 46 | 99.0,F,79.0 47 | 97.9,F,69.0 48 | 97.4,M,72.0 49 | 97.4,M,68.0 50 | 97.9,M,76.0 51 | 97.1,M,82.0 52 | 98.9,F,76.0 53 | 98.3,F,80.0 54 | 98.5,F,83.0 55 | 98.6,M,78.0 56 | 98.2,F,73.0 57 | 98.6,F,82.0 58 | 98.8,F,70.0 59 | 98.2,M,66.0 60 | 98.2,F,65.0 61 | 97.6,M,73.0 62 | 99.1,F,80.0 63 | 98.4,M,84.0 64 | 98.2,F,57.0 65 | 98.6,M,83.0 66 | 98.7,F,65.0 67 | 97.4,M,70.0 68 | 97.4,F,57.0 69 | 98.6,M,77.0 70 | 98.7,F,82.0 71 | 98.9,M,80.0 72 | 98.1,F,81.0 73 | 97.7,F,61.0 74 | 98.0,M,78.0 75 | 98.8,M,81.0 76 | 99.0,M,75.0 77 | 98.8,M,78.0 78 | 98.0,F,76.0 79 | 98.4,M,70.0 80 | 97.4,M,78.0 81 | 97.6,M,74.0 82 | 98.8,F,73.0 83 | 98.0,M,67.0 84 | 97.5,M,70.0 85 | 99.2,F,77.0 86 | 98.6,F,85.0 87 | 97.1,M,75.0 88 | 98.6,F,77.0 89 | 98.0,M,78.0 90 | 98.7,M,73.0 91 | 98.1,M,73.0 92 | 97.8,M,74.0 93 | 100.0,F,78.0 94 | 98.8,F,84.0 95 | 97.1,M,73.0 96 | 97.8,M,58.0 97 | 96.8,F,75.0 98 | 99.9,F,79.0 99 | 98.7,F,64.0 100 | 98.8,F,64.0 101 | 98.0,M,74.0 102 | 99.0,M,81.0 103 | 98.5,M,68.0 104 | 98.0,F,78.0 105 | 99.4,F,77.0 106 | 97.6,M,69.0 107 | 96.7,M,71.0 108 | 97.0,M,80.0 109 | 98.6,M,66.0 110 | 98.7,F,72.0 111 | 97.3,M,69.0 112 | 98.8,F,69.0 113 | 98.0,F,89.0 114 | 98.2,F,64.0 115 | 99.1,F,74.0 116 | 99.0,M,79.0 117 | 98.0,M,64.0 118 | 100.8,F,77.0 119 | 97.8,F,71.0 120 | 98.7,M,78.0 121 | 98.4,F,74.0 122 | 97.7,F,84.0 123 | 97.9,F,68.0 124 | 99.0,F,81.0 125 | 97.2,F,66.0 126 | 97.5,M,75.0 127 | 96.3,M,70.0 128 | 97.7,M,77.0 129 | 98.2,F,73.0 130 | 97.9,M,72.0 131 | 98.7,F,59.0 132 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 1/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## What is the true normal human body temperature? \n", 8 | "\n", 9 | "#### Background\n", 10 | "\n", 11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n", 12 | "\n", 13 | "#### Exercise\n", 14 | "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n", 15 | "\n", 16 | "1. Is the distribution of body temperatures normal? \n", 17 | "2. Is the true population mean really 98.6 degrees F?\n", 18 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 19 | "4. Is there a significant difference between males and females in normal temperature?\n", 20 | "\n", 21 | "\n", 22 | "\n", 23 | "#### Resources\n", 24 | "\n", 25 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 58, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import pandas as pd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 62, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "df = pd.read_csv('data/human_body_temperature.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "source": [ 56 | "# Exercise\n", 57 | "\n", 58 | "Answer the following questions in this notebook and submit to your Github account. \n", 59 | "\n", 60 | "1. Is the distribution of body temperatures normal? \n", 61 | " - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n", 62 | "2. Is the true population mean really 98.6 degrees F?\n", 63 | " - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n", 64 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 65 | " - Start by computing the margin of error and confidence interval.\n", 66 | "4. Is there a significant difference between males and females in normal temperature?\n", 67 | " - Set up and solve for a two sample hypothesis testing." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 2", 83 | "language": "python", 84 | "name": "python2" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 2 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython2", 96 | "version": "2.7.9" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 0 101 | } 102 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 2/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## What is the true normal human body temperature? \n", 8 | "\n", 9 | "#### Background\n", 10 | "\n", 11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n", 12 | "\n", 13 | "#### Exercise\n", 14 | "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n", 15 | "\n", 16 | "1. Is the distribution of body temperatures normal? \n", 17 | "2. Is the true population mean really 98.6 degrees F?\n", 18 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 19 | "4. Is there a significant difference between males and females in normal temperature?\n", 20 | "\n", 21 | "\n", 22 | "\n", 23 | "#### Resources\n", 24 | "\n", 25 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 58, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import pandas as pd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 62, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "df = pd.read_csv('data/human_body_temperature.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "source": [ 56 | "# Exercise\n", 57 | "\n", 58 | "Answer the following questions in this notebook and submit to your Github account. \n", 59 | "\n", 60 | "1. Is the distribution of body temperatures normal? \n", 61 | " - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n", 62 | "2. Is the true population mean really 98.6 degrees F?\n", 63 | " - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n", 64 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 65 | " - Start by computing the margin of error and confidence interval.\n", 66 | "4. Is there a significant difference between males and females in normal temperature?\n", 67 | " - Set up and solve for a two sample hypothesis testing." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 2", 83 | "language": "python", 84 | "name": "python2" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 2 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython2", 96 | "version": "2.7.9" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 0 101 | } 102 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 3/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## What is the true normal human body temperature? \n", 8 | "\n", 9 | "#### Background\n", 10 | "\n", 11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n", 12 | "\n", 13 | "#### Exercise\n", 14 | "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n", 15 | "\n", 16 | "1. Is the distribution of body temperatures normal? \n", 17 | "2. Is the true population mean really 98.6 degrees F?\n", 18 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 19 | "4. Is there a significant difference between males and females in normal temperature?\n", 20 | "\n", 21 | "\n", 22 | "\n", 23 | "#### Resources\n", 24 | "\n", 25 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 58, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import pandas as pd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 62, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "df = pd.read_csv('data/human_body_temperature.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "source": [ 56 | "# Exercise\n", 57 | "\n", 58 | "Answer the following questions in this notebook and submit to your Github account. \n", 59 | "\n", 60 | "1. Is the distribution of body temperatures normal? \n", 61 | " - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n", 62 | "2. Is the true population mean really 98.6 degrees F?\n", 63 | " - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n", 64 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 65 | " - Start by computing the margin of error and confidence interval.\n", 66 | "4. Is there a significant difference between males and females in normal temperature?\n", 67 | " - Set up and solve for a two sample hypothesis testing." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 2", 83 | "language": "python", 84 | "name": "python2" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 2 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython2", 96 | "version": "2.7.9" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 0 101 | } 102 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 1/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "### Examining racial discrimination in the US job market\n", 9 | "\n", 10 | "#### Background\n", 11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 12 | "\n", 13 | "#### Data\n", 14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 15 | "\n", 16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n", 17 | "\n", 18 | "#### Exercise\n", 19 | "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 20 | "\n", 21 | "\n", 22 | "#### Resources\n", 23 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 24 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "****" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "from scipy import stats" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "157.0" 69 | ] 70 | }, 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "# number of callbacks for balck-sounding names\n", 78 | "sum(data[data.race=='b'].call)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "source": [ 87 | "****\n", 88 | "\n", 89 | "# Exercise\n", 90 | "\n", 91 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 92 | " 2. What are the null and alternate hypotheses?\n", 93 | " 3. Compute margin of error, confidence interval, and p-value.\n", 94 | " 4. Discuss statistical significance.\n", 95 | " \n", 96 | "You can include written notes in notebook cells using Markdown: \n", 97 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 98 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 99 | " \n", 100 | "****" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 2", 116 | "language": "python", 117 | "name": "python2" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 2 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython2", 129 | "version": "2.7.9" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 0 134 | } 135 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 3/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "### Examining racial discrimination in the US job market\n", 9 | "\n", 10 | "#### Background\n", 11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 12 | "\n", 13 | "#### Data\n", 14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 15 | "\n", 16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n", 17 | "\n", 18 | "#### Exercise\n", 19 | "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 20 | "\n", 21 | "\n", 22 | "#### Resources\n", 23 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 24 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "****" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "from scipy import stats" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "157.0" 69 | ] 70 | }, 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "# number of callbacks for balck-sounding names\n", 78 | "sum(data[data.race=='b'].call)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "source": [ 87 | "****\n", 88 | "\n", 89 | "# Exercise\n", 90 | "\n", 91 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 92 | " 2. What are the null and alternate hypotheses?\n", 93 | " 3. Compute margin of error, confidence interval, and p-value.\n", 94 | " 4. Discuss statistical significance.\n", 95 | " \n", 96 | "You can include written notes in notebook cells using Markdown: \n", 97 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 98 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 99 | " \n", 100 | "****" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 2", 116 | "language": "python", 117 | "name": "python2" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 2 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython2", 129 | "version": "2.7.9" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 0 134 | } 135 | -------------------------------------------------------------------------------- /Data Wrangling/data_wrangling_json/.ipynb_checkpoints/sliderule_dsi_xml_exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# XML example and exercise\n", 8 | "****\n", 9 | "+ study examples of accessing nodes in XML tree structure \n", 10 | "+ work on exercise to be completed and submitted\n", 11 | "****\n", 12 | "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n", 13 | "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n", 14 | "****" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from xml.etree import ElementTree as ET" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## XML example\n", 33 | "\n", 34 | "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "document_tree = ET.parse( './data/mondial_database_less.xml' )" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "Albania\n", 60 | "Greece\n", 61 | "Macedonia\n", 62 | "Serbia\n", 63 | "Montenegro\n", 64 | "Kosovo\n", 65 | "Andorra\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "# print names of all countries\n", 71 | "for child in document_tree.getroot():\n", 72 | " print child.find('name').text" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n", 87 | "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n", 88 | "* Macedonia: Skopje, Kumanovo\n", 89 | "* Serbia: Beograd, Novi Sad, Niš\n", 90 | "* Montenegro: Podgorica\n", 91 | "* Kosovo: Prishtine\n", 92 | "* Andorra: Andorra la Vella\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# print names of all countries and their cities\n", 98 | "for element in document_tree.iterfind('country'):\n", 99 | " print '* ' + element.find('name').text + ':',\n", 100 | " capitals_string = ''\n", 101 | " for subelement in element.getiterator('city'):\n", 102 | " capitals_string += subelement.find('name').text + ', '\n", 103 | " print capitals_string[:-2]" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "****\n", 111 | "## XML exercise\n", 112 | "\n", 113 | "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n", 114 | "\n", 115 | "1. 10 countries with the lowest infant mortality rates\n", 116 | "2. 10 cities with the largest population\n", 117 | "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n", 118 | "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "document = ET.parse( './data/mondial_database.xml' )" 130 | ] 131 | } 132 | ], 133 | "metadata": { 134 | "kernelspec": { 135 | "display_name": "Python 2", 136 | "language": "python", 137 | "name": "python2" 138 | }, 139 | "language_info": { 140 | "codemirror_mode": { 141 | "name": "ipython", 142 | "version": 2 143 | }, 144 | "file_extension": ".py", 145 | "mimetype": "text/x-python", 146 | "name": "python", 147 | "nbconvert_exporter": "python", 148 | "pygments_lexer": "ipython2", 149 | "version": "2.7.9" 150 | } 151 | }, 152 | "nbformat": 4, 153 | "nbformat_minor": 0 154 | } 155 | -------------------------------------------------------------------------------- /Capstone Project/Data/notes.txt: -------------------------------------------------------------------------------- 1 | Notes for Football Data 2 | 3 | All data is in csv format, ready for use within standard spreadsheet applications. Please note that some abbreviations are no longer in use (in particular odds from specific bookmakers no longer used) and refer to data collected in earlier seasons. For a current list of what bookmakers are included in the dataset please visit http://www.football-data.co.uk/matches.php 4 | 5 | Key to results data: 6 | 7 | Div = League Division 8 | Date = Match Date (dd/mm/yy) 9 | HomeTeam = Home Team 10 | AwayTeam = Away Team 11 | FTHG = Full Time Home Team Goals 12 | FTAG = Full Time Away Team Goals 13 | FTR = Full Time Result (H=Home Win, D=Draw, A=Away Win) 14 | HTHG = Half Time Home Team Goals 15 | HTAG = Half Time Away Team Goals 16 | HTR = Half Time Result (H=Home Win, D=Draw, A=Away Win) 17 | 18 | Match Statistics (where available) 19 | Attendance = Crowd Attendance 20 | Referee = Match Referee 21 | HS = Home Team Shots 22 | AS = Away Team Shots 23 | HST = Home Team Shots on Target 24 | AST = Away Team Shots on Target 25 | HHW = Home Team Hit Woodwork 26 | AHW = Away Team Hit Woodwork 27 | HC = Home Team Corners 28 | AC = Away Team Corners 29 | HF = Home Team Fouls Committed 30 | AF = Away Team Fouls Committed 31 | HO = Home Team Offsides 32 | AO = Away Team Offsides 33 | HY = Home Team Yellow Cards 34 | AY = Away Team Yellow Cards 35 | HR = Home Team Red Cards 36 | AR = Away Team Red Cards 37 | HBP = Home Team Bookings Points (10 = yellow, 25 = red) 38 | ABP = Away Team Bookings Points (10 = yellow, 25 = red) 39 | 40 | Key to 1X2 (match) betting odds data: 41 | 42 | B365H = Bet365 home win odds 43 | B365D = Bet365 draw odds 44 | B365A = Bet365 away win odds 45 | BSH = Blue Square home win odds 46 | BSD = Blue Square draw odds 47 | BSA = Blue Square away win odds 48 | BWH = Bet&Win home win odds 49 | BWD = Bet&Win draw odds 50 | BWA = Bet&Win away win odds 51 | GBH = Gamebookers home win odds 52 | GBD = Gamebookers draw odds 53 | GBA = Gamebookers away win odds 54 | IWH = Interwetten home win odds 55 | IWD = Interwetten draw odds 56 | IWA = Interwetten away win odds 57 | LBH = Ladbrokes home win odds 58 | LBD = Ladbrokes draw odds 59 | LBA = Ladbrokes away win odds 60 | PSH = Pinnacle home win odds 61 | PSD = Pinnacle draw odds 62 | PSA = Pinnacle away win odds 63 | SOH = Sporting Odds home win odds 64 | SOD = Sporting Odds draw odds 65 | SOA = Sporting Odds away win odds 66 | SBH = Sportingbet home win odds 67 | SBD = Sportingbet draw odds 68 | SBA = Sportingbet away win odds 69 | SJH = Stan James home win odds 70 | SJD = Stan James draw odds 71 | SJA = Stan James away win odds 72 | SYH = Stanleybet home win odds 73 | SYD = Stanleybet draw odds 74 | SYA = Stanleybet away win odds 75 | VCH = VC Bet home win odds 76 | VCD = VC Bet draw odds 77 | VCA = VC Bet away win odds 78 | WHH = William Hill home win odds 79 | WHD = William Hill draw odds 80 | WHA = William Hill away win odds 81 | 82 | Bb1X2 = Number of BetBrain bookmakers used to calculate match odds averages and maximums 83 | BbMxH = Betbrain maximum home win odds 84 | BbAvH = Betbrain average home win odds 85 | BbMxD = Betbrain maximum draw odds 86 | BbAvD = Betbrain average draw win odds 87 | BbMxA = Betbrain maximum away win odds 88 | BbAvA = Betbrain average away win odds 89 | 90 | 91 | 92 | Key to total goals betting odds: 93 | 94 | BbOU = Number of BetBrain bookmakers used to calculate over/under 2.5 goals (total goals) averages and maximums 95 | BbMx>2.5 = Betbrain maximum over 2.5 goals 96 | BbAv>2.5 = Betbrain average over 2.5 goals 97 | BbMx<2.5 = Betbrain maximum under 2.5 goals 98 | BbAv<2.5 = Betbrain average under 2.5 goals 99 | 100 | GB>2.5 = Gamebookers over 2.5 goals 101 | GB<2.5 = Gamebookers under 2.5 goals 102 | B365>2.5 = Bet365 over 2.5 goals 103 | B365<2.5 = Bet365 under 2.5 goals 104 | 105 | 106 | Key to Asian handicap betting odds: 107 | 108 | BbAH = Number of BetBrain bookmakers used to Asian handicap averages and maximums 109 | BbAHh = Betbrain size of handicap (home team) 110 | BbMxAHH = Betbrain maximum Asian handicap home team odds 111 | BbAvAHH = Betbrain average Asian handicap home team odds 112 | BbMxAHA = Betbrain maximum Asian handicap away team odds 113 | BbAvAHA = Betbrain average Asian handicap away team odds 114 | 115 | GBAHH = Gamebookers Asian handicap home team odds 116 | GBAHA = Gamebookers Asian handicap away team odds 117 | GBAH = Gamebookers size of handicap (home team) 118 | LBAHH = Ladbrokes Asian handicap home team odds 119 | LBAHA = Ladbrokes Asian handicap away team odds 120 | LBAH = Ladbrokes size of handicap (home team) 121 | B365AHH = Bet365 Asian handicap home team odds 122 | B365AHA = Bet365 Asian handicap away team odds 123 | B365AH = Bet365 size of handicap (home team) 124 | 125 | 126 | Closing odds (last odds before match starts) 127 | 128 | PSCH = Pinnacle closing home win odds 129 | PSCD = Pinnacle closing draw odds 130 | PSCA = Pinnacle closing away win odds 131 | 132 | Football-Data would like to acknowledge the following sources which have been utilised in the compilation of Football-Data's results and odds files. 133 | 134 | Historical results: 135 | International Soccer Server - http://sunsite.tut.fi/rec/riku/soccer.html 136 | European Football - http://www.eurofootball.be/ 137 | RSSSF Archive - http://www.rsssf.com/ 138 | 139 | Current results (full time, half time) 140 | TBWSport - http://www.tbwsport.com 141 | Livescore- http://www.livescore.com 142 | 143 | Match statistics 144 | Sportinglife, ESPN Soccer, Bundesliga.de, Gazzetta.it and Football.fr 145 | 146 | Bookmakers betting odds 147 | Betbrain - http://www.betbrain.com 148 | Betbase - http://www.betbase.info 149 | 150 | Betting odds for weekend games are collected Friday afternoons, and on Tuesday afternoons for midweek games. 151 | 152 | Additional match statistics (corners, shots, bookings, referee etc.) for the 2000/01 and 2001/02 seasons for the English, Scottish and German leagues were provided by Sports.com (now under new ownership and no longer available). 153 | 154 | -------------------------------------------------------------------------------- /Data Wrangling/data_wrangling_json/data/world_bank_projects_less.json: -------------------------------------------------------------------------------- 1 | [{ "_id" : { "$oid" : "52b213b38594d8a2be17c780" }, "approvalfy" : 1999, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-12T00:00:00Z", "borrower" : "FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA", "closingdate" : "2018-07-07T00:00:00Z", "country_namecode" : "Federal Democratic Republic of Ethiopia!$!ET", "countrycode" : "ET", "countryname" : "Federal Democratic Republic of Ethiopia", "countryshortname" : "Ethiopia", "docty" : "Project Information Document,Indigenous Peoples Plan,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 0, "ibrdcommamt" : 0, "id" : "P129828", "idacommamt" : 130000000, "impagency" : "MINISTRY OF EDUCATION", "lendinginstr" : "Investment Project Financing", "lendinginstrtype" : "IN", "lendprojectcost" : 550000000, "majorsector_percent" : [ { "Name" : "Education", "Percent" : 46 }, { "Name" : "Education", "Percent" : 26 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 16 }, { "Name" : "Education", "Percent" : 12 } ], "mjsector_namecode" : [ { "name" : "Education", "code" : "EX" }, { "name" : "Education", "code" : "EX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Education", "code" : "EX" } ], "mjtheme" : [ "Human development" ], "mjtheme_namecode" : [ { "name" : "Human development", "code" : "8" }, { "name" : "", "code" : "11" } ], "mjthemecode" : "8,11", "prodline" : "PE", "prodlinetext" : "IBRD/IDA", "productlinetype" : "L", "project_abstract" : { "cdata" : "The development objective of the Second Phase of General Education Quality Improvement Project for Ethiopia is to improve learning conditions in primary and secondary schools and strengthen institutions at different levels of educational administration. The project has six components. The first component is curriculum, textbooks, assessment, examinations, and inspection. This component will support improvement of learning conditions in grades KG-12 by providing increased access to teaching and learning materials and through improvements to the curriculum by assessing the strengths and weaknesses of the current curriculum. This component has following four sub-components: (i) curriculum reform and implementation; (ii) teaching and learning materials; (iii) assessment and examinations; and (iv) inspection. The second component is teacher development program (TDP). This component will support improvements in learning conditions in both primary and secondary schools by advancing the quality of teaching in general education through: (a) enhancing the training of pre-service teachers in teacher education institutions; and (b) improving the quality of in-service teacher training. This component has following three sub-components: (i) pre-service teacher training; (ii) in-service teacher training; and (iii) licensing and relicensing of teachers and school leaders. The third component is school improvement plan. This component will support the strengthening of school planning in order to improve learning outcomes, and to partly fund the school improvement plans through school grants. It has following two sub-components: (i) school improvement plan; and (ii) school grants. The fourth component is management and capacity building, including education management information systems (EMIS). This component will support management and capacity building aspect of the project. This component has following three sub-components: (i) capacity building for education planning and management; (ii) capacity building for school planning and management; and (iii) EMIS. The fifth component is improving the quality of learning and teaching in secondary schools and universities through the use of information and communications technology (ICT). It has following five sub-components: (i) national policy and institution for ICT in general education; (ii) national ICT infrastructure improvement plan for general education; (iii) develop an integrated monitoring, evaluation, and learning system specifically for the ICT component; (iv) teacher professional development in the use of ICT; and (v) provision of limited number of e-Braille display readers with the possibility to scale up to all secondary education schools based on the successful implementation and usage of the readers. The sixth component is program coordination, monitoring and evaluation, and communication. It will support institutional strengthening by developing capacities in all aspects of program coordination, monitoring and evaluation; a new sub-component on communications will support information sharing for better management and accountability. It has following three sub-components: (i) program coordination; (ii) monitoring and evaluation (M and E); and (iii) communication." }, "project_name" : "Ethiopia General Education Quality Improvement Project II", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID), Vol.", "DocType" : "PID", "EntityID" : "090224b081e545fb_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b081e545fb_1_0", "DocDate" : "28-AUG-2013" }, { "DocTypeDesc" : "Indigenous Peoples Plan (IP), Vol.1 of 1", "DocType" : "IP", "EntityID" : "000442464_20130920111729", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20130920111729", "DocDate" : "01-JUL-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.", "DocType" : "PID", "EntityID" : "090224b0817b19e2_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b0817b19e2_1_0", "DocDate" : "22-NOV-2012" } ], "projectfinancialtype" : "IDA", "projectstatusdisplay" : "Active", "regionname" : "Africa", "sector" : [ { "Name" : "Primary education" }, { "Name" : "Secondary education" }, { "Name" : "Public administration- Other social services" }, { "Name" : "Tertiary education" } ], "sector1" : { "Name" : "Primary education", "Percent" : 46 }, "sector2" : { "Name" : "Secondary education", "Percent" : 26 }, "sector3" : { "Name" : "Public administration- Other social services", "Percent" : 16 }, "sector4" : { "Name" : "Tertiary education", "Percent" : 12 }, "sector_namecode" : [ { "name" : "Primary education", "code" : "EP" }, { "name" : "Secondary education", "code" : "ES" }, { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "Tertiary education", "code" : "ET" } ], "sectorcode" : "ET,BS,ES,EP", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Education for all", "Percent" : 100 }, "theme_namecode" : [ { "name" : "Education for all", "code" : "65" } ], "themecode" : "65", "totalamt" : 130000000, "totalcommamt" : 130000000, "url" : "http://www.worldbank.org/projects/P129828/ethiopia-general-education-quality-improvement-project-ii?lang=en" }, 2 | { "_id" : { "$oid" : "52b213b38594d8a2be17c781" }, "approvalfy" : 2015, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-04T00:00:00Z", "borrower" : "GOVERNMENT OF TUNISIA", "country_namecode" : "Republic of Tunisia!$!TN", "countrycode" : "TN", "countryname" : "Republic of Tunisia", "countryshortname" : "Tunisia", "docty" : "Project Information Document,Integrated Safeguards Data Sheet,Integrated Safeguards Data Sheet,Project Information Document,Integrated Safeguards Data Sheet,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 4700000, "ibrdcommamt" : 0, "id" : "P144674", "idacommamt" : 0, "impagency" : "MINISTRY OF FINANCE", "lendinginstr" : "Specific Investment Loan", "lendinginstrtype" : "IN", "lendprojectcost" : 5700000, "majorsector_percent" : [ { "Name" : "Public Administration, Law, and Justice", "Percent" : 70 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 30 } ], "mjsector_namecode" : [ { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" } ], "mjtheme" : [ "Economic management", "Social protection and risk management" ], "mjtheme_namecode" : [ { "name" : "Economic management", "code" : "1" }, { "name" : "Social protection and risk management", "code" : "6" } ], "mjthemecode" : "1,6", "prodline" : "RE", "prodlinetext" : "Recipient Executed Activities", "productlinetype" : "L", "project_name" : "TN: DTF Social Protection Reforms Support", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131024115616", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131024115616", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20131024151611", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20131024151611", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000442464_20131031112136", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20131031112136", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131031105716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131031105716", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20130305113209", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113209", "DocDate" : "16-JAN-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000356161_20130305113716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113716", "DocDate" : "16-JAN-2013" } ], "projectfinancialtype" : "OTHER", "projectstatusdisplay" : "Active", "regionname" : "Middle East and North Africa", "sector" : [ { "Name" : "Public administration- Other social services" }, { "Name" : "General public administration sector" } ], "sector1" : { "Name" : "Public administration- Other social services", "Percent" : 70 }, "sector2" : { "Name" : "General public administration sector", "Percent" : 30 }, "sector_namecode" : [ { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "General public administration sector", "code" : "BZ" } ], "sectorcode" : "BZ,BS", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Other economic management", "Percent" : 30 }, "theme_namecode" : [ { "name" : "Other economic management", "code" : "24" }, { "name" : "Social safety nets", "code" : "54" } ], "themecode" : "54,24", "totalamt" : 0, "totalcommamt" : 4700000, "url" : "http://www.worldbank.org/projects/P144674?lang=en" } 3 | ] 4 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 2/sliderule_dsi_inferential_statistics_exercise_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "### Examining racial discrimination in the US job market\n", 9 | "\n", 10 | "#### Background\n", 11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 12 | "\n", 13 | "#### Data\n", 14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 15 | "\n", 16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n", 17 | "\n", 18 | "#### Exercise\n", 19 | "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 20 | "\n", 21 | "Answer the following questions **in this notebook below and submit to your Github account**. \n", 22 | "\n", 23 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 24 | " 2. What are the null and alternate hypotheses?\n", 25 | " 3. Compute margin of error, confidence interval, and p-value.\n", 26 | " 4. Discuss statistical significance.\n", 27 | "\n", 28 | "You can include written notes in notebook cells using Markdown: \n", 29 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 30 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 31 | "\n", 32 | "\n", 33 | "#### Resources\n", 34 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 35 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n", 36 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 37 | "\n", 38 | "****" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 42, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "import pandas as pd\n", 50 | "import numpy as np\n", 51 | "from scipy import stats\n", 52 | "import math" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 19, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "157.0" 77 | ] 78 | }, 79 | "execution_count": 19, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "# number of callbacks for black-sounding names\n", 86 | "sum(data[data.race=='b'].call)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 10, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/html": [ 99 | "
\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
idadeducationofjobsyearsexphonorsvolunteermilitaryempholesoccupspecific...compreqorgreqmanuftranscombankrealtradebusserviceothservicemissindownership
0b1426000117...1.00.01.00.00.00.00.00.00.0
1b13360110316...1.00.01.00.00.00.00.00.00.0
\n", 178 | "

2 rows × 65 columns

\n", 179 | "
" 180 | ], 181 | "text/plain": [ 182 | " id ad education ofjobs yearsexp honors volunteer military empholes \\\n", 183 | "0 b 1 4 2 6 0 0 0 1 \n", 184 | "1 b 1 3 3 6 0 1 1 0 \n", 185 | "\n", 186 | " occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n", 187 | "0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 188 | "1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 189 | "\n", 190 | " busservice othservice missind ownership \n", 191 | "0 0.0 0.0 0.0 \n", 192 | "1 0.0 0.0 0.0 \n", 193 | "\n", 194 | "[2 rows x 65 columns]" 195 | ] 196 | }, 197 | "execution_count": 10, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "data.columns\n", 204 | "data.head(2)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "#### What test is appropriate for this problem? Does CLT apply?\n", 212 | "\n", 213 | "Let's being by looking the number of observations where race = b and race = w. We can also check the number of such observations that have received a call back and the ones that didn't. Using these factors, we will be able to create a contingency table. \n", 214 | "\n", 215 | "Hence, the problem boils down to comparison of two proportions. \n", 216 | "If certain conditions are satisfied, we can also perform the Fischer's Exact Test using the contingency table.\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 73, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "Number of observations where race is b : 2435\n", 231 | "Number of observations where race is w : 2435\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "data_b = data[data.race=='b']\n", 237 | "data_w = data[data.race=='w']\n", 238 | "num_b = len(data_b)\n", 239 | "num_w = len(data_w)\n", 240 | "print \"Number of observations where race is b : \",num_b\n", 241 | "print \"Number of observations where race is w : \",num_w" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "Since there can be only two states for the 'call' variable, we can arbitrarily assign getting a call back as \"success\" and not getting a call back as a failure. " 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 74, 254 | "metadata": { 255 | "collapsed": false 256 | }, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "157 235\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "b_success = len(data_b[data_b.call == 1])\n", 268 | "w_success = len(data_w[data_w.call == 1])\n", 269 | "print b_success, w_success" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Let's also calculate the proportion $\\hat{p}_b$ of black sounding names getting a callback and the proportion $\\hat{p}_w$ white sounding names getting a call back." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 75, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "Proportion of black sounding names getting a callback : 0.064476386037\n", 291 | "Proportion of white sounding names getting a callback : 0.0965092402464\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "p_b = 1.0 * b_success/num_b\n", 297 | "p_w = 1.0 * w_success/num_w\n", 298 | "print \"Proportion of black sounding names getting a callback : \",p_b\n", 299 | "print \"Proportion of white sounding names getting a callback : \",p_w" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "Also, to make sure the samples are big enough such that we can use a normal distribution to model difference between \n", 307 | "proportions, we need to check if $n*p$ and $n*(1-p)$ are greater than 10. This is a conclusion from the Central Limit Theorem. " 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 76, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "157.0\n", 322 | "2278.0\n", 323 | "---\n", 324 | "235.0\n", 325 | "2200.0\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "print num_b * p_b\n", 331 | "print num_b * (1-p_b)\n", 332 | "print \"---\"\n", 333 | "print num_b * p_w\n", 334 | "print num_b * (1-p_w)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "Since, all the values are above 10, we can use the normal distribution to model differences between proportions." 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "Now that we have all the required data, we formulate the null and alternate hypotheses.\n", 349 | "\n", 350 | "$H_0\\:is \\: p_b = p_w\\\\\n", 351 | "H_A \\:is \\: p_b \\neq p_w$\n", 352 | "\n", 353 | "The Standard Error for the sample statistic is given by\n", 354 | "$\\sqrt{\\frac{\\hat{p}_b(1-\\hat{p}_b)}{n_b} + \\frac{\\hat{p}_w(1-\\hat{p}_w)}{n_w}} $\n", 355 | "\n", 356 | "We can use the z-statistic to place a confidence interval on this sample statistic.Hence, the margin of error is \n", 357 | "$Z_{\\alpha/2} * SE$. For a 95% confidence interval, the z-value is 1.96. \n", 358 | "\n", 359 | "The confidence interval, subsequently, is $\\hat{p}_b - \\hat{p}_w \\pm {Z_{\\alpha/2} * SE}$" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 77, 365 | "metadata": { 366 | "collapsed": false 367 | }, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "Margin of error = 0.0152554063499\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "z = 1.96\n", 379 | "margin = z * math.sqrt( ( p_w*(1-p_w) / num_b) + (p_b*(1-p_b)/num_w) )\n", 380 | "\n", 381 | "print \"Margin of error = \", margin" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 78, 387 | "metadata": { 388 | "collapsed": false 389 | }, 390 | "outputs": [ 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "The confidence interval is given by : 0.00213225776367 to 0.0619334506552\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "print \"The confidence interval is given by :\", p_w-p_b-z*margin,\"to\", p_w-p_b+z*margin" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "0 is not in this confidence interval. Nevertheless, let's go ahead and calculating the p-value." 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 79, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "(-4.1084121524343464, 3.9838868375850767e-05)" 421 | ] 422 | }, 423 | "execution_count": 79, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "from statsmodels.stats.proportion import proportions_ztest as pz\n", 430 | "pz(np.array([b_success,w_success]),np.array([num_b,num_w]),value=0)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "The second value is the p-value and it is much lesser than 0.05. Hence, we can reject the null hypothesis. \n", 438 | "\n", 439 | "#### Significance of the result\n", 440 | "What does it practically mean to reject the null hypothesis? Our null hypothesis was that the proportion of black sounding names getting a call back is equal to the number of white sounding names getting a call back. After analysis, we have decided to reject it. This means that, in reality, there is a significant difference in the number of call backs ; white sounding names getting more call backs." 441 | ] 442 | } 443 | ], 444 | "metadata": { 445 | "kernelspec": { 446 | "display_name": "Python [Root]", 447 | "language": "python", 448 | "name": "Python [Root]" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 2 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython2", 460 | "version": "2.7.12" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 0 465 | } 466 | -------------------------------------------------------------------------------- /Inferential Statistics/statistics project 2/.ipynb_checkpoints/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "### Examining racial discrimination in the US job market\n", 9 | "\n", 10 | "#### Background\n", 11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 12 | "\n", 13 | "#### Data\n", 14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 15 | "\n", 16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n", 17 | "\n", 18 | "#### Exercise\n", 19 | "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 20 | "\n", 21 | "Answer the following questions **in this notebook below and submit to your Github account**. \n", 22 | "\n", 23 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 24 | " 2. What are the null and alternate hypotheses?\n", 25 | " 3. Compute margin of error, confidence interval, and p-value.\n", 26 | " 4. Discuss statistical significance.\n", 27 | "\n", 28 | "You can include written notes in notebook cells using Markdown: \n", 29 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 30 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 31 | "\n", 32 | "\n", 33 | "#### Resources\n", 34 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 35 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n", 36 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 37 | "\n", 38 | "****" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 42, 44 | "metadata": { 45 | "collapsed": true 46 | }, 47 | "outputs": [], 48 | "source": [ 49 | "import pandas as pd\n", 50 | "import numpy as np\n", 51 | "from scipy import stats\n", 52 | "import math" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": { 59 | "collapsed": false 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 19, 69 | "metadata": { 70 | "collapsed": false 71 | }, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "157.0" 77 | ] 78 | }, 79 | "execution_count": 19, 80 | "metadata": {}, 81 | "output_type": "execute_result" 82 | } 83 | ], 84 | "source": [ 85 | "# number of callbacks for black-sounding names\n", 86 | "sum(data[data.race=='b'].call)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 10, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [ 96 | { 97 | "data": { 98 | "text/html": [ 99 | "
\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | "
idadeducationofjobsyearsexphonorsvolunteermilitaryempholesoccupspecific...compreqorgreqmanuftranscombankrealtradebusserviceothservicemissindownership
0b1426000117...1.00.01.00.00.00.00.00.00.0
1b13360110316...1.00.01.00.00.00.00.00.00.0
\n", 178 | "

2 rows × 65 columns

\n", 179 | "
" 180 | ], 181 | "text/plain": [ 182 | " id ad education ofjobs yearsexp honors volunteer military empholes \\\n", 183 | "0 b 1 4 2 6 0 0 0 1 \n", 184 | "1 b 1 3 3 6 0 1 1 0 \n", 185 | "\n", 186 | " occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n", 187 | "0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 188 | "1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 189 | "\n", 190 | " busservice othservice missind ownership \n", 191 | "0 0.0 0.0 0.0 \n", 192 | "1 0.0 0.0 0.0 \n", 193 | "\n", 194 | "[2 rows x 65 columns]" 195 | ] 196 | }, 197 | "execution_count": 10, 198 | "metadata": {}, 199 | "output_type": "execute_result" 200 | } 201 | ], 202 | "source": [ 203 | "data.columns\n", 204 | "data.head(2)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "#### What test is appropriate for this problem? Does CLT apply?\n", 212 | "\n", 213 | "Let's being by looking the number of observations where race = b and race = w. We can also check the number of such observations that have received a call back and the ones that didn't. Using these factors, we will be able to create a contingency table. \n", 214 | "\n", 215 | "Hence, the problem boils down to comparison of two proportions. \n", 216 | "If certain conditions are satisfied, we can also perform the Fischer's Exact Test using the contingency table.\n" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 73, 222 | "metadata": { 223 | "collapsed": false 224 | }, 225 | "outputs": [ 226 | { 227 | "name": "stdout", 228 | "output_type": "stream", 229 | "text": [ 230 | "Number of observations where race is b : 2435\n", 231 | "Number of observations where race is w : 2435\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "data_b = data[data.race=='b']\n", 237 | "data_w = data[data.race=='w']\n", 238 | "num_b = len(data_b)\n", 239 | "num_w = len(data_w)\n", 240 | "print \"Number of observations where race is b : \",num_b\n", 241 | "print \"Number of observations where race is w : \",num_w" 242 | ] 243 | }, 244 | { 245 | "cell_type": "markdown", 246 | "metadata": {}, 247 | "source": [ 248 | "Since there can be only two states for the 'call' variable, we can arbitrarily assign getting a call back as \"success\" and not getting a call back as a failure. " 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 74, 254 | "metadata": { 255 | "collapsed": false 256 | }, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "157 235\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "b_success = len(data_b[data_b.call == 1])\n", 268 | "w_success = len(data_w[data_w.call == 1])\n", 269 | "print b_success, w_success" 270 | ] 271 | }, 272 | { 273 | "cell_type": "markdown", 274 | "metadata": {}, 275 | "source": [ 276 | "Let's also calculate the proportion $\\hat{p}_b$ of black sounding names getting a callback and the proportion $\\hat{p}_w$ white sounding names getting a call back." 277 | ] 278 | }, 279 | { 280 | "cell_type": "code", 281 | "execution_count": 75, 282 | "metadata": { 283 | "collapsed": false 284 | }, 285 | "outputs": [ 286 | { 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "Proportion of black sounding names getting a callback : 0.064476386037\n", 291 | "Proportion of white sounding names getting a callback : 0.0965092402464\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "p_b = 1.0 * b_success/num_b\n", 297 | "p_w = 1.0 * w_success/num_w\n", 298 | "print \"Proportion of black sounding names getting a callback : \",p_b\n", 299 | "print \"Proportion of white sounding names getting a callback : \",p_w" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "Also, to make sure the samples are big enough such that we can use a normal distribution to model difference between \n", 307 | "proportions, we need to check if $n*p$ and $n*(1-p)$ are greater than 10. This is a conclusion from the Central Limit Theorem. " 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 76, 313 | "metadata": { 314 | "collapsed": false 315 | }, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "157.0\n", 322 | "2278.0\n", 323 | "---\n", 324 | "235.0\n", 325 | "2200.0\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "print num_b * p_b\n", 331 | "print num_b * (1-p_b)\n", 332 | "print \"---\"\n", 333 | "print num_b * p_w\n", 334 | "print num_b * (1-p_w)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "markdown", 339 | "metadata": {}, 340 | "source": [ 341 | "Since, all the values are above 10, we can use the normal distribution to model differences between proportions." 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "Now that we have all the required data, we formulate the null and alternate hypotheses.\n", 349 | "\n", 350 | "$H_0\\:is \\: p_b = p_w\\\\\n", 351 | "H_A \\:is \\: p_b \\neq p_w$\n", 352 | "\n", 353 | "The Standard Error for the sample statistic is given by\n", 354 | "$\\sqrt{\\frac{\\hat{p}_b(1-\\hat{p}_b)}{n_b} + \\frac{\\hat{p}_w(1-\\hat{p}_w)}{n_w}} $\n", 355 | "\n", 356 | "We can use the z-statistic to place a confidence interval on this sample statistic.Hence, the margin of error is \n", 357 | "$Z_{\\alpha/2} * SE$. For a 95% confidence interval, the z-value is 1.96. \n", 358 | "\n", 359 | "The confidence interval, subsequently, is $\\hat{p}_b - \\hat{p}_w \\pm {Z_{\\alpha/2} * SE}$" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 77, 365 | "metadata": { 366 | "collapsed": false 367 | }, 368 | "outputs": [ 369 | { 370 | "name": "stdout", 371 | "output_type": "stream", 372 | "text": [ 373 | "Margin of error = 0.0152554063499\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "z = 1.96\n", 379 | "margin = z * math.sqrt( ( p_w*(1-p_w) / num_b) + (p_b*(1-p_b)/num_w) )\n", 380 | "\n", 381 | "print \"Margin of error = \", margin" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 78, 387 | "metadata": { 388 | "collapsed": false 389 | }, 390 | "outputs": [ 391 | { 392 | "name": "stdout", 393 | "output_type": "stream", 394 | "text": [ 395 | "The confidence interval is given by : 0.00213225776367 to 0.0619334506552\n" 396 | ] 397 | } 398 | ], 399 | "source": [ 400 | "print \"The confidence interval is given by :\", p_w-p_b-z*margin,\"to\", p_w-p_b+z*margin" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": {}, 406 | "source": [ 407 | "0 is not in this confidence interval. Nevertheless, let's go ahead and calculating the p-value." 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 79, 413 | "metadata": { 414 | "collapsed": false 415 | }, 416 | "outputs": [ 417 | { 418 | "data": { 419 | "text/plain": [ 420 | "(-4.1084121524343464, 3.9838868375850767e-05)" 421 | ] 422 | }, 423 | "execution_count": 79, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "from statsmodels.stats.proportion import proportions_ztest as pz\n", 430 | "pz(np.array([b_success,w_success]),np.array([num_b,num_w]),value=0)" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "The second value is the p-value and it is much lesser than 0.05. Hence, we can reject the null hypothesis. \n", 438 | "\n", 439 | "#### Significance of the result\n", 440 | "What does it practically mean to reject the null hypothesis? Our null hypothesis was that the proportion of black sounding names getting a call back is equal to the number of white sounding names getting a call back. After analysis, we have decided to reject it. This means that, in reality, there is a significant difference in the number of call backs ; white sounding names getting more call backs." 441 | ] 442 | } 443 | ], 444 | "metadata": { 445 | "kernelspec": { 446 | "display_name": "Python [Root]", 447 | "language": "python", 448 | "name": "Python [Root]" 449 | }, 450 | "language_info": { 451 | "codemirror_mode": { 452 | "name": "ipython", 453 | "version": 2 454 | }, 455 | "file_extension": ".py", 456 | "mimetype": "text/x-python", 457 | "name": "python", 458 | "nbconvert_exporter": "python", 459 | "pygments_lexer": "ipython2", 460 | "version": "2.7.12" 461 | } 462 | }, 463 | "nbformat": 4, 464 | "nbformat_minor": 0 465 | } 466 | -------------------------------------------------------------------------------- /Capstone Project/Notebooks/Classification_Baseline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Creating a baseline for classification\n", 8 | "\n", 9 | "\n", 10 | "Notebook attempting to predict the result (Home win, away win, draw) of any fixture given the teams that are playing it based on their performance in the previous season. We use multiclass classification to predict the results of the matches. More feature engineering on the data might lead us to better results." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 12, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import scipy.stats as scipy\n", 24 | "import random" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Load the data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 15, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "df = pd.read_csv(\"./Data/England/E0_13.csv\")\n", 43 | "df_14 = pd.read_csv(\"./Data/England/E0_14.csv\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 16, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n", 57 | " u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n", 58 | " u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n", 59 | " u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n", 60 | " u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n", 61 | " u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n", 62 | " u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n", 63 | " u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n", 64 | " u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n", 65 | " dtype='object')" 66 | ] 67 | }, 68 | "execution_count": 16, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "df.columns" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Cleaning\n", 82 | "\n", 83 | "We do not need information about division, data, referee and the betting odds from various companies for this method. " 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 65, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "res_13 = df.ix[:,:23]\n", 95 | "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n", 96 | "res_14 = df_14.ix[:,:23]\n", 97 | "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n", 98 | "table_features = df.ix[:,:7]\n", 99 | "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n", 100 | "bet_13 = df.ix[:,23:]" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 19, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "from math import log" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 20, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "entropy = -((0.32 * log(0.32,3)) + (0.20 * log(0.20,3)) + (0.47 * log(0.47,3)))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 21, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "0.947893245378005" 136 | ] 137 | }, 138 | "execution_count": 21, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "entropy" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 22, 150 | "metadata": { 151 | "collapsed": false, 152 | "scrolled": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "res_13.head()\n", 157 | "feature_table = df.ix[:,:23]" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 23, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n", 169 | "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 24, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "Average number of goals at home 1.57368421053\n", 184 | "Average number of goals away 1.19473684211\n", 185 | "Average number of goals conceded at home 1.57368421053\n", 186 | "Average number of goals conceded away 1.19473684211\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n", 192 | "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n", 193 | "avg_home_conceded_13 = avg_away_scored_13\n", 194 | "avg_away_conceded_13 = avg_home_scored_13\n", 195 | "print \"Average number of goals at home\",avg_home_scored_13\n", 196 | "print \"Average number of goals away\", avg_away_scored_13\n", 197 | "print \"Average number of goals conceded at home\",avg_away_conceded_13\n", 198 | "print \"Average number of goals conceded away\",avg_home_conceded_13\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 25, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "res_home = res_13.groupby('HomeTeam')\n", 210 | "res_away = res_13.groupby('AwayTeam')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 26, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/html": [ 223 | "
\n", 224 | "\n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | "
TeamHGSAGSHASAASHGCAGCHDSADS
0Arsenal3632NaNNaN1130NaNNaN
1Aston Villa2217NaNNaN2932NaNNaN
2Cardiff2012NaNNaN3539NaNNaN
3Chelsea4328NaNNaN1116NaNNaN
4Crystal Palace1815NaNNaN2325NaNNaN
\n", 302 | "
" 303 | ], 304 | "text/plain": [ 305 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n", 306 | "0 Arsenal 36 32 NaN NaN 11 30 NaN NaN\n", 307 | "1 Aston Villa 22 17 NaN NaN 29 32 NaN NaN\n", 308 | "2 Cardiff 20 12 NaN NaN 35 39 NaN NaN\n", 309 | "3 Chelsea 43 28 NaN NaN 11 16 NaN NaN\n", 310 | "4 Crystal Palace 18 15 NaN NaN 23 25 NaN NaN" 311 | ] 312 | }, 313 | "execution_count": 26, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "table_13.Team = res_home.HomeTeam.all().values\n", 320 | "table_13.HGS = res_home.FTHG.sum().values\n", 321 | "table_13.HGC = res_home.FTAG.sum().values\n", 322 | "table_13.AGS = res_away.FTAG.sum().values\n", 323 | "table_13.AGC = res_away.FTHG.sum().values\n", 324 | "table_13.head()" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 27, 330 | "metadata": { 331 | "collapsed": false 332 | }, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/html": [ 337 | "
\n", 338 | "\n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | "
TeamHGSAGSHASAASHGCAGCHDSADS
0Arsenal36321.2040131.40969211300.4845811.003344
1Aston Villa22170.7357860.74889929321.2775331.070234
2Cardiff20120.6688960.52863435391.5418501.304348
3Chelsea43281.4381271.23348011160.4845810.535117
4Crystal Palace18150.6020070.66079323251.0132160.836120
\n", 416 | "
" 417 | ], 418 | "text/plain": [ 419 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n", 420 | "0 Arsenal 36 32 1.204013 1.409692 11 30 0.484581 1.003344\n", 421 | "1 Aston Villa 22 17 0.735786 0.748899 29 32 1.277533 1.070234\n", 422 | "2 Cardiff 20 12 0.668896 0.528634 35 39 1.541850 1.304348\n", 423 | "3 Chelsea 43 28 1.438127 1.233480 11 16 0.484581 0.535117\n", 424 | "4 Crystal Palace 18 15 0.602007 0.660793 23 25 1.013216 0.836120" 425 | ] 426 | }, 427 | "execution_count": 27, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n", 434 | "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n", 435 | "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n", 436 | "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n", 437 | "table_13.head()" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 28, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "feature_table = feature_table[['HomeTeam','AwayTeam','FTR']]\n", 449 | "f_HAS = []\n", 450 | "f_HDS = []\n", 451 | "f_AAS = []\n", 452 | "f_ADS = []\n", 453 | "for index,row in feature_table.iterrows():\n", 454 | " f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n", 455 | " f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n", 456 | " f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n", 457 | " f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n", 458 | " \n", 459 | "feature_table['HAS'] = f_HAS\n", 460 | "feature_table['HDS'] = f_HDS\n", 461 | "feature_table['AAS'] = f_AAS\n", 462 | "feature_table['ADS'] = f_ADS" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 29, 468 | "metadata": { 469 | "collapsed": false 470 | }, 471 | "outputs": [ 472 | { 473 | "data": { 474 | "text/html": [ 475 | "
\n", 476 | "\n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | "
HomeTeamAwayTeamFTRHASHDSAASADS
0ArsenalAston VillaA1.2040130.4845811.4096921.003344
1LiverpoolStokeH1.7725750.7929522.1145371.070234
2NorwichEvertonD0.5685620.7929520.4845811.471572
3SunderlandFulhamA0.7023411.1894270.8810571.103679
4SwanseaMan UnitedA1.1036791.1453740.9251100.936455
\n", 542 | "
" 543 | ], 544 | "text/plain": [ 545 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS\n", 546 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344\n", 547 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234\n", 548 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572\n", 549 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679\n", 550 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455" 551 | ] 552 | }, 553 | "execution_count": 29, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "feature_table.head()" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 30, 565 | "metadata": { 566 | "collapsed": true 567 | }, 568 | "outputs": [], 569 | "source": [ 570 | "def transformResult(row):\n", 571 | " if(row.FTR == 'H'):\n", 572 | " return 1\n", 573 | " elif(row.FTR == 'A'):\n", 574 | " return -1\n", 575 | " else:\n", 576 | " return 0" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 31, 582 | "metadata": { 583 | "collapsed": false 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 32, 593 | "metadata": { 594 | "collapsed": false 595 | }, 596 | "outputs": [ 597 | { 598 | "data": { 599 | "text/html": [ 600 | "
\n", 601 | "\n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | "
HomeTeamAwayTeamFTRHASHDSAASADSResult
0ArsenalAston VillaA1.2040130.4845811.4096921.003344-1
1LiverpoolStokeH1.7725750.7929522.1145371.0702341
2NorwichEvertonD0.5685620.7929520.4845811.4715720
3SunderlandFulhamA0.7023411.1894270.8810571.103679-1
4SwanseaMan UnitedA1.1036791.1453740.9251100.936455-1
\n", 673 | "
" 674 | ], 675 | "text/plain": [ 676 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS Result\n", 677 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344 -1\n", 678 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234 1\n", 679 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572 0\n", 680 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679 -1\n", 681 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455 -1" 682 | ] 683 | }, 684 | "execution_count": 32, 685 | "metadata": {}, 686 | "output_type": "execute_result" 687 | } 688 | ], 689 | "source": [ 690 | "feature_table.head()" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 33, 696 | "metadata": { 697 | "collapsed": true 698 | }, 699 | "outputs": [], 700 | "source": [ 701 | "X_train = feature_table[['HAS','HDS','AAS','ADS']]\n", 702 | "y_train = feature_table['Result']" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 34, 708 | "metadata": { 709 | "collapsed": false 710 | }, 711 | "outputs": [], 712 | "source": [ 713 | "from sklearn.tree import DecisionTreeClassifier\n", 714 | "from sklearn.naive_bayes import MultinomialNB\n", 715 | "from xgboost import XGBClassifier\n", 716 | "from sklearn.neighbors import KNeighborsClassifier\n", 717 | "from sklearn.multiclass import OneVsRestClassifier\n", 718 | "\n", 719 | "from sklearn.linear_model import LogisticRegression\n", 720 | "from sklearn.metrics import accuracy_score" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": {}, 726 | "source": [ 727 | "## Randomized Model as Benchmark" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": 41, 733 | "metadata": { 734 | "collapsed": false 735 | }, 736 | "outputs": [], 737 | "source": [ 738 | "outcome_list = [-1,0,1]\n", 739 | "y_pred = []\n", 740 | "for i in xrange(1,381):\n", 741 | " y_pred.append(random.choice(outcome_list))\n", 742 | " " 743 | ] 744 | }, 745 | { 746 | "cell_type": "code", 747 | "execution_count": 42, 748 | "metadata": { 749 | "collapsed": false 750 | }, 751 | "outputs": [ 752 | { 753 | "data": { 754 | "text/plain": [ 755 | "0.32631578947368423" 756 | ] 757 | }, 758 | "execution_count": 42, 759 | "metadata": {}, 760 | "output_type": "execute_result" 761 | } 762 | ], 763 | "source": [ 764 | "accuracy_score(y_train,y_pred)" 765 | ] 766 | }, 767 | { 768 | "cell_type": "markdown", 769 | "metadata": {}, 770 | "source": [ 771 | "With just a random model, we get an accuracy of 33% on an average which is expected since there are three outcomes to any fixture. " 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "metadata": {}, 777 | "source": [ 778 | "### Classifiers" 779 | ] 780 | }, 781 | { 782 | "cell_type": "code", 783 | "execution_count": 87, 784 | "metadata": { 785 | "collapsed": false 786 | }, 787 | "outputs": [ 788 | { 789 | "data": { 790 | "text/plain": [ 791 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 792 | " metric_params=None, n_jobs=1, n_neighbors=15, p=2,\n", 793 | " weights='uniform')" 794 | ] 795 | }, 796 | "execution_count": 87, 797 | "metadata": {}, 798 | "output_type": "execute_result" 799 | } 800 | ], 801 | "source": [ 802 | "clf1 = DecisionTreeClassifier()\n", 803 | "clf2 = XGBClassifier()\n", 804 | "clf3 = KNeighborsClassifier(n_neighbors=15)\n", 805 | "clf3.fit(X_train,y_train)" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 88, 811 | "metadata": { 812 | "collapsed": false 813 | }, 814 | "outputs": [ 815 | { 816 | "data": { 817 | "text/plain": [ 818 | "0.54736842105263162" 819 | ] 820 | }, 821 | "execution_count": 88, 822 | "metadata": {}, 823 | "output_type": "execute_result" 824 | } 825 | ], 826 | "source": [ 827 | "y_pred = clf3.predict(X_train)\n", 828 | "accuracy_score(y_pred,y_train)" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": { 835 | "collapsed": true 836 | }, 837 | "outputs": [], 838 | "source": [] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": { 844 | "collapsed": true 845 | }, 846 | "outputs": [], 847 | "source": [] 848 | } 849 | ], 850 | "metadata": { 851 | "kernelspec": { 852 | "display_name": "Python [Root]", 853 | "language": "python", 854 | "name": "Python [Root]" 855 | }, 856 | "language_info": { 857 | "codemirror_mode": { 858 | "name": "ipython", 859 | "version": 2 860 | }, 861 | "file_extension": ".py", 862 | "mimetype": "text/x-python", 863 | "name": "python", 864 | "nbconvert_exporter": "python", 865 | "pygments_lexer": "ipython2", 866 | "version": "2.7.12" 867 | } 868 | }, 869 | "nbformat": 4, 870 | "nbformat_minor": 0 871 | } 872 | -------------------------------------------------------------------------------- /Capstone Project/.ipynb_checkpoints/Classification_Baseline-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Creating a baseline for classification\n", 8 | "\n", 9 | "\n", 10 | "Notebook attempting to predict the result (Home win, away win, draw) of any fixture given the teams that are playing it based on their performance in the previous season. We use multiclass classification to predict the results of the matches. More feature engineering on the data might lead us to better results." 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 12, 16 | "metadata": { 17 | "collapsed": true 18 | }, 19 | "outputs": [], 20 | "source": [ 21 | "import pandas as pd\n", 22 | "import numpy as np\n", 23 | "import scipy.stats as scipy\n", 24 | "import random" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "### Load the data" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 15, 37 | "metadata": { 38 | "collapsed": false 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "df = pd.read_csv(\"./Data/England/E0_13.csv\")\n", 43 | "df_14 = pd.read_csv(\"./Data/England/E0_14.csv\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 16, 49 | "metadata": { 50 | "collapsed": false 51 | }, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/plain": [ 56 | "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n", 57 | " u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n", 58 | " u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n", 59 | " u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n", 60 | " u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n", 61 | " u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n", 62 | " u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n", 63 | " u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n", 64 | " u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n", 65 | " dtype='object')" 66 | ] 67 | }, 68 | "execution_count": 16, 69 | "metadata": {}, 70 | "output_type": "execute_result" 71 | } 72 | ], 73 | "source": [ 74 | "df.columns" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "### Cleaning\n", 82 | "\n", 83 | "We do not need information about division, data, referee and the betting odds from various companies for this method. " 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 65, 89 | "metadata": { 90 | "collapsed": false 91 | }, 92 | "outputs": [], 93 | "source": [ 94 | "res_13 = df.ix[:,:23]\n", 95 | "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n", 96 | "res_14 = df_14.ix[:,:23]\n", 97 | "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n", 98 | "table_features = df.ix[:,:7]\n", 99 | "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n", 100 | "bet_13 = df.ix[:,23:]" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 19, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [ 111 | "from math import log" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 20, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "entropy = -((0.32 * log(0.32,3)) + (0.20 * log(0.20,3)) + (0.47 * log(0.47,3)))" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 21, 128 | "metadata": { 129 | "collapsed": false 130 | }, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "0.947893245378005" 136 | ] 137 | }, 138 | "execution_count": 21, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "entropy" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 22, 150 | "metadata": { 151 | "collapsed": false, 152 | "scrolled": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "res_13.head()\n", 157 | "feature_table = df.ix[:,:23]" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 23, 163 | "metadata": { 164 | "collapsed": false 165 | }, 166 | "outputs": [], 167 | "source": [ 168 | "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n", 169 | "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 24, 175 | "metadata": { 176 | "collapsed": false 177 | }, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "Average number of goals at home 1.57368421053\n", 184 | "Average number of goals away 1.19473684211\n", 185 | "Average number of goals conceded at home 1.57368421053\n", 186 | "Average number of goals conceded away 1.19473684211\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n", 192 | "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n", 193 | "avg_home_conceded_13 = avg_away_scored_13\n", 194 | "avg_away_conceded_13 = avg_home_scored_13\n", 195 | "print \"Average number of goals at home\",avg_home_scored_13\n", 196 | "print \"Average number of goals away\", avg_away_scored_13\n", 197 | "print \"Average number of goals conceded at home\",avg_away_conceded_13\n", 198 | "print \"Average number of goals conceded away\",avg_home_conceded_13\n" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 25, 204 | "metadata": { 205 | "collapsed": false 206 | }, 207 | "outputs": [], 208 | "source": [ 209 | "res_home = res_13.groupby('HomeTeam')\n", 210 | "res_away = res_13.groupby('AwayTeam')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 26, 216 | "metadata": { 217 | "collapsed": false 218 | }, 219 | "outputs": [ 220 | { 221 | "data": { 222 | "text/html": [ 223 | "
\n", 224 | "\n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | "
TeamHGSAGSHASAASHGCAGCHDSADS
0Arsenal3632NaNNaN1130NaNNaN
1Aston Villa2217NaNNaN2932NaNNaN
2Cardiff2012NaNNaN3539NaNNaN
3Chelsea4328NaNNaN1116NaNNaN
4Crystal Palace1815NaNNaN2325NaNNaN
\n", 302 | "
" 303 | ], 304 | "text/plain": [ 305 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n", 306 | "0 Arsenal 36 32 NaN NaN 11 30 NaN NaN\n", 307 | "1 Aston Villa 22 17 NaN NaN 29 32 NaN NaN\n", 308 | "2 Cardiff 20 12 NaN NaN 35 39 NaN NaN\n", 309 | "3 Chelsea 43 28 NaN NaN 11 16 NaN NaN\n", 310 | "4 Crystal Palace 18 15 NaN NaN 23 25 NaN NaN" 311 | ] 312 | }, 313 | "execution_count": 26, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "table_13.Team = res_home.HomeTeam.all().values\n", 320 | "table_13.HGS = res_home.FTHG.sum().values\n", 321 | "table_13.HGC = res_home.FTAG.sum().values\n", 322 | "table_13.AGS = res_away.FTAG.sum().values\n", 323 | "table_13.AGC = res_away.FTHG.sum().values\n", 324 | "table_13.head()" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 27, 330 | "metadata": { 331 | "collapsed": false 332 | }, 333 | "outputs": [ 334 | { 335 | "data": { 336 | "text/html": [ 337 | "
\n", 338 | "\n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | "
TeamHGSAGSHASAASHGCAGCHDSADS
0Arsenal36321.2040131.40969211300.4845811.003344
1Aston Villa22170.7357860.74889929321.2775331.070234
2Cardiff20120.6688960.52863435391.5418501.304348
3Chelsea43281.4381271.23348011160.4845810.535117
4Crystal Palace18150.6020070.66079323251.0132160.836120
\n", 416 | "
" 417 | ], 418 | "text/plain": [ 419 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n", 420 | "0 Arsenal 36 32 1.204013 1.409692 11 30 0.484581 1.003344\n", 421 | "1 Aston Villa 22 17 0.735786 0.748899 29 32 1.277533 1.070234\n", 422 | "2 Cardiff 20 12 0.668896 0.528634 35 39 1.541850 1.304348\n", 423 | "3 Chelsea 43 28 1.438127 1.233480 11 16 0.484581 0.535117\n", 424 | "4 Crystal Palace 18 15 0.602007 0.660793 23 25 1.013216 0.836120" 425 | ] 426 | }, 427 | "execution_count": 27, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n", 434 | "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n", 435 | "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n", 436 | "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n", 437 | "table_13.head()" 438 | ] 439 | }, 440 | { 441 | "cell_type": "code", 442 | "execution_count": 28, 443 | "metadata": { 444 | "collapsed": false 445 | }, 446 | "outputs": [], 447 | "source": [ 448 | "feature_table = feature_table[['HomeTeam','AwayTeam','FTR']]\n", 449 | "f_HAS = []\n", 450 | "f_HDS = []\n", 451 | "f_AAS = []\n", 452 | "f_ADS = []\n", 453 | "for index,row in feature_table.iterrows():\n", 454 | " f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n", 455 | " f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n", 456 | " f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n", 457 | " f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n", 458 | " \n", 459 | "feature_table['HAS'] = f_HAS\n", 460 | "feature_table['HDS'] = f_HDS\n", 461 | "feature_table['AAS'] = f_AAS\n", 462 | "feature_table['ADS'] = f_ADS" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": 29, 468 | "metadata": { 469 | "collapsed": false 470 | }, 471 | "outputs": [ 472 | { 473 | "data": { 474 | "text/html": [ 475 | "
\n", 476 | "\n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | "
HomeTeamAwayTeamFTRHASHDSAASADS
0ArsenalAston VillaA1.2040130.4845811.4096921.003344
1LiverpoolStokeH1.7725750.7929522.1145371.070234
2NorwichEvertonD0.5685620.7929520.4845811.471572
3SunderlandFulhamA0.7023411.1894270.8810571.103679
4SwanseaMan UnitedA1.1036791.1453740.9251100.936455
\n", 542 | "
" 543 | ], 544 | "text/plain": [ 545 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS\n", 546 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344\n", 547 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234\n", 548 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572\n", 549 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679\n", 550 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455" 551 | ] 552 | }, 553 | "execution_count": 29, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "feature_table.head()" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 30, 565 | "metadata": { 566 | "collapsed": true 567 | }, 568 | "outputs": [], 569 | "source": [ 570 | "def transformResult(row):\n", 571 | " if(row.FTR == 'H'):\n", 572 | " return 1\n", 573 | " elif(row.FTR == 'A'):\n", 574 | " return -1\n", 575 | " else:\n", 576 | " return 0" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 31, 582 | "metadata": { 583 | "collapsed": false 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 32, 593 | "metadata": { 594 | "collapsed": false 595 | }, 596 | "outputs": [ 597 | { 598 | "data": { 599 | "text/html": [ 600 | "
\n", 601 | "\n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | "
HomeTeamAwayTeamFTRHASHDSAASADSResult
0ArsenalAston VillaA1.2040130.4845811.4096921.003344-1
1LiverpoolStokeH1.7725750.7929522.1145371.0702341
2NorwichEvertonD0.5685620.7929520.4845811.4715720
3SunderlandFulhamA0.7023411.1894270.8810571.103679-1
4SwanseaMan UnitedA1.1036791.1453740.9251100.936455-1
\n", 673 | "
" 674 | ], 675 | "text/plain": [ 676 | " HomeTeam AwayTeam FTR HAS HDS AAS ADS Result\n", 677 | "0 Arsenal Aston Villa A 1.204013 0.484581 1.409692 1.003344 -1\n", 678 | "1 Liverpool Stoke H 1.772575 0.792952 2.114537 1.070234 1\n", 679 | "2 Norwich Everton D 0.568562 0.792952 0.484581 1.471572 0\n", 680 | "3 Sunderland Fulham A 0.702341 1.189427 0.881057 1.103679 -1\n", 681 | "4 Swansea Man United A 1.103679 1.145374 0.925110 0.936455 -1" 682 | ] 683 | }, 684 | "execution_count": 32, 685 | "metadata": {}, 686 | "output_type": "execute_result" 687 | } 688 | ], 689 | "source": [ 690 | "feature_table.head()" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 33, 696 | "metadata": { 697 | "collapsed": true 698 | }, 699 | "outputs": [], 700 | "source": [ 701 | "X_train = feature_table[['HAS','HDS','AAS','ADS']]\n", 702 | "y_train = feature_table['Result']" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": { 709 | "collapsed": true 710 | }, 711 | "outputs": [], 712 | "source": [] 713 | }, 714 | { 715 | "cell_type": "code", 716 | "execution_count": 34, 717 | "metadata": { 718 | "collapsed": false 719 | }, 720 | "outputs": [], 721 | "source": [ 722 | "from sklearn.tree import DecisionTreeClassifier\n", 723 | "from sklearn.naive_bayes import MultinomialNB\n", 724 | "from xgboost import XGBClassifier\n", 725 | "from sklearn.neighbors import KNeighborsClassifier\n", 726 | "from sklearn.multiclass import OneVsRestClassifier\n", 727 | "\n", 728 | "from sklearn.linear_model import LogisticRegression\n", 729 | "from sklearn.metrics import accuracy_score" 730 | ] 731 | }, 732 | { 733 | "cell_type": "markdown", 734 | "metadata": {}, 735 | "source": [ 736 | "## Randomized Model as Benchmark" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": 41, 742 | "metadata": { 743 | "collapsed": false 744 | }, 745 | "outputs": [], 746 | "source": [ 747 | "outcome_list = [-1,0,1]\n", 748 | "y_pred = []\n", 749 | "for i in xrange(1,381):\n", 750 | " y_pred.append(random.choice(outcome_list))\n", 751 | " " 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 42, 757 | "metadata": { 758 | "collapsed": false 759 | }, 760 | "outputs": [ 761 | { 762 | "data": { 763 | "text/plain": [ 764 | "0.32631578947368423" 765 | ] 766 | }, 767 | "execution_count": 42, 768 | "metadata": {}, 769 | "output_type": "execute_result" 770 | } 771 | ], 772 | "source": [ 773 | "accuracy_score(y_train,y_pred)" 774 | ] 775 | }, 776 | { 777 | "cell_type": "markdown", 778 | "metadata": {}, 779 | "source": [ 780 | "With just a random model, we get an accuracy of 33% on an average which is expected since there are three outcomes to any fixture. " 781 | ] 782 | }, 783 | { 784 | "cell_type": "markdown", 785 | "metadata": {}, 786 | "source": [ 787 | "### Classifiers" 788 | ] 789 | }, 790 | { 791 | "cell_type": "code", 792 | "execution_count": 87, 793 | "metadata": { 794 | "collapsed": false 795 | }, 796 | "outputs": [ 797 | { 798 | "data": { 799 | "text/plain": [ 800 | "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", 801 | " metric_params=None, n_jobs=1, n_neighbors=15, p=2,\n", 802 | " weights='uniform')" 803 | ] 804 | }, 805 | "execution_count": 87, 806 | "metadata": {}, 807 | "output_type": "execute_result" 808 | } 809 | ], 810 | "source": [ 811 | "clf1 = DecisionTreeClassifier()\n", 812 | "clf2 = XGBClassifier()\n", 813 | "clf3 = KNeighborsClassifier(n_neighbors=15)\n", 814 | "clf3.fit(X_train,y_train)" 815 | ] 816 | }, 817 | { 818 | "cell_type": "code", 819 | "execution_count": 88, 820 | "metadata": { 821 | "collapsed": false 822 | }, 823 | "outputs": [ 824 | { 825 | "data": { 826 | "text/plain": [ 827 | "0.54736842105263162" 828 | ] 829 | }, 830 | "execution_count": 88, 831 | "metadata": {}, 832 | "output_type": "execute_result" 833 | } 834 | ], 835 | "source": [ 836 | "y_pred = clf3.predict(X_train)\n", 837 | "accuracy_score(y_pred,y_train)" 838 | ] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": { 844 | "collapsed": true 845 | }, 846 | "outputs": [], 847 | "source": [] 848 | }, 849 | { 850 | "cell_type": "code", 851 | "execution_count": null, 852 | "metadata": { 853 | "collapsed": true 854 | }, 855 | "outputs": [], 856 | "source": [] 857 | } 858 | ], 859 | "metadata": { 860 | "kernelspec": { 861 | "display_name": "Python [Root]", 862 | "language": "python", 863 | "name": "Python [Root]" 864 | }, 865 | "language_info": { 866 | "codemirror_mode": { 867 | "name": "ipython", 868 | "version": 2 869 | }, 870 | "file_extension": ".py", 871 | "mimetype": "text/x-python", 872 | "name": "python", 873 | "nbconvert_exporter": "python", 874 | "pygments_lexer": "ipython2", 875 | "version": "2.7.12" 876 | } 877 | }, 878 | "nbformat": 4, 879 | "nbformat_minor": 0 880 | } 881 | -------------------------------------------------------------------------------- /Data Wrangling/data_wrangling_xml/data_wrangling_xml/sliderule_dsi_xml_exercise.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# XML example and exercise\n", 8 | "****\n", 9 | "+ study examples of accessing nodes in XML tree structure \n", 10 | "+ work on exercise to be completed and submitted\n", 11 | "****\n", 12 | "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n", 13 | "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n", 14 | "****" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from xml.etree import ElementTree as ET" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## XML example\n", 33 | "\n", 34 | "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "document_tree = ET.parse( './data/mondial_database_less.xml' )" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "Albania\n", 60 | "Greece\n", 61 | "Macedonia\n", 62 | "Serbia\n", 63 | "Montenegro\n", 64 | "Kosovo\n", 65 | "Andorra\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "# print names of all countries\n", 71 | "for child in document_tree.getroot():\n", 72 | " print child.find('name').text" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n", 87 | "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n", 88 | "* Macedonia: Skopje, Kumanovo\n", 89 | "* Serbia: Beograd, Novi Sad, Niš\n", 90 | "* Montenegro: Podgorica\n", 91 | "* Kosovo: Prishtine\n", 92 | "* Andorra: Andorra la Vella\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# print names of all countries and their cities\n", 98 | "for element in document_tree.iterfind('country'):\n", 99 | " print '* ' + element.find('name').text + ':',\n", 100 | " capitals_string = ''\n", 101 | " for subelement in element.getiterator('city'):\n", 102 | " capitals_string += subelement.find('name').text + ', '\n", 103 | " print capitals_string[:-2]" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "****\n", 111 | "## XML exercise\n", 112 | "\n", 113 | "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n", 114 | "\n", 115 | "1. 10 countries with the lowest infant mortality rates\n", 116 | "2. 10 cities with the largest population\n", 117 | "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n", 118 | "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "document = ET.parse( './data/mondial_database.xml' )" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "#1. 10 countries with the lowest infant mortality rates\n", 141 | "country_im = []\n", 142 | "for country in document.iterfind('country'):\n", 143 | " if country.find('infant_mortality') is not None:\n", 144 | " country_im.append([country.find('name').text,country.find('infant_mortality').text])\n", 145 | " " 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 8, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "import pandas as pd\n", 157 | "im = pd.DataFrame(country_im)\n", 158 | "im.columns = [\"country\",\"infant_moratality\"]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 9, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/html": [ 171 | "
\n", 172 | "\n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | "
countryinfant_moratality
36Monaco1.81
90Japan2.13
109Bermuda2.48
34Norway2.48
98Singapore2.53
35Sweden2.60
8Czech Republic2.63
72Hong Kong2.73
73Macao3.13
39Iceland3.15
\n", 233 | "
" 234 | ], 235 | "text/plain": [ 236 | " country infant_moratality\n", 237 | "36 Monaco 1.81\n", 238 | "90 Japan 2.13\n", 239 | "109 Bermuda 2.48\n", 240 | "34 Norway 2.48\n", 241 | "98 Singapore 2.53\n", 242 | "35 Sweden 2.60\n", 243 | "8 Czech Republic 2.63\n", 244 | "72 Hong Kong 2.73\n", 245 | "73 Macao 3.13\n", 246 | "39 Iceland 3.15" 247 | ] 248 | }, 249 | "execution_count": 9, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "im.infant_moratality = im.infant_moratality.astype(float)\n", 256 | "im = im.sort_values(by='infant_moratality')\n", 257 | "im.head(10)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 10, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "#2. 10 cities with the largest population\n", 269 | "populations = []\n", 270 | "for country in document.iterfind('country'):\n", 271 | " if country.find('population') is not None:\n", 272 | " populations.append([country.find('name').text,country.find('population').text])\n", 273 | " " 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 11, 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "pop = pd.DataFrame(populations)\n", 285 | "pop.columns = [\"country\",\"population\"]" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 12, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/html": [ 298 | "
\n", 299 | "\n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | "
countrypopulation
166Pitcairn68.0
83Cocos Islands628.0
41Holy See840.0
121Cayman Islands933.0
138Sint Maarten1497.0
170Tokelau1570.0
39Gibraltar1816.0
186Falkland Islands2043.0
159Nauru2066.0
52Svalbard2116.0
\n", 360 | "
" 361 | ], 362 | "text/plain": [ 363 | " country population\n", 364 | "166 Pitcairn 68.0\n", 365 | "83 Cocos Islands 628.0\n", 366 | "41 Holy See 840.0\n", 367 | "121 Cayman Islands 933.0\n", 368 | "138 Sint Maarten 1497.0\n", 369 | "170 Tokelau 1570.0\n", 370 | "39 Gibraltar 1816.0\n", 371 | "186 Falkland Islands 2043.0\n", 372 | "159 Nauru 2066.0\n", 373 | "52 Svalbard 2116.0" 374 | ] 375 | }, 376 | "execution_count": 12, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "pop.population = pop.population.astype(float)\n", 383 | "pop = pop.sort_values(by = \"population\")\n", 384 | "pop.head(10)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 13, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/html": [ 397 | "
\n", 398 | "\n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | "
countriescountry_percentagecountry_popethnicitypopulationyear
0Albania95.02800138Albanian26601312011
1Albania3.02800138Greek840042011
2Greece93.010816286Greek100591452011
3Macedonia64.22059794Macedonian13223872011
4Macedonia25.22059794Albanian5190682011
5Macedonia3.92059794Turkish803312011
6Macedonia2.72059794Gypsy556142011
7Macedonia1.82059794Serb370762011
8Serbia82.97120666Serb59030322011
9Serbia0.97120666Montenegrin640852011
\n", 503 | "
" 504 | ], 505 | "text/plain": [ 506 | " countries country_percentage country_pop ethnicity population year\n", 507 | "0 Albania 95.0 2800138 Albanian 2660131 2011\n", 508 | "1 Albania 3.0 2800138 Greek 84004 2011\n", 509 | "2 Greece 93.0 10816286 Greek 10059145 2011\n", 510 | "3 Macedonia 64.2 2059794 Macedonian 1322387 2011\n", 511 | "4 Macedonia 25.2 2059794 Albanian 519068 2011\n", 512 | "5 Macedonia 3.9 2059794 Turkish 80331 2011\n", 513 | "6 Macedonia 2.7 2059794 Gypsy 55614 2011\n", 514 | "7 Macedonia 1.8 2059794 Serb 37076 2011\n", 515 | "8 Serbia 82.9 7120666 Serb 5903032 2011\n", 516 | "9 Serbia 0.9 7120666 Montenegrin 64085 2011" 517 | ] 518 | }, 519 | "execution_count": 13, 520 | "metadata": {}, 521 | "output_type": "execute_result" 522 | } 523 | ], 524 | "source": [ 525 | "#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n", 526 | "data = [] \n", 527 | "\n", 528 | "for country in document.findall('country'):\n", 529 | " for node in list(country):\n", 530 | " if node.tag == 'name':\n", 531 | " co = node.text\n", 532 | " elif node.tag == 'population':\n", 533 | " # the last listed population statistic is used\n", 534 | " pop = int(node.text)\n", 535 | " #meas = node.attrib['measured'] --leads to an error, potentially unpopulated at times\n", 536 | " yr = int(node.attrib['year'])\n", 537 | " elif node.tag == 'ethnicgroup':\n", 538 | " eth = node.text\n", 539 | " perc = float(node.attrib['percentage'])\n", 540 | " epop = int(pop * perc / 100.)\n", 541 | " \n", 542 | " data.append({'countries':co, 'country_pop':pop, 'year':yr,\n", 543 | " 'ethnicity':eth, 'country_percentage':perc, 'population':epop})\n", 544 | " \n", 545 | "df = pd.DataFrame(data)\n", 546 | "df.head(10)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 15, 552 | "metadata": { 553 | "collapsed": false 554 | }, 555 | "outputs": [ 556 | { 557 | "data": { 558 | "text/html": [ 559 | "
\n", 560 | "\n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | "
ethnicitypopulation
0Han Chinese1245058800
1Indo-Aryan871815583
2European494872201
3African318325104
4Dravidian302713744
5Mestizo157734349
6Bengali146776916
7Russian131856989
8Japanese126534212
9Malay121993548
\n", 621 | "
" 622 | ], 623 | "text/plain": [ 624 | " ethnicity population\n", 625 | "0 Han Chinese 1245058800\n", 626 | "1 Indo-Aryan 871815583\n", 627 | "2 European 494872201\n", 628 | "3 African 318325104\n", 629 | "4 Dravidian 302713744\n", 630 | "5 Mestizo 157734349\n", 631 | "6 Bengali 146776916\n", 632 | "7 Russian 131856989\n", 633 | "8 Japanese 126534212\n", 634 | "9 Malay 121993548" 635 | ] 636 | }, 637 | "execution_count": 15, 638 | "metadata": {}, 639 | "output_type": "execute_result" 640 | } 641 | ], 642 | "source": [ 643 | "df.groupby('ethnicity').population.sum().sort_values(ascending=False).head(10).reset_index()" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 14, 649 | "metadata": { 650 | "collapsed": true 651 | }, 652 | "outputs": [], 653 | "source": [ 654 | "#4. name and country of a) longest river " 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 17, 660 | "metadata": { 661 | "collapsed": false 662 | }, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/html": [ 667 | "
\n", 668 | "\n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | "
countrylengthname
161CO6448Amazonas
\n", 686 | "
" 687 | ], 688 | "text/plain": [ 689 | " country length name\n", 690 | "161 CO 6448 Amazonas" 691 | ] 692 | }, 693 | "execution_count": 17, 694 | "metadata": {}, 695 | "output_type": "execute_result" 696 | } 697 | ], 698 | "source": [ 699 | "rivers_list=[]\n", 700 | "rivers_df = pd.DataFrame()\n", 701 | "for rivers in document.iterfind('river'):\n", 702 | " try:\n", 703 | " rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})\n", 704 | " except:\n", 705 | " next\n", 706 | "rivers_df = pd.DataFrame(rivers_list)\n", 707 | "rivers_df.sort_values(by = 'length', ascending=False).head(1)" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": { 714 | "collapsed": true 715 | }, 716 | "outputs": [], 717 | "source": [ 718 | "#b) largest lake" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 19, 724 | "metadata": { 725 | "collapsed": false 726 | }, 727 | "outputs": [ 728 | { 729 | "data": { 730 | "text/html": [ 731 | "
\n", 732 | "\n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | "
areacountryname
42386400RCaspian Sea
\n", 750 | "
" 751 | ], 752 | "text/plain": [ 753 | " area country name\n", 754 | "42 386400 R Caspian Sea" 755 | ] 756 | }, 757 | "execution_count": 19, 758 | "metadata": {}, 759 | "output_type": "execute_result" 760 | } 761 | ], 762 | "source": [ 763 | "lake_list=[]\n", 764 | "lake_df = pd.DataFrame()\n", 765 | "for lakes in document.iterfind('lake'):\n", 766 | " try:\n", 767 | " lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})\n", 768 | " except:\n", 769 | " next\n", 770 | "lakes_df = pd.DataFrame(lake_list)\n", 771 | "lakes_df.sort_values(by = 'area', ascending=False).head(1)" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": { 778 | "collapsed": true 779 | }, 780 | "outputs": [], 781 | "source": [ 782 | "#c) airport at highest elevation" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 20, 788 | "metadata": { 789 | "collapsed": false 790 | }, 791 | "outputs": [ 792 | { 793 | "name": "stderr", 794 | "output_type": "stream", 795 | "text": [ 796 | "/home/sibi/acad/prog_tools/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n" 797 | ] 798 | }, 799 | { 800 | "data": { 801 | "text/html": [ 802 | "
\n", 803 | "\n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | "
countryelevationname
80BOL4063El Alto Intl
\n", 821 | "
" 822 | ], 823 | "text/plain": [ 824 | " country elevation name\n", 825 | "80 BOL 4063 El Alto Intl" 826 | ] 827 | }, 828 | "execution_count": 20, 829 | "metadata": {}, 830 | "output_type": "execute_result" 831 | } 832 | ], 833 | "source": [ 834 | "ap_list=[]\n", 835 | "ap_df = pd.DataFrame()\n", 836 | "for ap in document.iterfind('airport'):\n", 837 | " try:\n", 838 | " ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})\n", 839 | " except:\n", 840 | " next\n", 841 | "ap_df = pd.DataFrame(ap_list)\n", 842 | "ap_df.sort('elevation', ascending=False).head(1)\n" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": { 849 | "collapsed": true 850 | }, 851 | "outputs": [], 852 | "source": [] 853 | } 854 | ], 855 | "metadata": { 856 | "kernelspec": { 857 | "display_name": "Python [Root]", 858 | "language": "python", 859 | "name": "Python [Root]" 860 | }, 861 | "language_info": { 862 | "codemirror_mode": { 863 | "name": "ipython", 864 | "version": 2 865 | }, 866 | "file_extension": ".py", 867 | "mimetype": "text/x-python", 868 | "name": "python", 869 | "nbconvert_exporter": "python", 870 | "pygments_lexer": "ipython2", 871 | "version": "2.7.12" 872 | } 873 | }, 874 | "nbformat": 4, 875 | "nbformat_minor": 0 876 | } 877 | -------------------------------------------------------------------------------- /Data Wrangling/data_wrangling_xml/data_wrangling_xml/.ipynb_checkpoints/sliderule_dsi_xml_exercise-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# XML example and exercise\n", 8 | "****\n", 9 | "+ study examples of accessing nodes in XML tree structure \n", 10 | "+ work on exercise to be completed and submitted\n", 11 | "****\n", 12 | "+ reference: https://docs.python.org/2.7/library/xml.etree.elementtree.html\n", 13 | "+ data source: http://www.dbis.informatik.uni-goettingen.de/Mondial\n", 14 | "****" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": { 21 | "collapsed": true 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "from xml.etree import ElementTree as ET" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## XML example\n", 33 | "\n", 34 | "+ for details about tree traversal and iterators, see https://docs.python.org/2.7/library/xml.etree.elementtree.html" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 3, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "document_tree = ET.parse( './data/mondial_database_less.xml' )" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": { 52 | "collapsed": false 53 | }, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "Albania\n", 60 | "Greece\n", 61 | "Macedonia\n", 62 | "Serbia\n", 63 | "Montenegro\n", 64 | "Kosovo\n", 65 | "Andorra\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "# print names of all countries\n", 71 | "for child in document_tree.getroot():\n", 72 | " print child.find('name').text" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 5, 78 | "metadata": { 79 | "collapsed": false 80 | }, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "* Albania: Tirana, Shkodër, Durrës, Vlorë, Elbasan, Korçë\n", 87 | "* Greece: Komotini, Kavala, Athina, Peiraias, Peristeri, Acharnes, Patra, Kozani, Kerkyra, Ioannina, Thessaloniki, Iraklio, Chania, Ermoupoli, Rhodes, Tripoli, Lamia, Chalkida, Larissa, Volos, Mytilini, Karyes\n", 88 | "* Macedonia: Skopje, Kumanovo\n", 89 | "* Serbia: Beograd, Novi Sad, Niš\n", 90 | "* Montenegro: Podgorica\n", 91 | "* Kosovo: Prishtine\n", 92 | "* Andorra: Andorra la Vella\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "# print names of all countries and their cities\n", 98 | "for element in document_tree.iterfind('country'):\n", 99 | " print '* ' + element.find('name').text + ':',\n", 100 | " capitals_string = ''\n", 101 | " for subelement in element.getiterator('city'):\n", 102 | " capitals_string += subelement.find('name').text + ', '\n", 103 | " print capitals_string[:-2]" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "****\n", 111 | "## XML exercise\n", 112 | "\n", 113 | "Using data in 'data/mondial_database.xml', the examples above, and refering to https://docs.python.org/2.7/library/xml.etree.elementtree.html, find\n", 114 | "\n", 115 | "1. 10 countries with the lowest infant mortality rates\n", 116 | "2. 10 cities with the largest population\n", 117 | "3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n", 118 | "4. name and country of a) longest river, b) largest lake and c) airport at highest elevation" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 6, 124 | "metadata": { 125 | "collapsed": false 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "document = ET.parse( './data/mondial_database.xml' )" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 7, 135 | "metadata": { 136 | "collapsed": false 137 | }, 138 | "outputs": [], 139 | "source": [ 140 | "#1. 10 countries with the lowest infant mortality rates\n", 141 | "country_im = []\n", 142 | "for country in document.iterfind('country'):\n", 143 | " if country.find('infant_mortality') is not None:\n", 144 | " country_im.append([country.find('name').text,country.find('infant_mortality').text])\n", 145 | " " 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 8, 151 | "metadata": { 152 | "collapsed": false 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "import pandas as pd\n", 157 | "im = pd.DataFrame(country_im)\n", 158 | "im.columns = [\"country\",\"infant_moratality\"]" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 9, 164 | "metadata": { 165 | "collapsed": false 166 | }, 167 | "outputs": [ 168 | { 169 | "data": { 170 | "text/html": [ 171 | "
\n", 172 | "\n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | "
countryinfant_moratality
36Monaco1.81
90Japan2.13
109Bermuda2.48
34Norway2.48
98Singapore2.53
35Sweden2.60
8Czech Republic2.63
72Hong Kong2.73
73Macao3.13
39Iceland3.15
\n", 233 | "
" 234 | ], 235 | "text/plain": [ 236 | " country infant_moratality\n", 237 | "36 Monaco 1.81\n", 238 | "90 Japan 2.13\n", 239 | "109 Bermuda 2.48\n", 240 | "34 Norway 2.48\n", 241 | "98 Singapore 2.53\n", 242 | "35 Sweden 2.60\n", 243 | "8 Czech Republic 2.63\n", 244 | "72 Hong Kong 2.73\n", 245 | "73 Macao 3.13\n", 246 | "39 Iceland 3.15" 247 | ] 248 | }, 249 | "execution_count": 9, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "im.infant_moratality = im.infant_moratality.astype(float)\n", 256 | "im = im.sort_values(by='infant_moratality')\n", 257 | "im.head(10)" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 10, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "#2. 10 cities with the largest population\n", 269 | "populations = []\n", 270 | "for country in document.iterfind('country'):\n", 271 | " if country.find('population') is not None:\n", 272 | " populations.append([country.find('name').text,country.find('population').text])\n", 273 | " " 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 11, 279 | "metadata": { 280 | "collapsed": false 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "pop = pd.DataFrame(populations)\n", 285 | "pop.columns = [\"country\",\"population\"]" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 12, 291 | "metadata": { 292 | "collapsed": false 293 | }, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/html": [ 298 | "
\n", 299 | "\n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | "
countrypopulation
166Pitcairn68.0
83Cocos Islands628.0
41Holy See840.0
121Cayman Islands933.0
138Sint Maarten1497.0
170Tokelau1570.0
39Gibraltar1816.0
186Falkland Islands2043.0
159Nauru2066.0
52Svalbard2116.0
\n", 360 | "
" 361 | ], 362 | "text/plain": [ 363 | " country population\n", 364 | "166 Pitcairn 68.0\n", 365 | "83 Cocos Islands 628.0\n", 366 | "41 Holy See 840.0\n", 367 | "121 Cayman Islands 933.0\n", 368 | "138 Sint Maarten 1497.0\n", 369 | "170 Tokelau 1570.0\n", 370 | "39 Gibraltar 1816.0\n", 371 | "186 Falkland Islands 2043.0\n", 372 | "159 Nauru 2066.0\n", 373 | "52 Svalbard 2116.0" 374 | ] 375 | }, 376 | "execution_count": 12, 377 | "metadata": {}, 378 | "output_type": "execute_result" 379 | } 380 | ], 381 | "source": [ 382 | "pop.population = pop.population.astype(float)\n", 383 | "pop = pop.sort_values(by = \"population\")\n", 384 | "pop.head(10)" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 13, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [ 394 | { 395 | "data": { 396 | "text/html": [ 397 | "
\n", 398 | "\n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | "
countriescountry_percentagecountry_popethnicitypopulationyear
0Albania95.02800138Albanian26601312011
1Albania3.02800138Greek840042011
2Greece93.010816286Greek100591452011
3Macedonia64.22059794Macedonian13223872011
4Macedonia25.22059794Albanian5190682011
5Macedonia3.92059794Turkish803312011
6Macedonia2.72059794Gypsy556142011
7Macedonia1.82059794Serb370762011
8Serbia82.97120666Serb59030322011
9Serbia0.97120666Montenegrin640852011
\n", 503 | "
" 504 | ], 505 | "text/plain": [ 506 | " countries country_percentage country_pop ethnicity population year\n", 507 | "0 Albania 95.0 2800138 Albanian 2660131 2011\n", 508 | "1 Albania 3.0 2800138 Greek 84004 2011\n", 509 | "2 Greece 93.0 10816286 Greek 10059145 2011\n", 510 | "3 Macedonia 64.2 2059794 Macedonian 1322387 2011\n", 511 | "4 Macedonia 25.2 2059794 Albanian 519068 2011\n", 512 | "5 Macedonia 3.9 2059794 Turkish 80331 2011\n", 513 | "6 Macedonia 2.7 2059794 Gypsy 55614 2011\n", 514 | "7 Macedonia 1.8 2059794 Serb 37076 2011\n", 515 | "8 Serbia 82.9 7120666 Serb 5903032 2011\n", 516 | "9 Serbia 0.9 7120666 Montenegrin 64085 2011" 517 | ] 518 | }, 519 | "execution_count": 13, 520 | "metadata": {}, 521 | "output_type": "execute_result" 522 | } 523 | ], 524 | "source": [ 525 | "#3. 10 ethnic groups with the largest overall populations (sum of best/latest estimates over all countries)\n", 526 | "data = [] \n", 527 | "\n", 528 | "for country in document.findall('country'):\n", 529 | " for node in list(country):\n", 530 | " if node.tag == 'name':\n", 531 | " co = node.text\n", 532 | " elif node.tag == 'population':\n", 533 | " # the last listed population statistic is used\n", 534 | " pop = int(node.text)\n", 535 | " #meas = node.attrib['measured'] --leads to an error, potentially unpopulated at times\n", 536 | " yr = int(node.attrib['year'])\n", 537 | " elif node.tag == 'ethnicgroup':\n", 538 | " eth = node.text\n", 539 | " perc = float(node.attrib['percentage'])\n", 540 | " epop = int(pop * perc / 100.)\n", 541 | " \n", 542 | " data.append({'countries':co, 'country_pop':pop, 'year':yr,\n", 543 | " 'ethnicity':eth, 'country_percentage':perc, 'population':epop})\n", 544 | " \n", 545 | "df = pd.DataFrame(data)\n", 546 | "df.head(10)" 547 | ] 548 | }, 549 | { 550 | "cell_type": "code", 551 | "execution_count": 15, 552 | "metadata": { 553 | "collapsed": false 554 | }, 555 | "outputs": [ 556 | { 557 | "data": { 558 | "text/html": [ 559 | "
\n", 560 | "\n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | "
ethnicitypopulation
0Han Chinese1245058800
1Indo-Aryan871815583
2European494872201
3African318325104
4Dravidian302713744
5Mestizo157734349
6Bengali146776916
7Russian131856989
8Japanese126534212
9Malay121993548
\n", 621 | "
" 622 | ], 623 | "text/plain": [ 624 | " ethnicity population\n", 625 | "0 Han Chinese 1245058800\n", 626 | "1 Indo-Aryan 871815583\n", 627 | "2 European 494872201\n", 628 | "3 African 318325104\n", 629 | "4 Dravidian 302713744\n", 630 | "5 Mestizo 157734349\n", 631 | "6 Bengali 146776916\n", 632 | "7 Russian 131856989\n", 633 | "8 Japanese 126534212\n", 634 | "9 Malay 121993548" 635 | ] 636 | }, 637 | "execution_count": 15, 638 | "metadata": {}, 639 | "output_type": "execute_result" 640 | } 641 | ], 642 | "source": [ 643 | "df.groupby('ethnicity').population.sum().sort_values(ascending=False).head(10).reset_index()" 644 | ] 645 | }, 646 | { 647 | "cell_type": "code", 648 | "execution_count": 14, 649 | "metadata": { 650 | "collapsed": true 651 | }, 652 | "outputs": [], 653 | "source": [ 654 | "#4. name and country of a) longest river " 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 17, 660 | "metadata": { 661 | "collapsed": false 662 | }, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/html": [ 667 | "
\n", 668 | "\n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | "
countrylengthname
161CO6448Amazonas
\n", 686 | "
" 687 | ], 688 | "text/plain": [ 689 | " country length name\n", 690 | "161 CO 6448 Amazonas" 691 | ] 692 | }, 693 | "execution_count": 17, 694 | "metadata": {}, 695 | "output_type": "execute_result" 696 | } 697 | ], 698 | "source": [ 699 | "rivers_list=[]\n", 700 | "rivers_df = pd.DataFrame()\n", 701 | "for rivers in document.iterfind('river'):\n", 702 | " try:\n", 703 | " rivers_list.append({'name':rivers.find('name').text, 'length':int(rivers.find('length').text), 'country':rivers.find('located').attrib['country']})\n", 704 | " except:\n", 705 | " next\n", 706 | "rivers_df = pd.DataFrame(rivers_list)\n", 707 | "rivers_df.sort_values(by = 'length', ascending=False).head(1)" 708 | ] 709 | }, 710 | { 711 | "cell_type": "code", 712 | "execution_count": null, 713 | "metadata": { 714 | "collapsed": true 715 | }, 716 | "outputs": [], 717 | "source": [ 718 | "#b) largest lake" 719 | ] 720 | }, 721 | { 722 | "cell_type": "code", 723 | "execution_count": 19, 724 | "metadata": { 725 | "collapsed": false 726 | }, 727 | "outputs": [ 728 | { 729 | "data": { 730 | "text/html": [ 731 | "
\n", 732 | "\n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | "
areacountryname
42386400RCaspian Sea
\n", 750 | "
" 751 | ], 752 | "text/plain": [ 753 | " area country name\n", 754 | "42 386400 R Caspian Sea" 755 | ] 756 | }, 757 | "execution_count": 19, 758 | "metadata": {}, 759 | "output_type": "execute_result" 760 | } 761 | ], 762 | "source": [ 763 | "lake_list=[]\n", 764 | "lake_df = pd.DataFrame()\n", 765 | "for lakes in document.iterfind('lake'):\n", 766 | " try:\n", 767 | " lake_list.append({'name':lakes.find('name').text, 'area':int(lakes.find('area').text), 'country':lakes.find('located').attrib['country']})\n", 768 | " except:\n", 769 | " next\n", 770 | "lakes_df = pd.DataFrame(lake_list)\n", 771 | "lakes_df.sort_values(by = 'area', ascending=False).head(1)" 772 | ] 773 | }, 774 | { 775 | "cell_type": "code", 776 | "execution_count": null, 777 | "metadata": { 778 | "collapsed": true 779 | }, 780 | "outputs": [], 781 | "source": [ 782 | "#c) airport at highest elevation" 783 | ] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": 20, 788 | "metadata": { 789 | "collapsed": false 790 | }, 791 | "outputs": [ 792 | { 793 | "name": "stderr", 794 | "output_type": "stream", 795 | "text": [ 796 | "/home/sibi/acad/prog_tools/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:9: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)\n" 797 | ] 798 | }, 799 | { 800 | "data": { 801 | "text/html": [ 802 | "
\n", 803 | "\n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | "
countryelevationname
80BOL4063El Alto Intl
\n", 821 | "
" 822 | ], 823 | "text/plain": [ 824 | " country elevation name\n", 825 | "80 BOL 4063 El Alto Intl" 826 | ] 827 | }, 828 | "execution_count": 20, 829 | "metadata": {}, 830 | "output_type": "execute_result" 831 | } 832 | ], 833 | "source": [ 834 | "ap_list=[]\n", 835 | "ap_df = pd.DataFrame()\n", 836 | "for ap in document.iterfind('airport'):\n", 837 | " try:\n", 838 | " ap_list.append({'name':ap.find('name').text, 'elevation':int(ap.find('elevation').text), 'country':ap.attrib['country']})\n", 839 | " except:\n", 840 | " next\n", 841 | "ap_df = pd.DataFrame(ap_list)\n", 842 | "ap_df.sort('elevation', ascending=False).head(1)\n" 843 | ] 844 | }, 845 | { 846 | "cell_type": "code", 847 | "execution_count": null, 848 | "metadata": { 849 | "collapsed": true 850 | }, 851 | "outputs": [], 852 | "source": [] 853 | } 854 | ], 855 | "metadata": { 856 | "kernelspec": { 857 | "display_name": "Python [Root]", 858 | "language": "python", 859 | "name": "Python [Root]" 860 | }, 861 | "language_info": { 862 | "codemirror_mode": { 863 | "name": "ipython", 864 | "version": 2 865 | }, 866 | "file_extension": ".py", 867 | "mimetype": "text/x-python", 868 | "name": "python", 869 | "nbconvert_exporter": "python", 870 | "pygments_lexer": "ipython2", 871 | "version": "2.7.12" 872 | } 873 | }, 874 | "nbformat": 4, 875 | "nbformat_minor": 0 876 | } 877 | -------------------------------------------------------------------------------- /Capstone Project/.ipynb_checkpoints/Classification_Adding_Shots-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import pandas as pd\n", 12 | "import numpy as np\n", 13 | "import scipy.stats as scipy" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 3, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "df = pd.read_csv(\"./Data/E0_13.csv\")\n", 25 | "df_14 = pd.read_csv(\"./Data/E0_14.csv\")" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 4, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [ 35 | { 36 | "data": { 37 | "text/plain": [ 38 | "Index([u'Div', u'Date', u'HomeTeam', u'AwayTeam', u'FTHG', u'FTAG', u'FTR',\n", 39 | " u'HTHG', u'HTAG', u'HTR', u'Referee', u'HS', u'AS', u'HST', u'AST',\n", 40 | " u'HF', u'AF', u'HC', u'AC', u'HY', u'AY', u'HR', u'AR', u'B365H',\n", 41 | " u'B365D', u'B365A', u'BWH', u'BWD', u'BWA', u'IWH', u'IWD', u'IWA',\n", 42 | " u'LBH', u'LBD', u'LBA', u'PSH', u'PSD', u'PSA', u'WHH', u'WHD', u'WHA',\n", 43 | " u'SJH', u'SJD', u'SJA', u'VCH', u'VCD', u'VCA', u'Bb1X2', u'BbMxH',\n", 44 | " u'BbAvH', u'BbMxD', u'BbAvD', u'BbMxA', u'BbAvA', u'BbOU', u'BbMx>2.5',\n", 45 | " u'BbAv>2.5', u'BbMx<2.5', u'BbAv<2.5', u'BbAH', u'BbAHh', u'BbMxAHH',\n", 46 | " u'BbAvAHH', u'BbMxAHA', u'BbAvAHA', u'PSCH', u'PSCD', u'PSCA'],\n", 47 | " dtype='object')" 48 | ] 49 | }, 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "output_type": "execute_result" 53 | } 54 | ], 55 | "source": [ 56 | "df.columns" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 5, 62 | "metadata": { 63 | "collapsed": false 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "res_13 = df.ix[:,:23]\n", 68 | "res_13 = res_13.drop(['Div','Date','Referee'],axis=1)\n", 69 | "res_14 = df_14.ix[:,:23]\n", 70 | "res_14 = res_14.drop(['Div','Date','Referee'],axis=1)\n", 71 | "table_features = df.ix[:,:7]\n", 72 | "table_features = table_features.drop(['FTHG','FTAG','Div','Date'],axis=1)\n", 73 | "bet_13 = df.ix[:,23:]\n" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 6, 79 | "metadata": { 80 | "collapsed": false, 81 | "scrolled": true 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "res_13.head()\n", 86 | "feature_table = df.ix[:,:23]" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "metadata": { 93 | "collapsed": false 94 | }, 95 | "outputs": [], 96 | "source": [ 97 | "#Team, Home Goals Score, Away Goals Score, Attack Strength, Home Goals Conceded, Away Goals Conceded, Defensive Strength\n", 98 | "table_13 = pd.DataFrame(columns=('Team','HGS','AGS','HAS','AAS','HGC','AGC','HDS','ADS'))" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": { 105 | "collapsed": false 106 | }, 107 | "outputs": [ 108 | { 109 | "name": "stdout", 110 | "output_type": "stream", 111 | "text": [ 112 | "Average number of goals at home 1.57368421053\n", 113 | "Average number of goals away 1.19473684211\n", 114 | "Average number of goals conceded at home 1.57368421053\n", 115 | "Average number of goals conceded away 1.19473684211\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "avg_home_scored_13 = res_13.FTHG.sum() / 380.0\n", 121 | "avg_away_scored_13 = res_13.FTAG.sum() / 380.0\n", 122 | "avg_home_conceded_13 = avg_away_scored_13\n", 123 | "avg_away_conceded_13 = avg_home_scored_13\n", 124 | "print \"Average number of goals at home\",avg_home_scored_13\n", 125 | "print \"Average number of goals away\", avg_away_scored_13\n", 126 | "print \"Average number of goals conceded at home\",avg_away_conceded_13\n", 127 | "print \"Average number of goals conceded away\",avg_home_conceded_13\n" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": 9, 133 | "metadata": { 134 | "collapsed": false 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "res_home = res_13.groupby('HomeTeam')\n", 139 | "res_away = res_13.groupby('AwayTeam')" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 10, 145 | "metadata": { 146 | "collapsed": false 147 | }, 148 | "outputs": [ 149 | { 150 | "data": { 151 | "text/plain": [ 152 | "('Arsenal',\n", 153 | " HomeTeam AwayTeam FTHG FTAG FTR HTHG HTAG HTR HS AS HST \\\n", 154 | " 0 Arsenal Aston Villa 1 3 A 1 1 D 16 9 4 \n", 155 | " 27 Arsenal Tottenham 1 0 H 1 0 H 12 14 5 \n", 156 | " 46 Arsenal Stoke 3 1 H 2 1 H 16 9 8 \n", 157 | " 70 Arsenal Norwich 4 1 H 1 0 H 20 12 11 \n", 158 | " 90 Arsenal Liverpool 2 0 H 1 0 H 12 12 7 \n", 159 | " 110 Arsenal Southampton 2 0 H 1 0 H 9 10 4 \n", 160 | " 131 Arsenal Hull 2 0 H 1 0 H 20 7 7 \n", 161 | " 147 Arsenal Everton 1 1 D 0 0 D 11 12 5 \n", 162 | " 169 Arsenal Chelsea 0 0 D 0 0 D 7 13 2 \n", 163 | " 190 Arsenal Cardiff 2 0 H 0 0 D 28 8 6 \n", 164 | " 210 Arsenal Fulham 2 0 H 0 0 D 22 8 8 \n", 165 | " 237 Arsenal Crystal Palace 2 0 H 0 0 D 11 10 6 \n", 166 | " 254 Arsenal Man United 0 0 D 0 0 D 17 6 5 \n", 167 | " 258 Arsenal Sunderland 4 1 H 3 0 H 12 7 9 \n", 168 | " 301 Arsenal Swansea 2 2 D 0 1 A 13 8 4 \n", 169 | " 306 Arsenal Man City 1 1 D 0 1 A 10 15 3 \n", 170 | " 334 Arsenal West Ham 3 1 H 1 1 D 14 12 8 \n", 171 | " 356 Arsenal Newcastle 3 0 H 2 0 H 20 8 8 \n", 172 | " 364 Arsenal West Brom 1 0 H 1 0 H 15 11 4 \n", 173 | " \n", 174 | " AST HF AF HC AC HY AY HR AR \n", 175 | " 0 4 15 18 4 3 4 5 1 0 \n", 176 | " 27 4 15 14 3 6 2 2 0 0 \n", 177 | " 46 3 8 15 6 7 0 2 0 0 \n", 178 | " 70 6 8 7 10 1 0 0 0 0 \n", 179 | " 90 4 11 7 3 5 2 1 0 0 \n", 180 | " 110 4 10 14 5 6 0 3 0 0 \n", 181 | " 131 2 9 6 11 1 0 0 0 0 \n", 182 | " 147 4 13 11 3 2 0 4 0 0 \n", 183 | " 169 4 7 11 8 6 2 1 0 0 \n", 184 | " 190 2 6 11 12 2 1 2 0 0 \n", 185 | " 210 2 5 7 4 4 0 0 0 0 \n", 186 | " 237 2 9 14 6 5 1 2 0 0 \n", 187 | " 254 2 10 14 5 5 1 2 0 0 \n", 188 | " 258 3 10 11 7 3 0 1 0 0 \n", 189 | " 301 2 11 12 7 0 0 1 0 0 \n", 190 | " 306 4 8 11 6 6 1 4 0 0 \n", 191 | " 334 2 14 12 4 3 2 2 0 0 \n", 192 | " 356 3 9 8 14 0 3 2 0 0 \n", 193 | " 364 1 9 6 9 10 2 2 0 0 )" 194 | ] 195 | }, 196 | "execution_count": 10, 197 | "metadata": {}, 198 | "output_type": "execute_result" 199 | } 200 | ], 201 | "source": [ 202 | "list(res_home)[0]" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 11, 208 | "metadata": { 209 | "collapsed": false 210 | }, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/html": [ 215 | "
\n", 216 | "\n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | "
TeamHGSAGSHASAASHGCAGCHDSADS
0Arsenal3632NaNNaN1130NaNNaN
1Aston Villa2217NaNNaN2932NaNNaN
2Cardiff2012NaNNaN3539NaNNaN
3Chelsea4328NaNNaN1116NaNNaN
4Crystal Palace1815NaNNaN2325NaNNaN
\n", 294 | "
" 295 | ], 296 | "text/plain": [ 297 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n", 298 | "0 Arsenal 36 32 NaN NaN 11 30 NaN NaN\n", 299 | "1 Aston Villa 22 17 NaN NaN 29 32 NaN NaN\n", 300 | "2 Cardiff 20 12 NaN NaN 35 39 NaN NaN\n", 301 | "3 Chelsea 43 28 NaN NaN 11 16 NaN NaN\n", 302 | "4 Crystal Palace 18 15 NaN NaN 23 25 NaN NaN" 303 | ] 304 | }, 305 | "execution_count": 11, 306 | "metadata": {}, 307 | "output_type": "execute_result" 308 | } 309 | ], 310 | "source": [ 311 | "table_13.Team = res_home.HomeTeam.all().values\n", 312 | "table_13.HGS = res_home.FTHG.sum().values\n", 313 | "table_13.HGC = res_home.FTAG.sum().values\n", 314 | "table_13.AGS = res_away.FTAG.sum().values\n", 315 | "table_13.AGC = res_away.FTHG.sum().values\n", 316 | "table_13.head()" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 12, 322 | "metadata": { 323 | "collapsed": false 324 | }, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/html": [ 329 | "
\n", 330 | "\n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | "
TeamHGSAGSHASAASHGCAGCHDSADS
0Arsenal36321.2040131.40969211300.4845811.003344
1Aston Villa22170.7357860.74889929321.2775331.070234
2Cardiff20120.6688960.52863435391.5418501.304348
3Chelsea43281.4381271.23348011160.4845810.535117
4Crystal Palace18150.6020070.66079323251.0132160.836120
\n", 408 | "
" 409 | ], 410 | "text/plain": [ 411 | " Team HGS AGS HAS AAS HGC AGC HDS ADS\n", 412 | "0 Arsenal 36 32 1.204013 1.409692 11 30 0.484581 1.003344\n", 413 | "1 Aston Villa 22 17 0.735786 0.748899 29 32 1.277533 1.070234\n", 414 | "2 Cardiff 20 12 0.668896 0.528634 35 39 1.541850 1.304348\n", 415 | "3 Chelsea 43 28 1.438127 1.233480 11 16 0.484581 0.535117\n", 416 | "4 Crystal Palace 18 15 0.602007 0.660793 23 25 1.013216 0.836120" 417 | ] 418 | }, 419 | "execution_count": 12, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "table_13.HAS = (table_13.HGS / 19.0) / avg_home_scored_13\n", 426 | "table_13.AAS = (table_13.AGS / 19.0) / avg_away_scored_13\n", 427 | "table_13.HDS = (table_13.HGC / 19.0) / avg_home_conceded_13\n", 428 | "table_13.ADS = (table_13.AGC / 19.0) / avg_away_conceded_13\n", 429 | "table_13.head()" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": 15, 435 | "metadata": { 436 | "collapsed": false 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "feature_table = feature_table[['HomeTeam','AwayTeam','FTR','HST','AST']]\n", 441 | "f_HAS = []\n", 442 | "f_HDS = []\n", 443 | "f_AAS = []\n", 444 | "f_ADS = []\n", 445 | "for index,row in feature_table.iterrows():\n", 446 | " f_HAS.append(table_13[table_13['Team'] == row['HomeTeam']]['HAS'].values[0])\n", 447 | " f_HDS.append(table_13[table_13['Team'] == row['HomeTeam']]['HDS'].values[0])\n", 448 | " f_AAS.append(table_13[table_13['Team'] == row['HomeTeam']]['AAS'].values[0])\n", 449 | " f_ADS.append(table_13[table_13['Team'] == row['HomeTeam']]['ADS'].values[0])\n", 450 | " \n", 451 | "feature_table['HAS'] = f_HAS\n", 452 | "feature_table['HDS'] = f_HDS\n", 453 | "feature_table['AAS'] = f_AAS\n", 454 | "feature_table['ADS'] = f_ADS" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 16, 460 | "metadata": { 461 | "collapsed": false 462 | }, 463 | "outputs": [ 464 | { 465 | "data": { 466 | "text/html": [ 467 | "
\n", 468 | "\n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | "
HomeTeamAwayTeamFTRHSTASTHASHDSAASADS
0ArsenalAston VillaA441.2040130.4845811.4096921.003344
1LiverpoolStokeH1141.7725750.7929522.1145371.070234
2NorwichEvertonD260.5685620.7929520.4845811.471572
3SunderlandFulhamA310.7023411.1894270.8810571.103679
4SwanseaMan UnitedA671.1036791.1453740.9251100.936455
\n", 546 | "
" 547 | ], 548 | "text/plain": [ 549 | " HomeTeam AwayTeam FTR HST AST HAS HDS AAS \\\n", 550 | "0 Arsenal Aston Villa A 4 4 1.204013 0.484581 1.409692 \n", 551 | "1 Liverpool Stoke H 11 4 1.772575 0.792952 2.114537 \n", 552 | "2 Norwich Everton D 2 6 0.568562 0.792952 0.484581 \n", 553 | "3 Sunderland Fulham A 3 1 0.702341 1.189427 0.881057 \n", 554 | "4 Swansea Man United A 6 7 1.103679 1.145374 0.925110 \n", 555 | "\n", 556 | " ADS \n", 557 | "0 1.003344 \n", 558 | "1 1.070234 \n", 559 | "2 1.471572 \n", 560 | "3 1.103679 \n", 561 | "4 0.936455 " 562 | ] 563 | }, 564 | "execution_count": 16, 565 | "metadata": {}, 566 | "output_type": "execute_result" 567 | } 568 | ], 569 | "source": [ 570 | "feature_table.head()" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 17, 576 | "metadata": { 577 | "collapsed": true 578 | }, 579 | "outputs": [], 580 | "source": [ 581 | "def transformResult(row):\n", 582 | " if(row.FTR == 'H'):\n", 583 | " return 1\n", 584 | " elif(row.FTR == 'A'):\n", 585 | " return -1\n", 586 | " else:\n", 587 | " return 0" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": 18, 593 | "metadata": { 594 | "collapsed": false 595 | }, 596 | "outputs": [], 597 | "source": [ 598 | "feature_table[\"Result\"] = feature_table.apply(lambda row: transformResult(row),axis=1)" 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": 19, 604 | "metadata": { 605 | "collapsed": false 606 | }, 607 | "outputs": [ 608 | { 609 | "data": { 610 | "text/html": [ 611 | "
\n", 612 | "\n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | "
HomeTeamAwayTeamFTRHSTASTHASHDSAASADSResult
0ArsenalAston VillaA441.2040130.4845811.4096921.003344-1
1LiverpoolStokeH1141.7725750.7929522.1145371.0702341
2NorwichEvertonD260.5685620.7929520.4845811.4715720
3SunderlandFulhamA310.7023411.1894270.8810571.103679-1
4SwanseaMan UnitedA671.1036791.1453740.9251100.936455-1
\n", 696 | "
" 697 | ], 698 | "text/plain": [ 699 | " HomeTeam AwayTeam FTR HST AST HAS HDS AAS \\\n", 700 | "0 Arsenal Aston Villa A 4 4 1.204013 0.484581 1.409692 \n", 701 | "1 Liverpool Stoke H 11 4 1.772575 0.792952 2.114537 \n", 702 | "2 Norwich Everton D 2 6 0.568562 0.792952 0.484581 \n", 703 | "3 Sunderland Fulham A 3 1 0.702341 1.189427 0.881057 \n", 704 | "4 Swansea Man United A 6 7 1.103679 1.145374 0.925110 \n", 705 | "\n", 706 | " ADS Result \n", 707 | "0 1.003344 -1 \n", 708 | "1 1.070234 1 \n", 709 | "2 1.471572 0 \n", 710 | "3 1.103679 -1 \n", 711 | "4 0.936455 -1 " 712 | ] 713 | }, 714 | "execution_count": 19, 715 | "metadata": {}, 716 | "output_type": "execute_result" 717 | } 718 | ], 719 | "source": [ 720 | "feature_table.head()" 721 | ] 722 | }, 723 | { 724 | "cell_type": "code", 725 | "execution_count": 21, 726 | "metadata": { 727 | "collapsed": true 728 | }, 729 | "outputs": [], 730 | "source": [ 731 | "X_train = feature_table[['HST','AST','HAS','HDS','AAS','ADS']]\n", 732 | "y_train = feature_table['Result']" 733 | ] 734 | }, 735 | { 736 | "cell_type": "code", 737 | "execution_count": 27, 738 | "metadata": { 739 | "collapsed": false 740 | }, 741 | "outputs": [], 742 | "source": [ 743 | "from sklearn.tree import DecisionTreeClassifier\n", 744 | "from sklearn.naive_bayes import MultinomialNB\n", 745 | "from xgboost import XGBClassifier\n", 746 | "from sklearn.metrics import accuracy_score\n", 747 | "from sklearn.model_selection import cross_val_score\n" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": 43, 753 | "metadata": { 754 | "collapsed": false 755 | }, 756 | "outputs": [ 757 | { 758 | "data": { 759 | "text/plain": [ 760 | "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n", 761 | " max_features=None, max_leaf_nodes=None,\n", 762 | " min_impurity_split=1e-07, min_samples_leaf=1,\n", 763 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 764 | " presort=False, random_state=None, splitter='best')" 765 | ] 766 | }, 767 | "execution_count": 43, 768 | "metadata": {}, 769 | "output_type": "execute_result" 770 | } 771 | ], 772 | "source": [ 773 | "clf = DecisionTreeClassifier()\n", 774 | "clf2 = MultinomialNB()\n", 775 | "clf3 = XGBClassifier()\n", 776 | "clf.fit(X_train,y_train)" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": 45, 782 | "metadata": { 783 | "collapsed": false 784 | }, 785 | "outputs": [], 786 | "source": [ 787 | "# y_pred = clf3.predict(X_train)\n", 788 | "accuracy_score(y_pred,y_train)\n", 789 | "scores = cross_val_score(clf2, X_train, y_train, cv=10)\n" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": 46, 795 | "metadata": { 796 | "collapsed": false 797 | }, 798 | "outputs": [ 799 | { 800 | "name": "stdout", 801 | "output_type": "stream", 802 | "text": [ 803 | "[ 0.56410256 0.64102564 0.56410256 0.57894737 0.65789474 0.65789474\n", 804 | " 0.65789474 0.65789474 0.54054054 0.75 ]\n", 805 | "0.627029762556\n" 806 | ] 807 | } 808 | ], 809 | "source": [ 810 | "print scores\n", 811 | "print scores.mean()" 812 | ] 813 | } 814 | ], 815 | "metadata": { 816 | "kernelspec": { 817 | "display_name": "Python [Root]", 818 | "language": "python", 819 | "name": "Python [Root]" 820 | }, 821 | "language_info": { 822 | "codemirror_mode": { 823 | "name": "ipython", 824 | "version": 2 825 | }, 826 | "file_extension": ".py", 827 | "mimetype": "text/x-python", 828 | "name": "python", 829 | "nbconvert_exporter": "python", 830 | "pygments_lexer": "ipython2", 831 | "version": "2.7.12" 832 | } 833 | }, 834 | "nbformat": 4, 835 | "nbformat_minor": 0 836 | } 837 | --------------------------------------------------------------------------------