├── 0.jpg
├── Additional Resources
    ├── 7.1 Storytelling Resources
    │   ├── 03-EDA.pdf
    │   ├── 06-StoryTelling.pdf
    │   ├── choosing-a-good-chart-09.pdf
    │   └── dummy
    ├── Apache Spark.pdf
    ├── List of Resources
    ├── Statistics and Machine Learning in Python.pdf
    ├── causal_inference_tutorial.pptx
    └── profile_pic_jpeg.jpg
├── DataCamp_Notes
    ├── Advanced NLP with Spacy.txt
    ├── Analyzing Police Activity with pandas.txt
    ├── Chapter 10 - Machine Learning
    │   ├── Sec 10.1 - Linear - Logistic Regression
    │   │   └── XX. Supervised Learning with Scikit-Learn
    │   └── YYY. Machine Learning with the Experts: School Budgets
    ├── Chapter 10
    │   ├── Sec 10.1 - Linear - Logistic Regression
    │   │   ├── ch1_SupervisedLearning.pdf
    │   │   ├── ch2_IntroToRegression.pdf
    │   │   ├── ch3_FineTuningModel.pdf
    │   │   └── ch4_PreProcessingData.pdf
    │   └── Sec 10.5 - Introduction to Unsupervised Learning
    │   │   └── Notes
    │   │       ├── ZZ. Unsupervised Learning in Python
    │   │       ├── ch1_slides.pdf
    │   │       ├── ch2_slides.pdf
    │   │       ├── ch3_slides.pdf
    │   │       └── ch4_slides.pdf
    ├── Chapter 12
    │   └── Sec 12.2 Fundamentals of NLP
    │   │   └── Notes
    │   │       ├── ZXY. Building Chatbots in Python
    │   │       └── ZZ. NLP in Python
    ├── Chapter 16 Data Science at Scale
    │   └── Class: Introduction to PySpark
    │   │   └── Intro to PySpark.txt
    ├── Chapter 4
    │   ├── Sec 4.1 - Python for Data Science
    │   │   ├── 2. intermed python for data science
    │   │   ├── 3. data types for data science
    │   │   ├── 4. python datascience toolbox part 1
    │   │   └── 5. Introduction to Data Visualization with Python
    │   └── Sec 4.2 Coding Practices
    │   │   └── Writing Functions in Python.txt
    ├── Chapter 5
    │   ├── Sec 5.1 - Data Wrangling with Pandas
    │   │   ├── 6. pandas Foundations
    │   │   ├── 6. pandas Foundations - Slides
    │   │   │   ├── ch1_slides.pdf
    │   │   │   ├── ch2_slides.pdf
    │   │   │   ├── ch3_slides.pdf
    │   │   │   ├── ch4_slides.pdf
    │   │   │   └── txt
    │   │   ├── 7. Manipulating DataFrames with Pandas
    │   │   ├── 8. Merging DataFrames with pandas
    │   │   └── 9. Cleaning Data in Python
    │   └── Sec 5.2 - Working with Data in Files
    │   │   ├── 10. Python Data Science Toolbox (Part 2)
    │   │   ├── 11. Importing Data in Python (Part 1)
    │   │   └── 12. Importing Data in Python (Part 2)
    ├── Chapter 8 - Inferential Statistics
    │   ├── Customer Analytics & AB Testing in Python.txt
    │   ├── Experimental Design in Python.txt
    │   ├── Preparing for Statistics Interview Questions in Python.txt
    │   └── Sec 8.2 - Inferential Statistics Using Python
    │   │   ├── Statistical Thinking in Python (Part 1)
    │   │   └── Statistical Thinking in Python (Part 2)
    ├── Creating Robust Python Workflows.txt
    ├── Designing Machine Learning Workflows in Python
    ├── Dimensionality Reduction in Python.txt
    ├── Feature Engineering for MachineLearning in Python.txt
    ├── Interactive Data Visualization with Bokeh.txt
    ├── Intermediate SQL.txt
    ├── Introduction to Data Visualization with Seaborn.txt
    ├── Introduction to Databases in Python
    ├── Introduction to Git for Data Science.txt
    ├── Introduction to Tensorflow
    ├── Introduction to Time Series Analysis in Python.txt
    ├── Linear Classifiers in Python.txt
    ├── Machine Learning for Marketing in Python.txt
    ├── Object-Oriented Programming in Python.txt
    ├── Optimizing Python Code with pandas.txt
    ├── Software Engineering for Data Scientists in Python.txt
    ├── Visualizing Time Series Data in Python.txt
    ├── Working with Dates and Times in Python.txt
    └── Writing Efficient Python Code.txt
├── MBazeley_Resume_Aug2020_v2.docx
├── MBazeley_Resume_Aug2020_v2.pdf
├── README.md
├── _config.yml
├── about.md
├── capstone1
    ├── BazeleyMikiko_Capstone1_Springboard_V2.pdf
    ├── BazeleyMikiko_Capstone1_Springboard_V2.pptx
    ├── Capstone Project 1_ Final Report (V2).pdf
    ├── Capstone1-FinalVersion-BazeleyMikiko-Springboard.ipynb
    ├── Capstone1-InDepthAnalysis.ipynb
    ├── IntroCall Scoring - Data Storytelling.ipynb
    ├── Project Documents
    │   ├── Apply Data Storytelling.pdf
    │   ├── Apply Inferential Statistics.pdf
    │   ├── BazeleyMikiko_Capstone1_Springboard_2019March.pdf
    │   ├── BazeleyMikiko_Capstone1_Springboard_2019March.pptx
    │   ├── Capstone 1 - In-Depth Analysis - MBazeley.pdf
    │   ├── Capstone Project 1_ Data Storytelling - Google Docs.pdf
    │   ├── Capstone Project 1_ Exploratory Data Analysis.pdf
    │   ├── Capstone Project 1_ Milestone Report.pdf
    │   ├── Data Story Rubric_ Capstone 1.pdf
    │   ├── Data Wrangling Rubric_ Capstone 1.pdf
    │   ├── Milestone Report Rubric_ Capstone 1 .pdf
    │   ├── Project Capstone Project 1 Data Wrangling.pdf
    │   ├── Project Capstone Project 1 Milestone Report.pdf
    │   ├── _ DSC Capstone Project 1 Rubric_ Inferential Statistics.pdf
    │   └── dumy.txt
    ├── Project_ Capstone Project 1_ Project Proposal.docx
    ├── Springboard Project Capstone 1 - Project Ideas - Google Docs.pdf
    └── WalkMe - IntroCall Scoring - Exploratory Data Analysis.ipynb
├── capstone2
    ├── Capstone 2 - Kickstarter - Final Submission.ipynb
    ├── Capstone 2_ Final Summary.pdf
    ├── Capstone 2_ Milestone 1 Report.pdf
    ├── Capstone 2_ Milestone 2 Report.pdf
    ├── Capstone2_Slides.pdf
    ├── Capstone2_Slides.pptx
    └── dummy
├── mini-projects
    ├── Ch 10 Machine Learning
    │   ├── Ch 10.1 Linear - Logistic Regression
    │   │   ├── BazeleyMikiko_MiniProject_LinearRegression_2018Dec29.ipynb
    │   │   ├── DSC Mini-Project_Linear Regression Rubric.pdf
    │   │   └── images
    │   │   │   ├── conditionalmean.png
    │   │   │   ├── cs109gitflow3.png
    │   │   │   ├── dummy.txt
    │   │   │   └── shuttle.png
    │   ├── Ch 10.1 Logistic Regression
    │   │   ├── BazeleyM_MiniProject_LogisticRegression_2018Dec30.ipynb
    │   │   ├── data
    │   │   │   ├── 01_heights_weights_genders.csv
    │   │   │   └── dummy
    │   │   ├── dummy
    │   │   └── images
    │   │   │   ├── .DS_Store
    │   │   │   ├── bias.png
    │   │   │   ├── complexity-error-plot.png
    │   │   │   ├── complexity-error-reg.png
    │   │   │   ├── data.png
    │   │   │   ├── dummy
    │   │   │   ├── knn1.png
    │   │   │   ├── knn2.png
    │   │   │   ├── linreg.png
    │   │   │   ├── linsep.png
    │   │   │   ├── onelinesplit.png
    │   │   │   ├── pcanim.gif
    │   │   │   ├── reshape.jpg
    │   │   │   ├── sklearn2.jpg
    │   │   │   ├── sklearntrans.jpg
    │   │   │   ├── train-cv2.png
    │   │   │   ├── train-cv3.png
    │   │   │   ├── train-test.png
    │   │   │   ├── train-validate-test-cont.png
    │   │   │   ├── train-validate-test.png
    │   │   │   └── train-validate-test3.png
    │   ├── Ch 10.3 Bayesian Methods and Text Data
    │   │   ├── MiniProject_NaiveBayes_BazeleyMikiko_2019Jan6.ipynb
    │   │   ├── callibration.png
    │   │   ├── critics.csv
    │   │   ├── terms.png
    │   │   ├── terms2.png
    │   │   └── vsm.png
    │   └── Ch 10.5 Introduction to Unsupervised Learning
    │   │   ├── .ipynb_checkpoints
    │   │       └── dummy
    │   │   ├── BazeleyM_MiniProject_Clustering_2019Jan27.ipynb
    │   │   ├── WineKMC.xlsx
    │   │   └── dummy
    ├── Ch 16 Spark and PySpark
    │   ├── Springboard - BazeleyMikiko - 2019Feb13 - Spark DF, SQL, ML Exercise.html
    │   ├── Springboard - BazeleyMikiko - 2019Feb13 - Spark DF, SQL, ML Exercise.ipynb
    │   └── dummy.txt
    ├── Ch 5 - Data Wrangling
    │   ├── 5.2 Working with Data
    │   │   └── JSON Exervise
    │   │   │   ├── BazeleyM_JSON_Exercise.ipynb
    │   │   │   ├── data
    │   │   │       ├── dummy
    │   │   │       ├── world_bank_projects.json
    │   │   │       └── world_bank_projects_less.json
    │   │   │   └── world_bank_data.csv
    │   ├── 5.3 SQL Practice
    │   │   ├── BazeleyMikiko_Sec5_3_sql_project_2018Nov24.sql
    │   │   ├── DSC Mini-Project_ SQL Rubric.docx
    │   │   ├── Database Info
    │   │   │   ├── bookings_table_constraints.PNG
    │   │   │   ├── bookings_table_data.PNG
    │   │   │   ├── countryClub_bookings.csv
    │   │   │   ├── countryClub_facilities.csv
    │   │   │   ├── countryClub_members.csv
    │   │   │   ├── country_club_db.PNG
    │   │   │   ├── countryclub.db
    │   │   │   ├── facilities_table_constraints.PNG
    │   │   │   ├── facilities_table_data.PNG
    │   │   │   ├── members_table_constraints.PNG
    │   │   │   └── members_table_data.PNG
    │   │   └── Project Info
    │   │   │   ├── Answers
    │   │   │   ├── Intro
    │   │   │   ├── Project Requirements & Questions
    │   │   │   ├── Resources
    │   │   │   └── dummy
    │   └── 5.4 API
    │   │   ├── BazeleyMikiko_MiniProject_DataWrangling_API.ipynb
    │   │   └── dummy.txt
    ├── Ch 8 Inferential Statistics
    │   └── Ch 8.3 Exploratory Data Analysis
    │   │   ├── EDA_HumanTemp
    │   │       ├── EDA_human_temperature_BazeleyMikiko-checkpoint.ipynb
    │   │       ├── MiniProject_EDA_HumanBodyTemp_BazeleyM_2019Jan01-checkpoint.ipynb
    │   │       ├── MiniProject_EDA_HumanBodyTemp_BazeleyM_2019Jan01.ipynb
    │   │       ├── dummy.tx
    │   │       └── human_body_temperature.csv
    │   │   ├── Racial Discrimination
    │   │       ├── BazeleyMikiko_MiniProject_RacialDiscrimination-checkpoint.ipynb
    │   │       ├── BazeleyMikiko_MiniProject_RacialDiscrimination.ipynb
    │   │       ├── dummy.txt
    │   │       ├── sliderule_dsi_inferential_statistics_exercise_1_solutions-checkpoint.ipynb
    │   │       ├── sliderule_dsi_inferential_statistics_exercise_2_solutions-checkpoint.ipynb
    │   │       └── us_job_market_discrimination.dta
    │   │   ├── Reduce Hospital Readmissions
    │   │       ├── BazeleyMikiko_MiniProject_ReduceHospitalReadmissions-checkpoint.ipynb
    │   │       ├── BazeleyMikiko_MiniProject_ReduceHospitalReadmissions.ipynb
    │   │       ├── cms_hospital_readmissions.csv
    │   │       ├── dummy.txt
    │   │       ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb
    │   │       └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb
    │   │   └── dummy.txt
    └── dummy
└── small_logo.png


/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/0.jpg


--------------------------------------------------------------------------------
/Additional Resources/7.1 Storytelling Resources/03-EDA.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/7.1 Storytelling Resources/03-EDA.pdf


--------------------------------------------------------------------------------
/Additional Resources/7.1 Storytelling Resources/06-StoryTelling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/7.1 Storytelling Resources/06-StoryTelling.pdf


--------------------------------------------------------------------------------
/Additional Resources/7.1 Storytelling Resources/choosing-a-good-chart-09.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/7.1 Storytelling Resources/choosing-a-good-chart-09.pdf


--------------------------------------------------------------------------------
/Additional Resources/7.1 Storytelling Resources/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Additional Resources/Apache Spark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/Apache Spark.pdf


--------------------------------------------------------------------------------
/Additional Resources/List of Resources:
--------------------------------------------------------------------------------
 1 | Chapter 7 - Data Storytelling
 2 | Sec 7.1 - Storytelling Resources
 3 |     - Choosing the Right Chart: https://extremepresentation.typepad.com/files/choosing-a-good-chart-09.pdf
 4 |     - Visualizing Economics:
 5 |         - https://www.kickstarter.com/projects/visualizingeconomics/visualizingeconomics-an-infographic-zine
 6 |         - http://visualizingeconomics.com/book/
 7 |     - MTA Data/Report: http://cs109hubway.github.io/classp/
 8 |         
 9 |         
10 |         
11 |         
12 | 


--------------------------------------------------------------------------------
/Additional Resources/Statistics and Machine Learning in Python.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/Statistics and Machine Learning in Python.pdf


--------------------------------------------------------------------------------
/Additional Resources/causal_inference_tutorial.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/causal_inference_tutorial.pptx


--------------------------------------------------------------------------------
/Additional Resources/profile_pic_jpeg.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/profile_pic_jpeg.jpg


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch1_SupervisedLearning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch1_SupervisedLearning.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch2_IntroToRegression.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch2_IntroToRegression.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch3_FineTuningModel.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch3_FineTuningModel.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch4_PreProcessingData.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch4_PreProcessingData.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch1_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch1_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch2_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch2_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch3_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch3_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch4_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch4_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch1_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch1_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch2_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch2_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch3_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch3_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch4_slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch4_slides.pdf


--------------------------------------------------------------------------------
/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/DataCamp_Notes/Dimensionality Reduction in Python.txt:
--------------------------------------------------------------------------------
  1 | Dimensionality Reduction in Python
  2 | 
  3 | Course Description
  4 | High-dimensional datasets can be overwhelming and leave you not knowing where to start. Typically, you’d visually explore a new dataset first, but when you have too many dimensions the classical approaches will seem insufficient. Fortunately, there are visualization techniques designed specifically for high dimensional data and you’ll be introduced to these in this course. After exploring the data, you’ll often find that many features hold little information because they don’t show any variance or because they are duplicates of other features. You’ll learn how to detect these features and drop them from the dataset so that you can focus on the informative ones. In a next step, you might want to build a model on these features, and it may turn out that some don’t have any effect on the thing you’re trying to predict. You’ll learn how to detect and drop these irrelevant features too, in order to reduce dimensionality and thus complexity. Finally, you’ll learn how feature extraction techniques can reduce dimensionality for you through the calculation of uncorrelated principal components.
  5 | 
  6 | <=========================================================================================================================================>
  7 | 1
  8 | Exploring high dimensional data
  9 | FREE
 10 | 0%
 11 | You'll be introduced to the concept of dimensionality reduction and will learn when an why this is important. You'll learn the difference between feature selection and feature extraction and will apply both techniques for data exploration. The chapter ends with a lesson on t-SNE, a powerful feature extraction technique that will allow you to visualize a high-dimensional dataset.
 12 | 
 13 | <---------------------------------------------------------------------------------------------------------------------------------------->
 14 | 
 15 | Fitting t-SNE to the ANSUR data
 16 | t-SNE is a great technique for visual exploration of high dimensional datasets. In this exercise, you'll apply it to the ANSUR dataset. You'll remove non-numeric columns from the pre-loaded dataset df and fit TSNE to his numeric dataset.
 17 | 
 18 | Instructions
 19 | 100 XP
 20 | Drop the non-numeric columns from the dataset.
 21 | Create a TSNE model with learning rate 50.
 22 | Fit and transform the model on the numeric dataset.
 23 | 
 24 | Take Hint (-30 XP)
 25 | 
 26 | 
 27 | # Non-numerical columns in the dataset
 28 | non_numeric = ['Branch', 'Gender', 'Component']
 29 | 
 30 | # Drop the non-numerical columns from df
 31 | df_numeric = df.drop(non_numeric, axis=1)
 32 | 
 33 | # Create a t-SNE model with learning rate 50
 34 | m = TSNE(learning_rate=50)
 35 | 
 36 | # Fit and transform the t-SNE model on the numeric dataset
 37 | tsne_features = m.fit_transform(df_numeric)
 38 | print(tsne_features.shape)
 39 | 
 40 |  +100 XP
 41 | Good job! t-SNE reduced the more than 90 features in the dataset to just 2 which you can now plot.
 42 | 
 43 | 
 44 | <---------------------------------------------------------------------------------------------------------------------------------------->
 45 | 
 46 | t-SNE visualisation of dimensionality
 47 | Time to look at the results of your hard work. In this exercise, you will visualize the output of t-SNE dimensionality reduction on the combined male and female Ansur dataset. You'll create 3 scatterplots of the 2 t-SNE features ('x' and 'y') which were added to the dataset df. In each scatterplot you'll color the points according to a different categorical variable.
 48 | 
 49 | seaborn has already been imported as sns and matplotlib.pyplot as plt.
 50 | 
 51 | Instructions 1/3
 52 | 35 XP
 53 | 1
 54 | Use seaborn's sns.scatterplot to create the plot.
 55 | Color the points by 'Component'.
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | <=========================================================================================================================================>
 64 | 
 65 | 2
 66 | Feature selection I, selecting for feature information
 67 | 0%
 68 | In this first out of two chapters on feature selection, you'll learn about the curse of dimensionality and how dimensionality reduction can help you overcome it. You'll be introduced to a number of techniques to detect and remove features that bring little added value to the dataset. Either because they have little variance, too many missing values, or because they are strongly correlated to other features.
 69 | 
 70 | 
 71 | <---------------------------------------------------------------------------------------------------------------------------------------->
 72 | 
 73 | Train - test split
 74 | In this chapter, you will keep working with the ANSUR dataset. Before you can build a model on your dataset, you should first decide on which feature you want to predict. In this case, you're trying to predict gender.
 75 | 
 76 | You need to extract the column holding this feature from the dataset and then split the data into a training and test set. The training set will be used to train the model and the test set will be used to check its performance on unseen data.
 77 | 
 78 | ansur_df has been pre-loaded for you.
 79 | 
 80 | Instructions
 81 | 100 XP
 82 | Import the train_test_split function from sklearn.model_selection.
 83 | Assign the 'Gender' column to y.
 84 | Remove the 'Gender' column from the dataframe and assign the result to X.
 85 | Set the test size to 30% to perform a 70% train and 30% test data split.
 86 | 
 87 | 
 88 | # Import train_test_split()
 89 | from sklearn.model_selection import train_test_split
 90 | 
 91 | # Select the Gender column as the feature to be predicted (y)
 92 | y = ansur_df['Gender']
 93 | 
 94 | # Remove the Gender column to create the training data
 95 | X = ansur_df.drop('Gender', axis=1)
 96 | 
 97 | # Perform a 70% train and 30% test data split
 98 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
 99 | 
100 | print("{} rows in test set vs. {} in training set. {} Features.".format(X_test.shape[0], X_train.shape[0], X_test.shape[1]))
101 | 
102 | <---------------------------------------------------------------------------------------------------------------------------------------->
103 | 
104 | Fitting and testing the model
105 | In the previous exercise, you split the dataset into X_train, X_test, y_train, and y_test. These datasets have been pre-loaded for you. You'll now create a support vector machine classifier model (SVC()) and fit that to the training data. You'll then calculate the accuracy on both the test and training set to detect overfitting.
106 | 
107 | Instructions
108 | 100 XP
109 | Import SVC from sklearn.svm and accuracy_score from sklearn.metrics
110 | Create an instance of the Support Vector Classification class (SVC()).
111 | Fit the model to the training data.
112 | Calculate accuracy scores on both train and test data.
113 | 
114 | 
115 | # Import SVC from sklearn.svm and accuracy_score from sklearn.metrics
116 | from sklearn.svm import SVC
117 | from sklearn.metrics import accuracy_score
118 | 
119 | # Create an instance of the Support Vector Classification class
120 | svc = SVC()
121 | 
122 | # Fit the model to the training data
123 | svc.fit(X_train, y_train)
124 | 
125 | # Calculate accuracy scores on both train and test data
126 | accuracy_train = accuracy_score(y_train, svc.predict(X_train))
127 | accuracy_test = accuracy_score(y_test, svc.predict(X_test))
128 | 
129 | print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train))
130 | 
131 | 
132 | <script.py> output:
133 |     49.7% accuracy on test set vs. 100.0% on training set
134 |     
135 | <---------------------------------------------------------------------------------------------------------------------------------------->
136 | 
137 | Accuracy after dimensionality reduction
138 | You'll reduce the overfit with the help of dimensionality reduction. In this case, you'll apply a rather drastic form of dimensionality reduction by only selecting a single column that has some good information to distinguish between genders. You'll repeat the train-test split, model fit and prediction steps to compare the accuracy on test vs. training data.
139 | 
140 | All relevant packages and y have been pre-loaded.
141 | 
142 | Instructions
143 | 100 XP
144 | Select just the neck circumference ('neckcircumferencebase') column from ansur_df.
145 | Split the data, instantiate a classifier and fit the data. This has been done for you.
146 | Once again calculate the accuracy scores on both training and test set.
147 | 
148 | # Assign just the 'neckcircumferencebase' column from ansur_df to X
149 | X = ansur_df[['neckcircumferencebase']]
150 | 
151 | # Split the data, instantiate a classifier and fit the data
152 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
153 | svc = SVC()
154 | svc.fit(X_train, y_train)
155 | 
156 | # Calculate accuracy scores on both train and test data
157 | accuracy_train = accuracy_score(y_train, svc.predict(X_train))
158 | accuracy_test = accuracy_score(y_test, svc.predict(X_test))
159 | 
160 | print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train))
161 | 
162 | 
163 | <script.py> output:
164 |     93.3% accuracy on test set vs. 94.9% on training set
165 |     
166 |     
167 |  +100 XP
168 | Wow, what just happened!? On the full dataset the model is rubbish but with a single feature we can make good predictions? This is an example of the curse of dimensionality! The model badly overfits when we feed it too many features. It overlooks that neck circumference by itself is pretty different for males and females.
169 | 
170 | <---------------------------------------------------------------------------------------------------------------------------------------->
171 | 
172 | Features with low variance
173 | In the previous exercise you established that 0.001 is a good threshold to filter out low variance features in head_df after normalization. Now use the VarianceThreshold feature selector to remove these features.
174 | 
175 | Instructions
176 | 100 XP
177 | Create the variance threshold selector with a threshold of 0.001.
178 | Normalize the head_df dataframe by dividing it by its mean values and fit the selector.
179 | Create a boolean mask from the selector using .get_support().
180 | Create a reduced dataframe by passing the mask to the .loc[] method.
181 | 
182 | Take Hint (-30 XP)
183 | 
184 | from sklearn.feature_selection import VarianceThreshold
185 | 
186 | # Create a VarianceThreshold feature selector
187 | sel = VarianceThreshold(threshold=0.001)
188 | 
189 | # Fit the selector to normalized head_df
190 | sel.fit(head_df / head_df.mean())
191 | 
192 | # Create a boolean mask
193 | mask = sel.get_support()
194 | 
195 | # Apply the mask to create a reduced dataframe
196 | reduced_df = head_df.loc[:, mask]
197 | 
198 | print("Dimensionality reduced from {} to {}.".format(head_df.shape[1], reduced_df.shape[1]))
199 | 
200 | <script.py> output:
201 |     Dimensionality reduced from 6 to 4.
202 | <---------------------------------------------------------------------------------------------------------------------------------------->
203 | 
204 | Visualizing the correlation matrix
205 | Reading the correlation matrix of ansur_df in its raw, numeric format doesn't allow us to get a quick overview. Let's improve this by removing redundant values and visualizing the matrix using seaborn.
206 | 
207 | Seaborn has been pre-loaded as sns, matplotlib.pyplot as plt, NumPy as np and pandas as pd.
208 | 
209 | Instructions 1/4
210 | 100 XP
211 | 1
212 | 2
213 | 3
214 | 4
215 | Create the correlation matrix.
216 | Visualize it using Seaborn's heatmap function.
217 | 
218 | Take Hint (-30 XP)
219 | 
220 | # Create the correlation matrix
221 | corr = ansur_df.corr()
222 | 
223 | # Draw the heatmap
224 | sns.heatmap(corr,  cmap=cmap, center=0, linewidths=1, annot=True, fmt=".2f")
225 | plt.show()
226 | 
227 | 
228 | 
229 | Instructions 2/4
230 | 0 XP
231 | 2
232 | 3
233 | 4
234 | Create a boolean mask for the upper triangle of the plot.
235 | 
236 | # Create the correlation matrix
237 | corr = ansur_df.corr()
238 | 
239 | # Generate a mask for the upper triangle
240 | mask = np.triu(np.ones_like(corr, dtype=bool))
241 | 
242 | 
243 | 
244 | Instructions 3/4
245 | 0 XP
246 | 3
247 | 4
248 | Add the mask to the heatmap.
249 | 
250 | # Create the correlation matrix
251 | corr = ansur_df.corr()
252 | 
253 | # Generate a mask for the upper triangle 
254 | mask = np.triu(np.ones_like(corr, dtype=bool))
255 | 
256 | # Add the mask to the heatmap
257 | sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=1, annot=True, fmt=".2f")
258 | plt.show()
259 | 
260 | 
261 | <---------------------------------------------------------------------------------------------------------------------------------------->
262 | 
263 | Filtering out highly correlated features
264 | You're going to automate the removal of highly correlated features in the numeric ANSUR dataset. You'll calculate the correlation matrix and filter out columns that have a correlation coefficient of more than 0.95 or less than -0.95.
265 | 
266 | Since each correlation coefficient occurs twice in the matrix (correlation of A to B equals correlation of B to A) you'll want to ignore half of the correlation matrix so that only one of the two correlated features is removed. Use a mask trick for this purpose.
267 | 
268 | Instructions
269 | 100 XP
270 | Calculate the correlation matrix of ansur_df and take the absolute value of this matrix.
271 | Create a boolean mask with True values in the upper right triangle and apply it to the correlation matrix.
272 | Set the correlation coefficient threshold to 0.95.
273 | Drop all the columns listed in to_drop from the dataframe.
274 | 
275 | Take Hint (-30 XP)
276 | 
277 | # Calculate the correlation matrix and take the absolute value
278 | corr_matrix = ansur_df.corr().abs()
279 | 
280 | # Create a True/False mask and apply it
281 | mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
282 | tri_df = corr_matrix.mask(mask)
283 | 
284 | # List column names of highly correlated features (r > 0.95)
285 | to_drop = [c for c in tri_df.columns if any(tri_df[c] >  0.95)]
286 | 
287 | # Drop the features in the to_drop list
288 | reduced_df = ansur_df.drop(to_drop, axis=1)
289 | 
290 | print("The reduced dataframe has {} columns.".format(reduced_df.shape[1]))
291 | 
292 | 
293 | 
294 | 
295 | <=========================================================================================================================================>
296 | 
297 | 3
298 | Feature selection II, selecting for model accuracy
299 | 0%
300 | In this second chapter on feature selection, you'll learn how to let models help you find the most important features in a dataset for predicting a particular target feature. In the final lesson of this chapter, you'll combine the advice of multiple, different, models to decide on which features are worth keeping.
301 | 
302 | 
303 | 
304 | <---------------------------------------------------------------------------------------------------------------------------------------->
305 | 
306 | Building a diabetes classifier
307 | You'll be using the Pima Indians diabetes dataset to predict whether a person has diabetes using logistic regression. There are 8 features and one target in this dataset. The data has been split into a training and test set and pre-loaded for you as X_train, y_train, X_test, and y_test.
308 | 
309 | A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr.
310 | 
311 | Instructions
312 | 100 XP
313 | Fit the scaler on the training features and transform these features in one go.
314 | Fit the logistic regression model on the scaled training data.
315 | Scale the test features.
316 | Predict diabetes presence on the scaled test set.
317 | 
318 | 
319 | # Fit the scaler on the training features and transform these in one go
320 | X_train_std = scaler.fit_transform(X_train)
321 | 
322 | # Fit the logistic regression model on the scaled training data
323 | lr.fit(X_train_std, y_train)
324 | 
325 | # Scale the test features
326 | X_test_std = scaler.transform(X_test)
327 | 
328 | # Predict diabetes presence on the scaled test set
329 | y_pred = lr.predict(X_test_std)
330 | 
331 | # Prints accuracy metrics and feature coefficients
332 | print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
333 | print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))
334 | 
335 | 
336 | <script.py> output:
337 |     79.6% accuracy on test set.
338 |     {'diastolic': 0.03, 'family': 0.34, 'bmi': 0.38, 'glucose': 1.23, 'insulin': 0.19, 'age': 0.34, 'triceps': 0.24, 'pregnant': 0.04}
339 |     
340 |      +100 XP
341 | Great! We get almost 80% accuracy on the test set. Take a look at the differences in model coefficients for the different features.
342 | <---------------------------------------------------------------------------------------------------------------------------------------->
343 | 
344 | Manual Recursive Feature Elimination
345 | Now that we've created a diabetes classifier, let's see if we can reduce the number of features without hurting the model accuracy too much.
346 | 
347 | On the second line of code the features are selected from the original dataframe. Adjust this selection.
348 | 
349 | A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr.
350 | 
351 | All necessary functions and packages have been pre-loaded too.
352 | 
353 | Instructions 1/3
354 | 50 XP
355 | 1
356 | First, run the given code, then remove the feature with the lowest model coefficient from X.
357 | 
358 | Take Hint (-15 XP)
359 | 
360 | 
361 | # Remove the feature with the lowest model coefficient
362 | X = diabetes_df[['pregnant', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi', 'family', 'age']]
363 | 
364 | # Performs a 25-75% train test split
365 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
366 | 
367 | # Scales features and fits the logistic regression model
368 | lr.fit(scaler.fit_transform(X_train), y_train)
369 | 
370 | # Calculates the accuracy on the test set and prints coefficients
371 | acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
372 | print("{0:.1%} accuracy on test set.".format(acc)) 
373 | print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))
374 | 
375 | 
376 | 
377 | 
378 | <script.py> output:
379 |     79.6% accuracy on test set.
380 |     {'diastolic': 0.03, 'family': 0.34, 'bmi': 0.38, 'glucose': 1.23, 'insulin': 0.19, 'age': 0.34, 'triceps': 0.24, 'pregnant': 0.04}
381 |     
382 |     
383 |     
384 |     # Remove the 2 features with the lowest model coefficients
385 | X = diabetes_df[['pregnant', 'glucose', 'triceps', 'insulin', 'bmi', 'family', 'age']]
386 | 
387 | # Performs a 25-75% train test split
388 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
389 | 
390 | # Scales features and fits the logistic regression model
391 | lr.fit(scaler.fit_transform(X_train), y_train)
392 | 
393 | # Calculates the accuracy on the test set and prints coefficients
394 | acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test)))
395 | print("{0:.1%} accuracy on test set.".format(acc)) 
396 | print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))
397 | 
398 | 
399 | 
400 | 
401 | Run the code and remove 2 more features with the lowest model coefficients.
402 | 
403 | Take Hint (-15 XP)
404 | 
405 | 
406 | 
407 | 
408 | <---------------------------------------------------------------------------------------------------------------------------------------->
409 | 
410 | 
411 | 
412 | 
413 | 
414 | 
415 | 
416 | 
417 | <---------------------------------------------------------------------------------------------------------------------------------------->
418 | 
419 | 
420 | <=========================================================================================================================================>
421 | 
422 | 4
423 | Feature extraction
424 | 0%
425 | This chapter is a deep-dive on the most frequently used dimensionality reduction algorithm, Principal Component Analysis (PCA). You'll build intuition on how and why this algorithm is so powerful and will apply it both for data exploration and data pre-processing in a modeling pipeline. You'll end with a cool image compression use case.
426 | 
427 | 
428 | 
429 | 


--------------------------------------------------------------------------------
/DataCamp_Notes/Feature Engineering for MachineLearning in Python.txt:
--------------------------------------------------------------------------------
  1 | Course Description
  2 | Every day you read about the amazing breakthroughs in how the newest applications of machine learning are changing the world. Often this reporting glosses over the fact that a huge amount of data munging and feature engineering must be done before any of these fancy models can be used. In this course, you will learn how to do just that. You will work with Stack Overflow Developers survey, and historic US presidential inauguration addresses, to understand how best to preprocess and engineer features from categorical, continuous, and unstructured data. This course will give you hands-on experience on how to prepare any data for your own machine learning models.
  3 | 
  4 | <==================================================================================================================================>
  5 | 
  6 | 1
  7 | Creating Features
  8 | FREE
  9 | 0%
 10 | In this chapter, you will explore what feature engineering is and how to get started with applying it to real-world data. You will load, explore and visualize a survey response dataset, and in doing so you will learn about its underlying data types and why they have an influence on how you should engineer your features. Using the pandas package you will create new features from both categorical and continuous columns.
 11 | 
 12 | _______________________________________________________________________________________________________________________
 13 | 
 14 | One-hot encoding and dummy variables
 15 | To use categorical variables in a machine learning model, you first need to represent them in a quantitative way. The two most common approaches are to one-hot encode the variables using or to use dummy variables. In this exercise, you will create both types of encoding, and compare the created column sets. We will continue using the same DataFrame from previous lesson loaded as so_survey_df and focusing on its Country column.
 16 | 
 17 | Instructions 1/2
 18 | 50 XP
 19 | 1
 20 | One-hot encode the Country column, adding "OH" as a prefix for each column.
 21 | 
 22 | Take Hint (-15 XP)
 23 | 2
 24 | Create dummy variables for the Country column, adding "DM" as a prefix for each column.
 25 | 
 26 | 
 27 | # Convert the Country column to a one hot encoded Data Frame
 28 | one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH')
 29 | 
 30 | # Print the columns names
 31 | print(one_hot_encoded.columns)
 32 | 
 33 | # Create dummy variables for the Country column
 34 | dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM')
 35 | 
 36 | # Print the columns names
 37 | print(dummy.columns)
 38 | 
 39 |  +100 XP
 40 | Great job! Did you notice that the column for France was missing when you created dummy variables? Now you can choose to use one-hot encoding or dummy variables where appropriate.
 41 | 
 42 | _______________________________________________________________________________________________________________________
 43 | 
 44 | Dealing with uncommon categories
 45 | Some features can have many different categories but a very uneven distribution of their occurrences. Take for example Data Science's favorite languages to code in, some common choices are Python, R, and Julia, but there can be individuals with bespoke choices, like FORTRAN, C etc. In these cases, you may not want to create a feature for each value, but only the more common occurrences.
 46 | 
 47 | Instructions 1/3
 48 | 35 XP
 49 | 1
 50 | 2
 51 | 3
 52 | Extract the Country column of so_survey_df as a series and assign it to countries.
 53 | Find the counts of each category in the newly created countries series.
 54 | 
 55 | 
 56 | # Create a series out of the Country column
 57 | countries = so_survey_df['Country']
 58 | 
 59 | # Get the counts of each category
 60 | country_counts = countries.value_counts()
 61 | 
 62 | # Print the count values for each category
 63 | print(country_counts)
 64 | 
 65 | 
 66 | Create a mask for values occurring less than 10 times in country_counts.
 67 | Print the first 5 rows of the mask.
 68 | 
 69 | # Create a series out of the Country column
 70 | countries = so_survey_df['Country']
 71 | 
 72 | # Get the counts of each category
 73 | country_counts = countries.value_counts()
 74 | 
 75 | # Create a mask for only categories that occur less than 10 times
 76 | mask = countries.isin(country_counts[country_counts<10].index)
 77 | 
 78 | # Print the top 5 rows in the mask series
 79 | print(mask.head())
 80 | 
 81 | 
 82 | 
 83 | Instructions 3/3
 84 | 30 XP
 85 | 3
 86 | Label values occurring less than the mask cutoff as 'Other'.
 87 | Print the new category counts in countries.
 88 | 
 89 | 
 90 | # Create a series out of the Country column
 91 | countries = so_survey_df['Country']
 92 | 
 93 | # Get the counts of each category
 94 | country_counts = countries.value_counts()
 95 | 
 96 | # Create a mask for only categories that occur less than 10 times
 97 | mask = countries.isin(country_counts[country_counts < 10].index)
 98 | 
 99 | # Label all other categories as Other
100 | countries[mask] = 'Other'
101 | 
102 | # Print the updated category counts
103 | print(countries.value_counts())
104 | 
105 | _______________________________________________________________________________________________________________________
106 | 
107 | Binarizing columns
108 | While numeric values can often be used without any feature engineering, there will be cases when some form of manipulation can be useful. For example on some occasions, you might not care about the magnitude of a value but only care about its direction, or if it exists at all. In these situations, you will want to binarize a column. In the so_survey_df data, you have a large number of survey respondents that are working voluntarily (without pay). You will create a new column titled Paid_Job indicating whether each person is paid (their salary is greater than zero).
109 | 
110 | Instructions
111 | 100 XP
112 | Create a new column called Paid_Job filled with zeros.
113 | Replace all the Paid_Job values with a 1 where the corresponding ConvertedSalary is greater than 0.
114 | 
115 | # Create the Paid_Job column filled with zeros
116 | so_survey_df['Paid_Job'] = 0
117 | 
118 | # Replace all the Paid_Job values where ConvertedSalary is > 0
119 | so_survey_df.loc[so_survey_df['ConvertedSalary']>0, 'Paid_Job'] = 1
120 | 
121 | # Print the first five rows of the columns
122 | print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head())
123 | 
124 | 
125 | 
126 | <script.py> output:
127 |        Paid_Job  ConvertedSalary
128 |     0         0              0.0
129 |     1         1          70841.0
130 |     2         0              0.0
131 |     3         1          21426.0
132 |     4         1          41671.0
133 |     
134 |     
135 |   +100 XP
136 | Good work, binarizing columns can also be useful for your target variables.
137 | _______________________________________________________________________________________________________________________
138 | 
139 | 
140 | Binning values
141 | For many continuous values you will care less about the exact value of a numeric column, but instead care about the bucket it falls into. This can be useful when plotting values, or simplifying your machine learning models. It is mostly used on continuous variables where accuracy is not the biggest concern e.g. age, height, wages.
142 | 
143 | Bins are created using pd.cut(df['column_name'], bins) where bins can be an integer specifying the number of evenly spaced bins, or a list of bin boundaries.
144 | 
145 | Instructions 1/2
146 | 50 XP
147 | 1
148 | Bin the ConvertedSalary column values into 5 equal bins, in a new column called equal_binned.
149 | 
150 | 
151 | Bin the ConvertedSalary column using the boundaries in the list bins and label the bins using labels.
152 | 
153 | 
154 | # Bin the continuous variable ConvertedSalary into 5 bins
155 | so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], bins=5)
156 | 
157 | # Print the first 5 rows of the equal_binned column
158 | print(so_survey_df[['equal_binned', 'ConvertedSalary']].head())
159 | 
160 | 
161 | <script.py> output:
162 |               equal_binned  ConvertedSalary
163 |     0  (-2000.0, 400000.0]              0.0
164 |     1  (-2000.0, 400000.0]          70841.0
165 |     2  (-2000.0, 400000.0]              0.0
166 |     3  (-2000.0, 400000.0]          21426.0
167 |     4  (-2000.0, 400000.0]          41671.0
168 |     
169 |     
170 |     
171 | 
172 | # Import numpy
173 | import numpy as np
174 | 
175 | # Specify the boundaries of the bins
176 | bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf]
177 | 
178 | # Bin labels
179 | labels = ['Very low', 'Low', 'Medium', 'High', 'Very high']
180 | 
181 | # Bin the continuous variable ConvertedSalary using these boundaries
182 | so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 
183 |                                          bins=bins, labels=labels)
184 | 
185 | # Print the first 5 rows of the boundary_binned column
186 | print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head())
187 | 
188 | 
189 | 
190 | <script.py> output:
191 |       boundary_binned  ConvertedSalary
192 |     0        Very low              0.0
193 |     1          Medium          70841.0
194 |     2        Very low              0.0
195 |     3             Low          21426.0
196 |     4             Low          41671.0
197 |     
198 |     
199 |      +100 XP
200 | Correct, now you can bin columns with equal spacing and predefined boundaries.
201 | 
202 | 
203 | 
204 |     
205 | <==================================================================================================================================>
206 | 
207 | 
208 | VIEW CHAPTER DETAILS
209 | 2
210 | Dealing with Messy Data
211 | 0%
212 | This chapter introduces you to the reality of messy and incomplete data. You will learn how to find where your data has missing values and explore multiple approaches on how to deal with them. You will also use string manipulation techniques to deal with unwanted characters in your dataset.
213 | 
214 | 
215 | <==================================================================================================================================>
216 | VIEW CHAPTER DETAILS
217 | 3
218 | Conforming to Statistical Assumptions
219 | 0%
220 | In this chapter, you will focus on analyzing the underlying distribution of your data and whether it will impact your machine learning pipeline. You will learn how to deal with skewed data and situations where outliers may be negatively impacting your analysis.
221 | 
222 | 
223 | <==================================================================================================================================>
224 | VIEW CHAPTER DETAILS
225 | 4
226 | Dealing with Text Data
227 | 0%
228 | Finally, in this chapter, you will work with unstructured text data, understanding ways in which you can engineer columnar features out of a text corpus. You will compare how different approaches may impact how much context is being extracted from a text, and how to balance the need for context, without too many features being created.
229 | 
230 | VIEW CHAPTER DETAILS
231 | 


--------------------------------------------------------------------------------
/DataCamp_Notes/Interactive Data Visualization with Bokeh.txt:
--------------------------------------------------------------------------------
  1 | Interactive Data
  2 | Visualization with
  3 | Bokeh
  4 | 
  5 | What is Bokeh?
  6 | ● Interactive visualization, controls, and tools
  7 | ● Versatile and high-level graphics
  8 | ● High-level statistical charts
  9 | ● Streaming, dynamic, large data
 10 | ● For the browser, with or without a server
 11 | ● No JavaScript
 12 | 
 13 | 
 14 | 
 15 | 
 16 | What you will learn
 17 | ● Basic plo!ing with bokeh.plotting
 18 | ● Layouts, interactions, and annotations
 19 | ● Statistical charting with bokeh.charts
 20 | ● Interactive data applications in the browser
 21 | ● Case Study: A Gapminder explorer
 22 | 
 23 | 
 24 | 
 25 | 
 26 | <=================================================================================================================================>
 27 | 
 28 | Plo!ing with
 29 |  Glyphs
 30 |  
 31 |  
 32 |  What are Glyphs
 33 | ● Visual shapes
 34 | ● circles, squares, triangles
 35 | ● rectangles, lines, wedges
 36 | ● With properties a!ached to data
 37 | ● coordinates (x,y)
 38 | ● size, color, transparency
 39 | 
 40 | 
 41 |  
 42 |  Typical usage
 43 | In [1]: from bokeh.io import output_file, show
 44 | In [2]: from bokeh.plotting import figure
 45 | In [3]: plot = figure(plot_width=400, tools='pan,box_zoom')
 46 | In [4]: plot.circle([1,2,3,4,5], [8,6,5,2,3])
 47 | In [5]: output_file('circle.html')
 48 | In [6]: show(plot)
 49 | 
 50 | 
 51 | 
 52 |  Glyph properties
 53 | ● Lists, arrays, sequences of values
 54 | ● Single fixed values
 55 | In [1]: plot = figure()
 56 | In [2]: plot.circle(x=10, y=[2,5,8,12], size=[10,20,30,40])
 57 | 
 58 | 
 59 | 
 60 | 
 61 |  Markers ● asterisk() ● circle() ● circle_cross() ● circle_x() ● cross() ● diamond() ● diamond_cross() ● inverted_triangle() ● square() ● square_cross() ● square_x() ● triangle() ● x()
 62 |  
 63 |  
 64 |  
 65 |  
 66 | <=================================================================================================================================>
 67 | 
 68 | 
 69 | Additional Glyphs
 70 | 
 71 | 
 72 | Lines
 73 | In [1]: from bokeh.io import output_file, show
 74 | In [2]: from bokeh.plotting import figure
 75 | In [3]: x = [1,2,3,4,5]
 76 | In [4]: y = [8,6,5,2,3]
 77 | In [5]: plot = figure()
 78 | In [6]: plot.line(x, y, line_width=3)
 79 | In [7]: output_file('line.html')
 80 | In [8]: show(plot)
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | <=================================================================================================================================>
 89 | 
 90 | <=================================================================================================================================>
 91 | 
 92 | <=================================================================================================================================>
 93 | 
 94 | <=================================================================================================================================>
 95 | 
 96 | <=================================================================================================================================>
 97 | 
 98 | <=================================================================================================================================>
 99 | 
100 | <=================================================================================================================================>
101 | 
102 | <=================================================================================================================================>
103 | 
104 | <=================================================================================================================================>
105 | 
106 | <=================================================================================================================================>
107 | 
108 | <=================================================================================================================================>
109 | 
110 |  
111 |  
112 |  
113 | 


--------------------------------------------------------------------------------
/DataCamp_Notes/Intermediate SQL.txt:
--------------------------------------------------------------------------------
  1 | Intermediate SQL
  2 | Course Description
  3 | So you've learned how to aggregate and join data from tables in your database—now what? How do you manipulate, transform, and make the most sense of your data? This intermediate-level course will teach you several key functions necessary to wrangle, filter, and categorize information in a relational database, expand your SQL toolkit, and answer complex questions. You will learn the robust use of CASE statements, subqueries, and window functions—all while discovering some interesting facts about soccer using the European Soccer Database.
  4 | 
  5 | ____________________________________________________________________________________________________________________________________
  6 | 
  7 | Topics covered:
  8 | CASE statements
  9 | Simple subqueries
 10 | Correlated subqueries
 11 | Window functions
 12 | 
 13 | ------------------------------------------------------------------------------------------------------------
 14 | 
 15 | CASE statements
 16 | Contains a WHEN , THEN , and ELSE statement, finished with END
 17 | 
 18 | Example:
 19 | CASE WHEN x = 1 THEN 'a'
 20 | WHEN x = 2 THEN 'b'
 21 | ELSE 'c' END AS new_column
 22 | 
 23 | 
 24 | ------------------------------------------------------------------------------------------------------------
 25 | 
 26 | CASE WHEN ... AND then some
 27 | Add multiple logical conditions to your WHEN clause!
 28 | 
 29 | Example:
 30 | SELECT date, hometeam_id, awayteam_id,
 31 | CASE WHEN hometeam_id = 8455 AND home_goal > away_goal
 32 | THEN 'Chelsea home win!'
 33 | WHEN awayteam_id = 8455 AND home_goal < away_goal
 34 | THEN 'Chelsea away win!'
 35 | ELSE 'Loss or tie :(' END AS outcome
 36 | FROM match
 37 | WHERE hometeam_id = 8455 OR awayteam_id = 8455;
 38 | 
 39 | ------------------------------------------------------------------------------------------------------------
 40 | 
 41 | Filtering your CASE statement
 42 | Let's generate a list of matches won by Italy's Bologna team! There are quite a few additional teams in the two tables, so a key part of generating a usable query will be using your CASE statement as a filter in the WHERE clause.
 43 | 
 44 | CASE statements allow you to categorize data that you're interested in -- and exclude data you're not interested in. In order to do this, you can use a CASE statement as a filter in the WHERE statement to remove output you don't want to see.
 45 | 
 46 | Here is how you might set that up:
 47 | 
 48 | SELECT *
 49 | FROM table
 50 | WHERE 
 51 |     CASE WHEN a > 5 THEN 'Keep'
 52 |          WHEN a <= 5 THEN 'Exclude' END = 'Keep';
 53 | In essence, you can use the CASE statement as a filtering column like any other column in your database. The only difference is that you don't alias the statement in WHERE.
 54 | 
 55 | Instructions 3/3
 56 | 0 XP
 57 | 3
 58 | Select the home_goal and away_goal for each match.
 59 | Use the CASE statement in the WHERE clause to filter all NULL values generated by the statement in the previous step.
 60 | 
 61 | 
 62 | -- Select the season, date, home_goal, and away_goal columns
 63 | SELECT 
 64 | 	season,
 65 | 	date,
 66 | 	home_goal,
 67 | 	away_goal
 68 | FROM matches_italy
 69 | WHERE
 70 | -- Exclude games not won by Bologna
 71 | 	CASE WHEN hometeam_id = 9857 AND home_goal > away_goal THEN 'Bologna Win'
 72 |          WHEN awayteam_id = 9857 AND away_goal > home_goal THEN 'Bologna Win' 
 73 |          END IS NOT NULL;
 74 |          
 75 | ------------------------------------------------------------------------------------------------------------
 76 | 
 77 | In CASE you need to aggregate
 78 | CASE statements are great for
 79 | Categorizing data
 80 | Filtering data
 81 | Aggregating data
 82 | 
 83 | ------------------------------------------------------------------------------------------------------------
 84 | 
 85 | Calculating percent with CASE and AVG
 86 | CASE statements will return any value you specify in your THEN clause. This is an incredibly powerful tool for robust calculations and data manipulation when used in conjunction with an aggregate statement. One key task you can perform is using CASE inside an AVG function to calculate a percentage of information in your database.
 87 | 
 88 | Here's an example of how you set that up:
 89 | 
 90 | AVG(CASE WHEN condition_is_met THEN 1
 91 |          WHEN condition_is_not_met THEN 0 END)
 92 | With this approach, it's important to accurately specify which records count as 0, otherwise your calculations may not be correct!
 93 | 
 94 | Your task is to examine the number of wins, losses, and ties in each country. The matches table is filtered to include all matches from the 2013/2014 and 2014/2015 seasons.
 95 | 
 96 | Instructions 1/3
 97 | 35 XP
 98 | 1
 99 | 2
100 | 3
101 | Create 3 CASE statements to COUNT the total number of home team wins, away team wins, and ties. This will allow you to examine the total number of records. You will convert this to an AVG in the next step.
102 | 
103 | 
104 | SELECT 
105 |     c.name AS country,
106 |     -- Count the home wins, away wins, and ties in each country
107 | 	count(case when m.home_goal > m.away_goal THEN m.id 
108 |         END) AS home_wins,
109 | 	count(case when m.home_goal < m.away_goal THEN m.id 
110 |         END) AS away_wins,
111 | 	count(case when m.home_goal = m.away_goal THEN m.id 
112 |         END) AS ties
113 | FROM country AS c
114 | LEFT JOIN matches AS m
115 | ON c.id = m.country_id
116 | GROUP BY country;
117 | 
118 | 
119 | 
120 | Instructions 2/3
121 | 35 XP
122 | 2
123 | 3
124 | Calculate the percentage of matches tied using a CASE statement inside AVG.
125 | Fill in the logical operators for each statement. Alias your columns as ties_2013_2014 and ties_2014_2015, respectively.
126 | 
127 | SELECT 
128 | 	c.name AS country,
129 |     -- Calculate the percentage of tied games in each season
130 | 	avg(case when m.season='2013/2014' AND m.home_goal = m.away_goal THEN 1
131 | 			WHEN m.season='2013/2014' AND m.home_goal != m.away_goal THEN 0
132 | 			END) AS ties_2013_2014,
133 | 	avg(case when m.season='2014/2015' and m.home_goal = m.away_goal then 1
134 | 			WHEN m.season='2014/2015' and m.home_goal != m.away_goal then 0
135 | 			end) AS ties_2014_2015
136 | FROM country AS c
137 | LEFT JOIN matches AS m
138 | ON c.id = m.country_id
139 | GROUP BY country;
140 | 
141 | 
142 | 
143 | 
144 | 
145 | 
146 | Instructions 3/3
147 | 30 XP
148 | 3
149 | The previous "ties" columns returned values with 14 decimal points, which is not easy to interpret. Use the ROUND function to round to 2 decimal points.
150 | 
151 | SELECT 
152 | 	c.name AS country,
153 |     -- Round the percentage of tied games to 2 decimal points
154 | 	round(avg(CASE WHEN m.season='2013/2014' AND m.home_goal = m.away_goal THEN 1
155 | 			 WHEN m.season='2013/2014' AND m.home_goal != m.away_goal THEN 0
156 | 			 END),2) AS pct_ties_2013_2014,
157 | 	round(avg(CASE WHEN m.season='2014/2015' AND m.home_goal = m.away_goal THEN 1
158 | 			 WHEN m.season='2014/2015' AND m.home_goal != m.away_goal THEN 0
159 | 			 END),2) AS pct_ties_2014_2015
160 | FROM country AS c
161 | LEFT JOIN matches AS m
162 | ON c.id = m.country_id
163 | GROUP BY country;
164 | 
165 | ------------------------------------------------------------------------------------------------------------
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/DataCamp_Notes/Introduction to Git for Data Science.txt:
--------------------------------------------------------------------------------
 1 | Interactive Course
 2 | Introduction to Git for Data Science
 3 | Introduction to Git for Data Science
 4 | 
 5 |     4 hours
 6 |     0 Videos
 7 |     46 Exercises
 8 |     65,206 Participants
 9 |     3,650 XP
10 | 
11 | Greg Wilson
12 | Greg Wilson
13 | 
14 | Co-founder of Software Carpentry
15 | 
16 | Dr. Greg Wilson has worked for 30 years in both industry and academia, and is the author or editor of several books on computing and two for children. He is best known as the co-founder of Software Carpentry, a non-profit organization that teaches basic computing skills to researchers.
17 | Collaborator(s)
18 | 
19 |     Filip Schouwenaars
20 | 
21 |     Filip Schouwenaars
22 | 
23 | Course Description
24 | 
25 | Version control is one of the power tools of programming. It allows you to keep track of what you did when, undo any changes you decide you don't want, and collaborate at scale with other people. This course will introduce you to Git, a modern version control tool that is very popular with data scientists and software developers, and show you how to use it to get more done in less time and with less pain.
26 | 
27 | <============================================================================================================================>
28 | 
29 |     1
30 |     Basic workflow
31 |     100%
32 | 
33 |     This chapter explains what version control is and why you should use it, and introduces the most common steps in a common Git workflow.
34 |     View Chapter Details
35 | 
36 | 
37 | 
38 | 
39 | 
40 | <============================================================================================================================>
41 | 2
42 | Repositories
43 | 9%
44 | 
45 | This chapter digs a little deeper into how Git stores information and how you can explore a repository's history.
46 | View Chapter Details
47 | ______________________________________________________________________________________________________________________________
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | <============================================================================================================================>
55 | 3
56 | Undo
57 | 0%
58 | 
59 | Since Git saves all the changes you've made to your files, you can use it to undo those changes. This chapter shows you several ways to do that.
60 | View Chapter Details
61 | 
62 | 
63 | 
64 | 
65 | <============================================================================================================================>
66 | 4
67 | Working with branches
68 | 0%
69 | 
70 | Branching is one of Git's most powerful features, since it allows you to work on several things at once. This chapter shows you how to create and manage branches.
71 | View Chapter Details
72 | 
73 | 
74 | 
75 | 
76 | 
77 | <============================================================================================================================>
78 | 5
79 | Collaborating
80 | 0%
81 | 
82 | This chapter showcases how Git allows you to share changes between repositories to collaborate at scale.
83 | View Chapter Details
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/DataCamp_Notes/Linear Classifiers in Python.txt:
--------------------------------------------------------------------------------
 1 | Linear Classifiers in Python
 2 | 
 3 | Course Description
 4 | In this course you'll learn all about using linear classifiers, specifically logistic regression and support vector machines, with scikit-learn. Once you've learned how to apply these methods, you'll dive into the ideas behind them and find out what really makes them tick. At the end of this course you'll know how to train, test, and tune these linear classifiers in Python. You'll also have a conceptual foundation for understanding many other machine learning algorithms.
 5 | 
 6 | --------------------------------------------------------------------------------------------------------------------------------------------------------
 7 | 
 8 | Applying logistic regression and SVM
 9 | FREE
10 | 100%
11 | In this chapter you will learn the basics of applying logistic regression and support vector machines (SVMs) to classification problems. You'll use the scikit-learn library to fit classification models to real data.
12 | 
13 | --------------------------------------------------------------------------------------------------------------------------------------------------------
14 | 
15 | 
16 | Loss functions
17 | 0%
18 | In this chapter you will discover the conceptual framework behind logistic regression and SVMs. This will let you delve deeper into the inner workings of these models.
19 | 
20 | --------------------------------------------------------------------------------------------------------------------------------------------------------
21 | 
22 | Logistic regression
23 | 0%
24 | In this chapter you will delve into the details of logistic regression. You'll learn all about regularization and how to interpret model output.
25 | 
26 | --------------------------------------------------------------------------------------------------------------------------------------------------------
27 | 
28 | 4
29 | Support Vector Machines
30 | 0%
31 | In this chapter you will learn all about the details of support vector machines. You'll learn about tuning hyperparameters for these models and using kernels to fit non-linear decision boundaries.
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/DataCamp_Notes/Visualizing Time Series Data in Python.txt:
--------------------------------------------------------------------------------
  1 | Visualizing Time Series Data in Python
  2 | 
  3 | 
  4 | Course Description
  5 | Time series data is omnipresent in the field of Data Science. Whether it is analyzing business trends, forecasting company revenue or exploring customer behavior, every data scientist is likely to encounter time series data at some point during their work. To get you started on working with time series data, this course will provide practical knowledge on visualizing time series data using Python.
  6 | <=====================================================================================================================================>
  7 | 
  8 | 1
  9 | Introduction
 10 | FREE
 11 | 
 12 | 
 13 | 0%
 14 | You will learn how to leverage basic plottings tools in Python, and how to annotate and personalize your time series plots. By the end of this chapter, you will be able to take any static dataset and produce compelling plots of your data.
 15 | 
 16 | -----------------------------------------------------------------------------------------------------------------
 17 | 
 18 | 
 19 | Load your time series data
 20 | The most common way to import time series data in Python is by using the pandas library. You can use the read_csv() from pandas to read the contents of a file into a DataFrame. This can be achieved using the following command:
 21 | 
 22 | df = pd.read_csv("name_of_your_file.csv")
 23 | Once your data is loaded into Python, you can display the first rows of your DataFrame by calling the .head(n=5) method, where n=5 indicates that you want to print the first five rows of your DataFrame.
 24 | 
 25 | In this exercise, you will read in a time series dataset that contains the number of "great" inventions and scientific discoveries from 1860 to 1959, and display its first five rows.
 26 | 
 27 | Instructions
 28 | 100 XP
 29 | Import the pandas library using the pd alias.
 30 | Read in the time series data from the csv file located at url_discoveries into a DataFrame called discoveries.
 31 | Print the first 5 lines of the DataFrame using the .head() method.
 32 | 
 33 | # Import pandas
 34 | import pandas as pd
 35 | 
 36 | # Read in the file content in a DataFrame called discoveries
 37 | discoveries = pd.read_csv(url_discoveries)
 38 | 
 39 | # Display the first five lines of the DataFrame
 40 | print(discoveries.head())
 41 | 
 42 | -----------------------------------------------------------------------------------------------------------------
 43 | 
 44 | Test whether your data is of the correct type
 45 | When working with time series data in pandas, any date information should be formatted as a datetime64 type. Therefore, it is important to check that the columns containing the date information are of the correct type. You can check the type of each column in a DataFrame by using the .dtypes attribute. Fortunately, if your date columns come as strings, epochs, etc... you can use the to_datetime() function to convert them to the appropriate datetime64 type:
 46 | 
 47 | df['date_column'] = pd.to_datetime(df['date_column'])
 48 | In this exercise, you will learn how to check the data type of the columns in your time series data and convert a date column to the appropriate datetime type.
 49 | 
 50 | Instructions 1/3
 51 | 30 XP
 52 | 1
 53 | 2
 54 | 3
 55 | Print out the data type of the column in the discoveries object.
 56 | 
 57 | # Print the data type of each column in discoveries
 58 | print(discoveries.dtypes)
 59 | 
 60 | # Convert the date column to a datestamp type
 61 | discoveries['date'] = pd.to_datetime(discoveries['date'])
 62 | 
 63 | # Print the data type of each column in discoveries, again
 64 | print(discoveries.dtypes)
 65 | -----------------------------------------------------------------------------------------------------------------
 66 | 
 67 | Your first plot!
 68 | Let's take everything you have learned so far and plot your first time series plot. You will set the groundwork by producing a time series plot of your data and labeling the axes of your plot, as this makes the plot more readable and interpretable for the intended audience.
 69 | 
 70 | matplotlib is the most widely used plotting library in Python, and would be the most appropriate tool for this job. Fortunately for us, the pandas library has implemented a .plot() method on Series and DataFrame objects that is a wrapper around matplotlib.pyplot.plot(), which makes it easier to produce plots.
 71 | 
 72 | Instructions
 73 | 100 XP
 74 | Set the 'date' column as the index of your DataFrame.
 75 | Using the discoveries DataFrame, plot the time series in your DataFrame using a "blue" line plot and assign it to ax.
 76 | Specify the x-axis label on your plot: 'Date'.
 77 | Specify the y-axis label on your plot: 'Number of great discoveries'.
 78 | 
 79 | 
 80 | # Set the date column as the index of your DataFrame discoveries
 81 | discoveries = discoveries.set_index('date')
 82 | 
 83 | # Plot the time series in your DataFrame
 84 | ax = discoveries.plot(color='blue')
 85 | 
 86 | # Specify the x-axis label in your plot
 87 | ax.set_xlabel('Date')
 88 | 
 89 | # Specify the y-axis label in your plot
 90 | ax.set_ylabel('Number of great discoveries')
 91 | 
 92 | # Show plot
 93 | plt.show()
 94 | 
 95 | -----------------------------------------------------------------------------------------------------------------
 96 | 
 97 | Specify plot styles
 98 | The matplotlib library also comes with a number of built-in stylesheets that allow you to customize the appearance of your plots. To use a particular style sheet for your plots, you can use the command plt.style.use(your_stylesheet) where your_stylesheet is the name of the style sheet.
 99 | 
100 | In order to see the list of available style sheets that can be used, you can use the command print(plt.style.available). For the rest of this course, we will use the awesome fivethirtyeight style sheet.
101 | 
102 | Instructions 1/2
103 | 50 XP
104 | 1
105 | 2
106 | Import matplotlib.pyplot using its usual alias plt.
107 | Use the fivethirtyeight style sheet to plot a line plot of the discoveries data.
108 | 
109 | # Import the matplotlib.pyplot sub-module
110 | import matplotlib.pyplot as plt
111 | 
112 | # Use the fivethirtyeight style
113 | plt.style.use('fivethirtyeight')
114 | 
115 | # Plot the time series
116 | ax1 = discoveries.plot()
117 | ax1.set_title('FiveThirtyEight Style')
118 | plt.show()
119 | 
120 | 
121 | Use the ggplot style sheet to plot a line plot of the discoveries data.
122 | Set the title of your second plot as 'ggplot Style'.
123 | 
124 | # Import the matplotlib.pyplot sub-module
125 | import matplotlib.pyplot as plt
126 | 
127 | # Use the ggplot style
128 | plt.style.use('ggplot')
129 | ax2 = discoveries.plot()
130 | 
131 | # Set the title
132 | ax2.set_title('ggplot Style')
133 | plt.show()
134 | -----------------------------------------------------------------------------------------------------------------
135 | 
136 | 
137 | Display and label plots
138 | As you saw earlier, if the index of a pandas DataFrame consists of dates, then pandas will automatically format the x-axis in a human-readable way. In addition the .plot() method allows you to specify various other parameters to tailor your time series plot (color of the lines, width of the lines and figure size).
139 | 
140 | You may have noticed the use of the notation ax = df.plot(...) and wondered about the purpose of the ax object. This is because the plot function returns a matplotlib AxesSubplot object, and it is common practice to assign this returned object to a variable called ax. Doing so also allows you to include additional notations and specifications to your plot such as axis labels.
141 | 
142 | Instructions
143 | 100 XP
144 | Display a line chart of the discoveries DataFrame.
145 | 
146 | Specify the color of the line as 'blue'.
147 | Width of the line as 2.
148 | The dimensions of your plot to be of length 8 and width 3.
149 | Specify the fontsize of 6.
150 | 
151 | 
152 | # Plot a line chart of the discoveries DataFrame using the specified arguments
153 | ax = discoveries.plot(color='blue', figsize=(8, 3), linewidth=2, fontsize=6)
154 | 
155 | # Specify the title in your plot
156 | ax.set_title('Number of great inventions and scientific discoveries from 1860 to 1959', fontsize=8)
157 | 
158 | # Show plot
159 | plt.show()
160 | -----------------------------------------------------------------------------------------------------------------
161 | Subset time series data
162 | When plotting time series data, you may occasionally want to visualize only a subset of the data. The pandas library provides powerful indexing and subsetting methods that allow you to extract specific portions of a DataFrame. For example, you can subset all the data between 1950 and 1960 in the discoveries DataFrame by specifying the following date range:
163 | 
164 | subset_data = discoveries['1950-01-01':'1960-01-01']
165 | Note: Subsetting your data this way is only possible if the index of your DataFrame contains dates of the datetime type. Failing that, the pandas library will return an error message.
166 | 
167 | Instructions 1/2
168 | 50 XP
169 | 1
170 | 2
171 | Use discoveries to create a new DataFrame discoveries_subset_1 that contains all the data between January 1, 1945 and January 1, 1950.
172 | Plot the time series of discoveries_subset_1 using a "blue" line plot.
173 | 
174 | 
175 | 
176 | 
177 | 
178 | <=====================================================================================================================================>
179 | 
180 | 2
181 | Summary Statistics and Diagnostics
182 | 0%
183 | In this chapter, you will gain a deeper understanding of your time series data by computing summary statistics and plotting aggregated views of your data.
184 | 
185 | <=====================================================================================================================================>
186 | 
187 | 
188 | 3
189 | Seasonality, Trend and Noise
190 | 0%
191 | You will go beyond summary statistics by learning about autocorrelation and partial autocorrelation plots. You will also learn how to automatically detect seasonality, trend and noise in your time series data.
192 | 
193 | <=====================================================================================================================================>
194 | 
195 | 4
196 | Work with Multiple Time Series
197 | 0%
198 | In the field of Data Science, it is common to be involved in projects where multiple time series need to be studied simultaneously. In this chapter, we will show you how to plot multiple time series at once, and how to discover and describe relationships between multiple time series.
199 | 
200 | <=====================================================================================================================================>
201 | 
202 | 
203 | 5
204 | Case Study
205 | 0%
206 | This chapter will give you a chance to practice all the concepts covered in the course. You will visualize the unemployment rate in the US from 2000 to 2010.
207 | 
208 | 


--------------------------------------------------------------------------------
/MBazeley_Resume_Aug2020_v2.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/MBazeley_Resume_Aug2020_v2.docx


--------------------------------------------------------------------------------
/MBazeley_Resume_Aug2020_v2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/MBazeley_Resume_Aug2020_v2.pdf


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![alt text](
  2 |        https://github.com/MMBazel/springboard-program/blob/master/0.jpg
  3 |       )
  4 | 
  5 | 
  6 | # Springboard Data Science Career Track
  7 | 
  8 | Hi!
  9 | 
 10 | My name is Mikiko Bazeley and this is my repo for the Springboard Data Science Track. 
 11 | 
 12 | From Oct 2018 to April 2019 I completed a number of projects, including two capstones, as part of the DS track. 
 13 | 
 14 | All of the documentation, code, and notes can be found here, as well as links to other resources I found helpful for successfully completing the program. 
 15 | 
 16 | For questions or comments, please feel free to reach out on [LinkedIn](https://www.linkedin.com/in/mikikobazeley/). 
 17 | 
 18 | If you find my repo useful, let me know OR ☕ consider buying me a coffee! https://www.buymeacoffee.com/mmbazel ☕.
 19 | 
 20 | Regards,
 21 | Mikiko
 22 | 
 23 | ![alt text](
 24 |        https://github.com/MMBazel/springboard-program/blob/master/Additional%20Resources/profile_pic_jpeg.jpg?raw=true
 25 |       )
 26 | 
 27 | 
 28 | --------------------------------------------------------------------------------------------------------------------------------
 29 | # Project List by Unit of Study
 30 | 
 31 | For a comprehensve list of the projects and corresponding skills needed, please see the list below.
 32 | 
 33 | ## 1. The Python Data Science Stack
 34 | Topics covered: 
 35 | * Python      
 36 | * Matplotlib, Seaborn—visualization tools in Python  
 37 | * Writing clear, elegant, readable code
 38 |        in Python using the PEP8 standard
 39 | 
 40 | ## 2. Data Wrangling
 41 | Topics covered:
 42 | * Deep dive into Pandas for data wrangling
 43 | * Data in files: Work with a variety of file formats from plain text (.txt) to more structured and nested formats files like csv and JSON
 44 | * Data in databases: Get an overview of relational and NoSQL databases and practice data querying with SQL
 45 | * APIs: Collect data from the internet using Application Programming Interfaces (APIs)
 46 | 
 47 | Projects:
 48 | * =====> [Mini Project: SQL Practice](https://github.com/MMBazel/springboard-program/tree/master/mini-projects/Ch%205%20-%20Data%20Wrangling/5.3%20SQL%20Practice)
 49 | 
 50 | 
 51 | 
 52 | ## 3. Data Story
 53 | 
 54 | ## 4. Statistical Inference
 55 | Topics covered:
 56 | * Theory of inferential statistics
 57 | * Statistical significance
 58 | * Parameter estimation
 59 | * Hypothesis testing
 60 | * Correlation and regression
 61 | * Exploratory data analysis
 62 | * A/B testing
 63 | 
 64 | 
 65 | ## 5. Machine Learning
 66 | Topics covered:
 67 | * Scikit-learn
 68 | * Supervised and unsupervised learning
 69 | * Top machine learning techniques:
 70 | 	* Linear and logistic regression 
 71 | 	* naive bayes
 72 | 	* support vector machines
 73 | 	* decision trees 
 74 | 	* clustering
 75 | * Ensemble learning with random forests and gradient boosting
 76 | * Best practices
 77 | * Evaluating and tuning machine learning systems
 78 | 
 79 | 
 80 | 
 81 | ## 6. Capstone Project 1: Building a Data Product
 82 | 	      
 83 | * =====> My Capstone Project: [Predicting Qualifieds from First Call](https://github.com/MMBazel/springboard-program/tree/master/capstone1)
 84 | 
 85 | 
 86 | ## 7. The Natural Language Processing (NLP) Track
 87 | 
 88 | Topics covered:
 89 | * How to work with text and natural language data
 90 | * NLP in Python, using common libraries such as NLTK and spaCy
 91 | * Basics of Deep Learning in NLP using word2vec and TensorFlow 
 92 | * Data Science at Scale using Spark
 93 | * Software Engineering for Data Scientists
 94 | 
 95 | 
 96 | ## 8. Second Capstone Project: NLP
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-minimal


--------------------------------------------------------------------------------
/about.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: This will be used as the title-tag of the page head
 3 | ---
 4 |  
 5 | hello
 6 | =====
 7 |  
 8 | **You are here!**
 9 |  
10 | 


--------------------------------------------------------------------------------
/capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pdf


--------------------------------------------------------------------------------
/capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pptx


--------------------------------------------------------------------------------
/capstone1/Capstone Project 1_ Final Report (V2).pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Capstone Project 1_ Final Report (V2).pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Apply Data Storytelling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Apply Data Storytelling.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Apply Inferential Statistics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Apply Inferential Statistics.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pptx


--------------------------------------------------------------------------------
/capstone1/Project Documents/Capstone 1 - In-Depth Analysis - MBazeley.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone 1 - In-Depth Analysis - MBazeley.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Capstone Project 1_ Data Storytelling - Google Docs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone Project 1_ Data Storytelling - Google Docs.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Capstone Project 1_ Exploratory Data Analysis.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone Project 1_ Exploratory Data Analysis.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Capstone Project 1_ Milestone Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone Project 1_ Milestone Report.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Data Story Rubric_ Capstone 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Data Story Rubric_ Capstone 1.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Data Wrangling Rubric_ Capstone 1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Data Wrangling Rubric_ Capstone 1.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Milestone Report Rubric_ Capstone 1 .pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Milestone Report Rubric_ Capstone 1 .pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Project Capstone Project 1 Data Wrangling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Project Capstone Project 1 Data Wrangling.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/Project Capstone Project 1 Milestone Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Project Capstone Project 1 Milestone Report.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/_ DSC Capstone Project 1 Rubric_ Inferential Statistics.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/_ DSC Capstone Project 1 Rubric_ Inferential Statistics.pdf


--------------------------------------------------------------------------------
/capstone1/Project Documents/dumy.txt:
--------------------------------------------------------------------------------
1 | 1
2 | 


--------------------------------------------------------------------------------
/capstone1/Project_ Capstone Project 1_ Project Proposal.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project_ Capstone Project 1_ Project Proposal.docx


--------------------------------------------------------------------------------
/capstone1/Springboard Project Capstone 1 - Project Ideas - Google Docs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Springboard Project Capstone 1 - Project Ideas - Google Docs.pdf


--------------------------------------------------------------------------------
/capstone2/Capstone 2_ Final Summary.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone 2_ Final Summary.pdf


--------------------------------------------------------------------------------
/capstone2/Capstone 2_ Milestone 1 Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone 2_ Milestone 1 Report.pdf


--------------------------------------------------------------------------------
/capstone2/Capstone 2_ Milestone 2 Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone 2_ Milestone 2 Report.pdf


--------------------------------------------------------------------------------
/capstone2/Capstone2_Slides.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone2_Slides.pdf


--------------------------------------------------------------------------------
/capstone2/Capstone2_Slides.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone2_Slides.pptx


--------------------------------------------------------------------------------
/capstone2/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/DSC Mini-Project_Linear Regression Rubric.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/DSC Mini-Project_Linear Regression Rubric.pdf


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/conditionalmean.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/conditionalmean.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/cs109gitflow3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/cs109gitflow3.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/dummy.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/shuttle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/shuttle.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/data/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/.DS_Store


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/bias.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/bias.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-plot.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-reg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-reg.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/data.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn1.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn2.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linreg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linreg.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linsep.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linsep.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/onelinesplit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/onelinesplit.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/pcanim.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/pcanim.gif


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/reshape.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/reshape.jpg


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearn2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearn2.jpg


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearntrans.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearntrans.jpg


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv2.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv3.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-test.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test-cont.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test-cont.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test3.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/callibration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/callibration.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms2.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/vsm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/vsm.png


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/.ipynb_checkpoints/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/WineKMC.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/WineKMC.xlsx


--------------------------------------------------------------------------------
/mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 16 Spark and PySpark/dummy.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.2 Working with Data/JSON Exervise/data/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.2 Working with Data/JSON Exervise/data/world_bank_projects_less.json:
--------------------------------------------------------------------------------
1 | [{ "_id" : { "$oid" : "52b213b38594d8a2be17c780" }, "approvalfy" : 1999, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-12T00:00:00Z", "borrower" : "FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA", "closingdate" : "2018-07-07T00:00:00Z", "country_namecode" : "Federal Democratic Republic of Ethiopia!$!ET", "countrycode" : "ET", "countryname" : "Federal Democratic Republic of Ethiopia", "countryshortname" : "Ethiopia", "docty" : "Project Information Document,Indigenous Peoples Plan,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 0, "ibrdcommamt" : 0, "id" : "P129828", "idacommamt" : 130000000, "impagency" : "MINISTRY OF EDUCATION", "lendinginstr" : "Investment Project Financing", "lendinginstrtype" : "IN", "lendprojectcost" : 550000000, "majorsector_percent" : [ { "Name" : "Education", "Percent" : 46 }, { "Name" : "Education", "Percent" : 26 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 16 }, { "Name" : "Education", "Percent" : 12 } ], "mjsector_namecode" : [ { "name" : "Education", "code" : "EX" }, { "name" : "Education", "code" : "EX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Education", "code" : "EX" } ], "mjtheme" : [ "Human development" ], "mjtheme_namecode" : [ { "name" : "Human development", "code" : "8" }, { "name" : "", "code" : "11" } ], "mjthemecode" : "8,11", "prodline" : "PE", "prodlinetext" : "IBRD/IDA", "productlinetype" : "L", "project_abstract" : { "cdata" : "The development objective of the Second Phase of General Education Quality Improvement Project for Ethiopia is to improve learning conditions in primary and secondary schools and strengthen institutions at different levels of educational administration. The project has six components. The first component is curriculum, textbooks, assessment, examinations, and inspection. This component will support improvement of learning conditions in grades KG-12 by providing increased access to teaching and learning materials and through improvements to the curriculum by assessing the strengths and weaknesses of the current curriculum. This component has following four sub-components: (i) curriculum reform and implementation; (ii) teaching and learning materials; (iii) assessment and examinations; and (iv) inspection. The second component is teacher development program (TDP). This component will support improvements in learning conditions in both primary and secondary schools by advancing the quality of teaching in general education through: (a) enhancing the training of pre-service teachers in teacher education institutions; and (b) improving the quality of in-service teacher training. This component has following three sub-components: (i) pre-service teacher training; (ii) in-service teacher training; and (iii) licensing and relicensing of teachers and school leaders. The third component is school improvement plan. This component will support the strengthening of school planning in order to improve learning outcomes, and to partly fund the school improvement plans through school grants. It has following two sub-components: (i) school improvement plan; and (ii) school grants. The fourth component is management and capacity building, including education management information systems (EMIS). This component will support management and capacity building aspect of the project. This component has following three sub-components: (i) capacity building for education planning and management; (ii) capacity building for school planning and management; and (iii) EMIS. The fifth component is improving the quality of learning and teaching in secondary schools and universities through the use of information and communications technology (ICT). It has following five sub-components: (i) national policy and institution for ICT in general education; (ii) national ICT infrastructure improvement plan for general education; (iii) develop an integrated monitoring, evaluation, and learning system specifically for the ICT component; (iv) teacher professional development in the use of ICT; and (v) provision of limited number of e-Braille display readers with the possibility to scale up to all secondary education schools based on the successful implementation and usage of the readers. The sixth component is program coordination, monitoring and evaluation, and communication. It will support institutional strengthening by developing capacities in all aspects of program coordination, monitoring and evaluation; a new sub-component on communications will support information sharing for better management and accountability. It has following three sub-components: (i) program coordination; (ii) monitoring and evaluation (M and E); and (iii) communication." }, "project_name" : "Ethiopia General Education Quality Improvement Project II", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID),  Vol.", "DocType" : "PID", "EntityID" : "090224b081e545fb_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b081e545fb_1_0", "DocDate" : "28-AUG-2013" }, { "DocTypeDesc" : "Indigenous Peoples Plan (IP),  Vol.1 of 1", "DocType" : "IP", "EntityID" : "000442464_20130920111729", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20130920111729", "DocDate" : "01-JUL-2013" }, { "DocTypeDesc" : "Project Information Document (PID),  Vol.", "DocType" : "PID", "EntityID" : "090224b0817b19e2_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b0817b19e2_1_0", "DocDate" : "22-NOV-2012" } ], "projectfinancialtype" : "IDA", "projectstatusdisplay" : "Active", "regionname" : "Africa", "sector" : [ { "Name" : "Primary education" }, { "Name" : "Secondary education" }, { "Name" : "Public administration- Other social services" }, { "Name" : "Tertiary education" } ], "sector1" : { "Name" : "Primary education", "Percent" : 46 }, "sector2" : { "Name" : "Secondary education", "Percent" : 26 }, "sector3" : { "Name" : "Public administration- Other social services", "Percent" : 16 }, "sector4" : { "Name" : "Tertiary education", "Percent" : 12 }, "sector_namecode" : [ { "name" : "Primary education", "code" : "EP" }, { "name" : "Secondary education", "code" : "ES" }, { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "Tertiary education", "code" : "ET" } ], "sectorcode" : "ET,BS,ES,EP", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Education for all", "Percent" : 100 }, "theme_namecode" : [ { "name" : "Education for all", "code" : "65" } ], "themecode" : "65", "totalamt" : 130000000, "totalcommamt" : 130000000, "url" : "http://www.worldbank.org/projects/P129828/ethiopia-general-education-quality-improvement-project-ii?lang=en" },
2 | { "_id" : { "$oid" : "52b213b38594d8a2be17c781" }, "approvalfy" : 2015, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-04T00:00:00Z", "borrower" : "GOVERNMENT OF TUNISIA", "country_namecode" : "Republic of Tunisia!$!TN", "countrycode" : "TN", "countryname" : "Republic of Tunisia", "countryshortname" : "Tunisia", "docty" : "Project Information Document,Integrated Safeguards Data Sheet,Integrated Safeguards Data Sheet,Project Information Document,Integrated Safeguards Data Sheet,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 4700000, "ibrdcommamt" : 0, "id" : "P144674", "idacommamt" : 0, "impagency" : "MINISTRY OF FINANCE", "lendinginstr" : "Specific Investment Loan", "lendinginstrtype" : "IN", "lendprojectcost" : 5700000, "majorsector_percent" : [ { "Name" : "Public Administration, Law, and Justice", "Percent" : 70 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 30 } ], "mjsector_namecode" : [ { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" } ], "mjtheme" : [ "Economic management", "Social protection and risk management" ], "mjtheme_namecode" : [ { "name" : "Economic management", "code" : "1" }, { "name" : "Social protection and risk management", "code" : "6" } ], "mjthemecode" : "1,6", "prodline" : "RE", "prodlinetext" : "Recipient Executed Activities", "productlinetype" : "L", "project_name" : "TN: DTF Social Protection Reforms Support", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID),  Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131024115616", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131024115616", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS),  Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20131024151611", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20131024151611", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS),  Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000442464_20131031112136", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20131031112136", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Project Information Document (PID),  Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131031105716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131031105716", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS),  Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20130305113209", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113209", "DocDate" : "16-JAN-2013" }, { "DocTypeDesc" : "Project Information Document (PID),  Vol.1 of 1", "DocType" : "PID", "EntityID" : "000356161_20130305113716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113716", "DocDate" : "16-JAN-2013" } ], "projectfinancialtype" : "OTHER", "projectstatusdisplay" : "Active", "regionname" : "Middle East and North Africa", "sector" : [ { "Name" : "Public administration- Other social services" }, { "Name" : "General public administration sector" } ], "sector1" : { "Name" : "Public administration- Other social services", "Percent" : 70 }, "sector2" : { "Name" : "General public administration sector", "Percent" : 30 }, "sector_namecode" : [ { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "General public administration sector", "code" : "BZ" } ], "sectorcode" : "BZ,BS", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Other economic management", "Percent" : 30 }, "theme_namecode" : [ { "name" : "Other economic management", "code" : "24" }, { "name" : "Social safety nets", "code" : "54" } ], "themecode" : "54,24", "totalamt" : 0, "totalcommamt" : 4700000, "url" : "http://www.worldbank.org/projects/P144674?lang=en" }
3 | ]
4 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/BazeleyMikiko_Sec5_3_sql_project_2018Nov24.sql:
--------------------------------------------------------------------------------
  1 | -- Student: Mikiko Bazeley
  2 | -- Date: 11/24/2018
  3 | -- Sec 5.3 - SQL Mini Project
  4 | 
  5 | 
  6 | 
  7 | 
  8 | /* Q1: Some of the facilities charge a fee to members, but some do not.
  9 | Please list the names of the facilities that do. */
 10 | 
 11 | select f.name
 12 | from facilities f
 13 | where f.membercost > 0;
 14 | 
 15 | /* Q2: How many facilities do not charge a fee to members? */
 16 | 
 17 | select count(*)
 18 | from facilities f
 19 | where f.membercost > 0;
 20 | 
 21 | 
 22 | /* Q3: How can you produce a list of facilities that charge a fee to members,
 23 | where the fee is less than 20% of the facility's monthly maintenance cost?
 24 | Return the facid, facility name, member cost, and monthly maintenance of the
 25 | facilities in question. */
 26 | 
 27 | select f.facid, f.name, f.membercost, f.monthlymaintenance
 28 | from facilities f
 29 | where f.membercost < (0.2*f.monthlymaintenance);
 30 | 
 31 | 
 32 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5?
 33 | Write the query without using the OR operator. */
 34 | 
 35 | select *
 36 | from facilities f
 37 | where f.facid in (1,5);
 38 | 
 39 | /* Q5: How can you produce a list of facilities, with each labelled as
 40 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is
 41 | more than $100? Return the name and monthly maintenance of the facilities
 42 | in question. */
 43 | 
 44 | select f.name, 
 45 | f.monthlymaintenance, 
 46 | case when (f.monthlymaintenance>100) then 'expensive' else 'cheap' end label
 47 | from facilities f;
 48 | 
 49 | /* Q6: You'd like to get the first and last name of the last member(s)
 50 | who signed up. Do not use the LIMIT clause for your solution. */
 51 | 
 52 | select m.firstname, m.surname 
 53 | from members m
 54 | where m.joindate = (select max(joindate) from members);
 55 | 
 56 | /* Q7: How can you produce a list of all members who have used a tennis court?
 57 | Include in your output the name of the court, and the name of the member
 58 | formatted as a single column. Ensure no duplicate data, and order by
 59 | the member name. */
 60 | 
 61 | select f2.name, m2.firstname ||' ' || m2.surname as fullName
 62 | from
 63 |         (select b1.facid, b1.memid
 64 |         from bookings b1
 65 |         where b1.facid in
 66 |                 (select distinct f1.facid
 67 |                 from facilities f1 
 68 |                 where f1.name like ('Tennis Court%')
 69 |                 )
 70 |         group by b1.facid, b1.memid) b2 
 71 | left join facilities f2 on f2.facid = b2.facid
 72 | left join members m2 on m2.memid = b2.memid
 73 | order by (m2.firstname || m2.surname) desc ;
 74 | 
 75 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which
 76 | will cost the member (or guest) more than $30? Remember that guests have
 77 | different costs to members (the listed costs are per half-hour 'slot'), and
 78 | the guest user's ID is always 0. Include in your output the name of the
 79 | facility, the name of the member formatted as a single column, and the cost.
 80 | Order by descending cost, and do not use any subqueries. */
 81 | 
 82 | select f.name, 
 83 |     (m.firstname ||' '|| m.surname) as fullname, 
 84 |     case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost
 85 | --    ,
 86 | --    b.memid,
 87 | --    b.slots,
 88 | --    f.guestcost,
 89 | --    f.membercost
 90 | from bookings b
 91 | left join facilities f on b.facid=f.facid
 92 | left join members m on b.memid = m.memid
 93 | where date(b.starttime) = date('2012-09-14')
 94 | and (cost>30)
 95 | order by cost desc;
 96 | 
 97 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */
 98 | 
 99 | select t.name, t.fullname,t.cost
100 | from
101 |     (select f.name, 
102 |         (m.firstname ||' '|| m.surname) as fullname, 
103 |         case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost,
104 |         date(b.starttime) as date
105 |     from bookings b
106 |     left join facilities f on b.facid=f.facid
107 |     left join members m on b.memid = m.memid) t
108 | where t.date = date('2012-09-14')
109 | and (t.cost>30)
110 | order by t.cost desc;
111 | 
112 | 
113 | /* Q10: Produce a list of facilities with a total revenue less than 1000.
114 | The output of facility name and total revenue, sorted by revenue. Remember
115 | that there's a different cost for guests and members! */
116 | 
117 | select  
118 |     f.name,
119 |     sum(case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end) as revenue
120 | from bookings b
121 | left join facilities f on b.facid=f.facid
122 | group by f.name
123 | having revenue < 1000
124 | order by revenue desc;


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/DSC Mini-Project_ SQL Rubric.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/DSC Mini-Project_ SQL Rubric.docx


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_constraints.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_constraints.PNG


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_data.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_data.PNG


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryClub_facilities.csv:
--------------------------------------------------------------------------------
 1 | facid,name,membercost,guestcost,initialoutlay,monthlymaintenance
 2 | 0,Tennis Court 1,5,25,10000,200
 3 | 1,Tennis Court 2,5,25,8000,200
 4 | 2,Badminton Court,0,15.5,4000,50
 5 | 3,Table Tennis,0,5,320,10
 6 | 4,Massage Room 1,9.9,80,4000,3000
 7 | 5,Massage Room 2,9.9,80,4000,3000
 8 | 6,Squash Court,3.5,17.5,5000,80
 9 | 7,Snooker Table,0,5,450,15
10 | 8,Pool Table,0,5,400,15
11 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryClub_members.csv:
--------------------------------------------------------------------------------
 1 | memid,surname,firstname,address,zipcode,telephone,recommendedby,joindate
 2 | 0,GUEST,GUEST,GUEST,0,(000) 000-0000,,7/1/2012 0:00
 3 | 1,Smith,Darren,"8 Bloomsbury Close, Boston",4321,555-555-5555,,7/2/2012 12:02
 4 | 2,Smith,Tracy,"8 Bloomsbury Close, New York",4321,555-555-5555,,7/2/2012 12:08
 5 | 3,Rownam,Tim,"23 Highway Way, Boston",23423,(844) 693-0723,,7/3/2012 9:32
 6 | 4,Joplette,Janice,"20 Crossing Road, New York",234,(833) 942-4710,1,7/3/2012 10:25
 7 | 5,Butters,Gerald,"1065 Huntingdon Avenue, Boston",56754,(844) 078-4130,1,7/9/2012 10:44
 8 | 6,Tracy,Burton,"3 Tunisia Drive, Boston",45678,(822) 354-9973,,7/15/2012 8:52
 9 | 7,Dare,Nancy,"6 Hunting Lodge Way, Boston",10383,(833) 776-4001,4,7/25/2012 8:59
10 | 8,Boothe,Tim,"3 Bloomsbury Close, Reading, 00234",234,(811) 433-2547,3,7/25/2012 16:02
11 | 9,Stibbons,Ponder,"5 Dragons Way, Winchester",87630,(833) 160-3900,6,7/25/2012 17:09
12 | 10,Owen,Charles,"52 Cheshire Grove, Winchester, 28563",28563,(855) 542-5251,1,8/3/2012 19:42
13 | 11,Jones,David,"976 Gnats Close, Reading",33862,(844) 536-8036,4,8/6/2012 16:32
14 | 12,Baker,Anne,"55 Powdery Street, Boston",80743,844-076-5141,9,8/10/2012 14:23
15 | 13,Farrell,Jemima,"103 Firth Avenue, North Reading",57392,(855) 016-0163,,8/10/2012 14:28
16 | 14,Smith,Jack,"252 Binkington Way, Boston",69302,(822) 163-3254,1,8/10/2012 16:22
17 | 15,Bader,Florence,"264 Ursula Drive, Westford",84923,(833) 499-3527,9,8/10/2012 17:52
18 | 16,Baker,Timothy,"329 James Street, Reading",58393,833-941-0824,13,8/15/2012 10:34
19 | 17,Pinker,David,"5 Impreza Road, Boston",65332,811 409-6734,13,8/16/2012 11:32
20 | 20,Genting,Matthew,"4 Nunnington Place, Wingfield, Boston",52365,(811) 972-1377,5,8/19/2012 14:55
21 | 21,Mackenzie,Anna,"64 Perkington Lane, Reading",64577,(822) 661-2898,1,8/26/2012 9:32
22 | 22,Coplin,Joan,"85 Bard Street, Bloomington, Boston",43533,(822) 499-2232,16,8/29/2012 8:32
23 | 24,Sarwin,Ramnaresh,"12 Bullington Lane, Boston",65464,(822) 413-1470,15,9/1/2012 8:44
24 | 26,Jones,Douglas,"976 Gnats Close, Reading",11986,844 536-8036,11,9/2/2012 18:43
25 | 27,Rumney,Henrietta,"3 Burkington Plaza, Boston",78533,(822) 989-8876,20,9/5/2012 8:42
26 | 28,Farrell,David,"437 Granite Farm Road, Westford",43532,(855) 755-9876,,9/15/2012 8:22
27 | 29,Worthington-Smyth,Henry,"55 Jagbi Way, North Reading",97676,(855) 894-3758,2,9/17/2012 12:27
28 | 30,Purview,Millicent,"641 Drudgery Close, Burnington, Boston",34232,(855) 941-9786,2,9/18/2012 19:04
29 | 33,Tupperware,Hyacinth,"33 Cheerful Plaza, Drake Road, Westford",68666,(822) 665-5327,,9/18/2012 19:32
30 | 35,Hunt,John,"5 Bullington Lane, Boston",54333,(899) 720-6978,30,9/19/2012 11:32
31 | 36,Crumpet,Erica,"Crimson Road, North Reading",75655,(811) 732-4816,2,9/22/2012 8:36
32 | 37,Smith,Darren,"3 Funktown, Denzington, Boston",66796,(822) 577-3541,,9/26/2012 18:08
33 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/country_club_db.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/country_club_db.PNG


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryclub.db:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryclub.db


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_constraints.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_constraints.PNG


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_data.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_data.PNG


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_constraints.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_constraints.PNG


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_data.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_data.PNG


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Answers:
--------------------------------------------------------------------------------
  1 | /* Welcome to the SQL mini project. For this project, you will use
  2 | Springboard' online SQL platform, which you can log into through the
  3 | following link:
  4 | 
  5 | https://sql.springboard.com/
  6 | Username: student
  7 | Password: learn_sql@springboard
  8 | 
  9 | The data you need is in the "country_club" database. This database
 10 | contains 3 tables:
 11 |     i) the "Bookings" table,
 12 |     ii) the "Facilities" table, and
 13 |     iii) the "Members" table.
 14 | 
 15 | Note that, if you need to, you can also download these tables locally.
 16 | 
 17 | In the mini project, you'll be asked a series of questions. You can
 18 | solve them using the platform, but for the final deliverable,
 19 | paste the code for each solution into this script, and upload it
 20 | to your GitHub.
 21 | 
 22 | Before starting with the questions, feel free to take your time,
 23 | exploring the data, and getting acquainted with the 3 tables. */
 24 | 
 25 | 
 26 | 
 27 | /* Q1: Some of the facilities charge a fee to members, but some do not.
 28 | Please list the names of the facilities that do. */
 29 | 
 30 | select f.name
 31 | from facilities f
 32 | where f.membercost > 0;
 33 | 
 34 | 
 35 | /* Q2: How many facilities do not charge a fee to members? */
 36 | 
 37 | select count(*)
 38 | from facilities f
 39 | where f.membercost > 0;
 40 | 
 41 | 
 42 | /* Q3: How can you produce a list of facilities that charge a fee to members,
 43 | where the fee is less than 20% of the facility's monthly maintenance cost?
 44 | Return the facid, facility name, member cost, and monthly maintenance of the
 45 | facilities in question. */
 46 | 
 47 | select f.facid, f.name, f.membercost, f.monthlymaintenance
 48 | from facilities f
 49 | where f.membercost < (0.2*f.monthlymaintenance);/* Welcome to the SQL mini project. For this project, you will use
 50 | Springboard' online SQL platform, which you can log into through the
 51 | following link:
 52 | 
 53 | https://sql.springboard.com/
 54 | Username: student
 55 | Password: learn_sql@springboard
 56 | 
 57 | The data you need is in the "country_club" database. This database
 58 | contains 3 tables:
 59 |     i) the "Bookings" table,
 60 |     ii) the "Facilities" table, and
 61 |     iii) the "Members" table.
 62 | 
 63 | Note that, if you need to, you can also download these tables locally.
 64 | 
 65 | In the mini project, you'll be asked a series of questions. You can
 66 | solve them using the platform, but for the final deliverable,
 67 | paste the code for each solution into this script, and upload it
 68 | to your GitHub.
 69 | 
 70 | Before starting with the questions, feel free to take your time,
 71 | exploring the data, and getting acquainted with the 3 tables. */
 72 | 
 73 | 
 74 | 
 75 | /* Q1: Some of the facilities charge a fee to members, but some do not.
 76 | Please list the names of the facilities that do. */
 77 | 
 78 | select f.name
 79 | from facilities f
 80 | where f.membercost > 0;
 81 | 
 82 | 
 83 | /* Q2: How many facilities do not charge a fee to members? */
 84 | 
 85 | select count(*)
 86 | from facilities f
 87 | where f.membercost > 0;
 88 | 
 89 | 
 90 | /* Q3: How can you produce a list of facilities that charge a fee to members,
 91 | where the fee is less than 20% of the facility's monthly maintenance cost?
 92 | Return the facid, facility name, member cost, and monthly maintenance of the
 93 | facilities in question. */
 94 | 
 95 | select f.facid, f.name, f.membercost, f.monthlymaintenance
 96 | from facilities f
 97 | where f.membercost < (0.2*f.monthlymaintenance);
 98 | 
 99 | 
100 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5?
101 | Write the query without using the OR operator. */
102 | 
103 | select *
104 | from facilities f
105 | where f.facid in (1,5);
106 | 
107 | /* Q5: How can you produce a list of facilities, with each labelled as
108 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is
109 | more than $100? Return the name and monthly maintenance of the facilities
110 | in question. */
111 | 
112 | select f.name, 
113 | f.monthlymaintenance, 
114 | case when (f.monthlymaintenance>100) then 'expensive' else 'cheap' end label
115 | from facilities f
116 | 
117 | /* Q6: You'd like to get the first and last name of the last member(s)
118 | who signed up. Do not use the LIMIT clause for your solution. */
119 | 
120 | select m.firstname, m.surname 
121 | from members m
122 | where m.joindate = (select max(joindate) from members);
123 | 
124 | /* Q7: How can you produce a list of all members who have used a tennis court?
125 | Include in your output the name of the court, and the name of the member
126 | formatted as a single column. Ensure no duplicate data, and order by
127 | the member name. */
128 | 
129 | select f2.name, m2.firstname ||' ' || m2.surname as fullName
130 | from
131 |         (select b1.facid, b1.memid
132 |         from bookings b1
133 |         where b1.facid in
134 |                 (select distinct f1.facid
135 |                 from facilities f1 
136 |                 where f1.name like ('Tennis Court%')
137 |                 )
138 |         group by b1.facid, b1.memid) b2 
139 | left join facilities f2 on f2.facid = b2.facid
140 | left join members m2 on m2.memid = b2.memid
141 | order by (m2.firstname || m2.surname) desc 
142 | 
143 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which
144 | will cost the member (or guest) more than $30? Remember that guests have
145 | different costs to members (the listed costs are per half-hour 'slot'), and
146 | the guest user's ID is always 0. Include in your output the name of the
147 | facility, the name of the member formatted as a single column, and the cost.
148 | Order by descending cost, and do not use any subqueries. */
149 | 
150 | select f.name, 
151 |     (m.firstname ||' '|| m.surname) as fullname, 
152 |     case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost
153 | --    ,
154 | --    b.memid,
155 | --    b.slots,
156 | --    f.guestcost,
157 | --    f.membercost
158 | from bookings b
159 | left join facilities f on b.facid=f.facid
160 | left join members m on b.memid = m.memid
161 | where date(b.starttime) = date('2012-09-14')
162 | and (cost>30)
163 | order by cost desc;
164 | 
165 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */
166 | 
167 | select t.name, t.fullname,t.cost
168 | from
169 |     (select f.name, 
170 |         (m.firstname ||' '|| m.surname) as fullname, 
171 |         case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost,
172 |         date(b.starttime) as date
173 |     from bookings b
174 |     left join facilities f on b.facid=f.facid
175 |     left join members m on b.memid = m.memid) t
176 | where t.date = date('2012-09-14')
177 | and (t.cost>30)
178 | order by t.cost desc;
179 | 
180 | 
181 | /* Q10: Produce a list of facilities with a total revenue less than 1000.
182 | The output of facility name and total revenue, sorted by revenue. Remember
183 | that there's a different cost for guests and members! */
184 | 
185 | select  
186 |     f.name,
187 |     sum(case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end) as revenue
188 | from bookings b
189 | left join facilities f on b.facid=f.facid
190 | group by f.name
191 | having revenue < 1000
192 | order by revenue desc;
193 | 
194 | 
195 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5?
196 | Write the query without using the OR operator. */
197 | 
198 | select *
199 | from facilities f
200 | where f.facid in (1,5);
201 | 
202 | /* Q5: How can you produce a list of facilities, with each labelled as
203 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is
204 | more than $100? Return the name and monthly maintenance of the facilities
205 | in question. */
206 | 
207 | select f.name, 
208 | f.monthlymaintenance, 
209 | case when (f.monthlymaintenance>100) then 'expensive' else 'cheap' end label
210 | from facilities f
211 | 
212 | /* Q6: You'd like to get the first and last name of the last member(s)
213 | who signed up. Do not use the LIMIT clause for your solution. */
214 | 
215 | select m.firstname, m.surname 
216 | from members m
217 | where m.joindate = (select max(joindate) from members);
218 | 
219 | /* Q7: How can you produce a list of all members who have used a tennis court?
220 | Include in your output the name of the court, and the name of the member
221 | formatted as a single column. Ensure no duplicate data, and order by
222 | the member name. */
223 | 
224 | select f2.name, m2.firstname ||' ' || m2.surname as fullName
225 | from
226 |         (select b1.facid, b1.memid
227 |         from bookings b1
228 |         where b1.facid in
229 |                 (select distinct f1.facid
230 |                 from facilities f1 
231 |                 where f1.name like ('Tennis Court%')
232 |                 )
233 |         group by b1.facid, b1.memid) b2 
234 | left join facilities f2 on f2.facid = b2.facid
235 | left join members m2 on m2.memid = b2.memid
236 | order by (m2.firstname || m2.surname) desc 
237 | 
238 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which
239 | will cost the member (or guest) more than $30? Remember that guests have
240 | different costs to members (the listed costs are per half-hour 'slot'), and
241 | the guest user's ID is always 0. Include in your output the name of the
242 | facility, the name of the member formatted as a single column, and the cost.
243 | Order by descending cost, and do not use any subqueries. */
244 | 
245 | select f.name, 
246 |     (m.firstname ||' '|| m.surname) as fullname, 
247 |     case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost
248 | --    ,
249 | --    b.memid,
250 | --    b.slots,
251 | --    f.guestcost,
252 | --    f.membercost
253 | from bookings b
254 | left join facilities f on b.facid=f.facid
255 | left join members m on b.memid = m.memid
256 | where date(b.starttime) = date('2012-09-14')
257 | and (cost>30)
258 | order by cost desc;
259 | 
260 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */
261 | 
262 | select t.name, t.fullname,t.cost
263 | from
264 |     (select f.name, 
265 |         (m.firstname ||' '|| m.surname) as fullname, 
266 |         case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost,
267 |         date(b.starttime) as date
268 |     from bookings b
269 |     left join facilities f on b.facid=f.facid
270 |     left join members m on b.memid = m.memid) t
271 | where t.date = date('2012-09-14')
272 | and (t.cost>30)
273 | order by t.cost desc;
274 | 
275 | 
276 | /* Q10: Produce a list of facilities with a total revenue less than 1000.
277 | The output of facility name and total revenue, sorted by revenue. Remember
278 | that there's a different cost for guests and members! */
279 | 
280 | select  
281 |     f.name,
282 |     sum(case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end) as revenue
283 | from bookings b
284 | left join facilities f on b.facid=f.facid
285 | group by f.name
286 | having revenue < 1000
287 | order by revenue desc;
288 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Intro:
--------------------------------------------------------------------------------
 1 | Project: SQL Practice
 2 | 3 - 5 Hours
 3 | 
 4 | Steps:
 5 | 
 6 | Download the SQL file and follow the instructions to log into the provided SQL platform.
 7 | Fill in your answers to the questions in the SQL file.
 8 | Add your SQL file to a GitHub repository and submit a link to it.
 9 | 
10 | Your project will be evaluated using this rubric (https://docs.google.com/document/d/1xR6CGuZLrzAAA2z_s_vhuOwYykq0cg2uDhsn2H3czo4/edit).
11 | 
12 | Download project file(s).
13 | 
14 | Learning Objective
15 | Work with SQL-based databases 
16 | Learn and write basic SQL queries up to basic aggregations and joins
17 | Comment on SQL code
18 | 
19 | Criteria
20 | Meets Expectations
21 | Completion
22 | The code runs successfully.
23 | 
24 | 
25 | Process and understanding
26 | The submission shows that the correct solutions to all of the 10 problems have been produced.
27 | The submission demonstrates an understanding of the various types of joins, aggregations, filters, and subqueries.
28 | 
29 | 
30 | Presentation
31 | The project is delivered as a .sql file (as stated in the instructions), and uploaded to GitHub.
32 |  
33 | 
34 | Excellence: Student downloads the raw data files, sets up a local database, loads in the data, and uses a Jupyter notebook to set up a connection and query the data.
35 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Project Requirements & Questions:
--------------------------------------------------------------------------------
 1 | /* Welcome to the SQL mini project. For this project, you will use
 2 | Springboard' online SQL platform, which you can log into through the
 3 | following link:
 4 | 
 5 | 
 6 | The data you need is in the "country_club" database. This database
 7 | contains 3 tables:
 8 |     i) the "Bookings" table,
 9 |     ii) the "Facilities" table, and
10 |     iii) the "Members" table.
11 | 
12 | Note that, if you need to, you can also download these tables locally.
13 | 
14 | In the mini project, you'll be asked a series of questions. You can
15 | solve them using the platform, but for the final deliverable,
16 | paste the code for each solution into this script, and upload it
17 | to your GitHub.
18 | 
19 | Before starting with the questions, feel free to take your time,
20 | exploring the data, and getting acquainted with the 3 tables. */
21 | 
22 | 
23 | 
24 | /* Q1: Some of the facilities charge a fee to members, but some do not.
25 | Please list the names of the facilities that do. */
26 | 
27 | 
28 | /* Q2: How many facilities do not charge a fee to members? */
29 | 
30 | 
31 | /* Q3: How can you produce a list of facilities that charge a fee to members,
32 | where the fee is less than 20% of the facility's monthly maintenance cost?
33 | Return the facid, facility name, member cost, and monthly maintenance of the
34 | facilities in question. */
35 | 
36 | 
37 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5?
38 | Write the query without using the OR operator. */
39 | 
40 | 
41 | /* Q5: How can you produce a list of facilities, with each labelled as
42 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is
43 | more than $100? Return the name and monthly maintenance of the facilities
44 | in question. */
45 | 
46 | 
47 | /* Q6: You'd like to get the first and last name of the last member(s)
48 | who signed up. Do not use the LIMIT clause for your solution. */
49 | 
50 | 
51 | /* Q7: How can you produce a list of all members who have used a tennis court?
52 | Include in your output the name of the court, and the name of the member
53 | formatted as a single column. Ensure no duplicate data, and order by
54 | the member name. */
55 | 
56 | 
57 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which
58 | will cost the member (or guest) more than $30? Remember that guests have
59 | different costs to members (the listed costs are per half-hour 'slot'), and
60 | the guest user's ID is always 0. Include in your output the name of the
61 | facility, the name of the member formatted as a single column, and the cost.
62 | Order by descending cost, and do not use any subqueries. */
63 | 
64 | 
65 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */
66 | 
67 | 
68 | /* Q10: Produce a list of facilities with a total revenue less than 1000.
69 | The output of facility name and total revenue, sorted by revenue. Remember
70 | that there's a different cost for guests and members! */
71 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Resources:
--------------------------------------------------------------------------------
 1 | http://www.sqlitetutorial.net/download-install-sqlite/
 2 | 
 3 | http://www.sqlitetutorial.net/sqlite-commands/
 4 | 
 5 | http://www.sqlitetutorial.net/sqlite-import-csv/
 6 | 
 7 | 
 8 | http://www.sqlitetutorial.net/sqlite-python/creating-database/
 9 | 
10 | 
11 | http://www.sqlitetutorial.net/sqlite-import-csv/
12 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 5 - Data Wrangling/5.4 API/dummy.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/EDA_HumanTemp/dummy.tx:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/EDA_HumanTemp/human_body_temperature.csv:
--------------------------------------------------------------------------------
  1 | temperature,gender,heart_rate
  2 | 99.3,F,68.0
  3 | 98.4,F,81.0
  4 | 97.8,M,73.0
  5 | 99.2,F,66.0
  6 | 98.0,F,73.0
  7 | 99.2,M,83.0
  8 | 98.0,M,71.0
  9 | 98.8,M,78.0
 10 | 98.4,F,84.0
 11 | 98.6,F,86.0
 12 | 98.8,F,89.0
 13 | 96.7,F,62.0
 14 | 98.2,M,72.0
 15 | 98.7,F,79.0
 16 | 97.8,F,77.0
 17 | 98.8,F,83.0
 18 | 98.3,F,79.0
 19 | 98.2,M,64.0
 20 | 97.2,F,68.0
 21 | 99.4,M,70.0
 22 | 98.3,F,78.0
 23 | 98.2,M,71.0
 24 | 98.6,M,70.0
 25 | 98.4,M,68.0
 26 | 97.8,M,65.0
 27 | 98.0,F,87.0
 28 | 97.8,F,62.0
 29 | 98.2,F,69.0
 30 | 98.4,F,73.0
 31 | 98.1,M,67.0
 32 | 98.3,M,86.0
 33 | 97.6,F,61.0
 34 | 98.5,M,71.0
 35 | 98.6,M,82.0
 36 | 99.3,M,63.0
 37 | 99.5,M,75.0
 38 | 99.1,M,71.0
 39 | 98.3,M,72.0
 40 | 97.9,F,79.0
 41 | 96.4,F,69.0
 42 | 98.4,F,79.0
 43 | 98.4,M,82.0
 44 | 96.9,M,74.0
 45 | 97.2,M,64.0
 46 | 99.0,F,79.0
 47 | 97.9,F,69.0
 48 | 97.4,M,72.0
 49 | 97.4,M,68.0
 50 | 97.9,M,76.0
 51 | 97.1,M,82.0
 52 | 98.9,F,76.0
 53 | 98.3,F,80.0
 54 | 98.5,F,83.0
 55 | 98.6,M,78.0
 56 | 98.2,F,73.0
 57 | 98.6,F,82.0
 58 | 98.8,F,70.0
 59 | 98.2,M,66.0
 60 | 98.2,F,65.0
 61 | 97.6,M,73.0
 62 | 99.1,F,80.0
 63 | 98.4,M,84.0
 64 | 98.2,F,57.0
 65 | 98.6,M,83.0
 66 | 98.7,F,65.0
 67 | 97.4,M,70.0
 68 | 97.4,F,57.0
 69 | 98.6,M,77.0
 70 | 98.7,F,82.0
 71 | 98.9,M,80.0
 72 | 98.1,F,81.0
 73 | 97.7,F,61.0
 74 | 98.0,M,78.0
 75 | 98.8,M,81.0
 76 | 99.0,M,75.0
 77 | 98.8,M,78.0
 78 | 98.0,F,76.0
 79 | 98.4,M,70.0
 80 | 97.4,M,78.0
 81 | 97.6,M,74.0
 82 | 98.8,F,73.0
 83 | 98.0,M,67.0
 84 | 97.5,M,70.0
 85 | 99.2,F,77.0
 86 | 98.6,F,85.0
 87 | 97.1,M,75.0
 88 | 98.6,F,77.0
 89 | 98.0,M,78.0
 90 | 98.7,M,73.0
 91 | 98.1,M,73.0
 92 | 97.8,M,74.0
 93 | 100.0,F,78.0
 94 | 98.8,F,84.0
 95 | 97.1,M,73.0
 96 | 97.8,M,58.0
 97 | 96.8,F,75.0
 98 | 99.9,F,79.0
 99 | 98.7,F,64.0
100 | 98.8,F,64.0
101 | 98.0,M,74.0
102 | 99.0,M,81.0
103 | 98.5,M,68.0
104 | 98.0,F,78.0
105 | 99.4,F,77.0
106 | 97.6,M,69.0
107 | 96.7,M,71.0
108 | 97.0,M,80.0
109 | 98.6,M,66.0
110 | 98.7,F,72.0
111 | 97.3,M,69.0
112 | 98.8,F,69.0
113 | 98.0,F,89.0
114 | 98.2,F,64.0
115 | 99.1,F,74.0
116 | 99.0,M,79.0
117 | 98.0,M,64.0
118 | 100.8,F,77.0
119 | 97.8,F,71.0
120 | 98.7,M,78.0
121 | 98.4,F,74.0
122 | 97.7,F,84.0
123 | 97.9,F,68.0
124 | 99.0,F,81.0
125 | 97.2,F,66.0
126 | 97.5,M,75.0
127 | 96.3,M,70.0
128 | 97.7,M,77.0
129 | 98.2,F,73.0
130 | 97.9,M,72.0
131 | 98.7,F,59.0
132 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/BazeleyMikiko_MiniProject_RacialDiscrimination-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Examining Racial Discrimination in the US Job Market\n",
  8 |     "\n",
  9 |     "### Background\n",
 10 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés to black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 11 |     "\n",
 12 |     "### Data\n",
 13 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 14 |     "\n",
 15 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes when presented to the employer."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "### Exercises\n",
 23 |     "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 24 |     "\n",
 25 |     "Answer the following questions **in this notebook below and submit to your Github account**. \n",
 26 |     "\n",
 27 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 28 |     "   2. What are the null and alternate hypotheses?\n",
 29 |     "   3. Compute margin of error, confidence interval, and p-value. Try using both the bootstrapping and the frequentist statistical approaches.\n",
 30 |     "   4. Write a story describing the statistical significance in the context or the original problem.\n",
 31 |     "   5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n",
 32 |     "\n",
 33 |     "You can include written notes in notebook cells using Markdown: \n",
 34 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 35 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 36 |     "\n",
 37 |     "#### Resources\n",
 38 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 39 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n",
 40 |     "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 41 |     "+ Formulas for the Bernoulli distribution: https://en.wikipedia.org/wiki/Bernoulli_distribution"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 1,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import pandas as pd\n",
 51 |     "import numpy as np\n",
 52 |     "from scipy import stats"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "235.0\n",
 74 |       "157.0\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "# number of callbacks for black-sounding names\n",
 80 |     "print(sum(data[data.race=='w'].call))\n",
 81 |     "\n",
 82 |     "print(sum(data[data.race=='b'].call))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/html": [
 93 |        "<div>\n",
 94 |        "<style scoped>\n",
 95 |        "    .dataframe tbody tr th:only-of-type {\n",
 96 |        "        vertical-align: middle;\n",
 97 |        "    }\n",
 98 |        "\n",
 99 |        "    .dataframe tbody tr th {\n",
100 |        "        vertical-align: top;\n",
101 |        "    }\n",
102 |        "\n",
103 |        "    .dataframe thead th {\n",
104 |        "        text-align: right;\n",
105 |        "    }\n",
106 |        "</style>\n",
107 |        "<table border=\"1\" class=\"dataframe\">\n",
108 |        "  <thead>\n",
109 |        "    <tr style=\"text-align: right;\">\n",
110 |        "      <th></th>\n",
111 |        "      <th>id</th>\n",
112 |        "      <th>ad</th>\n",
113 |        "      <th>education</th>\n",
114 |        "      <th>ofjobs</th>\n",
115 |        "      <th>yearsexp</th>\n",
116 |        "      <th>honors</th>\n",
117 |        "      <th>volunteer</th>\n",
118 |        "      <th>military</th>\n",
119 |        "      <th>empholes</th>\n",
120 |        "      <th>occupspecific</th>\n",
121 |        "      <th>...</th>\n",
122 |        "      <th>compreq</th>\n",
123 |        "      <th>orgreq</th>\n",
124 |        "      <th>manuf</th>\n",
125 |        "      <th>transcom</th>\n",
126 |        "      <th>bankreal</th>\n",
127 |        "      <th>trade</th>\n",
128 |        "      <th>busservice</th>\n",
129 |        "      <th>othservice</th>\n",
130 |        "      <th>missind</th>\n",
131 |        "      <th>ownership</th>\n",
132 |        "    </tr>\n",
133 |        "  </thead>\n",
134 |        "  <tbody>\n",
135 |        "    <tr>\n",
136 |        "      <th>0</th>\n",
137 |        "      <td>b</td>\n",
138 |        "      <td>1</td>\n",
139 |        "      <td>4</td>\n",
140 |        "      <td>2</td>\n",
141 |        "      <td>6</td>\n",
142 |        "      <td>0</td>\n",
143 |        "      <td>0</td>\n",
144 |        "      <td>0</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>17</td>\n",
147 |        "      <td>...</td>\n",
148 |        "      <td>1.0</td>\n",
149 |        "      <td>0.0</td>\n",
150 |        "      <td>1.0</td>\n",
151 |        "      <td>0.0</td>\n",
152 |        "      <td>0.0</td>\n",
153 |        "      <td>0.0</td>\n",
154 |        "      <td>0.0</td>\n",
155 |        "      <td>0.0</td>\n",
156 |        "      <td>0.0</td>\n",
157 |        "      <td></td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>1</th>\n",
161 |        "      <td>b</td>\n",
162 |        "      <td>1</td>\n",
163 |        "      <td>3</td>\n",
164 |        "      <td>3</td>\n",
165 |        "      <td>6</td>\n",
166 |        "      <td>0</td>\n",
167 |        "      <td>1</td>\n",
168 |        "      <td>1</td>\n",
169 |        "      <td>0</td>\n",
170 |        "      <td>316</td>\n",
171 |        "      <td>...</td>\n",
172 |        "      <td>1.0</td>\n",
173 |        "      <td>0.0</td>\n",
174 |        "      <td>1.0</td>\n",
175 |        "      <td>0.0</td>\n",
176 |        "      <td>0.0</td>\n",
177 |        "      <td>0.0</td>\n",
178 |        "      <td>0.0</td>\n",
179 |        "      <td>0.0</td>\n",
180 |        "      <td>0.0</td>\n",
181 |        "      <td></td>\n",
182 |        "    </tr>\n",
183 |        "    <tr>\n",
184 |        "      <th>2</th>\n",
185 |        "      <td>b</td>\n",
186 |        "      <td>1</td>\n",
187 |        "      <td>4</td>\n",
188 |        "      <td>1</td>\n",
189 |        "      <td>6</td>\n",
190 |        "      <td>0</td>\n",
191 |        "      <td>0</td>\n",
192 |        "      <td>0</td>\n",
193 |        "      <td>0</td>\n",
194 |        "      <td>19</td>\n",
195 |        "      <td>...</td>\n",
196 |        "      <td>1.0</td>\n",
197 |        "      <td>0.0</td>\n",
198 |        "      <td>1.0</td>\n",
199 |        "      <td>0.0</td>\n",
200 |        "      <td>0.0</td>\n",
201 |        "      <td>0.0</td>\n",
202 |        "      <td>0.0</td>\n",
203 |        "      <td>0.0</td>\n",
204 |        "      <td>0.0</td>\n",
205 |        "      <td></td>\n",
206 |        "    </tr>\n",
207 |        "    <tr>\n",
208 |        "      <th>3</th>\n",
209 |        "      <td>b</td>\n",
210 |        "      <td>1</td>\n",
211 |        "      <td>3</td>\n",
212 |        "      <td>4</td>\n",
213 |        "      <td>6</td>\n",
214 |        "      <td>0</td>\n",
215 |        "      <td>1</td>\n",
216 |        "      <td>0</td>\n",
217 |        "      <td>1</td>\n",
218 |        "      <td>313</td>\n",
219 |        "      <td>...</td>\n",
220 |        "      <td>1.0</td>\n",
221 |        "      <td>0.0</td>\n",
222 |        "      <td>1.0</td>\n",
223 |        "      <td>0.0</td>\n",
224 |        "      <td>0.0</td>\n",
225 |        "      <td>0.0</td>\n",
226 |        "      <td>0.0</td>\n",
227 |        "      <td>0.0</td>\n",
228 |        "      <td>0.0</td>\n",
229 |        "      <td></td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>4</th>\n",
233 |        "      <td>b</td>\n",
234 |        "      <td>1</td>\n",
235 |        "      <td>3</td>\n",
236 |        "      <td>3</td>\n",
237 |        "      <td>22</td>\n",
238 |        "      <td>0</td>\n",
239 |        "      <td>0</td>\n",
240 |        "      <td>0</td>\n",
241 |        "      <td>0</td>\n",
242 |        "      <td>313</td>\n",
243 |        "      <td>...</td>\n",
244 |        "      <td>1.0</td>\n",
245 |        "      <td>1.0</td>\n",
246 |        "      <td>0.0</td>\n",
247 |        "      <td>0.0</td>\n",
248 |        "      <td>0.0</td>\n",
249 |        "      <td>0.0</td>\n",
250 |        "      <td>0.0</td>\n",
251 |        "      <td>1.0</td>\n",
252 |        "      <td>0.0</td>\n",
253 |        "      <td>Nonprofit</td>\n",
254 |        "    </tr>\n",
255 |        "  </tbody>\n",
256 |        "</table>\n",
257 |        "<p>5 rows × 65 columns</p>\n",
258 |        "</div>"
259 |       ],
260 |       "text/plain": [
261 |        "  id ad  education  ofjobs  yearsexp  honors  volunteer  military  empholes  \\\n",
262 |        "0  b  1          4       2         6       0          0         0         1   \n",
263 |        "1  b  1          3       3         6       0          1         1         0   \n",
264 |        "2  b  1          4       1         6       0          0         0         0   \n",
265 |        "3  b  1          3       4         6       0          1         0         1   \n",
266 |        "4  b  1          3       3        22       0          0         0         0   \n",
267 |        "\n",
268 |        "   occupspecific    ...      compreq  orgreq  manuf  transcom  bankreal trade  \\\n",
269 |        "0             17    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
270 |        "1            316    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
271 |        "2             19    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
272 |        "3            313    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
273 |        "4            313    ...          1.0     1.0    0.0       0.0       0.0   0.0   \n",
274 |        "\n",
275 |        "  busservice othservice  missind  ownership  \n",
276 |        "0        0.0        0.0      0.0             \n",
277 |        "1        0.0        0.0      0.0             \n",
278 |        "2        0.0        0.0      0.0             \n",
279 |        "3        0.0        0.0      0.0             \n",
280 |        "4        0.0        1.0      0.0  Nonprofit  \n",
281 |        "\n",
282 |        "[5 rows x 65 columns]"
283 |       ]
284 |      },
285 |      "execution_count": 4,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "data.head()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {
297 |     "collapsed": true
298 |    },
299 |    "source": [
300 |     "## 1. What test is appropriate for this problem? Does CLT apply?\n",
301 |     "\n",
302 |     "The problem we are trying to solve is whether there is a statistically significant difference between callbacks for white sounding names and black sounding names. \n",
303 |     "\n",
304 |     "The CLT states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximatelt equal to the mean of the population. Specifically, as the sample sizes get larger, the distributio of means caculated from repeated sampling will approach normality. \n",
305 |     "\n",
306 |     "Another way to interpret the Cnetral Limit Theorem is that if we repeatedly take independent random samples of size n from any population, then when n is large, the distribution of the sample means will approach a normal distribution. \n",
307 |     "\n",
308 |     "We can see that we have more than 300+ samples that were randomly assigned b/w names, so we can assume that the based on sample size and independence of draw that CLT would apply. \n",
309 |     "\n",
310 |     "Given CLT applies, we should also be able to use a 2 sampel t-test. Given we have a large sample size but don't know the population standard deviation, this seems to be the most appropriate test.\n",
311 |     "\n",
312 |     "\n",
313 |     "## 2. What are the null and alternate hypotheses?\n",
314 |     "\n",
315 |     "\n",
316 |     "\n",
317 |     "Ho: (Proportion of W called back) == (Proportion of B called back)\n",
318 |     "\n",
319 |     "\n",
320 |     "Ha: (Proportion of W called back) =/= (Proportion of B called back)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 5,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "w = data[data.race=='w']\n",
330 |     "b = data[data.race=='b']"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 6,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "name": "stdout",
340 |      "output_type": "stream",
341 |      "text": [
342 |       "Z score: 4.108412152434346\n",
343 |       "P-value: 3.983886837585077e-05\n"
344 |      ]
345 |     }
346 |    ],
347 |    "source": [
348 |     "# Your solution to Q3 here\n",
349 |     "# Compute margin of error, confidence interval, and p-value. \n",
350 |     "# Try using both the bootstrapping and the frequentist statistical approaches.\n",
351 |     "\n",
352 |     "\n",
353 |     "#Frequentist Approach\n",
354 |     "\n",
355 |     "n_w = len(w)\n",
356 |     "n_b = len(b)\n",
357 |     "\n",
358 |     "n_w_c = np.sum(w.call)\n",
359 |     "n_b_c = np.sum(b.call)\n",
360 |     "\n",
361 |     "prop_w = n_w_c / n_w\n",
362 |     "prop_b = n_b_c / n_b\n",
363 |     "\n",
364 |     "prop_diff = prop_w - prop_b\n",
365 |     "phat = (n_w_c + n_b_c) / (n_w + n_b)\n",
366 |     "\n",
367 |     "z = prop_diff / np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n",
368 |     "pval = stats.norm.cdf(-z) * 2\n",
369 |     "\n",
370 |     "\n",
371 |     "print(\"Z score: {}\".format(z))\n",
372 |     "print(\"P-value: {}\".format(pval))"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "The p-value is fairly low using the frquentist approach, so we may want to validate using a hacker stats approach."
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 7,
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "name": "stdout",
389 |      "output_type": "stream",
390 |      "text": [
391 |       "p = 0.0\n"
392 |      ]
393 |     }
394 |    ],
395 |    "source": [
396 |     "# Hacker stats approach\n",
397 |     "\n",
398 |     "\n",
399 |     "permutation_replicates = np.empty(100000)\n",
400 |     "\n",
401 |     "white = data[data.race == 'w'].call.values\n",
402 |     "black = data[data.race == 'b'].call.values\n",
403 |     "diff_of_means = np.mean(white) - np.mean(black)\n",
404 |     "\n",
405 |     "for i in range(len(permutation_replicates)):\n",
406 |     "    permutation_samples = np.random.permutation(np.concatenate((white, black)))\n",
407 |     "    \n",
408 |     "    white_perm = permutation_samples[:len(white)]\n",
409 |     "    black_perm = permutation_samples[len(white):]\n",
410 |     "    \n",
411 |     "    permutation_replicates[i] = np.abs(np.mean(white_perm) - np.mean(black_perm))\n",
412 |     "\n",
413 |     "p = np.sum(permutation_replicates > diff_of_means) / len(permutation_replicates)\n",
414 |     "\n",
415 |     "\n",
416 |     "print('p =', p)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "P-value here is alos fairly low."
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 8,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "name": "stdout",
433 |      "output_type": "stream",
434 |      "text": [
435 |       "Margin of Error: 0.015281912310894095\n",
436 |       "Confidence interval: [0.01675094 0.04731477]\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "moe = 1.96 * np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n",
442 |     "\n",
443 |     "ci = prop_diff + np.array([-1, 1]) * moe\n",
444 |     "\n",
445 |     "\n",
446 |     "print(\"Margin of Error: {}\".format(moe))\n",
447 |     "print(\"Confidence interval: {}\".format(ci))"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {
453 |     "collapsed": true
454 |    },
455 |    "source": [
456 |     "## 4. Write a story describing the statistical significance in the context or the original problem.\n",
457 |     "\n",
458 |     "\n",
459 |     "\n",
460 |     "P value is low still so we reject the null hypothesis that black and white sounding names have the same callback rates.\n"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "## 5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n",
468 |     "\n",
469 |     "We'd need to understand the impact of the other variables present. Even though names were randomly distributed there could be other variables that might suffer from systemic bias or confound the results. \n",
470 |     "\n",
471 |     "Example: Gender, Education, Georgraphy, etc. Another interesting question could be that of the assigned b labels that did get callbacks, were there significant differences from their counterparts that did not get callbacks?  "
472 |    ]
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "kernelspec": {
477 |    "display_name": "Python 3",
478 |    "language": "python",
479 |    "name": "python3"
480 |   },
481 |   "language_info": {
482 |    "codemirror_mode": {
483 |     "name": "ipython",
484 |     "version": 3
485 |    },
486 |    "file_extension": ".py",
487 |    "mimetype": "text/x-python",
488 |    "name": "python",
489 |    "nbconvert_exporter": "python",
490 |    "pygments_lexer": "ipython3",
491 |    "version": "3.7.0"
492 |   }
493 |  },
494 |  "nbformat": 4,
495 |  "nbformat_minor": 1
496 | }
497 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/BazeleyMikiko_MiniProject_RacialDiscrimination.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Examining Racial Discrimination in the US Job Market\n",
  8 |     "\n",
  9 |     "### Background\n",
 10 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés to black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 11 |     "\n",
 12 |     "### Data\n",
 13 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 14 |     "\n",
 15 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes when presented to the employer."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "### Exercises\n",
 23 |     "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 24 |     "\n",
 25 |     "Answer the following questions **in this notebook below and submit to your Github account**. \n",
 26 |     "\n",
 27 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 28 |     "   2. What are the null and alternate hypotheses?\n",
 29 |     "   3. Compute margin of error, confidence interval, and p-value. Try using both the bootstrapping and the frequentist statistical approaches.\n",
 30 |     "   4. Write a story describing the statistical significance in the context or the original problem.\n",
 31 |     "   5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n",
 32 |     "\n",
 33 |     "You can include written notes in notebook cells using Markdown: \n",
 34 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 35 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 36 |     "\n",
 37 |     "#### Resources\n",
 38 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 39 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n",
 40 |     "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 41 |     "+ Formulas for the Bernoulli distribution: https://en.wikipedia.org/wiki/Bernoulli_distribution"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 1,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "import pandas as pd\n",
 51 |     "import numpy as np\n",
 52 |     "from scipy import stats"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "name": "stdout",
 71 |      "output_type": "stream",
 72 |      "text": [
 73 |       "235.0\n",
 74 |       "157.0\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "# number of callbacks for black-sounding names\n",
 80 |     "print(sum(data[data.race=='w'].call))\n",
 81 |     "\n",
 82 |     "print(sum(data[data.race=='b'].call))"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/html": [
 93 |        "<div>\n",
 94 |        "<style scoped>\n",
 95 |        "    .dataframe tbody tr th:only-of-type {\n",
 96 |        "        vertical-align: middle;\n",
 97 |        "    }\n",
 98 |        "\n",
 99 |        "    .dataframe tbody tr th {\n",
100 |        "        vertical-align: top;\n",
101 |        "    }\n",
102 |        "\n",
103 |        "    .dataframe thead th {\n",
104 |        "        text-align: right;\n",
105 |        "    }\n",
106 |        "</style>\n",
107 |        "<table border=\"1\" class=\"dataframe\">\n",
108 |        "  <thead>\n",
109 |        "    <tr style=\"text-align: right;\">\n",
110 |        "      <th></th>\n",
111 |        "      <th>id</th>\n",
112 |        "      <th>ad</th>\n",
113 |        "      <th>education</th>\n",
114 |        "      <th>ofjobs</th>\n",
115 |        "      <th>yearsexp</th>\n",
116 |        "      <th>honors</th>\n",
117 |        "      <th>volunteer</th>\n",
118 |        "      <th>military</th>\n",
119 |        "      <th>empholes</th>\n",
120 |        "      <th>occupspecific</th>\n",
121 |        "      <th>...</th>\n",
122 |        "      <th>compreq</th>\n",
123 |        "      <th>orgreq</th>\n",
124 |        "      <th>manuf</th>\n",
125 |        "      <th>transcom</th>\n",
126 |        "      <th>bankreal</th>\n",
127 |        "      <th>trade</th>\n",
128 |        "      <th>busservice</th>\n",
129 |        "      <th>othservice</th>\n",
130 |        "      <th>missind</th>\n",
131 |        "      <th>ownership</th>\n",
132 |        "    </tr>\n",
133 |        "  </thead>\n",
134 |        "  <tbody>\n",
135 |        "    <tr>\n",
136 |        "      <th>0</th>\n",
137 |        "      <td>b</td>\n",
138 |        "      <td>1</td>\n",
139 |        "      <td>4</td>\n",
140 |        "      <td>2</td>\n",
141 |        "      <td>6</td>\n",
142 |        "      <td>0</td>\n",
143 |        "      <td>0</td>\n",
144 |        "      <td>0</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>17</td>\n",
147 |        "      <td>...</td>\n",
148 |        "      <td>1.0</td>\n",
149 |        "      <td>0.0</td>\n",
150 |        "      <td>1.0</td>\n",
151 |        "      <td>0.0</td>\n",
152 |        "      <td>0.0</td>\n",
153 |        "      <td>0.0</td>\n",
154 |        "      <td>0.0</td>\n",
155 |        "      <td>0.0</td>\n",
156 |        "      <td>0.0</td>\n",
157 |        "      <td></td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th>1</th>\n",
161 |        "      <td>b</td>\n",
162 |        "      <td>1</td>\n",
163 |        "      <td>3</td>\n",
164 |        "      <td>3</td>\n",
165 |        "      <td>6</td>\n",
166 |        "      <td>0</td>\n",
167 |        "      <td>1</td>\n",
168 |        "      <td>1</td>\n",
169 |        "      <td>0</td>\n",
170 |        "      <td>316</td>\n",
171 |        "      <td>...</td>\n",
172 |        "      <td>1.0</td>\n",
173 |        "      <td>0.0</td>\n",
174 |        "      <td>1.0</td>\n",
175 |        "      <td>0.0</td>\n",
176 |        "      <td>0.0</td>\n",
177 |        "      <td>0.0</td>\n",
178 |        "      <td>0.0</td>\n",
179 |        "      <td>0.0</td>\n",
180 |        "      <td>0.0</td>\n",
181 |        "      <td></td>\n",
182 |        "    </tr>\n",
183 |        "    <tr>\n",
184 |        "      <th>2</th>\n",
185 |        "      <td>b</td>\n",
186 |        "      <td>1</td>\n",
187 |        "      <td>4</td>\n",
188 |        "      <td>1</td>\n",
189 |        "      <td>6</td>\n",
190 |        "      <td>0</td>\n",
191 |        "      <td>0</td>\n",
192 |        "      <td>0</td>\n",
193 |        "      <td>0</td>\n",
194 |        "      <td>19</td>\n",
195 |        "      <td>...</td>\n",
196 |        "      <td>1.0</td>\n",
197 |        "      <td>0.0</td>\n",
198 |        "      <td>1.0</td>\n",
199 |        "      <td>0.0</td>\n",
200 |        "      <td>0.0</td>\n",
201 |        "      <td>0.0</td>\n",
202 |        "      <td>0.0</td>\n",
203 |        "      <td>0.0</td>\n",
204 |        "      <td>0.0</td>\n",
205 |        "      <td></td>\n",
206 |        "    </tr>\n",
207 |        "    <tr>\n",
208 |        "      <th>3</th>\n",
209 |        "      <td>b</td>\n",
210 |        "      <td>1</td>\n",
211 |        "      <td>3</td>\n",
212 |        "      <td>4</td>\n",
213 |        "      <td>6</td>\n",
214 |        "      <td>0</td>\n",
215 |        "      <td>1</td>\n",
216 |        "      <td>0</td>\n",
217 |        "      <td>1</td>\n",
218 |        "      <td>313</td>\n",
219 |        "      <td>...</td>\n",
220 |        "      <td>1.0</td>\n",
221 |        "      <td>0.0</td>\n",
222 |        "      <td>1.0</td>\n",
223 |        "      <td>0.0</td>\n",
224 |        "      <td>0.0</td>\n",
225 |        "      <td>0.0</td>\n",
226 |        "      <td>0.0</td>\n",
227 |        "      <td>0.0</td>\n",
228 |        "      <td>0.0</td>\n",
229 |        "      <td></td>\n",
230 |        "    </tr>\n",
231 |        "    <tr>\n",
232 |        "      <th>4</th>\n",
233 |        "      <td>b</td>\n",
234 |        "      <td>1</td>\n",
235 |        "      <td>3</td>\n",
236 |        "      <td>3</td>\n",
237 |        "      <td>22</td>\n",
238 |        "      <td>0</td>\n",
239 |        "      <td>0</td>\n",
240 |        "      <td>0</td>\n",
241 |        "      <td>0</td>\n",
242 |        "      <td>313</td>\n",
243 |        "      <td>...</td>\n",
244 |        "      <td>1.0</td>\n",
245 |        "      <td>1.0</td>\n",
246 |        "      <td>0.0</td>\n",
247 |        "      <td>0.0</td>\n",
248 |        "      <td>0.0</td>\n",
249 |        "      <td>0.0</td>\n",
250 |        "      <td>0.0</td>\n",
251 |        "      <td>1.0</td>\n",
252 |        "      <td>0.0</td>\n",
253 |        "      <td>Nonprofit</td>\n",
254 |        "    </tr>\n",
255 |        "  </tbody>\n",
256 |        "</table>\n",
257 |        "<p>5 rows × 65 columns</p>\n",
258 |        "</div>"
259 |       ],
260 |       "text/plain": [
261 |        "  id ad  education  ofjobs  yearsexp  honors  volunteer  military  empholes  \\\n",
262 |        "0  b  1          4       2         6       0          0         0         1   \n",
263 |        "1  b  1          3       3         6       0          1         1         0   \n",
264 |        "2  b  1          4       1         6       0          0         0         0   \n",
265 |        "3  b  1          3       4         6       0          1         0         1   \n",
266 |        "4  b  1          3       3        22       0          0         0         0   \n",
267 |        "\n",
268 |        "   occupspecific    ...      compreq  orgreq  manuf  transcom  bankreal trade  \\\n",
269 |        "0             17    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
270 |        "1            316    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
271 |        "2             19    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
272 |        "3            313    ...          1.0     0.0    1.0       0.0       0.0   0.0   \n",
273 |        "4            313    ...          1.0     1.0    0.0       0.0       0.0   0.0   \n",
274 |        "\n",
275 |        "  busservice othservice  missind  ownership  \n",
276 |        "0        0.0        0.0      0.0             \n",
277 |        "1        0.0        0.0      0.0             \n",
278 |        "2        0.0        0.0      0.0             \n",
279 |        "3        0.0        0.0      0.0             \n",
280 |        "4        0.0        1.0      0.0  Nonprofit  \n",
281 |        "\n",
282 |        "[5 rows x 65 columns]"
283 |       ]
284 |      },
285 |      "execution_count": 4,
286 |      "metadata": {},
287 |      "output_type": "execute_result"
288 |     }
289 |    ],
290 |    "source": [
291 |     "data.head()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {
297 |     "collapsed": true
298 |    },
299 |    "source": [
300 |     "## 1. What test is appropriate for this problem? Does CLT apply?\n",
301 |     "\n",
302 |     "The problem we are trying to solve is whether there is a statistically significant difference between callbacks for white sounding names and black sounding names. \n",
303 |     "\n",
304 |     "The CLT states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximatelt equal to the mean of the population. Specifically, as the sample sizes get larger, the distributio of means caculated from repeated sampling will approach normality. \n",
305 |     "\n",
306 |     "Another way to interpret the Cnetral Limit Theorem is that if we repeatedly take independent random samples of size n from any population, then when n is large, the distribution of the sample means will approach a normal distribution. \n",
307 |     "\n",
308 |     "We can see that we have more than 300+ samples that were randomly assigned b/w names, so we can assume that the based on sample size and independence of draw that CLT would apply. \n",
309 |     "\n",
310 |     "Given CLT applies, we should also be able to use a 2 sampel t-test. Given we have a large sample size but don't know the population standard deviation, this seems to be the most appropriate test.\n",
311 |     "\n",
312 |     "\n",
313 |     "## 2. What are the null and alternate hypotheses?\n",
314 |     "\n",
315 |     "\n",
316 |     "\n",
317 |     "Ho: (Proportion of W called back) == (Proportion of B called back)\n",
318 |     "\n",
319 |     "\n",
320 |     "Ha: (Proportion of W called back) =/= (Proportion of B called back)"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "code",
325 |    "execution_count": 5,
326 |    "metadata": {},
327 |    "outputs": [],
328 |    "source": [
329 |     "w = data[data.race=='w']\n",
330 |     "b = data[data.race=='b']"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 6,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "name": "stdout",
340 |      "output_type": "stream",
341 |      "text": [
342 |       "Z score: 4.108412152434346\n",
343 |       "P-value: 3.983886837585077e-05\n"
344 |      ]
345 |     }
346 |    ],
347 |    "source": [
348 |     "# Your solution to Q3 here\n",
349 |     "# Compute margin of error, confidence interval, and p-value. \n",
350 |     "# Try using both the bootstrapping and the frequentist statistical approaches.\n",
351 |     "\n",
352 |     "\n",
353 |     "#Frequentist Approach\n",
354 |     "\n",
355 |     "n_w = len(w)\n",
356 |     "n_b = len(b)\n",
357 |     "\n",
358 |     "n_w_c = np.sum(w.call)\n",
359 |     "n_b_c = np.sum(b.call)\n",
360 |     "\n",
361 |     "prop_w = n_w_c / n_w\n",
362 |     "prop_b = n_b_c / n_b\n",
363 |     "\n",
364 |     "prop_diff = prop_w - prop_b\n",
365 |     "phat = (n_w_c + n_b_c) / (n_w + n_b)\n",
366 |     "\n",
367 |     "z = prop_diff / np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n",
368 |     "pval = stats.norm.cdf(-z) * 2\n",
369 |     "\n",
370 |     "\n",
371 |     "print(\"Z score: {}\".format(z))\n",
372 |     "print(\"P-value: {}\".format(pval))"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {},
378 |    "source": [
379 |     "The p-value is fairly low using the frquentist approach, so we may want to validate using a hacker stats approach."
380 |    ]
381 |   },
382 |   {
383 |    "cell_type": "code",
384 |    "execution_count": 7,
385 |    "metadata": {},
386 |    "outputs": [
387 |     {
388 |      "name": "stdout",
389 |      "output_type": "stream",
390 |      "text": [
391 |       "p = 0.0\n"
392 |      ]
393 |     }
394 |    ],
395 |    "source": [
396 |     "# Hacker stats approach\n",
397 |     "\n",
398 |     "\n",
399 |     "permutation_replicates = np.empty(100000)\n",
400 |     "\n",
401 |     "white = data[data.race == 'w'].call.values\n",
402 |     "black = data[data.race == 'b'].call.values\n",
403 |     "diff_of_means = np.mean(white) - np.mean(black)\n",
404 |     "\n",
405 |     "for i in range(len(permutation_replicates)):\n",
406 |     "    permutation_samples = np.random.permutation(np.concatenate((white, black)))\n",
407 |     "    \n",
408 |     "    white_perm = permutation_samples[:len(white)]\n",
409 |     "    black_perm = permutation_samples[len(white):]\n",
410 |     "    \n",
411 |     "    permutation_replicates[i] = np.abs(np.mean(white_perm) - np.mean(black_perm))\n",
412 |     "\n",
413 |     "p = np.sum(permutation_replicates > diff_of_means) / len(permutation_replicates)\n",
414 |     "\n",
415 |     "\n",
416 |     "print('p =', p)"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "markdown",
421 |    "metadata": {},
422 |    "source": [
423 |     "P-value here is alos fairly low."
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 8,
429 |    "metadata": {},
430 |    "outputs": [
431 |     {
432 |      "name": "stdout",
433 |      "output_type": "stream",
434 |      "text": [
435 |       "Margin of Error: 0.015281912310894095\n",
436 |       "Confidence interval: [0.01675094 0.04731477]\n"
437 |      ]
438 |     }
439 |    ],
440 |    "source": [
441 |     "moe = 1.96 * np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n",
442 |     "\n",
443 |     "ci = prop_diff + np.array([-1, 1]) * moe\n",
444 |     "\n",
445 |     "\n",
446 |     "print(\"Margin of Error: {}\".format(moe))\n",
447 |     "print(\"Confidence interval: {}\".format(ci))"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "markdown",
452 |    "metadata": {
453 |     "collapsed": true
454 |    },
455 |    "source": [
456 |     "## 4. Write a story describing the statistical significance in the context or the original problem.\n",
457 |     "\n",
458 |     "\n",
459 |     "\n",
460 |     "P value is low still so we reject the null hypothesis that black and white sounding names have the same callback rates.\n"
461 |    ]
462 |   },
463 |   {
464 |    "cell_type": "markdown",
465 |    "metadata": {},
466 |    "source": [
467 |     "## 5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n",
468 |     "\n",
469 |     "We'd need to understand the impact of the other variables present. Even though names were randomly distributed there could be other variables that might suffer from systemic bias or confound the results. \n",
470 |     "\n",
471 |     "Example: Gender, Education, Georgraphy, etc. Another interesting question could be that of the assigned b labels that did get callbacks, were there significant differences from their counterparts that did not get callbacks?  "
472 |    ]
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "kernelspec": {
477 |    "display_name": "Python 3",
478 |    "language": "python",
479 |    "name": "python3"
480 |   },
481 |   "language_info": {
482 |    "codemirror_mode": {
483 |     "name": "ipython",
484 |     "version": 3
485 |    },
486 |    "file_extension": ".py",
487 |    "mimetype": "text/x-python",
488 |    "name": "python",
489 |    "nbconvert_exporter": "python",
490 |    "pygments_lexer": "ipython3",
491 |    "version": "3.7.0"
492 |   }
493 |  },
494 |  "nbformat": 4,
495 |  "nbformat_minor": 1
496 | }
497 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/dummy.txt:
--------------------------------------------------------------------------------
1 | text
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/sliderule_dsi_inferential_statistics_exercise_1_solutions-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# What is the True Normal Human Body Temperature? \n",
  8 |     "\n",
  9 |     "#### Background\n",
 10 |     "\n",
 11 |     "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. But, is this value statistically correct?"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "<div class=\"span5 alert alert-info\">\n",
 19 |     "<h3>Exercises</h3>\n",
 20 |     "\n",
 21 |     "<p>In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance.</p>\n",
 22 |     "\n",
 23 |     "<p>Answer the following questions <b>in this notebook below and submit to your Github account</b>.</p> \n",
 24 |     "\n",
 25 |     "<ol>\n",
 26 |     "<li>  Is the distribution of body temperatures normal? \n",
 27 |     "    <ul>\n",
 28 |     "    <li> Although this is not a requirement for the Central Limit Theorem to hold (read the introduction on Wikipedia's page about the CLT carefully: https://en.wikipedia.org/wiki/Central_limit_theorem), it gives us some peace of mind that the population may also be normally distributed if we assume that this sample is representative of the population.\n",
 29 |     "    <li> Think about the way you're going to check for the normality of the distribution. Graphical methods are usually used first, but there are also other ways: https://en.wikipedia.org/wiki/Normality_test\n",
 30 |     "    </ul>\n",
 31 |     "<li>  Is the sample size large? Are the observations independent?\n",
 32 |     "    <ul>\n",
 33 |     "    <li> Remember that this is a condition for the Central Limit Theorem, and hence the statistical tests we are using, to apply.\n",
 34 |     "    </ul>\n",
 35 |     "<li>  Is the true population mean really 98.6 degrees F?\n",
 36 |     "    <ul>\n",
 37 |     "    <li> First, try a bootstrap hypothesis test.\n",
 38 |     "    <li> Now, let's try frequentist statistical testing. Would you use a one-sample or two-sample test? Why?\n",
 39 |     "    <li> In this situation, is it appropriate to use the $t$ or $z$ statistic? \n",
 40 |     "    <li> Now try using the other test. How is the result be different? Why?\n",
 41 |     "    </ul>\n",
 42 |     "<li>  Draw a small sample of size 10 from the data and repeat both frequentist tests. \n",
 43 |     "    <ul>\n",
 44 |     "    <li> Which one is the correct one to use? \n",
 45 |     "    <li> What do you notice? What does this tell you about the difference in application of the $t$ and $z$ statistic?\n",
 46 |     "    </ul>\n",
 47 |     "<li>  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 48 |     "    <ul>\n",
 49 |     "    <li> As in the previous example, try calculating everything using the boostrap approach, as well as the frequentist approach.\n",
 50 |     "    <li> Start by computing the margin of error and confidence interval. When calculating the confidence interval, keep in mind that you should use the appropriate formula for one draw, and not N draws.\n",
 51 |     "    </ul>\n",
 52 |     "<li>  Is there a significant difference between males and females in normal temperature?\n",
 53 |     "    <ul>\n",
 54 |     "    <li> What testing approach did you use and why?\n",
 55 |     "    <li> Write a story with your conclusion in the context of the original problem.\n",
 56 |     "    </ul>\n",
 57 |     "</ol>\n",
 58 |     "\n",
 59 |     "You can include written notes in notebook cells using Markdown: \n",
 60 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 61 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 62 |     "\n",
 63 |     "#### Resources\n",
 64 |     "\n",
 65 |     "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n",
 66 |     "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 67 |     "\n",
 68 |     "****"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {
 75 |     "collapsed": true
 76 |    },
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "import pandas as pd\n",
 80 |     "import matplotlib.pyplot as plt\n",
 81 |     "%matplotlib inline\n",
 82 |     "\n",
 83 |     "df = pd.read_csv('data/human_body_temperature.csv')"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "markdown",
 88 |    "metadata": {},
 89 |    "source": [
 90 |     "<div class=\"span5 alert alert-success\">\n",
 91 |     "<h2>SOLUTION: Is the distribution of body temperatures normal?</h2>\n",
 92 |     "</div>"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {
 99 |     "collapsed": true
100 |    },
101 |    "outputs": [],
102 |    "source": [
103 |     "# First, a histogram\n",
104 |     "%matplotlib inline\n",
105 |     "plt.hist(df['temperature'])\n",
106 |     "plt.xlabel('Temperature')\n",
107 |     "plt.ylabel('Frequency')\n",
108 |     "plt.title('Histogram of Body Temperature')\n",
109 |     "plt.ylim(0, 40)  # Add some buffer space at the top so the bar doesn't get cut off."
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "# Next, a quantile plot.\n",
121 |     "import statsmodels.api as sm\n",
122 |     "mean = np.mean(df['temperature'])\n",
123 |     "sd = np.std(df['temperature'])\n",
124 |     "z = (df['temperature'] - mean) / sd\n",
125 |     "sm.qqplot(z, line='45')"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": null,
131 |    "metadata": {
132 |     "collapsed": true
133 |    },
134 |    "outputs": [],
135 |    "source": [
136 |     "# Finally, a normal distribution test. Not recommended!! Use only when you're not sure.\n",
137 |     "import scipy.stats as stats\n",
138 |     "stats.mstats.normaltest(df['temperature'])"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "<div class=\"span5 alert alert-success\">\n",
146 |     "<h4>SOLUTION</h4>\n",
147 |     "\n",
148 |     "<p>The histogram looks *very roughly* normally distributed. There is an implied bell shape, though there are some values above the mode that occur much less frequently than we would expect under a normal distribution. The shape is not so deviant as to call it some other distribution. </p>\n",
149 |     "\n",
150 |     "<p>A quantile plot can help. The quantile plot computes percentiles for our data and also the percentiles for a normal distribution via sampling (mean 0, sd 1). If the quantiles/percentiles for both distributions match, we expect to see a more or less straight line of data points. Note that the quantile plot does pretty much follow a straight line, so this helps us conclude that the distribution is likely normal. Note that there are three outliers on the \"high\" end and two on the \"low\" end that cause deviations in the tail, but this is pretty typical.</p>\n",
151 |     "\n",
152 |     "<p>Suppose we really aren't sure, or the plots tell us two different conclusions. We could confirm with a statistical significance test, though this should not be your first method of attack. The p-value from the normality test is 0.25 which is significantly above the usual cutoff of 0.05. The null hypothesis is that the distribution is normal. Since we fail to reject the null hypothesis, we conclude that the distribution is probably normal.</p>\n",
153 |     "</div>"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "<div class=\"span5 alert alert-success\">\n",
161 |     "<h2>SOLUTION: Is the sample size large? Are the observations independent?</h2>\n",
162 |     "</div>"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {
169 |     "collapsed": true
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "n = len(df['temperature'])\n",
174 |     "n"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "<div class=\"span5 alert alert-success\">\n",
182 |     "<p>The sample size is 130. Literature typically suggests a lower limit of 30 observations in a sample for CLT to hold. In terms of CLT, the sample is large enough.</p>\n",
183 |     "\n",
184 |     "<p>We must assume that the obserations are independent. One person's body temperature should not have any affect on another person's body temperature, so under common sense conditions, the observations are independent. Note that this condition may potentially be violated if the researcher lacked common sense and performed this study by stuffing all of the participants shoulder to shoulder in a very hot and confined room. </p>\n",
185 |     "\n",
186 |     "<p>Note that the temperatures <i>may</i> be dependent on age, gender, or health status, but this is a separate issue and does not affect our conclusion that <i>another person's</i> temperature does not affect someone else's temperature.</p>\n",
187 |     "</div>"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "<div class=\"span5 alert alert-success\">\n",
195 |     "<h2>SOLUTION: Is the true population mean really 98.6 degrees F?</h2>\n",
196 |     "</div>"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "markdown",
201 |    "metadata": {},
202 |    "source": [
203 |     "<div class=\"span5 alert alert-success\">\n",
204 |     "<p>We will now perform a bootstrap hypothesis test with the following:</p>\n",
205 |     "\n",
206 |     "<p>$H_0$: The mean of the sample and the true mean of 98.6 are the same. $\\mu=\\mu_0$</p>\n",
207 |     "\n",
208 |     "<p>$H_A$: The means are different. $\\mu\\neq\\mu_0$</p>\n",
209 |     "\n",
210 |     "</div>"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "# Calculates p value using 100,000 boostrap replicates\n",
222 |     "bootstrap_replicates = np.empty(100000)\n",
223 |     "\n",
224 |     "size = len(bootstrap_replicates)\n",
225 |     "\n",
226 |     "for i in range(size):\n",
227 |     "    bootstrap_sample = np.random.choice(temperature, size=len(temperature))\n",
228 |     "    bootstrap_replicates[i] = np.mean(bootstrap_sample)\n",
229 |     "\n",
230 |     "p = np.sum(bootstrap_replicates >= 98.6) / len(bootstrap_replicates)\n",
231 |     "print('p =', p)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "<div class=\"span5 alert alert-success\">\n",
239 |     "<p>We are testing only if the true population mean temperature is 98.6. We are treating everyone as being in the same group, with one mean. We use a **one-sample** test. The population standard deviation is not given, so we assume it is not known. We do however know the sample standard deviation from the data and we know that the sample size is large enough for CLT to apply, so we can use a $z$-test.</p>\n",
240 |     "</div>"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {
247 |     "collapsed": true
248 |    },
249 |    "outputs": [],
250 |    "source": [
251 |     "z = (mean - 98.6)/(sd / np.sqrt(n))\n",
252 |     "z"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "<div class=\"span5 alert alert-success\">\n",
260 |     "Since the question does not ask if the true mean is greater than, or less than 98.6 as the alternative hypothesis, we use a two-tailed test. We have to regions where we reject the null hypothesis: if $z < -1.96$ or if $z > 1.96$, assuming $\\alpha = 0.05$. Since -5.48 < -1.96, we reject the null hypothesis: the true population mean temperature is NOT 98.6.\n",
261 |     "\n",
262 |     "<p>We can also use a p-value:</p>\n",
263 |     "</div>"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "metadata": {
270 |     "collapsed": true
271 |    },
272 |    "outputs": [],
273 |    "source": [
274 |     "stats.norm.cdf(z) * 2\n",
275 |     "# NOTE: Since CDF gives us $P(Z \\le z)$ and this is a two-tailed test, we multiply the result by 2"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "markdown",
280 |    "metadata": {},
281 |    "source": [
282 |     "<div class=\"span5 alert alert-success\">\n",
283 |     "<p>Since the p-value is *way* below 0.05, we reject the null hypothesis. The population mean is not 98.6.</p>\n",
284 |     "\n",
285 |     "<p>The $z$-test was the \"correct\" test to use in this case. But what if we used a $t$-test instead? The degrees of freedom is $n - 1 = 129$.</p>\n",
286 |     "</div>"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": null,
292 |    "metadata": {
293 |     "collapsed": true
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "t = (mean - 98.6)/(sd / np.sqrt(n))"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "<div class=\"span5 alert alert-success\">\n",
305 |     "We find the critical value of $t$ and when $\\vert t \\vert > \\vert t^* \\vert$ we reject the null hypothesis.\n",
306 |     "</div>"
307 |    ]
308 |   },
309 |   {
310 |    "cell_type": "code",
311 |    "execution_count": null,
312 |    "metadata": {
313 |     "collapsed": true
314 |    },
315 |    "outputs": [],
316 |    "source": [
317 |     "t_critical = stats.t.ppf(0.05 / 2, n - 1)\n",
318 |     "t_critical"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "markdown",
323 |    "metadata": {},
324 |    "source": [
325 |     "<div class=\"span5 alert alert-success\">\n",
326 |     "<p>Note that the critical value of $t$ is $\\pm 1.979$. This is pretty close to the $\\pm 1.96$ we used for the $z$-test. *As the sample size gets larger, the student's $t$ distribution converges to the normal distribution.* So in theory, even if your sample size is large you could use the $t$-test, but the pesky degrees of freedom step is likely why people do not. If we use a sample of size, say, 1000, the critical values are close to identical.</p>\n",
327 |     "\n",
328 |     "<p>So, to answer the question, the result is NOT different! The only case where it would be different is if the $t$ statistic were between -1.96 and -1.979 which would be pretty rare.</p>"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "<div class=\"span5 alert alert-success\">\n",
336 |     "<h2>SOLUTION: At what temperature should we consider someone's temperature to be \"abnormal\"?</h2>\n",
337 |     "\n",
338 |     "<p>We compute the confidence interval using $z^* = \\pm 1.96$.</p>\n",
339 |     "\n",
340 |     "<p>The margin of error is </p>\n",
341 |     "\n",
342 |     "$$MOE = z^* \\frac{\\sigma}{\\sqrt{n}}$$\n",
343 |     "</div>"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {
350 |     "collapsed": true
351 |    },
352 |    "outputs": [],
353 |    "source": [
354 |     "sd = df['temperature'].std()\n",
355 |     "n = len(df['temperature'])\n",
356 |     "moe = 1.96 * sd / np.sqrt(n)\n",
357 |     "moe"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": null,
363 |    "metadata": {
364 |     "collapsed": true
365 |    },
366 |    "outputs": [],
367 |    "source": [
368 |     "mean = df['temperature'].mean()\n",
369 |     "ci = mean + np.array([-1, 1]) * moe\n",
370 |     "ci"
371 |    ]
372 |   },
373 |   {
374 |    "cell_type": "markdown",
375 |    "metadata": {},
376 |    "source": [
377 |     "<div class=\"span5 alert alert-success\">At 95% confidence level, we consider a temperature abnormal if it is below 98.1 degrees or above 98.38 degrees. Since the null hypothesis 98.6 is not in the confidence interval, we reject the null hypothesis -- the true population mean is not 98.6 degrees.</div>"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "markdown",
382 |    "metadata": {},
383 |    "source": [
384 |     "<div class=\"span5 alert alert-success\">\n",
385 |     "We can also use the bootstrap approach.\n",
386 |     "</div>"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {
393 |     "collapsed": true
394 |    },
395 |    "outputs": [],
396 |    "source": [
397 |     "# Define bootstrap functions:\n",
398 |     "\n",
399 |     "def replicate(data, function):\n",
400 |     "    \"\"\"Return replicate of a resampled data array.\"\"\"\n",
401 |     "    \n",
402 |     "    # Create the resampled array and return the statistic of interest:\n",
403 |     "    return function(np.random.choice(data, size=len(data)))\n",
404 |     "\n",
405 |     "\n",
406 |     "def draw_replicates(data, function, size=1):\n",
407 |     "    \"\"\"Draw bootstrap replicates.\"\"\"\n",
408 |     "\n",
409 |     "    # Initialize array of replicates:\n",
410 |     "    replicates = np.empty(size)\n",
411 |     "\n",
412 |     "    # Generate replicates:\n",
413 |     "    for i in range(size):\n",
414 |     "        replicates[i] = replicate(data, function)\n",
415 |     "\n",
416 |     "    return replicates"
417 |    ]
418 |   },
419 |   {
420 |    "cell_type": "code",
421 |    "execution_count": null,
422 |    "metadata": {
423 |     "collapsed": true
424 |    },
425 |    "outputs": [],
426 |    "source": [
427 |     "# Seed the random number generator:\n",
428 |     "np.random.seed(15)\n",
429 |     "\n",
430 |     "# Draw bootstrap replicates of temperatures:\n",
431 |     "replicates = draw_replicates(df.temperature, np.mean, 10000)\n",
432 |     "\n",
433 |     "# Compute the 99.9% confidence interval:\n",
434 |     "CI = np.percentile(replicates, [0.05, 99.95])\n",
435 |     "print('99.9% Confidence Interval:', CI)"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "markdown",
440 |    "metadata": {},
441 |    "source": [
442 |     "<div class=\"span5 alert alert-success\">\n",
443 |     "\n",
444 |     "<h2>SOLUTION: Is there a significant difference between males and females in normal temperature?</h2>\n",
445 |     "\n",
446 |     "<p>We use a two-sample test. Since the number of males is greater than 30 and the number of females is greater than 30, we use a two-sample z-test. Since the question just asks if there is a *difference* and doesn't specify a direction, we use a two-tailed test.</p>\n",
447 |     "\n",
448 |     "$$z = \\frac{(\\bar{x}_M - \\bar{x}_F) - 0}{\\sqrt{\\frac{\\sigma_M^2}{n_M} + \\frac{\\sigma_F^2}{n_F}}}$$"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": null,
454 |    "metadata": {
455 |     "collapsed": true
456 |    },
457 |    "outputs": [],
458 |    "source": [
459 |     "males = df.gender == 'M'\n",
460 |     "diff_means = df.temperature[males].mean() - df.temperature[~males].mean()\n",
461 |     "sd_male = df.temperature[males].std()\n",
462 |     "sd_female = df.temperature[~males].std()\n",
463 |     "n_male = np.sum(males)\n",
464 |     "n_female = len(df.temperature) - n_male\n",
465 |     "\n",
466 |     "z = diff_means / np.sqrt(((sd_male ** 2)/ n_male) + ((sd_female ** 2)/ n_female))\n",
467 |     "z"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {
474 |     "collapsed": true
475 |    },
476 |    "outputs": [],
477 |    "source": [
478 |     "pval = stats.norm.cdf(z) * 2\n",
479 |     "pval"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "metadata": {},
485 |    "source": [
486 |     "<div class=\"span5 alert alert-success\">\n",
487 |     "<p>Since the p-value of 0.022 < 0.05, we reject the null hypothesis that the mean body temperature for men and women is the same. The difference in mean body temperature between men and women is statistically significant.</p>\n",
488 |     "</p>"
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": null,
494 |    "metadata": {
495 |     "collapsed": true
496 |    },
497 |    "outputs": [],
498 |    "source": [
499 |     "diff_means + np.array([-1, 1]) * 1.96 * np.sqrt(((sd_male ** 2)/ n_male) + ((sd_female ** 2)/ n_female))"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {},
505 |    "source": [
506 |     "<div class=\"span5 alert alert-success\">Since the null hypothesized 0 is not in the confidence interval, we reject the null hypothesis with the same conclusion as the hypothesis test.</div>"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {
512 |     "collapsed": true
513 |    },
514 |    "source": [
515 |     "<div class=\"span5 alert alert-success\">Now let's try the hacker stats approach.</div>"
516 |    ]
517 |   },
518 |   {
519 |    "cell_type": "code",
520 |    "execution_count": null,
521 |    "metadata": {
522 |     "collapsed": true
523 |    },
524 |    "outputs": [],
525 |    "source": [
526 |     "permutation_replicates = np.empty(100000)\n",
527 |     "\n",
528 |     "size = len(permutation_replicates)\n",
529 |     "\n",
530 |     "for i in range(size):\n",
531 |     "    combined_perm_temperatures = np.random.permutation(np.concatenate((male_temperature, female_temperature)))\n",
532 |     "\n",
533 |     "    male_permutation = combined_perm_temperatures[:len(male_temperature)]\n",
534 |     "    female_permutation = combined_perm_temperatures[len(male_temperature):]\n",
535 |     "\n",
536 |     "    permutation_replicates[i] = np.abs(np.mean(male_permutation) - np.mean(female_permutation))\n",
537 |     "    \n",
538 |     "p_val = np.sum(permutation_replicates >= male_and_female_diff) / len(permutation_replicates)\n",
539 |     "\n",
540 |     "print('p =', p_val)"
541 |    ]
542 |   }
543 |  ],
544 |  "metadata": {
545 |   "kernelspec": {
546 |    "display_name": "Python 2",
547 |    "language": "python",
548 |    "name": "python2"
549 |   },
550 |   "language_info": {
551 |    "codemirror_mode": {
552 |     "name": "ipython",
553 |     "version": 2
554 |    },
555 |    "file_extension": ".py",
556 |    "mimetype": "text/x-python",
557 |    "name": "python",
558 |    "nbconvert_exporter": "python",
559 |    "pygments_lexer": "ipython2",
560 |    "version": "2.7.13"
561 |   }
562 |  },
563 |  "nbformat": 4,
564 |  "nbformat_minor": 1
565 | }
566 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/sliderule_dsi_inferential_statistics_exercise_2_solutions-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Examining Racial Discrimination in the US Job Market\n",
  8 |     "\n",
  9 |     "### Background\n",
 10 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés to black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 11 |     "\n",
 12 |     "### Data\n",
 13 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 14 |     "\n",
 15 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes when presented to the employer."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "<div class=\"span5 alert alert-info\">\n",
 23 |     "### Exercises\n",
 24 |     "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 25 |     "\n",
 26 |     "Answer the following questions **in this notebook below and submit to your Github account**. \n",
 27 |     "\n",
 28 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 29 |     "   2. What are the null and alternate hypotheses?\n",
 30 |     "   3. Compute margin of error, confidence interval, and p-value. Try using both the bootstrapping and the frequentist statistical approaches.\n",
 31 |     "   4. Write a story describing the statistical significance in the context or the original problem.\n",
 32 |     "   5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n",
 33 |     "\n",
 34 |     "You can include written notes in notebook cells using Markdown: \n",
 35 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 36 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "#### Resources\n",
 40 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 41 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n",
 42 |     "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 43 |     "+ Formulas for the Bernoulli distribution: https://en.wikipedia.org/wiki/Bernoulli_distribution\n",
 44 |     "</div>\n",
 45 |     "****"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {
 52 |     "collapsed": true
 53 |    },
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "import pandas as pd\n",
 57 |     "import numpy as np\n",
 58 |     "from scipy import stats"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {
 65 |     "collapsed": true
 66 |    },
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {
 76 |     "collapsed": true
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# number of callbacks for black-sounding names\n",
 81 |     "sum(data[data.race=='w'].call)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {
 88 |     "collapsed": true
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "data.head()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {
 98 |     "collapsed": true
 99 |    },
100 |    "source": [
101 |     "<div class=\"span5 alert alert-success\">\n",
102 |     "<h2>SOLUTION: What test is appropriate for this problem?</h2>\n",
103 |     "\n",
104 |     "<p>This is an interesting question, and it can be argued that there are two possible correct answers (such is life). First note the wording of the question: does race have a *significant impact* on callback. This usually suggests a $\\chi^2$-test, but the $\\chi^2$-test requires *frequencies* rather than percentages. Interestingly, one of the code snippets above uses frequency so we will look at that in a bit.</p>\n",
105 |     "\n",
106 |     "For now, a two-sample $z$-test:\n",
107 |     "\n",
108 |     "$$z = \\frac{\\left( \\hat{p}_W - \\hat{p}_B \\right) - 0}{\\sqrt{\\hat{p} (1 - \\hat{p)} \\left( \\frac{1}{n_W} + \\frac{1}{n_B}\\right)}}$$\n",
109 |     "\n",
110 |     "where\n",
111 |     "\n",
112 |     "$$\\hat{p} = \\frac{y_W + y_B}{n_W + n_B}$$\n",
113 |     "\n",
114 |     "The null and alternate hypotheses:\n",
115 |     "\n",
116 |     "$$H_0: p_B = p_W$$\n",
117 |     "$$H_A: p_B \\ne p_W$$\n",
118 |     "\n",
119 |     "CLT applies because we assume that the samples are representative of the population. The observations in each sample are assumed to be independent since the sample was drawn randomly."
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {
126 |     "collapsed": true
127 |    },
128 |    "outputs": [],
129 |    "source": [
130 |     "w = data[data.race=='w']\n",
131 |     "b = data[data.race=='b']\n",
132 |     "\n",
133 |     "n_w = len(w)\n",
134 |     "n_b = len(b)\n",
135 |     "\n",
136 |     "prop_w = np.sum(w.call) / len(w)\n",
137 |     "prop_b = np.sum(b.call) / len(b)\n",
138 |     "\n",
139 |     "prop_diff = prop_w - prop_b\n",
140 |     "phat = (np.sum(w.call) + np.sum(b.call)) / (len(w) + len(b))\n",
141 |     "\n",
142 |     "z = prop_diff / np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n",
143 |     "pval = stats.norm.cdf(-z) * 2\n",
144 |     "print(\"Z score: {}\".format(z))\n",
145 |     "print(\"P-value: {}\".format(pval))"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "# Hacker stats approach\n",
157 |     "permutation_replicates = np.empty(100000)\n",
158 |     "\n",
159 |     "white = data[data.race == 'w'].call.values\n",
160 |     "black = data[data.race == 'b'].call.values\n",
161 |     "diff_of_means = np.mean(white) - np.mean(black)\n",
162 |     "\n",
163 |     "for i in range(len(permutation_replicates)):\n",
164 |     "    permutation_samples = np.random.permutation(np.concatenate((white, black)))\n",
165 |     "    \n",
166 |     "    white_perm = permutation_samples[:len(white)]\n",
167 |     "    black_perm = permutation_samples[len(white):]\n",
168 |     "    \n",
169 |     "    permutation_replicates[i] = np.abs(np.mean(white_perm) - np.mean(black_perm))\n",
170 |     "\n",
171 |     "p = np.sum(permutation_replicates > diff_of_means) / len(permutation_replicates)\n",
172 |     "print('p =', p)"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {
179 |     "collapsed": true
180 |    },
181 |    "outputs": [],
182 |    "source": [
183 |     "moe = 1.96 * np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n",
184 |     "ci = prop_diff + np.array([-1, 1]) * moe\n",
185 |     "print(\"Margin of Error: {}\".format(moe))\n",
186 |     "print(\"Confidence interval: {}\".format(ci))"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {},
192 |    "source": [
193 |     "<div class=\"span5 alert alert-success\">\n",
194 |     "<p>The p-value is practically 0 so we reject the null hypothesis that white and black sounding names have the same callback rate. They do not.</p>\n",
195 |     "\n",
196 |     "<p>Since 0 is not in the confidence interval, we reject the null hypothesis with the same conclusion.</p>\n",
197 |     "\n",
198 |     "<h4>SOLUTION: Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?</h4>\n",
199 |     "\n",
200 |     "<p>No! While our test did show that there is a difference in callback rate based on race alone, there are other variables that may also contribute to, or interact with, race to explain the difference. In the original research paper, the researchers cited geography/city as a confounding variable. Additionally, we could also look at education and experience levels as well. But, in our very narrow example, we have shown that there is a significant difference in callback rates between white people and black people.</p>\n",
201 |     "\n",
202 |     "<p>The $\\chi^2$ approach:</p>\n",
203 |     "</div>"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": null,
209 |    "metadata": {
210 |     "collapsed": true
211 |    },
212 |    "outputs": [],
213 |    "source": [
214 |     "cont_table = pd.crosstab(index=data.call, columns=data.race)\n",
215 |     "chi2, pval, _, _ = stats.chi2_contingency(cont_table)\n",
216 |     "print(\"Chi-squared test statistic: {}\".format(chi2))\n",
217 |     "print(\"p-value: {}\".format(pval))"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "<div class=\"span5 alert alert-success\">\n",
225 |     "The chi-squared test yields a similar result. We reject the null hypothesis that race and callback rate are independent. The margin of error and confidence interval calculations are a bit more complicated because the chi-squared distribution is not always symmetric, depending on the number of degrees of freedom.\n",
226 |     "</div>"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "code",
231 |    "execution_count": null,
232 |    "metadata": {
233 |     "collapsed": true
234 |    },
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "kernelspec": {
241 |    "display_name": "Python 2",
242 |    "language": "python",
243 |    "name": "python2"
244 |   },
245 |   "language_info": {
246 |    "codemirror_mode": {
247 |     "name": "ipython",
248 |     "version": 2
249 |    },
250 |    "file_extension": ".py",
251 |    "mimetype": "text/x-python",
252 |    "name": "python",
253 |    "nbconvert_exporter": "python",
254 |    "pygments_lexer": "ipython2",
255 |    "version": "2.7.10"
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 1
260 | }
261 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/us_job_market_discrimination.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/us_job_market_discrimination.dta


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Reduce Hospital Readmissions/dummy.txt:
--------------------------------------------------------------------------------
1 | .
2 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Reduce Hospital Readmissions/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## What is the true normal human body temperature? \n",
  8 |     "\n",
  9 |     "#### Background\n",
 10 |     "\n",
 11 |     "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n",
 12 |     "\n",
 13 |     "#### Exercise\n",
 14 |     "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n",
 15 |     "\n",
 16 |     "1.  Is the distribution of body temperatures normal? \n",
 17 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 18 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 19 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "\n",
 23 |     "#### Resources\n",
 24 |     "\n",
 25 |     "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 58,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import pandas as pd"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "code",
 41 |    "execution_count": 62,
 42 |    "metadata": {
 43 |     "collapsed": false
 44 |    },
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "df = pd.read_csv('data/human_body_temperature.csv')"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "markdown",
 52 |    "metadata": {
 53 |     "collapsed": true
 54 |    },
 55 |    "source": [
 56 |     "# Exercise\n",
 57 |     "\n",
 58 |     "Answer the following questions in this notebook and submit to your Github account. \n",
 59 |     "\n",
 60 |     "1.  Is the distribution of body temperatures normal? \n",
 61 |     "    - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n",
 62 |     "2.  Is the true population mean really 98.6 degrees F?\n",
 63 |     "    - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n",
 64 |     "3.  At what temperature should we consider someone's temperature to be \"abnormal\"?\n",
 65 |     "    - Start by computing the margin of error and confidence interval.\n",
 66 |     "4.  Is there a significant difference between males and females in normal temperature?\n",
 67 |     "    - Set up and solve for a two sample hypothesis testing."
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": null,
 73 |    "metadata": {
 74 |     "collapsed": true
 75 |    },
 76 |    "outputs": [],
 77 |    "source": []
 78 |   }
 79 |  ],
 80 |  "metadata": {
 81 |   "kernelspec": {
 82 |    "display_name": "Python 2",
 83 |    "language": "python",
 84 |    "name": "python2"
 85 |   },
 86 |   "language_info": {
 87 |    "codemirror_mode": {
 88 |     "name": "ipython",
 89 |     "version": 2
 90 |    },
 91 |    "file_extension": ".py",
 92 |    "mimetype": "text/x-python",
 93 |    "name": "python",
 94 |    "nbconvert_exporter": "python",
 95 |    "pygments_lexer": "ipython2",
 96 |    "version": "2.7.9"
 97 |   }
 98 |  },
 99 |  "nbformat": 4,
100 |  "nbformat_minor": 0
101 | }
102 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Reduce Hospital Readmissions/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "\n",
  8 |     "### Examining racial discrimination in the US job market\n",
  9 |     "\n",
 10 |     "#### Background\n",
 11 |     "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n",
 12 |     "\n",
 13 |     "#### Data\n",
 14 |     "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n",
 15 |     "\n",
 16 |     "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n",
 17 |     "\n",
 18 |     "#### Exercise\n",
 19 |     "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "#### Resources\n",
 23 |     "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n",
 24 |     "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html "
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "metadata": {},
 30 |    "source": [
 31 |     "****"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 2,
 37 |    "metadata": {
 38 |     "collapsed": true
 39 |    },
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "import pandas as pd\n",
 43 |     "import numpy as np\n",
 44 |     "from scipy import stats"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {
 51 |     "collapsed": false
 52 |    },
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {
 62 |     "collapsed": false
 63 |    },
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "157.0"
 69 |       ]
 70 |      },
 71 |      "execution_count": 4,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "# number of callbacks for balck-sounding names\n",
 78 |     "sum(data[data.race=='b'].call)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {
 84 |     "collapsed": true
 85 |    },
 86 |    "source": [
 87 |     "****\n",
 88 |     "\n",
 89 |     "# Exercise\n",
 90 |     "\n",
 91 |     "   1. What test is appropriate for this problem? Does CLT apply?\n",
 92 |     "   2. What are the null and alternate hypotheses?\n",
 93 |     "   3. Compute margin of error, confidence interval, and p-value.\n",
 94 |     "   4. Discuss statistical significance.\n",
 95 |     "    \n",
 96 |     "You can include written notes in notebook cells using Markdown: \n",
 97 |     "   - In the control panel at the top, choose Cell > Cell Type > Markdown\n",
 98 |     "   - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n",
 99 |     "   \n",
100 |     "****"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "metadata": {
107 |     "collapsed": true
108 |    },
109 |    "outputs": [],
110 |    "source": []
111 |   }
112 |  ],
113 |  "metadata": {
114 |   "kernelspec": {
115 |    "display_name": "Python 2",
116 |    "language": "python",
117 |    "name": "python2"
118 |   },
119 |   "language_info": {
120 |    "codemirror_mode": {
121 |     "name": "ipython",
122 |     "version": 2
123 |    },
124 |    "file_extension": ".py",
125 |    "mimetype": "text/x-python",
126 |    "name": "python",
127 |    "nbconvert_exporter": "python",
128 |    "pygments_lexer": "ipython2",
129 |    "version": "2.7.9"
130 |   }
131 |  },
132 |  "nbformat": 4,
133 |  "nbformat_minor": 0
134 | }
135 | 


--------------------------------------------------------------------------------
/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/dummy.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/mini-projects/dummy:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/small_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/small_logo.png


--------------------------------------------------------------------------------