├── 0.jpg ├── Additional Resources ├── 7.1 Storytelling Resources │ ├── 03-EDA.pdf │ ├── 06-StoryTelling.pdf │ ├── choosing-a-good-chart-09.pdf │ └── dummy ├── Apache Spark.pdf ├── List of Resources ├── Statistics and Machine Learning in Python.pdf ├── causal_inference_tutorial.pptx └── profile_pic_jpeg.jpg ├── DataCamp_Notes ├── Advanced NLP with Spacy.txt ├── Analyzing Police Activity with pandas.txt ├── Chapter 10 - Machine Learning │ ├── Sec 10.1 - Linear - Logistic Regression │ │ └── XX. Supervised Learning with Scikit-Learn │ └── YYY. Machine Learning with the Experts: School Budgets ├── Chapter 10 │ ├── Sec 10.1 - Linear - Logistic Regression │ │ ├── ch1_SupervisedLearning.pdf │ │ ├── ch2_IntroToRegression.pdf │ │ ├── ch3_FineTuningModel.pdf │ │ └── ch4_PreProcessingData.pdf │ └── Sec 10.5 - Introduction to Unsupervised Learning │ │ └── Notes │ │ ├── ZZ. Unsupervised Learning in Python │ │ ├── ch1_slides.pdf │ │ ├── ch2_slides.pdf │ │ ├── ch3_slides.pdf │ │ └── ch4_slides.pdf ├── Chapter 12 │ └── Sec 12.2 Fundamentals of NLP │ │ └── Notes │ │ ├── ZXY. Building Chatbots in Python │ │ └── ZZ. NLP in Python ├── Chapter 16 Data Science at Scale │ └── Class: Introduction to PySpark │ │ └── Intro to PySpark.txt ├── Chapter 4 │ ├── Sec 4.1 - Python for Data Science │ │ ├── 2. intermed python for data science │ │ ├── 3. data types for data science │ │ ├── 4. python datascience toolbox part 1 │ │ └── 5. Introduction to Data Visualization with Python │ └── Sec 4.2 Coding Practices │ │ └── Writing Functions in Python.txt ├── Chapter 5 │ ├── Sec 5.1 - Data Wrangling with Pandas │ │ ├── 6. pandas Foundations │ │ ├── 6. pandas Foundations - Slides │ │ │ ├── ch1_slides.pdf │ │ │ ├── ch2_slides.pdf │ │ │ ├── ch3_slides.pdf │ │ │ ├── ch4_slides.pdf │ │ │ └── txt │ │ ├── 7. Manipulating DataFrames with Pandas │ │ ├── 8. Merging DataFrames with pandas │ │ └── 9. Cleaning Data in Python │ └── Sec 5.2 - Working with Data in Files │ │ ├── 10. Python Data Science Toolbox (Part 2) │ │ ├── 11. Importing Data in Python (Part 1) │ │ └── 12. Importing Data in Python (Part 2) ├── Chapter 8 - Inferential Statistics │ ├── Customer Analytics & AB Testing in Python.txt │ ├── Experimental Design in Python.txt │ ├── Preparing for Statistics Interview Questions in Python.txt │ └── Sec 8.2 - Inferential Statistics Using Python │ │ ├── Statistical Thinking in Python (Part 1) │ │ └── Statistical Thinking in Python (Part 2) ├── Creating Robust Python Workflows.txt ├── Designing Machine Learning Workflows in Python ├── Dimensionality Reduction in Python.txt ├── Feature Engineering for MachineLearning in Python.txt ├── Interactive Data Visualization with Bokeh.txt ├── Intermediate SQL.txt ├── Introduction to Data Visualization with Seaborn.txt ├── Introduction to Databases in Python ├── Introduction to Git for Data Science.txt ├── Introduction to Tensorflow ├── Introduction to Time Series Analysis in Python.txt ├── Linear Classifiers in Python.txt ├── Machine Learning for Marketing in Python.txt ├── Object-Oriented Programming in Python.txt ├── Optimizing Python Code with pandas.txt ├── Software Engineering for Data Scientists in Python.txt ├── Visualizing Time Series Data in Python.txt ├── Working with Dates and Times in Python.txt └── Writing Efficient Python Code.txt ├── MBazeley_Resume_Aug2020_v2.docx ├── MBazeley_Resume_Aug2020_v2.pdf ├── README.md ├── _config.yml ├── about.md ├── capstone1 ├── BazeleyMikiko_Capstone1_Springboard_V2.pdf ├── BazeleyMikiko_Capstone1_Springboard_V2.pptx ├── Capstone Project 1_ Final Report (V2).pdf ├── Capstone1-FinalVersion-BazeleyMikiko-Springboard.ipynb ├── Capstone1-InDepthAnalysis.ipynb ├── IntroCall Scoring - Data Storytelling.ipynb ├── Project Documents │ ├── Apply Data Storytelling.pdf │ ├── Apply Inferential Statistics.pdf │ ├── BazeleyMikiko_Capstone1_Springboard_2019March.pdf │ ├── BazeleyMikiko_Capstone1_Springboard_2019March.pptx │ ├── Capstone 1 - In-Depth Analysis - MBazeley.pdf │ ├── Capstone Project 1_ Data Storytelling - Google Docs.pdf │ ├── Capstone Project 1_ Exploratory Data Analysis.pdf │ ├── Capstone Project 1_ Milestone Report.pdf │ ├── Data Story Rubric_ Capstone 1.pdf │ ├── Data Wrangling Rubric_ Capstone 1.pdf │ ├── Milestone Report Rubric_ Capstone 1 .pdf │ ├── Project Capstone Project 1 Data Wrangling.pdf │ ├── Project Capstone Project 1 Milestone Report.pdf │ ├── _ DSC Capstone Project 1 Rubric_ Inferential Statistics.pdf │ └── dumy.txt ├── Project_ Capstone Project 1_ Project Proposal.docx ├── Springboard Project Capstone 1 - Project Ideas - Google Docs.pdf └── WalkMe - IntroCall Scoring - Exploratory Data Analysis.ipynb ├── capstone2 ├── Capstone 2 - Kickstarter - Final Submission.ipynb ├── Capstone 2_ Final Summary.pdf ├── Capstone 2_ Milestone 1 Report.pdf ├── Capstone 2_ Milestone 2 Report.pdf ├── Capstone2_Slides.pdf ├── Capstone2_Slides.pptx └── dummy ├── mini-projects ├── Ch 10 Machine Learning │ ├── Ch 10.1 Linear - Logistic Regression │ │ ├── BazeleyMikiko_MiniProject_LinearRegression_2018Dec29.ipynb │ │ ├── DSC Mini-Project_Linear Regression Rubric.pdf │ │ └── images │ │ │ ├── conditionalmean.png │ │ │ ├── cs109gitflow3.png │ │ │ ├── dummy.txt │ │ │ └── shuttle.png │ ├── Ch 10.1 Logistic Regression │ │ ├── BazeleyM_MiniProject_LogisticRegression_2018Dec30.ipynb │ │ ├── data │ │ │ ├── 01_heights_weights_genders.csv │ │ │ └── dummy │ │ ├── dummy │ │ └── images │ │ │ ├── .DS_Store │ │ │ ├── bias.png │ │ │ ├── complexity-error-plot.png │ │ │ ├── complexity-error-reg.png │ │ │ ├── data.png │ │ │ ├── dummy │ │ │ ├── knn1.png │ │ │ ├── knn2.png │ │ │ ├── linreg.png │ │ │ ├── linsep.png │ │ │ ├── onelinesplit.png │ │ │ ├── pcanim.gif │ │ │ ├── reshape.jpg │ │ │ ├── sklearn2.jpg │ │ │ ├── sklearntrans.jpg │ │ │ ├── train-cv2.png │ │ │ ├── train-cv3.png │ │ │ ├── train-test.png │ │ │ ├── train-validate-test-cont.png │ │ │ ├── train-validate-test.png │ │ │ └── train-validate-test3.png │ ├── Ch 10.3 Bayesian Methods and Text Data │ │ ├── MiniProject_NaiveBayes_BazeleyMikiko_2019Jan6.ipynb │ │ ├── callibration.png │ │ ├── critics.csv │ │ ├── terms.png │ │ ├── terms2.png │ │ └── vsm.png │ └── Ch 10.5 Introduction to Unsupervised Learning │ │ ├── .ipynb_checkpoints │ │ └── dummy │ │ ├── BazeleyM_MiniProject_Clustering_2019Jan27.ipynb │ │ ├── WineKMC.xlsx │ │ └── dummy ├── Ch 16 Spark and PySpark │ ├── Springboard - BazeleyMikiko - 2019Feb13 - Spark DF, SQL, ML Exercise.html │ ├── Springboard - BazeleyMikiko - 2019Feb13 - Spark DF, SQL, ML Exercise.ipynb │ └── dummy.txt ├── Ch 5 - Data Wrangling │ ├── 5.2 Working with Data │ │ └── JSON Exervise │ │ │ ├── BazeleyM_JSON_Exercise.ipynb │ │ │ ├── data │ │ │ ├── dummy │ │ │ ├── world_bank_projects.json │ │ │ └── world_bank_projects_less.json │ │ │ └── world_bank_data.csv │ ├── 5.3 SQL Practice │ │ ├── BazeleyMikiko_Sec5_3_sql_project_2018Nov24.sql │ │ ├── DSC Mini-Project_ SQL Rubric.docx │ │ ├── Database Info │ │ │ ├── bookings_table_constraints.PNG │ │ │ ├── bookings_table_data.PNG │ │ │ ├── countryClub_bookings.csv │ │ │ ├── countryClub_facilities.csv │ │ │ ├── countryClub_members.csv │ │ │ ├── country_club_db.PNG │ │ │ ├── countryclub.db │ │ │ ├── facilities_table_constraints.PNG │ │ │ ├── facilities_table_data.PNG │ │ │ ├── members_table_constraints.PNG │ │ │ └── members_table_data.PNG │ │ └── Project Info │ │ │ ├── Answers │ │ │ ├── Intro │ │ │ ├── Project Requirements & Questions │ │ │ ├── Resources │ │ │ └── dummy │ └── 5.4 API │ │ ├── BazeleyMikiko_MiniProject_DataWrangling_API.ipynb │ │ └── dummy.txt ├── Ch 8 Inferential Statistics │ └── Ch 8.3 Exploratory Data Analysis │ │ ├── EDA_HumanTemp │ │ ├── EDA_human_temperature_BazeleyMikiko-checkpoint.ipynb │ │ ├── MiniProject_EDA_HumanBodyTemp_BazeleyM_2019Jan01-checkpoint.ipynb │ │ ├── MiniProject_EDA_HumanBodyTemp_BazeleyM_2019Jan01.ipynb │ │ ├── dummy.tx │ │ └── human_body_temperature.csv │ │ ├── Racial Discrimination │ │ ├── BazeleyMikiko_MiniProject_RacialDiscrimination-checkpoint.ipynb │ │ ├── BazeleyMikiko_MiniProject_RacialDiscrimination.ipynb │ │ ├── dummy.txt │ │ ├── sliderule_dsi_inferential_statistics_exercise_1_solutions-checkpoint.ipynb │ │ ├── sliderule_dsi_inferential_statistics_exercise_2_solutions-checkpoint.ipynb │ │ └── us_job_market_discrimination.dta │ │ ├── Reduce Hospital Readmissions │ │ ├── BazeleyMikiko_MiniProject_ReduceHospitalReadmissions-checkpoint.ipynb │ │ ├── BazeleyMikiko_MiniProject_ReduceHospitalReadmissions.ipynb │ │ ├── cms_hospital_readmissions.csv │ │ ├── dummy.txt │ │ ├── sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb │ │ └── sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb │ │ └── dummy.txt └── dummy └── small_logo.png /0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/0.jpg -------------------------------------------------------------------------------- /Additional Resources/7.1 Storytelling Resources/03-EDA.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/7.1 Storytelling Resources/03-EDA.pdf -------------------------------------------------------------------------------- /Additional Resources/7.1 Storytelling Resources/06-StoryTelling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/7.1 Storytelling Resources/06-StoryTelling.pdf -------------------------------------------------------------------------------- /Additional Resources/7.1 Storytelling Resources/choosing-a-good-chart-09.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/7.1 Storytelling Resources/choosing-a-good-chart-09.pdf -------------------------------------------------------------------------------- /Additional Resources/7.1 Storytelling Resources/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Additional Resources/Apache Spark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/Apache Spark.pdf -------------------------------------------------------------------------------- /Additional Resources/List of Resources: -------------------------------------------------------------------------------- 1 | Chapter 7 - Data Storytelling 2 | Sec 7.1 - Storytelling Resources 3 | - Choosing the Right Chart: https://extremepresentation.typepad.com/files/choosing-a-good-chart-09.pdf 4 | - Visualizing Economics: 5 | - https://www.kickstarter.com/projects/visualizingeconomics/visualizingeconomics-an-infographic-zine 6 | - http://visualizingeconomics.com/book/ 7 | - MTA Data/Report: http://cs109hubway.github.io/classp/ 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /Additional Resources/Statistics and Machine Learning in Python.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/Statistics and Machine Learning in Python.pdf -------------------------------------------------------------------------------- /Additional Resources/causal_inference_tutorial.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/causal_inference_tutorial.pptx -------------------------------------------------------------------------------- /Additional Resources/profile_pic_jpeg.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/Additional Resources/profile_pic_jpeg.jpg -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch1_SupervisedLearning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch1_SupervisedLearning.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch2_IntroToRegression.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch2_IntroToRegression.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch3_FineTuningModel.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch3_FineTuningModel.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch4_PreProcessingData.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.1 - Linear - Logistic Regression/ch4_PreProcessingData.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch1_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch1_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch2_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch2_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch3_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch3_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch4_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 10/Sec 10.5 - Introduction to Unsupervised Learning/Notes/ch4_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch1_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch1_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch2_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch2_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch3_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch3_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch4_slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/ch4_slides.pdf -------------------------------------------------------------------------------- /DataCamp_Notes/Chapter 5/Sec 5.1 - Data Wrangling with Pandas/6. pandas Foundations - Slides/txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /DataCamp_Notes/Dimensionality Reduction in Python.txt: -------------------------------------------------------------------------------- 1 | Dimensionality Reduction in Python 2 | 3 | Course Description 4 | High-dimensional datasets can be overwhelming and leave you not knowing where to start. Typically, you’d visually explore a new dataset first, but when you have too many dimensions the classical approaches will seem insufficient. Fortunately, there are visualization techniques designed specifically for high dimensional data and you’ll be introduced to these in this course. After exploring the data, you’ll often find that many features hold little information because they don’t show any variance or because they are duplicates of other features. You’ll learn how to detect these features and drop them from the dataset so that you can focus on the informative ones. In a next step, you might want to build a model on these features, and it may turn out that some don’t have any effect on the thing you’re trying to predict. You’ll learn how to detect and drop these irrelevant features too, in order to reduce dimensionality and thus complexity. Finally, you’ll learn how feature extraction techniques can reduce dimensionality for you through the calculation of uncorrelated principal components. 5 | 6 | <=========================================================================================================================================> 7 | 1 8 | Exploring high dimensional data 9 | FREE 10 | 0% 11 | You'll be introduced to the concept of dimensionality reduction and will learn when an why this is important. You'll learn the difference between feature selection and feature extraction and will apply both techniques for data exploration. The chapter ends with a lesson on t-SNE, a powerful feature extraction technique that will allow you to visualize a high-dimensional dataset. 12 | 13 | <----------------------------------------------------------------------------------------------------------------------------------------> 14 | 15 | Fitting t-SNE to the ANSUR data 16 | t-SNE is a great technique for visual exploration of high dimensional datasets. In this exercise, you'll apply it to the ANSUR dataset. You'll remove non-numeric columns from the pre-loaded dataset df and fit TSNE to his numeric dataset. 17 | 18 | Instructions 19 | 100 XP 20 | Drop the non-numeric columns from the dataset. 21 | Create a TSNE model with learning rate 50. 22 | Fit and transform the model on the numeric dataset. 23 | 24 | Take Hint (-30 XP) 25 | 26 | 27 | # Non-numerical columns in the dataset 28 | non_numeric = ['Branch', 'Gender', 'Component'] 29 | 30 | # Drop the non-numerical columns from df 31 | df_numeric = df.drop(non_numeric, axis=1) 32 | 33 | # Create a t-SNE model with learning rate 50 34 | m = TSNE(learning_rate=50) 35 | 36 | # Fit and transform the t-SNE model on the numeric dataset 37 | tsne_features = m.fit_transform(df_numeric) 38 | print(tsne_features.shape) 39 | 40 | +100 XP 41 | Good job! t-SNE reduced the more than 90 features in the dataset to just 2 which you can now plot. 42 | 43 | 44 | <----------------------------------------------------------------------------------------------------------------------------------------> 45 | 46 | t-SNE visualisation of dimensionality 47 | Time to look at the results of your hard work. In this exercise, you will visualize the output of t-SNE dimensionality reduction on the combined male and female Ansur dataset. You'll create 3 scatterplots of the 2 t-SNE features ('x' and 'y') which were added to the dataset df. In each scatterplot you'll color the points according to a different categorical variable. 48 | 49 | seaborn has already been imported as sns and matplotlib.pyplot as plt. 50 | 51 | Instructions 1/3 52 | 35 XP 53 | 1 54 | Use seaborn's sns.scatterplot to create the plot. 55 | Color the points by 'Component'. 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | <=========================================================================================================================================> 64 | 65 | 2 66 | Feature selection I, selecting for feature information 67 | 0% 68 | In this first out of two chapters on feature selection, you'll learn about the curse of dimensionality and how dimensionality reduction can help you overcome it. You'll be introduced to a number of techniques to detect and remove features that bring little added value to the dataset. Either because they have little variance, too many missing values, or because they are strongly correlated to other features. 69 | 70 | 71 | <----------------------------------------------------------------------------------------------------------------------------------------> 72 | 73 | Train - test split 74 | In this chapter, you will keep working with the ANSUR dataset. Before you can build a model on your dataset, you should first decide on which feature you want to predict. In this case, you're trying to predict gender. 75 | 76 | You need to extract the column holding this feature from the dataset and then split the data into a training and test set. The training set will be used to train the model and the test set will be used to check its performance on unseen data. 77 | 78 | ansur_df has been pre-loaded for you. 79 | 80 | Instructions 81 | 100 XP 82 | Import the train_test_split function from sklearn.model_selection. 83 | Assign the 'Gender' column to y. 84 | Remove the 'Gender' column from the dataframe and assign the result to X. 85 | Set the test size to 30% to perform a 70% train and 30% test data split. 86 | 87 | 88 | # Import train_test_split() 89 | from sklearn.model_selection import train_test_split 90 | 91 | # Select the Gender column as the feature to be predicted (y) 92 | y = ansur_df['Gender'] 93 | 94 | # Remove the Gender column to create the training data 95 | X = ansur_df.drop('Gender', axis=1) 96 | 97 | # Perform a 70% train and 30% test data split 98 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 99 | 100 | print("{} rows in test set vs. {} in training set. {} Features.".format(X_test.shape[0], X_train.shape[0], X_test.shape[1])) 101 | 102 | <----------------------------------------------------------------------------------------------------------------------------------------> 103 | 104 | Fitting and testing the model 105 | In the previous exercise, you split the dataset into X_train, X_test, y_train, and y_test. These datasets have been pre-loaded for you. You'll now create a support vector machine classifier model (SVC()) and fit that to the training data. You'll then calculate the accuracy on both the test and training set to detect overfitting. 106 | 107 | Instructions 108 | 100 XP 109 | Import SVC from sklearn.svm and accuracy_score from sklearn.metrics 110 | Create an instance of the Support Vector Classification class (SVC()). 111 | Fit the model to the training data. 112 | Calculate accuracy scores on both train and test data. 113 | 114 | 115 | # Import SVC from sklearn.svm and accuracy_score from sklearn.metrics 116 | from sklearn.svm import SVC 117 | from sklearn.metrics import accuracy_score 118 | 119 | # Create an instance of the Support Vector Classification class 120 | svc = SVC() 121 | 122 | # Fit the model to the training data 123 | svc.fit(X_train, y_train) 124 | 125 | # Calculate accuracy scores on both train and test data 126 | accuracy_train = accuracy_score(y_train, svc.predict(X_train)) 127 | accuracy_test = accuracy_score(y_test, svc.predict(X_test)) 128 | 129 | print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train)) 130 | 131 | 132 | output: 133 | 49.7% accuracy on test set vs. 100.0% on training set 134 | 135 | <----------------------------------------------------------------------------------------------------------------------------------------> 136 | 137 | Accuracy after dimensionality reduction 138 | You'll reduce the overfit with the help of dimensionality reduction. In this case, you'll apply a rather drastic form of dimensionality reduction by only selecting a single column that has some good information to distinguish between genders. You'll repeat the train-test split, model fit and prediction steps to compare the accuracy on test vs. training data. 139 | 140 | All relevant packages and y have been pre-loaded. 141 | 142 | Instructions 143 | 100 XP 144 | Select just the neck circumference ('neckcircumferencebase') column from ansur_df. 145 | Split the data, instantiate a classifier and fit the data. This has been done for you. 146 | Once again calculate the accuracy scores on both training and test set. 147 | 148 | # Assign just the 'neckcircumferencebase' column from ansur_df to X 149 | X = ansur_df[['neckcircumferencebase']] 150 | 151 | # Split the data, instantiate a classifier and fit the data 152 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 153 | svc = SVC() 154 | svc.fit(X_train, y_train) 155 | 156 | # Calculate accuracy scores on both train and test data 157 | accuracy_train = accuracy_score(y_train, svc.predict(X_train)) 158 | accuracy_test = accuracy_score(y_test, svc.predict(X_test)) 159 | 160 | print("{0:.1%} accuracy on test set vs. {1:.1%} on training set".format(accuracy_test, accuracy_train)) 161 | 162 | 163 | output: 164 | 93.3% accuracy on test set vs. 94.9% on training set 165 | 166 | 167 | +100 XP 168 | Wow, what just happened!? On the full dataset the model is rubbish but with a single feature we can make good predictions? This is an example of the curse of dimensionality! The model badly overfits when we feed it too many features. It overlooks that neck circumference by itself is pretty different for males and females. 169 | 170 | <----------------------------------------------------------------------------------------------------------------------------------------> 171 | 172 | Features with low variance 173 | In the previous exercise you established that 0.001 is a good threshold to filter out low variance features in head_df after normalization. Now use the VarianceThreshold feature selector to remove these features. 174 | 175 | Instructions 176 | 100 XP 177 | Create the variance threshold selector with a threshold of 0.001. 178 | Normalize the head_df dataframe by dividing it by its mean values and fit the selector. 179 | Create a boolean mask from the selector using .get_support(). 180 | Create a reduced dataframe by passing the mask to the .loc[] method. 181 | 182 | Take Hint (-30 XP) 183 | 184 | from sklearn.feature_selection import VarianceThreshold 185 | 186 | # Create a VarianceThreshold feature selector 187 | sel = VarianceThreshold(threshold=0.001) 188 | 189 | # Fit the selector to normalized head_df 190 | sel.fit(head_df / head_df.mean()) 191 | 192 | # Create a boolean mask 193 | mask = sel.get_support() 194 | 195 | # Apply the mask to create a reduced dataframe 196 | reduced_df = head_df.loc[:, mask] 197 | 198 | print("Dimensionality reduced from {} to {}.".format(head_df.shape[1], reduced_df.shape[1])) 199 | 200 | output: 201 | Dimensionality reduced from 6 to 4. 202 | <----------------------------------------------------------------------------------------------------------------------------------------> 203 | 204 | Visualizing the correlation matrix 205 | Reading the correlation matrix of ansur_df in its raw, numeric format doesn't allow us to get a quick overview. Let's improve this by removing redundant values and visualizing the matrix using seaborn. 206 | 207 | Seaborn has been pre-loaded as sns, matplotlib.pyplot as plt, NumPy as np and pandas as pd. 208 | 209 | Instructions 1/4 210 | 100 XP 211 | 1 212 | 2 213 | 3 214 | 4 215 | Create the correlation matrix. 216 | Visualize it using Seaborn's heatmap function. 217 | 218 | Take Hint (-30 XP) 219 | 220 | # Create the correlation matrix 221 | corr = ansur_df.corr() 222 | 223 | # Draw the heatmap 224 | sns.heatmap(corr, cmap=cmap, center=0, linewidths=1, annot=True, fmt=".2f") 225 | plt.show() 226 | 227 | 228 | 229 | Instructions 2/4 230 | 0 XP 231 | 2 232 | 3 233 | 4 234 | Create a boolean mask for the upper triangle of the plot. 235 | 236 | # Create the correlation matrix 237 | corr = ansur_df.corr() 238 | 239 | # Generate a mask for the upper triangle 240 | mask = np.triu(np.ones_like(corr, dtype=bool)) 241 | 242 | 243 | 244 | Instructions 3/4 245 | 0 XP 246 | 3 247 | 4 248 | Add the mask to the heatmap. 249 | 250 | # Create the correlation matrix 251 | corr = ansur_df.corr() 252 | 253 | # Generate a mask for the upper triangle 254 | mask = np.triu(np.ones_like(corr, dtype=bool)) 255 | 256 | # Add the mask to the heatmap 257 | sns.heatmap(corr, mask=mask, cmap=cmap, center=0, linewidths=1, annot=True, fmt=".2f") 258 | plt.show() 259 | 260 | 261 | <----------------------------------------------------------------------------------------------------------------------------------------> 262 | 263 | Filtering out highly correlated features 264 | You're going to automate the removal of highly correlated features in the numeric ANSUR dataset. You'll calculate the correlation matrix and filter out columns that have a correlation coefficient of more than 0.95 or less than -0.95. 265 | 266 | Since each correlation coefficient occurs twice in the matrix (correlation of A to B equals correlation of B to A) you'll want to ignore half of the correlation matrix so that only one of the two correlated features is removed. Use a mask trick for this purpose. 267 | 268 | Instructions 269 | 100 XP 270 | Calculate the correlation matrix of ansur_df and take the absolute value of this matrix. 271 | Create a boolean mask with True values in the upper right triangle and apply it to the correlation matrix. 272 | Set the correlation coefficient threshold to 0.95. 273 | Drop all the columns listed in to_drop from the dataframe. 274 | 275 | Take Hint (-30 XP) 276 | 277 | # Calculate the correlation matrix and take the absolute value 278 | corr_matrix = ansur_df.corr().abs() 279 | 280 | # Create a True/False mask and apply it 281 | mask = np.triu(np.ones_like(corr_matrix, dtype=bool)) 282 | tri_df = corr_matrix.mask(mask) 283 | 284 | # List column names of highly correlated features (r > 0.95) 285 | to_drop = [c for c in tri_df.columns if any(tri_df[c] > 0.95)] 286 | 287 | # Drop the features in the to_drop list 288 | reduced_df = ansur_df.drop(to_drop, axis=1) 289 | 290 | print("The reduced dataframe has {} columns.".format(reduced_df.shape[1])) 291 | 292 | 293 | 294 | 295 | <=========================================================================================================================================> 296 | 297 | 3 298 | Feature selection II, selecting for model accuracy 299 | 0% 300 | In this second chapter on feature selection, you'll learn how to let models help you find the most important features in a dataset for predicting a particular target feature. In the final lesson of this chapter, you'll combine the advice of multiple, different, models to decide on which features are worth keeping. 301 | 302 | 303 | 304 | <----------------------------------------------------------------------------------------------------------------------------------------> 305 | 306 | Building a diabetes classifier 307 | You'll be using the Pima Indians diabetes dataset to predict whether a person has diabetes using logistic regression. There are 8 features and one target in this dataset. The data has been split into a training and test set and pre-loaded for you as X_train, y_train, X_test, and y_test. 308 | 309 | A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr. 310 | 311 | Instructions 312 | 100 XP 313 | Fit the scaler on the training features and transform these features in one go. 314 | Fit the logistic regression model on the scaled training data. 315 | Scale the test features. 316 | Predict diabetes presence on the scaled test set. 317 | 318 | 319 | # Fit the scaler on the training features and transform these in one go 320 | X_train_std = scaler.fit_transform(X_train) 321 | 322 | # Fit the logistic regression model on the scaled training data 323 | lr.fit(X_train_std, y_train) 324 | 325 | # Scale the test features 326 | X_test_std = scaler.transform(X_test) 327 | 328 | # Predict diabetes presence on the scaled test set 329 | y_pred = lr.predict(X_test_std) 330 | 331 | # Prints accuracy metrics and feature coefficients 332 | print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 333 | print(dict(zip(X.columns, abs(lr.coef_[0]).round(2)))) 334 | 335 | 336 | output: 337 | 79.6% accuracy on test set. 338 | {'diastolic': 0.03, 'family': 0.34, 'bmi': 0.38, 'glucose': 1.23, 'insulin': 0.19, 'age': 0.34, 'triceps': 0.24, 'pregnant': 0.04} 339 | 340 | +100 XP 341 | Great! We get almost 80% accuracy on the test set. Take a look at the differences in model coefficients for the different features. 342 | <----------------------------------------------------------------------------------------------------------------------------------------> 343 | 344 | Manual Recursive Feature Elimination 345 | Now that we've created a diabetes classifier, let's see if we can reduce the number of features without hurting the model accuracy too much. 346 | 347 | On the second line of code the features are selected from the original dataframe. Adjust this selection. 348 | 349 | A StandardScaler() instance has been predefined as scaler and a LogisticRegression() one as lr. 350 | 351 | All necessary functions and packages have been pre-loaded too. 352 | 353 | Instructions 1/3 354 | 50 XP 355 | 1 356 | First, run the given code, then remove the feature with the lowest model coefficient from X. 357 | 358 | Take Hint (-15 XP) 359 | 360 | 361 | # Remove the feature with the lowest model coefficient 362 | X = diabetes_df[['pregnant', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi', 'family', 'age']] 363 | 364 | # Performs a 25-75% train test split 365 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) 366 | 367 | # Scales features and fits the logistic regression model 368 | lr.fit(scaler.fit_transform(X_train), y_train) 369 | 370 | # Calculates the accuracy on the test set and prints coefficients 371 | acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test))) 372 | print("{0:.1%} accuracy on test set.".format(acc)) 373 | print(dict(zip(X.columns, abs(lr.coef_[0]).round(2)))) 374 | 375 | 376 | 377 | 378 | output: 379 | 79.6% accuracy on test set. 380 | {'diastolic': 0.03, 'family': 0.34, 'bmi': 0.38, 'glucose': 1.23, 'insulin': 0.19, 'age': 0.34, 'triceps': 0.24, 'pregnant': 0.04} 381 | 382 | 383 | 384 | # Remove the 2 features with the lowest model coefficients 385 | X = diabetes_df[['pregnant', 'glucose', 'triceps', 'insulin', 'bmi', 'family', 'age']] 386 | 387 | # Performs a 25-75% train test split 388 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) 389 | 390 | # Scales features and fits the logistic regression model 391 | lr.fit(scaler.fit_transform(X_train), y_train) 392 | 393 | # Calculates the accuracy on the test set and prints coefficients 394 | acc = accuracy_score(y_test, lr.predict(scaler.transform(X_test))) 395 | print("{0:.1%} accuracy on test set.".format(acc)) 396 | print(dict(zip(X.columns, abs(lr.coef_[0]).round(2)))) 397 | 398 | 399 | 400 | 401 | Run the code and remove 2 more features with the lowest model coefficients. 402 | 403 | Take Hint (-15 XP) 404 | 405 | 406 | 407 | 408 | <----------------------------------------------------------------------------------------------------------------------------------------> 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | <----------------------------------------------------------------------------------------------------------------------------------------> 418 | 419 | 420 | <=========================================================================================================================================> 421 | 422 | 4 423 | Feature extraction 424 | 0% 425 | This chapter is a deep-dive on the most frequently used dimensionality reduction algorithm, Principal Component Analysis (PCA). You'll build intuition on how and why this algorithm is so powerful and will apply it both for data exploration and data pre-processing in a modeling pipeline. You'll end with a cool image compression use case. 426 | 427 | 428 | 429 | -------------------------------------------------------------------------------- /DataCamp_Notes/Feature Engineering for MachineLearning in Python.txt: -------------------------------------------------------------------------------- 1 | Course Description 2 | Every day you read about the amazing breakthroughs in how the newest applications of machine learning are changing the world. Often this reporting glosses over the fact that a huge amount of data munging and feature engineering must be done before any of these fancy models can be used. In this course, you will learn how to do just that. You will work with Stack Overflow Developers survey, and historic US presidential inauguration addresses, to understand how best to preprocess and engineer features from categorical, continuous, and unstructured data. This course will give you hands-on experience on how to prepare any data for your own machine learning models. 3 | 4 | <==================================================================================================================================> 5 | 6 | 1 7 | Creating Features 8 | FREE 9 | 0% 10 | In this chapter, you will explore what feature engineering is and how to get started with applying it to real-world data. You will load, explore and visualize a survey response dataset, and in doing so you will learn about its underlying data types and why they have an influence on how you should engineer your features. Using the pandas package you will create new features from both categorical and continuous columns. 11 | 12 | _______________________________________________________________________________________________________________________ 13 | 14 | One-hot encoding and dummy variables 15 | To use categorical variables in a machine learning model, you first need to represent them in a quantitative way. The two most common approaches are to one-hot encode the variables using or to use dummy variables. In this exercise, you will create both types of encoding, and compare the created column sets. We will continue using the same DataFrame from previous lesson loaded as so_survey_df and focusing on its Country column. 16 | 17 | Instructions 1/2 18 | 50 XP 19 | 1 20 | One-hot encode the Country column, adding "OH" as a prefix for each column. 21 | 22 | Take Hint (-15 XP) 23 | 2 24 | Create dummy variables for the Country column, adding "DM" as a prefix for each column. 25 | 26 | 27 | # Convert the Country column to a one hot encoded Data Frame 28 | one_hot_encoded = pd.get_dummies(so_survey_df, columns=['Country'], prefix='OH') 29 | 30 | # Print the columns names 31 | print(one_hot_encoded.columns) 32 | 33 | # Create dummy variables for the Country column 34 | dummy = pd.get_dummies(so_survey_df, columns=['Country'], drop_first=True, prefix='DM') 35 | 36 | # Print the columns names 37 | print(dummy.columns) 38 | 39 | +100 XP 40 | Great job! Did you notice that the column for France was missing when you created dummy variables? Now you can choose to use one-hot encoding or dummy variables where appropriate. 41 | 42 | _______________________________________________________________________________________________________________________ 43 | 44 | Dealing with uncommon categories 45 | Some features can have many different categories but a very uneven distribution of their occurrences. Take for example Data Science's favorite languages to code in, some common choices are Python, R, and Julia, but there can be individuals with bespoke choices, like FORTRAN, C etc. In these cases, you may not want to create a feature for each value, but only the more common occurrences. 46 | 47 | Instructions 1/3 48 | 35 XP 49 | 1 50 | 2 51 | 3 52 | Extract the Country column of so_survey_df as a series and assign it to countries. 53 | Find the counts of each category in the newly created countries series. 54 | 55 | 56 | # Create a series out of the Country column 57 | countries = so_survey_df['Country'] 58 | 59 | # Get the counts of each category 60 | country_counts = countries.value_counts() 61 | 62 | # Print the count values for each category 63 | print(country_counts) 64 | 65 | 66 | Create a mask for values occurring less than 10 times in country_counts. 67 | Print the first 5 rows of the mask. 68 | 69 | # Create a series out of the Country column 70 | countries = so_survey_df['Country'] 71 | 72 | # Get the counts of each category 73 | country_counts = countries.value_counts() 74 | 75 | # Create a mask for only categories that occur less than 10 times 76 | mask = countries.isin(country_counts[country_counts<10].index) 77 | 78 | # Print the top 5 rows in the mask series 79 | print(mask.head()) 80 | 81 | 82 | 83 | Instructions 3/3 84 | 30 XP 85 | 3 86 | Label values occurring less than the mask cutoff as 'Other'. 87 | Print the new category counts in countries. 88 | 89 | 90 | # Create a series out of the Country column 91 | countries = so_survey_df['Country'] 92 | 93 | # Get the counts of each category 94 | country_counts = countries.value_counts() 95 | 96 | # Create a mask for only categories that occur less than 10 times 97 | mask = countries.isin(country_counts[country_counts < 10].index) 98 | 99 | # Label all other categories as Other 100 | countries[mask] = 'Other' 101 | 102 | # Print the updated category counts 103 | print(countries.value_counts()) 104 | 105 | _______________________________________________________________________________________________________________________ 106 | 107 | Binarizing columns 108 | While numeric values can often be used without any feature engineering, there will be cases when some form of manipulation can be useful. For example on some occasions, you might not care about the magnitude of a value but only care about its direction, or if it exists at all. In these situations, you will want to binarize a column. In the so_survey_df data, you have a large number of survey respondents that are working voluntarily (without pay). You will create a new column titled Paid_Job indicating whether each person is paid (their salary is greater than zero). 109 | 110 | Instructions 111 | 100 XP 112 | Create a new column called Paid_Job filled with zeros. 113 | Replace all the Paid_Job values with a 1 where the corresponding ConvertedSalary is greater than 0. 114 | 115 | # Create the Paid_Job column filled with zeros 116 | so_survey_df['Paid_Job'] = 0 117 | 118 | # Replace all the Paid_Job values where ConvertedSalary is > 0 119 | so_survey_df.loc[so_survey_df['ConvertedSalary']>0, 'Paid_Job'] = 1 120 | 121 | # Print the first five rows of the columns 122 | print(so_survey_df[['Paid_Job', 'ConvertedSalary']].head()) 123 | 124 | 125 | 126 | output: 127 | Paid_Job ConvertedSalary 128 | 0 0 0.0 129 | 1 1 70841.0 130 | 2 0 0.0 131 | 3 1 21426.0 132 | 4 1 41671.0 133 | 134 | 135 | +100 XP 136 | Good work, binarizing columns can also be useful for your target variables. 137 | _______________________________________________________________________________________________________________________ 138 | 139 | 140 | Binning values 141 | For many continuous values you will care less about the exact value of a numeric column, but instead care about the bucket it falls into. This can be useful when plotting values, or simplifying your machine learning models. It is mostly used on continuous variables where accuracy is not the biggest concern e.g. age, height, wages. 142 | 143 | Bins are created using pd.cut(df['column_name'], bins) where bins can be an integer specifying the number of evenly spaced bins, or a list of bin boundaries. 144 | 145 | Instructions 1/2 146 | 50 XP 147 | 1 148 | Bin the ConvertedSalary column values into 5 equal bins, in a new column called equal_binned. 149 | 150 | 151 | Bin the ConvertedSalary column using the boundaries in the list bins and label the bins using labels. 152 | 153 | 154 | # Bin the continuous variable ConvertedSalary into 5 bins 155 | so_survey_df['equal_binned'] = pd.cut(so_survey_df['ConvertedSalary'], bins=5) 156 | 157 | # Print the first 5 rows of the equal_binned column 158 | print(so_survey_df[['equal_binned', 'ConvertedSalary']].head()) 159 | 160 | 161 | output: 162 | equal_binned ConvertedSalary 163 | 0 (-2000.0, 400000.0] 0.0 164 | 1 (-2000.0, 400000.0] 70841.0 165 | 2 (-2000.0, 400000.0] 0.0 166 | 3 (-2000.0, 400000.0] 21426.0 167 | 4 (-2000.0, 400000.0] 41671.0 168 | 169 | 170 | 171 | 172 | # Import numpy 173 | import numpy as np 174 | 175 | # Specify the boundaries of the bins 176 | bins = [-np.inf, 10000, 50000, 100000, 150000, np.inf] 177 | 178 | # Bin labels 179 | labels = ['Very low', 'Low', 'Medium', 'High', 'Very high'] 180 | 181 | # Bin the continuous variable ConvertedSalary using these boundaries 182 | so_survey_df['boundary_binned'] = pd.cut(so_survey_df['ConvertedSalary'], 183 | bins=bins, labels=labels) 184 | 185 | # Print the first 5 rows of the boundary_binned column 186 | print(so_survey_df[['boundary_binned', 'ConvertedSalary']].head()) 187 | 188 | 189 | 190 | output: 191 | boundary_binned ConvertedSalary 192 | 0 Very low 0.0 193 | 1 Medium 70841.0 194 | 2 Very low 0.0 195 | 3 Low 21426.0 196 | 4 Low 41671.0 197 | 198 | 199 | +100 XP 200 | Correct, now you can bin columns with equal spacing and predefined boundaries. 201 | 202 | 203 | 204 | 205 | <==================================================================================================================================> 206 | 207 | 208 | VIEW CHAPTER DETAILS 209 | 2 210 | Dealing with Messy Data 211 | 0% 212 | This chapter introduces you to the reality of messy and incomplete data. You will learn how to find where your data has missing values and explore multiple approaches on how to deal with them. You will also use string manipulation techniques to deal with unwanted characters in your dataset. 213 | 214 | 215 | <==================================================================================================================================> 216 | VIEW CHAPTER DETAILS 217 | 3 218 | Conforming to Statistical Assumptions 219 | 0% 220 | In this chapter, you will focus on analyzing the underlying distribution of your data and whether it will impact your machine learning pipeline. You will learn how to deal with skewed data and situations where outliers may be negatively impacting your analysis. 221 | 222 | 223 | <==================================================================================================================================> 224 | VIEW CHAPTER DETAILS 225 | 4 226 | Dealing with Text Data 227 | 0% 228 | Finally, in this chapter, you will work with unstructured text data, understanding ways in which you can engineer columnar features out of a text corpus. You will compare how different approaches may impact how much context is being extracted from a text, and how to balance the need for context, without too many features being created. 229 | 230 | VIEW CHAPTER DETAILS 231 | -------------------------------------------------------------------------------- /DataCamp_Notes/Interactive Data Visualization with Bokeh.txt: -------------------------------------------------------------------------------- 1 | Interactive Data 2 | Visualization with 3 | Bokeh 4 | 5 | What is Bokeh? 6 | ● Interactive visualization, controls, and tools 7 | ● Versatile and high-level graphics 8 | ● High-level statistical charts 9 | ● Streaming, dynamic, large data 10 | ● For the browser, with or without a server 11 | ● No JavaScript 12 | 13 | 14 | 15 | 16 | What you will learn 17 | ● Basic plo!ing with bokeh.plotting 18 | ● Layouts, interactions, and annotations 19 | ● Statistical charting with bokeh.charts 20 | ● Interactive data applications in the browser 21 | ● Case Study: A Gapminder explorer 22 | 23 | 24 | 25 | 26 | <=================================================================================================================================> 27 | 28 | Plo!ing with 29 | Glyphs 30 | 31 | 32 | What are Glyphs 33 | ● Visual shapes 34 | ● circles, squares, triangles 35 | ● rectangles, lines, wedges 36 | ● With properties a!ached to data 37 | ● coordinates (x,y) 38 | ● size, color, transparency 39 | 40 | 41 | 42 | Typical usage 43 | In [1]: from bokeh.io import output_file, show 44 | In [2]: from bokeh.plotting import figure 45 | In [3]: plot = figure(plot_width=400, tools='pan,box_zoom') 46 | In [4]: plot.circle([1,2,3,4,5], [8,6,5,2,3]) 47 | In [5]: output_file('circle.html') 48 | In [6]: show(plot) 49 | 50 | 51 | 52 | Glyph properties 53 | ● Lists, arrays, sequences of values 54 | ● Single fixed values 55 | In [1]: plot = figure() 56 | In [2]: plot.circle(x=10, y=[2,5,8,12], size=[10,20,30,40]) 57 | 58 | 59 | 60 | 61 | Markers ● asterisk() ● circle() ● circle_cross() ● circle_x() ● cross() ● diamond() ● diamond_cross() ● inverted_triangle() ● square() ● square_cross() ● square_x() ● triangle() ● x() 62 | 63 | 64 | 65 | 66 | <=================================================================================================================================> 67 | 68 | 69 | Additional Glyphs 70 | 71 | 72 | Lines 73 | In [1]: from bokeh.io import output_file, show 74 | In [2]: from bokeh.plotting import figure 75 | In [3]: x = [1,2,3,4,5] 76 | In [4]: y = [8,6,5,2,3] 77 | In [5]: plot = figure() 78 | In [6]: plot.line(x, y, line_width=3) 79 | In [7]: output_file('line.html') 80 | In [8]: show(plot) 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | <=================================================================================================================================> 89 | 90 | <=================================================================================================================================> 91 | 92 | <=================================================================================================================================> 93 | 94 | <=================================================================================================================================> 95 | 96 | <=================================================================================================================================> 97 | 98 | <=================================================================================================================================> 99 | 100 | <=================================================================================================================================> 101 | 102 | <=================================================================================================================================> 103 | 104 | <=================================================================================================================================> 105 | 106 | <=================================================================================================================================> 107 | 108 | <=================================================================================================================================> 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /DataCamp_Notes/Intermediate SQL.txt: -------------------------------------------------------------------------------- 1 | Intermediate SQL 2 | Course Description 3 | So you've learned how to aggregate and join data from tables in your database—now what? How do you manipulate, transform, and make the most sense of your data? This intermediate-level course will teach you several key functions necessary to wrangle, filter, and categorize information in a relational database, expand your SQL toolkit, and answer complex questions. You will learn the robust use of CASE statements, subqueries, and window functions—all while discovering some interesting facts about soccer using the European Soccer Database. 4 | 5 | ____________________________________________________________________________________________________________________________________ 6 | 7 | Topics covered: 8 | CASE statements 9 | Simple subqueries 10 | Correlated subqueries 11 | Window functions 12 | 13 | ------------------------------------------------------------------------------------------------------------ 14 | 15 | CASE statements 16 | Contains a WHEN , THEN , and ELSE statement, finished with END 17 | 18 | Example: 19 | CASE WHEN x = 1 THEN 'a' 20 | WHEN x = 2 THEN 'b' 21 | ELSE 'c' END AS new_column 22 | 23 | 24 | ------------------------------------------------------------------------------------------------------------ 25 | 26 | CASE WHEN ... AND then some 27 | Add multiple logical conditions to your WHEN clause! 28 | 29 | Example: 30 | SELECT date, hometeam_id, awayteam_id, 31 | CASE WHEN hometeam_id = 8455 AND home_goal > away_goal 32 | THEN 'Chelsea home win!' 33 | WHEN awayteam_id = 8455 AND home_goal < away_goal 34 | THEN 'Chelsea away win!' 35 | ELSE 'Loss or tie :(' END AS outcome 36 | FROM match 37 | WHERE hometeam_id = 8455 OR awayteam_id = 8455; 38 | 39 | ------------------------------------------------------------------------------------------------------------ 40 | 41 | Filtering your CASE statement 42 | Let's generate a list of matches won by Italy's Bologna team! There are quite a few additional teams in the two tables, so a key part of generating a usable query will be using your CASE statement as a filter in the WHERE clause. 43 | 44 | CASE statements allow you to categorize data that you're interested in -- and exclude data you're not interested in. In order to do this, you can use a CASE statement as a filter in the WHERE statement to remove output you don't want to see. 45 | 46 | Here is how you might set that up: 47 | 48 | SELECT * 49 | FROM table 50 | WHERE 51 | CASE WHEN a > 5 THEN 'Keep' 52 | WHEN a <= 5 THEN 'Exclude' END = 'Keep'; 53 | In essence, you can use the CASE statement as a filtering column like any other column in your database. The only difference is that you don't alias the statement in WHERE. 54 | 55 | Instructions 3/3 56 | 0 XP 57 | 3 58 | Select the home_goal and away_goal for each match. 59 | Use the CASE statement in the WHERE clause to filter all NULL values generated by the statement in the previous step. 60 | 61 | 62 | -- Select the season, date, home_goal, and away_goal columns 63 | SELECT 64 | season, 65 | date, 66 | home_goal, 67 | away_goal 68 | FROM matches_italy 69 | WHERE 70 | -- Exclude games not won by Bologna 71 | CASE WHEN hometeam_id = 9857 AND home_goal > away_goal THEN 'Bologna Win' 72 | WHEN awayteam_id = 9857 AND away_goal > home_goal THEN 'Bologna Win' 73 | END IS NOT NULL; 74 | 75 | ------------------------------------------------------------------------------------------------------------ 76 | 77 | In CASE you need to aggregate 78 | CASE statements are great for 79 | Categorizing data 80 | Filtering data 81 | Aggregating data 82 | 83 | ------------------------------------------------------------------------------------------------------------ 84 | 85 | Calculating percent with CASE and AVG 86 | CASE statements will return any value you specify in your THEN clause. This is an incredibly powerful tool for robust calculations and data manipulation when used in conjunction with an aggregate statement. One key task you can perform is using CASE inside an AVG function to calculate a percentage of information in your database. 87 | 88 | Here's an example of how you set that up: 89 | 90 | AVG(CASE WHEN condition_is_met THEN 1 91 | WHEN condition_is_not_met THEN 0 END) 92 | With this approach, it's important to accurately specify which records count as 0, otherwise your calculations may not be correct! 93 | 94 | Your task is to examine the number of wins, losses, and ties in each country. The matches table is filtered to include all matches from the 2013/2014 and 2014/2015 seasons. 95 | 96 | Instructions 1/3 97 | 35 XP 98 | 1 99 | 2 100 | 3 101 | Create 3 CASE statements to COUNT the total number of home team wins, away team wins, and ties. This will allow you to examine the total number of records. You will convert this to an AVG in the next step. 102 | 103 | 104 | SELECT 105 | c.name AS country, 106 | -- Count the home wins, away wins, and ties in each country 107 | count(case when m.home_goal > m.away_goal THEN m.id 108 | END) AS home_wins, 109 | count(case when m.home_goal < m.away_goal THEN m.id 110 | END) AS away_wins, 111 | count(case when m.home_goal = m.away_goal THEN m.id 112 | END) AS ties 113 | FROM country AS c 114 | LEFT JOIN matches AS m 115 | ON c.id = m.country_id 116 | GROUP BY country; 117 | 118 | 119 | 120 | Instructions 2/3 121 | 35 XP 122 | 2 123 | 3 124 | Calculate the percentage of matches tied using a CASE statement inside AVG. 125 | Fill in the logical operators for each statement. Alias your columns as ties_2013_2014 and ties_2014_2015, respectively. 126 | 127 | SELECT 128 | c.name AS country, 129 | -- Calculate the percentage of tied games in each season 130 | avg(case when m.season='2013/2014' AND m.home_goal = m.away_goal THEN 1 131 | WHEN m.season='2013/2014' AND m.home_goal != m.away_goal THEN 0 132 | END) AS ties_2013_2014, 133 | avg(case when m.season='2014/2015' and m.home_goal = m.away_goal then 1 134 | WHEN m.season='2014/2015' and m.home_goal != m.away_goal then 0 135 | end) AS ties_2014_2015 136 | FROM country AS c 137 | LEFT JOIN matches AS m 138 | ON c.id = m.country_id 139 | GROUP BY country; 140 | 141 | 142 | 143 | 144 | 145 | 146 | Instructions 3/3 147 | 30 XP 148 | 3 149 | The previous "ties" columns returned values with 14 decimal points, which is not easy to interpret. Use the ROUND function to round to 2 decimal points. 150 | 151 | SELECT 152 | c.name AS country, 153 | -- Round the percentage of tied games to 2 decimal points 154 | round(avg(CASE WHEN m.season='2013/2014' AND m.home_goal = m.away_goal THEN 1 155 | WHEN m.season='2013/2014' AND m.home_goal != m.away_goal THEN 0 156 | END),2) AS pct_ties_2013_2014, 157 | round(avg(CASE WHEN m.season='2014/2015' AND m.home_goal = m.away_goal THEN 1 158 | WHEN m.season='2014/2015' AND m.home_goal != m.away_goal THEN 0 159 | END),2) AS pct_ties_2014_2015 160 | FROM country AS c 161 | LEFT JOIN matches AS m 162 | ON c.id = m.country_id 163 | GROUP BY country; 164 | 165 | ------------------------------------------------------------------------------------------------------------ 166 | 167 | 168 | -------------------------------------------------------------------------------- /DataCamp_Notes/Introduction to Git for Data Science.txt: -------------------------------------------------------------------------------- 1 | Interactive Course 2 | Introduction to Git for Data Science 3 | Introduction to Git for Data Science 4 | 5 | 4 hours 6 | 0 Videos 7 | 46 Exercises 8 | 65,206 Participants 9 | 3,650 XP 10 | 11 | Greg Wilson 12 | Greg Wilson 13 | 14 | Co-founder of Software Carpentry 15 | 16 | Dr. Greg Wilson has worked for 30 years in both industry and academia, and is the author or editor of several books on computing and two for children. He is best known as the co-founder of Software Carpentry, a non-profit organization that teaches basic computing skills to researchers. 17 | Collaborator(s) 18 | 19 | Filip Schouwenaars 20 | 21 | Filip Schouwenaars 22 | 23 | Course Description 24 | 25 | Version control is one of the power tools of programming. It allows you to keep track of what you did when, undo any changes you decide you don't want, and collaborate at scale with other people. This course will introduce you to Git, a modern version control tool that is very popular with data scientists and software developers, and show you how to use it to get more done in less time and with less pain. 26 | 27 | <============================================================================================================================> 28 | 29 | 1 30 | Basic workflow 31 | 100% 32 | 33 | This chapter explains what version control is and why you should use it, and introduces the most common steps in a common Git workflow. 34 | View Chapter Details 35 | 36 | 37 | 38 | 39 | 40 | <============================================================================================================================> 41 | 2 42 | Repositories 43 | 9% 44 | 45 | This chapter digs a little deeper into how Git stores information and how you can explore a repository's history. 46 | View Chapter Details 47 | ______________________________________________________________________________________________________________________________ 48 | 49 | 50 | 51 | 52 | 53 | 54 | <============================================================================================================================> 55 | 3 56 | Undo 57 | 0% 58 | 59 | Since Git saves all the changes you've made to your files, you can use it to undo those changes. This chapter shows you several ways to do that. 60 | View Chapter Details 61 | 62 | 63 | 64 | 65 | <============================================================================================================================> 66 | 4 67 | Working with branches 68 | 0% 69 | 70 | Branching is one of Git's most powerful features, since it allows you to work on several things at once. This chapter shows you how to create and manage branches. 71 | View Chapter Details 72 | 73 | 74 | 75 | 76 | 77 | <============================================================================================================================> 78 | 5 79 | Collaborating 80 | 0% 81 | 82 | This chapter showcases how Git allows you to share changes between repositories to collaborate at scale. 83 | View Chapter Details 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /DataCamp_Notes/Linear Classifiers in Python.txt: -------------------------------------------------------------------------------- 1 | Linear Classifiers in Python 2 | 3 | Course Description 4 | In this course you'll learn all about using linear classifiers, specifically logistic regression and support vector machines, with scikit-learn. Once you've learned how to apply these methods, you'll dive into the ideas behind them and find out what really makes them tick. At the end of this course you'll know how to train, test, and tune these linear classifiers in Python. You'll also have a conceptual foundation for understanding many other machine learning algorithms. 5 | 6 | -------------------------------------------------------------------------------------------------------------------------------------------------------- 7 | 8 | Applying logistic regression and SVM 9 | FREE 10 | 100% 11 | In this chapter you will learn the basics of applying logistic regression and support vector machines (SVMs) to classification problems. You'll use the scikit-learn library to fit classification models to real data. 12 | 13 | -------------------------------------------------------------------------------------------------------------------------------------------------------- 14 | 15 | 16 | Loss functions 17 | 0% 18 | In this chapter you will discover the conceptual framework behind logistic regression and SVMs. This will let you delve deeper into the inner workings of these models. 19 | 20 | -------------------------------------------------------------------------------------------------------------------------------------------------------- 21 | 22 | Logistic regression 23 | 0% 24 | In this chapter you will delve into the details of logistic regression. You'll learn all about regularization and how to interpret model output. 25 | 26 | -------------------------------------------------------------------------------------------------------------------------------------------------------- 27 | 28 | 4 29 | Support Vector Machines 30 | 0% 31 | In this chapter you will learn all about the details of support vector machines. You'll learn about tuning hyperparameters for these models and using kernels to fit non-linear decision boundaries. 32 | 33 | 34 | -------------------------------------------------------------------------------- /DataCamp_Notes/Visualizing Time Series Data in Python.txt: -------------------------------------------------------------------------------- 1 | Visualizing Time Series Data in Python 2 | 3 | 4 | Course Description 5 | Time series data is omnipresent in the field of Data Science. Whether it is analyzing business trends, forecasting company revenue or exploring customer behavior, every data scientist is likely to encounter time series data at some point during their work. To get you started on working with time series data, this course will provide practical knowledge on visualizing time series data using Python. 6 | <=====================================================================================================================================> 7 | 8 | 1 9 | Introduction 10 | FREE 11 | 12 | 13 | 0% 14 | You will learn how to leverage basic plottings tools in Python, and how to annotate and personalize your time series plots. By the end of this chapter, you will be able to take any static dataset and produce compelling plots of your data. 15 | 16 | ----------------------------------------------------------------------------------------------------------------- 17 | 18 | 19 | Load your time series data 20 | The most common way to import time series data in Python is by using the pandas library. You can use the read_csv() from pandas to read the contents of a file into a DataFrame. This can be achieved using the following command: 21 | 22 | df = pd.read_csv("name_of_your_file.csv") 23 | Once your data is loaded into Python, you can display the first rows of your DataFrame by calling the .head(n=5) method, where n=5 indicates that you want to print the first five rows of your DataFrame. 24 | 25 | In this exercise, you will read in a time series dataset that contains the number of "great" inventions and scientific discoveries from 1860 to 1959, and display its first five rows. 26 | 27 | Instructions 28 | 100 XP 29 | Import the pandas library using the pd alias. 30 | Read in the time series data from the csv file located at url_discoveries into a DataFrame called discoveries. 31 | Print the first 5 lines of the DataFrame using the .head() method. 32 | 33 | # Import pandas 34 | import pandas as pd 35 | 36 | # Read in the file content in a DataFrame called discoveries 37 | discoveries = pd.read_csv(url_discoveries) 38 | 39 | # Display the first five lines of the DataFrame 40 | print(discoveries.head()) 41 | 42 | ----------------------------------------------------------------------------------------------------------------- 43 | 44 | Test whether your data is of the correct type 45 | When working with time series data in pandas, any date information should be formatted as a datetime64 type. Therefore, it is important to check that the columns containing the date information are of the correct type. You can check the type of each column in a DataFrame by using the .dtypes attribute. Fortunately, if your date columns come as strings, epochs, etc... you can use the to_datetime() function to convert them to the appropriate datetime64 type: 46 | 47 | df['date_column'] = pd.to_datetime(df['date_column']) 48 | In this exercise, you will learn how to check the data type of the columns in your time series data and convert a date column to the appropriate datetime type. 49 | 50 | Instructions 1/3 51 | 30 XP 52 | 1 53 | 2 54 | 3 55 | Print out the data type of the column in the discoveries object. 56 | 57 | # Print the data type of each column in discoveries 58 | print(discoveries.dtypes) 59 | 60 | # Convert the date column to a datestamp type 61 | discoveries['date'] = pd.to_datetime(discoveries['date']) 62 | 63 | # Print the data type of each column in discoveries, again 64 | print(discoveries.dtypes) 65 | ----------------------------------------------------------------------------------------------------------------- 66 | 67 | Your first plot! 68 | Let's take everything you have learned so far and plot your first time series plot. You will set the groundwork by producing a time series plot of your data and labeling the axes of your plot, as this makes the plot more readable and interpretable for the intended audience. 69 | 70 | matplotlib is the most widely used plotting library in Python, and would be the most appropriate tool for this job. Fortunately for us, the pandas library has implemented a .plot() method on Series and DataFrame objects that is a wrapper around matplotlib.pyplot.plot(), which makes it easier to produce plots. 71 | 72 | Instructions 73 | 100 XP 74 | Set the 'date' column as the index of your DataFrame. 75 | Using the discoveries DataFrame, plot the time series in your DataFrame using a "blue" line plot and assign it to ax. 76 | Specify the x-axis label on your plot: 'Date'. 77 | Specify the y-axis label on your plot: 'Number of great discoveries'. 78 | 79 | 80 | # Set the date column as the index of your DataFrame discoveries 81 | discoveries = discoveries.set_index('date') 82 | 83 | # Plot the time series in your DataFrame 84 | ax = discoveries.plot(color='blue') 85 | 86 | # Specify the x-axis label in your plot 87 | ax.set_xlabel('Date') 88 | 89 | # Specify the y-axis label in your plot 90 | ax.set_ylabel('Number of great discoveries') 91 | 92 | # Show plot 93 | plt.show() 94 | 95 | ----------------------------------------------------------------------------------------------------------------- 96 | 97 | Specify plot styles 98 | The matplotlib library also comes with a number of built-in stylesheets that allow you to customize the appearance of your plots. To use a particular style sheet for your plots, you can use the command plt.style.use(your_stylesheet) where your_stylesheet is the name of the style sheet. 99 | 100 | In order to see the list of available style sheets that can be used, you can use the command print(plt.style.available). For the rest of this course, we will use the awesome fivethirtyeight style sheet. 101 | 102 | Instructions 1/2 103 | 50 XP 104 | 1 105 | 2 106 | Import matplotlib.pyplot using its usual alias plt. 107 | Use the fivethirtyeight style sheet to plot a line plot of the discoveries data. 108 | 109 | # Import the matplotlib.pyplot sub-module 110 | import matplotlib.pyplot as plt 111 | 112 | # Use the fivethirtyeight style 113 | plt.style.use('fivethirtyeight') 114 | 115 | # Plot the time series 116 | ax1 = discoveries.plot() 117 | ax1.set_title('FiveThirtyEight Style') 118 | plt.show() 119 | 120 | 121 | Use the ggplot style sheet to plot a line plot of the discoveries data. 122 | Set the title of your second plot as 'ggplot Style'. 123 | 124 | # Import the matplotlib.pyplot sub-module 125 | import matplotlib.pyplot as plt 126 | 127 | # Use the ggplot style 128 | plt.style.use('ggplot') 129 | ax2 = discoveries.plot() 130 | 131 | # Set the title 132 | ax2.set_title('ggplot Style') 133 | plt.show() 134 | ----------------------------------------------------------------------------------------------------------------- 135 | 136 | 137 | Display and label plots 138 | As you saw earlier, if the index of a pandas DataFrame consists of dates, then pandas will automatically format the x-axis in a human-readable way. In addition the .plot() method allows you to specify various other parameters to tailor your time series plot (color of the lines, width of the lines and figure size). 139 | 140 | You may have noticed the use of the notation ax = df.plot(...) and wondered about the purpose of the ax object. This is because the plot function returns a matplotlib AxesSubplot object, and it is common practice to assign this returned object to a variable called ax. Doing so also allows you to include additional notations and specifications to your plot such as axis labels. 141 | 142 | Instructions 143 | 100 XP 144 | Display a line chart of the discoveries DataFrame. 145 | 146 | Specify the color of the line as 'blue'. 147 | Width of the line as 2. 148 | The dimensions of your plot to be of length 8 and width 3. 149 | Specify the fontsize of 6. 150 | 151 | 152 | # Plot a line chart of the discoveries DataFrame using the specified arguments 153 | ax = discoveries.plot(color='blue', figsize=(8, 3), linewidth=2, fontsize=6) 154 | 155 | # Specify the title in your plot 156 | ax.set_title('Number of great inventions and scientific discoveries from 1860 to 1959', fontsize=8) 157 | 158 | # Show plot 159 | plt.show() 160 | ----------------------------------------------------------------------------------------------------------------- 161 | Subset time series data 162 | When plotting time series data, you may occasionally want to visualize only a subset of the data. The pandas library provides powerful indexing and subsetting methods that allow you to extract specific portions of a DataFrame. For example, you can subset all the data between 1950 and 1960 in the discoveries DataFrame by specifying the following date range: 163 | 164 | subset_data = discoveries['1950-01-01':'1960-01-01'] 165 | Note: Subsetting your data this way is only possible if the index of your DataFrame contains dates of the datetime type. Failing that, the pandas library will return an error message. 166 | 167 | Instructions 1/2 168 | 50 XP 169 | 1 170 | 2 171 | Use discoveries to create a new DataFrame discoveries_subset_1 that contains all the data between January 1, 1945 and January 1, 1950. 172 | Plot the time series of discoveries_subset_1 using a "blue" line plot. 173 | 174 | 175 | 176 | 177 | 178 | <=====================================================================================================================================> 179 | 180 | 2 181 | Summary Statistics and Diagnostics 182 | 0% 183 | In this chapter, you will gain a deeper understanding of your time series data by computing summary statistics and plotting aggregated views of your data. 184 | 185 | <=====================================================================================================================================> 186 | 187 | 188 | 3 189 | Seasonality, Trend and Noise 190 | 0% 191 | You will go beyond summary statistics by learning about autocorrelation and partial autocorrelation plots. You will also learn how to automatically detect seasonality, trend and noise in your time series data. 192 | 193 | <=====================================================================================================================================> 194 | 195 | 4 196 | Work with Multiple Time Series 197 | 0% 198 | In the field of Data Science, it is common to be involved in projects where multiple time series need to be studied simultaneously. In this chapter, we will show you how to plot multiple time series at once, and how to discover and describe relationships between multiple time series. 199 | 200 | <=====================================================================================================================================> 201 | 202 | 203 | 5 204 | Case Study 205 | 0% 206 | This chapter will give you a chance to practice all the concepts covered in the course. You will visualize the unemployment rate in the US from 2000 to 2010. 207 | 208 | -------------------------------------------------------------------------------- /MBazeley_Resume_Aug2020_v2.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/MBazeley_Resume_Aug2020_v2.docx -------------------------------------------------------------------------------- /MBazeley_Resume_Aug2020_v2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/MBazeley_Resume_Aug2020_v2.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![alt text]( 2 | https://github.com/MMBazel/springboard-program/blob/master/0.jpg 3 | ) 4 | 5 | 6 | # Springboard Data Science Career Track 7 | 8 | Hi! 9 | 10 | My name is Mikiko Bazeley and this is my repo for the Springboard Data Science Track. 11 | 12 | From Oct 2018 to April 2019 I completed a number of projects, including two capstones, as part of the DS track. 13 | 14 | All of the documentation, code, and notes can be found here, as well as links to other resources I found helpful for successfully completing the program. 15 | 16 | For questions or comments, please feel free to reach out on [LinkedIn](https://www.linkedin.com/in/mikikobazeley/). 17 | 18 | If you find my repo useful, let me know OR ☕ consider buying me a coffee! https://www.buymeacoffee.com/mmbazel ☕. 19 | 20 | Regards, 21 | Mikiko 22 | 23 | ![alt text]( 24 | https://github.com/MMBazel/springboard-program/blob/master/Additional%20Resources/profile_pic_jpeg.jpg?raw=true 25 | ) 26 | 27 | 28 | -------------------------------------------------------------------------------------------------------------------------------- 29 | # Project List by Unit of Study 30 | 31 | For a comprehensve list of the projects and corresponding skills needed, please see the list below. 32 | 33 | ## 1. The Python Data Science Stack 34 | Topics covered: 35 | * Python 36 | * Matplotlib, Seaborn—visualization tools in Python 37 | * Writing clear, elegant, readable code 38 | in Python using the PEP8 standard 39 | 40 | ## 2. Data Wrangling 41 | Topics covered: 42 | * Deep dive into Pandas for data wrangling 43 | * Data in files: Work with a variety of file formats from plain text (.txt) to more structured and nested formats files like csv and JSON 44 | * Data in databases: Get an overview of relational and NoSQL databases and practice data querying with SQL 45 | * APIs: Collect data from the internet using Application Programming Interfaces (APIs) 46 | 47 | Projects: 48 | * =====> [Mini Project: SQL Practice](https://github.com/MMBazel/springboard-program/tree/master/mini-projects/Ch%205%20-%20Data%20Wrangling/5.3%20SQL%20Practice) 49 | 50 | 51 | 52 | ## 3. Data Story 53 | 54 | ## 4. Statistical Inference 55 | Topics covered: 56 | * Theory of inferential statistics 57 | * Statistical significance 58 | * Parameter estimation 59 | * Hypothesis testing 60 | * Correlation and regression 61 | * Exploratory data analysis 62 | * A/B testing 63 | 64 | 65 | ## 5. Machine Learning 66 | Topics covered: 67 | * Scikit-learn 68 | * Supervised and unsupervised learning 69 | * Top machine learning techniques: 70 | * Linear and logistic regression 71 | * naive bayes 72 | * support vector machines 73 | * decision trees 74 | * clustering 75 | * Ensemble learning with random forests and gradient boosting 76 | * Best practices 77 | * Evaluating and tuning machine learning systems 78 | 79 | 80 | 81 | ## 6. Capstone Project 1: Building a Data Product 82 | 83 | * =====> My Capstone Project: [Predicting Qualifieds from First Call](https://github.com/MMBazel/springboard-program/tree/master/capstone1) 84 | 85 | 86 | ## 7. The Natural Language Processing (NLP) Track 87 | 88 | Topics covered: 89 | * How to work with text and natural language data 90 | * NLP in Python, using common libraries such as NLTK and spaCy 91 | * Basics of Deep Learning in NLP using word2vec and TensorFlow 92 | * Data Science at Scale using Spark 93 | * Software Engineering for Data Scientists 94 | 95 | 96 | ## 8. Second Capstone Project: NLP 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /about.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: This will be used as the title-tag of the page head 3 | --- 4 | 5 | hello 6 | ===== 7 | 8 | **You are here!** 9 | 10 | -------------------------------------------------------------------------------- /capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pdf -------------------------------------------------------------------------------- /capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/BazeleyMikiko_Capstone1_Springboard_V2.pptx -------------------------------------------------------------------------------- /capstone1/Capstone Project 1_ Final Report (V2).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Capstone Project 1_ Final Report (V2).pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Apply Data Storytelling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Apply Data Storytelling.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Apply Inferential Statistics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Apply Inferential Statistics.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/BazeleyMikiko_Capstone1_Springboard_2019March.pptx -------------------------------------------------------------------------------- /capstone1/Project Documents/Capstone 1 - In-Depth Analysis - MBazeley.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone 1 - In-Depth Analysis - MBazeley.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Capstone Project 1_ Data Storytelling - Google Docs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone Project 1_ Data Storytelling - Google Docs.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Capstone Project 1_ Exploratory Data Analysis.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone Project 1_ Exploratory Data Analysis.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Capstone Project 1_ Milestone Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Capstone Project 1_ Milestone Report.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Data Story Rubric_ Capstone 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Data Story Rubric_ Capstone 1.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Data Wrangling Rubric_ Capstone 1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Data Wrangling Rubric_ Capstone 1.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Milestone Report Rubric_ Capstone 1 .pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Milestone Report Rubric_ Capstone 1 .pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Project Capstone Project 1 Data Wrangling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Project Capstone Project 1 Data Wrangling.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/Project Capstone Project 1 Milestone Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/Project Capstone Project 1 Milestone Report.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/_ DSC Capstone Project 1 Rubric_ Inferential Statistics.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project Documents/_ DSC Capstone Project 1 Rubric_ Inferential Statistics.pdf -------------------------------------------------------------------------------- /capstone1/Project Documents/dumy.txt: -------------------------------------------------------------------------------- 1 | 1 2 | -------------------------------------------------------------------------------- /capstone1/Project_ Capstone Project 1_ Project Proposal.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Project_ Capstone Project 1_ Project Proposal.docx -------------------------------------------------------------------------------- /capstone1/Springboard Project Capstone 1 - Project Ideas - Google Docs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone1/Springboard Project Capstone 1 - Project Ideas - Google Docs.pdf -------------------------------------------------------------------------------- /capstone2/Capstone 2_ Final Summary.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone 2_ Final Summary.pdf -------------------------------------------------------------------------------- /capstone2/Capstone 2_ Milestone 1 Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone 2_ Milestone 1 Report.pdf -------------------------------------------------------------------------------- /capstone2/Capstone 2_ Milestone 2 Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone 2_ Milestone 2 Report.pdf -------------------------------------------------------------------------------- /capstone2/Capstone2_Slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone2_Slides.pdf -------------------------------------------------------------------------------- /capstone2/Capstone2_Slides.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/capstone2/Capstone2_Slides.pptx -------------------------------------------------------------------------------- /capstone2/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/DSC Mini-Project_Linear Regression Rubric.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/DSC Mini-Project_Linear Regression Rubric.pdf -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/conditionalmean.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/conditionalmean.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/cs109gitflow3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/cs109gitflow3.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/dummy.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/shuttle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Linear - Logistic Regression/images/shuttle.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/data/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/.DS_Store -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/bias.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/bias.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-plot.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-reg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/complexity-error-reg.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/data.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn1.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/knn2.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linreg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linreg.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linsep.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/linsep.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/onelinesplit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/onelinesplit.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/pcanim.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/pcanim.gif -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/reshape.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/reshape.jpg -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearn2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearn2.jpg -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearntrans.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/sklearntrans.jpg -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv2.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-cv3.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-test.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test-cont.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test-cont.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.1 Logistic Regression/images/train-validate-test3.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/callibration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/callibration.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/terms2.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/vsm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.3 Bayesian Methods and Text Data/vsm.png -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/.ipynb_checkpoints/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/WineKMC.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/WineKMC.xlsx -------------------------------------------------------------------------------- /mini-projects/Ch 10 Machine Learning/Ch 10.5 Introduction to Unsupervised Learning/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 16 Spark and PySpark/dummy.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.2 Working with Data/JSON Exervise/data/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.2 Working with Data/JSON Exervise/data/world_bank_projects_less.json: -------------------------------------------------------------------------------- 1 | [{ "_id" : { "$oid" : "52b213b38594d8a2be17c780" }, "approvalfy" : 1999, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-12T00:00:00Z", "borrower" : "FEDERAL DEMOCRATIC REPUBLIC OF ETHIOPIA", "closingdate" : "2018-07-07T00:00:00Z", "country_namecode" : "Federal Democratic Republic of Ethiopia!$!ET", "countrycode" : "ET", "countryname" : "Federal Democratic Republic of Ethiopia", "countryshortname" : "Ethiopia", "docty" : "Project Information Document,Indigenous Peoples Plan,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 0, "ibrdcommamt" : 0, "id" : "P129828", "idacommamt" : 130000000, "impagency" : "MINISTRY OF EDUCATION", "lendinginstr" : "Investment Project Financing", "lendinginstrtype" : "IN", "lendprojectcost" : 550000000, "majorsector_percent" : [ { "Name" : "Education", "Percent" : 46 }, { "Name" : "Education", "Percent" : 26 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 16 }, { "Name" : "Education", "Percent" : 12 } ], "mjsector_namecode" : [ { "name" : "Education", "code" : "EX" }, { "name" : "Education", "code" : "EX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Education", "code" : "EX" } ], "mjtheme" : [ "Human development" ], "mjtheme_namecode" : [ { "name" : "Human development", "code" : "8" }, { "name" : "", "code" : "11" } ], "mjthemecode" : "8,11", "prodline" : "PE", "prodlinetext" : "IBRD/IDA", "productlinetype" : "L", "project_abstract" : { "cdata" : "The development objective of the Second Phase of General Education Quality Improvement Project for Ethiopia is to improve learning conditions in primary and secondary schools and strengthen institutions at different levels of educational administration. The project has six components. The first component is curriculum, textbooks, assessment, examinations, and inspection. This component will support improvement of learning conditions in grades KG-12 by providing increased access to teaching and learning materials and through improvements to the curriculum by assessing the strengths and weaknesses of the current curriculum. This component has following four sub-components: (i) curriculum reform and implementation; (ii) teaching and learning materials; (iii) assessment and examinations; and (iv) inspection. The second component is teacher development program (TDP). This component will support improvements in learning conditions in both primary and secondary schools by advancing the quality of teaching in general education through: (a) enhancing the training of pre-service teachers in teacher education institutions; and (b) improving the quality of in-service teacher training. This component has following three sub-components: (i) pre-service teacher training; (ii) in-service teacher training; and (iii) licensing and relicensing of teachers and school leaders. The third component is school improvement plan. This component will support the strengthening of school planning in order to improve learning outcomes, and to partly fund the school improvement plans through school grants. It has following two sub-components: (i) school improvement plan; and (ii) school grants. The fourth component is management and capacity building, including education management information systems (EMIS). This component will support management and capacity building aspect of the project. This component has following three sub-components: (i) capacity building for education planning and management; (ii) capacity building for school planning and management; and (iii) EMIS. The fifth component is improving the quality of learning and teaching in secondary schools and universities through the use of information and communications technology (ICT). It has following five sub-components: (i) national policy and institution for ICT in general education; (ii) national ICT infrastructure improvement plan for general education; (iii) develop an integrated monitoring, evaluation, and learning system specifically for the ICT component; (iv) teacher professional development in the use of ICT; and (v) provision of limited number of e-Braille display readers with the possibility to scale up to all secondary education schools based on the successful implementation and usage of the readers. The sixth component is program coordination, monitoring and evaluation, and communication. It will support institutional strengthening by developing capacities in all aspects of program coordination, monitoring and evaluation; a new sub-component on communications will support information sharing for better management and accountability. It has following three sub-components: (i) program coordination; (ii) monitoring and evaluation (M and E); and (iii) communication." }, "project_name" : "Ethiopia General Education Quality Improvement Project II", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID), Vol.", "DocType" : "PID", "EntityID" : "090224b081e545fb_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b081e545fb_1_0", "DocDate" : "28-AUG-2013" }, { "DocTypeDesc" : "Indigenous Peoples Plan (IP), Vol.1 of 1", "DocType" : "IP", "EntityID" : "000442464_20130920111729", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20130920111729", "DocDate" : "01-JUL-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.", "DocType" : "PID", "EntityID" : "090224b0817b19e2_1_0", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=090224b0817b19e2_1_0", "DocDate" : "22-NOV-2012" } ], "projectfinancialtype" : "IDA", "projectstatusdisplay" : "Active", "regionname" : "Africa", "sector" : [ { "Name" : "Primary education" }, { "Name" : "Secondary education" }, { "Name" : "Public administration- Other social services" }, { "Name" : "Tertiary education" } ], "sector1" : { "Name" : "Primary education", "Percent" : 46 }, "sector2" : { "Name" : "Secondary education", "Percent" : 26 }, "sector3" : { "Name" : "Public administration- Other social services", "Percent" : 16 }, "sector4" : { "Name" : "Tertiary education", "Percent" : 12 }, "sector_namecode" : [ { "name" : "Primary education", "code" : "EP" }, { "name" : "Secondary education", "code" : "ES" }, { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "Tertiary education", "code" : "ET" } ], "sectorcode" : "ET,BS,ES,EP", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Education for all", "Percent" : 100 }, "theme_namecode" : [ { "name" : "Education for all", "code" : "65" } ], "themecode" : "65", "totalamt" : 130000000, "totalcommamt" : 130000000, "url" : "http://www.worldbank.org/projects/P129828/ethiopia-general-education-quality-improvement-project-ii?lang=en" }, 2 | { "_id" : { "$oid" : "52b213b38594d8a2be17c781" }, "approvalfy" : 2015, "board_approval_month" : "November", "boardapprovaldate" : "2013-11-04T00:00:00Z", "borrower" : "GOVERNMENT OF TUNISIA", "country_namecode" : "Republic of Tunisia!$!TN", "countrycode" : "TN", "countryname" : "Republic of Tunisia", "countryshortname" : "Tunisia", "docty" : "Project Information Document,Integrated Safeguards Data Sheet,Integrated Safeguards Data Sheet,Project Information Document,Integrated Safeguards Data Sheet,Project Information Document", "envassesmentcategorycode" : "C", "grantamt" : 4700000, "ibrdcommamt" : 0, "id" : "P144674", "idacommamt" : 0, "impagency" : "MINISTRY OF FINANCE", "lendinginstr" : "Specific Investment Loan", "lendinginstrtype" : "IN", "lendprojectcost" : 5700000, "majorsector_percent" : [ { "Name" : "Public Administration, Law, and Justice", "Percent" : 70 }, { "Name" : "Public Administration, Law, and Justice", "Percent" : 30 } ], "mjsector_namecode" : [ { "name" : "Public Administration, Law, and Justice", "code" : "BX" }, { "name" : "Public Administration, Law, and Justice", "code" : "BX" } ], "mjtheme" : [ "Economic management", "Social protection and risk management" ], "mjtheme_namecode" : [ { "name" : "Economic management", "code" : "1" }, { "name" : "Social protection and risk management", "code" : "6" } ], "mjthemecode" : "1,6", "prodline" : "RE", "prodlinetext" : "Recipient Executed Activities", "productlinetype" : "L", "project_name" : "TN: DTF Social Protection Reforms Support", "projectdocs" : [ { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131024115616", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131024115616", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20131024151611", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20131024151611", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000442464_20131031112136", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000442464_20131031112136", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000333037_20131031105716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000333037_20131031105716", "DocDate" : "29-MAR-2013" }, { "DocTypeDesc" : "Integrated Safeguards Data Sheet (ISDS), Vol.1 of 1", "DocType" : "ISDS", "EntityID" : "000356161_20130305113209", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113209", "DocDate" : "16-JAN-2013" }, { "DocTypeDesc" : "Project Information Document (PID), Vol.1 of 1", "DocType" : "PID", "EntityID" : "000356161_20130305113716", "DocURL" : "http://www-wds.worldbank.org/servlet/WDSServlet?pcont=details&eid=000356161_20130305113716", "DocDate" : "16-JAN-2013" } ], "projectfinancialtype" : "OTHER", "projectstatusdisplay" : "Active", "regionname" : "Middle East and North Africa", "sector" : [ { "Name" : "Public administration- Other social services" }, { "Name" : "General public administration sector" } ], "sector1" : { "Name" : "Public administration- Other social services", "Percent" : 70 }, "sector2" : { "Name" : "General public administration sector", "Percent" : 30 }, "sector_namecode" : [ { "name" : "Public administration- Other social services", "code" : "BS" }, { "name" : "General public administration sector", "code" : "BZ" } ], "sectorcode" : "BZ,BS", "source" : "IBRD", "status" : "Active", "supplementprojectflg" : "N", "theme1" : { "Name" : "Other economic management", "Percent" : 30 }, "theme_namecode" : [ { "name" : "Other economic management", "code" : "24" }, { "name" : "Social safety nets", "code" : "54" } ], "themecode" : "54,24", "totalamt" : 0, "totalcommamt" : 4700000, "url" : "http://www.worldbank.org/projects/P144674?lang=en" } 3 | ] 4 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/BazeleyMikiko_Sec5_3_sql_project_2018Nov24.sql: -------------------------------------------------------------------------------- 1 | -- Student: Mikiko Bazeley 2 | -- Date: 11/24/2018 3 | -- Sec 5.3 - SQL Mini Project 4 | 5 | 6 | 7 | 8 | /* Q1: Some of the facilities charge a fee to members, but some do not. 9 | Please list the names of the facilities that do. */ 10 | 11 | select f.name 12 | from facilities f 13 | where f.membercost > 0; 14 | 15 | /* Q2: How many facilities do not charge a fee to members? */ 16 | 17 | select count(*) 18 | from facilities f 19 | where f.membercost > 0; 20 | 21 | 22 | /* Q3: How can you produce a list of facilities that charge a fee to members, 23 | where the fee is less than 20% of the facility's monthly maintenance cost? 24 | Return the facid, facility name, member cost, and monthly maintenance of the 25 | facilities in question. */ 26 | 27 | select f.facid, f.name, f.membercost, f.monthlymaintenance 28 | from facilities f 29 | where f.membercost < (0.2*f.monthlymaintenance); 30 | 31 | 32 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5? 33 | Write the query without using the OR operator. */ 34 | 35 | select * 36 | from facilities f 37 | where f.facid in (1,5); 38 | 39 | /* Q5: How can you produce a list of facilities, with each labelled as 40 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is 41 | more than $100? Return the name and monthly maintenance of the facilities 42 | in question. */ 43 | 44 | select f.name, 45 | f.monthlymaintenance, 46 | case when (f.monthlymaintenance>100) then 'expensive' else 'cheap' end label 47 | from facilities f; 48 | 49 | /* Q6: You'd like to get the first and last name of the last member(s) 50 | who signed up. Do not use the LIMIT clause for your solution. */ 51 | 52 | select m.firstname, m.surname 53 | from members m 54 | where m.joindate = (select max(joindate) from members); 55 | 56 | /* Q7: How can you produce a list of all members who have used a tennis court? 57 | Include in your output the name of the court, and the name of the member 58 | formatted as a single column. Ensure no duplicate data, and order by 59 | the member name. */ 60 | 61 | select f2.name, m2.firstname ||' ' || m2.surname as fullName 62 | from 63 | (select b1.facid, b1.memid 64 | from bookings b1 65 | where b1.facid in 66 | (select distinct f1.facid 67 | from facilities f1 68 | where f1.name like ('Tennis Court%') 69 | ) 70 | group by b1.facid, b1.memid) b2 71 | left join facilities f2 on f2.facid = b2.facid 72 | left join members m2 on m2.memid = b2.memid 73 | order by (m2.firstname || m2.surname) desc ; 74 | 75 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which 76 | will cost the member (or guest) more than $30? Remember that guests have 77 | different costs to members (the listed costs are per half-hour 'slot'), and 78 | the guest user's ID is always 0. Include in your output the name of the 79 | facility, the name of the member formatted as a single column, and the cost. 80 | Order by descending cost, and do not use any subqueries. */ 81 | 82 | select f.name, 83 | (m.firstname ||' '|| m.surname) as fullname, 84 | case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost 85 | -- , 86 | -- b.memid, 87 | -- b.slots, 88 | -- f.guestcost, 89 | -- f.membercost 90 | from bookings b 91 | left join facilities f on b.facid=f.facid 92 | left join members m on b.memid = m.memid 93 | where date(b.starttime) = date('2012-09-14') 94 | and (cost>30) 95 | order by cost desc; 96 | 97 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */ 98 | 99 | select t.name, t.fullname,t.cost 100 | from 101 | (select f.name, 102 | (m.firstname ||' '|| m.surname) as fullname, 103 | case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost, 104 | date(b.starttime) as date 105 | from bookings b 106 | left join facilities f on b.facid=f.facid 107 | left join members m on b.memid = m.memid) t 108 | where t.date = date('2012-09-14') 109 | and (t.cost>30) 110 | order by t.cost desc; 111 | 112 | 113 | /* Q10: Produce a list of facilities with a total revenue less than 1000. 114 | The output of facility name and total revenue, sorted by revenue. Remember 115 | that there's a different cost for guests and members! */ 116 | 117 | select 118 | f.name, 119 | sum(case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end) as revenue 120 | from bookings b 121 | left join facilities f on b.facid=f.facid 122 | group by f.name 123 | having revenue < 1000 124 | order by revenue desc; -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/DSC Mini-Project_ SQL Rubric.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/DSC Mini-Project_ SQL Rubric.docx -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_constraints.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_constraints.PNG -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_data.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/bookings_table_data.PNG -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryClub_facilities.csv: -------------------------------------------------------------------------------- 1 | facid,name,membercost,guestcost,initialoutlay,monthlymaintenance 2 | 0,Tennis Court 1,5,25,10000,200 3 | 1,Tennis Court 2,5,25,8000,200 4 | 2,Badminton Court,0,15.5,4000,50 5 | 3,Table Tennis,0,5,320,10 6 | 4,Massage Room 1,9.9,80,4000,3000 7 | 5,Massage Room 2,9.9,80,4000,3000 8 | 6,Squash Court,3.5,17.5,5000,80 9 | 7,Snooker Table,0,5,450,15 10 | 8,Pool Table,0,5,400,15 11 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryClub_members.csv: -------------------------------------------------------------------------------- 1 | memid,surname,firstname,address,zipcode,telephone,recommendedby,joindate 2 | 0,GUEST,GUEST,GUEST,0,(000) 000-0000,,7/1/2012 0:00 3 | 1,Smith,Darren,"8 Bloomsbury Close, Boston",4321,555-555-5555,,7/2/2012 12:02 4 | 2,Smith,Tracy,"8 Bloomsbury Close, New York",4321,555-555-5555,,7/2/2012 12:08 5 | 3,Rownam,Tim,"23 Highway Way, Boston",23423,(844) 693-0723,,7/3/2012 9:32 6 | 4,Joplette,Janice,"20 Crossing Road, New York",234,(833) 942-4710,1,7/3/2012 10:25 7 | 5,Butters,Gerald,"1065 Huntingdon Avenue, Boston",56754,(844) 078-4130,1,7/9/2012 10:44 8 | 6,Tracy,Burton,"3 Tunisia Drive, Boston",45678,(822) 354-9973,,7/15/2012 8:52 9 | 7,Dare,Nancy,"6 Hunting Lodge Way, Boston",10383,(833) 776-4001,4,7/25/2012 8:59 10 | 8,Boothe,Tim,"3 Bloomsbury Close, Reading, 00234",234,(811) 433-2547,3,7/25/2012 16:02 11 | 9,Stibbons,Ponder,"5 Dragons Way, Winchester",87630,(833) 160-3900,6,7/25/2012 17:09 12 | 10,Owen,Charles,"52 Cheshire Grove, Winchester, 28563",28563,(855) 542-5251,1,8/3/2012 19:42 13 | 11,Jones,David,"976 Gnats Close, Reading",33862,(844) 536-8036,4,8/6/2012 16:32 14 | 12,Baker,Anne,"55 Powdery Street, Boston",80743,844-076-5141,9,8/10/2012 14:23 15 | 13,Farrell,Jemima,"103 Firth Avenue, North Reading",57392,(855) 016-0163,,8/10/2012 14:28 16 | 14,Smith,Jack,"252 Binkington Way, Boston",69302,(822) 163-3254,1,8/10/2012 16:22 17 | 15,Bader,Florence,"264 Ursula Drive, Westford",84923,(833) 499-3527,9,8/10/2012 17:52 18 | 16,Baker,Timothy,"329 James Street, Reading",58393,833-941-0824,13,8/15/2012 10:34 19 | 17,Pinker,David,"5 Impreza Road, Boston",65332,811 409-6734,13,8/16/2012 11:32 20 | 20,Genting,Matthew,"4 Nunnington Place, Wingfield, Boston",52365,(811) 972-1377,5,8/19/2012 14:55 21 | 21,Mackenzie,Anna,"64 Perkington Lane, Reading",64577,(822) 661-2898,1,8/26/2012 9:32 22 | 22,Coplin,Joan,"85 Bard Street, Bloomington, Boston",43533,(822) 499-2232,16,8/29/2012 8:32 23 | 24,Sarwin,Ramnaresh,"12 Bullington Lane, Boston",65464,(822) 413-1470,15,9/1/2012 8:44 24 | 26,Jones,Douglas,"976 Gnats Close, Reading",11986,844 536-8036,11,9/2/2012 18:43 25 | 27,Rumney,Henrietta,"3 Burkington Plaza, Boston",78533,(822) 989-8876,20,9/5/2012 8:42 26 | 28,Farrell,David,"437 Granite Farm Road, Westford",43532,(855) 755-9876,,9/15/2012 8:22 27 | 29,Worthington-Smyth,Henry,"55 Jagbi Way, North Reading",97676,(855) 894-3758,2,9/17/2012 12:27 28 | 30,Purview,Millicent,"641 Drudgery Close, Burnington, Boston",34232,(855) 941-9786,2,9/18/2012 19:04 29 | 33,Tupperware,Hyacinth,"33 Cheerful Plaza, Drake Road, Westford",68666,(822) 665-5327,,9/18/2012 19:32 30 | 35,Hunt,John,"5 Bullington Lane, Boston",54333,(899) 720-6978,30,9/19/2012 11:32 31 | 36,Crumpet,Erica,"Crimson Road, North Reading",75655,(811) 732-4816,2,9/22/2012 8:36 32 | 37,Smith,Darren,"3 Funktown, Denzington, Boston",66796,(822) 577-3541,,9/26/2012 18:08 33 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/country_club_db.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/country_club_db.PNG -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryclub.db: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/countryclub.db -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_constraints.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_constraints.PNG -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_data.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/facilities_table_data.PNG -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_constraints.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_constraints.PNG -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_data.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Database Info/members_table_data.PNG -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Answers: -------------------------------------------------------------------------------- 1 | /* Welcome to the SQL mini project. For this project, you will use 2 | Springboard' online SQL platform, which you can log into through the 3 | following link: 4 | 5 | https://sql.springboard.com/ 6 | Username: student 7 | Password: learn_sql@springboard 8 | 9 | The data you need is in the "country_club" database. This database 10 | contains 3 tables: 11 | i) the "Bookings" table, 12 | ii) the "Facilities" table, and 13 | iii) the "Members" table. 14 | 15 | Note that, if you need to, you can also download these tables locally. 16 | 17 | In the mini project, you'll be asked a series of questions. You can 18 | solve them using the platform, but for the final deliverable, 19 | paste the code for each solution into this script, and upload it 20 | to your GitHub. 21 | 22 | Before starting with the questions, feel free to take your time, 23 | exploring the data, and getting acquainted with the 3 tables. */ 24 | 25 | 26 | 27 | /* Q1: Some of the facilities charge a fee to members, but some do not. 28 | Please list the names of the facilities that do. */ 29 | 30 | select f.name 31 | from facilities f 32 | where f.membercost > 0; 33 | 34 | 35 | /* Q2: How many facilities do not charge a fee to members? */ 36 | 37 | select count(*) 38 | from facilities f 39 | where f.membercost > 0; 40 | 41 | 42 | /* Q3: How can you produce a list of facilities that charge a fee to members, 43 | where the fee is less than 20% of the facility's monthly maintenance cost? 44 | Return the facid, facility name, member cost, and monthly maintenance of the 45 | facilities in question. */ 46 | 47 | select f.facid, f.name, f.membercost, f.monthlymaintenance 48 | from facilities f 49 | where f.membercost < (0.2*f.monthlymaintenance);/* Welcome to the SQL mini project. For this project, you will use 50 | Springboard' online SQL platform, which you can log into through the 51 | following link: 52 | 53 | https://sql.springboard.com/ 54 | Username: student 55 | Password: learn_sql@springboard 56 | 57 | The data you need is in the "country_club" database. This database 58 | contains 3 tables: 59 | i) the "Bookings" table, 60 | ii) the "Facilities" table, and 61 | iii) the "Members" table. 62 | 63 | Note that, if you need to, you can also download these tables locally. 64 | 65 | In the mini project, you'll be asked a series of questions. You can 66 | solve them using the platform, but for the final deliverable, 67 | paste the code for each solution into this script, and upload it 68 | to your GitHub. 69 | 70 | Before starting with the questions, feel free to take your time, 71 | exploring the data, and getting acquainted with the 3 tables. */ 72 | 73 | 74 | 75 | /* Q1: Some of the facilities charge a fee to members, but some do not. 76 | Please list the names of the facilities that do. */ 77 | 78 | select f.name 79 | from facilities f 80 | where f.membercost > 0; 81 | 82 | 83 | /* Q2: How many facilities do not charge a fee to members? */ 84 | 85 | select count(*) 86 | from facilities f 87 | where f.membercost > 0; 88 | 89 | 90 | /* Q3: How can you produce a list of facilities that charge a fee to members, 91 | where the fee is less than 20% of the facility's monthly maintenance cost? 92 | Return the facid, facility name, member cost, and monthly maintenance of the 93 | facilities in question. */ 94 | 95 | select f.facid, f.name, f.membercost, f.monthlymaintenance 96 | from facilities f 97 | where f.membercost < (0.2*f.monthlymaintenance); 98 | 99 | 100 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5? 101 | Write the query without using the OR operator. */ 102 | 103 | select * 104 | from facilities f 105 | where f.facid in (1,5); 106 | 107 | /* Q5: How can you produce a list of facilities, with each labelled as 108 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is 109 | more than $100? Return the name and monthly maintenance of the facilities 110 | in question. */ 111 | 112 | select f.name, 113 | f.monthlymaintenance, 114 | case when (f.monthlymaintenance>100) then 'expensive' else 'cheap' end label 115 | from facilities f 116 | 117 | /* Q6: You'd like to get the first and last name of the last member(s) 118 | who signed up. Do not use the LIMIT clause for your solution. */ 119 | 120 | select m.firstname, m.surname 121 | from members m 122 | where m.joindate = (select max(joindate) from members); 123 | 124 | /* Q7: How can you produce a list of all members who have used a tennis court? 125 | Include in your output the name of the court, and the name of the member 126 | formatted as a single column. Ensure no duplicate data, and order by 127 | the member name. */ 128 | 129 | select f2.name, m2.firstname ||' ' || m2.surname as fullName 130 | from 131 | (select b1.facid, b1.memid 132 | from bookings b1 133 | where b1.facid in 134 | (select distinct f1.facid 135 | from facilities f1 136 | where f1.name like ('Tennis Court%') 137 | ) 138 | group by b1.facid, b1.memid) b2 139 | left join facilities f2 on f2.facid = b2.facid 140 | left join members m2 on m2.memid = b2.memid 141 | order by (m2.firstname || m2.surname) desc 142 | 143 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which 144 | will cost the member (or guest) more than $30? Remember that guests have 145 | different costs to members (the listed costs are per half-hour 'slot'), and 146 | the guest user's ID is always 0. Include in your output the name of the 147 | facility, the name of the member formatted as a single column, and the cost. 148 | Order by descending cost, and do not use any subqueries. */ 149 | 150 | select f.name, 151 | (m.firstname ||' '|| m.surname) as fullname, 152 | case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost 153 | -- , 154 | -- b.memid, 155 | -- b.slots, 156 | -- f.guestcost, 157 | -- f.membercost 158 | from bookings b 159 | left join facilities f on b.facid=f.facid 160 | left join members m on b.memid = m.memid 161 | where date(b.starttime) = date('2012-09-14') 162 | and (cost>30) 163 | order by cost desc; 164 | 165 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */ 166 | 167 | select t.name, t.fullname,t.cost 168 | from 169 | (select f.name, 170 | (m.firstname ||' '|| m.surname) as fullname, 171 | case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost, 172 | date(b.starttime) as date 173 | from bookings b 174 | left join facilities f on b.facid=f.facid 175 | left join members m on b.memid = m.memid) t 176 | where t.date = date('2012-09-14') 177 | and (t.cost>30) 178 | order by t.cost desc; 179 | 180 | 181 | /* Q10: Produce a list of facilities with a total revenue less than 1000. 182 | The output of facility name and total revenue, sorted by revenue. Remember 183 | that there's a different cost for guests and members! */ 184 | 185 | select 186 | f.name, 187 | sum(case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end) as revenue 188 | from bookings b 189 | left join facilities f on b.facid=f.facid 190 | group by f.name 191 | having revenue < 1000 192 | order by revenue desc; 193 | 194 | 195 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5? 196 | Write the query without using the OR operator. */ 197 | 198 | select * 199 | from facilities f 200 | where f.facid in (1,5); 201 | 202 | /* Q5: How can you produce a list of facilities, with each labelled as 203 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is 204 | more than $100? Return the name and monthly maintenance of the facilities 205 | in question. */ 206 | 207 | select f.name, 208 | f.monthlymaintenance, 209 | case when (f.monthlymaintenance>100) then 'expensive' else 'cheap' end label 210 | from facilities f 211 | 212 | /* Q6: You'd like to get the first and last name of the last member(s) 213 | who signed up. Do not use the LIMIT clause for your solution. */ 214 | 215 | select m.firstname, m.surname 216 | from members m 217 | where m.joindate = (select max(joindate) from members); 218 | 219 | /* Q7: How can you produce a list of all members who have used a tennis court? 220 | Include in your output the name of the court, and the name of the member 221 | formatted as a single column. Ensure no duplicate data, and order by 222 | the member name. */ 223 | 224 | select f2.name, m2.firstname ||' ' || m2.surname as fullName 225 | from 226 | (select b1.facid, b1.memid 227 | from bookings b1 228 | where b1.facid in 229 | (select distinct f1.facid 230 | from facilities f1 231 | where f1.name like ('Tennis Court%') 232 | ) 233 | group by b1.facid, b1.memid) b2 234 | left join facilities f2 on f2.facid = b2.facid 235 | left join members m2 on m2.memid = b2.memid 236 | order by (m2.firstname || m2.surname) desc 237 | 238 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which 239 | will cost the member (or guest) more than $30? Remember that guests have 240 | different costs to members (the listed costs are per half-hour 'slot'), and 241 | the guest user's ID is always 0. Include in your output the name of the 242 | facility, the name of the member formatted as a single column, and the cost. 243 | Order by descending cost, and do not use any subqueries. */ 244 | 245 | select f.name, 246 | (m.firstname ||' '|| m.surname) as fullname, 247 | case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost 248 | -- , 249 | -- b.memid, 250 | -- b.slots, 251 | -- f.guestcost, 252 | -- f.membercost 253 | from bookings b 254 | left join facilities f on b.facid=f.facid 255 | left join members m on b.memid = m.memid 256 | where date(b.starttime) = date('2012-09-14') 257 | and (cost>30) 258 | order by cost desc; 259 | 260 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */ 261 | 262 | select t.name, t.fullname,t.cost 263 | from 264 | (select f.name, 265 | (m.firstname ||' '|| m.surname) as fullname, 266 | case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end as cost, 267 | date(b.starttime) as date 268 | from bookings b 269 | left join facilities f on b.facid=f.facid 270 | left join members m on b.memid = m.memid) t 271 | where t.date = date('2012-09-14') 272 | and (t.cost>30) 273 | order by t.cost desc; 274 | 275 | 276 | /* Q10: Produce a list of facilities with a total revenue less than 1000. 277 | The output of facility name and total revenue, sorted by revenue. Remember 278 | that there's a different cost for guests and members! */ 279 | 280 | select 281 | f.name, 282 | sum(case when b.memid = 0 then b.slots * f.guestcost else b.slots * f.membercost end) as revenue 283 | from bookings b 284 | left join facilities f on b.facid=f.facid 285 | group by f.name 286 | having revenue < 1000 287 | order by revenue desc; 288 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Intro: -------------------------------------------------------------------------------- 1 | Project: SQL Practice 2 | 3 - 5 Hours 3 | 4 | Steps: 5 | 6 | Download the SQL file and follow the instructions to log into the provided SQL platform. 7 | Fill in your answers to the questions in the SQL file. 8 | Add your SQL file to a GitHub repository and submit a link to it. 9 | 10 | Your project will be evaluated using this rubric (https://docs.google.com/document/d/1xR6CGuZLrzAAA2z_s_vhuOwYykq0cg2uDhsn2H3czo4/edit). 11 | 12 | Download project file(s). 13 | 14 | Learning Objective 15 | Work with SQL-based databases 16 | Learn and write basic SQL queries up to basic aggregations and joins 17 | Comment on SQL code 18 | 19 | Criteria 20 | Meets Expectations 21 | Completion 22 | The code runs successfully. 23 | 24 | 25 | Process and understanding 26 | The submission shows that the correct solutions to all of the 10 problems have been produced. 27 | The submission demonstrates an understanding of the various types of joins, aggregations, filters, and subqueries. 28 | 29 | 30 | Presentation 31 | The project is delivered as a .sql file (as stated in the instructions), and uploaded to GitHub. 32 | 33 | 34 | Excellence: Student downloads the raw data files, sets up a local database, loads in the data, and uses a Jupyter notebook to set up a connection and query the data. 35 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Project Requirements & Questions: -------------------------------------------------------------------------------- 1 | /* Welcome to the SQL mini project. For this project, you will use 2 | Springboard' online SQL platform, which you can log into through the 3 | following link: 4 | 5 | 6 | The data you need is in the "country_club" database. This database 7 | contains 3 tables: 8 | i) the "Bookings" table, 9 | ii) the "Facilities" table, and 10 | iii) the "Members" table. 11 | 12 | Note that, if you need to, you can also download these tables locally. 13 | 14 | In the mini project, you'll be asked a series of questions. You can 15 | solve them using the platform, but for the final deliverable, 16 | paste the code for each solution into this script, and upload it 17 | to your GitHub. 18 | 19 | Before starting with the questions, feel free to take your time, 20 | exploring the data, and getting acquainted with the 3 tables. */ 21 | 22 | 23 | 24 | /* Q1: Some of the facilities charge a fee to members, but some do not. 25 | Please list the names of the facilities that do. */ 26 | 27 | 28 | /* Q2: How many facilities do not charge a fee to members? */ 29 | 30 | 31 | /* Q3: How can you produce a list of facilities that charge a fee to members, 32 | where the fee is less than 20% of the facility's monthly maintenance cost? 33 | Return the facid, facility name, member cost, and monthly maintenance of the 34 | facilities in question. */ 35 | 36 | 37 | /* Q4: How can you retrieve the details of facilities with ID 1 and 5? 38 | Write the query without using the OR operator. */ 39 | 40 | 41 | /* Q5: How can you produce a list of facilities, with each labelled as 42 | 'cheap' or 'expensive', depending on if their monthly maintenance cost is 43 | more than $100? Return the name and monthly maintenance of the facilities 44 | in question. */ 45 | 46 | 47 | /* Q6: You'd like to get the first and last name of the last member(s) 48 | who signed up. Do not use the LIMIT clause for your solution. */ 49 | 50 | 51 | /* Q7: How can you produce a list of all members who have used a tennis court? 52 | Include in your output the name of the court, and the name of the member 53 | formatted as a single column. Ensure no duplicate data, and order by 54 | the member name. */ 55 | 56 | 57 | /* Q8: How can you produce a list of bookings on the day of 2012-09-14 which 58 | will cost the member (or guest) more than $30? Remember that guests have 59 | different costs to members (the listed costs are per half-hour 'slot'), and 60 | the guest user's ID is always 0. Include in your output the name of the 61 | facility, the name of the member formatted as a single column, and the cost. 62 | Order by descending cost, and do not use any subqueries. */ 63 | 64 | 65 | /* Q9: This time, produce the same result as in Q8, but using a subquery. */ 66 | 67 | 68 | /* Q10: Produce a list of facilities with a total revenue less than 1000. 69 | The output of facility name and total revenue, sorted by revenue. Remember 70 | that there's a different cost for guests and members! */ 71 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/Resources: -------------------------------------------------------------------------------- 1 | http://www.sqlitetutorial.net/download-install-sqlite/ 2 | 3 | http://www.sqlitetutorial.net/sqlite-commands/ 4 | 5 | http://www.sqlitetutorial.net/sqlite-import-csv/ 6 | 7 | 8 | http://www.sqlitetutorial.net/sqlite-python/creating-database/ 9 | 10 | 11 | http://www.sqlitetutorial.net/sqlite-import-csv/ 12 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.3 SQL Practice/Project Info/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 5 - Data Wrangling/5.4 API/dummy.txt: -------------------------------------------------------------------------------- 1 | . 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/EDA_HumanTemp/dummy.tx: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/EDA_HumanTemp/human_body_temperature.csv: -------------------------------------------------------------------------------- 1 | temperature,gender,heart_rate 2 | 99.3,F,68.0 3 | 98.4,F,81.0 4 | 97.8,M,73.0 5 | 99.2,F,66.0 6 | 98.0,F,73.0 7 | 99.2,M,83.0 8 | 98.0,M,71.0 9 | 98.8,M,78.0 10 | 98.4,F,84.0 11 | 98.6,F,86.0 12 | 98.8,F,89.0 13 | 96.7,F,62.0 14 | 98.2,M,72.0 15 | 98.7,F,79.0 16 | 97.8,F,77.0 17 | 98.8,F,83.0 18 | 98.3,F,79.0 19 | 98.2,M,64.0 20 | 97.2,F,68.0 21 | 99.4,M,70.0 22 | 98.3,F,78.0 23 | 98.2,M,71.0 24 | 98.6,M,70.0 25 | 98.4,M,68.0 26 | 97.8,M,65.0 27 | 98.0,F,87.0 28 | 97.8,F,62.0 29 | 98.2,F,69.0 30 | 98.4,F,73.0 31 | 98.1,M,67.0 32 | 98.3,M,86.0 33 | 97.6,F,61.0 34 | 98.5,M,71.0 35 | 98.6,M,82.0 36 | 99.3,M,63.0 37 | 99.5,M,75.0 38 | 99.1,M,71.0 39 | 98.3,M,72.0 40 | 97.9,F,79.0 41 | 96.4,F,69.0 42 | 98.4,F,79.0 43 | 98.4,M,82.0 44 | 96.9,M,74.0 45 | 97.2,M,64.0 46 | 99.0,F,79.0 47 | 97.9,F,69.0 48 | 97.4,M,72.0 49 | 97.4,M,68.0 50 | 97.9,M,76.0 51 | 97.1,M,82.0 52 | 98.9,F,76.0 53 | 98.3,F,80.0 54 | 98.5,F,83.0 55 | 98.6,M,78.0 56 | 98.2,F,73.0 57 | 98.6,F,82.0 58 | 98.8,F,70.0 59 | 98.2,M,66.0 60 | 98.2,F,65.0 61 | 97.6,M,73.0 62 | 99.1,F,80.0 63 | 98.4,M,84.0 64 | 98.2,F,57.0 65 | 98.6,M,83.0 66 | 98.7,F,65.0 67 | 97.4,M,70.0 68 | 97.4,F,57.0 69 | 98.6,M,77.0 70 | 98.7,F,82.0 71 | 98.9,M,80.0 72 | 98.1,F,81.0 73 | 97.7,F,61.0 74 | 98.0,M,78.0 75 | 98.8,M,81.0 76 | 99.0,M,75.0 77 | 98.8,M,78.0 78 | 98.0,F,76.0 79 | 98.4,M,70.0 80 | 97.4,M,78.0 81 | 97.6,M,74.0 82 | 98.8,F,73.0 83 | 98.0,M,67.0 84 | 97.5,M,70.0 85 | 99.2,F,77.0 86 | 98.6,F,85.0 87 | 97.1,M,75.0 88 | 98.6,F,77.0 89 | 98.0,M,78.0 90 | 98.7,M,73.0 91 | 98.1,M,73.0 92 | 97.8,M,74.0 93 | 100.0,F,78.0 94 | 98.8,F,84.0 95 | 97.1,M,73.0 96 | 97.8,M,58.0 97 | 96.8,F,75.0 98 | 99.9,F,79.0 99 | 98.7,F,64.0 100 | 98.8,F,64.0 101 | 98.0,M,74.0 102 | 99.0,M,81.0 103 | 98.5,M,68.0 104 | 98.0,F,78.0 105 | 99.4,F,77.0 106 | 97.6,M,69.0 107 | 96.7,M,71.0 108 | 97.0,M,80.0 109 | 98.6,M,66.0 110 | 98.7,F,72.0 111 | 97.3,M,69.0 112 | 98.8,F,69.0 113 | 98.0,F,89.0 114 | 98.2,F,64.0 115 | 99.1,F,74.0 116 | 99.0,M,79.0 117 | 98.0,M,64.0 118 | 100.8,F,77.0 119 | 97.8,F,71.0 120 | 98.7,M,78.0 121 | 98.4,F,74.0 122 | 97.7,F,84.0 123 | 97.9,F,68.0 124 | 99.0,F,81.0 125 | 97.2,F,66.0 126 | 97.5,M,75.0 127 | 96.3,M,70.0 128 | 97.7,M,77.0 129 | 98.2,F,73.0 130 | 97.9,M,72.0 131 | 98.7,F,59.0 132 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/BazeleyMikiko_MiniProject_RacialDiscrimination-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Examining Racial Discrimination in the US Job Market\n", 8 | "\n", 9 | "### Background\n", 10 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés to black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 11 | "\n", 12 | "### Data\n", 13 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 14 | "\n", 15 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes when presented to the employer." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### Exercises\n", 23 | "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 24 | "\n", 25 | "Answer the following questions **in this notebook below and submit to your Github account**. \n", 26 | "\n", 27 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 28 | " 2. What are the null and alternate hypotheses?\n", 29 | " 3. Compute margin of error, confidence interval, and p-value. Try using both the bootstrapping and the frequentist statistical approaches.\n", 30 | " 4. Write a story describing the statistical significance in the context or the original problem.\n", 31 | " 5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n", 32 | "\n", 33 | "You can include written notes in notebook cells using Markdown: \n", 34 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 35 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 36 | "\n", 37 | "#### Resources\n", 38 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 39 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n", 40 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 41 | "+ Formulas for the Bernoulli distribution: https://en.wikipedia.org/wiki/Bernoulli_distribution" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import pandas as pd\n", 51 | "import numpy as np\n", 52 | "from scipy import stats" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "235.0\n", 74 | "157.0\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "# number of callbacks for black-sounding names\n", 80 | "print(sum(data[data.race=='w'].call))\n", 81 | "\n", 82 | "print(sum(data[data.race=='b'].call))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | "
idadeducationofjobsyearsexphonorsvolunteermilitaryempholesoccupspecific...compreqorgreqmanuftranscombankrealtradebusserviceothservicemissindownership
0b1426000117...1.00.01.00.00.00.00.00.00.0
1b13360110316...1.00.01.00.00.00.00.00.00.0
2b1416000019...1.00.01.00.00.00.00.00.00.0
3b13460101313...1.00.01.00.00.00.00.00.00.0
4b133220000313...1.01.00.00.00.00.00.01.00.0Nonprofit
\n", 257 | "

5 rows × 65 columns

\n", 258 | "
" 259 | ], 260 | "text/plain": [ 261 | " id ad education ofjobs yearsexp honors volunteer military empholes \\\n", 262 | "0 b 1 4 2 6 0 0 0 1 \n", 263 | "1 b 1 3 3 6 0 1 1 0 \n", 264 | "2 b 1 4 1 6 0 0 0 0 \n", 265 | "3 b 1 3 4 6 0 1 0 1 \n", 266 | "4 b 1 3 3 22 0 0 0 0 \n", 267 | "\n", 268 | " occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n", 269 | "0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 270 | "1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 271 | "2 19 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 272 | "3 313 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 273 | "4 313 ... 1.0 1.0 0.0 0.0 0.0 0.0 \n", 274 | "\n", 275 | " busservice othservice missind ownership \n", 276 | "0 0.0 0.0 0.0 \n", 277 | "1 0.0 0.0 0.0 \n", 278 | "2 0.0 0.0 0.0 \n", 279 | "3 0.0 0.0 0.0 \n", 280 | "4 0.0 1.0 0.0 Nonprofit \n", 281 | "\n", 282 | "[5 rows x 65 columns]" 283 | ] 284 | }, 285 | "execution_count": 4, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "data.head()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": { 297 | "collapsed": true 298 | }, 299 | "source": [ 300 | "## 1. What test is appropriate for this problem? Does CLT apply?\n", 301 | "\n", 302 | "The problem we are trying to solve is whether there is a statistically significant difference between callbacks for white sounding names and black sounding names. \n", 303 | "\n", 304 | "The CLT states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximatelt equal to the mean of the population. Specifically, as the sample sizes get larger, the distributio of means caculated from repeated sampling will approach normality. \n", 305 | "\n", 306 | "Another way to interpret the Cnetral Limit Theorem is that if we repeatedly take independent random samples of size n from any population, then when n is large, the distribution of the sample means will approach a normal distribution. \n", 307 | "\n", 308 | "We can see that we have more than 300+ samples that were randomly assigned b/w names, so we can assume that the based on sample size and independence of draw that CLT would apply. \n", 309 | "\n", 310 | "Given CLT applies, we should also be able to use a 2 sampel t-test. Given we have a large sample size but don't know the population standard deviation, this seems to be the most appropriate test.\n", 311 | "\n", 312 | "\n", 313 | "## 2. What are the null and alternate hypotheses?\n", 314 | "\n", 315 | "\n", 316 | "\n", 317 | "Ho: (Proportion of W called back) == (Proportion of B called back)\n", 318 | "\n", 319 | "\n", 320 | "Ha: (Proportion of W called back) =/= (Proportion of B called back)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 5, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "w = data[data.race=='w']\n", 330 | "b = data[data.race=='b']" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 6, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "Z score: 4.108412152434346\n", 343 | "P-value: 3.983886837585077e-05\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "# Your solution to Q3 here\n", 349 | "# Compute margin of error, confidence interval, and p-value. \n", 350 | "# Try using both the bootstrapping and the frequentist statistical approaches.\n", 351 | "\n", 352 | "\n", 353 | "#Frequentist Approach\n", 354 | "\n", 355 | "n_w = len(w)\n", 356 | "n_b = len(b)\n", 357 | "\n", 358 | "n_w_c = np.sum(w.call)\n", 359 | "n_b_c = np.sum(b.call)\n", 360 | "\n", 361 | "prop_w = n_w_c / n_w\n", 362 | "prop_b = n_b_c / n_b\n", 363 | "\n", 364 | "prop_diff = prop_w - prop_b\n", 365 | "phat = (n_w_c + n_b_c) / (n_w + n_b)\n", 366 | "\n", 367 | "z = prop_diff / np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n", 368 | "pval = stats.norm.cdf(-z) * 2\n", 369 | "\n", 370 | "\n", 371 | "print(\"Z score: {}\".format(z))\n", 372 | "print(\"P-value: {}\".format(pval))" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "The p-value is fairly low using the frquentist approach, so we may want to validate using a hacker stats approach." 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 7, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "p = 0.0\n" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "# Hacker stats approach\n", 397 | "\n", 398 | "\n", 399 | "permutation_replicates = np.empty(100000)\n", 400 | "\n", 401 | "white = data[data.race == 'w'].call.values\n", 402 | "black = data[data.race == 'b'].call.values\n", 403 | "diff_of_means = np.mean(white) - np.mean(black)\n", 404 | "\n", 405 | "for i in range(len(permutation_replicates)):\n", 406 | " permutation_samples = np.random.permutation(np.concatenate((white, black)))\n", 407 | " \n", 408 | " white_perm = permutation_samples[:len(white)]\n", 409 | " black_perm = permutation_samples[len(white):]\n", 410 | " \n", 411 | " permutation_replicates[i] = np.abs(np.mean(white_perm) - np.mean(black_perm))\n", 412 | "\n", 413 | "p = np.sum(permutation_replicates > diff_of_means) / len(permutation_replicates)\n", 414 | "\n", 415 | "\n", 416 | "print('p =', p)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "P-value here is alos fairly low." 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 8, 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "Margin of Error: 0.015281912310894095\n", 436 | "Confidence interval: [0.01675094 0.04731477]\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "moe = 1.96 * np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n", 442 | "\n", 443 | "ci = prop_diff + np.array([-1, 1]) * moe\n", 444 | "\n", 445 | "\n", 446 | "print(\"Margin of Error: {}\".format(moe))\n", 447 | "print(\"Confidence interval: {}\".format(ci))" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "source": [ 456 | "## 4. Write a story describing the statistical significance in the context or the original problem.\n", 457 | "\n", 458 | "\n", 459 | "\n", 460 | "P value is low still so we reject the null hypothesis that black and white sounding names have the same callback rates.\n" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "## 5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n", 468 | "\n", 469 | "We'd need to understand the impact of the other variables present. Even though names were randomly distributed there could be other variables that might suffer from systemic bias or confound the results. \n", 470 | "\n", 471 | "Example: Gender, Education, Georgraphy, etc. Another interesting question could be that of the assigned b labels that did get callbacks, were there significant differences from their counterparts that did not get callbacks? " 472 | ] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.7.0" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 1 496 | } 497 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/BazeleyMikiko_MiniProject_RacialDiscrimination.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Examining Racial Discrimination in the US Job Market\n", 8 | "\n", 9 | "### Background\n", 10 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés to black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 11 | "\n", 12 | "### Data\n", 13 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 14 | "\n", 15 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes when presented to the employer." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### Exercises\n", 23 | "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 24 | "\n", 25 | "Answer the following questions **in this notebook below and submit to your Github account**. \n", 26 | "\n", 27 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 28 | " 2. What are the null and alternate hypotheses?\n", 29 | " 3. Compute margin of error, confidence interval, and p-value. Try using both the bootstrapping and the frequentist statistical approaches.\n", 30 | " 4. Write a story describing the statistical significance in the context or the original problem.\n", 31 | " 5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n", 32 | "\n", 33 | "You can include written notes in notebook cells using Markdown: \n", 34 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 35 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 36 | "\n", 37 | "#### Resources\n", 38 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 39 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n", 40 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 41 | "+ Formulas for the Bernoulli distribution: https://en.wikipedia.org/wiki/Bernoulli_distribution" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 1, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "import pandas as pd\n", 51 | "import numpy as np\n", 52 | "from scipy import stats" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "235.0\n", 74 | "157.0\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "# number of callbacks for black-sounding names\n", 80 | "print(sum(data[data.race=='w'].call))\n", 81 | "\n", 82 | "print(sum(data[data.race=='b'].call))" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/html": [ 93 | "
\n", 94 | "\n", 107 | "\n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | "
idadeducationofjobsyearsexphonorsvolunteermilitaryempholesoccupspecific...compreqorgreqmanuftranscombankrealtradebusserviceothservicemissindownership
0b1426000117...1.00.01.00.00.00.00.00.00.0
1b13360110316...1.00.01.00.00.00.00.00.00.0
2b1416000019...1.00.01.00.00.00.00.00.00.0
3b13460101313...1.00.01.00.00.00.00.00.00.0
4b133220000313...1.01.00.00.00.00.00.01.00.0Nonprofit
\n", 257 | "

5 rows × 65 columns

\n", 258 | "
" 259 | ], 260 | "text/plain": [ 261 | " id ad education ofjobs yearsexp honors volunteer military empholes \\\n", 262 | "0 b 1 4 2 6 0 0 0 1 \n", 263 | "1 b 1 3 3 6 0 1 1 0 \n", 264 | "2 b 1 4 1 6 0 0 0 0 \n", 265 | "3 b 1 3 4 6 0 1 0 1 \n", 266 | "4 b 1 3 3 22 0 0 0 0 \n", 267 | "\n", 268 | " occupspecific ... compreq orgreq manuf transcom bankreal trade \\\n", 269 | "0 17 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 270 | "1 316 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 271 | "2 19 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 272 | "3 313 ... 1.0 0.0 1.0 0.0 0.0 0.0 \n", 273 | "4 313 ... 1.0 1.0 0.0 0.0 0.0 0.0 \n", 274 | "\n", 275 | " busservice othservice missind ownership \n", 276 | "0 0.0 0.0 0.0 \n", 277 | "1 0.0 0.0 0.0 \n", 278 | "2 0.0 0.0 0.0 \n", 279 | "3 0.0 0.0 0.0 \n", 280 | "4 0.0 1.0 0.0 Nonprofit \n", 281 | "\n", 282 | "[5 rows x 65 columns]" 283 | ] 284 | }, 285 | "execution_count": 4, 286 | "metadata": {}, 287 | "output_type": "execute_result" 288 | } 289 | ], 290 | "source": [ 291 | "data.head()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": { 297 | "collapsed": true 298 | }, 299 | "source": [ 300 | "## 1. What test is appropriate for this problem? Does CLT apply?\n", 301 | "\n", 302 | "The problem we are trying to solve is whether there is a statistically significant difference between callbacks for white sounding names and black sounding names. \n", 303 | "\n", 304 | "The CLT states that given a sufficiently large sample size from a population with a finite level of variance, the mean of all samples from the same population will be approximatelt equal to the mean of the population. Specifically, as the sample sizes get larger, the distributio of means caculated from repeated sampling will approach normality. \n", 305 | "\n", 306 | "Another way to interpret the Cnetral Limit Theorem is that if we repeatedly take independent random samples of size n from any population, then when n is large, the distribution of the sample means will approach a normal distribution. \n", 307 | "\n", 308 | "We can see that we have more than 300+ samples that were randomly assigned b/w names, so we can assume that the based on sample size and independence of draw that CLT would apply. \n", 309 | "\n", 310 | "Given CLT applies, we should also be able to use a 2 sampel t-test. Given we have a large sample size but don't know the population standard deviation, this seems to be the most appropriate test.\n", 311 | "\n", 312 | "\n", 313 | "## 2. What are the null and alternate hypotheses?\n", 314 | "\n", 315 | "\n", 316 | "\n", 317 | "Ho: (Proportion of W called back) == (Proportion of B called back)\n", 318 | "\n", 319 | "\n", 320 | "Ha: (Proportion of W called back) =/= (Proportion of B called back)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": 5, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "w = data[data.race=='w']\n", 330 | "b = data[data.race=='b']" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 6, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "name": "stdout", 340 | "output_type": "stream", 341 | "text": [ 342 | "Z score: 4.108412152434346\n", 343 | "P-value: 3.983886837585077e-05\n" 344 | ] 345 | } 346 | ], 347 | "source": [ 348 | "# Your solution to Q3 here\n", 349 | "# Compute margin of error, confidence interval, and p-value. \n", 350 | "# Try using both the bootstrapping and the frequentist statistical approaches.\n", 351 | "\n", 352 | "\n", 353 | "#Frequentist Approach\n", 354 | "\n", 355 | "n_w = len(w)\n", 356 | "n_b = len(b)\n", 357 | "\n", 358 | "n_w_c = np.sum(w.call)\n", 359 | "n_b_c = np.sum(b.call)\n", 360 | "\n", 361 | "prop_w = n_w_c / n_w\n", 362 | "prop_b = n_b_c / n_b\n", 363 | "\n", 364 | "prop_diff = prop_w - prop_b\n", 365 | "phat = (n_w_c + n_b_c) / (n_w + n_b)\n", 366 | "\n", 367 | "z = prop_diff / np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n", 368 | "pval = stats.norm.cdf(-z) * 2\n", 369 | "\n", 370 | "\n", 371 | "print(\"Z score: {}\".format(z))\n", 372 | "print(\"P-value: {}\".format(pval))" 373 | ] 374 | }, 375 | { 376 | "cell_type": "markdown", 377 | "metadata": {}, 378 | "source": [ 379 | "The p-value is fairly low using the frquentist approach, so we may want to validate using a hacker stats approach." 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 7, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "p = 0.0\n" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "# Hacker stats approach\n", 397 | "\n", 398 | "\n", 399 | "permutation_replicates = np.empty(100000)\n", 400 | "\n", 401 | "white = data[data.race == 'w'].call.values\n", 402 | "black = data[data.race == 'b'].call.values\n", 403 | "diff_of_means = np.mean(white) - np.mean(black)\n", 404 | "\n", 405 | "for i in range(len(permutation_replicates)):\n", 406 | " permutation_samples = np.random.permutation(np.concatenate((white, black)))\n", 407 | " \n", 408 | " white_perm = permutation_samples[:len(white)]\n", 409 | " black_perm = permutation_samples[len(white):]\n", 410 | " \n", 411 | " permutation_replicates[i] = np.abs(np.mean(white_perm) - np.mean(black_perm))\n", 412 | "\n", 413 | "p = np.sum(permutation_replicates > diff_of_means) / len(permutation_replicates)\n", 414 | "\n", 415 | "\n", 416 | "print('p =', p)" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "P-value here is alos fairly low." 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 8, 429 | "metadata": {}, 430 | "outputs": [ 431 | { 432 | "name": "stdout", 433 | "output_type": "stream", 434 | "text": [ 435 | "Margin of Error: 0.015281912310894095\n", 436 | "Confidence interval: [0.01675094 0.04731477]\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "moe = 1.96 * np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n", 442 | "\n", 443 | "ci = prop_diff + np.array([-1, 1]) * moe\n", 444 | "\n", 445 | "\n", 446 | "print(\"Margin of Error: {}\".format(moe))\n", 447 | "print(\"Confidence interval: {}\".format(ci))" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": { 453 | "collapsed": true 454 | }, 455 | "source": [ 456 | "## 4. Write a story describing the statistical significance in the context or the original problem.\n", 457 | "\n", 458 | "\n", 459 | "\n", 460 | "P value is low still so we reject the null hypothesis that black and white sounding names have the same callback rates.\n" 461 | ] 462 | }, 463 | { 464 | "cell_type": "markdown", 465 | "metadata": {}, 466 | "source": [ 467 | "## 5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n", 468 | "\n", 469 | "We'd need to understand the impact of the other variables present. Even though names were randomly distributed there could be other variables that might suffer from systemic bias or confound the results. \n", 470 | "\n", 471 | "Example: Gender, Education, Georgraphy, etc. Another interesting question could be that of the assigned b labels that did get callbacks, were there significant differences from their counterparts that did not get callbacks? " 472 | ] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.7.0" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 1 496 | } 497 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/dummy.txt: -------------------------------------------------------------------------------- 1 | text 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/sliderule_dsi_inferential_statistics_exercise_1_solutions-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# What is the True Normal Human Body Temperature? \n", 8 | "\n", 9 | "#### Background\n", 10 | "\n", 11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. But, is this value statistically correct?" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "
\n", 19 | "

Exercises

\n", 20 | "\n", 21 | "

In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance.

\n", 22 | "\n", 23 | "

Answer the following questions in this notebook below and submit to your Github account.

\n", 24 | "\n", 25 | "
    \n", 26 | "
  1. Is the distribution of body temperatures normal? \n", 27 | "
      \n", 28 | "
    • Although this is not a requirement for the Central Limit Theorem to hold (read the introduction on Wikipedia's page about the CLT carefully: https://en.wikipedia.org/wiki/Central_limit_theorem), it gives us some peace of mind that the population may also be normally distributed if we assume that this sample is representative of the population.\n", 29 | "
    • Think about the way you're going to check for the normality of the distribution. Graphical methods are usually used first, but there are also other ways: https://en.wikipedia.org/wiki/Normality_test\n", 30 | "
    \n", 31 | "
  2. Is the sample size large? Are the observations independent?\n", 32 | "
      \n", 33 | "
    • Remember that this is a condition for the Central Limit Theorem, and hence the statistical tests we are using, to apply.\n", 34 | "
    \n", 35 | "
  3. Is the true population mean really 98.6 degrees F?\n", 36 | "
      \n", 37 | "
    • First, try a bootstrap hypothesis test.\n", 38 | "
    • Now, let's try frequentist statistical testing. Would you use a one-sample or two-sample test? Why?\n", 39 | "
    • In this situation, is it appropriate to use the $t$ or $z$ statistic? \n", 40 | "
    • Now try using the other test. How is the result be different? Why?\n", 41 | "
    \n", 42 | "
  4. Draw a small sample of size 10 from the data and repeat both frequentist tests. \n", 43 | "
      \n", 44 | "
    • Which one is the correct one to use? \n", 45 | "
    • What do you notice? What does this tell you about the difference in application of the $t$ and $z$ statistic?\n", 46 | "
    \n", 47 | "
  5. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 48 | "
      \n", 49 | "
    • As in the previous example, try calculating everything using the boostrap approach, as well as the frequentist approach.\n", 50 | "
    • Start by computing the margin of error and confidence interval. When calculating the confidence interval, keep in mind that you should use the appropriate formula for one draw, and not N draws.\n", 51 | "
    \n", 52 | "
  6. Is there a significant difference between males and females in normal temperature?\n", 53 | "
      \n", 54 | "
    • What testing approach did you use and why?\n", 55 | "
    • Write a story with your conclusion in the context of the original problem.\n", 56 | "
    \n", 57 | "
\n", 58 | "\n", 59 | "You can include written notes in notebook cells using Markdown: \n", 60 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 61 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 62 | "\n", 63 | "#### Resources\n", 64 | "\n", 65 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n", 66 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 67 | "\n", 68 | "****" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": { 75 | "collapsed": true 76 | }, 77 | "outputs": [], 78 | "source": [ 79 | "import pandas as pd\n", 80 | "import matplotlib.pyplot as plt\n", 81 | "%matplotlib inline\n", 82 | "\n", 83 | "df = pd.read_csv('data/human_body_temperature.csv')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": {}, 89 | "source": [ 90 | "
\n", 91 | "

SOLUTION: Is the distribution of body temperatures normal?

\n", 92 | "
" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": { 99 | "collapsed": true 100 | }, 101 | "outputs": [], 102 | "source": [ 103 | "# First, a histogram\n", 104 | "%matplotlib inline\n", 105 | "plt.hist(df['temperature'])\n", 106 | "plt.xlabel('Temperature')\n", 107 | "plt.ylabel('Frequency')\n", 108 | "plt.title('Histogram of Body Temperature')\n", 109 | "plt.ylim(0, 40) # Add some buffer space at the top so the bar doesn't get cut off." 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "# Next, a quantile plot.\n", 121 | "import statsmodels.api as sm\n", 122 | "mean = np.mean(df['temperature'])\n", 123 | "sd = np.std(df['temperature'])\n", 124 | "z = (df['temperature'] - mean) / sd\n", 125 | "sm.qqplot(z, line='45')" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": { 132 | "collapsed": true 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "# Finally, a normal distribution test. Not recommended!! Use only when you're not sure.\n", 137 | "import scipy.stats as stats\n", 138 | "stats.mstats.normaltest(df['temperature'])" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "
\n", 146 | "

SOLUTION

\n", 147 | "\n", 148 | "

The histogram looks *very roughly* normally distributed. There is an implied bell shape, though there are some values above the mode that occur much less frequently than we would expect under a normal distribution. The shape is not so deviant as to call it some other distribution.

\n", 149 | "\n", 150 | "

A quantile plot can help. The quantile plot computes percentiles for our data and also the percentiles for a normal distribution via sampling (mean 0, sd 1). If the quantiles/percentiles for both distributions match, we expect to see a more or less straight line of data points. Note that the quantile plot does pretty much follow a straight line, so this helps us conclude that the distribution is likely normal. Note that there are three outliers on the \"high\" end and two on the \"low\" end that cause deviations in the tail, but this is pretty typical.

\n", 151 | "\n", 152 | "

Suppose we really aren't sure, or the plots tell us two different conclusions. We could confirm with a statistical significance test, though this should not be your first method of attack. The p-value from the normality test is 0.25 which is significantly above the usual cutoff of 0.05. The null hypothesis is that the distribution is normal. Since we fail to reject the null hypothesis, we conclude that the distribution is probably normal.

\n", 153 | "
" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": {}, 159 | "source": [ 160 | "
\n", 161 | "

SOLUTION: Is the sample size large? Are the observations independent?

\n", 162 | "
" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "n = len(df['temperature'])\n", 174 | "n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "
\n", 182 | "

The sample size is 130. Literature typically suggests a lower limit of 30 observations in a sample for CLT to hold. In terms of CLT, the sample is large enough.

\n", 183 | "\n", 184 | "

We must assume that the obserations are independent. One person's body temperature should not have any affect on another person's body temperature, so under common sense conditions, the observations are independent. Note that this condition may potentially be violated if the researcher lacked common sense and performed this study by stuffing all of the participants shoulder to shoulder in a very hot and confined room.

\n", 185 | "\n", 186 | "

Note that the temperatures may be dependent on age, gender, or health status, but this is a separate issue and does not affect our conclusion that another person's temperature does not affect someone else's temperature.

\n", 187 | "
" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "
\n", 195 | "

SOLUTION: Is the true population mean really 98.6 degrees F?

\n", 196 | "
" 197 | ] 198 | }, 199 | { 200 | "cell_type": "markdown", 201 | "metadata": {}, 202 | "source": [ 203 | "
\n", 204 | "

We will now perform a bootstrap hypothesis test with the following:

\n", 205 | "\n", 206 | "

$H_0$: The mean of the sample and the true mean of 98.6 are the same. $\\mu=\\mu_0$

\n", 207 | "\n", 208 | "

$H_A$: The means are different. $\\mu\\neq\\mu_0$

\n", 209 | "\n", 210 | "
" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "# Calculates p value using 100,000 boostrap replicates\n", 222 | "bootstrap_replicates = np.empty(100000)\n", 223 | "\n", 224 | "size = len(bootstrap_replicates)\n", 225 | "\n", 226 | "for i in range(size):\n", 227 | " bootstrap_sample = np.random.choice(temperature, size=len(temperature))\n", 228 | " bootstrap_replicates[i] = np.mean(bootstrap_sample)\n", 229 | "\n", 230 | "p = np.sum(bootstrap_replicates >= 98.6) / len(bootstrap_replicates)\n", 231 | "print('p =', p)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "
\n", 239 | "

We are testing only if the true population mean temperature is 98.6. We are treating everyone as being in the same group, with one mean. We use a **one-sample** test. The population standard deviation is not given, so we assume it is not known. We do however know the sample standard deviation from the data and we know that the sample size is large enough for CLT to apply, so we can use a $z$-test.

\n", 240 | "
" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": { 247 | "collapsed": true 248 | }, 249 | "outputs": [], 250 | "source": [ 251 | "z = (mean - 98.6)/(sd / np.sqrt(n))\n", 252 | "z" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": {}, 258 | "source": [ 259 | "
\n", 260 | "Since the question does not ask if the true mean is greater than, or less than 98.6 as the alternative hypothesis, we use a two-tailed test. We have to regions where we reject the null hypothesis: if $z < -1.96$ or if $z > 1.96$, assuming $\\alpha = 0.05$. Since -5.48 < -1.96, we reject the null hypothesis: the true population mean temperature is NOT 98.6.\n", 261 | "\n", 262 | "

We can also use a p-value:

\n", 263 | "
" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "metadata": { 270 | "collapsed": true 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "stats.norm.cdf(z) * 2\n", 275 | "# NOTE: Since CDF gives us $P(Z \\le z)$ and this is a two-tailed test, we multiply the result by 2" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": {}, 281 | "source": [ 282 | "
\n", 283 | "

Since the p-value is *way* below 0.05, we reject the null hypothesis. The population mean is not 98.6.

\n", 284 | "\n", 285 | "

The $z$-test was the \"correct\" test to use in this case. But what if we used a $t$-test instead? The degrees of freedom is $n - 1 = 129$.

\n", 286 | "
" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "collapsed": true 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "t = (mean - 98.6)/(sd / np.sqrt(n))" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "
\n", 305 | "We find the critical value of $t$ and when $\\vert t \\vert > \\vert t^* \\vert$ we reject the null hypothesis.\n", 306 | "
" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "collapsed": true 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "t_critical = stats.t.ppf(0.05 / 2, n - 1)\n", 318 | "t_critical" 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "
\n", 326 | "

Note that the critical value of $t$ is $\\pm 1.979$. This is pretty close to the $\\pm 1.96$ we used for the $z$-test. *As the sample size gets larger, the student's $t$ distribution converges to the normal distribution.* So in theory, even if your sample size is large you could use the $t$-test, but the pesky degrees of freedom step is likely why people do not. If we use a sample of size, say, 1000, the critical values are close to identical.

\n", 327 | "\n", 328 | "

So, to answer the question, the result is NOT different! The only case where it would be different is if the $t$ statistic were between -1.96 and -1.979 which would be pretty rare.

" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "
\n", 336 | "

SOLUTION: At what temperature should we consider someone's temperature to be \"abnormal\"?

\n", 337 | "\n", 338 | "

We compute the confidence interval using $z^* = \\pm 1.96$.

\n", 339 | "\n", 340 | "

The margin of error is

\n", 341 | "\n", 342 | "$$MOE = z^* \\frac{\\sigma}{\\sqrt{n}}$$\n", 343 | "
" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": { 350 | "collapsed": true 351 | }, 352 | "outputs": [], 353 | "source": [ 354 | "sd = df['temperature'].std()\n", 355 | "n = len(df['temperature'])\n", 356 | "moe = 1.96 * sd / np.sqrt(n)\n", 357 | "moe" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": null, 363 | "metadata": { 364 | "collapsed": true 365 | }, 366 | "outputs": [], 367 | "source": [ 368 | "mean = df['temperature'].mean()\n", 369 | "ci = mean + np.array([-1, 1]) * moe\n", 370 | "ci" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": {}, 376 | "source": [ 377 | "
At 95% confidence level, we consider a temperature abnormal if it is below 98.1 degrees or above 98.38 degrees. Since the null hypothesis 98.6 is not in the confidence interval, we reject the null hypothesis -- the true population mean is not 98.6 degrees.
" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "
\n", 385 | "We can also use the bootstrap approach.\n", 386 | "
" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "collapsed": true 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "# Define bootstrap functions:\n", 398 | "\n", 399 | "def replicate(data, function):\n", 400 | " \"\"\"Return replicate of a resampled data array.\"\"\"\n", 401 | " \n", 402 | " # Create the resampled array and return the statistic of interest:\n", 403 | " return function(np.random.choice(data, size=len(data)))\n", 404 | "\n", 405 | "\n", 406 | "def draw_replicates(data, function, size=1):\n", 407 | " \"\"\"Draw bootstrap replicates.\"\"\"\n", 408 | "\n", 409 | " # Initialize array of replicates:\n", 410 | " replicates = np.empty(size)\n", 411 | "\n", 412 | " # Generate replicates:\n", 413 | " for i in range(size):\n", 414 | " replicates[i] = replicate(data, function)\n", 415 | "\n", 416 | " return replicates" 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": null, 422 | "metadata": { 423 | "collapsed": true 424 | }, 425 | "outputs": [], 426 | "source": [ 427 | "# Seed the random number generator:\n", 428 | "np.random.seed(15)\n", 429 | "\n", 430 | "# Draw bootstrap replicates of temperatures:\n", 431 | "replicates = draw_replicates(df.temperature, np.mean, 10000)\n", 432 | "\n", 433 | "# Compute the 99.9% confidence interval:\n", 434 | "CI = np.percentile(replicates, [0.05, 99.95])\n", 435 | "print('99.9% Confidence Interval:', CI)" 436 | ] 437 | }, 438 | { 439 | "cell_type": "markdown", 440 | "metadata": {}, 441 | "source": [ 442 | "
\n", 443 | "\n", 444 | "

SOLUTION: Is there a significant difference between males and females in normal temperature?

\n", 445 | "\n", 446 | "

We use a two-sample test. Since the number of males is greater than 30 and the number of females is greater than 30, we use a two-sample z-test. Since the question just asks if there is a *difference* and doesn't specify a direction, we use a two-tailed test.

\n", 447 | "\n", 448 | "$$z = \\frac{(\\bar{x}_M - \\bar{x}_F) - 0}{\\sqrt{\\frac{\\sigma_M^2}{n_M} + \\frac{\\sigma_F^2}{n_F}}}$$" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": { 455 | "collapsed": true 456 | }, 457 | "outputs": [], 458 | "source": [ 459 | "males = df.gender == 'M'\n", 460 | "diff_means = df.temperature[males].mean() - df.temperature[~males].mean()\n", 461 | "sd_male = df.temperature[males].std()\n", 462 | "sd_female = df.temperature[~males].std()\n", 463 | "n_male = np.sum(males)\n", 464 | "n_female = len(df.temperature) - n_male\n", 465 | "\n", 466 | "z = diff_means / np.sqrt(((sd_male ** 2)/ n_male) + ((sd_female ** 2)/ n_female))\n", 467 | "z" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": { 474 | "collapsed": true 475 | }, 476 | "outputs": [], 477 | "source": [ 478 | "pval = stats.norm.cdf(z) * 2\n", 479 | "pval" 480 | ] 481 | }, 482 | { 483 | "cell_type": "markdown", 484 | "metadata": {}, 485 | "source": [ 486 | "
\n", 487 | "

Since the p-value of 0.022 < 0.05, we reject the null hypothesis that the mean body temperature for men and women is the same. The difference in mean body temperature between men and women is statistically significant.

\n", 488 | "

" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": true 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "diff_means + np.array([-1, 1]) * 1.96 * np.sqrt(((sd_male ** 2)/ n_male) + ((sd_female ** 2)/ n_female))" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": {}, 505 | "source": [ 506 | "
Since the null hypothesized 0 is not in the confidence interval, we reject the null hypothesis with the same conclusion as the hypothesis test.
" 507 | ] 508 | }, 509 | { 510 | "cell_type": "markdown", 511 | "metadata": { 512 | "collapsed": true 513 | }, 514 | "source": [ 515 | "
Now let's try the hacker stats approach.
" 516 | ] 517 | }, 518 | { 519 | "cell_type": "code", 520 | "execution_count": null, 521 | "metadata": { 522 | "collapsed": true 523 | }, 524 | "outputs": [], 525 | "source": [ 526 | "permutation_replicates = np.empty(100000)\n", 527 | "\n", 528 | "size = len(permutation_replicates)\n", 529 | "\n", 530 | "for i in range(size):\n", 531 | " combined_perm_temperatures = np.random.permutation(np.concatenate((male_temperature, female_temperature)))\n", 532 | "\n", 533 | " male_permutation = combined_perm_temperatures[:len(male_temperature)]\n", 534 | " female_permutation = combined_perm_temperatures[len(male_temperature):]\n", 535 | "\n", 536 | " permutation_replicates[i] = np.abs(np.mean(male_permutation) - np.mean(female_permutation))\n", 537 | " \n", 538 | "p_val = np.sum(permutation_replicates >= male_and_female_diff) / len(permutation_replicates)\n", 539 | "\n", 540 | "print('p =', p_val)" 541 | ] 542 | } 543 | ], 544 | "metadata": { 545 | "kernelspec": { 546 | "display_name": "Python 2", 547 | "language": "python", 548 | "name": "python2" 549 | }, 550 | "language_info": { 551 | "codemirror_mode": { 552 | "name": "ipython", 553 | "version": 2 554 | }, 555 | "file_extension": ".py", 556 | "mimetype": "text/x-python", 557 | "name": "python", 558 | "nbconvert_exporter": "python", 559 | "pygments_lexer": "ipython2", 560 | "version": "2.7.13" 561 | } 562 | }, 563 | "nbformat": 4, 564 | "nbformat_minor": 1 565 | } 566 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/sliderule_dsi_inferential_statistics_exercise_2_solutions-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Examining Racial Discrimination in the US Job Market\n", 8 | "\n", 9 | "### Background\n", 10 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés to black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 11 | "\n", 12 | "### Data\n", 13 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 14 | "\n", 15 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes when presented to the employer." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "
\n", 23 | "### Exercises\n", 24 | "You will perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 25 | "\n", 26 | "Answer the following questions **in this notebook below and submit to your Github account**. \n", 27 | "\n", 28 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 29 | " 2. What are the null and alternate hypotheses?\n", 30 | " 3. Compute margin of error, confidence interval, and p-value. Try using both the bootstrapping and the frequentist statistical approaches.\n", 31 | " 4. Write a story describing the statistical significance in the context or the original problem.\n", 32 | " 5. Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?\n", 33 | "\n", 34 | "You can include written notes in notebook cells using Markdown: \n", 35 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 36 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 37 | "\n", 38 | "\n", 39 | "#### Resources\n", 40 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 41 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html \n", 42 | "+ Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 43 | "+ Formulas for the Bernoulli distribution: https://en.wikipedia.org/wiki/Bernoulli_distribution\n", 44 | "
\n", 45 | "****" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": { 52 | "collapsed": true 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "import pandas as pd\n", 57 | "import numpy as np\n", 58 | "from scipy import stats" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": { 65 | "collapsed": true 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": { 76 | "collapsed": true 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "# number of callbacks for black-sounding names\n", 81 | "sum(data[data.race=='w'].call)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "collapsed": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "data.head()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": { 98 | "collapsed": true 99 | }, 100 | "source": [ 101 | "
\n", 102 | "

SOLUTION: What test is appropriate for this problem?

\n", 103 | "\n", 104 | "

This is an interesting question, and it can be argued that there are two possible correct answers (such is life). First note the wording of the question: does race have a *significant impact* on callback. This usually suggests a $\\chi^2$-test, but the $\\chi^2$-test requires *frequencies* rather than percentages. Interestingly, one of the code snippets above uses frequency so we will look at that in a bit.

\n", 105 | "\n", 106 | "For now, a two-sample $z$-test:\n", 107 | "\n", 108 | "$$z = \\frac{\\left( \\hat{p}_W - \\hat{p}_B \\right) - 0}{\\sqrt{\\hat{p} (1 - \\hat{p)} \\left( \\frac{1}{n_W} + \\frac{1}{n_B}\\right)}}$$\n", 109 | "\n", 110 | "where\n", 111 | "\n", 112 | "$$\\hat{p} = \\frac{y_W + y_B}{n_W + n_B}$$\n", 113 | "\n", 114 | "The null and alternate hypotheses:\n", 115 | "\n", 116 | "$$H_0: p_B = p_W$$\n", 117 | "$$H_A: p_B \\ne p_W$$\n", 118 | "\n", 119 | "CLT applies because we assume that the samples are representative of the population. The observations in each sample are assumed to be independent since the sample was drawn randomly." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "collapsed": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "w = data[data.race=='w']\n", 131 | "b = data[data.race=='b']\n", 132 | "\n", 133 | "n_w = len(w)\n", 134 | "n_b = len(b)\n", 135 | "\n", 136 | "prop_w = np.sum(w.call) / len(w)\n", 137 | "prop_b = np.sum(b.call) / len(b)\n", 138 | "\n", 139 | "prop_diff = prop_w - prop_b\n", 140 | "phat = (np.sum(w.call) + np.sum(b.call)) / (len(w) + len(b))\n", 141 | "\n", 142 | "z = prop_diff / np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n", 143 | "pval = stats.norm.cdf(-z) * 2\n", 144 | "print(\"Z score: {}\".format(z))\n", 145 | "print(\"P-value: {}\".format(pval))" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "# Hacker stats approach\n", 157 | "permutation_replicates = np.empty(100000)\n", 158 | "\n", 159 | "white = data[data.race == 'w'].call.values\n", 160 | "black = data[data.race == 'b'].call.values\n", 161 | "diff_of_means = np.mean(white) - np.mean(black)\n", 162 | "\n", 163 | "for i in range(len(permutation_replicates)):\n", 164 | " permutation_samples = np.random.permutation(np.concatenate((white, black)))\n", 165 | " \n", 166 | " white_perm = permutation_samples[:len(white)]\n", 167 | " black_perm = permutation_samples[len(white):]\n", 168 | " \n", 169 | " permutation_replicates[i] = np.abs(np.mean(white_perm) - np.mean(black_perm))\n", 170 | "\n", 171 | "p = np.sum(permutation_replicates > diff_of_means) / len(permutation_replicates)\n", 172 | "print('p =', p)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": { 179 | "collapsed": true 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "moe = 1.96 * np.sqrt(phat * (1 - phat) * ((1 / n_w) + (1 / n_b)))\n", 184 | "ci = prop_diff + np.array([-1, 1]) * moe\n", 185 | "print(\"Margin of Error: {}\".format(moe))\n", 186 | "print(\"Confidence interval: {}\".format(ci))" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": {}, 192 | "source": [ 193 | "
\n", 194 | "

The p-value is practically 0 so we reject the null hypothesis that white and black sounding names have the same callback rate. They do not.

\n", 195 | "\n", 196 | "

Since 0 is not in the confidence interval, we reject the null hypothesis with the same conclusion.

\n", 197 | "\n", 198 | "

SOLUTION: Does your analysis mean that race/name is the most important factor in callback success? Why or why not? If not, how would you amend your analysis?

\n", 199 | "\n", 200 | "

No! While our test did show that there is a difference in callback rate based on race alone, there are other variables that may also contribute to, or interact with, race to explain the difference. In the original research paper, the researchers cited geography/city as a confounding variable. Additionally, we could also look at education and experience levels as well. But, in our very narrow example, we have shown that there is a significant difference in callback rates between white people and black people.

\n", 201 | "\n", 202 | "

The $\\chi^2$ approach:

\n", 203 | "
" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": { 210 | "collapsed": true 211 | }, 212 | "outputs": [], 213 | "source": [ 214 | "cont_table = pd.crosstab(index=data.call, columns=data.race)\n", 215 | "chi2, pval, _, _ = stats.chi2_contingency(cont_table)\n", 216 | "print(\"Chi-squared test statistic: {}\".format(chi2))\n", 217 | "print(\"p-value: {}\".format(pval))" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "
\n", 225 | "The chi-squared test yields a similar result. We reject the null hypothesis that race and callback rate are independent. The margin of error and confidence interval calculations are a bit more complicated because the chi-squared distribution is not always symmetric, depending on the number of degrees of freedom.\n", 226 | "
" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "collapsed": true 234 | }, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 2", 242 | "language": "python", 243 | "name": "python2" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 2 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython2", 255 | "version": "2.7.10" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 1 260 | } 261 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/us_job_market_discrimination.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Racial Discrimination/us_job_market_discrimination.dta -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Reduce Hospital Readmissions/dummy.txt: -------------------------------------------------------------------------------- 1 | . 2 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Reduce Hospital Readmissions/sliderule_dsi_inferential_statistics_exercise_1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## What is the true normal human body temperature? \n", 8 | "\n", 9 | "#### Background\n", 10 | "\n", 11 | "The mean normal body temperature was held to be 37$^{\\circ}$C or 98.6$^{\\circ}$F for more than 120 years since it was first conceptualized and reported by Carl Wunderlich in a famous 1868 book. In 1992, this value was revised to 36.8$^{\\circ}$C or 98.2$^{\\circ}$F. \n", 12 | "\n", 13 | "#### Exercise\n", 14 | "In this exercise, you will analyze a dataset of human body temperatures and employ the concepts of hypothesis testing, confidence intervals, and statistical significance to answer the following questions:\n", 15 | "\n", 16 | "1. Is the distribution of body temperatures normal? \n", 17 | "2. Is the true population mean really 98.6 degrees F?\n", 18 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 19 | "4. Is there a significant difference between males and females in normal temperature?\n", 20 | "\n", 21 | "\n", 22 | "\n", 23 | "#### Resources\n", 24 | "\n", 25 | "+ Information and data sources: http://www.amstat.org/publications/jse/datasets/normtemp.txt, http://www.amstat.org/publications/jse/jse_data_archive.htm\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 58, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import pandas as pd" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 62, 42 | "metadata": { 43 | "collapsed": false 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "df = pd.read_csv('data/human_body_temperature.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "collapsed": true 54 | }, 55 | "source": [ 56 | "# Exercise\n", 57 | "\n", 58 | "Answer the following questions in this notebook and submit to your Github account. \n", 59 | "\n", 60 | "1. Is the distribution of body temperatures normal? \n", 61 | " - Remember that this is a condition for the CLT, and hence the statistical tests we are using, to apply. \n", 62 | "2. Is the true population mean really 98.6 degrees F?\n", 63 | " - Bring out the one sample hypothesis test! In this situation, is it approriate to apply a z-test or a t-test? How will the result be different?\n", 64 | "3. At what temperature should we consider someone's temperature to be \"abnormal\"?\n", 65 | " - Start by computing the margin of error and confidence interval.\n", 66 | "4. Is there a significant difference between males and females in normal temperature?\n", 67 | " - Set up and solve for a two sample hypothesis testing." 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": { 74 | "collapsed": true 75 | }, 76 | "outputs": [], 77 | "source": [] 78 | } 79 | ], 80 | "metadata": { 81 | "kernelspec": { 82 | "display_name": "Python 2", 83 | "language": "python", 84 | "name": "python2" 85 | }, 86 | "language_info": { 87 | "codemirror_mode": { 88 | "name": "ipython", 89 | "version": 2 90 | }, 91 | "file_extension": ".py", 92 | "mimetype": "text/x-python", 93 | "name": "python", 94 | "nbconvert_exporter": "python", 95 | "pygments_lexer": "ipython2", 96 | "version": "2.7.9" 97 | } 98 | }, 99 | "nbformat": 4, 100 | "nbformat_minor": 0 101 | } 102 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/Reduce Hospital Readmissions/sliderule_dsi_inferential_statistics_exercise_2-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "\n", 8 | "### Examining racial discrimination in the US job market\n", 9 | "\n", 10 | "#### Background\n", 11 | "Racial discrimination continues to be pervasive in cultures throughout the world. Researchers examined the level of racial discrimination in the United States labor market by randomly assigning identical résumés black-sounding or white-sounding names and observing the impact on requests for interviews from employers.\n", 12 | "\n", 13 | "#### Data\n", 14 | "In the dataset provided, each row represents a resume. The 'race' column has two values, 'b' and 'w', indicating black-sounding and white-sounding. The column 'call' has two values, 1 and 0, indicating whether the resume received a call from employers or not.\n", 15 | "\n", 16 | "Note that the 'b' and 'w' values in race are assigned randomly to the resumes.\n", 17 | "\n", 18 | "#### Exercise\n", 19 | "Perform a statistical analysis to establish whether race has a significant impact on the rate of callbacks for resumes.\n", 20 | "\n", 21 | "\n", 22 | "#### Resources\n", 23 | "+ Experiment information and data source: http://www.povertyactionlab.org/evaluation/discrimination-job-market-united-states\n", 24 | "+ Scipy statistical methods: http://docs.scipy.org/doc/scipy/reference/stats.html " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "****" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": { 38 | "collapsed": true 39 | }, 40 | "outputs": [], 41 | "source": [ 42 | "import pandas as pd\n", 43 | "import numpy as np\n", 44 | "from scipy import stats" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": { 51 | "collapsed": false 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "data = pd.io.stata.read_stata('data/us_job_market_discrimination.dta')" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 4, 61 | "metadata": { 62 | "collapsed": false 63 | }, 64 | "outputs": [ 65 | { 66 | "data": { 67 | "text/plain": [ 68 | "157.0" 69 | ] 70 | }, 71 | "execution_count": 4, 72 | "metadata": {}, 73 | "output_type": "execute_result" 74 | } 75 | ], 76 | "source": [ 77 | "# number of callbacks for balck-sounding names\n", 78 | "sum(data[data.race=='b'].call)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "collapsed": true 85 | }, 86 | "source": [ 87 | "****\n", 88 | "\n", 89 | "# Exercise\n", 90 | "\n", 91 | " 1. What test is appropriate for this problem? Does CLT apply?\n", 92 | " 2. What are the null and alternate hypotheses?\n", 93 | " 3. Compute margin of error, confidence interval, and p-value.\n", 94 | " 4. Discuss statistical significance.\n", 95 | " \n", 96 | "You can include written notes in notebook cells using Markdown: \n", 97 | " - In the control panel at the top, choose Cell > Cell Type > Markdown\n", 98 | " - Markdown syntax: http://nestacms.com/docs/creating-content/markdown-cheat-sheet\n", 99 | " \n", 100 | "****" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "metadata": { 107 | "collapsed": true 108 | }, 109 | "outputs": [], 110 | "source": [] 111 | } 112 | ], 113 | "metadata": { 114 | "kernelspec": { 115 | "display_name": "Python 2", 116 | "language": "python", 117 | "name": "python2" 118 | }, 119 | "language_info": { 120 | "codemirror_mode": { 121 | "name": "ipython", 122 | "version": 2 123 | }, 124 | "file_extension": ".py", 125 | "mimetype": "text/x-python", 126 | "name": "python", 127 | "nbconvert_exporter": "python", 128 | "pygments_lexer": "ipython2", 129 | "version": "2.7.9" 130 | } 131 | }, 132 | "nbformat": 4, 133 | "nbformat_minor": 0 134 | } 135 | -------------------------------------------------------------------------------- /mini-projects/Ch 8 Inferential Statistics/Ch 8.3 Exploratory Data Analysis/dummy.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /mini-projects/dummy: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /small_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MMBazel/Springboard-DataScienceTrack-Student/56fa17dc1197c3925842908c9f10a23f6ae6226f/small_logo.png --------------------------------------------------------------------------------