├── .gitignore ├── img ├── author_image.png └── shield_image.png ├── requirements.sh ├── README.md ├── course.yml ├── Python_Code__For_Decision_Trees ├── chapter3.md ├── chapter1.md └── chapter2.md /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /img/author_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/community-courses-kaggle-python-tutorial-on-machine-learning/HEAD/img/author_image.png -------------------------------------------------------------------------------- /img/shield_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datacamp/community-courses-kaggle-python-tutorial-on-machine-learning/HEAD/img/shield_image.png -------------------------------------------------------------------------------- /requirements.sh: -------------------------------------------------------------------------------- 1 | pip3 install pandas==0.19.1 2 | pip3 install numpy==1.11.0 3 | pip3 install scipy==0.18.1 4 | pip3 install scikit-learn==0.18.1 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Kaggle Python Tutorial on Machine Learning 2 | 3 | Source files for the Kaggle Python Tutorial on Machine Learning course on DataCamp. 4 | 5 | You can access the course here: https://www.datacamp.com/courses/1015 6 | 7 | -------------------------------------------------------------------------------- /course.yml: -------------------------------------------------------------------------------- 1 | title : Kaggle Python Tutorial on Machine Learning 2 | author_field : The DataCamp Team 3 | description : Always wanted to compete in a Kaggle competition but not sure you have the right skillset? This interactive tutorial by Kaggle and DataCamp on Machine Learning offers the solution. Step-by-step you will learn through fun coding exercises how to predict survival rate for Kaggle's Titanic competition using Machine Learning techniques. Upload your results and see your ranking go up!

New to Python? Give our Introduction to Python for Data Science course a try. 4 | author_bio : DataCamp is a young team of data analytics enthusiasts that provide affordable interactive data science and statistics education to the world. We do not believe in an educational framework that centers on passively reading books, or on watching YouTube videos that put a focus on the instructor, and not the scholar. We provide courses for both the novice and the experienced data scientist, and even allow passionate users to freely use the learning platform to create their own interactive courses. 5 | university : DataCamp 6 | difficulty_level : 2 7 | time_needed : 1 hour 8 | programming_language : python 9 | from : "python-base-prod:20" 10 | -------------------------------------------------------------------------------- /Python_Code__For_Decision_Trees: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import sklearn as sk 4 | from sklearn import tree 5 | import matplotlib.pyplot as plt 6 | 7 | # Load the train and test datasets to create two DataFrames 8 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" 9 | train = pd.read_csv(train_url) 10 | 11 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" 12 | test = pd.read_csv(test_url) 13 | 14 | 15 | #### converting variables and clean the data 16 | train.loc[train["Sex"] == "male", "Sex"] = 0 17 | train.loc[train["Sex"] == "female", "Sex"] = 1 18 | 19 | train["Embarked"] = train["Embarked"].fillna("S") 20 | 21 | train.loc[train["Embarked"] == "S", "Embarked"] = 0 22 | train.loc[train["Embarked"] == "C", "Embarked"] = 1 23 | train.loc[train["Embarked"] == "Q", "Embarked"] = 2 24 | 25 | train["Age"] = train["Age"].fillna(train["Age"].median()) 26 | 27 | ## building the first tree 28 | target = np.array(train.Survived).transpose() 29 | features_one = np.array([train.Pclass, train.Sex, train.Age, train.Fare]).transpose() 30 | 31 | my_tree_one = tree.DecisionTreeClassifier() 32 | my_tree_one = my_tree_one.fit(features_one, target) 33 | 34 | #### second tree 35 | 36 | features_two = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose() 37 | 38 | my_tree_two = tree.DecisionTreeClassifier() 39 | my_tree_two = my_tree_two.fit(features_two, target) 40 | 41 | #### third tree 42 | # control overfitting 43 | my_tree_three = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5) 44 | my_tree_three = my_tree_three.fit(features_two, target) 45 | 46 | 47 | ### evaluating the models 48 | from sklearn.metrics import confusion_matrix 49 | 50 | pred_vec_three = my_tree_three.predict(features_two) 51 | pred_vec_two = my_tree_two.predict(features_two) 52 | pred_vec_one = my_tree_one.predict(features_one) 53 | 54 | def pred_eval(pred_vec,target): 55 | cm = confusion_matrix(pred_vec,target) 56 | true_positive = cm[0][0] 57 | true_negative = cm[1][1] 58 | false_positive = cm[0][1] 59 | false_negative = cm[1][0] 60 | positive = true_positive + false_negative 61 | negative = true_negative + false_positive 62 | sensitivity = true_positive/positive #proportion of survivals correctly classified (want to maximize) 63 | specificity = true_negative/negative #proportion of deaths correctly classified (want to maximize) 64 | ppv = true_positive/(true_positive + false_positive) 65 | npv = true_negative/(true_negative + false_negative) 66 | fnr = false_negative/positive #accordingly minimize 1 - sensitivity 67 | fpr = false_positive/negative #accordingly minimize 1 - specificity 68 | 69 | eval = np.array([cm,sensitivity,specificity,ppv,npv,fnr,fpr]) 70 | return(eval) 71 | 72 | my_tree_one.score(features_one, target) 73 | my_tree_two.score(features_two, target) 74 | my_tree_three.score(features_two, target) 75 | 76 | #### Graphiong the Tree 77 | 78 | 79 | #from sklearn.externals.six import StringIO 80 | #import pydot 81 | #dot_data = StringIO() 82 | #tree.export_graphviz(my_tree_one, out_file = dot_data) 83 | #graph = pydot.graph_from_dot_data(dot_data.getvalue()) 84 | #graph.write_pdf("tree.pdf") 85 | 86 | #from sklearn.externals.six import StringIO 87 | #with open("tree.dot", 'w') as f: 88 | # f = tree.export_graphviz(my_tree_two, out_file=f) 89 | 90 | #from IPython.display import Image 91 | #dot_data = StringIO() 92 | #tree.export_graphviz(my_tree_two, out_file=dot_data, filled=True, rounded=True, special_characters=True) 93 | #graph = pydot.graph_from_dot_data(dot_data.getvalue()) 94 | 95 | 96 | 97 | #### Useful Attributes 98 | my_tree_one.feature_importances_ 99 | my_tree_one.tree_ 100 | my_tree_one.n_classes_ 101 | my_tree_one.n_features_ 102 | my_tree_one.classes_ 103 | 104 | 105 | 106 | #### Clean the test data. 107 | test.loc[test["Sex"] == "male", "Sex"] = 0 108 | test.loc[test["Sex"] == "female", "Sex"] = 1 109 | 110 | test["Embarked"] = test["Embarked"].fillna("S") 111 | 112 | test.loc[test["Embarked"] == "S", "Embarked"] = 0 113 | test.loc[test["Embarked"] == "C", "Embarked"] = 1 114 | test.loc[test["Embarked"] == "Q", "Embarked"] = 2 115 | 116 | test["Age"] = test["Age"].fillna(test["Age"].median()) 117 | 118 | test.Fare[152] = test.Fare.median() 119 | 120 | 121 | #### Prediction 122 | 123 | test_features_one = np.array([test.Pclass, test.Fare, test.SibSp, test.Parch]).transpose() 124 | pred_one = my_tree_one.predict(test_features_one) 125 | 126 | 127 | test_features_two = np.array([test.Pclass,test.Age,test.Sex, test.Fare, test.SibSp, test.Parch,test.Embarked]).transpose() 128 | pred_two = my_tree_two.predict(test_features_two) 129 | 130 | pred_three = my_tree_three.predict(test_features_two) 131 | 132 | 133 | #### Feature Engineering 134 | 135 | 136 | #### https://plot.ly/matplotlib/bar-charts/ 137 | 138 | y1 = cm1[1:5] 139 | y2 = cm2[1:5] 140 | y3 = cm3[1:5] 141 | N = len(y1) 142 | x = range(N) 143 | plt.bar(x, y2, color="red") 144 | plt.bar(x, y3, color="green") 145 | plt.bar(x, y1, color="blue") 146 | 147 | g1 = cm1[5:7] 148 | g2 = cm2[5:7] 149 | g3 = cm3[5:7] 150 | M = len(g1) 151 | h = range(M) 152 | plt.bar(h, g1, color="blue") 153 | plt.bar(h, g3, color="green") 154 | plt.bar(h, g2, color="red") 155 | 156 | 157 | #### Building a Random Forest 158 | 159 | from sklearn import cross_validation 160 | from sklearn.ensemble import RandomForestClassifier 161 | 162 | features_forest = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose() 163 | 164 | forest = RandomForestClassifier(max_depth = 10, n_estimators=100, min_samples_split=2) 165 | my_forest = forest.fit(features_forest, target) 166 | my_forest.score(features_forest, target) 167 | 168 | #Evaluate the forest 169 | pred_vec_forest = my_forest.predict(features_forest) 170 | pred_eval(pred_vec_forest,target) 171 | 172 | #predict using the forest 173 | pred_forest = my_forest.predict(test_features_two) 174 | 175 | 176 | 177 | -------------------------------------------------------------------------------- /chapter3.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Improving your predictions through Random Forests 3 | description : "What techniques can you use to improve your predictions even more? One possible way is by making use of the machine learning method Random Forest. Namely, a forest is just a collection of trees..." 4 | 5 | 6 | --- type:NormalExercise lang:python xp:100 skills:2 key:05b740fff1 7 | ## A Random Forest analysis in Python 8 | A detailed study of Random Forests would take this tutorial a bit too far. However, since it's an often used machine learning technique, gaining a general understanding in Python won't hurt. 9 | 10 | In layman's terms, the Random Forest technique handles the overfitting problem you faced with decision trees. It grows multiple (very deep) classification trees using the training set. At the time of prediction, each tree is used to come up with a prediction and every outcome is counted as a vote. For example, if you have trained 3 trees with 2 saying a passenger in the test set will survive and 1 says he will not, the passenger will be classified as a survivor. This approach of overtraining trees, but having the majority's vote count as the actual classification decision, avoids overfitting. 11 | 12 | Building a random forest in Python looks almost the same as building a decision tree; so we can jump right to it. There are two key differences, however. Firstly, a different class is used. And second, a new argument is necessary. Also, we need to import the necessary library from scikit-learn. 13 | 14 | - Use `RandomForestClassifier()` class instead of the `DecisionTreeClassifier()` class. 15 | - `n_estimators` needs to be set when using the `RandomForestClassifier()` class. This argument allows you to set the number of trees you wish to plant and average over. 16 | 17 | The latest training and testing data are preloaded for you. 18 | 19 | 20 | *** =instructions 21 | - Build the random forest with `n_estimators` set to `100`. 22 | - Fit your random forest model with inputs `features_forest` and `target`. 23 | - Compute the classifier predictions on the selected test set features. 24 | 25 | 26 | *** =hint 27 | 28 | - When computing the predictions you can use the `.predict()` mothod just like you did with decision trees! 29 | - To compute the score use the `.score()` method with correct argumnets. Consult your previous work from CH2 if your don't recall the syntax. 30 | 31 | *** =pre_exercise_code 32 | ```{python} 33 | import pandas as pd 34 | import numpy as np 35 | import sklearn as sk 36 | from sklearn import tree 37 | from sklearn.ensemble import RandomForestClassifier 38 | 39 | 40 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 41 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 42 | 43 | train["Sex"][train["Sex"] == "male"] = 0 44 | train["Sex"][train["Sex"] == "female"] = 1 45 | 46 | train["Embarked"] = train["Embarked"].fillna("S") 47 | 48 | train["Embarked"][train["Embarked"] == "S"] = 0 49 | train["Embarked"][train["Embarked"] == "C"] = 1 50 | train["Embarked"][train["Embarked"] == "Q"] = 2 51 | 52 | train["Age"] = train["Age"].fillna(train["Age"].median()) 53 | 54 | target = train["Survived"].values 55 | 56 | features_two = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 57 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1) 58 | my_tree_two = my_tree_two.fit(features_two, target) 59 | 60 | 61 | test["Sex"][test["Sex"] == "male"] = 0 62 | test["Sex"][test["Sex"] == "female"] = 1 63 | 64 | test["Embarked"] = test["Embarked"].fillna("S") 65 | 66 | test["Embarked"][test["Embarked"] == "S"] = 0 67 | test["Embarked"][test["Embarked"] == "C"] = 1 68 | test["Embarked"][test["Embarked"] == "Q"] = 2 69 | 70 | test["Age"] = test["Age"].fillna(test["Age"].median()) 71 | 72 | test.Fare[152] = test.Fare.median() 73 | 74 | ``` 75 | 76 | *** =sample_code 77 | ```{python} 78 | 79 | # Import the `RandomForestClassifier` 80 | from sklearn.ensemble import RandomForestClassifier 81 | 82 | # We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables 83 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 84 | 85 | # Building and fitting my_forest 86 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = ___, random_state = 1) 87 | my_forest = forest.fit(___, ___) 88 | 89 | # Print the score of the fitted random forest 90 | print(my_forest.score(features_forest, target)) 91 | 92 | # Compute predictions on our test set features then print the length of the prediction vector 93 | test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 94 | pred_forest = my_forest.predict(___) 95 | print(len(pred_forest)) 96 | 97 | ``` 98 | 99 | *** =solution 100 | ```{python} 101 | 102 | # Import the `RandomForestClassifier` 103 | from sklearn.ensemble import RandomForestClassifier 104 | 105 | # We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables 106 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 107 | 108 | # Building and fitting my_forest 109 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1) 110 | my_forest = forest.fit(features_forest, target) 111 | 112 | # Print the score of the fitted random forest 113 | print(my_forest.score(features_forest, target)) 114 | 115 | # Compute predictions on our test set features then print the length of the prediction vector 116 | test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 117 | pred_forest = my_forest.predict(test_features) 118 | print(len(pred_forest)) 119 | 120 | ``` 121 | 122 | *** =sct 123 | 124 | ```{python} 125 | test_function("sklearn.ensemble.RandomForestClassifier", args=None, 126 | incorrect_msg = "Don't forget to import `RandomForestClassifier` and use it to initiate your random forest.") 127 | test_object("features_forest", 128 | incorrect_msg = "Make sure to select the specified features in the right order. These should come from the train dataset!") 129 | test_function("print",1, args=None, 130 | incorrect_msg = "It looks like the score wasn't computed exactly right. Make sure to use `features_forest` and `target` as arguments") 131 | test_object("test_features", 132 | incorrect_msg = "Make sure to select the specified features in the right order. These should come from the test dataset!") 133 | test_function("print",2, args=None, 134 | incorrect_msg = "It seems that there is an incorrect number of predictions in pred_forest. Make sure to use `test_features` when computing the predictions.") 135 | ``` 136 | 137 | --- type:NormalExercise lang:python xp:100 skills:2 key:146563d1e8 138 | ## Interpreting and Comparing 139 | 140 | Remember how we looked at `.feature_importances_` attribute for the decision trees? Well, you can request the same attribute from your random forest as well and interpret the relevance of the included variables. 141 | You might also want to compare the models in some quick and easy way. For this, we can use the `.score()` method. The `.score()` method takes the features data and the target vector and computes mean accuracy of your model. You can apply this method to both the forest and individual trees. Remember, this measure should be high but not extreme because that would be a sign of overfitting. 142 | 143 | For this exercise, you have `my_forest` and `my_tree_two` available to you. The features and target arrays are also ready for use. 144 | 145 | *** =instructions 146 | - Explore the feature importance for both models 147 | - Compare the mean accuracy score of the two models 148 | 149 | *** =hint 150 | 151 | - Make sure that you are applying the commands to `my_forest` and, are using correct arguments. 152 | - Don't forget that `target` and `features_forest` are preloaded for you! 153 | 154 | *** =pre_exercise_code 155 | ```{python} 156 | import random 157 | random.seed(1) 158 | 159 | import pandas as pd 160 | import numpy as np 161 | import sklearn as sk 162 | from sklearn import tree 163 | from sklearn.ensemble import RandomForestClassifier 164 | 165 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" 166 | train = pd.read_csv(train_url) 167 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" 168 | test = pd.read_csv(test_url) 169 | 170 | train["Sex"][train["Sex"] == "male"] = 0 171 | train["Sex"][train["Sex"] == "female"] = 1 172 | train["Embarked"] = train["Embarked"].fillna("S") 173 | train["Embarked"][train["Embarked"] == "S"] = 0 174 | train["Embarked"][train["Embarked"] == "C"] = 1 175 | train["Embarked"][train["Embarked"] == "Q"] = 2 176 | train["Age"] = train["Age"].fillna(train["Age"].median()) 177 | 178 | target = train["Survived"].values 179 | 180 | features_two = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 181 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1) 182 | my_tree_two = my_tree_two.fit(features_two, target) 183 | 184 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 185 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators=100, random_state = 1) 186 | my_forest = forest.fit(features_forest, target) 187 | 188 | ``` 189 | 190 | 191 | *** =sample_code 192 | ```{python} 193 | #Request and print the `.feature_importances_` attribute 194 | print(my_tree_two.feature_importances_) 195 | print() 196 | 197 | #Compute and print the mean accuracy score for both models 198 | print(my_tree_two.score(features_two, target)) 199 | print() 200 | ``` 201 | 202 | *** =solution 203 | ```{python} 204 | #Request and print the `.feature_importances_` attribute 205 | print(my_tree_two.feature_importances_) 206 | print(my_forest.feature_importances_) 207 | 208 | #Compute and print the mean accuracy score for both models 209 | print(my_tree_two.score(features_two, target)) 210 | print(my_forest.score(features_forest, target)) 211 | ``` 212 | 213 | *** =sct 214 | 215 | ```{python} 216 | test_function("print", 1, args=None, 217 | incorrect_msg = "You don't need to edit the given code. Instead leave it and use it as a hint for your solution") 218 | test_function("print", 2, args=None, 219 | incorrect_msg = "Use the give code as a hint on how to complete the task. You solution shoudl look the same except with `my_forest` and an object of investigation!") 220 | test_function("print", 3, args=None, 221 | incorrect_msg = "You don't need to edit the given code. Instead leave it and use it as a hint for your solution") 222 | test_function("print", 4, args=None, 223 | incorrect_msg = "Use the give code as a hint on how to complete the task. You solution shoudl look the same except with `my_forest` and an object of investigation!") 224 | ``` 225 | 226 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:db79e9fe21 227 | ## Conclude and Submit 228 | 229 | Based on your finding in the previous exercise determine which feature was of most importance, and for which model. 230 | After this final exercise, you will be able to submit your random forest model to Kaggle! Use `my_forest`, `my_tree_two`, and `feature_importances_` to answer the question. 231 | 232 | *** =hint 233 | 234 | - By significance, we simply mean the magnitude of the values. For each feature you should see a decimal. The largst indicates greatest significance for the respective feature. 235 | 236 | *** =pre_exercise_code 237 | 238 | ```{python} 239 | import pandas as pd 240 | import numpy as np 241 | import sklearn as sk 242 | from sklearn import tree 243 | from sklearn.ensemble import RandomForestClassifier 244 | 245 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" 246 | train = pd.read_csv(train_url) 247 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" 248 | test = pd.read_csv(test_url) 249 | 250 | train["Sex"][train["Sex"] == "male"] = 0 251 | train["Sex"][train["Sex"] == "female"] = 1 252 | train["Embarked"] = train["Embarked"].fillna("S") 253 | train["Embarked"][train["Embarked"] == "S"] = 0 254 | train["Embarked"][train["Embarked"] == "C"] = 1 255 | train["Embarked"][train["Embarked"] == "Q"] = 2 256 | train["Age"] = train["Age"].fillna(train["Age"].median()) 257 | 258 | target = train["Survived"].values 259 | 260 | features_two = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 261 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5) 262 | my_tree_two = my_tree_two.fit(features_two, target) 263 | 264 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values 265 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators=100) 266 | my_forest = forest.fit(features_forest, target) 267 | 268 | ``` 269 | 270 | *** =instructions 271 | - `The most important feature was "Age", but it was more significant for "my_tree_two"` 272 | - `The most important feature was "Sex", but it was more significant for "my_tree_two"` 273 | - `The most important feature was "Sex", but it was more significant for "my_forest"` 274 | - `The most important feature was "Age", but it was more significant for "my_forest"` 275 | 276 | *** =sct 277 | 278 | ```{python} 279 | 280 | msg1 = "Wrong choice. Check the hint for some help." 281 | msg2 = "Wonderful! You are now at the end of this tutorial and ready to start improving the results yourself" 282 | msg3 = msg1 283 | msg4 = msg1 284 | test_mc(correct = 2, msgs = [msg1, msg2, msg3, msg4]) 285 | 286 | success_msg("Congrats on completing the course! Now that you created your first random forest and used it for prediction take a look at how well it does in the Kaggle competition. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_forest.csv). Having learned about decision trees and random forests, you can begin participating in some other Kaggle competitions as well. Good luck and have fun!") 287 | 288 | ``` 289 | -------------------------------------------------------------------------------- /chapter1.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Getting Started with Python 3 | description : In this chapter we will go trough the essential steps that you will need to take before beginning to build predictive models. 4 | 5 | --- type:NormalExercise lang:python xp:100 skills:2 key:49f71e27bd 6 | 7 | ## How it works 8 | Welcome to our Kaggle Machine Learning Tutorial. In this tutorial, you will explore how to tackle Kaggle Titanic competition using Python and Machine Learning. In case you're new to Python, it's recommended that you first take our free [Introduction to Python for Data Science Tutorial](https://www.datacamp.com/courses/intro-to-python-for-data-science). Furthermore, while not required, familiarity with machine learning techniques is a plus so you can get the maximum out of this tutorial. 9 | 10 | In the editor on the right, you should type Python code to solve the exercises. When you hit the 'Submit Answer' button, every line of code is interpreted and executed by Python and you get a message whether or not your code was correct. The output of your Python code is shown in the console in the lower right corner. Python makes use of the `#` sign to add comments; these lines are not run as Python code, so they will not influence your result. 11 | 12 | You can also execute Python commands straight in the console. This is a good way to experiment with Python code, as your submission is not checked for correctness. 13 | 14 | 15 | *** =instructions 16 | - In the editor to the right, you see some Python code and annotations. This is what a typical exercise will look like. 17 | - To complete the exercise and see how the interactive environment works add the code to compute `y` and hit the `Submit Answer` button. Don't forget to print the result. 18 | 19 | 20 | *** =hint 21 | 22 | Just add a line of Python code that calculates the product of 6 and 9, just like the example in the sample code! 23 | 24 | *** =pre_exercise_code 25 | ```{python} 26 | # no pre_exercise_code 27 | ``` 28 | 29 | *** =sample_code 30 | ```{python} 31 | #Compute x = 4 * 3 and print the result 32 | x = 4 * 3 33 | print(x) 34 | 35 | #Compute y = 6 * 9 and print the result 36 | ``` 37 | 38 | *** =solution 39 | ```{python} 40 | #Compute x = 4 * 3 and print the result 41 | x = 4 * 3 42 | print(x) 43 | 44 | #Compute y = 6 * 9 and print the result 45 | y = 6*9 46 | print(y) 47 | ``` 48 | 49 | *** =sct 50 | ```{python} 51 | 52 | msg = "Don't forget to assign the correct value to y" 53 | test_object("y", 54 | undefined_msg = msg, 55 | incorrect_msg = msg) 56 | 57 | msg = "Print out the resulting object, `y`!" 58 | test_function("print",2, 59 | not_called_msg = msg, 60 | incorrect_msg = msg, 61 | args=None) 62 | 63 | success_msg("Awesome! See how the console shows the result of the Python code you submitted? Now that you're familiar with the interface, let's get down to business!") 64 | ``` 65 | 66 | --- type:NormalExercise lang:python xp:100 skills:2 key:18a0d06d73 67 | ## Get the Data with Pandas 68 | When the Titanic sank, 1502 of the 2224 passengers and crew were killed. One of the main reasons for this high level of casualties was the lack of lifeboats on this self-proclaimed "unsinkable" ship. 69 | 70 | Those that have seen the movie know that some individuals were more likely to survive the sinking (lucky Rose) than others (poor Jack). In this course, you will learn how to apply machine learning techniques to predict a passenger's chance of surviving using Python. 71 | 72 | Let's start with loading in the training and testing set into your Python environment. You will use the training set to build your model, and the test set to validate it. The data is stored on the web as `csv` files; their URLs are already available as character strings in the sample code. You can load this data with the `read_csv()` method from the Pandas library. 73 | 74 | *** =instructions 75 | - First, import the Pandas library as pd. 76 | - Load the test data similarly to how the train data is loaded. 77 | - Inspect the first couple rows of the loaded dataframes using the `.head()` method with the code provided. 78 | 79 | *** =hint 80 | - You can load in the training set with `train = pd.read_csv(train_url)` 81 | - To print a variable to the console, use the print function on a new line. 82 | 83 | *** =pre_exercise_code 84 | ```{python} 85 | 86 | ``` 87 | 88 | *** =sample_code 89 | ```{python} 90 | # Import the Pandas library 91 | 92 | # Load the train and test datasets to create two DataFrames 93 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" 94 | train = pd.read_csv(train_url) 95 | 96 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" 97 | 98 | #Print the `head` of the train and test dataframes 99 | print(train.head()) 100 | print(test.head()) 101 | ``` 102 | *** =solution 103 | ```{python} 104 | # Import the Pandas library 105 | import pandas as pd 106 | 107 | # Load the train and test datasets to create two DataFrames 108 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" 109 | train = pd.read_csv(train_url) 110 | 111 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" 112 | test = pd.read_csv(test_url) 113 | 114 | #Print the `head` of the train and test dataframes 115 | print(train.head()) 116 | print(test.head()) 117 | ``` 118 | 119 | *** =sct 120 | 121 | ```{python} 122 | msg = "Have you correctly imported the `pandas` package? Use the alias `pd`." 123 | test_import("pandas", not_imported_msg = msg, incorrect_as_msg = msg) 124 | 125 | msg = "Do not touch the code that specifies the URLs of the training and test set csvs." 126 | test_object("train_url", undefined_msg = msg, incorrect_msg = msg) 127 | test_object("test_url", undefined_msg = msg, incorrect_msg = msg) 128 | 129 | msg = "Make sure you are using the `read_csv()` function correctly" 130 | test_function("pandas.read_csv", 1, 131 | args=None, 132 | not_called_msg = msg, 133 | incorrect_msg = msg,) 134 | test_function("pandas.read_csv", 2, 135 | args=None, 136 | not_called_msg = msg, 137 | incorrect_msg = msg) 138 | 139 | #msg = "Don't forget to print the first few rows of `train` with the `.head()` method" 140 | #test_function("print", 1, not_called_msg = msg, incorrect_msg = msg) 141 | 142 | #msg = "Don't forget to print the first few rows of `test` with the `.head()` method" 143 | #test_function("print", 2, not_called_msg = msg, incorrect_msg = msg) 144 | 145 | success_msg("Well done! Now that your data is loaded in, let's see if you can understand it.") 146 | ``` 147 | 148 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:7f5cb08579 149 | ## Understanding your data 150 | 151 | Before starting with the actual analysis, it's important to understand the structure of your data. Both `test` and `train` are DataFrame objects, the way pandas represent datasets. You can easily explore a DataFrame using the `.describe()` method. `.describe()` summarizes the columns/features of the DataFrame, including the count of observations, mean, max and so on. Another useful trick is to look at the dimensions of the DataFrame. This is done by requesting the `.shape` attribute of your DataFrame object. (ex. `your_data.shape`) 152 | 153 | The training and test set are already available in the workspace, as `train` and `test`. Apply `.describe()` method and print the `.shape` attribute of the training set. Which of the following statements is correct? 154 | 155 | *** =instructions 156 | - The training set has 891 observations and 12 variables, count for Age is 714. 157 | - The training set has 418 observations and 11 variables, count for Age is 891. 158 | - The testing set has 891 observations and 11 variables, count for Age is 891. 159 | - The testing set has 418 observations and 12 variables, count for Age is 714. 160 | 161 | *** =hint 162 | To see the description of the `test` variable try `test.describe()`. 163 | 164 | *** =pre_exercise_code 165 | ```{python} 166 | import pandas as pd 167 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 168 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 169 | ``` 170 | 171 | *** =sct 172 | 173 | ```{python} 174 | 175 | msg1 = "Great job!" 176 | msg2 = "Wrong, try again. Maybe have a look at the hint." 177 | msg3 = "Not so good... Maybe have a look at the hint." 178 | msg4 = "Incorrect. Maybe have a look at the hint." 179 | test_mc(correct = 1, msgs = [msg1, msg2, msg3, msg4]) 180 | 181 | success_msg("Well done! Now move on and explore some of the features in more detail.") 182 | 183 | ``` 184 | 185 | --- type:NormalExercise lang:python xp:100 skills:1 key:1eeaaeb294 186 | ## Rose vs Jack, or Female vs Male 187 | 188 | How many people in your training set survived the disaster with the Titanic? To see this, you can use the `value_counts()` method in combination with standard bracket notation to select a single column of a DataFrame: 189 | 190 | ``` 191 | # absolute numbers 192 | train["Survived"].value_counts() 193 | 194 | # percentages 195 | train["Survived"].value_counts(normalize = True) 196 | ``` 197 | 198 | If you run these commands in the console, you'll see that 549 individuals died (62%) and 342 survived (38%). A simple way to predict heuristically could be: "majority wins". This would mean that you will predict every unseen observation to not survive. 199 | 200 | To dive in a little deeper we can perform similar counts and percentage calculations on subsets of the Survived column. For example, maybe gender could play a role as well? You can explore this using the `.value_counts()` method for a two-way comparison on the number of males and females that survived, with this syntax: 201 | 202 | ``` 203 | train["Survived"][train["Sex"] == 'male'].value_counts() 204 | train["Survived"][train["Sex"] == 'female'].value_counts() 205 | ``` 206 | 207 | To get proportions, you can again pass in the argument `normalize = True` to the `.value_counts()` method. 208 | 209 | *** =instructions 210 | - Calculate and print the survival rates in absolute numbers using `values_counts()` method. 211 | - Calculate and print the survival rates as proportions by setting the `normalize` argument to `True`. 212 | - Repeat the same calculations but on subsets of survivals based on Sex. 213 | 214 | *** =hint 215 | - The code for the first four tasks is already given in the assignment! 216 | - Think about the `normalize` argument, and don't forget to print. 217 | 218 | *** =pre_exercise_code 219 | ```{python} 220 | import pandas as pd 221 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 222 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 223 | ``` 224 | 225 | *** =sample_code 226 | ```{python} 227 | 228 | # Passengers that survived vs passengers that passed away 229 | print() 230 | 231 | # As proportions 232 | print() 233 | 234 | # Males that survived vs males that passed away 235 | print() 236 | 237 | # Females that survived vs Females that passed away 238 | print() 239 | 240 | # Normalized male survival 241 | print() 242 | 243 | # Normalized female survival 244 | print() 245 | 246 | ``` 247 | 248 | *** =solution 249 | ```{python} 250 | 251 | # Passengers that survived vs passengers that passed away 252 | print(train.Survived.value_counts()) 253 | 254 | # As proportions 255 | print(train["Survived"].value_counts(normalize = True)) 256 | 257 | # Males that survived vs males that passed away 258 | print(train["Survived"][train["Sex"] == 'male'].value_counts()) 259 | 260 | # Females that survived vs Females that passed away 261 | print(train["Survived"][train["Sex"] == 'female'].value_counts()) 262 | 263 | # Normalized male survival 264 | print(train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True)) 265 | 266 | # Normalized female survival 267 | print(train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True)) 268 | ``` 269 | 270 | *** =sct 271 | 272 | ```{python} 273 | msg = "Make sure you are using `.value_counts()` method correctly." 274 | test_function("print", 1, 275 | not_called_msg= msg, 276 | incorrect_msg = msg) 277 | 278 | msg = "Don't forget to set `normalize = True` when using `.value_counts()`." 279 | test_function("print", 2, 280 | not_called_msg = msg, 281 | incorrect_msg = msg) 282 | 283 | msg = "Make sure you are partitioning by males." 284 | test_function("print", 3, 285 | not_called_msg = msg, 286 | incorrect_msg = msg) 287 | 288 | msg = "Make sure you are partitioning by females." 289 | test_function("print", 4, 290 | not_called_msg= msg, 291 | incorrect_msg = msg) 292 | 293 | msg = "Don't forget to set `normalize = True` when using `.value_counts()`." 294 | test_function("print", 5, 295 | not_called_msg = msg, 296 | incorrect_msg = msg) 297 | 298 | test_function("print", 6, 299 | not_called_msg = msg, 300 | incorrect_msg = msg) 301 | 302 | success_msg("Well done! It looks like it makes sense to predict that all females will survive, and all men will die.") 303 | 304 | ``` 305 | 306 | --- type:NormalExercise lang:python xp:100 skills:2 key:b8f71cf4de 307 | ## Does age play a role? 308 | 309 | Another variable that could influence survival is age; since it's probable that children were saved first. You can test this by creating a new column with a categorical variable `Child`. `Child` will take the value 1 in cases where age is less than 18, and a value of 0 in cases where age is greater than or equal to 18. 310 | 311 | To add this new variable you need to do two things (i) create a new column, and (ii) provide the values for each observation (i.e., row) based on the age of the passenger. 312 | 313 | Adding a new column with Pandas in Python is easy and can be done via the following syntax: 314 | 315 | ``` 316 | your_data["new_var"] = 0 317 | ``` 318 | 319 | This code would create a new column in the `train` DataFrame titled `new_var` with `0` for each observation. 320 | 321 | To set the values based on the age of the passenger, you make use of a boolean test inside the square bracket operator. With the `[]`-operator you create a subset of rows and assign a value to a certain variable of that subset of observations. For example, 322 | 323 | ``` 324 | train["new_var"][train["Fare"] > 10] = 1 325 | ``` 326 | 327 | would give a value of `1` to the variable `new_var` for the subset of passengers whose fares greater than 10. Remember that `new_var` has a value of `0` for all other values (including missing values). 328 | 329 | A new column called `Child` in the `train` data frame has been created for you that takes the value `NaN` for all observations. 330 | 331 | *** =instructions 332 | - Set the values of `Child` to `1` is the passenger's age is less than 18 years. 333 | - Then assign the value `0` to observations where the passenger is greater than or equal to 18 years in the new `Child` column. 334 | - Compare the normalized survival rates for those who are <18 and those who are older. Use code similar to what you had in the previous exercise. 335 | 336 | *** =hint 337 | Suppose you wanted to add a new column `clothes` to the `test` set, then give all males the value `"pants"` and the others `"skirt"`: 338 | 339 | ``` 340 | test["clothes"] = "skirt" 341 | 342 | test["clothes"][test["Sex"] == "male"] = "pants" 343 | ``` 344 | 345 | *** =pre_exercise_code 346 | 347 | ```{python} 348 | import pandas as pd 349 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 350 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 351 | ``` 352 | 353 | *** =sample_code 354 | 355 | ```{python} 356 | # Create the column Child and assign to 'NaN' 357 | train["Child"] = float('NaN') 358 | 359 | # Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column. 360 | 361 | 362 | 363 | 364 | # Print normalized Survival Rates for passengers under 18 365 | print(train["Survived"][train["Child"] == 1].value_counts(normalize = True)) 366 | 367 | # Print normalized Survival Rates for passengers 18 or older 368 | 369 | 370 | ``` 371 | 372 | *** =solution 373 | 374 | ```{python} 375 | # Create the column Child and assign to 'NaN' 376 | train["Child"] = float('NaN') 377 | 378 | # Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column. 379 | train["Child"][train["Age"] < 18] = 1 380 | train["Child"][train["Age"] >= 18] = 0 381 | print(train["Child"]) 382 | 383 | # Print normalized Survival Rates for passengers under 18 384 | print(train["Survived"][train["Child"] == 1].value_counts(normalize = True)) 385 | 386 | # Print normalized Survival Rates for passengers 18 or older 387 | print(train["Survived"][train["Child"] == 0].value_counts(normalize = True)) 388 | 389 | ``` 390 | 391 | *** =sct 392 | ```{python} 393 | msg = "Remember to print the new column `Child`. It should be equal to 1 when the passenger's age is under 18 and 0 if the passenger's age is 18 or greater." 394 | test_function("print", 2, 395 | not_called_msg = msg, 396 | incorrect_msg = msg) 397 | 398 | msg = "Compute the survival proportions for those OVER 18. Refer to the code provided for passengers under 18." 399 | test_function("print", 3, 400 | not_called_msg = msg, 401 | incorrect_msg = msg) 402 | 403 | success_msg("Well done! As you can see from the survival proportions, age does certainly seem to play a role.") 404 | ``` 405 | 406 | --- type:NormalExercise lang:python xp:100 skills:2 key:f02305d182 407 | ## First Prediction 408 | 409 | In one of the previous exercises you discovered that in your training set, females had over a 50% chance of surviving and males had less than a 50% chance of surviving. Hence, you could use this information for your first prediction: all females in the test set survive and all males in the test set die. 410 | 411 | You use your test set for validating your predictions. You might have seen that contrary to the training set, the test set has no `Survived` column. You add such a column using your predicted values. Next, when uploading your results, Kaggle will use this variable (= your predictions) to score your performance. 412 | 413 | *** =instructions 414 | - Create a variable `test_one`, identical to dataset `test` 415 | - Add an additional column, `Survived`, that you initialize to zero. 416 | - Use vector subsetting like in the previous exercise to set the value of `Survived` to 1 for observations whose `Sex` equals `"female"`. 417 | - Print the `Survived` column of predictions from the `test_one` dataset. 418 | 419 | *** =hint 420 | - To create a new variable, `y`, that is a copy of `x`, you can use `y = x`. 421 | - To initialize a new column `a` in a dataframe `data` to zero, you can use `data['a'] = 0`. 422 | - Have another look at the previous exercise if you're struggling with the third instruction. 423 | 424 | *** =pre_exercise_code 425 | 426 | ```{python} 427 | import pandas as pd 428 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 429 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 430 | ``` 431 | 432 | *** =sample_code 433 | 434 | ```{python} 435 | # Create a copy of test: test_one 436 | 437 | 438 | # Initialize a Survived column to 0 439 | 440 | 441 | # Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one` 442 | ``` 443 | 444 | *** =solution 445 | 446 | ```{python} 447 | # Create a copy of test: test_one 448 | test_one = test 449 | 450 | # Initialize a Survived column to 0 451 | test_one["Survived"] = 0 452 | 453 | # Set Survived to 1 if Sex equals "female" 454 | test_one["Survived"][test_one["Sex"] == "female"] = 1 455 | print(test_one.Survived) 456 | ``` 457 | 458 | *** =sct 459 | 460 | ```{python} 461 | 462 | test_function("print", 463 | not_called_msg = "Make sure to define the column `Survived` inside `test_one`", 464 | incorrect_msg = "Make sure you are assigning 1 to female and 0 to male passengers") 465 | 466 | success_msg("Well done! If you want, you can already submit these first predictions to Kaggle [by uploading this csv file](http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/ch1_ex4_solution/my_solution.csv). In the next chapter, you will learn how to make more advanced predictions and create your own .csv file from Python.") 467 | ``` 468 | 469 | -------------------------------------------------------------------------------- /chapter2.md: -------------------------------------------------------------------------------- 1 | --- 2 | title : Predicting with Decision Trees 3 | description : After making your first predictions in the previous chapter, it's time to bring you to the next level by using a fundamental concept in machine learning: decision trees. 4 | 5 | 6 | --- type:NormalExercise lang:python xp:100 skills:2 key:98be5c3225 7 | ## Intro to decision trees 8 | 9 | In the previous chapter, you did all the slicing and dicing yourself to find subsets that have a higher chance of surviving. A decision tree automates this process for you and outputs a classification model or classifier. 10 | 11 | Conceptually, the decision tree algorithm starts with all the data at the root node and scans all the variables for the best one to split on. Once a variable is chosen, you do the split and go down one level (or one node) and repeat. The final nodes at the bottom of the decision tree are known as terminal nodes, and the majority vote of the observations in that node determine how to predict for new observations that end up in that terminal node. 12 | 13 | First, let's import the necessary libraries: 14 | 15 | *** =instructions 16 | - Import the `numpy` library as `np` 17 | - From `sklearn` import the `tree` 18 | 19 | 20 | *** =hint 21 | 22 | - Use the `import` and `as` special keys when importing `numpy`. 23 | - You can use `from sklearn import tree` command to import `tree`. 24 | 25 | *** =pre_exercise_code 26 | 27 | ```{python} 28 | import pandas as pd 29 | import numpy as np 30 | import sklearn as sk 31 | from sklearn import tree 32 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 33 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 34 | 35 | ``` 36 | 37 | *** =sample_code 38 | ```{python} 39 | # Import the Numpy library 40 | 41 | # Import 'tree' from scikit-learn library 42 | from sklearn 43 | ``` 44 | 45 | *** =solution 46 | ```{python} 47 | # Import the Numpy library 48 | import numpy as np 49 | 50 | # Import 'tree' from scikit-learn library 51 | from sklearn import tree 52 | ``` 53 | 54 | *** =sct 55 | 56 | ```{python} 57 | test_import("numpy", same_as = False) 58 | success_msg("OK, your package is loaded now. Time for the real deal.") 59 | ``` 60 | 61 | 62 | --- type:NormalExercise lang:python xp:100 skills:2 key:98092838ce 63 | ## Cleaning and Formatting your Data 64 | 65 | Before you can begin constructing your trees you need to get your hands dirty and clean the data so that you can use all the features available to you. In the first chapter, we saw that the Age variable had some missing value. Missingness is a whole subject with and in itself, but we will use a simple imputation technique where we substitute each missing value with the median of the all present values. 66 | 67 | ``` 68 | train["Age"] = train["Age"].fillna(train["Age"].median()) 69 | ``` 70 | 71 | Another problem is that the Sex and Embarked variables are categorical but in a non-numeric format. Thus, we will need to assign each class a unique integer so that Python can handle the information. Embarked also has some missing values which you should impute witht the most common class of embarkation, which is `"S"`. 72 | 73 | 74 | *** =instructions 75 | - Assign the integer 1 to all females 76 | - Impute missing values in `Embarked` with class `S`. Use `.fillna()` method. 77 | - Replace each class of Embarked with a uniques integer. `0` for `S`, `1` for `C`, and `2` for `Q`. 78 | - Print the `Sex` and `Embarked` columns 79 | 80 | *** =hint 81 | - Use the standard bracket notation to select the appropriate rows and columns, and don't foget the `==` operator. 82 | 83 | *** =pre_exercise_code 84 | 85 | ```{python} 86 | import pandas as pd 87 | import numpy as np 88 | from sklearn import tree 89 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 90 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 91 | ``` 92 | 93 | *** =sample_code 94 | ```{python} 95 | # Convert the male and female groups to integer form 96 | train["Sex"][train["Sex"] == "male"] = 0 97 | 98 | # Impute the Embarked variable 99 | train["Embarked"] = 100 | 101 | # Convert the Embarked classes to integer form 102 | train["Embarked"][train["Embarked"] == "S"] = 0 103 | 104 | #Print the Sex and Embarked columns 105 | 106 | ``` 107 | 108 | *** =solution 109 | ```{python} 110 | # Convert the male and female groups to integer form 111 | train["Sex"][train["Sex"] == "male"] = 0 112 | train["Sex"][train["Sex"] == "female"] = 1 113 | 114 | # Impute the Embarked variable 115 | train["Embarked"] = train["Embarked"].fillna("S") 116 | 117 | # Convert the Embarked classes to integer form 118 | train["Embarked"][train["Embarked"] == "S"] = 0 119 | train["Embarked"][train["Embarked"] == "C"] = 1 120 | train["Embarked"][train["Embarked"] == "Q"] = 2 121 | 122 | # Print the Sex and Embarked columns 123 | print(train["Sex"]) 124 | print(train["Embarked"]) 125 | ``` 126 | *** =sct 127 | 128 | ```{python} 129 | 130 | msg = "It looks like you coded the `Sex` variable incorecctly. Make sure to use `0` for male and `1` for female" 131 | test_function("print", 1, 132 | args=None, 133 | not_called_msg = msg, 134 | incorrect_msg = msg,) 135 | 136 | msg = "It looks like you coded the `Embarked` variable incorecctly. Make sure to use `0` for `S`, `1` for `C, and `2` for `Q`." 137 | test_function("print", 2, 138 | args=None, 139 | not_called_msg = msg, 140 | incorrect_msg = msg,) 141 | 142 | success_msg("Geat! Now that the data is cleaned up a bit you are ready to begin building your first decision tree.") 143 | ``` 144 | 145 | --- type:NormalExercise lang:python xp:100 skills:2 key:2b663996b1 146 | ## Creating your first decision tree 147 | 148 | You will use the `scikit-learn` and `numpy` libraries to build your first decision tree. `scikit-learn` can be used to create `tree` objects from the `DecisionTreeClassifier` class. The methods that we will use take `numpy` arrays as inputs and therefore we will need to create those from the `DataFrame` that we already have. We will need the following to build a decision tree 149 | 150 | - `target`: A one-dimensional numpy array containing the target/response from the train data. (Survival in your case) 151 | - `features`: A multidimensional numpy array containing the features/predictors from the train data. (ex. Sex, Age) 152 | 153 | Take a look at the sample code below to see what this would look like: 154 | 155 | ``` 156 | target = train["Survived"].values 157 | 158 | features = train[["Sex", "Age"]].values 159 | 160 | my_tree = tree.DecisionTreeClassifier() 161 | 162 | my_tree = my_tree.fit(features, target) 163 | 164 | ``` 165 | 166 | One way to quickly see the result of your decision tree is to see the importance of the features that are included. This is done by requesting the `.feature_importances_` attribute of your tree object. Another quick metric is the mean accuracy that you can compute using the `.score()` function with `features_one` and `target` as arguments. 167 | 168 | Ok, time for you to build your first decision tree in Python! The train and testing data from chapter 1 are available in your workspace. 169 | 170 | *** =instructions 171 | - Build the `target` and `features_one` numpy arrays. The target will be based on the `Survived` column in `train`. The features 172 | array will be based on the variables Passenger, Class, Sex, Age, and Passenger Fare 173 | - Build a decision tree `my_tree_one` to predict survival using `features_one` and `target` 174 | - Look at the importance of features in your tree and compute the score 175 | 176 | *** =hint 177 | - Remember what the target column is in your data and assign it to `target`. 178 | - You can fit and compute the score for your decision tree by passing in the features and target objects you created. 179 | 180 | 181 | 182 | *** =pre_exercise_code 183 | ```{python} 184 | import pandas as pd 185 | import numpy as np 186 | import sklearn as sk 187 | from sklearn import tree 188 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 189 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 190 | 191 | train["Age"] = train["Age"].fillna(train["Age"].median()) 192 | train["Sex"][train["Sex"] == "male"] = 0 193 | train["Sex"][train["Sex"] == "female"] = 1 194 | 195 | ``` 196 | 197 | *** =sample_code 198 | ```{python} 199 | # Print the train data to see the available features 200 | print(train) 201 | 202 | # Create the target and features numpy arrays: target, features_one 203 | target = train[___].values 204 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values 205 | 206 | # Fit your first decision tree: my_tree_one 207 | my_tree_one = tree.DecisionTreeClassifier() 208 | my_tree_one = my_tree_one.fit(___, ___) 209 | 210 | # Look at the importance and score of the included features 211 | print(my_tree_one.feature_importances_) 212 | print(my_tree_one.score(___, ___)) 213 | ``` 214 | 215 | *** =solution 216 | 217 | ```{python} 218 | # Print the train data to see the available features 219 | print(train) 220 | 221 | # Create the target and features numpy arrays: target, features_one 222 | target = train["Survived"].values 223 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values 224 | 225 | # Fit your first decision tree: my_tree_one 226 | my_tree_one = tree.DecisionTreeClassifier() 227 | my_tree_one = my_tree_one.fit(features_one, target) 228 | 229 | # Look at the importance and score of the included features 230 | print(my_tree_one.feature_importances_) 231 | print(my_tree_one.score(features_one, target)) 232 | 233 | ``` 234 | 235 | 236 | *** =sct 237 | 238 | ```{python} 239 | msg = "`target` should be the `Survived` variable from the train dataset. Follow the code in the discussion for help." 240 | test_object("target", 241 | undefined_msg = msg, 242 | incorrect_msg = msg) 243 | 244 | msg = "Make sure that you are including the correct features in the stated order. Follow the code in the discussion for help." 245 | test_object("features_one", 246 | undefined_msg = msg, 247 | incorrect_msg = msg) 248 | 249 | msg = "It looks like the score was not computed correctly. Try re-submitting the code!" 250 | test_function("print",3, 251 | args=None, 252 | not_called_msg =msg, 253 | incorrect_msg = msg) 254 | 255 | success_msg("Well done! Time to investigate your decision tree a bit more.") 256 | ``` 257 | 258 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:87b643ee96 259 | ## Interpreting your decision tree 260 | 261 | The `feature_importances_` attribute make it simple to interpret the significance of the predictors you include. Based on your decision tree, what variable plays the most important role in determining whether or not a passenger survived? Your model (`my_tree_one`) is available in the console. 262 | 263 | *** =instructions 264 | - Passenger Class 265 | - Sex/Gender 266 | - Passenger Fare 267 | - Age 268 | 269 | *** =hint 270 | Have a close look at the `feature_importances_` attribute of your tree. What variable has the greatest coefficient? 271 | 272 | *** =pre_exercise_code 273 | 274 | ```{python} 275 | import pandas as pd 276 | import numpy as np 277 | import sklearn as sk 278 | from sklearn import tree 279 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 280 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 281 | 282 | train["Age"] = train["Age"].fillna(train["Age"].median()) 283 | train["Sex"][train["Sex"] == "male"] = 0 284 | train["Sex"][train["Sex"] == "female"] = 1 285 | 286 | target = train["Survived"].values 287 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values 288 | my_tree_one = tree.DecisionTreeClassifier(random_state = 1) 289 | my_tree_one = my_tree_one.fit(features_one, target) 290 | 291 | 292 | ``` 293 | 294 | *** =sct 295 | 296 | ```{python} 297 | msg1 = "Wrong choice. Check the hint for some help." 298 | msg3 = "Bellissimo! Time to make a prediction and submit it to Kaggle!" 299 | msg2 = msg1 300 | msg4 = msg1 301 | test_mc(correct = 3, msgs = [msg1, msg2, msg3, msg4]) 302 | 303 | success_msg("Looks like Passenger Fare has most significance in determining survival based on your model. Now let's move on to making your first submission to Kaggle!") 304 | 305 | ``` 306 | 307 | 308 | --- type:NormalExercise lang:python xp:100 skills:2 key:4a70446ddd 309 | ## Predict and submit to Kaggle 310 | 311 | To send a submission to Kaggle you need to predict the survival rates for the observations in the test set. In the last exercise of the previous chapter, we created simple predictions based on a single subset. Luckily, with our decision tree, we can make use of some simple functions to "generate" our answer without having to manually perform subsetting. 312 | 313 | First, you make use of the `.predict()` method. You provide it the model (`my_tree_one`), the values of features from the dataset for which predictions need to be made (`test`). To extract the features we will need to create a numpy array in the same way as we did when training the model. However, we need to take care of a small but important problem first. There is a missing value in the Fare feature that needs to be imputed. 314 | 315 | Next, you need to make sure your output is in line with the submission requirements of Kaggle: a csv file with exactly 418 entries and two columns: `PassengerId` and `Survived`. Then use the code provided to make a new data frame using `DataFrame()`, and create a csv file using `to_csv()` method from Pandas. 316 | 317 | *** =instructions 318 | - Impute the missing value for Fare in row 153 with the median of the column. 319 | - Make a prediction on the test set using the `.predict()` method and `my_tree_one`. Assign the result to `my_prediction`. 320 | - Create a data frame `my_solution` containing the solution and the passenger ids from the test set. Make sure the solution is in line with the standards set forth by Kaggle by naming the column appropriately. 321 | 322 | *** =hint 323 | 324 | - When doing the imputation use the `Fare` feature and the `.median` method. 325 | - Make sure to select the Pclass, Sex, Age, and Fare features in this exact order. Don't chnage the skeleton of the solution! 326 | 327 | 328 | *** =pre_exercise_code 329 | ```{python} 330 | import pandas as pd 331 | import numpy as np 332 | import sklearn as sk 333 | from sklearn import tree 334 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 335 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 336 | 337 | target = train["Survived"].values 338 | 339 | train["Age"] = train["Age"].fillna(train["Age"].median()) 340 | train["Sex"][train["Sex"] == "male"] = 0 341 | train["Sex"][train["Sex"] == "female"] = 1 342 | 343 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values 344 | my_tree_one = tree.DecisionTreeClassifier(random_state = 1) 345 | my_tree_one = my_tree_one.fit(features_one, target) 346 | 347 | test["Age"] = test["Age"].fillna(test["Age"].median()) 348 | test["Sex"][test["Sex"] == "male"] = 0 349 | test["Sex"][test["Sex"] == "female"] = 1 350 | 351 | ``` 352 | 353 | *** =sample_code 354 | 355 | ```{python} 356 | # Impute the missing value with the median 357 | test.Fare[152] = 358 | 359 | # Extract the features from the test set: Pclass, Sex, Age, and Fare. 360 | test_features = test[[___, ___, ___, ___]].values 361 | 362 | # Make your prediction using the test set 363 | my_prediction = my_tree_one.predict(test_features) 364 | 365 | # Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions 366 | PassengerId =np.array(test["PassengerId"]).astype(int) 367 | my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"]) 368 | print(my_solution) 369 | 370 | # Check that your data frame has 418 entries 371 | print(my_solution.shape) 372 | 373 | # Write your solution to a csv file with the name my_solution.csv 374 | my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"]) 375 | ``` 376 | 377 | *** =solution 378 | 379 | ```{python} 380 | # Impute the missing value with the median 381 | test.Fare[152] = test.Fare.median() 382 | 383 | # Extract the features from the test set: Pclass, Sex, Age, and Fare. 384 | test_features = test[["Pclass", "Sex", "Age", "Fare"]].values 385 | 386 | # Make your prediction using the test set and print them. 387 | my_prediction = my_tree_one.predict(test_features) 388 | print(my_prediction) 389 | 390 | # Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions 391 | PassengerId =np.array(test["PassengerId"]).astype(int) 392 | my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"]) 393 | print(my_solution) 394 | 395 | # Check that your data frame has 418 entries 396 | print(my_solution.shape) 397 | 398 | # Write your solution to a csv file with the name my_solution.csv 399 | my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"]) 400 | ``` 401 | 402 | *** =sct 403 | 404 | ```{python} 405 | 406 | test_object("test_features", 407 | incorrect_msg = "Make sure that you are selecting the correct variables from the `test` dataset.") 408 | test_function("print",3, args=None, 409 | incorrect_msg = "It looks like your solution doesn't have the correct number of entries. There should be exactly 418 rows!") 410 | 411 | success_msg("Great! You just created your first decision tree. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_one.csv), and submit the created csv to Kaggle to see the result of your effort.") 412 | 413 | ``` 414 | 415 | --- type:NormalExercise lang:python xp:100 skills:2 key:fa5a95aab5 416 | ## Overfitting and how to control it 417 | 418 | When you created your first decision tree the default arguments for `max_depth` and `min_samples_split` were set to `None`. This means that no limit on the depth of your tree was set. That's a good thing right? Not so fast. We are likely overfitting. This means that while your model describes the training data extremely well, it doesn't generalize to new data, which is frankly the point of prediction. Just look at the Kaggle submission results for the simple model based on Gender and the complex decision tree. Which one does better? 419 | 420 | Maybe we can improve the overfit model by making a less complex model? In `DecisionTreeRegressor`, the depth of our model is defined by two parameters: 421 | - the `max_depth` parameter determines when the splitting up of the decision tree stops. 422 | - the `min_samples_split` parameter monitors the amount of observations in a bucket. If a certain threshold is not reached (e.g minimum 10 passengers) no further splitting can be done. 423 | 424 | By limiting the complexity of your decision tree you will increase its generality and thus its usefulness for prediction! 425 | *** =instructions 426 | - Include the Siblings/Spouses Aboard, Parents/Children Aboard, and Embarked features in a new set of features. 427 | - Fit your second tree `my_tree_two` with the new features, and control for the model compelexity by toggling the `max_depth` and `min_samples_split` arguments. 428 | 429 | 430 | *** =hint 431 | 432 | You can always use `train.describe()` in the console to check the names of the features. 433 | 434 | *** =pre_exercise_code 435 | 436 | ```{python} 437 | import pandas as pd 438 | import numpy as np 439 | import sklearn as sk 440 | from sklearn import tree 441 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 442 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 443 | 444 | target = train["Survived"].values 445 | 446 | train["Age"] = train["Age"].fillna(train["Age"].median()) 447 | train["Sex"][train["Sex"] == "male"] = 0 448 | train["Sex"][train["Sex"] == "female"] = 1 449 | train["Embarked"] = train["Embarked"].fillna("S") 450 | train["Embarked"][train["Embarked"] == "S"] = 0 451 | train["Embarked"][train["Embarked"] == "C"] = 1 452 | train["Embarked"][train["Embarked"] == "Q"] = 2 453 | 454 | ``` 455 | 456 | *** =sample_code 457 | ```{python} 458 | # Create a new array with the added features: features_two 459 | features_two = train[["Pclass","Age","Sex","Fare", ___, ___, ___]].values 460 | 461 | #Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two 462 | max_depth = 463 | min_samples_split = 464 | my_tree_two = tree.DecisionTreeClassifier(max_depth = ___, min_samples_split = ____, random_state = 1) 465 | my_tree_two = 466 | 467 | #Print the score of the new decison tree 468 | 469 | ``` 470 | 471 | 472 | *** =solution 473 | ```{python} 474 | # Create a new array with the added features: features_two 475 | features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values 476 | 477 | #Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two 478 | max_depth = 10 479 | min_samples_split = 5 480 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1) 481 | my_tree_two = my_tree_two.fit(features_two, target) 482 | 483 | #Print the score of the new decison tree 484 | print(my_tree_two.score(features_two, target)) 485 | ``` 486 | 487 | *** =sct 488 | 489 | ```{python} 490 | test_object("features_two", 491 | incorrect_msg = "Make sure you are selecting the specified features from the train dataset.") 492 | test_object("max_depth", 493 | incorrect_msg = "The maximum deapth argument shoudl be set to 10!") 494 | test_object("min_samples_split", 495 | incorrect_msg = "The min_samples_split argument shoudl be set to 5!") 496 | test_function("print", args=None, 497 | incorrect_msg = "It looks like score wasn't computed quite right. Make sure that the you are using the `features_two` and `target` as your arguments.") 498 | 499 | success_msg("Great! You just created your second and possibly improved decision tree. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_two.csv) .Submit your updated solution to Kaggle to see how despite a lower `.score` you predict better.") 500 | 501 | ``` 502 | 503 | --- type:NormalExercise lang:python xp:100 skills:2 key:55678ebefb 504 | ## Feature-engineering for our Titanic data set 505 | 506 | Data Science is an art that benefits from a human element. Enter feature engineering: creatively engineering your own features by combining the different existing variables. 507 | 508 | While feature engineering is a discipline in itself, too broad to be covered here in detail, you will have a look at a simple example by creating your own new predictive attribute: `family_size`. 509 | 510 | A valid assumption is that larger families need more time to get together on a sinking ship, and hence have lower probability of surviving. Family size is determined by the variables `SibSp` and `Parch`, which indicate the number of family members a certain passenger is traveling with. So when doing feature engineering, you add a new variable `family_size`, which is the sum of `SibSp` and `Parch` plus one (the observation itself), to the test and train set. 511 | 512 | *** =instructions 513 | - Create a new train set `train_two` that differs from `train` only by having an extra column with your feature engineered variable `family_size`. 514 | - Add your feature engineered variable `family_size` in addition to `Pclass`, `Sex`, `Age`, `Fare`, `SibSp` and `Parch` to `features_three`. 515 | - Create a new decision tree as `my_tree_three` and fit the decision tree with your new feature set `features_three`. Then check out the score of the decision tree. 516 | 517 | *** =hint 518 | 519 | - Don't forget to add `1` when adding the column with the new feature 520 | - Add your newly defined feature to be included in `features_three` 521 | - Remember how you fit the decision tree model in the last exercise 522 | 523 | *** =pre_exercise_code 524 | ```{python} 525 | import pandas as pd 526 | import numpy as np 527 | import sklearn as sk 528 | from sklearn import tree 529 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv") 530 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv") 531 | 532 | target = train["Survived"].values 533 | 534 | train["Age"] = train["Age"].fillna(train["Age"].median()) 535 | train["Sex"][train["Sex"] == "male"] = 0 536 | train["Sex"][train["Sex"] == "female"] = 1 537 | train["Embarked"] = train["Embarked"].fillna("S") 538 | train["Embarked"][train["Embarked"] == "S"] = 0 539 | train["Embarked"][train["Embarked"] == "C"] = 1 540 | train["Embarked"][train["Embarked"] == "Q"] = 2 541 | 542 | ``` 543 | 544 | *** =sample_code 545 | ```{python} 546 | # Create train_two with the newly defined feature 547 | train_two = train.copy() 548 | train_two["family_size"] = 549 | 550 | # Create a new feature set and add the new feature 551 | features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", ___]].values 552 | 553 | # Define the tree classifier, then fit the model 554 | my_tree_three = tree.DecisionTreeClassifier() 555 | my_tree_three = 556 | 557 | # Print the score of this decision tree 558 | print(my_tree_three.score(features_three, target)) 559 | 560 | ``` 561 | 562 | *** =solution 563 | 564 | ```{python} 565 | # Create train_two with the newly defined feature 566 | train_two = train.copy() 567 | train_two["family_size"] = train["SibSp"] + train["Parch"] + 1 568 | 569 | # Create a new feature set and add the new feature 570 | features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values 571 | 572 | # Define the tree classifier, then fit the model 573 | my_tree_three = tree.DecisionTreeClassifier() 574 | my_tree_three = my_tree_three.fit(features_three, target) 575 | 576 | # Print the score of this decision tree 577 | print(my_tree_three.score(features_three, target)) 578 | 579 | ``` 580 | 581 | *** =sct 582 | 583 | ```{python} 584 | 585 | test_object("features_three", 586 | incorrect_msg = "Be sure that you add `1` while defining `family_size`. Then include `family_size` in `features_three`.") 587 | test_function("print", args=None, 588 | incorrect_msg = "It looks like score wasn't computed quite right. Make sure that the you are using the `features_three` and `target` to fit your decision tree model.") 589 | 590 | success_msg("Great! Notice that this time the newly created variable is included in the model. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_three.csv), and submit the created csv to Kaggle to see the result of the updated model.") 591 | ``` 592 | 593 | --------------------------------------------------------------------------------