├── .gitignore
├── img
    ├── author_image.png
    └── shield_image.png
├── requirements.sh
├── README.md
├── course.yml
├── Python_Code__For_Decision_Trees
├── chapter3.md
├── chapter1.md
└── chapter2.md


/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 


--------------------------------------------------------------------------------
/img/author_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datacamp/community-courses-kaggle-python-tutorial-on-machine-learning/HEAD/img/author_image.png


--------------------------------------------------------------------------------
/img/shield_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datacamp/community-courses-kaggle-python-tutorial-on-machine-learning/HEAD/img/shield_image.png


--------------------------------------------------------------------------------
/requirements.sh:
--------------------------------------------------------------------------------
1 | pip3 install pandas==0.19.1
2 | pip3 install numpy==1.11.0
3 | pip3 install scipy==0.18.1
4 | pip3 install scikit-learn==0.18.1
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Kaggle Python Tutorial on Machine Learning
2 | 
3 | Source files for the Kaggle Python Tutorial on Machine Learning course on DataCamp.
4 | 
5 | You can access the course here: https://www.datacamp.com/courses/1015
6 | 
7 | 


--------------------------------------------------------------------------------
/course.yml:
--------------------------------------------------------------------------------
 1 | title                : Kaggle Python Tutorial on Machine Learning
 2 | author_field         : The DataCamp Team
 3 | description          : Always wanted to compete in a <a href='https://www.kaggle.com/?utm_medium=partner&utm_source=datacamp.com&utm_campaign=datacamp+python' target="_blank">Kaggle</a> competition but not sure you have the right skillset? This interactive tutorial by Kaggle and DataCamp on Machine Learning offers the solution. Step-by-step you will learn through fun coding exercises how to predict survival rate for Kaggle's Titanic competition using Machine Learning techniques. Upload your results and see your ranking go up! <br><br> New to Python? Give our <a href='https://www.datacamp.com/courses/intro-to-python-for-data-science' target="_blank">Introduction to Python for Data Science</a> course a try.
 4 | author_bio           : DataCamp is a young team of data analytics enthusiasts that provide affordable interactive data science and statistics education to the world. We do not believe in an educational framework that centers on passively reading books, or on watching YouTube videos that put a focus on the instructor, and not the scholar. We provide courses for both the novice and the experienced data scientist, and even allow passionate users to freely use the learning platform to create their own interactive courses.
 5 | university           : DataCamp
 6 | difficulty_level     : 2
 7 | time_needed          : 1 hour
 8 | programming_language : python
 9 | from                 : "python-base-prod:20"
10 | 


--------------------------------------------------------------------------------
/Python_Code__For_Decision_Trees:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import sklearn as sk
  4 | from sklearn import tree
  5 | import matplotlib.pyplot as plt
  6 | 
  7 | # Load the train and test datasets to create two DataFrames
  8 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
  9 | train = pd.read_csv(train_url)
 10 | 
 11 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
 12 | test = pd.read_csv(test_url)
 13 | 
 14 | 
 15 | #### converting variables and clean the data
 16 | train.loc[train["Sex"] == "male", "Sex"] = 0
 17 | train.loc[train["Sex"] == "female", "Sex"] = 1
 18 | 
 19 | train["Embarked"] = train["Embarked"].fillna("S")
 20 | 
 21 | train.loc[train["Embarked"] == "S", "Embarked"] = 0
 22 | train.loc[train["Embarked"] == "C", "Embarked"] = 1
 23 | train.loc[train["Embarked"] == "Q", "Embarked"] = 2
 24 | 
 25 | train["Age"] = train["Age"].fillna(train["Age"].median())
 26 | 
 27 | ## building the first tree
 28 | target = np.array(train.Survived).transpose()
 29 | features_one = np.array([train.Pclass, train.Sex, train.Age,  train.Fare]).transpose()
 30 | 
 31 | my_tree_one = tree.DecisionTreeClassifier()
 32 | my_tree_one = my_tree_one.fit(features_one, target)
 33 | 
 34 | #### second tree
 35 | 
 36 | features_two = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose()
 37 | 
 38 | my_tree_two = tree.DecisionTreeClassifier()
 39 | my_tree_two = my_tree_two.fit(features_two, target)
 40 | 
 41 | #### third tree
 42 | # control overfitting
 43 | my_tree_three = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5)
 44 | my_tree_three = my_tree_three.fit(features_two, target)
 45 | 
 46 | 
 47 | ### evaluating the models
 48 | from sklearn.metrics import confusion_matrix
 49 | 
 50 | pred_vec_three = my_tree_three.predict(features_two)
 51 | pred_vec_two = my_tree_two.predict(features_two)
 52 | pred_vec_one = my_tree_one.predict(features_one)
 53 | 
 54 | def pred_eval(pred_vec,target):
 55 |     cm = confusion_matrix(pred_vec,target)
 56 |     true_positive = cm[0][0]
 57 |     true_negative = cm[1][1]
 58 |     false_positive = cm[0][1]
 59 |     false_negative = cm[1][0]
 60 |     positive = true_positive + false_negative
 61 |     negative = true_negative + false_positive
 62 |     sensitivity = true_positive/positive #proportion of survivals correctly classified (want to maximize)
 63 |     specificity = true_negative/negative #proportion of deaths correctly classified (want to maximize)
 64 |     ppv = true_positive/(true_positive + false_positive)
 65 |     npv = true_negative/(true_negative + false_negative)
 66 |     fnr = false_negative/positive #accordingly minimize 1 - sensitivity
 67 |     fpr = false_positive/negative #accordingly minimize 1 - specificity
 68 |     
 69 |     eval = np.array([cm,sensitivity,specificity,ppv,npv,fnr,fpr])
 70 |     return(eval)
 71 | 
 72 | my_tree_one.score(features_one, target)
 73 | my_tree_two.score(features_two, target)
 74 | my_tree_three.score(features_two, target)
 75 | 
 76 | #### Graphiong the Tree
 77 | 
 78 | 
 79 | #from sklearn.externals.six import StringIO 
 80 | #import pydot
 81 | #dot_data = StringIO() 
 82 | #tree.export_graphviz(my_tree_one, out_file = dot_data)
 83 | #graph = pydot.graph_from_dot_data(dot_data.getvalue())
 84 | #graph.write_pdf("tree.pdf")
 85 | 
 86 | #from sklearn.externals.six import StringIO
 87 | #with open("tree.dot", 'w') as f:
 88 | #    f = tree.export_graphviz(my_tree_two, out_file=f)
 89 | 
 90 | #from IPython.display import Image
 91 | #dot_data = StringIO()
 92 | #tree.export_graphviz(my_tree_two, out_file=dot_data,  filled=True, rounded=True,  special_characters=True)
 93 | #graph = pydot.graph_from_dot_data(dot_data.getvalue())
 94 | 
 95 | 
 96 | 
 97 | #### Useful Attributes
 98 | my_tree_one.feature_importances_
 99 | my_tree_one.tree_
100 | my_tree_one.n_classes_
101 | my_tree_one.n_features_
102 | my_tree_one.classes_
103 | 
104 | 
105 | 
106 | ####  Clean the test data.
107 | test.loc[test["Sex"] == "male", "Sex"] = 0
108 | test.loc[test["Sex"] == "female", "Sex"] = 1
109 | 
110 | test["Embarked"] = test["Embarked"].fillna("S")
111 | 
112 | test.loc[test["Embarked"] == "S", "Embarked"] = 0
113 | test.loc[test["Embarked"] == "C", "Embarked"] = 1
114 | test.loc[test["Embarked"] == "Q", "Embarked"] = 2
115 | 
116 | test["Age"] = test["Age"].fillna(test["Age"].median())
117 | 
118 | test.Fare[152] = test.Fare.median()
119 | 
120 | 
121 | #### Prediction
122 | 
123 | test_features_one = np.array([test.Pclass, test.Fare, test.SibSp, test.Parch]).transpose()
124 | pred_one = my_tree_one.predict(test_features_one)
125 | 
126 | 
127 | test_features_two = np.array([test.Pclass,test.Age,test.Sex, test.Fare, test.SibSp, test.Parch,test.Embarked]).transpose()
128 | pred_two = my_tree_two.predict(test_features_two)
129 | 
130 | pred_three = my_tree_three.predict(test_features_two)
131 | 
132 | 
133 | #### Feature Engineering
134 | 
135 | 
136 | #### https://plot.ly/matplotlib/bar-charts/
137 | 
138 | y1 = cm1[1:5] 
139 | y2 = cm2[1:5]
140 | y3 = cm3[1:5]
141 | N = len(y1)
142 | x = range(N)
143 | plt.bar(x, y2, color="red")
144 | plt.bar(x, y3, color="green")
145 | plt.bar(x, y1, color="blue")
146 | 
147 | g1 = cm1[5:7] 
148 | g2 = cm2[5:7]
149 | g3 = cm3[5:7]
150 | M = len(g1)
151 | h = range(M)
152 | plt.bar(h, g1, color="blue")
153 | plt.bar(h, g3, color="green")
154 | plt.bar(h, g2, color="red")
155 | 
156 | 
157 | #### Building a Random Forest
158 | 
159 | from sklearn import cross_validation
160 | from sklearn.ensemble import RandomForestClassifier
161 | 
162 | features_forest = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose()
163 | 
164 | forest = RandomForestClassifier(max_depth = 10, n_estimators=100, min_samples_split=2)
165 | my_forest = forest.fit(features_forest, target)
166 | my_forest.score(features_forest, target)
167 | 
168 | #Evaluate the forest
169 | pred_vec_forest = my_forest.predict(features_forest)
170 | pred_eval(pred_vec_forest,target)
171 | 
172 | #predict using the forest
173 | pred_forest = my_forest.predict(test_features_two)
174 | 
175 | 
176 | 
177 | 


--------------------------------------------------------------------------------
/chapter3.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title       : Improving your predictions through Random Forests 
  3 | description : "What techniques can you use to improve your predictions even more? One possible way is by making use of the machine learning method Random Forest. Namely, a forest is just a collection of trees..."
  4 | 
  5 | 
  6 | --- type:NormalExercise lang:python xp:100 skills:2 key:05b740fff1
  7 | ## A Random Forest analysis in Python
  8 | A detailed study of Random Forests would take this tutorial a bit too far. However, since it's an often used machine learning technique, gaining a general understanding in Python won't hurt.
  9 | 
 10 | In layman's terms, the Random Forest technique handles the overfitting problem you faced with decision trees. It grows multiple (very deep) classification trees using the training set. At the time of prediction, each tree is used to come up with a prediction and every outcome is counted as a vote. For example, if you have trained 3 trees with 2 saying a passenger in the test set will survive and 1 says he will not, the passenger will be classified as a survivor. This approach of overtraining trees, but having the majority's vote count as the actual classification decision, avoids overfitting.
 11 | 
 12 | Building a random forest in Python looks almost the same as building a decision tree; so we can jump right to it. There are two key differences, however. Firstly, a different class is used. And second, a new argument is necessary. Also, we need to import the necessary library from scikit-learn.
 13 | 
 14 | - Use `RandomForestClassifier()` class instead of the `DecisionTreeClassifier()` class. 
 15 | - `n_estimators` needs to be set when using the `RandomForestClassifier()` class. This argument allows you to set the number of trees you wish to plant and average over.
 16 | 
 17 | The latest training and testing data are preloaded for you.
 18 | 
 19 | 
 20 | *** =instructions
 21 | - Build the random forest with `n_estimators` set to `100`.
 22 | - Fit your random forest model with inputs `features_forest` and `target`.
 23 | - Compute the classifier predictions on the selected test set features.
 24 | 
 25 | 
 26 | *** =hint
 27 | 
 28 | - When computing the predictions you can use the `.predict()` mothod just like you did with decision trees!
 29 | - To compute the score use the `.score()` method with correct argumnets. Consult your previous work from CH2 if your don't recall the syntax.
 30 | 
 31 | *** =pre_exercise_code
 32 | ```{python}
 33 | import pandas as pd
 34 | import numpy as np
 35 | import sklearn as sk
 36 | from sklearn import tree
 37 | from sklearn.ensemble import RandomForestClassifier
 38 | 
 39 | 
 40 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
 41 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
 42 | 
 43 | train["Sex"][train["Sex"] == "male"] = 0
 44 | train["Sex"][train["Sex"] == "female"] = 1
 45 | 
 46 | train["Embarked"] = train["Embarked"].fillna("S")
 47 | 
 48 | train["Embarked"][train["Embarked"] == "S"] = 0
 49 | train["Embarked"][train["Embarked"] == "C"] = 1
 50 | train["Embarked"][train["Embarked"] == "Q"] = 2
 51 | 
 52 | train["Age"] = train["Age"].fillna(train["Age"].median())
 53 | 
 54 | target = train["Survived"].values
 55 | 
 56 | features_two = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
 57 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
 58 | my_tree_two = my_tree_two.fit(features_two, target)
 59 | 
 60 | 
 61 | test["Sex"][test["Sex"] == "male"] = 0
 62 | test["Sex"][test["Sex"] == "female"] = 1
 63 | 
 64 | test["Embarked"] = test["Embarked"].fillna("S")
 65 | 
 66 | test["Embarked"][test["Embarked"] == "S"] = 0
 67 | test["Embarked"][test["Embarked"] == "C"] = 1
 68 | test["Embarked"][test["Embarked"] == "Q"] = 2
 69 | 
 70 | test["Age"] = test["Age"].fillna(test["Age"].median())
 71 | 
 72 | test.Fare[152] = test.Fare.median()
 73 | 
 74 | ```
 75 | 
 76 | *** =sample_code
 77 | ```{python}
 78 | 
 79 | # Import the `RandomForestClassifier`
 80 | from sklearn.ensemble import RandomForestClassifier
 81 | 
 82 | # We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables
 83 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
 84 | 
 85 | # Building and fitting my_forest
 86 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = ___, random_state = 1)
 87 | my_forest = forest.fit(___, ___)
 88 | 
 89 | # Print the score of the fitted random forest
 90 | print(my_forest.score(features_forest, target))
 91 | 
 92 | # Compute predictions on our test set features then print the length of the prediction vector
 93 | test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
 94 | pred_forest = my_forest.predict(___)
 95 | print(len(pred_forest))
 96 | 
 97 | ```
 98 | 
 99 | *** =solution
100 | ```{python}
101 | 
102 | # Import the `RandomForestClassifier`
103 | from sklearn.ensemble import RandomForestClassifier
104 | 
105 | # We want the Pclass, Age, Sex, Fare,SibSp, Parch, and Embarked variables
106 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
107 | 
108 | # Building and fitting my_forest
109 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators = 100, random_state = 1)
110 | my_forest = forest.fit(features_forest, target)
111 | 
112 | # Print the score of the fitted random forest
113 | print(my_forest.score(features_forest, target))
114 | 
115 | # Compute predictions on our test set features then print the length of the prediction vector
116 | test_features = test[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
117 | pred_forest = my_forest.predict(test_features)
118 | print(len(pred_forest))
119 | 
120 | ```
121 | 
122 | *** =sct
123 | 
124 | ```{python}
125 | test_function("sklearn.ensemble.RandomForestClassifier", args=None,
126 |               incorrect_msg = "Don't forget to import `RandomForestClassifier` and use it to initiate your random forest.")
127 | test_object("features_forest",
128 |               incorrect_msg = "Make sure to select the specified features in the right order. These should come from the train dataset!")
129 | test_function("print",1, args=None,
130 |               incorrect_msg = "It looks like the score wasn't computed exactly right. Make sure to use `features_forest` and `target` as arguments")
131 | test_object("test_features",
132 |             incorrect_msg = "Make sure to select the specified features in the right order. These should come from the test dataset!")
133 | test_function("print",2, args=None,
134 |             incorrect_msg = "It seems that there is an incorrect number of predictions in pred_forest. Make sure to use `test_features` when computing the predictions.")
135 | ```
136 | 
137 | --- type:NormalExercise lang:python xp:100 skills:2 key:146563d1e8
138 | ## Interpreting and Comparing
139 | 
140 | Remember how we looked at `.feature_importances_` attribute for the decision trees? Well, you can request the same attribute from your random forest as well and interpret the relevance of the included variables.
141 | You might also want to compare the models in some quick and easy way. For this, we can use the `.score()` method. The `.score()` method takes the features data and the target vector and computes mean accuracy of your model. You can apply this method to both the forest and individual trees. Remember, this measure should be high but not extreme because that would be a sign of overfitting.
142 | 
143 | For this exercise, you have `my_forest` and `my_tree_two` available to you. The features and target arrays are also ready for use.
144 | 
145 | *** =instructions
146 | - Explore the feature importance for both models
147 | - Compare the mean accuracy score of the two models
148 | 
149 | *** =hint
150 | 
151 | - Make sure that you are applying the commands to `my_forest` and, are using correct arguments.
152 | - Don't forget that `target` and `features_forest` are preloaded for you!
153 | 
154 | *** =pre_exercise_code
155 | ```{python}
156 | import random
157 | random.seed(1)
158 | 
159 | import pandas as pd
160 | import numpy as np
161 | import sklearn as sk
162 | from sklearn import tree
163 | from sklearn.ensemble import RandomForestClassifier
164 | 
165 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
166 | train = pd.read_csv(train_url)
167 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
168 | test = pd.read_csv(test_url)
169 | 
170 | train["Sex"][train["Sex"] == "male"] = 0
171 | train["Sex"][train["Sex"] == "female"] = 1
172 | train["Embarked"] = train["Embarked"].fillna("S")
173 | train["Embarked"][train["Embarked"] == "S"] = 0
174 | train["Embarked"][train["Embarked"] == "C"] = 1
175 | train["Embarked"][train["Embarked"] == "Q"] = 2
176 | train["Age"] = train["Age"].fillna(train["Age"].median())
177 | 
178 | target = train["Survived"].values
179 | 
180 | features_two = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
181 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
182 | my_tree_two = my_tree_two.fit(features_two, target)
183 | 
184 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
185 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators=100, random_state = 1)
186 | my_forest = forest.fit(features_forest, target)
187 | 
188 | ```
189 | 
190 | 
191 | *** =sample_code
192 | ```{python}
193 | #Request and print the `.feature_importances_` attribute
194 | print(my_tree_two.feature_importances_)
195 | print()
196 | 
197 | #Compute and print the mean accuracy score for both models
198 | print(my_tree_two.score(features_two, target))
199 | print()
200 | ```
201 | 
202 | *** =solution
203 | ```{python}
204 | #Request and print the `.feature_importances_` attribute
205 | print(my_tree_two.feature_importances_)
206 | print(my_forest.feature_importances_)
207 | 
208 | #Compute and print the mean accuracy score for both models
209 | print(my_tree_two.score(features_two, target))
210 | print(my_forest.score(features_forest, target))
211 | ```
212 | 
213 | *** =sct
214 | 
215 | ```{python}
216 | test_function("print", 1, args=None,
217 |               incorrect_msg = "You don't need to edit the given code. Instead leave it and use it as a hint for your solution")
218 | test_function("print", 2, args=None,
219 |               incorrect_msg = "Use the give code as a hint on how to complete the task. You solution shoudl look the same except with `my_forest` and an object of investigation!")
220 | test_function("print", 3, args=None,
221 |               incorrect_msg = "You don't need to edit the given code. Instead leave it and use it as a hint for your solution")
222 | test_function("print", 4, args=None,
223 |               incorrect_msg = "Use the give code as a hint on how to complete the task. You solution shoudl look the same except with `my_forest` and an object of investigation!")
224 | ```
225 | 
226 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:db79e9fe21
227 | ## Conclude and Submit
228 | 
229 | Based on your finding in the previous exercise determine which feature was of most importance, and for which model.
230 | After this final exercise, you will be able to submit your random forest model to Kaggle! Use `my_forest`, `my_tree_two`, and `feature_importances_` to answer the question.
231 | 
232 | *** =hint
233 | 
234 | - By significance, we simply mean the magnitude of the values. For each feature you should see a decimal. The largst indicates greatest significance for the respective feature.
235 | 
236 | *** =pre_exercise_code
237 | 
238 | ```{python}
239 | import pandas as pd
240 | import numpy as np
241 | import sklearn as sk
242 | from sklearn import tree
243 | from sklearn.ensemble import RandomForestClassifier
244 | 
245 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
246 | train = pd.read_csv(train_url)
247 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
248 | test = pd.read_csv(test_url)
249 | 
250 | train["Sex"][train["Sex"] == "male"] = 0
251 | train["Sex"][train["Sex"] == "female"] = 1
252 | train["Embarked"] = train["Embarked"].fillna("S")
253 | train["Embarked"][train["Embarked"] == "S"] = 0
254 | train["Embarked"][train["Embarked"] == "C"] = 1
255 | train["Embarked"][train["Embarked"] == "Q"] = 2
256 | train["Age"] = train["Age"].fillna(train["Age"].median())
257 | 
258 | target = train["Survived"].values
259 | 
260 | features_two = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
261 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5)
262 | my_tree_two = my_tree_two.fit(features_two, target)
263 | 
264 | features_forest = train[["Pclass", "Age", "Sex", "Fare", "SibSp", "Parch", "Embarked"]].values
265 | forest = RandomForestClassifier(max_depth = 10, min_samples_split=2, n_estimators=100)
266 | my_forest = forest.fit(features_forest, target)
267 | 
268 | ```
269 | 
270 | *** =instructions
271 | - `The most important feature was "Age", but it was more significant for "my_tree_two"`
272 | - `The most important feature was "Sex", but it was more significant for "my_tree_two"`
273 | - `The most important feature was "Sex", but it was more significant for "my_forest"`
274 | - `The most important feature was "Age", but it was more significant for "my_forest"`
275 | 
276 | *** =sct
277 | 
278 | ```{python}
279 | 
280 | msg1 = "Wrong choice. Check the hint for some help."
281 | msg2 = "Wonderful! You are now at the end of this tutorial and ready to start improving the results yourself"
282 | msg3 = msg1
283 | msg4 = msg1
284 | test_mc(correct = 2, msgs = [msg1, msg2, msg3, msg4])
285 | 
286 | success_msg("Congrats on completing the course! Now that you created your first random forest and used it for prediction take a look at how well it does in the Kaggle competition. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_forest.csv). Having learned about decision trees and random forests, you can begin participating in some other Kaggle competitions as well. Good luck and have fun!")
287 | 
288 | ```
289 | 


--------------------------------------------------------------------------------
/chapter1.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title       : Getting Started with Python
  3 | description : In this chapter we will go trough the essential steps that you will need to take before beginning to build predictive models.
  4 | 
  5 | --- type:NormalExercise lang:python xp:100 skills:2 key:49f71e27bd
  6 | 
  7 | ## How it works
  8 | Welcome to our Kaggle Machine Learning Tutorial. In this tutorial, you will explore how to tackle Kaggle Titanic competition using Python and Machine Learning. In case you're new to Python, it's recommended that you first take our free [Introduction to Python for Data Science Tutorial](https://www.datacamp.com/courses/intro-to-python-for-data-science). Furthermore, while not required, familiarity with machine learning techniques is a plus so you can get the maximum out of this tutorial.
  9 | 
 10 | In the editor on the right, you should type Python code to solve the exercises. When you hit the 'Submit Answer' button, every line of code is interpreted and executed by Python and you get a message whether or not your code was correct. The output of your Python code is shown in the console in the lower right corner. Python makes use of the `#` sign to add comments; these lines are not run as Python code, so they will not influence your result.
 11 | 
 12 | You can also execute Python commands straight in the console. This is a good way to experiment with Python code, as your submission is not checked for correctness.
 13 | 
 14 | 
 15 | *** =instructions
 16 | - In the editor to the right, you see some Python code and annotations. This is what a typical exercise will look like.
 17 | - To complete the exercise and see how the interactive environment works  add the code to compute `y` and hit the `Submit Answer` button. Don't forget to print the result.
 18 | 
 19 | 
 20 | *** =hint
 21 | 
 22 | Just add a line of Python code that calculates the product of 6 and 9, just like the example in the sample code!
 23 | 
 24 | *** =pre_exercise_code
 25 | ```{python}
 26 | # no pre_exercise_code
 27 | ```
 28 | 
 29 | *** =sample_code
 30 | ```{python}
 31 | #Compute x = 4 * 3 and print the result
 32 | x = 4 * 3
 33 | print(x)
 34 | 
 35 | #Compute y = 6 * 9 and print the result
 36 | ```
 37 | 
 38 | *** =solution
 39 | ```{python}
 40 | #Compute x = 4 * 3 and print the result
 41 | x = 4 * 3
 42 | print(x)
 43 | 
 44 | #Compute y = 6 * 9 and print the result
 45 | y = 6*9
 46 | print(y)
 47 | ```
 48 | 
 49 | *** =sct
 50 | ```{python}
 51 | 
 52 | msg = "Don't forget to assign the correct value to y"
 53 | test_object("y", 
 54 |             undefined_msg = msg, 
 55 |             incorrect_msg = msg)
 56 | 
 57 | msg = "Print out the resulting object, `y`!"
 58 | test_function("print",2, 
 59 |               not_called_msg = msg,
 60 |               incorrect_msg = msg,
 61 |               args=None)
 62 | 
 63 | success_msg("Awesome! See how the console shows the result of the Python code you submitted? Now that you're familiar with the interface, let's get down to business!")
 64 | ```
 65 | 
 66 | --- type:NormalExercise lang:python xp:100 skills:2 key:18a0d06d73
 67 | ## Get the Data with Pandas
 68 | When the Titanic sank, 1502 of the 2224 passengers and crew were killed. One of the main reasons for this high level of casualties was the lack of lifeboats on this self-proclaimed "unsinkable" ship.
 69 | 
 70 | Those that have seen the movie know that some individuals were more likely to survive the sinking (lucky Rose) than others (poor Jack). In this course, you will learn how to apply machine learning techniques to predict a passenger's chance of surviving using Python.
 71 | 
 72 | Let's start with loading in the training and testing set into your Python environment. You will use the training set to build your model, and the test set to validate it. The data is stored on the web as `csv` files; their URLs are already available as character strings in the sample code. You can load this data with the `read_csv()` method from the Pandas library.
 73 | 
 74 | *** =instructions
 75 | - First, import the Pandas library as pd.
 76 | - Load the test data similarly to how the train data is loaded.
 77 | - Inspect the first couple rows of the loaded dataframes using the `.head()` method with the code provided.
 78 | 
 79 | *** =hint
 80 | - You can load in the training set with `train = pd.read_csv(train_url)`
 81 | - To print a variable to the console, use the print function on a new line.
 82 | 
 83 | *** =pre_exercise_code
 84 | ```{python}
 85 | 
 86 | ```
 87 | 
 88 | *** =sample_code
 89 | ```{python}
 90 | # Import the Pandas library
 91 | 
 92 | # Load the train and test datasets to create two DataFrames
 93 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
 94 | train = pd.read_csv(train_url)
 95 | 
 96 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
 97 | 
 98 | #Print the `head` of the train and test dataframes
 99 | print(train.head())
100 | print(test.head())
101 | ```
102 | *** =solution
103 | ```{python}
104 | # Import the Pandas library
105 | import pandas as pd
106 | 
107 | # Load the train and test datasets to create two DataFrames
108 | train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
109 | train = pd.read_csv(train_url)
110 | 
111 | test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
112 | test = pd.read_csv(test_url)
113 | 
114 | #Print the `head` of the train and test dataframes
115 | print(train.head())
116 | print(test.head())
117 | ```
118 | 
119 | *** =sct
120 | 
121 | ```{python}
122 | msg = "Have you correctly imported the `pandas` package? Use the alias `pd`."
123 | test_import("pandas",  not_imported_msg = msg,  incorrect_as_msg = msg)
124 | 
125 | msg = "Do not touch the code that specifies the URLs of the training and test set csvs."
126 | test_object("train_url", undefined_msg = msg, incorrect_msg = msg)
127 | test_object("test_url", undefined_msg = msg, incorrect_msg = msg)
128 | 
129 | msg = "Make sure you are using the `read_csv()` function correctly"
130 | test_function("pandas.read_csv", 1,
131 |               args=None,
132 |               not_called_msg = msg,
133 |               incorrect_msg = msg,)
134 | test_function("pandas.read_csv", 2,
135 |               args=None,
136 |               not_called_msg = msg,
137 |               incorrect_msg = msg)
138 | 
139 | #msg = "Don't forget to print the first few rows of `train` with the `.head()` method"
140 | #test_function("print", 1, not_called_msg = msg, incorrect_msg = msg)
141 | 
142 | #msg = "Don't forget to print the first few rows of `test` with the `.head()` method"
143 | #test_function("print", 2, not_called_msg = msg, incorrect_msg = msg)
144 | 
145 | success_msg("Well done! Now that your data is loaded in, let's see if you can understand it.")
146 | ```
147 | 
148 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:7f5cb08579
149 | ## Understanding your data
150 | 
151 | Before starting with the actual analysis, it's important to understand the structure of your data. Both `test` and `train` are DataFrame objects, the way pandas represent datasets. You can easily explore a DataFrame using the `.describe()` method. `.describe()` summarizes the columns/features of the DataFrame, including the count of observations, mean, max and so on. Another useful trick is to look at the dimensions of the DataFrame. This is done by requesting the `.shape` attribute of your DataFrame object. (ex. `your_data.shape`)
152 | 
153 | The training and test set are already available in the workspace, as `train` and `test`. Apply `.describe()` method and print the `.shape` attribute of the training set. Which of the following statements is correct?
154 | 
155 | *** =instructions
156 | - The training set has 891 observations and 12 variables, count for Age is 714.
157 | - The training set has 418 observations and 11 variables, count for Age is 891.
158 | - The testing set has 891 observations and 11 variables, count for Age is 891.
159 | - The testing set has 418 observations and 12 variables, count for Age is 714.
160 | 
161 | *** =hint
162 | To see the description of the `test` variable try `test.describe()`.
163 | 
164 | *** =pre_exercise_code
165 | ```{python}
166 | import pandas as pd
167 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
168 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
169 | ```
170 | 
171 | *** =sct
172 | 
173 | ```{python}
174 | 
175 | msg1 = "Great job!"
176 | msg2 = "Wrong, try again. Maybe have a look at the hint."
177 | msg3 = "Not so good... Maybe have a look at the hint."
178 | msg4 = "Incorrect. Maybe have a look at the hint."
179 | test_mc(correct = 1, msgs = [msg1, msg2, msg3, msg4])
180 | 
181 | success_msg("Well done! Now move on and explore some of the features in more detail.")
182 | 
183 | ```
184 | 
185 | --- type:NormalExercise lang:python xp:100 skills:1 key:1eeaaeb294
186 | ## Rose vs Jack, or Female vs Male
187 | 
188 | How many people in your training set survived the disaster with the Titanic? To see this, you can use the `value_counts()` method in combination with standard bracket notation to select a single column of a DataFrame:
189 | 
190 | ```
191 | # absolute numbers
192 | train["Survived"].value_counts()
193 | 
194 | # percentages
195 | train["Survived"].value_counts(normalize = True)
196 | ``` 
197 | 
198 | If you run these commands in the console, you'll see that 549 individuals died (62%) and 342 survived (38%). A simple way to predict heuristically could be: "majority wins". This would mean that you will predict every unseen observation to not survive.
199 | 
200 | To dive in a little deeper we can perform similar counts and percentage calculations on subsets of the Survived column. For example, maybe gender could play a role as well? You can explore this using the `.value_counts()` method for a two-way comparison on the number of males and females that survived, with this syntax:
201 | 
202 | ```
203 | train["Survived"][train["Sex"] == 'male'].value_counts()
204 | train["Survived"][train["Sex"] == 'female'].value_counts()
205 | ```
206 | 
207 | To get proportions, you can again pass in the argument `normalize = True` to the `.value_counts()` method.
208 | 
209 | *** =instructions
210 | - Calculate and print the survival rates in absolute numbers using `values_counts()` method.
211 | - Calculate and print the survival rates as proportions by setting the `normalize` argument to `True`.
212 | - Repeat the same calculations but on subsets of survivals based on Sex.
213 | 
214 | *** =hint
215 | - The code for the first four tasks is already given in the assignment!
216 | - Think about the `normalize` argument, and don't forget to print.
217 | 
218 | *** =pre_exercise_code
219 | ```{python}
220 | import pandas as pd
221 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
222 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
223 | ```
224 | 
225 | *** =sample_code
226 | ```{python}
227 | 
228 | # Passengers that survived vs passengers that passed away
229 | print()
230 | 
231 | # As proportions
232 | print()
233 | 
234 | # Males that survived vs males that passed away
235 | print()
236 | 
237 | # Females that survived vs Females that passed away
238 | print()
239 | 
240 | # Normalized male survival
241 | print()
242 | 
243 | # Normalized female survival
244 | print()
245 | 
246 | ```
247 | 
248 | *** =solution
249 | ```{python}
250 | 
251 | # Passengers that survived vs passengers that passed away
252 | print(train.Survived.value_counts())
253 | 
254 | # As proportions
255 | print(train["Survived"].value_counts(normalize = True))
256 | 
257 | # Males that survived vs males that passed away
258 | print(train["Survived"][train["Sex"] == 'male'].value_counts())
259 | 
260 | # Females that survived vs Females that passed away
261 | print(train["Survived"][train["Sex"] == 'female'].value_counts())
262 | 
263 | # Normalized male survival
264 | print(train["Survived"][train["Sex"] == 'male'].value_counts(normalize = True))
265 | 
266 | # Normalized female survival
267 | print(train["Survived"][train["Sex"] == 'female'].value_counts(normalize = True))
268 | ```
269 | 
270 | *** =sct
271 | 
272 | ```{python}
273 | msg = "Make sure you are using `.value_counts()` method correctly."
274 | test_function("print", 1,
275 |               not_called_msg= msg,
276 |               incorrect_msg = msg)
277 | 
278 | msg = "Don't forget to set `normalize = True` when using `.value_counts()`."
279 | test_function("print", 2,
280 |               not_called_msg = msg,
281 |               incorrect_msg = msg)
282 | 
283 | msg = "Make sure you are partitioning by males."
284 | test_function("print", 3,
285 |               not_called_msg = msg,
286 |               incorrect_msg = msg)
287 | 
288 | msg = "Make sure you are partitioning by females."
289 | test_function("print", 4,
290 |               not_called_msg= msg,
291 |               incorrect_msg = msg)
292 | 
293 | msg = "Don't forget to set `normalize = True` when using `.value_counts()`."
294 | test_function("print", 5,
295 |               not_called_msg = msg,
296 |               incorrect_msg = msg)
297 | 
298 | test_function("print", 6,
299 |               not_called_msg = msg,
300 |               incorrect_msg = msg)
301 | 
302 | success_msg("Well done! It looks like it makes sense to predict that all females will survive, and all men will die.")
303 | 
304 | ```
305 | 
306 | --- type:NormalExercise lang:python xp:100 skills:2 key:b8f71cf4de
307 | ## Does age play a role?
308 | 
309 | Another variable that could influence survival is age; since it's probable that children were saved first. You can test this by creating a new column with a categorical variable `Child`. `Child` will take the value 1 in cases where age is less than 18, and a value of 0 in cases where age is greater than or equal to 18. 
310 | 
311 | To add this new variable you need to do two things (i) create a new column, and (ii) provide the values for each observation (i.e., row) based on the age of the passenger.
312 | 
313 | Adding a new column with Pandas in Python is easy and can be done via the following syntax:
314 | 
315 | ```
316 | your_data["new_var"] = 0
317 | ```
318 | 
319 | This code would create a new column in the `train` DataFrame titled `new_var` with `0` for each observation.
320 | 
321 | To set the values based on the age of the passenger, you make use of a boolean test inside the square bracket operator. With the `[]`-operator you create a subset of rows and assign a value to a certain variable of that subset of observations. For example,
322 | 
323 | ```
324 | train["new_var"][train["Fare"] > 10] = 1
325 | ```
326 | 
327 | would give a value of `1` to the variable `new_var` for the subset of passengers whose fares greater than 10. Remember that `new_var` has a value of `0` for all other values (including missing values).
328 | 
329 | A new column called `Child` in the `train` data frame has been created for you that takes the value `NaN` for all observations.
330 | 
331 | *** =instructions
332 | - Set the values of `Child` to `1` is the passenger's age is less than 18 years. 
333 | - Then assign the value `0` to observations where the passenger is greater than or equal to 18 years in the new `Child` column. 
334 | - Compare the normalized survival rates for those who are <18 and those who are older. Use code similar to what you had in the previous exercise.
335 | 
336 | *** =hint
337 | Suppose you wanted to add a new column `clothes` to the `test` set, then give all males the value `"pants"` and the others `"skirt"`:
338 | 
339 | ```
340 | test["clothes"] = "skirt"
341 | 
342 | test["clothes"][test["Sex"] == "male"] = "pants"
343 | ```
344 | 
345 | *** =pre_exercise_code
346 | 
347 | ```{python}
348 | import pandas as pd
349 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
350 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
351 | ```
352 | 
353 | *** =sample_code
354 | 
355 | ```{python}
356 | # Create the column Child and assign to 'NaN'
357 | train["Child"] = float('NaN')
358 | 
359 | # Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column.
360 | 
361 | 
362 | 
363 | 
364 | # Print normalized Survival Rates for passengers under 18
365 | print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))
366 | 
367 | # Print normalized Survival Rates for passengers 18 or older
368 | 
369 | 
370 | ```
371 | 
372 | *** =solution
373 | 
374 | ```{python}
375 | # Create the column Child and assign to 'NaN'
376 | train["Child"] = float('NaN')
377 | 
378 | # Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column.
379 | train["Child"][train["Age"] < 18] = 1
380 | train["Child"][train["Age"] >= 18] = 0
381 | print(train["Child"])
382 | 
383 | # Print normalized Survival Rates for passengers under 18
384 | print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))
385 | 
386 | # Print normalized Survival Rates for passengers 18 or older
387 | print(train["Survived"][train["Child"] == 0].value_counts(normalize = True))
388 | 
389 | ```
390 | 
391 | *** =sct
392 | ```{python}
393 | msg = "Remember to print the new column `Child`. It should be equal to 1 when the passenger's age is under 18 and 0 if the passenger's age is 18 or greater."
394 | test_function("print", 2,
395 |               not_called_msg = msg,
396 |               incorrect_msg = msg)
397 | 
398 | msg = "Compute the survival proportions for those OVER 18. Refer to the code provided for passengers under 18."
399 | test_function("print", 3,
400 |               not_called_msg = msg,
401 |               incorrect_msg = msg)
402 | 
403 | success_msg("Well done! As you can see from the survival proportions, age does certainly seem to play a role.")
404 | ```
405 | 
406 | --- type:NormalExercise lang:python xp:100 skills:2 key:f02305d182
407 | ## First Prediction
408 | 
409 | In one of the previous exercises you discovered that in your training set, females had over a 50% chance of surviving and males had less than a 50% chance of surviving. Hence, you could use this information for your first prediction: all females in the test set survive and all males in the test set die. 
410 | 
411 | You use your test set for validating your predictions. You might have seen that contrary to the training set, the test set has no `Survived` column. You add such a column using your predicted values. Next, when uploading your results, Kaggle will use this variable (= your predictions) to score your performance. 
412 | 
413 | *** =instructions
414 | - Create a variable `test_one`, identical to dataset `test`
415 | - Add an additional column, `Survived`, that you initialize to zero.
416 | - Use vector subsetting like in the previous exercise to set the value of `Survived` to 1 for observations whose `Sex` equals `"female"`.
417 | - Print the `Survived` column of predictions from the `test_one` dataset.
418 | 
419 | *** =hint
420 | - To create a new variable, `y`, that is a copy of `x`, you can use `y = x`.
421 | - To initialize a new column `a` in a dataframe `data` to zero, you can use `data['a'] = 0`.
422 | - Have another look at the previous exercise if you're struggling with the third instruction.
423 | 
424 | *** =pre_exercise_code
425 | 
426 | ```{python}
427 | import pandas as pd
428 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
429 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
430 | ```
431 | 
432 | *** =sample_code
433 | 
434 | ```{python}
435 | # Create a copy of test: test_one
436 | 
437 | 
438 | # Initialize a Survived column to 0
439 | 
440 | 
441 | # Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
442 | ```
443 | 
444 | *** =solution
445 | 
446 | ```{python}
447 | # Create a copy of test: test_one
448 | test_one = test
449 | 
450 | # Initialize a Survived column to 0
451 | test_one["Survived"] = 0
452 | 
453 | # Set Survived to 1 if Sex equals "female"
454 | test_one["Survived"][test_one["Sex"] == "female"] = 1
455 | print(test_one.Survived)
456 | ```
457 | 
458 | *** =sct
459 | 
460 | ```{python}
461 | 
462 | test_function("print",
463 |               not_called_msg = "Make sure to define the column `Survived` inside `test_one`",
464 |               incorrect_msg = "Make sure you are assigning 1 to female and 0 to male passengers")
465 | 
466 | success_msg("Well done! If you want, you can already submit these first predictions to Kaggle [by uploading this csv file](http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/ch1_ex4_solution/my_solution.csv). In the next chapter, you will learn how to make more advanced predictions and create your own .csv file from Python.")
467 | ```
468 | 
469 | 


--------------------------------------------------------------------------------
/chapter2.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | title       : Predicting with Decision Trees
  3 | description : After making your first predictions in the previous chapter, it's time to bring you to the next level by using a fundamental concept in machine learning: decision trees.
  4 | 
  5 | 
  6 | --- type:NormalExercise lang:python xp:100 skills:2 key:98be5c3225
  7 | ## Intro to decision trees
  8 | 
  9 | In the previous chapter, you did all the slicing and dicing yourself to find subsets that have a higher chance of surviving. A decision tree automates this process for you and outputs a classification model or classifier.
 10 | 
 11 | Conceptually, the decision tree algorithm starts with all the data at the root node and scans all the variables for the best one to split on. Once a variable is chosen, you do the split and go down one level (or one node) and repeat. The final nodes at the bottom of the decision tree are known as terminal nodes, and the majority vote of the observations in that node determine how to predict for new observations that end up in that terminal node.
 12 | 
 13 | First, let's import the necessary libraries:
 14 | 
 15 | *** =instructions
 16 | - Import the `numpy` library as `np`
 17 | - From `sklearn` import the `tree`
 18 | 
 19 | 
 20 | *** =hint
 21 | 
 22 | - Use the `import` and `as` special keys when importing `numpy`.
 23 | - You can use `from sklearn import tree` command to import `tree`.
 24 | 
 25 | *** =pre_exercise_code
 26 | 
 27 | ```{python}
 28 | import pandas as pd
 29 | import numpy as np
 30 | import sklearn as sk
 31 | from sklearn import tree
 32 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
 33 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
 34 | 
 35 | ```
 36 | 
 37 | *** =sample_code
 38 | ```{python}
 39 | # Import the Numpy library
 40 | 
 41 | # Import 'tree' from scikit-learn library
 42 | from sklearn 
 43 | ```
 44 | 
 45 | *** =solution
 46 | ```{python}
 47 | # Import the Numpy library
 48 | import numpy as np
 49 | 
 50 | # Import 'tree' from scikit-learn library
 51 | from sklearn import tree
 52 | ```
 53 | 
 54 | *** =sct
 55 | 
 56 | ```{python}
 57 | test_import("numpy", same_as = False)
 58 | success_msg("OK, your package is loaded now. Time for the real deal.")
 59 | ```
 60 | 
 61 | 
 62 | --- type:NormalExercise lang:python xp:100 skills:2 key:98092838ce
 63 | ## Cleaning and Formatting your Data
 64 | 
 65 | Before you can begin constructing your trees you need to get your hands dirty and clean the data so that you can use all the features available to you. In the first chapter, we saw that the Age variable had some missing value. Missingness is a whole subject with and in itself, but we will use a simple imputation technique where we substitute each missing value with the median of the all present values.
 66 | 
 67 | ```
 68 | train["Age"] = train["Age"].fillna(train["Age"].median())
 69 | ```
 70 | 
 71 | Another problem is that the Sex and Embarked variables are categorical but in a non-numeric format. Thus, we will need to assign each class a unique integer so that Python can handle the information. Embarked also has some missing values which you should impute witht the most common class of embarkation, which is `"S"`.
 72 | 
 73 | 
 74 | *** =instructions
 75 | - Assign the integer 1 to all females
 76 | - Impute missing values in `Embarked` with class `S`. Use `.fillna()` method.
 77 | - Replace each class of Embarked with a uniques integer. `0` for `S`, `1` for `C`, and `2` for `Q`.
 78 | - Print the `Sex` and `Embarked` columns
 79 | 
 80 | *** =hint
 81 | - Use the standard bracket notation to select the appropriate rows and columns, and don't foget the `==` operator.
 82 | 
 83 | *** =pre_exercise_code
 84 | 
 85 | ```{python}
 86 | import pandas as pd
 87 | import numpy as np
 88 | from sklearn import tree
 89 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
 90 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
 91 | ```
 92 | 
 93 | *** =sample_code
 94 | ```{python}
 95 | # Convert the male and female groups to integer form
 96 | train["Sex"][train["Sex"] == "male"] = 0
 97 | 
 98 | # Impute the Embarked variable
 99 | train["Embarked"] = 
100 | 
101 | # Convert the Embarked classes to integer form
102 | train["Embarked"][train["Embarked"] == "S"] = 0
103 | 
104 | #Print the Sex and Embarked columns
105 | 
106 | ```
107 | 
108 | *** =solution
109 | ```{python}
110 | # Convert the male and female groups to integer form
111 | train["Sex"][train["Sex"] == "male"] = 0
112 | train["Sex"][train["Sex"] == "female"] = 1
113 | 
114 | # Impute the Embarked variable
115 | train["Embarked"] = train["Embarked"].fillna("S")
116 | 
117 | # Convert the Embarked classes to integer form
118 | train["Embarked"][train["Embarked"] == "S"] = 0
119 | train["Embarked"][train["Embarked"] == "C"] = 1
120 | train["Embarked"][train["Embarked"] == "Q"] = 2
121 | 
122 | # Print the Sex and Embarked columns
123 | print(train["Sex"])
124 | print(train["Embarked"])
125 | ```
126 | *** =sct
127 | 
128 | ```{python}
129 | 
130 | msg = "It looks like you coded the `Sex` variable incorecctly. Make sure to use `0` for male and `1` for female"
131 | test_function("print", 1,
132 |               args=None,
133 |               not_called_msg = msg,
134 |               incorrect_msg = msg,)
135 | 
136 | msg = "It looks like you coded the `Embarked` variable incorecctly. Make sure to use `0` for `S`, `1` for `C, and `2` for `Q`."
137 | test_function("print", 2,
138 |               args=None,
139 |               not_called_msg = msg,
140 |               incorrect_msg = msg,)
141 | 
142 | success_msg("Geat! Now that the data is cleaned up a bit you are ready to begin building your first decision tree.")
143 | ```
144 | 
145 | --- type:NormalExercise lang:python xp:100 skills:2 key:2b663996b1
146 | ## Creating your first decision tree
147 | 
148 | You will use the `scikit-learn` and `numpy` libraries to build your first decision tree. `scikit-learn` can be used to create `tree` objects from the `DecisionTreeClassifier` class. The methods that we will use take `numpy` arrays as inputs and therefore we will need to create those from the `DataFrame` that we already have. We will need the following to build a decision tree
149 | 
150 | - `target`: A one-dimensional numpy array containing the target/response from the train data. (Survival in your case)
151 | - `features`: A multidimensional numpy array containing the features/predictors from the train data. (ex. Sex, Age)
152 | 
153 | Take a look at the sample code below to see what this would look like:
154 | 
155 | ```
156 | target = train["Survived"].values
157 | 
158 | features = train[["Sex", "Age"]].values
159 | 
160 | my_tree = tree.DecisionTreeClassifier()
161 | 
162 | my_tree = my_tree.fit(features, target)
163 | 
164 | ```
165 | 
166 | One way to quickly see the result of your decision tree is to see the importance of the features that are included. This is done by requesting the `.feature_importances_` attribute of your tree object. Another quick metric is the mean accuracy that you can compute using the `.score()` function with `features_one` and `target` as arguments.
167 | 
168 | Ok, time for you to build your first decision tree in Python! The train and testing data from chapter 1 are available in your workspace.
169 | 
170 | *** =instructions
171 | - Build the `target` and `features_one` numpy arrays. The target will be based on the `Survived` column in `train`. The features
172 | array will be based on the variables Passenger, Class, Sex, Age, and Passenger Fare
173 | - Build a decision tree `my_tree_one` to predict survival using `features_one` and `target`
174 | - Look at the importance of features in your tree and compute the score
175 | 
176 | *** =hint
177 | - Remember what the target column is in your data and assign it to `target`.
178 | - You can fit and compute the score for your decision tree by passing in the features and target objects you created.
179 | 
180 | 
181 | 
182 | *** =pre_exercise_code
183 | ```{python}
184 | import pandas as pd
185 | import numpy as np
186 | import sklearn as sk
187 | from sklearn import tree
188 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
189 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
190 | 
191 | train["Age"] = train["Age"].fillna(train["Age"].median())
192 | train["Sex"][train["Sex"] == "male"] = 0
193 | train["Sex"][train["Sex"] == "female"] = 1
194 | 
195 | ```
196 | 
197 | *** =sample_code
198 | ```{python}
199 | # Print the train data to see the available features
200 | print(train)
201 | 
202 | # Create the target and features numpy arrays: target, features_one
203 | target = train[___].values
204 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
205 | 
206 | # Fit your first decision tree: my_tree_one
207 | my_tree_one = tree.DecisionTreeClassifier()
208 | my_tree_one = my_tree_one.fit(___, ___)
209 | 
210 | # Look at the importance and score of the included features
211 | print(my_tree_one.feature_importances_)
212 | print(my_tree_one.score(___, ___))
213 | ```
214 | 
215 | *** =solution
216 | 
217 | ```{python}
218 | # Print the train data to see the available features
219 | print(train)
220 | 
221 | # Create the target and features numpy arrays: target, features_one
222 | target = train["Survived"].values
223 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
224 | 
225 | # Fit your first decision tree: my_tree_one
226 | my_tree_one = tree.DecisionTreeClassifier()
227 | my_tree_one = my_tree_one.fit(features_one, target)
228 | 
229 | # Look at the importance and score of the included features
230 | print(my_tree_one.feature_importances_)
231 | print(my_tree_one.score(features_one, target))
232 | 
233 | ```
234 | 
235 | 
236 | *** =sct
237 | 
238 | ```{python}
239 | msg = "`target` should be the `Survived` variable from the train dataset. Follow the code in the discussion for help."
240 | test_object("target",
241 |               undefined_msg = msg,
242 |               incorrect_msg = msg)
243 | 
244 | msg = "Make sure that you are including the correct features in the stated order. Follow the code in the discussion for help."
245 | test_object("features_one",
246 |               undefined_msg = msg,
247 |               incorrect_msg = msg)
248 | 
249 | msg = "It looks like the score was not computed correctly. Try re-submitting the code!"
250 | test_function("print",3,
251 |               args=None,
252 |               not_called_msg =msg,
253 |               incorrect_msg = msg)
254 | 
255 | success_msg("Well done! Time to investigate your decision tree a bit more.")
256 | ```
257 | 
258 | --- type:MultipleChoiceExercise lang:python xp:50 skills:2 key:87b643ee96
259 | ## Interpreting your decision tree
260 | 
261 | The `feature_importances_` attribute make it simple to interpret the significance of the predictors you include. Based on your decision tree, what variable plays the most important role in determining whether or not a passenger survived? Your model (`my_tree_one`) is available in the console.
262 | 
263 | *** =instructions
264 | - Passenger Class
265 | - Sex/Gender
266 | - Passenger Fare
267 | - Age
268 | 
269 | *** =hint
270 | Have a close look at the `feature_importances_` attribute of your tree. What variable has the greatest coefficient? 
271 | 
272 | *** =pre_exercise_code
273 | 
274 | ```{python}
275 | import pandas as pd
276 | import numpy as np
277 | import sklearn as sk
278 | from sklearn import tree
279 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
280 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
281 | 
282 | train["Age"] = train["Age"].fillna(train["Age"].median())
283 | train["Sex"][train["Sex"] == "male"] = 0
284 | train["Sex"][train["Sex"] == "female"] = 1
285 | 
286 | target = train["Survived"].values
287 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
288 | my_tree_one = tree.DecisionTreeClassifier(random_state = 1)
289 | my_tree_one = my_tree_one.fit(features_one, target)
290 | 
291 | 
292 | ```
293 | 
294 | *** =sct
295 | 
296 | ```{python}
297 | msg1 = "Wrong choice. Check the hint for some help."
298 | msg3 = "Bellissimo! Time to make a prediction and submit it to Kaggle!"
299 | msg2 = msg1
300 | msg4 = msg1
301 | test_mc(correct = 3, msgs = [msg1, msg2, msg3, msg4])
302 | 
303 | success_msg("Looks like Passenger Fare has most significance in determining survival based on your model. Now let's move on to making your first submission to Kaggle!")
304 | 
305 | ```
306 | 
307 | 
308 | --- type:NormalExercise lang:python xp:100 skills:2 key:4a70446ddd
309 | ## Predict and submit to Kaggle
310 | 
311 | To send a submission to Kaggle you need to predict the survival rates for the observations in the test set. In the last exercise of the previous chapter, we created simple predictions based on a single subset. Luckily, with our decision tree, we can make use of some simple functions to "generate" our answer without having to manually perform subsetting.
312 | 
313 | First, you make use of the `.predict()` method. You provide it the model (`my_tree_one`), the values of features from the dataset for which predictions need to be made (`test`). To extract the features we will need to create a numpy array in the same way as we did when training the model. However, we need to take care of a small but important problem first. There is a missing value in the Fare feature that needs to be imputed.
314 | 
315 | Next, you need to make sure your output is in line with the submission requirements of Kaggle: a csv file with exactly 418 entries and two columns: `PassengerId` and `Survived`. Then use the code provided to make a new data frame using `DataFrame()`, and create a csv file using `to_csv()` method from Pandas. 
316 | 
317 | *** =instructions
318 | - Impute the missing value for Fare in row 153 with the median of the column.
319 | - Make a prediction on the test set using the `.predict()` method and `my_tree_one`. Assign the result to `my_prediction`.
320 | - Create a data frame `my_solution` containing the solution and the passenger ids from the test set. Make sure the solution is in line with the standards set forth by Kaggle by naming the column appropriately.
321 | 
322 | *** =hint
323 | 
324 | - When doing the imputation use the `Fare` feature and the `.median` method.
325 | - Make sure to select the Pclass, Sex, Age, and Fare features in this exact order. Don't chnage the skeleton of the solution!
326 | 
327 | 
328 | *** =pre_exercise_code
329 | ```{python}
330 | import pandas as pd
331 | import numpy as np
332 | import sklearn as sk
333 | from sklearn import tree
334 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
335 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
336 | 
337 | target = train["Survived"].values
338 | 
339 | train["Age"] = train["Age"].fillna(train["Age"].median())
340 | train["Sex"][train["Sex"] == "male"] = 0
341 | train["Sex"][train["Sex"] == "female"] = 1
342 | 
343 | features_one = train[["Pclass", "Sex", "Age", "Fare"]].values
344 | my_tree_one = tree.DecisionTreeClassifier(random_state = 1)
345 | my_tree_one = my_tree_one.fit(features_one, target)
346 | 
347 | test["Age"] = test["Age"].fillna(test["Age"].median())
348 | test["Sex"][test["Sex"] == "male"] = 0
349 | test["Sex"][test["Sex"] == "female"] = 1
350 | 
351 | ```
352 | 
353 | *** =sample_code
354 | 
355 | ```{python}
356 | # Impute the missing value with the median
357 | test.Fare[152] = 
358 | 
359 | # Extract the features from the test set: Pclass, Sex, Age, and Fare.
360 | test_features = test[[___, ___, ___, ___]].values
361 | 
362 | # Make your prediction using the test set
363 | my_prediction = my_tree_one.predict(test_features)
364 | 
365 | # Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
366 | PassengerId =np.array(test["PassengerId"]).astype(int)
367 | my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
368 | print(my_solution)
369 | 
370 | # Check that your data frame has 418 entries
371 | print(my_solution.shape)
372 | 
373 | # Write your solution to a csv file with the name my_solution.csv
374 | my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"])
375 | ```
376 | 
377 | *** =solution
378 | 
379 | ```{python}
380 | # Impute the missing value with the median
381 | test.Fare[152] = test.Fare.median()
382 | 
383 | # Extract the features from the test set: Pclass, Sex, Age, and Fare.
384 | test_features = test[["Pclass", "Sex", "Age", "Fare"]].values
385 | 
386 | # Make your prediction using the test set and print them.
387 | my_prediction = my_tree_one.predict(test_features)
388 | print(my_prediction)
389 | 
390 | # Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
391 | PassengerId =np.array(test["PassengerId"]).astype(int)
392 | my_solution = pd.DataFrame(my_prediction, PassengerId, columns = ["Survived"])
393 | print(my_solution)
394 | 
395 | # Check that your data frame has 418 entries
396 | print(my_solution.shape)
397 | 
398 | # Write your solution to a csv file with the name my_solution.csv
399 | my_solution.to_csv("my_solution_one.csv", index_label = ["PassengerId"])
400 | ```
401 | 
402 | *** =sct
403 | 
404 | ```{python}
405 | 
406 | test_object("test_features",
407 |             incorrect_msg = "Make sure that you are selecting the correct variables from the `test` dataset.")
408 | test_function("print",3, args=None,
409 |             incorrect_msg = "It looks like your solution doesn't have the correct number of entries. There should be exactly 418 rows!")
410 | 
411 | success_msg("Great! You just created your first decision tree. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_one.csv), and submit the created csv to Kaggle to see the result of your effort.")
412 | 
413 | ```
414 | 
415 | --- type:NormalExercise lang:python xp:100 skills:2 key:fa5a95aab5
416 | ## Overfitting and how to control it
417 | 
418 | When you created your first decision tree the default arguments for  `max_depth` and `min_samples_split` were set to `None`. This means that no limit on the depth of your tree was set.  That's a good thing right? Not so fast. We are likely overfitting. This means that while your model describes the training data extremely well, it doesn't generalize to new data, which is frankly the point of prediction. Just look at the Kaggle submission results for the simple model based on Gender and the complex decision tree. Which one does better?
419 | 
420 | Maybe we can improve the overfit model by making a less complex model? In `DecisionTreeRegressor`, the depth of our model is defined by two parameters:
421 | - the `max_depth` parameter determines when the splitting up of the decision tree stops.
422 | - the `min_samples_split` parameter monitors the amount of observations in a bucket. If a certain threshold is not reached (e.g minimum 10 passengers) no further splitting can be done.
423 | 
424 | By limiting the complexity of your decision tree you will increase its generality and thus its usefulness for prediction!
425 | *** =instructions
426 | - Include the Siblings/Spouses Aboard, Parents/Children Aboard, and Embarked features in a new set of features.
427 | - Fit your second tree `my_tree_two` with the new features, and control for the model compelexity by toggling the `max_depth` and `min_samples_split` arguments.
428 | 
429 | 
430 | *** =hint
431 | 
432 | You can always use `train.describe()` in the console to check the names of the features.
433 | 
434 | *** =pre_exercise_code
435 | 
436 | ```{python}
437 | import pandas as pd
438 | import numpy as np
439 | import sklearn as sk
440 | from sklearn import tree
441 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
442 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
443 | 
444 | target = train["Survived"].values
445 | 
446 | train["Age"] = train["Age"].fillna(train["Age"].median())
447 | train["Sex"][train["Sex"] == "male"] = 0
448 | train["Sex"][train["Sex"] == "female"] = 1
449 | train["Embarked"] = train["Embarked"].fillna("S")
450 | train["Embarked"][train["Embarked"] == "S"] = 0
451 | train["Embarked"][train["Embarked"] == "C"] = 1
452 | train["Embarked"][train["Embarked"] == "Q"] = 2
453 | 
454 | ```
455 | 
456 | *** =sample_code
457 | ```{python}
458 | # Create a new array with the added features: features_two
459 | features_two = train[["Pclass","Age","Sex","Fare", ___, ___, ___]].values
460 | 
461 | #Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
462 | max_depth = 
463 | min_samples_split =
464 | my_tree_two = tree.DecisionTreeClassifier(max_depth = ___, min_samples_split = ____, random_state = 1)
465 | my_tree_two = 
466 | 
467 | #Print the score of the new decison tree
468 | 
469 | ```
470 | 
471 | 
472 | *** =solution
473 | ```{python}
474 | # Create a new array with the added features: features_two
475 | features_two = train[["Pclass","Age","Sex","Fare", "SibSp", "Parch", "Embarked"]].values
476 | 
477 | #Control overfitting by setting "max_depth" to 10 and "min_samples_split" to 5 : my_tree_two
478 | max_depth = 10
479 | min_samples_split = 5
480 | my_tree_two = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5, random_state = 1)
481 | my_tree_two = my_tree_two.fit(features_two, target)
482 | 
483 | #Print the score of the new decison tree
484 | print(my_tree_two.score(features_two, target))
485 | ```
486 | 
487 | *** =sct
488 | 
489 | ```{python}
490 | test_object("features_two",
491 |             incorrect_msg = "Make sure you are selecting the specified features from the train dataset.")
492 | test_object("max_depth",
493 |             incorrect_msg = "The maximum deapth argument shoudl be set to 10!")
494 | test_object("min_samples_split",
495 |             incorrect_msg = "The min_samples_split argument shoudl be set to 5!")
496 | test_function("print", args=None,
497 |             incorrect_msg = "It looks like score wasn't computed quite right. Make sure that the you are using the `features_two` and `target` as your arguments.")
498 | 
499 | success_msg("Great! You just created your second and possibly improved decision tree. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_two.csv) .Submit your updated solution to Kaggle to see how despite a lower `.score` you predict better.")
500 | 
501 | ```
502 | 
503 | --- type:NormalExercise lang:python xp:100 skills:2 key:55678ebefb
504 | ## Feature-engineering for our Titanic data set
505 | 
506 | Data Science is an art that benefits from a human element. Enter feature engineering: creatively engineering your own features by combining the different existing variables. 
507 | 
508 | While feature engineering is a discipline in itself, too broad to be covered here in detail, you will have a look at a simple example by creating your own new predictive attribute: `family_size`.  
509 | 
510 | A valid assumption is that larger families need more time to get together on a sinking ship, and hence have lower probability of surviving. Family size is determined by the variables `SibSp` and `Parch`, which indicate the number of family members a certain passenger is traveling with. So when doing feature engineering, you add a new variable `family_size`, which is the sum of `SibSp` and `Parch` plus one (the observation itself), to the test and train set.
511 | 
512 | *** =instructions
513 | - Create a new train set `train_two` that differs from `train` only by having an extra column with your feature engineered variable `family_size`.
514 | -  Add your feature engineered variable `family_size` in addition to `Pclass`, `Sex`, `Age`, `Fare`, `SibSp` and `Parch` to `features_three`.
515 | - Create a new decision tree as `my_tree_three` and fit the decision tree with your new feature set `features_three`. Then check out the score of the decision tree.
516 | 
517 | *** =hint
518 | 
519 | - Don't forget to add `1` when adding the column with the new feature 
520 | - Add your newly defined feature to be included in `features_three`
521 | - Remember how you fit the decision tree model in the last exercise
522 | 
523 | *** =pre_exercise_code
524 | ```{python}
525 | import pandas as pd
526 | import numpy as np
527 | import sklearn as sk
528 | from sklearn import tree
529 | train = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv")
530 | test = pd.read_csv("http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv")
531 | 
532 | target = train["Survived"].values
533 | 
534 | train["Age"] = train["Age"].fillna(train["Age"].median())
535 | train["Sex"][train["Sex"] == "male"] = 0
536 | train["Sex"][train["Sex"] == "female"] = 1
537 | train["Embarked"] = train["Embarked"].fillna("S")
538 | train["Embarked"][train["Embarked"] == "S"] = 0
539 | train["Embarked"][train["Embarked"] == "C"] = 1
540 | train["Embarked"][train["Embarked"] == "Q"] = 2
541 | 
542 | ```
543 | 
544 | *** =sample_code
545 | ```{python}
546 | # Create train_two with the newly defined feature
547 | train_two = train.copy()
548 | train_two["family_size"] = 
549 | 
550 | # Create a new feature set and add the new feature
551 | features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", ___]].values
552 | 
553 | # Define the tree classifier, then fit the model
554 | my_tree_three = tree.DecisionTreeClassifier()
555 | my_tree_three = 
556 | 
557 | # Print the score of this decision tree
558 | print(my_tree_three.score(features_three, target))
559 | 
560 | ```
561 | 
562 | *** =solution
563 | 
564 | ```{python}
565 | # Create train_two with the newly defined feature
566 | train_two = train.copy()
567 | train_two["family_size"] = train["SibSp"] + train["Parch"] + 1
568 | 
569 | # Create a new feature set and add the new feature
570 | features_three = train_two[["Pclass", "Sex", "Age", "Fare", "SibSp", "Parch", "family_size"]].values
571 | 
572 | # Define the tree classifier, then fit the model
573 | my_tree_three = tree.DecisionTreeClassifier()
574 | my_tree_three = my_tree_three.fit(features_three, target)
575 | 
576 | # Print the score of this decision tree
577 | print(my_tree_three.score(features_three, target))
578 | 
579 | ```
580 | 
581 | *** =sct
582 | 
583 | ```{python}
584 | 
585 | test_object("features_three",
586 |             incorrect_msg = "Be sure that you add `1` while defining `family_size`. Then include `family_size` in `features_three`.")
587 | test_function("print", args=None,
588 |             incorrect_msg = "It looks like score wasn't computed quite right. Make sure that the you are using the `features_three` and `target` to fit your decision tree model.")
589 | 
590 | success_msg("Great! Notice that this time the newly created variable is included in the model. [Download your csv file](https://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/my_solution_three.csv), and submit the created csv to Kaggle to see the result of the updated model.")
591 | ```
592 | 
593 | 


--------------------------------------------------------------------------------