├── visuals.py
└── README.md
/visuals.py:
--------------------------------------------------------------------------------
1 | ###########################################
2 | # Suppress matplotlib user warnings
3 | # Necessary for newer version of matplotlib
4 | import warnings
5 | warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
6 | #
7 | # Display inline matplotlib plots with IPython
8 | from IPython import get_ipython
9 | get_ipython().run_line_magic('matplotlib', 'inline')
10 | ###########################################
11 |
12 | import matplotlib.pyplot as pl
13 | import matplotlib.patches as mpatches
14 | import numpy as np
15 | import pandas as pd
16 | from time import time
17 | from sklearn.metrics import f1_score, accuracy_score
18 |
19 |
20 | def distribution(data, transformed = False):
21 | """
22 | Visualization code for displaying skewed distributions of features
23 | """
24 |
25 | # Create figure
26 | fig = pl.figure(figsize = (11,5));
27 |
28 | # Skewed feature plotting
29 | for i, feature in enumerate(['capital-gain','capital-loss']):
30 | ax = fig.add_subplot(1, 2, i+1)
31 | ax.hist(data[feature], bins = 25, color = '#00A0A0')
32 | ax.set_title("'%s' Feature Distribution"%(feature), fontsize = 14)
33 | ax.set_xlabel("Value")
34 | ax.set_ylabel("Number of Records")
35 | ax.set_ylim((0, 2000))
36 | ax.set_yticks([0, 500, 1000, 1500, 2000])
37 | ax.set_yticklabels([0, 500, 1000, 1500, ">2000"])
38 |
39 | # Plot aesthetics
40 | if transformed:
41 | fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", \
42 | fontsize = 16, y = 1.03)
43 | else:
44 | fig.suptitle("Skewed Distributions of Continuous Census Data Features", \
45 | fontsize = 16, y = 1.03)
46 |
47 | fig.tight_layout()
48 | fig.show()
49 |
50 |
51 | def evaluate(results, accuracy, f1):
52 | """
53 | Visualization code to display results of various learners.
54 |
55 | inputs:
56 | - learners: a list of supervised learners
57 | - stats: a list of dictionaries of the statistic results from 'train_predict()'
58 | - accuracy: The score for the naive predictor
59 | - f1: The score for the naive predictor
60 | """
61 |
62 | # Create figure
63 | fig, ax = pl.subplots(2, 3, figsize = (11,7))
64 |
65 | # Constants
66 | bar_width = 0.3
67 | colors = ['#A00000','#00A0A0','#00A000']
68 |
69 | # Super loop to plot four panels of data
70 | for k, learner in enumerate(results.keys()):
71 | for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):
72 | for i in np.arange(3):
73 |
74 | # Creative plot code
75 | ax[j//3, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
76 | ax[j//3, j%3].set_xticks([0.45, 1.45, 2.45])
77 | ax[j//3, j%3].set_xticklabels(["1%", "10%", "100%"])
78 | ax[j//3, j%3].set_xlabel("Training Set Size")
79 | ax[j//3, j%3].set_xlim((-0.1, 3.0))
80 |
81 | # Add unique y-labels
82 | ax[0, 0].set_ylabel("Time (in seconds)")
83 | ax[0, 1].set_ylabel("Accuracy Score")
84 | ax[0, 2].set_ylabel("F-score")
85 | ax[1, 0].set_ylabel("Time (in seconds)")
86 | ax[1, 1].set_ylabel("Accuracy Score")
87 | ax[1, 2].set_ylabel("F-score")
88 |
89 | # Add titles
90 | ax[0, 0].set_title("Model Training")
91 | ax[0, 1].set_title("Accuracy Score on Training Subset")
92 | ax[0, 2].set_title("F-score on Training Subset")
93 | ax[1, 0].set_title("Model Predicting")
94 | ax[1, 1].set_title("Accuracy Score on Testing Set")
95 | ax[1, 2].set_title("F-score on Testing Set")
96 |
97 | # Add horizontal lines for naive predictors
98 | ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
99 | ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
100 | ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
101 | ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
102 |
103 | # Set y-limits for score panels
104 | ax[0, 1].set_ylim((0, 1))
105 | ax[0, 2].set_ylim((0, 1))
106 | ax[1, 1].set_ylim((0, 1))
107 | ax[1, 2].set_ylim((0, 1))
108 |
109 | # Create patches for the legend
110 | patches = []
111 | for i, learner in enumerate(results.keys()):
112 | patches.append(mpatches.Patch(color = colors[i], label = learner))
113 | pl.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
114 | loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
115 |
116 | # Aesthetics
117 | pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
118 | pl.tight_layout()
119 | pl.show()
120 |
121 |
122 | def feature_plot(importances, X_train, y_train):
123 |
124 | # Display the five most important features
125 | indices = np.argsort(importances)[::-1]
126 | columns = X_train.columns.values[indices[:5]]
127 | values = importances[indices][:5]
128 |
129 | # Creat the plot
130 | fig = pl.figure(figsize = (9,5))
131 | pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
132 | pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \
133 | label = "Feature Weight")
134 | pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
135 | label = "Cumulative Feature Weight")
136 | pl.xticks(np.arange(5), columns)
137 | pl.xlim((-0.5, 4.5))
138 | pl.ylabel("Weight", fontsize = 12)
139 | pl.xlabel("Feature", fontsize = 12)
140 |
141 | pl.legend(loc = 'upper center')
142 | pl.tight_layout()
143 | pl.show()
144 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Study-09-MachineLearning-[Supervised Learning]
2 | - Let's find the Decision Surface!
3 | ----------------------------------------------------------------------------------------------------------------------------------------
4 | ## (A1) Linear Regression (when 'y' follows Normal-Dist)
5 | - For numeric data point
6 | ##### Single value
7 | ```
8 | from sklearn.linear_model import LinearRegression
9 | model = LinearRegression()
10 | model.fit(df[ ['predictor'] ], df[ ['response'] ])
11 |
12 | model.predict([ [127], [248] ])
13 | ```
14 | array([[ 438.94308857, 127.14839521]])
15 | - (The reason for predicting on an array like [127] and not just 127, is because we can have a model that makes a prediction using multiple features.)
16 |
17 | ##### Multiple values
18 | - (The dataset consists of 13 features of 506 houses and their median value in $1000's. We fit a model on the 13 features to predict on the value of houses, i.e 'x' has 506 lists.)
19 | ```
20 | from sklearn.datasets import load_boston
21 | boston_data = load_boston()
22 | x = boston_data['data']
23 | y = boston_data['target']
24 |
25 | from sklearn.linear_model import LinearRegression
26 | model = LinearRegression()
27 | model.fit(x, y)
28 |
29 | sample_house = [[2.29690000e-01, 0.00000000e+00, 1.05900000e+01, 0.00000000e+00, 4.89000000e-01,
30 | 6.32600000e+00, 5.25000000e+01, 4.35490000e+00, 4.00000000e+00, 2.77000000e+02,
31 | 1.86000000e+01, 3.94870000e+02, 1.09700000e+01]]
32 |
33 | model.predict(sample_house)
34 | ```
35 | array([ 23.68420569])
36 |
37 | ## (A2) Logistic Regression
38 | Linear models are the most useful applied statistical technique. However, they are not without their limitations. Additive response models don’t make much sense if the response is discrete, or strictly positive. Additive error models often don’t make sense, for example, if the outcome has to be positive. Transformations, such as taking a cube root of a count outcome, are often hard to interpret. In addition, there’s value in modeling the data on the scale that it was collected. Particularly interpretable transformations, natural logarithms in specific, aren’t applicable for negative or zero values.
39 |
40 | The generalized linear model is family of models that includes linear models. By extending the family, it handles many of the issues with linear models, but at the expense of some complexity and loss of some of the mathematical tidiness. A GLM involves three components.
41 | - An exponential family model for the response.
42 | - A systematic component via a linear predictor.
43 | - A link function that connects the means of the response to the linear predictor.
44 |
45 | The three most famous cases of GLMs are: linear models, binomial and binary regression and Poisson regression. We’ll go through the GLM model specification and likelihood for all three.
46 |
47 | **Typical Approach**
48 | - Fitting a logistic regression to a dataset where we would like to predict if a transaction is fraud or not.
49 |
50 |
51 | As we can see, there are two columns that need to be changed to dummy variables. Use the 1 for weekday and True, and 0 otherwise.
52 | ```
53 | df['weekday'] = pd.get_dummies(df['day'])['weekday']
54 | df[['not_fraud','fraud']] = pd.get_dummies(df['fraud'])
55 |
56 | df = df.drop('not_fraud', axis=1)
57 | df.head(2)
58 | ```
59 |
60 |
61 | The proportion of fraudulent, weekday... transactions...?
62 | ```
63 | print(df['fraud'].mean())
64 | print(df['weekday'].mean())
65 | print(df.groupby('fraud').mean()['duration'])
66 | ```
67 |
68 |
69 | Fit a logistic regression model to predict if a transaction is fraud using both day and duration. Don't forget an intercept! Instead of 'OLS', we use 'Logit'
70 | ```
71 | df['intercept'] = 1
72 |
73 | log_model = sm.Logit(df['fraud'], df[['intercept', 'weekday', 'duration']])
74 | result = log_model.fit()
75 | result.summary()
76 | ```
77 |
78 |
79 | Coeff-interpret: we need to exponentiate our coefficients before interpreting them.
80 | ```
81 | # np.exp(result.params)
82 | np.exp(2.5465)
83 | np.exp(-1.4637), 100/23.14
84 | ```
85 | 12.762357271496972, (0.23137858821179411, 4.32152117545376)
86 |
87 | >On weekdays, the chance of fraud is 12.76 (e^2.5465) times more likely than on weekends...holding 'duration' constant.
88 |
89 | >For each min less spent on the transaction, the chance of fraud is 4.32 times more likely...holding the 'weekday' constant.
90 |
91 | *Note: When you find the ordinal variable with numbers...Need to convert to the categorical variable, then
92 | ```
93 | df['columns'].astype(str).value_counts()
94 | ```
95 |
96 | **Diagnostics**
97 | ```
98 | import numpy as np
99 | import pandas as pd
100 | from sklearn.linear_model import LogisticRegression
101 | from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
102 | from sklearn.model_selection import train_test_split
103 | ```
104 | - __Confusion Matrix__
105 | - Recall: 'reality'(Out of all the items that are **truly positive**): TP / TP+FN
106 | - Precision 'argued'(Out of all the items **labeled as positive**): TP / TP+FP
107 |
108 |
109 | - Next, it is useful to split your data into training and testing data to assure your model can predict well not only on the data it was fit to, but also on data that the model has never seen before. Proving the model performs well on test data assures that you have a model that will do well in the future use cases. Let's pull off X and y first. Create your test set as 10% of the data, and use a random state of 0.
110 | ```
111 | X = df[['intercept', 'weekday', 'duration']]
112 | y = df['fraud']
113 |
114 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=0)
115 | ```
116 | The usual steps are:
117 | - Instantiate
118 | - Fit (on train)
119 | - Predict (on test)
120 | - Score (compare predict to test)
121 | ```
122 | log_model = LogisticRegression()
123 | log_model.fit(X_train, y_train)
124 | pred = log_model.predict(X_test)
125 |
126 | print(accuracy_score(y_test, pred))
127 | print(recall_score(y_test, pred))
128 | print(precision_score(y_test, pred))
129 | confusion_matrix(y_test, pred)
130 | ```
131 | Roc Curve: The ideal case is for this to shoot all the way to the upper left hand corner.
132 | ```
133 | from ggplot import *
134 | from sklearn.metrics import roc_curve, auc
135 |
136 | preds = log_mod.predict_proba(X_test)[:,1]
137 | fpr, tpr, _ = roc_curve(y_test, preds)
138 |
139 | df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
140 | ggplot(df, aes(x='fpr', y='tpr')) + geom_line() + geom_abline(linetype='dashed')
141 | ```
142 | --------------------------------------------------------------------------------------------------------------------------------------
143 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
144 | ## (B) DecisionTree
145 | - For continuous, categoric data
146 | - For **non-binary** classification
147 |
148 | > PREDICTION: based on the features, we can guess the apps that the future users would download.
149 |
150 | - Unlike SVM using a kernel trick, **DecisionTree** use a trick that lets a linear-DecisionSurf do Non-Linear-Decision making.
151 | - When making Decision Trees, we ask questions: On what features do we make our decisions on? What is the threshold for classifying each question into a yes or no answer? By adding an additional question, we can greater define the Yes and No classes !
152 |
153 |
154 |
155 | ```
156 | import matplotlib.pyplot as plt
157 | import numpy as np
158 | import pylab as pl
159 | import sys
160 | from class_vis import prettyPicture, output_image
161 | from prep_terrain_data import makeTerrainData
162 |
163 | features_train, labels_train, features_test, labels_test = makeTerrainData()
164 | ```
165 | We build two DecisionTree classifiers; one with parameter(min_samples_split=2), and the other with (min_samples_split=50). What's the difference in accuracy ? And how to prevent **overfitting** ?
166 |
167 |
168 |
169 | Store your predictions in a list named 'pred_2', 'pred_50'.
170 | ```
171 | from sklearn import tree
172 |
173 | clf_2 = tree.DecisionTreeClassifier(min_samples_split=2)
174 | clf_50 = tree.DecisionTreeClassifier(min_samples_split=50)
175 |
176 | X = features_train
177 | y = labels_train
178 |
179 | clf_2.fit(X, y)
180 | clf_50.fit(X, y)
181 |
182 | pred_2 = clf_2.predict(features_test)
183 | pred_50 = clf_50.predict(features_test)
184 | ```
185 | Accuracy ? Whose accuracy is better ? clf_2 or clf_50 ? Well..min_samples_split=2 is too much..overfitting giving less accuracy.
186 | ```
187 | from sklearn.metrics import accuracy_score
188 |
189 | acc_min_samples_split_2 = accuracy_score(pred_2, labels_test)
190 | acc_min_samples_split_50 = accuracy_score(pred_50, labels_test)
191 |
192 | def submitAccuracies():
193 | return {"acc_min_samples_split_2":round(acc_min_samples_split_2, 3),
194 | "acc_min_samples_split_50":round(acc_min_samples_split_50, 3)}
195 | ```
196 | ### DecisionTree & Entropy
197 | - **Entropy:** is a measure of **[impurity]** in a bunch of examples...Let's say it's an opposite of purity..
198 | - **Entropy** controls how a DecisionTree decides **where to split the data** to make subsets as pure as possible...
199 | - **Entropy** describes what's going on here.. `- proportion_a * log(proportion_a)` `- proportion_b * log(proportion_b)`
200 |
201 |
202 |
203 | # Entropy measures `variance` in categorical variables.
204 |
205 | If we have a categorical veriable that consists of entry (a, b). Let's say p(a)=0.5, p(b)=0.5, then our entropy is
206 | ```
207 | import math
208 |
209 | -0.5*math.log(0.5, 2) -0.5*math.log(0.5, 2)
210 | ```
211 | Which is 1, so it's a fucked up entropy.
212 |
213 | Our DecisionTree picks splits of the maximum **Information Gain**
214 | - First, calculate "Parents Entropy".
215 | - Second, look at the possible splits that **each column** gives, and caluculate each "Child Entropy".
216 | - Third, calculate each column's "Information Gain" to pick the largest.
217 |
218 |
219 |
220 |
221 | ### Hyperparameters for Decision Trees
222 | - `max_depth`
223 | - the largest length between the root to a leaf. A tree of maximum length k can have at most 2^k leaves(the very end).
224 | - Of course, too large depth very often causes overfitting.
225 | - `min_samples_leaf`(the very end)
226 | - a minimum for the number of samples we allow on each individual leaf.
227 | - This number can be specified as an integer, or as a float. If it's an integer, it's the number of minimum samples in the leaf. If it's a float, it'll be considered as the minimum percentage of samples on each leaf. For example, 0.1, or 10%, implies that a cut will not be allowed if in one of the leaves there is less than 10% of the samples on that node.
228 | - Of course, too small minimum samples per leaf results in overfitting.
229 | - `min_samples_split`
230 | - This is the same as the minimum number of samples per leaf, but applied on **any split** of a node.
231 | - `max_features`
232 | - Oftentimes, we will have too many features to build a tree. To speed up? we limit the number of features that one looks for in each split.
233 |
234 |
235 | ## RandomForests >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
236 |
237 |
238 | What if we have so many columns? ---- Warning of Overfitting !!! How to solve ?
239 | - 1) Pick some of the columns randomly
240 | - 2) Build a DecisionTree in those columns
241 | - 3) repeat
242 | - 4) Let the trees vote !
243 | - When we have a new data-pt, let all the trees make a prediction and pick the one that appears the most.
244 |
245 | ## EnsembleMethods >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
246 | Take a bunch of models and join them together to get a better model
247 | - **Bagging**(Bootstrap Aggregating): Check each results and combine(average out or vote).
248 | - **Boosting**: A bit more elaborated than Bagging. Try hard to exploit the strength of each models then combine.
249 | > Bagging
250 | - As our data is huge, we don't want to train many models on the same data. so we take random subsets of the data and train a week learner(one node DecisionTree) on each one of these subsets.
251 | - Impose each result over the data and vote(as what two or more of them say..blue? it's blue.)
252 |
253 |
254 | > Adaboosting
255 | - First, fit our first weak learner in order to maximize accuracy(or equivalently minimize the size of errors): Do no better than **3 errors** ! When it comes to the errors, it makes them bigger(punish them).
256 | - Our second learner needs to fix on the **mistakes** that the first one has made, correctly classifying these points at any expense, then punish the points misclassified by itself.
257 | - Our third learner needs to fix on the **mistakes** that the second one has made, correctly classifying these points at any expense, then punish the points misclassified by itself....we can go on and on..but let's say 3 is enough and we combine these learners.
258 | - OVERALL
259 |
260 |
261 | - DETAIL
262 | - Assign an initial weight of '1' and before fit our first learner, minimize the size of errors, then minimize the SUM of weights of these errors by changing the weights of errors to: `correct_sum/incorrect_sum`, which will make these two correct, incorrect into the same SUM of the correct, incorrect.
263 | - keep this going.
264 | - Here, notice `correct_sum/incorrect_sum` = `accuracy/(1-accuracy)` and we put it into `ln( )`, which is the final weight.
265 | - Our concern is whether the sums of these final-weights are `+ / -`.
266 |
267 |
268 |
269 | ```
270 | from sklearn.ensemble import AdaBoostClassifier
271 | model = AdaBoostClassifier()
272 | model.fit(x_train, y_train)
273 | model.predict(x_test)
274 | ```
275 | When we define the model, we can specify the **hyperparameters**. In practice, the most common ones are:
276 | - `base_estimator`: The model(Here, DecisonTree..) utilized for the weak learners
277 | - `n_estimators`: The maximum number of weak learners used.
278 | ```
279 | from sklearn.tree import DecisionTreeClassifier
280 | model = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth=2), n_estimators = 4)
281 | ```
282 |
283 | > Decision Tree Regression
284 | - `Predicted output = average of the training examples in the subset`
285 | - It requires a different definition of entropy
286 | - We can use **linear regression at the leaves** !!
287 |
288 |
289 | --------------------------------------------------------------------------------------------------------------------------------------
290 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
291 | ## (C) Naive Bayes
292 | - For categoric, continuous data
293 | - For continuous, you need to build the probablistic distribution based on the data and plug the new data in.
294 | - `In contingency table`, we have rows(features) and columns(event/non_event). First, we need to know if rows(features) are independent each other. Of course rows and columns are not independent, they are subject to the conditional prob `P(E|A)`, but when independent features are too many, `P(E|A,B,C,D,...)` we know this is where the **Naive_Bayes** come into play. We know multiplication works when finding the **joint probability** of multiple independent features.
295 |
296 |
297 | > PREDICTION: when future emails come, we can combine these features to guess if they are spam or not.
298 |
299 |
300 | - Naive Bayes is an extension of the Bayes Theorem where we have more than one feature, with the assumption that each feature is independent of each other event..so Naive.
301 | - Library: sklearn.naive_bayes (Gaussian)
302 | - Example: Compute the accuracy of your Naive Bayes classifier. Accuracy is defined as the number of test points that are classified correctly divided by the total number of test points.
303 | ```
304 | def NBAccuracy(features_train, labels_train, features_test, labels_test):
305 | from sklearn.naive_bayes import GaussianNB
306 | clf = GaussianNB() ### create classifier ###
307 | clf.fit(features_train, labels_train) ### fit the classifier on the training features and labels ###
308 | pred = clf.predict(features_test) ### use the trained classifier to predict labels for the test features ###
309 |
310 | ### calculate and return the accuracy on the test data. ###
311 | accuracy = clf.score(features_test, labels_test)
312 | return(accuracy)
313 |
314 | ### or we can use 'sklearn accuracy' ###
315 | from sklearn.metrics import accuracy_score
316 | print(accuracy_score(pred, labels_test))
317 | ```
318 | It throws an accuracy of 88.4% which means 88.4% of the points are being correctly labelled by our classifier-'clf' when we use our test-set !
319 |
320 | > Bayes Rule:
321 |
322 |
323 | *Semantically, what Bayes rule does is it **incorporates** some evidence from the test into our **prior** to arrive at a **posterior**.
324 | - Prior: Probability before running a test.
325 | - test evidence
326 | - Posterior:
327 |
328 |
329 | *Algorithm of Naive Bayes
330 |
331 |
332 | ### Ex) Text Forensic and Learning (ex. Whose email would it be ?)
333 |
334 |
335 | ### Ex) Multiple Evidences(test results)
336 |
337 |
338 | Spam detection is one of the major applications of Machine Learning in the interwebs today. Pretty much all of the major email service providers have spam detection systems built in and automatically classify such mail as 'Junk Mail'.
339 |
340 | > What are spammy messages?
341 | Usually they have words like 'free', 'win', 'winner', 'cash', 'prize' and the like in them as these texts are designed to catch your eye and in some sense tempt you to open them. Also, spam messages tend to have words written in all capitals and also tend to use a lot of exclamation marks. To the recipient, it is usually pretty straightforward to identify a spam text and our objective here is to train a model to do that for us! Being able to identify spam messages is a **binary classification** problem as messages are classified as either 'Spam' or 'Not Spam' and nothing else. This project has been broken down in to the following steps:
342 | - Step 0: Introduction to the Naive Bayes Theorem
343 | - Step 1.1: Understanding our dataset
344 | - Step 1.2: Data Preprocessing
345 | - Step 2.1: Bag of Words(BoW)
346 | - Step 2.2: Implementing BoW from scratch
347 | - Step 2.3: Implementing Bag of Words in scikit-learn
348 | - Step 3.1: Training and testing sets
349 | - Step 3.2: Applying Bag of Words processing to our dataset.
350 | - Step 4.1: Bayes Theorem implementation from scratch
351 | - Step 4.2: Naive Bayes implementation from scratch
352 | - Step 5: Naive Bayes implementation using scikit-learn
353 | - Step 6: Evaluating our model
354 | - Step 7: Conclusion
355 | > Curse of Dimensionality
356 | - As the number of descriptive **features grows**, the number of potential **conditioning events grows**. Consequently, an exponential increase is required in the `size of the dataset` as each new descriptive feature is added to ensure that for any conditional probability there are enough instances in the training dataset matching the conditions so that the resulting probability is reasonable.
357 |
358 |
359 |
360 | --------------------------------------------------------------------------------------------------------------------------------------
361 | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
362 | ## (D) Support Vector Machine
363 | - For categoric data
364 | - For numeric data
365 | > PREDICTION: face detection, handwriting recognition, time series, stock value prediction
366 |
367 |
368 | SVM is a set of supervised learning methods used for
369 | - classification
370 | - regression
371 | - **outliers detection**
372 | SVMs "doesn't work well with lots and lots of noise, so when the classes are very overlapping, you have to count independent evidence.
373 |
374 | It is not a probabilistic model; i.e., it does not postulate a probability distribution and thus does not assume any randomness. It merely tries to draw a simple line(or plane or hyperplane in higher dimensions) to separate the data points into two parts. That's all. Note that the dataset contains labeled data.
375 |
376 | One difficulty was that oftentimes the classifier(the separating 'line' or 'hyperplane') cannot be defined linearly, which means it's not actually a straight line or plane that separates the two sets. It should rather be a wavy curve or surface. So what do we do? We lift the feature space to a higher or possibly an infinite dimensional space so that a linear classifier is possible. This is called the kernel trick. This is what the support vector machine does.
377 |
378 | Now applying this to a regression problem, linear regression could be described as an attempt to draw a line (or similarly plane or hyperplane in higher dimensions) that minimizes the error(or the loss function). Therefore, if we choose different loss functions, the regression line(or plane, hyperplane) changes. When the feature space seemingly isn't best served by a simple line or plane but rather calls for something wavy as seen in the classification problem, instead of approximating the wavy object, we again use the kernel trick to lift the feature space into a higher dimension. In this task, the output is a real value.
379 |
380 | ### In SVM, tuning the parameters can be a lot of work, but GridCV, a great sklearn tool that can find an optimal parameter tune almost automatically.
381 |
382 | Naive Bayes is great for 'text'. It’s faster and generally gives better performance than an SVM. Of course, there are plenty of other problems where an SVM might work better. Knowing which one to try when you’re tackling a problem for the first time is part of the art of ML.
383 |
384 | Pros & Cons
385 | - > The advantages of support vector machines are:
386 | - Effective in cases where number of dimensions is greater than the number of samples.
387 | - Uses a subset of training points in the decision function called `support vectors`, so it is also memory efficient.
388 | - Versatile: different **Kernel functions** can be specified for the decision function(Common kernels are provided, but it is also possible to specify custom kernels).
389 | - Using a **kernel trick**, Linear DecisionSurf -> NonLinear DecisionSurf
390 |
391 | - > The disadvantages of support vector machines include:
392 | - If the number of features is much greater than the number of samples, avoid **over-fitting** in choosing Kernel functions and **regularization term** is crucial.
393 | - SVMs do not directly provide probability estimates, these are calculated using an expensive **five-fold cross-validation**.
394 |
395 |
396 | Margine is a maximum distance to each nearest point. The separating line should be most robust to classification errors. The margine aims to maximizes the robustness of the result....As Much Separation b/w two classifications as possible.
397 | - The perceptron algorithm is a trick in which we started with a random line, and iterated on a step in order to slowly walk the line towards the misclassified points, so we can classify them correctly. However, we can also see this algorithm as an algorithm which minimizes an error function.
398 |
399 |
400 | Error (Margin Error + Classification Error)
401 | - We punish the smaller margin..(just like punishing the model complexity in the L2_regularization of LinearModel).
402 | - We want to minimize the total error (or error function)
403 |
404 |
405 |
406 | ```
407 | import matplotlib.pyplot as plt
408 | import numpy as np
409 | import pylab as pl
410 | import copy
411 |
412 | import sys
413 | from class_vis import prettyPicture
414 | from prep_terrain_data import makeTerrainData
415 |
416 | features_train, labels_train, features_test, labels_test = makeTerrainData()
417 | ```
418 | In sklearn.svm, `SVC()`, `NuSVC()`, `LinearSVC()` accept slightly different sets of parameters and have different mathematical formulations, but take as input two arrays:
419 | - an array **X** of size `[n_samples, n_features]`holding the training samples
420 | - an array **y** of class labels (strings or integers), size `[n_samples]`
421 | - Library: sklearn.svm
422 | - Example:
423 | ```
424 | from sklearn.svm import SVC
425 | # clf = SVC(kernel="linear") #
426 | # clf = SVC(kernel='poly', degree=4, C=0.1) #
427 | # clf = SVC(kernel='rbf', gamma= ) #
428 |
429 | X = features_train
430 | y = labels_train
431 | clf.fit(X, y)
432 |
433 | pred = clf.predict(features_test)
434 | ```
435 | Accuracy ?
436 | ```
437 | from sklearn.metrics import accuracy_score
438 | acc = accuracy_score(pred, labels_test)
439 |
440 | def submitAccuracy():
441 | return acc
442 | ```
443 | ## Non-Linear SVM
444 |
445 |
446 | Introducing New Features 'Z' or 'transformed X or Y' causes 'hyperplane.' Z is non-negative because it's a distance from the origin.
447 |
448 |
449 | Introducing poly tool box. Select terms to create the decision surf.
450 |
451 |
452 | **Kernel Trick:** There are functions taking a low dimensional given 'input space' and the added 'feature space' then map it to a very high dimensional space - Kernel function (Linear, poly, rbf, sigmoid). It makes the separation then takes the solution and go back to the original space. It sets the dataset apart where the division line is non-linear.
453 |
454 |
455 | rbf (radial basis func) kernel:
456 | - hill & valley
457 | - find a place where a line intersecting the mountain range and project every pt down, then we have a boundary given by the vertical cut. But how we build the mountain range and how to locate red pt in highlands and blue pt in lowlands ?
458 |
459 | parameters (degree, C, Gamma)
460 | - **C:** The 'gamma' parameter actually has no effect on the 'linear' kernel for SVMs. The key parameter for 'linear kernel function' is "C". The C parameter **trades off misclassification of training examples against simplicity of the decision surface**. A low C makes the decision surface smooth, while a high C aims at classifying all training examples correctly by giving the model freedom to select more samples as support vectors - wiggling around individual data pt...
461 | - C is a constant that attaches itself to the classification error. If we have large C, then the error is mostly the classification error so we focus more on correctly classifying all the points than in finding a good margin. When C is small, the error is mostly a margin error.
462 |
463 |
464 | - **Gamma:** This parameter in **rbf** defines **how far the influence of a single data pt reaches**, with low values (widey mountain) meaning ‘far’ and high values (pointy mountain) meaning ‘close’. The gamma parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors. High gamma just like me..only thinking of sth right in my face.
465 | - When gamma is very small, the model is too constrained and cannot capture the complexity or “shape” of the data. The region of influence of any selected support vector would include the whole training set. The resulting model will behave similarly to a linear model with a set of hyperplanes that separate the centers of high density of any pair of two classes. If gamma is too large, the radius of the area of influence of the support vectors only includes the support vector itself and no amount of regularization with C will be able to prevent overfitting.
466 |
467 |
468 | ## SV-Regression
469 | Support Vector Machines are very specific class of algorithms, characterized by
470 | - **usage of kernels**,
471 | - **absence of local minima**,
472 | - **sparseness of the solution**
473 | - **capacity control** obtained by acting on the margin, or on number of support vectors, etc.
474 |
475 | > Intuitively, as all regressors, it tries to fit a line to data by minimising a cost function. However, the interesting part about SVR is that you can deploy a non-linear kernel, making **non-linear regression**. The model is represented as combinations of the training points rather than a function of the features and some weights.
476 |
477 | It can be applied not only to classification problems but also to the case of regression. Still it contains all the main features that characterize `maximum margin algorithm`:
478 | - A **non-linear function** is leaned by linear learning machine mapping into high dimensional kernel induced feature space.
479 | - The **capacity of the system** is controlled by parameters that do not depend on the dimensionality of feature space.
480 | - In the same way as with classification approach there is motivation to seek and optimize the generalization bounds given for regression. They relied on defining the loss function that ignores errors, which are situated within the certain distance of the true value. This type of function is often called – epsilon intensive – loss function. The variables measure the cost of the errors on the training points. These are zero for all points that are inside the band(Margin).
481 |
482 | > One of the most important ideas in Support Vector Classification and Regression cases is that presenting the solution by means of **small subset of training points gives enormous computational advantages**. Using the **epsilon intensive loss function** we ensure existence of the global minimum and at the same time optimization of reliable generalization bound.
483 |
484 | **Under the condition that:**
485 | - All examples are classified correctly...(for Classification)
486 | - The value y of all examples deviates less than **ϵ** from f(x)...(for Regression)
487 | - Support vectors: This are the **data points** which are closest to the margins. The distance of the points is minimum or least.
488 |
489 | __Classification__
490 | - **Goal:** Find a function `f(x)=wx+b` where `f(x)≥1` for **positive examples** and `f(x)≤−1` for **negative examples**.
491 | - **HOW:** `Maximise the margin`, which is nothing more than **minimising** the derivative of `f′` which is **`w`**. Kill the slope.
492 | - **Intuition:** `Maximising the margin` means that this will give us a unique solution to the problem of finding `f(x)=wx+b`. In SVM, we actually try to separate the class as far as possible from the decision surf and unlike logistic regression, we create a `safety boundary`(margin) from both sides of the decision surf (different between logistic regression and SVM classification is in their loss function).
493 |
494 | In simple regression we try to minimise the error rate while in SVR we try to **`fit the error within a certain threshold`**.
495 |
496 | __Regression__
497 | - **Goal:** Find a function `f(x)=wx+b` under the condition that `f(x)` is within a required **`ϵ`** deviation and `|y−f(x)|≤ϵ` where **`ϵ`** is the half width of margin. We are trying to decide a decision boundary at **`ϵ`** distance from the original decision surf such that data points closest to the decision surf or the support vectors that are within that boundary line.
498 | - **HOW:** `Maximise the margin`, which again **minimising** `f′(x)=w`, for the reason of **regularisation** and to obtain a unique solution as the result of the convex optimisation problem. One can see how minimising `w` results in a more `general case` because the extreme value of `w=0` would mean no functional relation at all which is the most general result one can obtain from the data.
499 | - **Intuition:** We want to fit a model to predict a quantity for future. Therefore, **we want the data point(observation) to be as close as possible to the hyperplane unlike SVM for classification**. The SVM regression inherited from Simple OLS Regression by this difference that we define an **`ϵ`** range from both sides of hyperplane to make the regression function insensitive to the error (unlike SVM for classification that we define a boundary to be safe for making the prediction). Eventually, SVM in Regression has a boundary like SVM in classification but the boundary for Regression is for `making the regression function insensitive w.r.t the error` but the boundary for classification is only to be way far from hyperplane to distinguish between class for future (that is why we call it safety margin).
500 |
501 |
502 | -------------------------------------------------------------------------------------------------------------------------------------
503 | -------------------------------------------------------------------------------------------------------------------------------------
504 | -------------------------------------------------------------------------------------------------------------------------------------
505 | ## (E) Perceptron Algorithm
506 | Why do we need NN? Because SO MANY FEATURESSSSS !!!
507 |
508 | ### `NN's each "NODE" is a linear model(classifier) with so many featuresssss whose weights(parameters) are updated by the optimization algorithm like a Gradient_Descent that minimizing the cost function("MSE" for regression form, "CrossEntropy" for logistic regression form), and this linear model(classifier) becomes the input of the activation function (step for discrete, sigmoid for continous) that returns a bunch of outcome of either classification/probability information per each feature datapoint`.
509 |
510 |
511 |
512 | - For classification (Y/N)
513 | - For regression (numbers)
514 | - This is the two model example. Here we simply go with **linear** rather than quadratic.
515 |
516 |
517 | - The model has 'input data-pt', 'weights', 'bias'
518 |
519 |
520 | > As the simplest Neural Network, Perceptron refers a combination of nodes (Here, `Step_func(LinearModel)` coz it simply says 1 / 0)
521 |
522 | ## What does the perceptron look like?
523 |
524 |
525 | > A single node(except the input_node) made out of
526 | - Multiple Linear Model
527 | - Activation function
528 | - the single output
529 |
530 | ### Two Algorithms
531 | - They are about the automatic process for model improvement **by adjusting parameters(W, b) of the Multiple Linear Model**.
532 | - Take your data
533 | - **Pick a random decision surface**
534 | - Calculate the error
535 | - **Minimize the Error-Function, and obtain a better surface**
536 |
537 |
538 | - **Perceptron Algorithm**:
539 | - Focus on the missclassified points(it will give the direction)
540 | - Because of the Step function, y and y_hat have the value of `0 / 1`.
541 | - Thus, the difference between **y** and **y_hat** is `1` or `-1` or `0`. We want it as close as "0"
542 | - **Gradient_Descent Algorithm**:
543 | - Focus on ?
544 | - Because of the Sigmoid function, y and y_hat have the value of `probability`.
545 | - Thus, the difference between **y** and **y_hat** is `-1 < values < 1`. We want it as close as "0"
546 | - Or...instead of `y - y_hat`, use the **derivative** of the cost function like a **`MSE`** because....at the end of the day, both have a goal of **minimization**!
547 |
548 |
549 | # We want to improve our model!
550 | ## 1> Perceptron Trick(with Step function)
551 | - Now that we've learned that the points that are misclassified, and want the line to move closer to them. How to modify the equation of the line, so that it comes closer to a particular point?
552 | - Here is the example. Need to repeat this until the point becomes well-classified (For blue point, need to repeat 10 times).
553 |
554 |
555 |
556 | > Example
557 |
558 |
559 | Recall that the perceptron step works as follows. For a **point** with coordinates(p,q), label y, and prediction given by the equation
560 |
561 |
562 |
563 | ```
564 | import numpy as np
565 | np.random.seed(42)
566 |
567 | def stepFunction(t):
568 | if t >= 0:
569 | return 1
570 | return 0
571 |
572 | def prediction(X, W, b):
573 | return stepFunction((np.matmul(X,W)+b)[0])
574 | ```
575 | The function should receive as inputs the data **X**, the labels **y**, the weights **W** (as an array), and the bias **b**.
576 | Update W, b, according to the perceptron algorithm, and return W and b.
577 | ```
578 | def perceptronStep(X, y, W, b, learn_rate = 0.01):
579 | for i in range(len(X)):
580 | y_hat = prediction(X[i],W,b) ## it's | ---
581 | if y[i]-y_hat == 1: ## FalseNegative
582 | W[0] += learn_rate*X[i][0]
583 | W[1] += learn_rate*X[i][1]
584 | b += learn_rate
585 | elif y[i]-y_hat == -1: ## FalsePositive
586 | W[0] -= learn_rate*X[i][0]
587 | W[1] -= learn_rate*X[i][1]
588 | b -= learn_rate
589 | return W, b
590 | ```
591 | This function runs the perceptron algorithm repeatedly on the dataset, and returns a few of the boundary lines obtained in the iterations for plotting purposes.
592 | > Play with the **learning rate** and the **num_epochs**.
593 | - 'boundary_lines' are the solution lines that get plotted below.
594 | - In each epoch, we apply the perceptron step.
595 |
596 |
597 |
598 | ```
599 | def trainPerceptronAlgorithm(X, y, learn_rate = 0.01, num_epochs = 25):
600 | x_min, x_max = min(X.T[0]), max(X.T[0])
601 | y_min, y_max = min(X.T[1]), max(X.T[1])
602 | W = np.array(np.random.rand(2,1))
603 | b = np.random.rand(1)[0] + x_max
604 | boundary_lines = []
605 | for i in range(num_epochs):
606 | W, b = perceptronStep(X, y, W, b, learn_rate)
607 | boundary_lines.append((-W[0]/W[1], -b/W[1]))
608 | return boundary_lines
609 | ```
610 |
611 |
612 | # We want to improve our model!
613 | ## 2> Gradient Descent Trick(with Sigmoid function)
614 | Move from the discrete to the continuous!
615 |
616 |
617 | ### > MaximumLikelihood and Error Function
618 | Let's say we want to calculate **probability** the four points are of the **colors that they actually are**. We assume the colors of the points are independent events, then the probability for the **`whole arrangement`** is the product of the probabilities of the four points. If the model is given by these probability spaces, then the **probability that the points are of this colors** offers the clue of which model is better.
619 |
620 |
621 | > Q. So how to maximize the probability?
622 |
623 | > Q. So how to minimize the Error-Function?
624 |
625 | > Q. Can we obtain an error-Function from the probability? Maximized probability can yield the minimised Error-Function?
626 |
627 | > In general, an Error-Function tells us how far we are from the solution(it's a distance).
628 | > - It should be continuous!
629 | > - It should be differentiable! (just like minimizing SSE in linear model.)
630 |
631 |
632 | > Q. **What if the number of datapoints are astronomical?** Then producting is not a good idea. We again need a log-function that turns products into sums...and remember..when input is ranged from 0 to 1(coz..they are probabilities), the logarithm gives negative. And this is the **Entropy** function.
633 |
634 | ### > Cross Entropy
635 | As always, we don't like a huge entropy value..
636 |
637 | We here, define **Error-Function**, using Cross Entropy. If I have a bunch of events and probabilities, Cross-Entropy says **how likely those events happen based on the probabilities**. If it's highly likely, then we have a small Cross-Entropy. If it's unlikely, we have a large Cross-Entropy.
638 | - A good model gives a low cross-entropy and a bad model gives a high cross-entropy. So our goal has changed:
639 | - **Minimize the Cross Entropy = Maximize the probability**
640 |
641 |
642 |
643 |
644 | ```
645 | def cross_entropy(Y, P):
646 | Y = np.float_(Y)
647 | P = np.float_(P)
648 | return(-np.sum(Y*np.log(P) + (1-Y)*np.log(1-P)))
649 | ```
650 |
651 |
652 | ### > Multiclass Cross-Entropy
653 | - 1. **Softmax Function**: When the problem has 3 or more classes ? How to turn all scores(WX+b) into probabilities?
654 | - Note that scores often can be negative, and we need to calculate probability. See 'exp()' can turn them into positive.
655 |
656 |
657 | Let's say we have 'n' classes and our linear model(WX+b) gives us the score: Z_1...Z_n, each score for each class. Let's turn them into probabilities. Takes as input a list of numbers(scores), and returns the list of values(possibilities) given by the softmax function.
658 | ```
659 | def softmax(L):
660 | expL = np.exp(L)
661 | S_expL = sum(expL)
662 | result=[]
663 | for i in expL:
664 | result.append(i/S_expL)
665 | return(result)
666 |
667 | def softmax(L):
668 | expL = np.exp(L)
669 | return(np.divide(expL, sum(expL)))
670 | ```
671 | - 2. **One hot encoding**: What if some input data is not numerical?
672 |
673 |
674 | - 3. formula
675 |
676 |
677 | ### Again, Cross Entropy is a connection between probabilities and error functions. We define our Error Function, using 'Cross Entropy'.
678 | - Error = each element of Cross Entropy `-ln(p)` or `-ln(q)`
679 | - What we need is just to minimize the Error-Function (minimize `Cross_Entropy / n`) !
680 |
681 |
682 | ### > Minimization of our Error-Function
683 | How to minimize the Error?
684 | - knowing the current inputs into the model(the current weight and bias), the derivatives of the loss function tell us which direction to nudge W and b in order to minimize the error.
685 |
686 |
687 |
688 | > Example(Binary Classification)
689 | - Implementing the functions that build the gradient descent algorithm:
690 | - **sigmoid**: The sigmoid activation function.
691 | - **output_formula**: The formula for the prediction.
692 | - **error_formula**: The formula for the error at a point.
693 | - **update_weights**: The function that updates the parameters with one gradient descent step.
694 |
695 |
696 | - Some helper functions for plotting and drawing lines.
697 | ```
698 | import matplotlib.pyplot as plt
699 | import numpy as np
700 | import pandas as pd
701 |
702 | def plot_points(X, y):
703 | admitted = X[np.argwhere(y==1)]
704 | rejected = X[np.argwhere(y==0)]
705 | plt.scatter([s[0][0] for s in rejected], [s[0][1] for s in rejected], s = 25, color = 'blue', edgecolor = 'k')
706 | plt.scatter([s[0][0] for s in admitted], [s[0][1] for s in admitted], s = 25, color = 'red', edgecolor = 'k')
707 |
708 | def display(m, b, color='g--'):
709 | plt.xlim(-0.05,1.05)
710 | plt.ylim(-0.05,1.05)
711 | x = np.arange(-10, 10, 0.1)
712 | plt.plot(x, m*x+b, color)
713 | ```
714 |
715 |
716 | ```
717 | # Activation (sigmoid) function
718 | def sigmoid(x):
719 | return(1 / (1 + np.exp(-x)))
720 |
721 | # Output (prediction) formula
722 | def output_formula(features, weights, bias):
723 | return(sigmoid(np.dot(features, weights) + bias))
724 |
725 | # Error (log-loss) formula
726 | def error_formula(y, output):
727 | return(- y*np.log(output) - (1-y)*np.log(1-output))
728 |
729 | # Gradient descent step
730 | def update_weights(x, y, weights, bias, learnrate):
731 | output = output_formula(x, weights, bias)
732 | d_error = -(y - output)
733 | weights -= learnrate * d_error * x
734 | bias -= learnrate * d_error
735 | return(weights, bias)
736 | ```
737 | - Iterate the gradient descent algorithm through all the data, for a number of epochs. Plot the data, and obtain some of the boundary lines as we run the algorithm.
738 | ```
739 | np.random.seed(44)
740 |
741 | epochs = 100
742 | learnrate = 0.01
743 |
744 | def train(features, targets, epochs, learnrate, graph_lines=False):
745 |
746 | errors = []
747 | n_records, n_features = features.shape
748 | last_loss = None
749 | weights = np.random.normal(scale=1 / n_features**.5, size=n_features)
750 | bias = 0
751 | for e in range(epochs):
752 | del_w = np.zeros(weights.shape)
753 | for x, y in zip(features, targets):
754 | output = output_formula(x, weights, bias)
755 | error = error_formula(y, output)
756 | weights, bias = update_weights(x, y, weights, bias, learnrate)
757 |
758 | # Printing out the log-loss error on the training set
759 | out = output_formula(features, weights, bias)
760 | loss = np.mean(error_formula(targets, out))
761 | errors.append(loss)
762 | if e % (epochs / 10) == 0:
763 | print("\n========== Epoch", e,"==========")
764 | if last_loss and last_loss < loss:
765 | print("Train loss: ", loss, " WARNING - Loss Increasing")
766 | else:
767 | print("Train loss: ", loss)
768 | last_loss = loss
769 | predictions = out > 0.5
770 | accuracy = np.mean(predictions == targets)
771 | print("Accuracy: ", accuracy)
772 | if graph_lines and e % (epochs / 100) == 0:
773 | display(-weights[0]/weights[1], -bias/weights[1])
774 |
775 |
776 | # Plotting the solution boundary
777 | plt.title("Solution boundary")
778 | display(-weights[0]/weights[1], -bias/weights[1], 'black')
779 |
780 | # Plotting the data
781 | plot_points(features, targets)
782 | plt.show()
783 |
784 | # Plotting the error
785 | plt.title("Error Plot")
786 | plt.xlabel('Number of epochs')
787 | plt.ylabel('Error')
788 | plt.plot(errors)
789 | plt.show()
790 | ```
791 | - Train the algorithm!
792 | - 10 updates with the current training loss and accuracy.
793 | - A plot of the data and some of the boundary lines obtained. The final one is in black. The lines get closer and closer to the best fit, as we go through more epochs.
794 | - A plot of the error function, which decreases as we go through more epochs.
795 |
796 |
797 |
798 |
799 |
--------------------------------------------------------------------------------