├── .gitignore ├── README.md ├── choose_your_own ├── class_vis.py ├── class_vis.pyc ├── prep_terrain_data.py ├── prep_terrain_data.pyc ├── test.png └── your_algorithm.py ├── datasets_questions └── explore_enron_data.py ├── decision_tree └── dt_author_id.py ├── evaluation └── evaluate_poi_identifier.py ├── feature_selection ├── email_authors.pkl ├── email_authors_overfit.pkl ├── find_signature.py ├── word_data.pkl └── word_data_overfit.pkl ├── k_means ├── clusters.pdf ├── clusters_before_scaling.pdf └── k_means_cluster.py ├── naive_bayes └── nb_author_id.py ├── outliers ├── enron_outliers.py ├── outlier_cleaner.py ├── outlier_removal_regression.py ├── practice_outliers_ages.pkl └── practice_outliers_net_worths.pkl ├── pca └── eigenfaces.py ├── regression └── finance_regression.py ├── svm └── svm_author_id.py ├── text_learning ├── from_chris.txt ├── from_sara.txt ├── test_email.txt └── vectorize_text.py ├── tools ├── email_authors.pkl ├── email_preprocess.py ├── email_preprocess.pyc ├── feature_format.py ├── parse_out_email_text.py ├── startup.py └── word_data.pkl └── validation └── validate_poi.py /.gitignore: -------------------------------------------------------------------------------- 1 | tools/feature_format.pyc 2 | tools/parse_out_email_text.pyc 3 | outliers/outlier_cleaner.pyc 4 | enron_mail_20110402.tgz 5 | enron_mail_20110402/ 6 | text_learning/your_word_data.pkl 7 | text_learning/your_email_authors.pkl 8 | my_classifier.pkl 9 | my_dataset.pkl 10 | my_feature_list.pkl 11 | *final_project/ 12 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Udacity - Intro to Machine Learning 2 | =================================== 3 | 4 | Project/Quiz codes for the udacity course "Intro to Machine Learning". 5 | -------------------------------------------------------------------------------- /choose_your_own/class_vis.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import pylab as pl 6 | 7 | def prettyPicture(clf, X_test, y_test): 8 | x_min = 0.0; x_max = 1.0 9 | y_min = 0.0; y_max = 1.0 10 | 11 | # Plot the decision boundary. For that, we will assign a color to each 12 | # point in the mesh [x_min, m_max]x[y_min, y_max]. 13 | h = .01 # step size in the mesh 14 | xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) 15 | Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) 16 | 17 | # Put the result into a color plot 18 | Z = Z.reshape(xx.shape) 19 | plt.xlim(xx.min(), xx.max()) 20 | plt.ylim(yy.min(), yy.max()) 21 | 22 | plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic) 23 | 24 | # Plot also the test points 25 | grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0] 26 | bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0] 27 | grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1] 28 | bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1] 29 | 30 | plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast") 31 | plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow") 32 | plt.legend() 33 | plt.xlabel("bumpiness") 34 | plt.ylabel("grade") 35 | 36 | plt.savefig("test.png") 37 | 38 | import base64 39 | import json 40 | import subprocess 41 | 42 | def output_image(name, format, bytes): 43 | image_start = "BEGIN_IMAGE_f9825uweof8jw9fj4r8" 44 | image_end = "END_IMAGE_0238jfw08fjsiufhw8frs" 45 | data = {} 46 | data['name'] = name 47 | data['format'] = format 48 | data['bytes'] = base64.encodestring(bytes) 49 | print image_start+json.dumps(data)+image_end 50 | 51 | -------------------------------------------------------------------------------- /choose_your_own/class_vis.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/choose_your_own/class_vis.pyc -------------------------------------------------------------------------------- /choose_your_own/prep_terrain_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import random 3 | 4 | 5 | def makeTerrainData(n_points=1000): 6 | ############################################################################### 7 | ### make the toy dataset 8 | random.seed(42) 9 | grade = [random.random() for ii in range(0,n_points)] 10 | bumpy = [random.random() for ii in range(0,n_points)] 11 | error = [random.random() for ii in range(0,n_points)] 12 | y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)] 13 | for ii in range(0, len(y)): 14 | if grade[ii]>0.8 or bumpy[ii]>0.8: 15 | y[ii] = 1.0 16 | 17 | ### split into train/test sets 18 | X = [[gg, ss] for gg, ss in zip(grade, bumpy)] 19 | split = int(0.75*n_points) 20 | X_train = X[0:split] 21 | X_test = X[split:] 22 | y_train = y[0:split] 23 | y_test = y[split:] 24 | 25 | grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0] 26 | bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0] 27 | grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1] 28 | bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1] 29 | 30 | training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig} 31 | , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}} 32 | 33 | 34 | grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0] 35 | bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0] 36 | grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1] 37 | bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1] 38 | 39 | test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig} 40 | , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}} 41 | 42 | return X_train, y_train, X_test, y_test 43 | 44 | -------------------------------------------------------------------------------- /choose_your_own/prep_terrain_data.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/choose_your_own/prep_terrain_data.pyc -------------------------------------------------------------------------------- /choose_your_own/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/choose_your_own/test.png -------------------------------------------------------------------------------- /choose_your_own/your_algorithm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import matplotlib.pyplot as plt 4 | from prep_terrain_data import makeTerrainData 5 | from class_vis import prettyPicture 6 | from time import time 7 | from sklearn.metrics import accuracy_score 8 | 9 | features_train, labels_train, features_test, labels_test = makeTerrainData() 10 | 11 | ### the training data (features_train, labels_train) have both "fast" and "slow" points mixed 12 | ### in together--separate them so we can give them different colors in the scatterplot, 13 | ### and visually identify them 14 | grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0] 15 | bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0] 16 | grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1] 17 | bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1] 18 | 19 | #### initial visualization 20 | plt.xlim(0.0, 1.0) 21 | plt.ylim(0.0, 1.0) 22 | plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast") 23 | plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow") 24 | plt.legend() 25 | plt.xlabel("bumpiness") 26 | plt.ylabel("grade") 27 | plt.show() 28 | ################################################################################# 29 | 30 | ### your code here! name your classifier object clf if you want the 31 | ### visualization code (prettyPicture) to show you the decision boundary 32 | 33 | ''' 34 | # K-Nearest-Neighbors Classifer 35 | from sklearn.neighbors import KNeighborsClassifier 36 | clf = KNeighborsClassifier(n_neighbors=22) 37 | t0 = time() 38 | clf.fit(features_train,labels_train) 39 | print "Training: ", round(time() - t0, 3), "s" 40 | t1 = time() 41 | pred = clf.predict(features_test) 42 | print "Prediction: ", round(time() - t1, 3), "s" 43 | print "Accuracy: ", accuracy_score(pred, labels_test) 44 | # Accuracy: 0.944 45 | # Training time: 0.001s 46 | # Prediction time: 0.002s 47 | ''' 48 | ''' 49 | # AdaBoost Classifier 50 | from sklearn.ensemble import AdaBoostClassifier 51 | clf = AdaBoostClassifier(n_estimators=50) 52 | t0 = time() 53 | clf.fit(features_train, labels_train) 54 | print "Training: ", round(time() - t0, 3), "s" 55 | t1 = time() 56 | pred = clf.predict(features_test) 57 | print "Prediction: ", round(time() - t1, 3), "s" 58 | print accuracy_score(pred, labels_test) 59 | # Accuracy: 0.924 60 | # Training time: 0.072s 61 | # Prediction time: 0.007s 62 | ''' 63 | # Random Forests Classifer 64 | from sklearn.ensemble import RandomForestClassifier 65 | clf = RandomForestClassifier(n_estimators=19) 66 | t0 = time() 67 | clf = clf.fit(features_train, labels_train) 68 | print "Training: ", round(time() - t0, 3), "s" 69 | t1 = time() 70 | pred = clf.predict(features_test) 71 | print "Prediction: ", round(time() - t1, 3), "s" 72 | print "Accuracy: ", accuracy_score(pred, labels_test) 73 | # Accuracy: 0.928 74 | # Training time: 0.022s 75 | # Prediction time: 0.003s 76 | 77 | try: 78 | prettyPicture(clf, features_test, labels_test) 79 | except NameError: 80 | pass -------------------------------------------------------------------------------- /datasets_questions/explore_enron_data.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/python 3 | 4 | """ 5 | starter code for exploring the Enron dataset (emails + finances) 6 | loads up the dataset (pickled dict of dicts) 7 | 8 | the dataset has the form 9 | enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict } 10 | 11 | {features_dict} is a dictionary of features associated with that person 12 | you should explore features_dict as part of the mini-project, 13 | but here's an example to get you started: 14 | 15 | enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000 16 | 17 | """ 18 | 19 | import pickle 20 | import numpy as np 21 | import pandas as pd 22 | 23 | enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r")) 24 | 25 | #data = enron_data['PRENTICE JAMES'] 26 | #data = enron_data['COLWELL WESLEY'] 27 | #data = enron_data['LAY KENNETH L'] 28 | #data = enron_data['FASTOW ANDREW S'] 29 | #data = enron_data['SKILLING JEFFREY K'] 30 | 31 | payments = sum([item["total_payments"]=='NaN' for item in enron_data.values()]) 32 | percent = (float(payments)/len(enron_data)) * 100 33 | print percent 34 | 35 | pois = 0 36 | count = 0 37 | for v in enron_data.values(): 38 | if v["poi"]: 39 | pois += 1 40 | if v["total_payments"] != 'NaN': 41 | count += 1 42 | 43 | print payments 44 | print pois 45 | print count 46 | print float(count)/pois * 100 -------------------------------------------------------------------------------- /decision_tree/dt_author_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | this is the code to accompany the Lesson 3 (decision tree) mini-project 5 | 6 | use an DT to identify emails from the Enron corpus by their authors 7 | 8 | Sara has label 0 9 | Chris has label 1 10 | 11 | """ 12 | 13 | import sys 14 | from time import time 15 | sys.path.append("../tools/") 16 | from email_preprocess import preprocess 17 | from sklearn import tree 18 | from sklearn.metrics import accuracy_score 19 | ### features_train and features_test are the features for the training 20 | ### and testing datasets, respectively 21 | ### labels_train and labels_test are the corresponding item labels 22 | features_train, features_test, labels_train, labels_test = preprocess() 23 | 24 | 25 | print len(features_train[0]) 26 | ## 3785 features with 10 percentile 27 | ## 379 features with 1 percentile 28 | ######################################################### 29 | ### your code goes here ### 30 | clf = tree.DecisionTreeClassifier(min_samples_split=40) 31 | clf = clf.fit(features_train, labels_train) 32 | prediction = clf.predict(features_test) 33 | print accuracy_score(prediction, labels_test) 34 | ## Accuracy: 0.977815699659 (10 percentile) 35 | ## Accuracy: 0.967007963595 (1 percentile) 36 | ######################################################### 37 | 38 | 39 | -------------------------------------------------------------------------------- /evaluation/evaluate_poi_identifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | 4 | """ 5 | starter code for the evaluation mini-project 6 | start by copying your trained/tested POI identifier from 7 | that you built in the validation mini-project 8 | 9 | the second step toward building your POI identifier! 10 | 11 | start by loading/formatting the data 12 | 13 | """ 14 | 15 | import pickle 16 | import sys 17 | import numpy as np 18 | sys.path.append("../tools/") 19 | from feature_format import featureFormat, targetFeatureSplit 20 | 21 | data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) 22 | 23 | ### add more features to features_list! 24 | features_list = ["poi", "salary"] 25 | 26 | data = featureFormat(data_dict, features_list) 27 | labels, features = targetFeatureSplit(data) 28 | 29 | 30 | 31 | ### your code goes here 32 | from sklearn import cross_validation 33 | features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42) 34 | 35 | # Fit data with sklearn decision trees algorithm 36 | from sklearn import tree 37 | clf = tree.DecisionTreeClassifier() 38 | clf = clf.fit(features_train, labels_train) 39 | 40 | # Get the accuracy 41 | from sklearn.metrics import accuracy_score 42 | prediction = clf.predict(features_test) 43 | print "Prediction: ", prediction 44 | print "Accuracy: ", accuracy_score(prediction, labels_test) 45 | print "Number of POI's: ", np.count_nonzero(prediction) 46 | print "People in Test Set: ", len(prediction) 47 | print "Accuracy if all zeros: ", accuracy_score([0]*29, labels_test) 48 | 49 | from collections import Counter 50 | confusion_matrix = Counter() 51 | 52 | #truth = labels_test 53 | prediction = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 54 | truth = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0] 55 | positives = [1] 56 | 57 | binary_truth = [x in positives for x in truth] 58 | binary_prediction = [x in positives for x in prediction] 59 | for t, p in zip(binary_truth, binary_prediction): 60 | confusion_matrix[t,p] += 1 61 | 62 | print confusion_matrix 63 | 64 | from sklearn.metrics import precision_score 65 | print "Precision Score: ", precision_score(prediction, truth) 66 | from sklearn.metrics import recall_score 67 | print "Recall Score: ", recall_score(prediction, truth) 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /feature_selection/email_authors.pkl: -------------------------------------------------------------------------------- 1 | (lp0 2 | I0 3 | aI0 4 | aI0 5 | aI0 6 | aI0 7 | aI0 8 | aI0 9 | aI0 10 | aI0 11 | aI0 12 | aI0 13 | aI0 14 | aI0 15 | aI0 16 | aI0 17 | aI0 18 | aI0 19 | aI0 20 | aI0 21 | aI0 22 | aI0 23 | aI0 24 | aI0 25 | aI0 26 | aI0 27 | aI0 28 | aI0 29 | aI0 30 | aI0 31 | aI0 32 | aI0 33 | aI0 34 | aI0 35 | aI0 36 | aI0 37 | aI0 38 | aI0 39 | aI0 40 | aI0 41 | aI0 42 | aI0 43 | aI0 44 | aI0 45 | aI0 46 | aI0 47 | aI0 48 | aI0 49 | aI0 50 | aI0 51 | aI0 52 | aI0 53 | aI0 54 | aI0 55 | aI0 56 | aI0 57 | aI0 58 | aI0 59 | aI0 60 | aI0 61 | aI0 62 | aI0 63 | aI0 64 | aI0 65 | aI0 66 | aI0 67 | aI0 68 | aI0 69 | aI0 70 | aI0 71 | aI0 72 | aI0 73 | aI0 74 | aI0 75 | aI0 76 | aI0 77 | aI0 78 | aI0 79 | aI0 80 | aI0 81 | aI0 82 | aI0 83 | aI0 84 | aI0 85 | aI0 86 | aI0 87 | aI0 88 | aI0 89 | aI0 90 | aI0 91 | aI0 92 | aI0 93 | aI0 94 | aI0 95 | aI0 96 | aI0 97 | aI0 98 | aI0 99 | aI0 100 | aI0 101 | aI0 102 | aI0 103 | aI0 104 | aI0 105 | aI0 106 | aI0 107 | aI0 108 | aI0 109 | aI0 110 | aI0 111 | aI0 112 | aI0 113 | aI0 114 | aI0 115 | aI0 116 | aI0 117 | aI0 118 | aI0 119 | aI0 120 | aI0 121 | aI0 122 | aI0 123 | aI0 124 | aI0 125 | aI0 126 | aI0 127 | aI0 128 | aI0 129 | aI0 130 | aI0 131 | aI0 132 | aI0 133 | aI0 134 | aI0 135 | aI0 136 | aI0 137 | aI0 138 | aI0 139 | aI0 140 | aI0 141 | aI0 142 | aI0 143 | aI0 144 | aI0 145 | aI0 146 | aI0 147 | aI0 148 | aI0 149 | aI0 150 | aI0 151 | aI0 152 | aI0 153 | aI0 154 | aI0 155 | aI0 156 | aI0 157 | aI0 158 | aI0 159 | aI0 160 | aI0 161 | aI0 162 | aI0 163 | aI0 164 | aI0 165 | aI0 166 | aI0 167 | aI0 168 | aI0 169 | aI0 170 | aI0 171 | aI0 172 | aI0 173 | aI0 174 | aI0 175 | aI0 176 | aI0 177 | aI0 178 | aI0 179 | aI0 180 | aI0 181 | aI0 182 | aI0 183 | aI0 184 | aI0 185 | aI0 186 | aI0 187 | aI0 188 | aI0 189 | aI0 190 | aI0 191 | aI0 192 | aI0 193 | aI0 194 | aI0 195 | aI0 196 | aI0 197 | aI0 198 | aI0 199 | aI0 200 | aI0 201 | a. -------------------------------------------------------------------------------- /feature_selection/find_signature.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import pickle 4 | import numpy 5 | numpy.random.seed(42) 6 | 7 | 8 | ### the words (features) and authors (labels), already largely processed 9 | ### these files should have been created from the previous (Lesson 10) mini-project. 10 | words_file = "../text_learning/your_word_data.pkl" 11 | authors_file = "../text_learning/your_email_authors.pkl" 12 | word_data = pickle.load( open(words_file, "r")) 13 | authors = pickle.load( open(authors_file, "r") ) 14 | 15 | 16 | ### test_size is the percentage of events assigned to the test set (remainder go into training) 17 | from sklearn import cross_validation 18 | features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) 19 | 20 | 21 | from sklearn.feature_extraction.text import TfidfVectorizer 22 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, 23 | stop_words='english') 24 | features_train = vectorizer.fit_transform(features_train).toarray() 25 | features_test = vectorizer.transform(features_test).toarray() 26 | 27 | 28 | ### a classic way to overfit is to use a small number 29 | ### of data points and a large number of features 30 | ### train on only 150 events to put ourselves in this regime 31 | features_train = features_train[:150] 32 | labels_train = labels_train[:150] 33 | 34 | 35 | ### your code goes here 36 | from sklearn.tree import DecisionTreeClassifier 37 | clf = DecisionTreeClassifier() 38 | clf.fit(features_train,labels_train) 39 | print "Score: ",clf.score(features_test,labels_test) 40 | ### Accuracy 1.0 on overfit data 41 | importances = clf.feature_importances_ 42 | import numpy as np 43 | indices = np.argsort(importances)[::-1] 44 | print 'Feature Ranking: ' 45 | for i in range(10): 46 | print "{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]]) 47 | 48 | -------------------------------------------------------------------------------- /k_means/clusters.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/k_means/clusters.pdf -------------------------------------------------------------------------------- /k_means/clusters_before_scaling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/k_means/clusters_before_scaling.pdf -------------------------------------------------------------------------------- /k_means/k_means_cluster.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | skeleton code for k-means clustering mini-project 5 | 6 | """ 7 | 8 | import pickle 9 | import numpy 10 | import matplotlib.pyplot as plt 11 | import sys 12 | sys.path.append("../tools/") 13 | from feature_format import featureFormat, targetFeatureSplit 14 | 15 | 16 | def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature 1", f2_name="feature 2"): 17 | """ some plotting code designed to help you visualize your clusters """ 18 | 19 | ### plot each cluster with a different color--add more colors for 20 | ### drawing more than 4 clusters 21 | colors = ["b", "c", "k", "m", "g"] 22 | for ii, pp in enumerate(pred): 23 | plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]]) 24 | 25 | ### if you like, place red stars over points that are POIs (just for funsies) 26 | if mark_poi: 27 | for ii, pp in enumerate(pred): 28 | if poi[ii]: 29 | plt.scatter(features[ii][0], features[ii][1], color="r", marker="*") 30 | plt.xlabel(f1_name) 31 | plt.ylabel(f2_name) 32 | plt.savefig(name) 33 | plt.show() 34 | 35 | 36 | ### load in the dict of dicts containing all the data on each person in the dataset 37 | data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) 38 | ### there's an outlier--remove it! 39 | data_dict.pop("TOTAL", 0) 40 | 41 | ### the input features we want to use 42 | ### can be any key in the person-level dictionary (salary, director_fees, etc.) 43 | feature_1 = "salary" 44 | feature_2 = "exercised_stock_options" 45 | feature_3 = "total_payments" 46 | poi = "poi" 47 | features_list = [poi, feature_1, feature_2, feature_3] 48 | data = featureFormat(data_dict, features_list ) 49 | poi, finance_features = targetFeatureSplit( data ) 50 | 51 | ### in the "clustering with 3 features" part of the mini-project, 52 | ### you'll want to change this line to 53 | ### for f1, f2, _ in finance_features: 54 | for f1, f2, f3 in finance_features: 55 | plt.scatter( f1, f2) 56 | plt.show() 57 | 58 | 59 | from sklearn.cluster import KMeans 60 | features_list = ["poi", feature_1, feature_2, feature_3] 61 | data2 = featureFormat(data_dict, features_list ) 62 | poi, finance_features = targetFeatureSplit( data2 ) 63 | clf = KMeans(n_clusters=3) 64 | pred = clf.fit_predict( finance_features ) 65 | Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name=feature_1, f2_name=feature_2) 66 | 67 | 68 | ### cluster here; create predictions of the cluster labels 69 | ### for the data and store them to a list called pred 70 | 71 | try: 72 | Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2) 73 | except NameError: 74 | print "no predictions object named pred found, no clusters to plot" 75 | -------------------------------------------------------------------------------- /naive_bayes/nb_author_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | this is the code to accompany the Lesson 1 (Naive Bayes) mini-project 5 | 6 | use a Naive Bayes Classifier to identify emails by their authors 7 | 8 | authors and labels: 9 | Sara has label 0 10 | Chris has label 1 11 | 12 | """ 13 | 14 | import sys 15 | from time import time 16 | sys.path.append("../tools/") 17 | from email_preprocess import preprocess 18 | from sklearn.naive_bayes import GaussianNB 19 | from sklearn.metrics import accuracy_score 20 | 21 | ### features_train and features_test are the features for the training 22 | ### and testing datasets, respectively 23 | ### labels_train and labels_test are the corresponding item labels 24 | features_train, features_test, labels_train, labels_test = preprocess() 25 | 26 | ######################################################### 27 | ### your code goes here ### 28 | 29 | clf = GaussianNB() 30 | clf.fit(features_train, labels_train) 31 | ## training time: 2.848 s 32 | prediction = clf.predict(features_test) 33 | ## prediction time: 0.301 s 34 | 35 | print accuracy_score(prediction, labels_test) 36 | ######################################################### 37 | 38 | 39 | -------------------------------------------------------------------------------- /outliers/enron_outliers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import pickle 4 | import sys 5 | import matplotlib.pyplot 6 | sys.path.append("../tools/") 7 | from feature_format import featureFormat, targetFeatureSplit 8 | 9 | 10 | ### read in data dictionary, convert to numpy array 11 | data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") ) 12 | features = ["salary", "bonus"] 13 | data_dict.pop("TOTAL", 0) 14 | data = featureFormat(data_dict, features) 15 | 16 | ### your code below 17 | for point in data: 18 | salary = point[0] 19 | bonus = point[1] 20 | matplotlib.pyplot.scatter( salary, bonus) 21 | 22 | matplotlib.pyplot.xlabel("salary") 23 | matplotlib.pyplot.ylabel("bonus") 24 | matplotlib.pyplot.show() -------------------------------------------------------------------------------- /outliers/outlier_cleaner.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | 4 | def outlierCleaner(predictions, ages, net_worths): 5 | """ 6 | clean away the 10% of points that have the largest 7 | residual errors (different between the prediction 8 | and the actual net worth) 9 | 10 | return a list of tuples named cleaned_data where 11 | each tuple is of the form (age, net_worth, error) 12 | """ 13 | cleaned_data = [] 14 | 15 | ### your code goes here 16 | errors = (net_worths-predictions)**2 17 | cleaned_data = zip(ages,net_worths,errors) 18 | cleaned_data = sorted(cleaned_data,key=lambda x:x[2][0],reverse=True) 19 | limit = int(len(net_worths)*0.1) 20 | 21 | return cleaned_data[limit:] 22 | 23 | -------------------------------------------------------------------------------- /outliers/outlier_removal_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import random 4 | import numpy 5 | import matplotlib.pyplot as plt 6 | import pickle 7 | 8 | from outlier_cleaner import outlierCleaner 9 | 10 | 11 | ### load up some practice data with outliers in it 12 | ages = pickle.load( open("practice_outliers_ages.pkl", "r") ) 13 | net_worths = pickle.load( open("practice_outliers_net_worths.pkl", "r") ) 14 | 15 | 16 | 17 | ### ages and net_worths need to be reshaped into 2D numpy arrays 18 | ### second argument of reshape command is a tuple of integers: (n_rows, n_columns) 19 | ### by convention, n_rows is the number of data points 20 | ### and n_columns is the number of features 21 | ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) 22 | net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) 23 | from sklearn.cross_validation import train_test_split 24 | ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42) 25 | 26 | ### fill in a regression here! Name the regression object reg so that 27 | ### the plotting code below works, and you can see what your regression looks like 28 | from sklearn import linear_model 29 | reg = linear_model.LinearRegression() 30 | reg.fit(ages_train, net_worths_train) 31 | print "Slope ", reg.coef_ 32 | print "Score ", reg.score(ages_test, net_worths_test) 33 | 34 | try: 35 | plt.plot(ages, reg.predict(ages), color="blue") 36 | except NameError: 37 | pass 38 | plt.scatter(ages, net_worths) 39 | plt.show() 40 | 41 | 42 | ### identify and remove the most outlier-y points 43 | cleaned_data = [] 44 | try: 45 | predictions = reg.predict(ages_train) 46 | cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) 47 | except NameError: 48 | print "your regression object doesn't exist, or isn't name reg" 49 | print "can't make predictions to use in identifying outliers" 50 | 51 | ### only run this code if cleaned_data is returning data 52 | if len(cleaned_data) > 0: 53 | ages, net_worths, errors = zip(*cleaned_data) 54 | ages = numpy.reshape( numpy.array(ages), (len(ages), 1)) 55 | net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1)) 56 | 57 | ### refit your cleaned data! 58 | try: 59 | reg.fit(ages, net_worths) 60 | plt.plot(ages, reg.predict(ages), color="blue") 61 | except NameError: 62 | print "you don't seem to have regression imported/created," 63 | print " or else your regression object isn't named reg" 64 | print " either way, only draw the scatter plot of the cleaned data" 65 | plt.scatter(ages, net_worths) 66 | plt.xlabel("ages") 67 | plt.ylabel("net worths") 68 | plt.show() 69 | 70 | print "New Slope Without Outliers ", reg.coef_ 71 | print "New Score Without Outliers ", reg.score(ages_test, net_worths_test) 72 | 73 | else: 74 | print "outlierCleaner() is returning an empty list, no refitting to be done" 75 | 76 | -------------------------------------------------------------------------------- /outliers/practice_outliers_ages.pkl: -------------------------------------------------------------------------------- 1 | (lp0 2 | I49 3 | aI21 4 | aI32 5 | aI30 6 | aI53 7 | aI51 8 | aI61 9 | aI23 10 | aI39 11 | aI21 12 | aI30 13 | aI43 14 | aI21 15 | aI29 16 | aI49 17 | aI45 18 | aI30 19 | aI47 20 | aI57 21 | aI20 22 | aI57 23 | aI52 24 | aI35 25 | aI27 26 | aI64 27 | aI35 28 | aI24 29 | aI24 30 | aI58 31 | aI47 32 | aI57 33 | aI53 34 | aI44 35 | aI64 36 | aI37 37 | aI45 38 | aI58 39 | aI48 40 | aI59 41 | aI46 42 | aI52 43 | aI22 44 | aI30 45 | aI33 46 | aI23 47 | aI30 48 | aI24 49 | aI32 50 | aI49 51 | aI36 52 | aI37 53 | aI29 54 | aI32 55 | aI63 56 | aI49 57 | aI48 58 | aI27 59 | aI53 60 | aI27 61 | aI37 62 | aI65 63 | aI49 64 | aI45 65 | aI51 66 | aI58 67 | aI55 68 | aI30 69 | aI21 70 | aI34 71 | aI32 72 | aI29 73 | aI63 74 | aI60 75 | aI34 76 | aI50 77 | aI38 78 | aI62 79 | aI41 80 | aI32 81 | aI31 82 | aI45 83 | aI32 84 | aI46 85 | aI61 86 | aI38 87 | aI30 88 | aI65 89 | aI43 90 | aI24 91 | aI22 92 | aI25 93 | aI48 94 | aI56 95 | aI39 96 | aI22 97 | aI37 98 | aI65 99 | aI44 100 | aI64 101 | aI59 102 | a. -------------------------------------------------------------------------------- /outliers/practice_outliers_net_worths.pkl: -------------------------------------------------------------------------------- 1 | (lp0 2 | F316.18428306022463 3 | aF128.4847139765763 4 | aF212.95377076201385 5 | aF217.96059712816052 6 | aF326.56693250553326 7 | aF314.0672608610164 8 | aF412.83425631014785 9 | aF159.09869458305818 10 | aF234.36051228130097 11 | aF142.1012008717193 12 | aF178.23164614375077 13 | aF259.43540492859483 14 | aF136.08924543132068 15 | aF142.98439510684403 16 | aF271.75164334973937 17 | aF270.00424941518054 18 | aF167.24337759331152 19 | aF300.0349466519055 20 | aF338.08951848957577 21 | aF96.75392597329417 22 | aF385.5629753784311 23 | aF320.4844739902693 24 | aF220.10056409375846 25 | aF140.25503627573087 26 | aF389.1123455094963 27 | aF220.96845179419734 28 | aF126.98012845155395 29 | aF157.51396036691344 30 | aF350.4872262016239 31 | aF287.91612500413447 32 | aF344.21586775541203 33 | aF368.29556369017877 34 | aF274.7300555052413 35 | aF378.84578142088196 36 | aF247.70089824206377 37 | aF256.83312700057957 38 | aF366.6772719000951 39 | aF260.8065975224045 40 | aF342.1862790220314 41 | aF291.4372247173825 42 | aF339.7693315999082 43 | aF140.9273656237994 44 | aF185.1870343522352 45 | aF200.22792608821422 46 | aF114.17956019265145 47 | aF173.10311583210583 48 | aF140.78722458080426 49 | aF221.1424445243783 50 | aF313.12236579136925 51 | aF189.73919689274533 52 | aF237.7316793878959 53 | aF173.54835439167368 54 | aF186.46155999388083 55 | aF405.9835257768174 56 | aF326.86999044991904 57 | aF318.62560238232396 58 | aF151.96564953554724 59 | aF325.0657524829757 60 | aF175.37526862807127 61 | aF250.76090254244718 62 | aF396.6665152430942 63 | aF302.53682046672367 64 | aF259.1233005198794 65 | aF294.8258675183866 66 | aF378.75051644788397 67 | aF370.8748005714165 68 | aF186.05979756839332 69 | aF151.32065795784047 70 | aF219.7327205009527 71 | aF187.0976049078975 72 | aF188.4779121101683 73 | aF424.51073132931936 74 | aF374.28347921780096 75 | aF243.79287311628013 76 | aF260.1050979182051 77 | aF253.93805008750448 78 | aF389.24094136476344 79 | aF250.26985299068266 80 | aF201.83521553071006 81 | aF153.99862170798215 82 | aF276.85656224324975 83 | aF207.14225143023492 84 | aF317.0578808948303 85 | aF370.88459563452705 86 | aF221.33012794213624 87 | aF177.46485912830926 88 | aF424.5580423540415 89 | aF275.3250221931937 90 | aF139.40479592465923 91 | aF147.76534866226712 92 | aF133.88384977349668 93 | aF80.9013362873476 94 | aF121.11602327460938 95 | aF134.41369498852399 96 | aF47.70052124577958 97 | aF16.507788679151513 98 | aF34.19027438129125 99 | aF64.06616829393845 100 | aF122.70221488837397 101 | aF129.10958748845152 102 | a. -------------------------------------------------------------------------------- /pca/eigenfaces.py: -------------------------------------------------------------------------------- 1 | """ 2 | =================================================== 3 | Faces recognition example using eigenfaces and SVMs 4 | =================================================== 5 | 6 | The dataset used in this example is a preprocessed excerpt of the 7 | "Labeled Faces in the Wild", aka LFW_: 8 | 9 | http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB) 10 | 11 | .. _LFW: http://vis-www.cs.umass.edu/lfw/ 12 | 13 | original source: http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html 14 | 15 | """ 16 | 17 | 18 | 19 | print __doc__ 20 | 21 | from time import time 22 | import logging 23 | import pylab as pl 24 | import numpy as np 25 | 26 | from sklearn.cross_validation import train_test_split 27 | from sklearn.datasets import fetch_lfw_people 28 | from sklearn.grid_search import GridSearchCV 29 | from sklearn.metrics import classification_report 30 | from sklearn.metrics import confusion_matrix 31 | from sklearn.decomposition import RandomizedPCA 32 | from sklearn.svm import SVC 33 | 34 | # Display progress logs on stdout 35 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') 36 | 37 | 38 | ############################################################################### 39 | # Download the data, if not already on disk and load it as numpy arrays 40 | 41 | lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4) 42 | 43 | # introspect the images arrays to find the shapes (for plotting) 44 | n_samples, h, w = lfw_people.images.shape 45 | np.random.seed(42) 46 | 47 | # fot machine learning we use the 2 data directly (as relative pixel 48 | # positions info is ignored by this model) 49 | X = lfw_people.data 50 | n_features = X.shape[1] 51 | 52 | # the label to predict is the id of the person 53 | y = lfw_people.target 54 | target_names = lfw_people.target_names 55 | n_classes = target_names.shape[0] 56 | 57 | print "Total dataset size:" 58 | print "n_samples: %d" % n_samples 59 | print "n_features: %d" % n_features 60 | print "n_classes: %d" % n_classes 61 | 62 | 63 | ############################################################################### 64 | # Split into a training set and a test set using a stratified k fold 65 | 66 | # split into a training and testing set 67 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 68 | 69 | 70 | ############################################################################### 71 | # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled 72 | # dataset): unsupervised feature extraction / dimensionality reduction 73 | 74 | #n_components = 10 # crashes 75 | #n_components = 15 # f1 score average of 0.65 76 | #n_components = 25 # f1 score average of 0.74 77 | #n_components = 50 # f1 score average of 0.81 78 | #n_components = 100 # f1 score average of 0.85 79 | #n_components = 150 # f1 score average of 0.83 80 | n_components = 200 # f1 score average of 0.85 81 | #n_components = 300 # f1 score average of 0.8 82 | 83 | 84 | print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0]) 85 | t0 = time() 86 | pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train) 87 | print "done in %0.3fs" % (time() - t0) 88 | 89 | eigenfaces = pca.components_.reshape((n_components, h, w)) 90 | 91 | print "Projecting the input data on the eigenfaces orthonormal basis" 92 | t0 = time() 93 | X_train_pca = pca.transform(X_train) 94 | X_test_pca = pca.transform(X_test) 95 | print "done in %0.3fs" % (time() - t0) 96 | print "Variance", pca.explained_variance_ratio_[:2] 97 | 98 | ############################################################################### 99 | # Train a SVM classification model 100 | 101 | print "Fitting the classifier to the training set" 102 | t0 = time() 103 | param_grid = { 104 | 'C': [1e3, 5e3, 1e4, 5e4, 1e5], 105 | 'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], 106 | } 107 | clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) 108 | clf = clf.fit(X_train_pca, y_train) 109 | print "done in %0.3fs" % (time() - t0) 110 | print "Best estimator found by grid search:" 111 | print clf.best_estimator_ 112 | 113 | 114 | ############################################################################### 115 | # Quantitative evaluation of the model quality on the test set 116 | 117 | print "Predicting the people names on the testing set" 118 | t0 = time() 119 | y_pred = clf.predict(X_test_pca) 120 | print "done in %0.3fs" % (time() - t0) 121 | 122 | print classification_report(y_test, y_pred, target_names=target_names) 123 | print confusion_matrix(y_test, y_pred, labels=range(n_classes)) 124 | 125 | 126 | ############################################################################### 127 | # Qualitative evaluation of the predictions using matplotlib 128 | 129 | def plot_gallery(images, titles, h, w, n_row=3, n_col=4): 130 | """Helper function to plot a gallery of portraits""" 131 | pl.figure(figsize=(1.8 * n_col, 2.4 * n_row)) 132 | pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) 133 | for i in range(n_row * n_col): 134 | pl.subplot(n_row, n_col, i + 1) 135 | pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray) 136 | pl.title(titles[i], size=12) 137 | pl.xticks(()) 138 | pl.yticks(()) 139 | 140 | # plot the result of the prediction on a portion of the test set 141 | 142 | def title(y_pred, y_test, target_names, i): 143 | pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1] 144 | true_name = target_names[y_test[i]].rsplit(' ', 1)[-1] 145 | return 'predicted: %s\ntrue: %s' % (pred_name, true_name) 146 | 147 | prediction_titles = [title(y_pred, y_test, target_names, i) 148 | for i in range(y_pred.shape[0])] 149 | 150 | plot_gallery(X_test, prediction_titles, h, w) 151 | 152 | # plot the gallery of the most significative eigenfaces 153 | 154 | eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])] 155 | plot_gallery(eigenfaces, eigenface_titles, h, w) 156 | 157 | pl.show() 158 | -------------------------------------------------------------------------------- /regression/finance_regression.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | starter code for the regression mini-project 5 | 6 | loads up/formats a modified version of the dataset 7 | (why modified? we've removed some trouble points 8 | that you'll find yourself in the outliers mini-project) 9 | 10 | draws a little scatterplot of the training/testing data 11 | 12 | you fill in the regression code where indicated 13 | 14 | """ 15 | 16 | 17 | import sys 18 | import pickle 19 | sys.path.append("../tools/") 20 | from feature_format import featureFormat, targetFeatureSplit 21 | dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") ) 22 | 23 | ### list the features you want to look at--first item in the 24 | ### list will be the "target" feature 25 | features_list = ["bonus", "long_term_incentive"] 26 | data = featureFormat( dictionary, features_list, remove_any_zeroes=True) 27 | target, features = targetFeatureSplit( data ) 28 | 29 | ### training-testing split needed in regression, just like classification 30 | from sklearn.cross_validation import train_test_split 31 | feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) 32 | train_color = "b" 33 | test_color = "r" 34 | 35 | ### your regression goes here! 36 | ### please name it reg, so that the plotting code below picks it up and 37 | ### plots it correctly 38 | from sklearn import linear_model 39 | reg = linear_model.LinearRegression() 40 | reg.fit(feature_train, target_train) 41 | 42 | ### draw the scatterplot, with color-coded training and testing points 43 | import matplotlib.pyplot as plt 44 | for feature, target in zip(feature_test, target_test): 45 | plt.scatter( feature, target, color=test_color ) 46 | for feature, target in zip(feature_train, target_train): 47 | plt.scatter( feature, target, color=train_color ) 48 | 49 | ### labels for the legend 50 | plt.scatter(feature_test[0], target_test[0], color=test_color, label="test") 51 | plt.scatter(feature_test[0], target_test[0], color=train_color, label="train") 52 | 53 | print "Slope ", reg.coef_ 54 | print "Intercept ", reg.intercept_ 55 | print "Score ", reg.score(feature_test, target_test) 56 | 57 | 58 | ### draw the regression line, once it's coded 59 | try: 60 | plt.plot( feature_test, reg.predict(feature_test) ) 61 | except NameError: 62 | pass 63 | reg.fit(feature_test, target_test) 64 | print "Slope ", reg.coef_ 65 | plt.plot(feature_train, reg.predict(feature_train), color="r") 66 | plt.xlabel(features_list[1]) 67 | plt.ylabel(features_list[0]) 68 | plt.legend() 69 | plt.show() 70 | -------------------------------------------------------------------------------- /svm/svm_author_id.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | this is the code to accompany the Lesson 2 (SVM) mini-project 5 | 6 | use an SVM to identify emails from the Enron corpus by their authors 7 | 8 | Sara has label 0 9 | Chris has label 1 10 | 11 | """ 12 | 13 | import sys 14 | from time import time 15 | sys.path.append("../tools/") 16 | from email_preprocess import preprocess 17 | from sklearn import svm 18 | from sklearn.metrics import accuracy_score 19 | ### features_train and features_test are the features for the training 20 | ### and testing datasets, respectively 21 | ### labels_train and labels_test are the corresponding item labels 22 | features_train, features_test, labels_train, labels_test = preprocess() 23 | #features_train = features_train[:len(features_train)/100] 24 | #labels_train = labels_train[:len(labels_train)/100] 25 | 26 | ######################################################### 27 | ### your code goes here ### 28 | clf = svm.SVC(kernel="rbf", C=10500) 29 | t0 = time() 30 | clf.fit(features_train, labels_train) 31 | print "Training time:", round(time()-t0, 3), "s" 32 | # Training Time: 213 seconds 33 | # Training Time w/ subset of data: 0.117 seconds 34 | # Training Time w/ rbf kernel: 0.164 seconds 35 | # Training Time w/ rbf kernel & C=10: 0.127 seconds 36 | # Training Time w/ rbf kernel & C=100: 0.138 seconds 37 | # Training Time w/ rbf kernel & C=1000: 0.127 seconds 38 | # Training Time w/ rbf kernel & C=10000: 0.129 seconds 39 | # Training Time (full) rbf kernel & C=10000: 137.442 seconds 40 | t1 = time() 41 | prediction = clf.predict(features_test) 42 | print "Prediction time:", round(time()-t1, 3), "s" 43 | # Prediction Time: 23 second 44 | # Prediction Time w/ subset of data: 0.88 seconds 45 | # Prediction Time w/ rbf kernel: 1.475 seconds 46 | # Prediction Time w/ rbf kernel & C=10: 1.465 seconds 47 | # Prediction Time w/ rbf kernel & C=100: 1.338 seconds 48 | # Prediction Time w/ rbf kernel & C=1000: 1.279 seconds 49 | # Prediction Time w/ rbf kernel & C=10000: 1.159 seconds 50 | # Prediction Time (full) rbf kernel & C=10000: 14.656 seconds 51 | print accuracy_score(prediction, labels_test) 52 | # Accuracy: 0.984072810011 53 | # Accuracy w/ subset: 0.884527872582 54 | # Accuracy w/ rbf kernel: 0.616040955631 55 | # Accuracy w/ rbf kernel & C=10 or 100: 0.616040955631 56 | # Accuracy w/ rbf kernel & C=1000: 0.821387940842 57 | # Accuracy w/ rbf kernel & C=10000: 0.892491467577 58 | # Accuracy (full) rbf kernel & C=10000: 0.990898748578 59 | 60 | # Prediction for feature_test[10][26][50] = [1][0][1] 61 | # print clf.predict(features_test[50]) 62 | 63 | chris = [] 64 | # Get number of predicted emails written by Chris. Ans: 877 65 | for i in prediction: 66 | if i == 1: 67 | chris.append(i) 68 | 69 | print len(chris) 70 | ######################################################### 71 | 72 | 73 | -------------------------------------------------------------------------------- /text_learning/test_email.txt: -------------------------------------------------------------------------------- 1 | To: Katie_and_Sebastians_Excellent_Students@udacity.com 2 | From: katie@udacity.com 3 | X-FileName: 4 | 5 | Hi Everyone! If you can read this message, you're properly using parseOutText. Please proceed to the next part of the project! 6 | -------------------------------------------------------------------------------- /text_learning/vectorize_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import os 4 | import pickle 5 | import re 6 | import sys 7 | 8 | sys.path.append( "../tools/" ) 9 | from parse_out_email_text import parseOutText 10 | 11 | """ 12 | starter code to process the emails from Sara and Chris to extract 13 | the features and get the documents ready for classification 14 | 15 | the list of all the emails from Sara are in the from_sara list 16 | likewise for emails from Chris (from_chris) 17 | 18 | the actual documents are in the Enron email dataset, which 19 | you downloaded/unpacked in Part 0 of the first mini-project 20 | 21 | the data is stored in lists and packed away in pickle files at the end 22 | 23 | """ 24 | ### Parse out test email 25 | testEmail = open("test_email.txt", "r") 26 | print parseOutText(testEmail) 27 | 28 | 29 | from_sara = open("from_sara.txt", "r") 30 | from_chris = open("from_chris.txt", "r") 31 | 32 | from_data = [] 33 | word_data = [] 34 | 35 | ### temp_counter is a way to speed up the development--there are 36 | ### thousands of emails from Sara and Chris, so running over all of them 37 | ### can take a long time 38 | ### temp_counter helps you only look at the first 200 emails in the list 39 | temp_counter = 0 40 | 41 | 42 | for name, from_person in [("sara", from_sara), ("chris", from_chris)]: 43 | for path in from_person: 44 | ### only look at first 200 emails when developing 45 | ### once everything is working, remove this line to run over full dataset 46 | temp_counter += 1 47 | if temp_counter < 200: 48 | path = os.path.join('..', path[:-1]) 49 | print path 50 | email = open(path, "r") 51 | 52 | ### use parseOutText to extract the text from the opened email 53 | ### use str.replace() to remove any instances of the words 54 | ### ["sara", "shackleton", "chris", "germani"] 55 | words = parseOutText(email) 56 | remove = ["sara", "shackleton", "chris", "germani"] 57 | #list_rep = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"] 58 | for word in remove: 59 | words = words.replace(word,"") 60 | ### append the text to word_data 61 | word_data.append(words) 62 | ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris 63 | from_data.append(0 if name == "sara" else 1) 64 | 65 | email.close() 66 | 67 | print "Entry #152: ", word_data[152] 68 | 69 | print "emails processed" 70 | from_sara.close() 71 | from_chris.close() 72 | 73 | pickle.dump( word_data, open("your_word_data.pkl", "w") ) 74 | pickle.dump( from_data, open("your_email_authors.pkl", "w") ) 75 | 76 | 77 | ### in Part 4, do TfIdf vectorization here 78 | from nltk.corpus import stopwords 79 | sw = stopwords.words("english") 80 | 81 | from sklearn.feature_extraction.text import TfidfVectorizer 82 | vectorizer = TfidfVectorizer(stop_words="english",lowercase=True) 83 | vectorizer.fit_transform(word_data) 84 | # bag_words = vectorizer.transform(word_data) 85 | 86 | # Get how many unique words there are in the emails 87 | print len(vectorizer.get_feature_names()) 88 | 89 | # Word number 34597 90 | print vectorizer.get_feature_names()[34597] -------------------------------------------------------------------------------- /tools/email_preprocess.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import pickle 4 | import numpy 5 | 6 | from sklearn import cross_validation 7 | from sklearn.feature_extraction.text import TfidfVectorizer 8 | from sklearn.feature_selection import SelectPercentile, f_classif 9 | 10 | 11 | 12 | def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"): 13 | """ 14 | this function takes a pre-made list of email texts (by default word_data.pkl) 15 | and the corresponding authors (by default email_authors.pkl) and performs 16 | a number of preprocessing steps: 17 | -- splits into training/testing sets (10% testing) 18 | -- vectorizes into tfidf matrix 19 | -- selects/keeps most helpful features 20 | 21 | after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 22 | 23 | 4 objects are returned: 24 | -- training/testing features 25 | -- training/testing labels 26 | 27 | """ 28 | 29 | ### the words (features) and authors (labels), already largely preprocessed 30 | ### this preprocessing will be repeated in the text learning mini-project 31 | word_data = pickle.load( open(words_file, "r")) 32 | authors = pickle.load( open(authors_file, "r") ) 33 | 34 | ### test_size is the percentage of events assigned to the test set (remainder go into training) 35 | features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) 36 | 37 | 38 | 39 | ### text vectorization--go from strings to lists of numbers 40 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, 41 | stop_words='english') 42 | features_train_transformed = vectorizer.fit_transform(features_train) 43 | features_test_transformed = vectorizer.transform(features_test) 44 | 45 | 46 | 47 | ### feature selection, because text is super high dimensional and 48 | ### can be really computationally chewy as a result 49 | selector = SelectPercentile(f_classif, percentile=1) 50 | selector.fit(features_train_transformed, labels_train) 51 | features_train_transformed = selector.transform(features_train_transformed).toarray() 52 | features_test_transformed = selector.transform(features_test_transformed).toarray() 53 | 54 | ### info on the data 55 | print "no. of Chris training emails:", sum(labels_train) 56 | print "no. of Sara training emails:", len(labels_train)-sum(labels_train) 57 | 58 | 59 | return features_train_transformed, features_test_transformed, labels_train, labels_test 60 | -------------------------------------------------------------------------------- /tools/email_preprocess.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/tools/email_preprocess.pyc -------------------------------------------------------------------------------- /tools/feature_format.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | """ 4 | A general tool for converting data from the 5 | dictionary format to an (n x k) python list that's 6 | ready for training an sklearn algorithm 7 | 8 | n--no. of key-value pairs in dictonary 9 | k--no. of features being extracted 10 | 11 | dictionary keys are names of persons in dataset 12 | dictionary values are dictionaries, where each 13 | key-value pair in the dict is the name 14 | of a feature, and its value for that person 15 | 16 | In addition to converting a dictionary to a numpy 17 | array, you may want to separate the labels from the 18 | features--this is what targetFeatureSplit is for 19 | 20 | so, if you want to have the poi label as the target, 21 | and the features you want to use are the person's 22 | salary and bonus, here's what you would do: 23 | 24 | feature_list = ["poi", "salary", "bonus"] 25 | data_array = featureFormat( data_dictionary, feature_list ) 26 | label, features = targetFeatureSplit(data_array) 27 | 28 | the line above (targetFeatureSplit) assumes that the 29 | label is the _first_ item in feature_list--very important 30 | that poi is listed first! 31 | """ 32 | 33 | 34 | import numpy as np 35 | 36 | def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False): 37 | """ convert dictionary to numpy array of features 38 | remove_NaN=True will convert "NaN" string to 0.0 39 | remove_all_zeroes=True will omit any data points for which 40 | all the features you seek are 0.0 41 | remove_any_zeroes=True will omit any data points for which 42 | any of the features you seek are 0.0 43 | """ 44 | 45 | 46 | return_list = [] 47 | 48 | if sort_keys: 49 | keys = sorted(dictionary.keys()) 50 | else: 51 | keys = dictionary.keys() 52 | 53 | for key in keys: 54 | tmp_list = [] 55 | append = False 56 | for feature in features: 57 | try: 58 | dictionary[key][feature] 59 | except KeyError: 60 | print "error: key ", feature, " not present" 61 | return 62 | value = dictionary[key][feature] 63 | if value=="NaN" and remove_NaN: 64 | value = 0 65 | tmp_list.append( float(value) ) 66 | 67 | ### if all features are zero and you want to remove 68 | ### data points that are all zero, do that here 69 | if remove_all_zeroes: 70 | all_zeroes = True 71 | for item in tmp_list: 72 | if item != 0 and item != "NaN": 73 | append = True 74 | 75 | ### if any features for a given data point are zero 76 | ### and you want to remove data points with any zeroes, 77 | ### handle that here 78 | if remove_any_zeroes: 79 | any_zeroes = False 80 | if 0 in tmp_list or "NaN" in tmp_list: 81 | append = False 82 | if append: 83 | return_list.append( np.array(tmp_list) ) 84 | 85 | 86 | return np.array(return_list) 87 | 88 | 89 | def targetFeatureSplit( data ): 90 | """ 91 | given a numpy array like the one returned from 92 | featureFormat, separate out the first feature 93 | and put it into its own list (this should be the 94 | quantity you want to predict) 95 | 96 | return targets and features as separate lists 97 | 98 | (sklearn can generally handle both lists and numpy arrays as 99 | input formats when training/predicting) 100 | """ 101 | 102 | target = [] 103 | features = [] 104 | for item in data: 105 | target.append( item[0] ) 106 | features.append( item[1:] ) 107 | 108 | return target, features 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /tools/parse_out_email_text.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from nltk.stem.snowball import SnowballStemmer 4 | import string 5 | 6 | def parseOutText(f): 7 | """ given an opened email file f, parse out all text below the 8 | metadata block at the top 9 | (in Part 2, you will also add stemming capabilities) 10 | and return a string that contains all the words 11 | in the email (space-separated) 12 | 13 | example use case: 14 | f = open("email_file_name.txt", "r") 15 | text = parseOutText(f) 16 | 17 | """ 18 | 19 | 20 | f.seek(0) ### go back to beginning of file (annoying) 21 | all_text = f.read() 22 | 23 | ### split off metadata 24 | content = all_text.split("X-FileName:") 25 | words = "" 26 | if len(content) > 1: 27 | ### remove punctuation 28 | text_string = content[1].translate(string.maketrans("", ""), string.punctuation) 29 | 30 | ### project part 2: comment out the line below 31 | ## words = text_string 32 | 33 | ### split the text string into individual words, stem each word, 34 | ### and append the stemmed word to words (make sure there's a single 35 | ### space between each stemmed word) 36 | from nltk.stem.snowball import SnowballStemmer 37 | words = [] 38 | for word in text_string.split(): 39 | stemmer = SnowballStemmer("english") 40 | stemWord = stemmer.stem(word) 41 | words.append(stemWord) 42 | 43 | return string.join(words) 44 | 45 | 46 | 47 | def main(): 48 | ff = open("../text_learning/test_email.txt", "r") 49 | text = parseOutText(ff) 50 | print text 51 | 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | 57 | -------------------------------------------------------------------------------- /tools/startup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | print 4 | print "checking for nltk" 5 | try: 6 | import nltk 7 | except ImportError: 8 | print "you should install nltk before continuing" 9 | 10 | print "checking for numpy" 11 | try: 12 | import numpy 13 | except ImportError: 14 | print "you should install numpy before continuing" 15 | 16 | print "checking for sklearn" 17 | try: 18 | import sklearn 19 | except: 20 | print "you should install sklearn before continuing" 21 | 22 | print 23 | print "downloading the Enron dataset (this may take a while)" 24 | print "to check on progress, you can cd up one level, then execute " 25 | print "Enron dataset should be last item on the list, along with its current size" 26 | print "download will complete at about 423 MB" 27 | import urllib 28 | url = "https://www.cs.cmu.edu/~./enron/enron_mail_20110402.tgz" 29 | urllib.urlretrieve(url, filename="../enron_mail_20110402.tgz") 30 | print "download complete!" 31 | 32 | 33 | print 34 | print "unzipping Enron dataset (this may take a while)" 35 | import tarfile 36 | import os 37 | os.chdir("..") 38 | tfile = tarfile.open("enron_mail_20110402.tgz", "r:gz") 39 | tfile.extractall(".") 40 | 41 | print "you're ready to go!" 42 | -------------------------------------------------------------------------------- /validation/validate_poi.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | 4 | """ 5 | starter code for the validation mini-project 6 | the first step toward building your POI identifier! 7 | 8 | start by loading/formatting the data 9 | 10 | after that, it's not our code anymore--it's yours! 11 | """ 12 | 13 | import pickle 14 | import sys 15 | sys.path.append("../tools/") 16 | from feature_format import featureFormat, targetFeatureSplit 17 | 18 | data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") ) 19 | 20 | ### first element is our labels, any added elements are predictor 21 | ### features. Keep this the same for the mini-project, but you'll 22 | ### have a different feature list when you do the final project. 23 | features_list = ["poi", "salary"] 24 | 25 | data = featureFormat(data_dict, features_list) 26 | labels, features = targetFeatureSplit(data) 27 | 28 | ### it's all yours from here forward! 29 | 30 | # Split data into training and testing 31 | from sklearn import cross_validation 32 | features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42) 33 | 34 | # Fit data with sklearn decision trees algorithm 35 | from sklearn import tree 36 | clf = tree.DecisionTreeClassifier() 37 | clf = clf.fit(features_train, labels_train) 38 | 39 | # Get the accuracy 40 | from sklearn.metrics import accuracy_score 41 | prediction = clf.predict(features_test) 42 | print accuracy_score(prediction, labels_test) 43 | # Overfit accuracy = 0.989583333333 --------------------------------------------------------------------------------