├── .gitignore
├── README.md
├── choose_your_own
    ├── class_vis.py
    ├── class_vis.pyc
    ├── prep_terrain_data.py
    ├── prep_terrain_data.pyc
    ├── test.png
    └── your_algorithm.py
├── datasets_questions
    └── explore_enron_data.py
├── decision_tree
    └── dt_author_id.py
├── evaluation
    └── evaluate_poi_identifier.py
├── feature_selection
    ├── email_authors.pkl
    ├── email_authors_overfit.pkl
    ├── find_signature.py
    ├── word_data.pkl
    └── word_data_overfit.pkl
├── k_means
    ├── clusters.pdf
    ├── clusters_before_scaling.pdf
    └── k_means_cluster.py
├── naive_bayes
    └── nb_author_id.py
├── outliers
    ├── enron_outliers.py
    ├── outlier_cleaner.py
    ├── outlier_removal_regression.py
    ├── practice_outliers_ages.pkl
    └── practice_outliers_net_worths.pkl
├── pca
    └── eigenfaces.py
├── regression
    └── finance_regression.py
├── svm
    └── svm_author_id.py
├── text_learning
    ├── from_chris.txt
    ├── from_sara.txt
    ├── test_email.txt
    └── vectorize_text.py
├── tools
    ├── email_authors.pkl
    ├── email_preprocess.py
    ├── email_preprocess.pyc
    ├── feature_format.py
    ├── parse_out_email_text.py
    ├── startup.py
    └── word_data.pkl
└── validation
    └── validate_poi.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | tools/feature_format.pyc
 2 | tools/parse_out_email_text.pyc
 3 | outliers/outlier_cleaner.pyc
 4 | enron_mail_20110402.tgz
 5 | enron_mail_20110402/
 6 | text_learning/your_word_data.pkl
 7 | text_learning/your_email_authors.pkl
 8 | my_classifier.pkl
 9 | my_dataset.pkl
10 | my_feature_list.pkl
11 | *final_project/
12 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Udacity - Intro to Machine Learning
2 | ===================================
3 | 
4 | Project/Quiz codes for the udacity course "Intro to Machine Learning".
5 | 


--------------------------------------------------------------------------------
/choose_your_own/class_vis.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import pylab as pl
 6 | 
 7 | def prettyPicture(clf, X_test, y_test):
 8 |     x_min = 0.0; x_max = 1.0
 9 |     y_min = 0.0; y_max = 1.0
10 |     
11 |     # Plot the decision boundary. For that, we will assign a color to each
12 |     # point in the mesh [x_min, m_max]x[y_min, y_max].
13 |     h = .01  # step size in the mesh
14 |     xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
15 |     Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
16 | 
17 |     # Put the result into a color plot
18 |     Z = Z.reshape(xx.shape)
19 |     plt.xlim(xx.min(), xx.max())
20 |     plt.ylim(yy.min(), yy.max())
21 | 
22 |     plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)
23 | 
24 |     # Plot also the test points
25 |     grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
26 |     bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
27 |     grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
28 |     bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
29 | 
30 |     plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
31 |     plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
32 |     plt.legend()
33 |     plt.xlabel("bumpiness")
34 |     plt.ylabel("grade")
35 | 
36 |     plt.savefig("test.png")
37 | 
38 | import base64
39 | import json
40 | import subprocess
41 | 
42 | def output_image(name, format, bytes):
43 |     image_start = "BEGIN_IMAGE_f9825uweof8jw9fj4r8"
44 |     image_end = "END_IMAGE_0238jfw08fjsiufhw8frs"
45 |     data = {}
46 |     data['name'] = name
47 |     data['format'] = format
48 |     data['bytes'] = base64.encodestring(bytes)
49 |     print image_start+json.dumps(data)+image_end
50 |                                     
51 | 


--------------------------------------------------------------------------------
/choose_your_own/class_vis.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/choose_your_own/class_vis.pyc


--------------------------------------------------------------------------------
/choose_your_own/prep_terrain_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import random
 3 | 
 4 | 
 5 | def makeTerrainData(n_points=1000):
 6 | ###############################################################################
 7 | ### make the toy dataset
 8 |     random.seed(42)
 9 |     grade = [random.random() for ii in range(0,n_points)]
10 |     bumpy = [random.random() for ii in range(0,n_points)]
11 |     error = [random.random() for ii in range(0,n_points)]
12 |     y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
13 |     for ii in range(0, len(y)):
14 |         if grade[ii]>0.8 or bumpy[ii]>0.8:
15 |             y[ii] = 1.0
16 | 
17 | ### split into train/test sets
18 |     X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
19 |     split = int(0.75*n_points)
20 |     X_train = X[0:split]
21 |     X_test  = X[split:]
22 |     y_train = y[0:split]
23 |     y_test  = y[split:]
24 | 
25 |     grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
26 |     bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
27 |     grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
28 |     bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]
29 | 
30 |     training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
31 |             , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
32 | 
33 | 
34 |     grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
35 |     bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
36 |     grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
37 |     bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
38 | 
39 |     test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
40 |             , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
41 | 
42 |     return X_train, y_train, X_test, y_test
43 | 
44 | 


--------------------------------------------------------------------------------
/choose_your_own/prep_terrain_data.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/choose_your_own/prep_terrain_data.pyc


--------------------------------------------------------------------------------
/choose_your_own/test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/choose_your_own/test.png


--------------------------------------------------------------------------------
/choose_your_own/your_algorithm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | from prep_terrain_data import makeTerrainData
 5 | from class_vis import prettyPicture
 6 | from time import time
 7 | from sklearn.metrics import accuracy_score
 8 | 
 9 | features_train, labels_train, features_test, labels_test = makeTerrainData()
10 | 
11 | ### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
12 | ### in together--separate them so we can give them different colors in the scatterplot,
13 | ### and visually identify them
14 | grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
15 | bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
16 | grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
17 | bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
18 | 
19 | #### initial visualization
20 | plt.xlim(0.0, 1.0)
21 | plt.ylim(0.0, 1.0)
22 | plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
23 | plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
24 | plt.legend()
25 | plt.xlabel("bumpiness")
26 | plt.ylabel("grade")
27 | plt.show()
28 | #################################################################################
29 | 
30 | ### your code here!  name your classifier object clf if you want the 
31 | ### visualization code (prettyPicture) to show you the decision boundary
32 | 
33 | '''
34 | # K-Nearest-Neighbors Classifer
35 | from sklearn.neighbors import KNeighborsClassifier
36 | clf = KNeighborsClassifier(n_neighbors=22)
37 | t0 = time()
38 | clf.fit(features_train,labels_train)
39 | print "Training: ", round(time() - t0, 3), "s"
40 | t1 = time()
41 | pred = clf.predict(features_test)
42 | print "Prediction: ", round(time() - t1, 3), "s"
43 | print "Accuracy: ", accuracy_score(pred, labels_test)
44 | # Accuracy: 0.944
45 | # Training time: 0.001s
46 | # Prediction time: 0.002s
47 | '''
48 | '''
49 | # AdaBoost Classifier
50 | from sklearn.ensemble import AdaBoostClassifier
51 | clf = AdaBoostClassifier(n_estimators=50)
52 | t0 = time()
53 | clf.fit(features_train, labels_train)
54 | print "Training: ", round(time() - t0, 3), "s"
55 | t1 = time()
56 | pred = clf.predict(features_test)
57 | print "Prediction: ", round(time() - t1, 3), "s"
58 | print accuracy_score(pred, labels_test)
59 | # Accuracy: 0.924
60 | # Training time: 0.072s
61 | # Prediction time: 0.007s
62 | '''
63 | # Random Forests Classifer
64 | from sklearn.ensemble import RandomForestClassifier
65 | clf = RandomForestClassifier(n_estimators=19)
66 | t0 = time()
67 | clf = clf.fit(features_train, labels_train)
68 | print "Training: ", round(time() - t0, 3), "s"
69 | t1 = time()
70 | pred = clf.predict(features_test)
71 | print "Prediction: ", round(time() - t1, 3), "s"
72 | print "Accuracy: ", accuracy_score(pred, labels_test)
73 | # Accuracy: 0.928
74 | # Training time: 0.022s
75 | # Prediction time: 0.003s
76 | 
77 | try:
78 |     prettyPicture(clf, features_test, labels_test)
79 | except NameError:
80 |     pass


--------------------------------------------------------------------------------
/datasets_questions/explore_enron_data.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/python
 3 | 
 4 | """ 
 5 |     starter code for exploring the Enron dataset (emails + finances) 
 6 |     loads up the dataset (pickled dict of dicts)
 7 | 
 8 |     the dataset has the form
 9 |     enron_data["LASTNAME FIRSTNAME MIDDLEINITIAL"] = { features_dict }
10 | 
11 |     {features_dict} is a dictionary of features associated with that person
12 |     you should explore features_dict as part of the mini-project,
13 |     but here's an example to get you started:
14 | 
15 |     enron_data["SKILLING JEFFREY K"]["bonus"] = 5600000
16 |     
17 | """
18 | 
19 | import pickle
20 | import numpy as np
21 | import pandas as pd
22 | 
23 | enron_data = pickle.load(open("../final_project/final_project_dataset.pkl", "r"))
24 | 
25 | #data = enron_data['PRENTICE JAMES']
26 | #data = enron_data['COLWELL WESLEY']
27 | #data = enron_data['LAY KENNETH L']
28 | #data = enron_data['FASTOW ANDREW S']
29 | #data = enron_data['SKILLING JEFFREY K']
30 | 
31 | payments = sum([item["total_payments"]=='NaN' for item in enron_data.values()])
32 | percent = (float(payments)/len(enron_data)) * 100
33 | print percent
34 | 
35 | pois = 0
36 | count = 0
37 | for v in enron_data.values():
38 |     if v["poi"]:
39 |         pois += 1
40 |         if v["total_payments"] != 'NaN':
41 |             count += 1
42 | 
43 | print payments
44 | print pois
45 | print count
46 | print float(count)/pois * 100


--------------------------------------------------------------------------------
/decision_tree/dt_author_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """ 
 4 |     this is the code to accompany the Lesson 3 (decision tree) mini-project
 5 | 
 6 |     use an DT to identify emails from the Enron corpus by their authors
 7 |     
 8 |     Sara has label 0
 9 |     Chris has label 1
10 | 
11 | """
12 |     
13 | import sys
14 | from time import time
15 | sys.path.append("../tools/")
16 | from email_preprocess import preprocess
17 | from sklearn import tree
18 | from sklearn.metrics import accuracy_score
19 | ### features_train and features_test are the features for the training
20 | ### and testing datasets, respectively
21 | ### labels_train and labels_test are the corresponding item labels
22 | features_train, features_test, labels_train, labels_test = preprocess()
23 | 
24 | 
25 | print len(features_train[0])
26 | ## 3785 features with 10 percentile
27 | ## 379 features with 1 percentile
28 | #########################################################
29 | ### your code goes here ###
30 | clf = tree.DecisionTreeClassifier(min_samples_split=40)
31 | clf = clf.fit(features_train, labels_train)
32 | prediction = clf.predict(features_test)
33 | print accuracy_score(prediction, labels_test)
34 | ## Accuracy: 0.977815699659 (10 percentile)
35 | ## Accuracy: 0.967007963595 (1 percentile)
36 | #########################################################
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/evaluation/evaluate_poi_identifier.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | 
 4 | """
 5 |     starter code for the evaluation mini-project
 6 |     start by copying your trained/tested POI identifier from
 7 |     that you built in the validation mini-project
 8 | 
 9 |     the second step toward building your POI identifier!
10 | 
11 |     start by loading/formatting the data
12 | 
13 | """
14 | 
15 | import pickle
16 | import sys
17 | import numpy as np
18 | sys.path.append("../tools/")
19 | from feature_format import featureFormat, targetFeatureSplit
20 | 
21 | data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
22 | 
23 | ### add more features to features_list!
24 | features_list = ["poi", "salary"]
25 | 
26 | data = featureFormat(data_dict, features_list)
27 | labels, features = targetFeatureSplit(data)
28 | 
29 | 
30 | 
31 | ### your code goes here 
32 | from sklearn import cross_validation  
33 | features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)
34 | 
35 | # Fit data with sklearn decision trees algorithm
36 | from sklearn import tree
37 | clf = tree.DecisionTreeClassifier()
38 | clf = clf.fit(features_train, labels_train)
39 | 
40 | # Get the accuracy
41 | from sklearn.metrics import accuracy_score
42 | prediction = clf.predict(features_test)
43 | print "Prediction: ", prediction
44 | print "Accuracy: ", accuracy_score(prediction, labels_test)
45 | print "Number of POI's: ", np.count_nonzero(prediction)
46 | print "People in Test Set: ", len(prediction) 
47 | print "Accuracy if all zeros: ", accuracy_score([0]*29, labels_test)
48 | 
49 | from collections import Counter
50 | confusion_matrix = Counter()
51 | 
52 | #truth = labels_test
53 | prediction = [0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1] 
54 | truth = [0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]
55 | positives = [1]
56 | 
57 | binary_truth = [x in positives for x in truth]
58 | binary_prediction = [x in positives for x in prediction]
59 | for t, p in zip(binary_truth, binary_prediction):
60 |     confusion_matrix[t,p] += 1
61 | 
62 | print confusion_matrix
63 | 
64 | from sklearn.metrics import precision_score
65 | print "Precision Score: ", precision_score(prediction, truth)
66 | from sklearn.metrics import recall_score
67 | print "Recall Score: ", recall_score(prediction, truth)
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 
76 | 
77 | 


--------------------------------------------------------------------------------
/feature_selection/email_authors.pkl:
--------------------------------------------------------------------------------
  1 | (lp0
  2 | I0
  3 | aI0
  4 | aI0
  5 | aI0
  6 | aI0
  7 | aI0
  8 | aI0
  9 | aI0
 10 | aI0
 11 | aI0
 12 | aI0
 13 | aI0
 14 | aI0
 15 | aI0
 16 | aI0
 17 | aI0
 18 | aI0
 19 | aI0
 20 | aI0
 21 | aI0
 22 | aI0
 23 | aI0
 24 | aI0
 25 | aI0
 26 | aI0
 27 | aI0
 28 | aI0
 29 | aI0
 30 | aI0
 31 | aI0
 32 | aI0
 33 | aI0
 34 | aI0
 35 | aI0
 36 | aI0
 37 | aI0
 38 | aI0
 39 | aI0
 40 | aI0
 41 | aI0
 42 | aI0
 43 | aI0
 44 | aI0
 45 | aI0
 46 | aI0
 47 | aI0
 48 | aI0
 49 | aI0
 50 | aI0
 51 | aI0
 52 | aI0
 53 | aI0
 54 | aI0
 55 | aI0
 56 | aI0
 57 | aI0
 58 | aI0
 59 | aI0
 60 | aI0
 61 | aI0
 62 | aI0
 63 | aI0
 64 | aI0
 65 | aI0
 66 | aI0
 67 | aI0
 68 | aI0
 69 | aI0
 70 | aI0
 71 | aI0
 72 | aI0
 73 | aI0
 74 | aI0
 75 | aI0
 76 | aI0
 77 | aI0
 78 | aI0
 79 | aI0
 80 | aI0
 81 | aI0
 82 | aI0
 83 | aI0
 84 | aI0
 85 | aI0
 86 | aI0
 87 | aI0
 88 | aI0
 89 | aI0
 90 | aI0
 91 | aI0
 92 | aI0
 93 | aI0
 94 | aI0
 95 | aI0
 96 | aI0
 97 | aI0
 98 | aI0
 99 | aI0
100 | aI0
101 | aI0
102 | aI0
103 | aI0
104 | aI0
105 | aI0
106 | aI0
107 | aI0
108 | aI0
109 | aI0
110 | aI0
111 | aI0
112 | aI0
113 | aI0
114 | aI0
115 | aI0
116 | aI0
117 | aI0
118 | aI0
119 | aI0
120 | aI0
121 | aI0
122 | aI0
123 | aI0
124 | aI0
125 | aI0
126 | aI0
127 | aI0
128 | aI0
129 | aI0
130 | aI0
131 | aI0
132 | aI0
133 | aI0
134 | aI0
135 | aI0
136 | aI0
137 | aI0
138 | aI0
139 | aI0
140 | aI0
141 | aI0
142 | aI0
143 | aI0
144 | aI0
145 | aI0
146 | aI0
147 | aI0
148 | aI0
149 | aI0
150 | aI0
151 | aI0
152 | aI0
153 | aI0
154 | aI0
155 | aI0
156 | aI0
157 | aI0
158 | aI0
159 | aI0
160 | aI0
161 | aI0
162 | aI0
163 | aI0
164 | aI0
165 | aI0
166 | aI0
167 | aI0
168 | aI0
169 | aI0
170 | aI0
171 | aI0
172 | aI0
173 | aI0
174 | aI0
175 | aI0
176 | aI0
177 | aI0
178 | aI0
179 | aI0
180 | aI0
181 | aI0
182 | aI0
183 | aI0
184 | aI0
185 | aI0
186 | aI0
187 | aI0
188 | aI0
189 | aI0
190 | aI0
191 | aI0
192 | aI0
193 | aI0
194 | aI0
195 | aI0
196 | aI0
197 | aI0
198 | aI0
199 | aI0
200 | aI0
201 | a.


--------------------------------------------------------------------------------
/feature_selection/find_signature.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import pickle
 4 | import numpy
 5 | numpy.random.seed(42)
 6 | 
 7 | 
 8 | ### the words (features) and authors (labels), already largely processed
 9 | ### these files should have been created from the previous (Lesson 10) mini-project.
10 | words_file = "../text_learning/your_word_data.pkl" 
11 | authors_file = "../text_learning/your_email_authors.pkl"
12 | word_data = pickle.load( open(words_file, "r"))
13 | authors = pickle.load( open(authors_file, "r") )
14 | 
15 | 
16 | ### test_size is the percentage of events assigned to the test set (remainder go into training)
17 | from sklearn import cross_validation
18 | features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
19 | 
20 | 
21 | from sklearn.feature_extraction.text import TfidfVectorizer
22 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
23 |                              stop_words='english')
24 | features_train = vectorizer.fit_transform(features_train).toarray()
25 | features_test  = vectorizer.transform(features_test).toarray()
26 | 
27 | 
28 | ### a classic way to overfit is to use a small number
29 | ### of data points and a large number of features
30 | ### train on only 150 events to put ourselves in this regime
31 | features_train = features_train[:150]
32 | labels_train   = labels_train[:150]
33 | 
34 | 
35 | ### your code goes here
36 | from sklearn.tree import DecisionTreeClassifier
37 | clf = DecisionTreeClassifier()
38 | clf.fit(features_train,labels_train)
39 | print "Score: ",clf.score(features_test,labels_test)
40 | ### Accuracy 1.0 on overfit data
41 | importances = clf.feature_importances_
42 | import numpy as np
43 | indices = np.argsort(importances)[::-1]
44 | print 'Feature Ranking: '
45 | for i in range(10):
46 |     print "{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]])
47 | 
48 | 


--------------------------------------------------------------------------------
/k_means/clusters.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/k_means/clusters.pdf


--------------------------------------------------------------------------------
/k_means/clusters_before_scaling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/k_means/clusters_before_scaling.pdf


--------------------------------------------------------------------------------
/k_means/k_means_cluster.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python 
 2 | 
 3 | """ 
 4 |     skeleton code for k-means clustering mini-project
 5 | 
 6 | """
 7 | 
 8 | import pickle
 9 | import numpy
10 | import matplotlib.pyplot as plt
11 | import sys
12 | sys.path.append("../tools/")
13 | from feature_format import featureFormat, targetFeatureSplit
14 | 
15 | 
16 | def Draw(pred, features, poi, mark_poi=False, name="image.png", f1_name="feature 1", f2_name="feature 2"):
17 |     """ some plotting code designed to help you visualize your clusters """
18 | 
19 |     ### plot each cluster with a different color--add more colors for
20 |     ### drawing more than 4 clusters
21 |     colors = ["b", "c", "k", "m", "g"]
22 |     for ii, pp in enumerate(pred):
23 |         plt.scatter(features[ii][0], features[ii][1], color = colors[pred[ii]])
24 | 
25 |     ### if you like, place red stars over points that are POIs (just for funsies)
26 |     if mark_poi:
27 |         for ii, pp in enumerate(pred):
28 |             if poi[ii]:
29 |                 plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
30 |     plt.xlabel(f1_name)
31 |     plt.ylabel(f2_name)
32 |     plt.savefig(name)
33 |     plt.show()
34 | 
35 | 
36 | ### load in the dict of dicts containing all the data on each person in the dataset
37 | data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
38 | ### there's an outlier--remove it! 
39 | data_dict.pop("TOTAL", 0)
40 | 
41 | ### the input features we want to use
42 | ### can be any key in the person-level dictionary (salary, director_fees, etc.) 
43 | feature_1 = "salary"
44 | feature_2 = "exercised_stock_options"
45 | feature_3 = "total_payments"
46 | poi  = "poi"
47 | features_list = [poi, feature_1, feature_2, feature_3]
48 | data = featureFormat(data_dict, features_list )
49 | poi, finance_features = targetFeatureSplit( data )
50 | 
51 | ### in the "clustering with 3 features" part of the mini-project,
52 | ### you'll want to change this line to 
53 | ### for f1, f2, _ in finance_features:
54 | for f1, f2, f3 in finance_features:
55 |     plt.scatter( f1, f2)
56 | plt.show()
57 | 
58 | 
59 | from sklearn.cluster import KMeans
60 | features_list = ["poi", feature_1, feature_2, feature_3]
61 | data2 = featureFormat(data_dict, features_list )
62 | poi, finance_features = targetFeatureSplit( data2 )
63 | clf = KMeans(n_clusters=3)
64 | pred = clf.fit_predict( finance_features )
65 | Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name=feature_1, f2_name=feature_2)
66 | 
67 | 
68 | ### cluster here; create predictions of the cluster labels
69 | ### for the data and store them to a list called pred
70 | 
71 | try:
72 |     Draw(pred, finance_features, poi, mark_poi=False, name="clusters.pdf", f1_name=feature_1, f2_name=feature_2)
73 | except NameError:
74 |     print "no predictions object named pred found, no clusters to plot"
75 | 


--------------------------------------------------------------------------------
/naive_bayes/nb_author_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """ 
 4 |     this is the code to accompany the Lesson 1 (Naive Bayes) mini-project 
 5 | 
 6 |     use a Naive Bayes Classifier to identify emails by their authors
 7 |     
 8 |     authors and labels:
 9 |     Sara has label 0
10 |     Chris has label 1
11 | 
12 | """
13 |     
14 | import sys
15 | from time import time
16 | sys.path.append("../tools/")
17 | from email_preprocess import preprocess
18 | from sklearn.naive_bayes import GaussianNB
19 | from sklearn.metrics import accuracy_score
20 | 
21 | ### features_train and features_test are the features for the training
22 | ### and testing datasets, respectively
23 | ### labels_train and labels_test are the corresponding item labels
24 | features_train, features_test, labels_train, labels_test = preprocess()
25 | 
26 | #########################################################
27 | ### your code goes here ###
28 | 
29 | clf = GaussianNB()
30 | clf.fit(features_train, labels_train)
31 | ## training time: 2.848 s
32 | prediction = clf.predict(features_test)
33 | ## prediction time: 0.301 s
34 | 
35 | print accuracy_score(prediction, labels_test)
36 | #########################################################
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/outliers/enron_outliers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import pickle
 4 | import sys
 5 | import matplotlib.pyplot
 6 | sys.path.append("../tools/")
 7 | from feature_format import featureFormat, targetFeatureSplit
 8 | 
 9 | 
10 | ### read in data dictionary, convert to numpy array
11 | data_dict = pickle.load( open("../final_project/final_project_dataset.pkl", "r") )
12 | features = ["salary", "bonus"]
13 | data_dict.pop("TOTAL", 0)
14 | data = featureFormat(data_dict, features)
15 | 
16 | ### your code below
17 | for point in data:
18 |     salary = point[0]
19 |     bonus = point[1]
20 |     matplotlib.pyplot.scatter( salary, bonus)
21 | 
22 | matplotlib.pyplot.xlabel("salary")
23 | matplotlib.pyplot.ylabel("bonus")
24 | matplotlib.pyplot.show()


--------------------------------------------------------------------------------
/outliers/outlier_cleaner.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | 
 4 | def outlierCleaner(predictions, ages, net_worths):
 5 |     """
 6 |         clean away the 10% of points that have the largest
 7 |         residual errors (different between the prediction
 8 |         and the actual net worth)
 9 | 
10 |         return a list of tuples named cleaned_data where 
11 |         each tuple is of the form (age, net_worth, error)
12 |     """
13 |     cleaned_data = []
14 | 
15 |     ### your code goes here
16 |     errors = (net_worths-predictions)**2
17 |     cleaned_data = zip(ages,net_worths,errors)
18 |     cleaned_data = sorted(cleaned_data,key=lambda x:x[2][0],reverse=True)
19 |     limit = int(len(net_worths)*0.1)
20 | 
21 |     return cleaned_data[limit:]
22 | 
23 | 


--------------------------------------------------------------------------------
/outliers/outlier_removal_regression.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import random
 4 | import numpy
 5 | import matplotlib.pyplot as plt
 6 | import pickle
 7 | 
 8 | from outlier_cleaner import outlierCleaner
 9 | 
10 | 
11 | ### load up some practice data with outliers in it
12 | ages = pickle.load( open("practice_outliers_ages.pkl", "r") )
13 | net_worths = pickle.load( open("practice_outliers_net_worths.pkl", "r") )
14 | 
15 | 
16 | 
17 | ### ages and net_worths need to be reshaped into 2D numpy arrays
18 | ### second argument of reshape command is a tuple of integers: (n_rows, n_columns)
19 | ### by convention, n_rows is the number of data points
20 | ### and n_columns is the number of features
21 | ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
22 | net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
23 | from sklearn.cross_validation import train_test_split
24 | ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state=42)
25 | 
26 | ### fill in a regression here!  Name the regression object reg so that
27 | ### the plotting code below works, and you can see what your regression looks like
28 | from sklearn import linear_model
29 | reg = linear_model.LinearRegression()
30 | reg.fit(ages_train, net_worths_train)
31 | print "Slope ", reg.coef_
32 | print "Score ", reg.score(ages_test, net_worths_test)
33 | 
34 | try:
35 |     plt.plot(ages, reg.predict(ages), color="blue")
36 | except NameError:
37 |     pass
38 | plt.scatter(ages, net_worths)
39 | plt.show()
40 | 
41 | 
42 | ### identify and remove the most outlier-y points
43 | cleaned_data = []
44 | try:
45 |     predictions = reg.predict(ages_train)
46 |     cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
47 | except NameError:
48 |     print "your regression object doesn't exist, or isn't name reg"
49 |     print "can't make predictions to use in identifying outliers"
50 | 
51 | ### only run this code if cleaned_data is returning data
52 | if len(cleaned_data) > 0:
53 |     ages, net_worths, errors = zip(*cleaned_data)
54 |     ages       = numpy.reshape( numpy.array(ages), (len(ages), 1))
55 |     net_worths = numpy.reshape( numpy.array(net_worths), (len(net_worths), 1))
56 | 
57 |     ### refit your cleaned data!
58 |     try:
59 |         reg.fit(ages, net_worths)
60 |         plt.plot(ages, reg.predict(ages), color="blue")
61 |     except NameError:
62 |         print "you don't seem to have regression imported/created,"
63 |         print "   or else your regression object isn't named reg"
64 |         print "   either way, only draw the scatter plot of the cleaned data"
65 |     plt.scatter(ages, net_worths)
66 |     plt.xlabel("ages")
67 |     plt.ylabel("net worths")
68 |     plt.show()
69 | 
70 |     print "New Slope Without Outliers ", reg.coef_
71 |     print "New Score Without Outliers ", reg.score(ages_test, net_worths_test)
72 | 
73 | else:
74 |     print "outlierCleaner() is returning an empty list, no refitting to be done"
75 | 
76 | 


--------------------------------------------------------------------------------
/outliers/practice_outliers_ages.pkl:
--------------------------------------------------------------------------------
  1 | (lp0
  2 | I49
  3 | aI21
  4 | aI32
  5 | aI30
  6 | aI53
  7 | aI51
  8 | aI61
  9 | aI23
 10 | aI39
 11 | aI21
 12 | aI30
 13 | aI43
 14 | aI21
 15 | aI29
 16 | aI49
 17 | aI45
 18 | aI30
 19 | aI47
 20 | aI57
 21 | aI20
 22 | aI57
 23 | aI52
 24 | aI35
 25 | aI27
 26 | aI64
 27 | aI35
 28 | aI24
 29 | aI24
 30 | aI58
 31 | aI47
 32 | aI57
 33 | aI53
 34 | aI44
 35 | aI64
 36 | aI37
 37 | aI45
 38 | aI58
 39 | aI48
 40 | aI59
 41 | aI46
 42 | aI52
 43 | aI22
 44 | aI30
 45 | aI33
 46 | aI23
 47 | aI30
 48 | aI24
 49 | aI32
 50 | aI49
 51 | aI36
 52 | aI37
 53 | aI29
 54 | aI32
 55 | aI63
 56 | aI49
 57 | aI48
 58 | aI27
 59 | aI53
 60 | aI27
 61 | aI37
 62 | aI65
 63 | aI49
 64 | aI45
 65 | aI51
 66 | aI58
 67 | aI55
 68 | aI30
 69 | aI21
 70 | aI34
 71 | aI32
 72 | aI29
 73 | aI63
 74 | aI60
 75 | aI34
 76 | aI50
 77 | aI38
 78 | aI62
 79 | aI41
 80 | aI32
 81 | aI31
 82 | aI45
 83 | aI32
 84 | aI46
 85 | aI61
 86 | aI38
 87 | aI30
 88 | aI65
 89 | aI43
 90 | aI24
 91 | aI22
 92 | aI25
 93 | aI48
 94 | aI56
 95 | aI39
 96 | aI22
 97 | aI37
 98 | aI65
 99 | aI44
100 | aI64
101 | aI59
102 | a.


--------------------------------------------------------------------------------
/outliers/practice_outliers_net_worths.pkl:
--------------------------------------------------------------------------------
  1 | (lp0
  2 | F316.18428306022463
  3 | aF128.4847139765763
  4 | aF212.95377076201385
  5 | aF217.96059712816052
  6 | aF326.56693250553326
  7 | aF314.0672608610164
  8 | aF412.83425631014785
  9 | aF159.09869458305818
 10 | aF234.36051228130097
 11 | aF142.1012008717193
 12 | aF178.23164614375077
 13 | aF259.43540492859483
 14 | aF136.08924543132068
 15 | aF142.98439510684403
 16 | aF271.75164334973937
 17 | aF270.00424941518054
 18 | aF167.24337759331152
 19 | aF300.0349466519055
 20 | aF338.08951848957577
 21 | aF96.75392597329417
 22 | aF385.5629753784311
 23 | aF320.4844739902693
 24 | aF220.10056409375846
 25 | aF140.25503627573087
 26 | aF389.1123455094963
 27 | aF220.96845179419734
 28 | aF126.98012845155395
 29 | aF157.51396036691344
 30 | aF350.4872262016239
 31 | aF287.91612500413447
 32 | aF344.21586775541203
 33 | aF368.29556369017877
 34 | aF274.7300555052413
 35 | aF378.84578142088196
 36 | aF247.70089824206377
 37 | aF256.83312700057957
 38 | aF366.6772719000951
 39 | aF260.8065975224045
 40 | aF342.1862790220314
 41 | aF291.4372247173825
 42 | aF339.7693315999082
 43 | aF140.9273656237994
 44 | aF185.1870343522352
 45 | aF200.22792608821422
 46 | aF114.17956019265145
 47 | aF173.10311583210583
 48 | aF140.78722458080426
 49 | aF221.1424445243783
 50 | aF313.12236579136925
 51 | aF189.73919689274533
 52 | aF237.7316793878959
 53 | aF173.54835439167368
 54 | aF186.46155999388083
 55 | aF405.9835257768174
 56 | aF326.86999044991904
 57 | aF318.62560238232396
 58 | aF151.96564953554724
 59 | aF325.0657524829757
 60 | aF175.37526862807127
 61 | aF250.76090254244718
 62 | aF396.6665152430942
 63 | aF302.53682046672367
 64 | aF259.1233005198794
 65 | aF294.8258675183866
 66 | aF378.75051644788397
 67 | aF370.8748005714165
 68 | aF186.05979756839332
 69 | aF151.32065795784047
 70 | aF219.7327205009527
 71 | aF187.0976049078975
 72 | aF188.4779121101683
 73 | aF424.51073132931936
 74 | aF374.28347921780096
 75 | aF243.79287311628013
 76 | aF260.1050979182051
 77 | aF253.93805008750448
 78 | aF389.24094136476344
 79 | aF250.26985299068266
 80 | aF201.83521553071006
 81 | aF153.99862170798215
 82 | aF276.85656224324975
 83 | aF207.14225143023492
 84 | aF317.0578808948303
 85 | aF370.88459563452705
 86 | aF221.33012794213624
 87 | aF177.46485912830926
 88 | aF424.5580423540415
 89 | aF275.3250221931937
 90 | aF139.40479592465923
 91 | aF147.76534866226712
 92 | aF133.88384977349668
 93 | aF80.9013362873476
 94 | aF121.11602327460938
 95 | aF134.41369498852399
 96 | aF47.70052124577958
 97 | aF16.507788679151513
 98 | aF34.19027438129125
 99 | aF64.06616829393845
100 | aF122.70221488837397
101 | aF129.10958748845152
102 | a.


--------------------------------------------------------------------------------
/pca/eigenfaces.py:
--------------------------------------------------------------------------------
  1 | """
  2 | ===================================================
  3 | Faces recognition example using eigenfaces and SVMs
  4 | ===================================================
  5 | 
  6 | The dataset used in this example is a preprocessed excerpt of the
  7 | "Labeled Faces in the Wild", aka LFW_:
  8 | 
  9 |   http://vis-www.cs.umass.edu/lfw/lfw-funneled.tgz (233MB)
 10 | 
 11 |   .. _LFW: http://vis-www.cs.umass.edu/lfw/
 12 | 
 13 |   original source: http://scikit-learn.org/stable/auto_examples/applications/face_recognition.html
 14 | 
 15 | """
 16 | 
 17 | 
 18 | 
 19 | print __doc__
 20 | 
 21 | from time import time
 22 | import logging
 23 | import pylab as pl
 24 | import numpy as np
 25 | 
 26 | from sklearn.cross_validation import train_test_split
 27 | from sklearn.datasets import fetch_lfw_people
 28 | from sklearn.grid_search import GridSearchCV
 29 | from sklearn.metrics import classification_report
 30 | from sklearn.metrics import confusion_matrix
 31 | from sklearn.decomposition import RandomizedPCA
 32 | from sklearn.svm import SVC
 33 | 
 34 | # Display progress logs on stdout
 35 | logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
 36 | 
 37 | 
 38 | ###############################################################################
 39 | # Download the data, if not already on disk and load it as numpy arrays
 40 | 
 41 | lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
 42 | 
 43 | # introspect the images arrays to find the shapes (for plotting)
 44 | n_samples, h, w = lfw_people.images.shape
 45 | np.random.seed(42)
 46 | 
 47 | # fot machine learning we use the 2 data directly (as relative pixel
 48 | # positions info is ignored by this model)
 49 | X = lfw_people.data
 50 | n_features = X.shape[1]
 51 | 
 52 | # the label to predict is the id of the person
 53 | y = lfw_people.target
 54 | target_names = lfw_people.target_names
 55 | n_classes = target_names.shape[0]
 56 | 
 57 | print "Total dataset size:"
 58 | print "n_samples: %d" % n_samples
 59 | print "n_features: %d" % n_features
 60 | print "n_classes: %d" % n_classes
 61 | 
 62 | 
 63 | ###############################################################################
 64 | # Split into a training set and a test set using a stratified k fold
 65 | 
 66 | # split into a training and testing set
 67 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
 68 | 
 69 | 
 70 | ###############################################################################
 71 | # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
 72 | # dataset): unsupervised feature extraction / dimensionality reduction
 73 | 
 74 | #n_components = 10 # crashes
 75 | #n_components = 15 # f1 score average of 0.65
 76 | #n_components = 25 # f1 score average of 0.74
 77 | #n_components = 50 # f1 score average of 0.81
 78 | #n_components = 100 # f1 score average of 0.85
 79 | #n_components = 150 # f1 score average of 0.83
 80 | n_components = 200 # f1 score average of 0.85
 81 | #n_components = 300 # f1 score average of 0.8
 82 | 
 83 | 
 84 | print "Extracting the top %d eigenfaces from %d faces" % (n_components, X_train.shape[0])
 85 | t0 = time()
 86 | pca = RandomizedPCA(n_components=n_components, whiten=True).fit(X_train)
 87 | print "done in %0.3fs" % (time() - t0)
 88 | 
 89 | eigenfaces = pca.components_.reshape((n_components, h, w))
 90 | 
 91 | print "Projecting the input data on the eigenfaces orthonormal basis"
 92 | t0 = time()
 93 | X_train_pca = pca.transform(X_train)
 94 | X_test_pca = pca.transform(X_test)
 95 | print "done in %0.3fs" % (time() - t0)
 96 | print "Variance", pca.explained_variance_ratio_[:2]
 97 | 
 98 | ###############################################################################
 99 | # Train a SVM classification model
100 | 
101 | print "Fitting the classifier to the training set"
102 | t0 = time()
103 | param_grid = {
104 |          'C': [1e3, 5e3, 1e4, 5e4, 1e5],
105 |           'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
106 |           }
107 | clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid)
108 | clf = clf.fit(X_train_pca, y_train)
109 | print "done in %0.3fs" % (time() - t0)
110 | print "Best estimator found by grid search:"
111 | print clf.best_estimator_
112 | 
113 | 
114 | ###############################################################################
115 | # Quantitative evaluation of the model quality on the test set
116 | 
117 | print "Predicting the people names on the testing set"
118 | t0 = time()
119 | y_pred = clf.predict(X_test_pca)
120 | print "done in %0.3fs" % (time() - t0)
121 | 
122 | print classification_report(y_test, y_pred, target_names=target_names)
123 | print confusion_matrix(y_test, y_pred, labels=range(n_classes))
124 | 
125 | 
126 | ###############################################################################
127 | # Qualitative evaluation of the predictions using matplotlib
128 | 
129 | def plot_gallery(images, titles, h, w, n_row=3, n_col=4):
130 |     """Helper function to plot a gallery of portraits"""
131 |     pl.figure(figsize=(1.8 * n_col, 2.4 * n_row))
132 |     pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35)
133 |     for i in range(n_row * n_col):
134 |         pl.subplot(n_row, n_col, i + 1)
135 |         pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray)
136 |         pl.title(titles[i], size=12)
137 |         pl.xticks(())
138 |         pl.yticks(())
139 | 
140 | # plot the result of the prediction on a portion of the test set
141 | 
142 | def title(y_pred, y_test, target_names, i):
143 |     pred_name = target_names[y_pred[i]].rsplit(' ', 1)[-1]
144 |     true_name = target_names[y_test[i]].rsplit(' ', 1)[-1]
145 |     return 'predicted: %s\ntrue:      %s' % (pred_name, true_name)
146 | 
147 | prediction_titles = [title(y_pred, y_test, target_names, i)
148 |                          for i in range(y_pred.shape[0])]
149 | 
150 | plot_gallery(X_test, prediction_titles, h, w)
151 | 
152 | # plot the gallery of the most significative eigenfaces
153 | 
154 | eigenface_titles = ["eigenface %d" % i for i in range(eigenfaces.shape[0])]
155 | plot_gallery(eigenfaces, eigenface_titles, h, w)
156 | 
157 | pl.show()
158 | 


--------------------------------------------------------------------------------
/regression/finance_regression.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """
 4 |     starter code for the regression mini-project
 5 |     
 6 |     loads up/formats a modified version of the dataset
 7 |     (why modified?  we've removed some trouble points
 8 |     that you'll find yourself in the outliers mini-project)
 9 | 
10 |     draws a little scatterplot of the training/testing data
11 | 
12 |     you fill in the regression code where indicated
13 | 
14 | """    
15 | 
16 | 
17 | import sys
18 | import pickle
19 | sys.path.append("../tools/")
20 | from feature_format import featureFormat, targetFeatureSplit
21 | dictionary = pickle.load( open("../final_project/final_project_dataset_modified.pkl", "r") )
22 | 
23 | ### list the features you want to look at--first item in the 
24 | ### list will be the "target" feature
25 | features_list = ["bonus", "long_term_incentive"]
26 | data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
27 | target, features = targetFeatureSplit( data )
28 | 
29 | ### training-testing split needed in regression, just like classification
30 | from sklearn.cross_validation import train_test_split
31 | feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
32 | train_color = "b"
33 | test_color = "r"
34 | 
35 | ### your regression goes here!
36 | ### please name it reg, so that the plotting code below picks it up and 
37 | ### plots it correctly
38 | from sklearn import linear_model
39 | reg = linear_model.LinearRegression()
40 | reg.fit(feature_train, target_train)
41 | 
42 | ### draw the scatterplot, with color-coded training and testing points
43 | import matplotlib.pyplot as plt
44 | for feature, target in zip(feature_test, target_test):
45 |     plt.scatter( feature, target, color=test_color ) 
46 | for feature, target in zip(feature_train, target_train):
47 |     plt.scatter( feature, target, color=train_color ) 
48 | 
49 | ### labels for the legend
50 | plt.scatter(feature_test[0], target_test[0], color=test_color, label="test")
51 | plt.scatter(feature_test[0], target_test[0], color=train_color, label="train")
52 | 
53 | print "Slope ", reg.coef_
54 | print "Intercept ", reg.intercept_
55 | print "Score ", reg.score(feature_test, target_test)
56 | 
57 | 
58 | ### draw the regression line, once it's coded
59 | try:
60 |     plt.plot( feature_test, reg.predict(feature_test) )
61 | except NameError:
62 |     pass
63 | reg.fit(feature_test, target_test)
64 | print "Slope ", reg.coef_
65 | plt.plot(feature_train, reg.predict(feature_train), color="r") 
66 | plt.xlabel(features_list[1])
67 | plt.ylabel(features_list[0])
68 | plt.legend()
69 | plt.show()
70 | 


--------------------------------------------------------------------------------
/svm/svm_author_id.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | """ 
 4 |     this is the code to accompany the Lesson 2 (SVM) mini-project
 5 | 
 6 |     use an SVM to identify emails from the Enron corpus by their authors
 7 |     
 8 |     Sara has label 0
 9 |     Chris has label 1
10 | 
11 | """
12 |     
13 | import sys
14 | from time import time
15 | sys.path.append("../tools/")
16 | from email_preprocess import preprocess
17 | from sklearn import svm
18 | from sklearn.metrics import accuracy_score
19 | ### features_train and features_test are the features for the training
20 | ### and testing datasets, respectively
21 | ### labels_train and labels_test are the corresponding item labels
22 | features_train, features_test, labels_train, labels_test = preprocess()
23 | #features_train = features_train[:len(features_train)/100] 
24 | #labels_train = labels_train[:len(labels_train)/100] 
25 | 
26 | #########################################################
27 | ### your code goes here ###
28 | clf = svm.SVC(kernel="rbf", C=10500)
29 | t0 = time()
30 | clf.fit(features_train, labels_train)
31 | print "Training time:", round(time()-t0, 3), "s"
32 | # Training Time: 213 seconds
33 | # Training Time w/ subset of data: 0.117 seconds
34 | # Training Time w/ rbf kernel: 0.164 seconds
35 | # Training Time w/ rbf kernel & C=10: 0.127 seconds
36 | # Training Time w/ rbf kernel & C=100: 0.138 seconds
37 | # Training Time w/ rbf kernel & C=1000: 0.127 seconds
38 | # Training Time w/ rbf kernel & C=10000: 0.129 seconds
39 | # Training Time (full) rbf kernel & C=10000: 137.442 seconds
40 | t1 = time()
41 | prediction = clf.predict(features_test)
42 | print "Prediction time:", round(time()-t1, 3), "s"
43 | # Prediction Time: 23 second
44 | # Prediction Time w/ subset of data: 0.88 seconds
45 | # Prediction Time w/ rbf kernel: 1.475 seconds
46 | # Prediction Time w/ rbf kernel & C=10: 1.465 seconds
47 | # Prediction Time w/ rbf kernel & C=100: 1.338 seconds
48 | # Prediction Time w/ rbf kernel & C=1000: 1.279 seconds
49 | # Prediction Time w/ rbf kernel & C=10000: 1.159 seconds
50 | # Prediction Time (full) rbf kernel & C=10000: 14.656 seconds
51 | print accuracy_score(prediction, labels_test)
52 | # Accuracy: 0.984072810011
53 | # Accuracy w/ subset: 0.884527872582
54 | # Accuracy w/ rbf kernel: 0.616040955631
55 | # Accuracy w/ rbf kernel & C=10 or 100: 0.616040955631
56 | # Accuracy w/ rbf kernel & C=1000: 0.821387940842
57 | # Accuracy w/ rbf kernel & C=10000: 0.892491467577
58 | # Accuracy (full) rbf kernel & C=10000: 0.990898748578
59 | 
60 | # Prediction for feature_test[10][26][50] = [1][0][1]
61 | # print clf.predict(features_test[50])
62 | 
63 | chris = []
64 | # Get number of predicted emails written by Chris.  Ans: 877
65 | for i in prediction:
66 |     if i == 1:
67 |         chris.append(i)
68 | 
69 | print len(chris)
70 | #########################################################
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/text_learning/test_email.txt:
--------------------------------------------------------------------------------
1 | To: Katie_and_Sebastians_Excellent_Students@udacity.com
2 | From: katie@udacity.com
3 | X-FileName:
4 | 
5 | Hi Everyone!  If you can read this message, you're properly using parseOutText.  Please proceed to the next part of the project!
6 | 


--------------------------------------------------------------------------------
/text_learning/vectorize_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import os
 4 | import pickle
 5 | import re
 6 | import sys
 7 | 
 8 | sys.path.append( "../tools/" )
 9 | from parse_out_email_text import parseOutText
10 | 
11 | """
12 |     starter code to process the emails from Sara and Chris to extract
13 |     the features and get the documents ready for classification
14 | 
15 |     the list of all the emails from Sara are in the from_sara list
16 |     likewise for emails from Chris (from_chris)
17 | 
18 |     the actual documents are in the Enron email dataset, which
19 |     you downloaded/unpacked in Part 0 of the first mini-project
20 | 
21 |     the data is stored in lists and packed away in pickle files at the end
22 | 
23 | """
24 | ### Parse out test email
25 | testEmail = open("test_email.txt", "r")
26 | print parseOutText(testEmail)
27 | 
28 | 
29 | from_sara  = open("from_sara.txt", "r")
30 | from_chris = open("from_chris.txt", "r")
31 | 
32 | from_data = []
33 | word_data = []
34 | 
35 | ### temp_counter is a way to speed up the development--there are
36 | ### thousands of emails from Sara and Chris, so running over all of them
37 | ### can take a long time
38 | ### temp_counter helps you only look at the first 200 emails in the list
39 | temp_counter = 0
40 | 
41 | 
42 | for name, from_person in [("sara", from_sara), ("chris", from_chris)]:
43 |     for path in from_person:
44 |         ### only look at first 200 emails when developing
45 |         ### once everything is working, remove this line to run over full dataset
46 |         temp_counter += 1
47 |         if temp_counter < 200:
48 |             path = os.path.join('..', path[:-1])
49 |             print path
50 |             email = open(path, "r")
51 | 
52 |             ### use parseOutText to extract the text from the opened email
53 |             ### use str.replace() to remove any instances of the words
54 |             ### ["sara", "shackleton", "chris", "germani"]
55 |             words = parseOutText(email)
56 |             remove  = ["sara", "shackleton", "chris", "germani"]
57 |             #list_rep  = ["sara", "shackleton", "chris", "germani","sshacklensf","cgermannsf"]
58 |             for word in remove:
59 |                 words = words.replace(word,"")
60 |             ### append the text to word_data
61 |             word_data.append(words)
62 |             ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
63 |             from_data.append(0 if name == "sara" else 1)
64 | 
65 |             email.close()
66 | 
67 | print "Entry #152: ", word_data[152]
68 | 
69 | print "emails processed"
70 | from_sara.close()
71 | from_chris.close()
72 | 
73 | pickle.dump( word_data, open("your_word_data.pkl", "w") )
74 | pickle.dump( from_data, open("your_email_authors.pkl", "w") )
75 | 
76 | 
77 | ### in Part 4, do TfIdf vectorization here
78 | from nltk.corpus import stopwords
79 | sw = stopwords.words("english")
80 | 
81 | from sklearn.feature_extraction.text import TfidfVectorizer
82 | vectorizer = TfidfVectorizer(stop_words="english",lowercase=True)
83 | vectorizer.fit_transform(word_data)
84 | # bag_words = vectorizer.transform(word_data)
85 | 
86 | # Get how many unique words there are in the emails
87 | print len(vectorizer.get_feature_names())
88 | 
89 | # Word number 34597
90 | print vectorizer.get_feature_names()[34597]


--------------------------------------------------------------------------------
/tools/email_preprocess.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | import pickle
 4 | import numpy
 5 | 
 6 | from sklearn import cross_validation
 7 | from sklearn.feature_extraction.text import TfidfVectorizer
 8 | from sklearn.feature_selection import SelectPercentile, f_classif
 9 | 
10 | 
11 | 
12 | def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl"):
13 |     """ 
14 |         this function takes a pre-made list of email texts (by default word_data.pkl)
15 |         and the corresponding authors (by default email_authors.pkl) and performs
16 |         a number of preprocessing steps:
17 |             -- splits into training/testing sets (10% testing)
18 |             -- vectorizes into tfidf matrix
19 |             -- selects/keeps most helpful features
20 | 
21 |         after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions
22 | 
23 |         4 objects are returned:
24 |             -- training/testing features
25 |             -- training/testing labels
26 | 
27 |     """
28 | 
29 |     ### the words (features) and authors (labels), already largely preprocessed
30 |     ### this preprocessing will be repeated in the text learning mini-project
31 |     word_data = pickle.load( open(words_file, "r"))
32 |     authors = pickle.load( open(authors_file, "r") )
33 | 
34 |     ### test_size is the percentage of events assigned to the test set (remainder go into training)
35 |     features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
36 | 
37 | 
38 | 
39 |     ### text vectorization--go from strings to lists of numbers
40 |     vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
41 |                                  stop_words='english')
42 |     features_train_transformed = vectorizer.fit_transform(features_train)
43 |     features_test_transformed  = vectorizer.transform(features_test)
44 | 
45 | 
46 | 
47 |     ### feature selection, because text is super high dimensional and 
48 |     ### can be really computationally chewy as a result
49 |     selector = SelectPercentile(f_classif, percentile=1)
50 |     selector.fit(features_train_transformed, labels_train)
51 |     features_train_transformed = selector.transform(features_train_transformed).toarray()
52 |     features_test_transformed  = selector.transform(features_test_transformed).toarray()
53 | 
54 |     ### info on the data
55 |     print "no. of Chris training emails:", sum(labels_train)
56 |     print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
57 | 
58 | 
59 |     return features_train_transformed, features_test_transformed, labels_train, labels_test
60 | 


--------------------------------------------------------------------------------
/tools/email_preprocess.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangheraio/MachineLearning/1de218e3be186e8689f5526b4834d73cd5952e53/tools/email_preprocess.pyc


--------------------------------------------------------------------------------
/tools/feature_format.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | """ 
  4 |     A general tool for converting data from the
  5 |     dictionary format to an (n x k) python list that's 
  6 |     ready for training an sklearn algorithm
  7 | 
  8 |     n--no. of key-value pairs in dictonary
  9 |     k--no. of features being extracted
 10 | 
 11 |     dictionary keys are names of persons in dataset
 12 |     dictionary values are dictionaries, where each
 13 |         key-value pair in the dict is the name
 14 |         of a feature, and its value for that person
 15 | 
 16 |     In addition to converting a dictionary to a numpy 
 17 |     array, you may want to separate the labels from the
 18 |     features--this is what targetFeatureSplit is for
 19 | 
 20 |     so, if you want to have the poi label as the target,
 21 |     and the features you want to use are the person's
 22 |     salary and bonus, here's what you would do:
 23 | 
 24 |     feature_list = ["poi", "salary", "bonus"] 
 25 |     data_array = featureFormat( data_dictionary, feature_list )
 26 |     label, features = targetFeatureSplit(data_array)
 27 | 
 28 |     the line above (targetFeatureSplit) assumes that the
 29 |     label is the _first_ item in feature_list--very important
 30 |     that poi is listed first!
 31 | """
 32 | 
 33 | 
 34 | import numpy as np
 35 | 
 36 | def featureFormat( dictionary, features, remove_NaN=True, remove_all_zeroes=True, remove_any_zeroes=False, sort_keys = False):
 37 |     """ convert dictionary to numpy array of features
 38 |         remove_NaN=True will convert "NaN" string to 0.0
 39 |         remove_all_zeroes=True will omit any data points for which
 40 |             all the features you seek are 0.0
 41 |         remove_any_zeroes=True will omit any data points for which
 42 |             any of the features you seek are 0.0
 43 |     """
 44 | 
 45 | 
 46 |     return_list = []
 47 | 
 48 |     if sort_keys:
 49 |         keys = sorted(dictionary.keys())
 50 |     else:
 51 |         keys = dictionary.keys()
 52 | 
 53 |     for key in keys:
 54 |         tmp_list = []
 55 |         append = False
 56 |         for feature in features:
 57 |             try:
 58 |                 dictionary[key][feature]
 59 |             except KeyError:
 60 |                 print "error: key ", feature, " not present"
 61 |                 return
 62 |             value = dictionary[key][feature]
 63 |             if value=="NaN" and remove_NaN:
 64 |                 value = 0
 65 |             tmp_list.append( float(value) )
 66 | 
 67 |         ### if all features are zero and you want to remove
 68 |         ### data points that are all zero, do that here
 69 |         if remove_all_zeroes:
 70 |             all_zeroes = True
 71 |             for item in tmp_list:
 72 |                 if item != 0 and item != "NaN":
 73 |                     append = True
 74 | 
 75 |         ### if any features for a given data point are zero
 76 |         ### and you want to remove data points with any zeroes,
 77 |         ### handle that here
 78 |         if remove_any_zeroes:
 79 |             any_zeroes = False
 80 |             if 0 in tmp_list or "NaN" in tmp_list:
 81 |                 append = False
 82 |         if append:
 83 |             return_list.append( np.array(tmp_list) )
 84 | 
 85 | 
 86 |     return np.array(return_list)
 87 | 
 88 | 
 89 | def targetFeatureSplit( data ):
 90 |     """ 
 91 |         given a numpy array like the one returned from
 92 |         featureFormat, separate out the first feature
 93 |         and put it into its own list (this should be the 
 94 |         quantity you want to predict)
 95 | 
 96 |         return targets and features as separate lists
 97 | 
 98 |         (sklearn can generally handle both lists and numpy arrays as 
 99 |         input formats when training/predicting)
100 |     """
101 | 
102 |     target = []
103 |     features = []
104 |     for item in data:
105 |         target.append( item[0] )
106 |         features.append( item[1:] )
107 | 
108 |     return target, features
109 | 
110 | 
111 | 
112 | 
113 | 


--------------------------------------------------------------------------------
/tools/parse_out_email_text.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from nltk.stem.snowball import SnowballStemmer
 4 | import string
 5 | 
 6 | def parseOutText(f):
 7 |     """ given an opened email file f, parse out all text below the
 8 |         metadata block at the top
 9 |         (in Part 2, you will also add stemming capabilities)
10 |         and return a string that contains all the words
11 |         in the email (space-separated) 
12 |         
13 |         example use case:
14 |         f = open("email_file_name.txt", "r")
15 |         text = parseOutText(f)
16 |         
17 |         """
18 | 
19 | 
20 |     f.seek(0)  ### go back to beginning of file (annoying)
21 |     all_text = f.read()
22 | 
23 |     ### split off metadata
24 |     content = all_text.split("X-FileName:")
25 |     words = ""
26 |     if len(content) > 1:
27 |         ### remove punctuation
28 |         text_string = content[1].translate(string.maketrans("", ""), string.punctuation)
29 | 
30 |         ### project part 2: comment out the line below
31 |         ## words = text_string
32 | 
33 |         ### split the text string into individual words, stem each word,
34 |         ### and append the stemmed word to words (make sure there's a single
35 |         ### space between each stemmed word)
36 |         from nltk.stem.snowball import SnowballStemmer
37 |         words = []
38 |         for word in text_string.split():
39 |             stemmer = SnowballStemmer("english")
40 |             stemWord = stemmer.stem(word)
41 |             words.append(stemWord)
42 | 
43 |     return string.join(words)
44 | 
45 |     
46 | 
47 | def main():
48 |     ff = open("../text_learning/test_email.txt", "r")
49 |     text = parseOutText(ff)
50 |     print text
51 | 
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 
57 | 


--------------------------------------------------------------------------------
/tools/startup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | print
 4 | print "checking for nltk"
 5 | try:
 6 |     import nltk
 7 | except ImportError:
 8 |     print "you should install nltk before continuing"
 9 | 
10 | print "checking for numpy"
11 | try:
12 |     import numpy
13 | except ImportError:
14 |     print "you should install numpy before continuing"
15 | 
16 | print "checking for sklearn"
17 | try:
18 |     import sklearn
19 | except:
20 |     print "you should install sklearn before continuing"
21 | 
22 | print
23 | print "downloading the Enron dataset (this may take a while)"
24 | print "to check on progress, you can cd up one level, then execute <ls -lthr>"
25 | print "Enron dataset should be last item on the list, along with its current size"
26 | print "download will complete at about 423 MB"
27 | import urllib
28 | url = "https://www.cs.cmu.edu/~./enron/enron_mail_20110402.tgz"
29 | urllib.urlretrieve(url, filename="../enron_mail_20110402.tgz") 
30 | print "download complete!"
31 | 
32 | 
33 | print
34 | print "unzipping Enron dataset (this may take a while)"
35 | import tarfile
36 | import os
37 | os.chdir("..")
38 | tfile = tarfile.open("enron_mail_20110402.tgz", "r:gz")
39 | tfile.extractall(".")
40 | 
41 | print "you're ready to go!"
42 | 


--------------------------------------------------------------------------------
/validation/validate_poi.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | 
 4 | """
 5 |     starter code for the validation mini-project
 6 |     the first step toward building your POI identifier!
 7 | 
 8 |     start by loading/formatting the data
 9 | 
10 |     after that, it's not our code anymore--it's yours!
11 | """
12 | 
13 | import pickle
14 | import sys
15 | sys.path.append("../tools/")
16 | from feature_format import featureFormat, targetFeatureSplit
17 | 
18 | data_dict = pickle.load(open("../final_project/final_project_dataset.pkl", "r") )
19 | 
20 | ### first element is our labels, any added elements are predictor
21 | ### features. Keep this the same for the mini-project, but you'll
22 | ### have a different feature list when you do the final project.
23 | features_list = ["poi", "salary"]
24 | 
25 | data = featureFormat(data_dict, features_list)
26 | labels, features = targetFeatureSplit(data)
27 | 
28 | ### it's all yours from here forward!
29 | 
30 | # Split data into training and testing
31 | from sklearn import cross_validation  
32 | features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.3, random_state=42)
33 | 
34 | # Fit data with sklearn decision trees algorithm
35 | from sklearn import tree
36 | clf = tree.DecisionTreeClassifier()
37 | clf = clf.fit(features_train, labels_train)
38 | 
39 | # Get the accuracy
40 | from sklearn.metrics import accuracy_score
41 | prediction = clf.predict(features_test)
42 | print accuracy_score(prediction, labels_test)
43 | # Overfit accuracy = 0.989583333333


--------------------------------------------------------------------------------