├── Feature Selection.py
├── Fraud Detection from ENRON dataset.ipynb
├── K means.py
├── Outliers Removal.py
├── README.md
├── Regression.py
├── enron61702insiderpay.pdf
├── evaluate_poi_identifier.py
├── img
    ├── feature.png
    ├── k_means.png
    ├── outliers.png
    └── regression.png
└── validate_poi.py


/Feature Selection.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[83]:
  5 | 
  6 | 
  7 | import pickle
  8 | import numpy
  9 | numpy.random.seed(42)
 10 | 
 11 | 
 12 | # In[84]:
 13 | 
 14 | 
 15 | word_file = "C:/Users/Geekquad/ud120-projects/feature_selection/word_data_modified_unix.pkl"
 16 | author_file = "C:/Users/Geekquad/ud120-projects/feature_selection/email_authors_modified_unix.pkl"
 17 | word_data = pickle.load(open(word_file, "rb"))
 18 | author_data = pickle.load(open(author_file, "rb"))
 19 | 
 20 | 
 21 | # In[85]:
 22 | 
 23 | 
 24 | import sklearn
 25 | from sklearn.cross_validation import train_test_split
 26 | features_train, features_test, labels_train, labels_test = train_test_split(word_data, author_data, test_size=0.1, random_state=42)
 27 | 
 28 | 
 29 | # In[86]:
 30 | 
 31 | 
 32 | from sklearn.feature_extraction.text import TfidfVectorizer
 33 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words="english")
 34 | features_train = vectorizer.fit_transform(features_train)
 35 | features_test = vectorizer.transform(features_test).toarray()
 36 | 
 37 | 
 38 | # In[87]:
 39 | 
 40 | 
 41 | #### training only on 150 data points to put myself into overfit regime
 42 | features_train = features_train[:150].toarray()
 43 | labels_train = labels_train[:150]
 44 | 
 45 | 
 46 | # In[88]:
 47 | 
 48 | 
 49 | print('number of training points: ', len(features_train))
 50 | 
 51 | 
 52 | # In[89]:
 53 | 
 54 | 
 55 | """overfitting the Decision Tree and cehcking the accuracy"""
 56 | from sklearn.tree import DecisionTreeClassifier
 57 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
 58 | 
 59 | clf = DecisionTreeClassifier()
 60 | clf.fit(features_train, labels_train)
 61 | y_pred = clf.predict(features_test)
 62 | 
 63 | 
 64 | print(confusion_matrix(labels_test, y_pred))
 65 | print(classification_report(labels_test, y_pred))
 66 | print(accuracy_score(labels_test, y_pred))
 67 | 
 68 | 
 69 | # Yes, it has an accuracy much higher than it should be. 
 70 | # Hence, finding the most important features.
 71 | 
 72 | # In[90]:
 73 | 
 74 | 
 75 | # identifying the most important features:
 76 | import numpy as np
 77 | importances = clf.feature_importances_
 78 | indices = np.argsort(importances)[::-1]
 79 | print("Feature Ranking")
 80 | for i in range(10):
 81 |     print("{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]]))
 82 | 
 83 | 
 84 | # In[91]:
 85 | 
 86 | 
 87 | vect.get_feature_names()[2802]
 88 | 
 89 | 
 90 | # This word seems like an outlier in a certain sense, so let’s remove it and refit. 
 91 | 
 92 | # In[99]:
 93 | 
 94 | 
 95 | word_file = "C:/Users/Geekquad/ud120-projects/feature_selection/word_data_overfit_modified_unix.pkl"
 96 | author_file = "C:/Users/Geekquad/ud120-projects/feature_selection/email_authors_overfit_modified_unix.pkl"
 97 | word_data = pickle.load(open(word_file, "rb"))
 98 | author_data = pickle.load(open(author_file, "rb"))
 99 | 
100 | 
101 | # In[100]:
102 | 
103 | 
104 | import sklearn
105 | from sklearn.cross_validation import train_test_split
106 | features_train, features_test, labels_train, labels_test = train_test_split(word_data, author_data, test_size=0.1, random_state=42)
107 | 
108 | 
109 | # In[101]:
110 | 
111 | 
112 | from sklearn.feature_extraction.text import TfidfVectorizer
113 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words="english")
114 | features_train = vectorizer.fit_transform(features_train)
115 | features_test = vectorizer.transform(features_test).toarray()
116 | 
117 | 
118 | # In[102]:
119 | 
120 | 
121 | #### training only on 150 data points to put myself into overfit regime
122 | features_train = features_train[:150].toarray()
123 | labels_train = labels_train[:150]
124 | 
125 | 
126 | # In[103]:
127 | 
128 | 
129 | """overfitting the Decision Tree and cehcking the accuracy"""
130 | from sklearn.tree import DecisionTreeClassifier
131 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
132 | 
133 | clf = DecisionTreeClassifier()
134 | clf.fit(features_train, labels_train)
135 | y_pred = clf.predict(features_test)
136 | 
137 | 
138 | print(confusion_matrix(labels_test, y_pred))
139 | print(classification_report(labels_test, y_pred))
140 | print(accuracy_score(labels_test, y_pred))
141 | 
142 | 
143 | # In[104]:
144 | 
145 | 
146 | # identifying the most important features:
147 | import numpy as np
148 | importances = clf.feature_importances_
149 | indices = np.argsort(importances)[::-1]
150 | print("Feature Ranking")
151 | for i in range(10):
152 |     print("{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]]))
153 | 
154 | 
155 | # In[105]:
156 | 
157 | 
158 | vectorizer.get_feature_names()[33604]
159 | 
160 | 


--------------------------------------------------------------------------------
/K means.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[22]:
 5 | 
 6 | 
 7 | import pickle
 8 | import numpy
 9 | import sys
10 | import matplotlib.pyplot as plt
11 | sys.path.append(".../tools/")
12 | from feature_format import featureFormat, targetFeatureSplit
13 | 
14 | 
15 | # In[23]:
16 | 
17 | 
18 | def Draw(pred, features, poi, mark_poi=False, name='image.png', f1_name='feature 1', f2_name ='feature 2'):
19 |     colors = ["b", "c", "k", "m", "g"]
20 |     for ii, pp in enumerate(pred):
21 |         plt.scatter(features[ii][0], features[ii][1], color= colors[pred[ii]])
22 |     
23 |     if mark_poi:
24 |         for ii, pp in enumerate(pred):
25 |             if poi[ii]:
26 |                 plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
27 |                 
28 |     plt.xlabel(f1_name)
29 |     plt.ylabel(f2_name)
30 |     plt.savefig(name)
31 |     plt.show()
32 | 
33 | 
34 | # In[24]:
35 | 
36 | 
37 | data_dict = pickle.load(open("C:/Users/Geekquad/final_project_dataset_modified_unix.pkl", "rb"))
38 | data_dict.pop("TOTAL", 0)
39 | 
40 | 
41 | # In[25]:
42 | 
43 | 
44 | feature_1 = 'salary'
45 | feature_2 = "exercised_stock_options"
46 | poi = "poi"
47 | features_list = [poi, feature_1, feature_2]
48 | data = featureFormat(data_dict, features_list)
49 | poi, finance_features = targetFeatureSplit(data)
50 | 
51 | 
52 | # In[26]:
53 | 
54 | 
55 | for f1, f2 in finance_features:
56 |     plt.scatter( f1, f2 )
57 | plt.show()
58 | 
59 | 
60 | # ## Clustering using K-means:
61 | 
62 | # In[27]:
63 | 
64 | 
65 | from sklearn.cluster import KMeans
66 | features_list = ["poi", feature_1, feature_2]
67 | data2 = featureFormat(data_dict, features_list)
68 | poi, finance_features = targetFeatureSplit(data2)
69 | clf = KMeans(n_clusters=2)
70 | pred = clf.fit_predict(finance_features)
71 | Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name = feature_1, f2_name=feature_2)
72 | 
73 | 


--------------------------------------------------------------------------------
/Outliers Removal.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[51]:
 5 | 
 6 | 
 7 | import random
 8 | import numpy as np
 9 | import pickle
10 | import matplotlib.pyplot as plt
11 | from outlier_cleaner import outlierCleaner
12 | 
13 | 
14 | # In[52]:
15 | 
16 | 
17 | ages = pickle.load( open("practice_outliers_ages_modified_unix.pkl", "rb") )
18 | net_worths = pickle.load( open("practice_outliers_net_worths_modified_unix.pkl", "rb") )
19 | 
20 | 
21 | # In[53]:
22 | 
23 | 
24 | ages = np.reshape(np.array(ages), (len(ages), 1))
25 | new_worths = np.reshape(np.array(net_worths), (len(net_worths), 1))
26 | 
27 | from sklearn.cross_validation import train_test_split
28 | ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state = 42)
29 | 
30 | 
31 | # In[54]:
32 | 
33 | 
34 | from sklearn.linear_model import LinearRegression
35 | reg = LinearRegression()
36 | reg.fit(ages_train, net_worths_train)
37 | reg.predict(ages_test)
38 | print('coef:', reg.coef_)
39 | print('intercept:', reg.intercept_)
40 | print('scores:', reg.score(ages_test, net_worths_test))
41 | 
42 | 
43 | # In[55]:
44 | 
45 | 
46 | plt.scatter(ages_train, net_worths_train, color='blue')
47 | plt.scatter(ages_test, net_worths_test, color='red')
48 | plt.plot(ages_train, reg.predict(ages_train), color='green')
49 | plt.xlabel('Ages')
50 | plt.ylabel('Net Worth')
51 | plt.show()
52 | 
53 | 
54 | # ## Removing the most outlier points:
55 | 
56 | # In[56]:
57 | 
58 | 
59 | cleaned_data = []
60 | try:
61 |     predictions = reg.predict(ages_train)
62 |     cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
63 | except NameError:
64 |     print("your regression object doesn't exist, or isn't name reg")
65 |     print("can't make predictions to use in identifying outliers")
66 | 
67 | 
68 | # In[57]:
69 | 
70 | 
71 | if len(cleaned_data) > 0:
72 |     ages, net_worths, errors = zip(*cleaned_data)
73 |     ages= np.reshape(np.array(ages), (len(ages), 1))
74 |     net_worths = np.reshape( np.array(net_worths), (len(net_worths), 1))
75 | 
76 | 
77 | # In[58]:
78 | 
79 | 
80 | try:
81 |     reg.fit(ages, net_worths)
82 |     plt.plot(ages, reg.predict(ages), color="blue")
83 | except NameError:
84 |     print("you don't seem to have regression imported/created")
85 |     print("or else your regression object isn't named reg")
86 |     print("either way, only draw the scatter plot of the cleaned data")
87 |     plt.scatter(ages, net_worths)
88 |     plt.xlabel("ages")
89 |     plt.ylabel("net worths")
90 |     plt.show()
91 | 
92 |     print('coef-after cleaned:', reg.coef_)
93 |     print('intercept-after cleaned:', reg.intercept_)
94 |     print('score-after cleaned:', reg.score(ages_test, net_worths_test))
95 | 
96 | else:
97 |     print("outlierCleaner() is returning an empty list, no refitting to be done")
98 | 
99 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Fraud-Detection
  2 | The Enron fraud is the largest case of corporate fraud in American history. Founded in 1985, Enron Corporation went bankrupt by end of 2001 due to widespread corporate fraud and corruption. Before its fall, Fortune magazine had named Enron “America’s most innovative company” for six consecutive years.
  3 | 
  4 | **Dataset**: <a href="https://www.cs.cmu.edu/~./enron/"> https://www.cs.cmu.edu/~./enron/ </a> 
  5 | <hr></hr>
  6 | 
  7 | ## <u> Goal of the Porject: </u>
  8 | The goal of the project is to go through the thought process of data exploration (learning, cleaning and preparing the data), 
  9 | feature selecting/engineering (selecting the features which influence mostly on the target, 
 10 | create new features (which explains the target the better than existing) and, 
 11 | reducing the dimensionality of the data using principal component analysis (PCA)), 
 12 | picking/tuning one of the supervised machine learning algorithm and validating it to get the accurate person of interest identifier model.
 13 | 
 14 | ## <u> Data Exploration </u>
 15 | The features in the data fall into three major types, namely 
 16 | - financial features, 
 17 | - email features 
 18 | - POI labels.
 19 | 
 20 | There are 143 samples with 20 features and a binary classification ("poi")
 21 | Among 146 samples, there are
 22 | - 18 POI and 
 23 | - 128 non-POI.
 24 | 
 25 | <hr> </hr>
 26 | 
 27 | ## Optimize Feature Selection/Engineering
 28 | During the work on the project, I've played with the different features and models. One strategy was to standardize features, 
 29 | apply principal component analysis and GaussianNB classifier, another strategy was to use decision tree classifier, incl. choosing the 
 30 | features with features importance attribute and tuning the model.
 31 | 
 32 | <img src="https://github.com/geekquad/Fraud-Detection/blob/master/img/feature.png">
 33 | 
 34 | ### Create new features
 35 | For both strategies I've tried to create new features as a fraction of almost all financial variables (f.ex. fractional bonus 
 36 | as fraction of bonus to total_payments, etc.). Logic behind email feature creation was to check the fraction of emails, sent to POI, 
 37 | to all sent emails; emails, received from POI, to all received emails.
 38 | I've end up with using one new feature fraction_to_POI.
 39 | <hr> </hr>
 40 | 
 41 | ## <u> Pick and Tune an Algorithm: </u>
 42 | I've played with 7 machine learning algorithms:
 43 | - Naive Bayes (GaussianNB)
 44 | - SVC
 45 | - RandomForestClassifier
 46 | - ExtraTreesClassifier
 47 | - AdaBoostClassifier
 48 | - LogisticRegression
 49 | - SVC
 50 | 
 51 | ### Comparing Classifiers based on cross-validation scores:
 52 | - 1st tier: SVC, RandomForestClassifier
 53 | - 2nd tier: GaussianNB, ExtraTreesClassifier, AdaBoostClassifier
 54 | - 3rd tier: Logistic Regression, LinearSVC
 55 | 
 56 | ### Tuning the algorithm:
 57 | Bias-variance tradeoff is one of the key dilema in machine learning. High bias algorithms has no capacity to learn, high variance algorithms 
 58 | react poorly in case they didn't see such data before. Predictive model should be tuned to achieve compromise. The process of changing the parameteres of algorithms is 
 59 | algorithm tuning and it lets us find the golden mean and best result. If I don't tune the algorithm well, I don't get the best result I could.
 60 | Algorithm might be tuned manually by iteratively changing the parameteres and tracking the results. Or GridSearchCV might be used which makes this automatically.
 61 | I've tuned the parameteres of my decision tree classifier by sequentially tuning parameter by parameter and got the best F1 using these parameters
 62 | <hr> </hr>
 63 | 
 64 | ## Validate and Evaluate
 65 | ### Usage of Evaluation Metrics
 66 | In the project I've used F1 score as key measure of algorithms' accuracy. It considers both the precision and the recall of the test to compute the score.
 67 | Precision is the ability of the classifier not label as positive sample that is negative.
 68 | Recall is the ability of the classifier to find all positive samples.
 69 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst at 0.
 70 | 
 71 | ### Validation Strategy
 72 | The validation is a process of model performance evaluation. Classic mistace is to use small data set for the model training or validate model on the same data set as train it.
 73 | There are a number of strategies to validate the model. One of them is to split the available data into train and test data another one is to perform a cross validation: process of splitting the data on k beans equal size; run learning experiments; repeat this operation number of times and take the average test result.
 74 | <hr> </hr>
 75 | 
 76 | ## <u> Conclusions: </u>
 77 | Before the start of this project I was completely sure that building the machine learning is about choosing the right algorithm 
 78 | from the black box and some magic. Working on the person of interest identifier I've been recursively going through the process 
 79 | of data exploration, outlier detection and algorithm tuning and spend most of the time on a data preparation. The model performance raised 
 80 | significantly after missing values imputation, extra feature creation and feature selection and less after algorithm tuning which shows me 
 81 | once again how important to fit the model with the good data.
 82 | This experience might be applied to other fraud detection tasks. I think there is way of the model improvement by 
 83 | using and tuning alternative algorithms like Random Forest.
 84 | 
 85 | ## Limitations of the study:
 86 | It’s important to identify and acknowledge the limitation of the study. My conclusions are based just on the provided 
 87 | data set which represent just 143 persons. To get the real causation, I should gather all financial and email information 
 88 | about all enron persons which is most probably not possible. Missing email values were imputed with median so the modes of the distributions 
 89 | of email features are switched to the medians. Algorithms were tuned sequentially (I've changed one parameter to achieve better performance 
 90 | and then swithched to another parameter. There is a chance that othere parameters in combination might give better model's accuracy).
 91 | 
 92 | ## References:
 93 | - Enron data set: <a href="https://www.cs.cmu.edu/~./enron/"> https://www.cs.cmu.edu/~./enron/ </a>
 94 | - FindLaw financial data: <a href="http://www.findlaw.com"> http://www.findlaw.com </a> 
 95 | - Visualization of POI: <a href="http://www.nytimes.com/packages/html/national/20061023_ENRON_TABLE/index.html"> http://www.nytimes.com/packages/html/national/20061023_ENRON_TABLE/index.html </a>
 96 | - Enron on Wikipedia: <a href="https://en.wikipedia.org/wiki/Enron"> https://en.wikipedia.org/wiki/Enron</a>
 97 | - F1 score on Wikipedia: <a href="https://en.wikipedia.org/wiki/F1_score"> https://en.wikipedia.org/wiki/F1_score </a>
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/Regression.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[10]:
 5 | 
 6 | 
 7 | import sys
 8 | import pickle
 9 | sys.path.append("../tools/")
10 | from feature_format import featureFormat, targetFeatureSplit
11 | import pickle
12 | 
13 | dictionary = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb"))
14 | 
15 | 
16 | # In[11]:
17 | 
18 | 
19 | features_list = ["bonus", "salary"]
20 | data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
21 | target, features = targetFeatureSplit( data )
22 | 
23 | 
24 | # In[12]:
25 | 
26 | 
27 | from sklearn.cross_validation import train_test_split
28 | feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
29 | train_color = "b"
30 | test_color = "b"
31 | 
32 | 
33 | # In[15]:
34 | 
35 | 
36 | from sklearn.linear_model import LinearRegression
37 | reg = LinearRegression()
38 | reg.fit(feature_train, target_train)
39 | print('coef', reg.coef_)
40 | print('intercept', reg.intercept_)
41 | 
42 | 
43 | # In[25]:
44 | 
45 | 
46 | import matplotlib.pyplot as plt
47 | for feature, target in zip(feature_test, target_test):
48 |     plt.scatter(feature, target, color = test_color)
49 |     
50 | for feature, target in zip(feature_train, target_train):
51 |     plt.scatter(feature, target, color= train_color)
52 |     
53 | plt.scatter(feature_test[0], target_test[0], color=test_color, label='test')
54 | plt.scatter(feature_test[0], target_test[0], color=train_color, label='train')
55 | plt.plot(feature_test, reg.predict(feature_test))
56 | plt.xlabel(features_list[1])
57 | plt.ylabel(features_list[0])
58 | plt.legend()
59 | plt.show()
60 | 
61 | 


--------------------------------------------------------------------------------
/enron61702insiderpay.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/enron61702insiderpay.pdf


--------------------------------------------------------------------------------
/evaluate_poi_identifier.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pickle
 8 | import sys
 9 | from feature_format import featureFormat, targetFeatureSplit
10 | 
11 | 
12 | # In[2]:
13 | 
14 | 
15 | data_dict = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb"))
16 | 
17 | 
18 | # In[3]:
19 | 
20 | 
21 | features_list = ['poi', 'salary']
22 | data = featureFormat(data_dict, features_list)
23 | labels, features = targetFeatureSplit(data)
24 | 
25 | 
26 | # In[10]:
27 | 
28 | 
29 | import sklearn
30 | import numpy as np
31 | from sklearn.cross_validation import train_test_split
32 | from sklearn.tree import DecisionTreeClassifier
33 | from sklearn import svm
34 | 
35 | 
36 | # In[6]:
37 | 
38 | 
39 | features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state=42)
40 | 
41 | 
42 | # In[7]:
43 | 
44 | 
45 | clf = DecisionTreeClassifier()
46 | clf.fit(features_train, labels_train)
47 | print("Accuracy:", clf.score(features_test, labels_test))
48 | print(clf.predict(features_test))
49 | 
50 | 
51 | # In[12]:
52 | 
53 | 
54 | print('np.array(labels_test):')
55 | print(np.array(labels_test))
56 | 
57 | 
58 | # In[16]:
59 | 
60 | 
61 | print('POIs predict:', clf.predict(features_test))
62 | print('Number of POIs predict:', len([e for e in labels_test if e == 1.0]))
63 | 
64 | 
65 | # In[17]:
66 | 
67 | 
68 | print("Number of tests:", len(labels_test))
69 | 
70 | 
71 | # In[21]:
72 | 
73 | 
74 | from sklearn.metrics import *
75 | print("precision:", precision_score(labels_test, clf.predict(features_test)))
76 | print("recall:", recall_score(labels_test, clf.predict(features_test)))
77 | 
78 | 


--------------------------------------------------------------------------------
/img/feature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/feature.png


--------------------------------------------------------------------------------
/img/k_means.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/k_means.png


--------------------------------------------------------------------------------
/img/outliers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/outliers.png


--------------------------------------------------------------------------------
/img/regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/regression.png


--------------------------------------------------------------------------------
/validate_poi.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | 
 7 | import pickle
 8 | import sys
 9 | from feature_format import featureFormat, targetFeatureSplit
10 | 
11 | 
12 | # In[2]:
13 | 
14 | 
15 | data_dict = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb"))
16 | 
17 | 
18 | # In[3]:
19 | 
20 | 
21 | features_list = ["poi", "salary"]
22 | 
23 | 
24 | # In[5]:
25 | 
26 | 
27 | data = featureFormat(data_dict, features_list)
28 | labels, features = targetFeatureSplit(data)
29 | 
30 | 
31 | # In[7]:
32 | 
33 | 
34 | import sklearn
35 | from sklearn.cross_validation import train_test_split
36 | from sklearn import svm
37 | 
38 | 
39 | # In[16]:
40 | 
41 | 
42 | features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state=42)
43 | 
44 | 
45 | # In[17]:
46 | 
47 | 
48 | from sklearn.model_selection import GridSearchCV
49 | 
50 | parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
51 | svr = svm.SVC()
52 | clf = GridSearchCV(svr, parameters)
53 | clf.fit(features_train,labels_train)
54 | print(clf.score(features_test, labels_test))
55 | 
56 | 


--------------------------------------------------------------------------------