├── Feature Selection.py
├── Fraud Detection from ENRON dataset.ipynb
├── K means.py
├── Outliers Removal.py
├── README.md
├── Regression.py
├── enron61702insiderpay.pdf
├── evaluate_poi_identifier.py
├── img
├── feature.png
├── k_means.png
├── outliers.png
└── regression.png
└── validate_poi.py
/Feature Selection.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[83]:
5 |
6 |
7 | import pickle
8 | import numpy
9 | numpy.random.seed(42)
10 |
11 |
12 | # In[84]:
13 |
14 |
15 | word_file = "C:/Users/Geekquad/ud120-projects/feature_selection/word_data_modified_unix.pkl"
16 | author_file = "C:/Users/Geekquad/ud120-projects/feature_selection/email_authors_modified_unix.pkl"
17 | word_data = pickle.load(open(word_file, "rb"))
18 | author_data = pickle.load(open(author_file, "rb"))
19 |
20 |
21 | # In[85]:
22 |
23 |
24 | import sklearn
25 | from sklearn.cross_validation import train_test_split
26 | features_train, features_test, labels_train, labels_test = train_test_split(word_data, author_data, test_size=0.1, random_state=42)
27 |
28 |
29 | # In[86]:
30 |
31 |
32 | from sklearn.feature_extraction.text import TfidfVectorizer
33 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words="english")
34 | features_train = vectorizer.fit_transform(features_train)
35 | features_test = vectorizer.transform(features_test).toarray()
36 |
37 |
38 | # In[87]:
39 |
40 |
41 | #### training only on 150 data points to put myself into overfit regime
42 | features_train = features_train[:150].toarray()
43 | labels_train = labels_train[:150]
44 |
45 |
46 | # In[88]:
47 |
48 |
49 | print('number of training points: ', len(features_train))
50 |
51 |
52 | # In[89]:
53 |
54 |
55 | """overfitting the Decision Tree and cehcking the accuracy"""
56 | from sklearn.tree import DecisionTreeClassifier
57 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
58 |
59 | clf = DecisionTreeClassifier()
60 | clf.fit(features_train, labels_train)
61 | y_pred = clf.predict(features_test)
62 |
63 |
64 | print(confusion_matrix(labels_test, y_pred))
65 | print(classification_report(labels_test, y_pred))
66 | print(accuracy_score(labels_test, y_pred))
67 |
68 |
69 | # Yes, it has an accuracy much higher than it should be.
70 | # Hence, finding the most important features.
71 |
72 | # In[90]:
73 |
74 |
75 | # identifying the most important features:
76 | import numpy as np
77 | importances = clf.feature_importances_
78 | indices = np.argsort(importances)[::-1]
79 | print("Feature Ranking")
80 | for i in range(10):
81 | print("{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]]))
82 |
83 |
84 | # In[91]:
85 |
86 |
87 | vect.get_feature_names()[2802]
88 |
89 |
90 | # This word seems like an outlier in a certain sense, so let’s remove it and refit.
91 |
92 | # In[99]:
93 |
94 |
95 | word_file = "C:/Users/Geekquad/ud120-projects/feature_selection/word_data_overfit_modified_unix.pkl"
96 | author_file = "C:/Users/Geekquad/ud120-projects/feature_selection/email_authors_overfit_modified_unix.pkl"
97 | word_data = pickle.load(open(word_file, "rb"))
98 | author_data = pickle.load(open(author_file, "rb"))
99 |
100 |
101 | # In[100]:
102 |
103 |
104 | import sklearn
105 | from sklearn.cross_validation import train_test_split
106 | features_train, features_test, labels_train, labels_test = train_test_split(word_data, author_data, test_size=0.1, random_state=42)
107 |
108 |
109 | # In[101]:
110 |
111 |
112 | from sklearn.feature_extraction.text import TfidfVectorizer
113 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words="english")
114 | features_train = vectorizer.fit_transform(features_train)
115 | features_test = vectorizer.transform(features_test).toarray()
116 |
117 |
118 | # In[102]:
119 |
120 |
121 | #### training only on 150 data points to put myself into overfit regime
122 | features_train = features_train[:150].toarray()
123 | labels_train = labels_train[:150]
124 |
125 |
126 | # In[103]:
127 |
128 |
129 | """overfitting the Decision Tree and cehcking the accuracy"""
130 | from sklearn.tree import DecisionTreeClassifier
131 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
132 |
133 | clf = DecisionTreeClassifier()
134 | clf.fit(features_train, labels_train)
135 | y_pred = clf.predict(features_test)
136 |
137 |
138 | print(confusion_matrix(labels_test, y_pred))
139 | print(classification_report(labels_test, y_pred))
140 | print(accuracy_score(labels_test, y_pred))
141 |
142 |
143 | # In[104]:
144 |
145 |
146 | # identifying the most important features:
147 | import numpy as np
148 | importances = clf.feature_importances_
149 | indices = np.argsort(importances)[::-1]
150 | print("Feature Ranking")
151 | for i in range(10):
152 | print("{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]]))
153 |
154 |
155 | # In[105]:
156 |
157 |
158 | vectorizer.get_feature_names()[33604]
159 |
160 |
--------------------------------------------------------------------------------
/K means.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[22]:
5 |
6 |
7 | import pickle
8 | import numpy
9 | import sys
10 | import matplotlib.pyplot as plt
11 | sys.path.append(".../tools/")
12 | from feature_format import featureFormat, targetFeatureSplit
13 |
14 |
15 | # In[23]:
16 |
17 |
18 | def Draw(pred, features, poi, mark_poi=False, name='image.png', f1_name='feature 1', f2_name ='feature 2'):
19 | colors = ["b", "c", "k", "m", "g"]
20 | for ii, pp in enumerate(pred):
21 | plt.scatter(features[ii][0], features[ii][1], color= colors[pred[ii]])
22 |
23 | if mark_poi:
24 | for ii, pp in enumerate(pred):
25 | if poi[ii]:
26 | plt.scatter(features[ii][0], features[ii][1], color="r", marker="*")
27 |
28 | plt.xlabel(f1_name)
29 | plt.ylabel(f2_name)
30 | plt.savefig(name)
31 | plt.show()
32 |
33 |
34 | # In[24]:
35 |
36 |
37 | data_dict = pickle.load(open("C:/Users/Geekquad/final_project_dataset_modified_unix.pkl", "rb"))
38 | data_dict.pop("TOTAL", 0)
39 |
40 |
41 | # In[25]:
42 |
43 |
44 | feature_1 = 'salary'
45 | feature_2 = "exercised_stock_options"
46 | poi = "poi"
47 | features_list = [poi, feature_1, feature_2]
48 | data = featureFormat(data_dict, features_list)
49 | poi, finance_features = targetFeatureSplit(data)
50 |
51 |
52 | # In[26]:
53 |
54 |
55 | for f1, f2 in finance_features:
56 | plt.scatter( f1, f2 )
57 | plt.show()
58 |
59 |
60 | # ## Clustering using K-means:
61 |
62 | # In[27]:
63 |
64 |
65 | from sklearn.cluster import KMeans
66 | features_list = ["poi", feature_1, feature_2]
67 | data2 = featureFormat(data_dict, features_list)
68 | poi, finance_features = targetFeatureSplit(data2)
69 | clf = KMeans(n_clusters=2)
70 | pred = clf.fit_predict(finance_features)
71 | Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name = feature_1, f2_name=feature_2)
72 |
73 |
--------------------------------------------------------------------------------
/Outliers Removal.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[51]:
5 |
6 |
7 | import random
8 | import numpy as np
9 | import pickle
10 | import matplotlib.pyplot as plt
11 | from outlier_cleaner import outlierCleaner
12 |
13 |
14 | # In[52]:
15 |
16 |
17 | ages = pickle.load( open("practice_outliers_ages_modified_unix.pkl", "rb") )
18 | net_worths = pickle.load( open("practice_outliers_net_worths_modified_unix.pkl", "rb") )
19 |
20 |
21 | # In[53]:
22 |
23 |
24 | ages = np.reshape(np.array(ages), (len(ages), 1))
25 | new_worths = np.reshape(np.array(net_worths), (len(net_worths), 1))
26 |
27 | from sklearn.cross_validation import train_test_split
28 | ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state = 42)
29 |
30 |
31 | # In[54]:
32 |
33 |
34 | from sklearn.linear_model import LinearRegression
35 | reg = LinearRegression()
36 | reg.fit(ages_train, net_worths_train)
37 | reg.predict(ages_test)
38 | print('coef:', reg.coef_)
39 | print('intercept:', reg.intercept_)
40 | print('scores:', reg.score(ages_test, net_worths_test))
41 |
42 |
43 | # In[55]:
44 |
45 |
46 | plt.scatter(ages_train, net_worths_train, color='blue')
47 | plt.scatter(ages_test, net_worths_test, color='red')
48 | plt.plot(ages_train, reg.predict(ages_train), color='green')
49 | plt.xlabel('Ages')
50 | plt.ylabel('Net Worth')
51 | plt.show()
52 |
53 |
54 | # ## Removing the most outlier points:
55 |
56 | # In[56]:
57 |
58 |
59 | cleaned_data = []
60 | try:
61 | predictions = reg.predict(ages_train)
62 | cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train )
63 | except NameError:
64 | print("your regression object doesn't exist, or isn't name reg")
65 | print("can't make predictions to use in identifying outliers")
66 |
67 |
68 | # In[57]:
69 |
70 |
71 | if len(cleaned_data) > 0:
72 | ages, net_worths, errors = zip(*cleaned_data)
73 | ages= np.reshape(np.array(ages), (len(ages), 1))
74 | net_worths = np.reshape( np.array(net_worths), (len(net_worths), 1))
75 |
76 |
77 | # In[58]:
78 |
79 |
80 | try:
81 | reg.fit(ages, net_worths)
82 | plt.plot(ages, reg.predict(ages), color="blue")
83 | except NameError:
84 | print("you don't seem to have regression imported/created")
85 | print("or else your regression object isn't named reg")
86 | print("either way, only draw the scatter plot of the cleaned data")
87 | plt.scatter(ages, net_worths)
88 | plt.xlabel("ages")
89 | plt.ylabel("net worths")
90 | plt.show()
91 |
92 | print('coef-after cleaned:', reg.coef_)
93 | print('intercept-after cleaned:', reg.intercept_)
94 | print('score-after cleaned:', reg.score(ages_test, net_worths_test))
95 |
96 | else:
97 | print("outlierCleaner() is returning an empty list, no refitting to be done")
98 |
99 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Fraud-Detection
2 | The Enron fraud is the largest case of corporate fraud in American history. Founded in 1985, Enron Corporation went bankrupt by end of 2001 due to widespread corporate fraud and corruption. Before its fall, Fortune magazine had named Enron “America’s most innovative company” for six consecutive years.
3 |
4 | **Dataset**: https://www.cs.cmu.edu/~./enron/
5 |
6 |
7 | ## Goal of the Porject:
8 | The goal of the project is to go through the thought process of data exploration (learning, cleaning and preparing the data),
9 | feature selecting/engineering (selecting the features which influence mostly on the target,
10 | create new features (which explains the target the better than existing) and,
11 | reducing the dimensionality of the data using principal component analysis (PCA)),
12 | picking/tuning one of the supervised machine learning algorithm and validating it to get the accurate person of interest identifier model.
13 |
14 | ## Data Exploration
15 | The features in the data fall into three major types, namely
16 | - financial features,
17 | - email features
18 | - POI labels.
19 |
20 | There are 143 samples with 20 features and a binary classification ("poi")
21 | Among 146 samples, there are
22 | - 18 POI and
23 | - 128 non-POI.
24 |
25 |
26 |
27 | ## Optimize Feature Selection/Engineering
28 | During the work on the project, I've played with the different features and models. One strategy was to standardize features,
29 | apply principal component analysis and GaussianNB classifier, another strategy was to use decision tree classifier, incl. choosing the
30 | features with features importance attribute and tuning the model.
31 |
32 |
33 |
34 | ### Create new features
35 | For both strategies I've tried to create new features as a fraction of almost all financial variables (f.ex. fractional bonus
36 | as fraction of bonus to total_payments, etc.). Logic behind email feature creation was to check the fraction of emails, sent to POI,
37 | to all sent emails; emails, received from POI, to all received emails.
38 | I've end up with using one new feature fraction_to_POI.
39 |
40 |
41 | ## Pick and Tune an Algorithm:
42 | I've played with 7 machine learning algorithms:
43 | - Naive Bayes (GaussianNB)
44 | - SVC
45 | - RandomForestClassifier
46 | - ExtraTreesClassifier
47 | - AdaBoostClassifier
48 | - LogisticRegression
49 | - SVC
50 |
51 | ### Comparing Classifiers based on cross-validation scores:
52 | - 1st tier: SVC, RandomForestClassifier
53 | - 2nd tier: GaussianNB, ExtraTreesClassifier, AdaBoostClassifier
54 | - 3rd tier: Logistic Regression, LinearSVC
55 |
56 | ### Tuning the algorithm:
57 | Bias-variance tradeoff is one of the key dilema in machine learning. High bias algorithms has no capacity to learn, high variance algorithms
58 | react poorly in case they didn't see such data before. Predictive model should be tuned to achieve compromise. The process of changing the parameteres of algorithms is
59 | algorithm tuning and it lets us find the golden mean and best result. If I don't tune the algorithm well, I don't get the best result I could.
60 | Algorithm might be tuned manually by iteratively changing the parameteres and tracking the results. Or GridSearchCV might be used which makes this automatically.
61 | I've tuned the parameteres of my decision tree classifier by sequentially tuning parameter by parameter and got the best F1 using these parameters
62 |
63 |
64 | ## Validate and Evaluate
65 | ### Usage of Evaluation Metrics
66 | In the project I've used F1 score as key measure of algorithms' accuracy. It considers both the precision and the recall of the test to compute the score.
67 | Precision is the ability of the classifier not label as positive sample that is negative.
68 | Recall is the ability of the classifier to find all positive samples.
69 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst at 0.
70 |
71 | ### Validation Strategy
72 | The validation is a process of model performance evaluation. Classic mistace is to use small data set for the model training or validate model on the same data set as train it.
73 | There are a number of strategies to validate the model. One of them is to split the available data into train and test data another one is to perform a cross validation: process of splitting the data on k beans equal size; run learning experiments; repeat this operation number of times and take the average test result.
74 |
75 |
76 | ## Conclusions:
77 | Before the start of this project I was completely sure that building the machine learning is about choosing the right algorithm
78 | from the black box and some magic. Working on the person of interest identifier I've been recursively going through the process
79 | of data exploration, outlier detection and algorithm tuning and spend most of the time on a data preparation. The model performance raised
80 | significantly after missing values imputation, extra feature creation and feature selection and less after algorithm tuning which shows me
81 | once again how important to fit the model with the good data.
82 | This experience might be applied to other fraud detection tasks. I think there is way of the model improvement by
83 | using and tuning alternative algorithms like Random Forest.
84 |
85 | ## Limitations of the study:
86 | It’s important to identify and acknowledge the limitation of the study. My conclusions are based just on the provided
87 | data set which represent just 143 persons. To get the real causation, I should gather all financial and email information
88 | about all enron persons which is most probably not possible. Missing email values were imputed with median so the modes of the distributions
89 | of email features are switched to the medians. Algorithms were tuned sequentially (I've changed one parameter to achieve better performance
90 | and then swithched to another parameter. There is a chance that othere parameters in combination might give better model's accuracy).
91 |
92 | ## References:
93 | - Enron data set: https://www.cs.cmu.edu/~./enron/
94 | - FindLaw financial data: http://www.findlaw.com
95 | - Visualization of POI: http://www.nytimes.com/packages/html/national/20061023_ENRON_TABLE/index.html
96 | - Enron on Wikipedia: https://en.wikipedia.org/wiki/Enron
97 | - F1 score on Wikipedia: https://en.wikipedia.org/wiki/F1_score
98 |
99 |
100 |
--------------------------------------------------------------------------------
/Regression.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[10]:
5 |
6 |
7 | import sys
8 | import pickle
9 | sys.path.append("../tools/")
10 | from feature_format import featureFormat, targetFeatureSplit
11 | import pickle
12 |
13 | dictionary = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb"))
14 |
15 |
16 | # In[11]:
17 |
18 |
19 | features_list = ["bonus", "salary"]
20 | data = featureFormat( dictionary, features_list, remove_any_zeroes=True)
21 | target, features = targetFeatureSplit( data )
22 |
23 |
24 | # In[12]:
25 |
26 |
27 | from sklearn.cross_validation import train_test_split
28 | feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42)
29 | train_color = "b"
30 | test_color = "b"
31 |
32 |
33 | # In[15]:
34 |
35 |
36 | from sklearn.linear_model import LinearRegression
37 | reg = LinearRegression()
38 | reg.fit(feature_train, target_train)
39 | print('coef', reg.coef_)
40 | print('intercept', reg.intercept_)
41 |
42 |
43 | # In[25]:
44 |
45 |
46 | import matplotlib.pyplot as plt
47 | for feature, target in zip(feature_test, target_test):
48 | plt.scatter(feature, target, color = test_color)
49 |
50 | for feature, target in zip(feature_train, target_train):
51 | plt.scatter(feature, target, color= train_color)
52 |
53 | plt.scatter(feature_test[0], target_test[0], color=test_color, label='test')
54 | plt.scatter(feature_test[0], target_test[0], color=train_color, label='train')
55 | plt.plot(feature_test, reg.predict(feature_test))
56 | plt.xlabel(features_list[1])
57 | plt.ylabel(features_list[0])
58 | plt.legend()
59 | plt.show()
60 |
61 |
--------------------------------------------------------------------------------
/enron61702insiderpay.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/enron61702insiderpay.pdf
--------------------------------------------------------------------------------
/evaluate_poi_identifier.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import pickle
8 | import sys
9 | from feature_format import featureFormat, targetFeatureSplit
10 |
11 |
12 | # In[2]:
13 |
14 |
15 | data_dict = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb"))
16 |
17 |
18 | # In[3]:
19 |
20 |
21 | features_list = ['poi', 'salary']
22 | data = featureFormat(data_dict, features_list)
23 | labels, features = targetFeatureSplit(data)
24 |
25 |
26 | # In[10]:
27 |
28 |
29 | import sklearn
30 | import numpy as np
31 | from sklearn.cross_validation import train_test_split
32 | from sklearn.tree import DecisionTreeClassifier
33 | from sklearn import svm
34 |
35 |
36 | # In[6]:
37 |
38 |
39 | features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state=42)
40 |
41 |
42 | # In[7]:
43 |
44 |
45 | clf = DecisionTreeClassifier()
46 | clf.fit(features_train, labels_train)
47 | print("Accuracy:", clf.score(features_test, labels_test))
48 | print(clf.predict(features_test))
49 |
50 |
51 | # In[12]:
52 |
53 |
54 | print('np.array(labels_test):')
55 | print(np.array(labels_test))
56 |
57 |
58 | # In[16]:
59 |
60 |
61 | print('POIs predict:', clf.predict(features_test))
62 | print('Number of POIs predict:', len([e for e in labels_test if e == 1.0]))
63 |
64 |
65 | # In[17]:
66 |
67 |
68 | print("Number of tests:", len(labels_test))
69 |
70 |
71 | # In[21]:
72 |
73 |
74 | from sklearn.metrics import *
75 | print("precision:", precision_score(labels_test, clf.predict(features_test)))
76 | print("recall:", recall_score(labels_test, clf.predict(features_test)))
77 |
78 |
--------------------------------------------------------------------------------
/img/feature.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/feature.png
--------------------------------------------------------------------------------
/img/k_means.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/k_means.png
--------------------------------------------------------------------------------
/img/outliers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/outliers.png
--------------------------------------------------------------------------------
/img/regression.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/regression.png
--------------------------------------------------------------------------------
/validate_poi.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 |
7 | import pickle
8 | import sys
9 | from feature_format import featureFormat, targetFeatureSplit
10 |
11 |
12 | # In[2]:
13 |
14 |
15 | data_dict = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb"))
16 |
17 |
18 | # In[3]:
19 |
20 |
21 | features_list = ["poi", "salary"]
22 |
23 |
24 | # In[5]:
25 |
26 |
27 | data = featureFormat(data_dict, features_list)
28 | labels, features = targetFeatureSplit(data)
29 |
30 |
31 | # In[7]:
32 |
33 |
34 | import sklearn
35 | from sklearn.cross_validation import train_test_split
36 | from sklearn import svm
37 |
38 |
39 | # In[16]:
40 |
41 |
42 | features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state=42)
43 |
44 |
45 | # In[17]:
46 |
47 |
48 | from sklearn.model_selection import GridSearchCV
49 |
50 | parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]}
51 | svr = svm.SVC()
52 | clf = GridSearchCV(svr, parameters)
53 | clf.fit(features_train,labels_train)
54 | print(clf.score(features_test, labels_test))
55 |
56 |
--------------------------------------------------------------------------------