├── Feature Selection.py ├── Fraud Detection from ENRON dataset.ipynb ├── K means.py ├── Outliers Removal.py ├── README.md ├── Regression.py ├── enron61702insiderpay.pdf ├── evaluate_poi_identifier.py ├── img ├── feature.png ├── k_means.png ├── outliers.png └── regression.png └── validate_poi.py /Feature Selection.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[83]: 5 | 6 | 7 | import pickle 8 | import numpy 9 | numpy.random.seed(42) 10 | 11 | 12 | # In[84]: 13 | 14 | 15 | word_file = "C:/Users/Geekquad/ud120-projects/feature_selection/word_data_modified_unix.pkl" 16 | author_file = "C:/Users/Geekquad/ud120-projects/feature_selection/email_authors_modified_unix.pkl" 17 | word_data = pickle.load(open(word_file, "rb")) 18 | author_data = pickle.load(open(author_file, "rb")) 19 | 20 | 21 | # In[85]: 22 | 23 | 24 | import sklearn 25 | from sklearn.cross_validation import train_test_split 26 | features_train, features_test, labels_train, labels_test = train_test_split(word_data, author_data, test_size=0.1, random_state=42) 27 | 28 | 29 | # In[86]: 30 | 31 | 32 | from sklearn.feature_extraction.text import TfidfVectorizer 33 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words="english") 34 | features_train = vectorizer.fit_transform(features_train) 35 | features_test = vectorizer.transform(features_test).toarray() 36 | 37 | 38 | # In[87]: 39 | 40 | 41 | #### training only on 150 data points to put myself into overfit regime 42 | features_train = features_train[:150].toarray() 43 | labels_train = labels_train[:150] 44 | 45 | 46 | # In[88]: 47 | 48 | 49 | print('number of training points: ', len(features_train)) 50 | 51 | 52 | # In[89]: 53 | 54 | 55 | """overfitting the Decision Tree and cehcking the accuracy""" 56 | from sklearn.tree import DecisionTreeClassifier 57 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 58 | 59 | clf = DecisionTreeClassifier() 60 | clf.fit(features_train, labels_train) 61 | y_pred = clf.predict(features_test) 62 | 63 | 64 | print(confusion_matrix(labels_test, y_pred)) 65 | print(classification_report(labels_test, y_pred)) 66 | print(accuracy_score(labels_test, y_pred)) 67 | 68 | 69 | # Yes, it has an accuracy much higher than it should be. 70 | # Hence, finding the most important features. 71 | 72 | # In[90]: 73 | 74 | 75 | # identifying the most important features: 76 | import numpy as np 77 | importances = clf.feature_importances_ 78 | indices = np.argsort(importances)[::-1] 79 | print("Feature Ranking") 80 | for i in range(10): 81 | print("{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]])) 82 | 83 | 84 | # In[91]: 85 | 86 | 87 | vect.get_feature_names()[2802] 88 | 89 | 90 | # This word seems like an outlier in a certain sense, so let’s remove it and refit. 91 | 92 | # In[99]: 93 | 94 | 95 | word_file = "C:/Users/Geekquad/ud120-projects/feature_selection/word_data_overfit_modified_unix.pkl" 96 | author_file = "C:/Users/Geekquad/ud120-projects/feature_selection/email_authors_overfit_modified_unix.pkl" 97 | word_data = pickle.load(open(word_file, "rb")) 98 | author_data = pickle.load(open(author_file, "rb")) 99 | 100 | 101 | # In[100]: 102 | 103 | 104 | import sklearn 105 | from sklearn.cross_validation import train_test_split 106 | features_train, features_test, labels_train, labels_test = train_test_split(word_data, author_data, test_size=0.1, random_state=42) 107 | 108 | 109 | # In[101]: 110 | 111 | 112 | from sklearn.feature_extraction.text import TfidfVectorizer 113 | vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words="english") 114 | features_train = vectorizer.fit_transform(features_train) 115 | features_test = vectorizer.transform(features_test).toarray() 116 | 117 | 118 | # In[102]: 119 | 120 | 121 | #### training only on 150 data points to put myself into overfit regime 122 | features_train = features_train[:150].toarray() 123 | labels_train = labels_train[:150] 124 | 125 | 126 | # In[103]: 127 | 128 | 129 | """overfitting the Decision Tree and cehcking the accuracy""" 130 | from sklearn.tree import DecisionTreeClassifier 131 | from sklearn.metrics import classification_report, confusion_matrix, accuracy_score 132 | 133 | clf = DecisionTreeClassifier() 134 | clf.fit(features_train, labels_train) 135 | y_pred = clf.predict(features_test) 136 | 137 | 138 | print(confusion_matrix(labels_test, y_pred)) 139 | print(classification_report(labels_test, y_pred)) 140 | print(accuracy_score(labels_test, y_pred)) 141 | 142 | 143 | # In[104]: 144 | 145 | 146 | # identifying the most important features: 147 | import numpy as np 148 | importances = clf.feature_importances_ 149 | indices = np.argsort(importances)[::-1] 150 | print("Feature Ranking") 151 | for i in range(10): 152 | print("{} feature no.{} ({})".format(i+1,indices[i],importances[indices[i]])) 153 | 154 | 155 | # In[105]: 156 | 157 | 158 | vectorizer.get_feature_names()[33604] 159 | 160 | -------------------------------------------------------------------------------- /K means.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[22]: 5 | 6 | 7 | import pickle 8 | import numpy 9 | import sys 10 | import matplotlib.pyplot as plt 11 | sys.path.append(".../tools/") 12 | from feature_format import featureFormat, targetFeatureSplit 13 | 14 | 15 | # In[23]: 16 | 17 | 18 | def Draw(pred, features, poi, mark_poi=False, name='image.png', f1_name='feature 1', f2_name ='feature 2'): 19 | colors = ["b", "c", "k", "m", "g"] 20 | for ii, pp in enumerate(pred): 21 | plt.scatter(features[ii][0], features[ii][1], color= colors[pred[ii]]) 22 | 23 | if mark_poi: 24 | for ii, pp in enumerate(pred): 25 | if poi[ii]: 26 | plt.scatter(features[ii][0], features[ii][1], color="r", marker="*") 27 | 28 | plt.xlabel(f1_name) 29 | plt.ylabel(f2_name) 30 | plt.savefig(name) 31 | plt.show() 32 | 33 | 34 | # In[24]: 35 | 36 | 37 | data_dict = pickle.load(open("C:/Users/Geekquad/final_project_dataset_modified_unix.pkl", "rb")) 38 | data_dict.pop("TOTAL", 0) 39 | 40 | 41 | # In[25]: 42 | 43 | 44 | feature_1 = 'salary' 45 | feature_2 = "exercised_stock_options" 46 | poi = "poi" 47 | features_list = [poi, feature_1, feature_2] 48 | data = featureFormat(data_dict, features_list) 49 | poi, finance_features = targetFeatureSplit(data) 50 | 51 | 52 | # In[26]: 53 | 54 | 55 | for f1, f2 in finance_features: 56 | plt.scatter( f1, f2 ) 57 | plt.show() 58 | 59 | 60 | # ## Clustering using K-means: 61 | 62 | # In[27]: 63 | 64 | 65 | from sklearn.cluster import KMeans 66 | features_list = ["poi", feature_1, feature_2] 67 | data2 = featureFormat(data_dict, features_list) 68 | poi, finance_features = targetFeatureSplit(data2) 69 | clf = KMeans(n_clusters=2) 70 | pred = clf.fit_predict(finance_features) 71 | Draw(pred, finance_features, poi, name="clusters_before_scaling.pdf", f1_name = feature_1, f2_name=feature_2) 72 | 73 | -------------------------------------------------------------------------------- /Outliers Removal.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[51]: 5 | 6 | 7 | import random 8 | import numpy as np 9 | import pickle 10 | import matplotlib.pyplot as plt 11 | from outlier_cleaner import outlierCleaner 12 | 13 | 14 | # In[52]: 15 | 16 | 17 | ages = pickle.load( open("practice_outliers_ages_modified_unix.pkl", "rb") ) 18 | net_worths = pickle.load( open("practice_outliers_net_worths_modified_unix.pkl", "rb") ) 19 | 20 | 21 | # In[53]: 22 | 23 | 24 | ages = np.reshape(np.array(ages), (len(ages), 1)) 25 | new_worths = np.reshape(np.array(net_worths), (len(net_worths), 1)) 26 | 27 | from sklearn.cross_validation import train_test_split 28 | ages_train, ages_test, net_worths_train, net_worths_test = train_test_split(ages, net_worths, test_size=0.1, random_state = 42) 29 | 30 | 31 | # In[54]: 32 | 33 | 34 | from sklearn.linear_model import LinearRegression 35 | reg = LinearRegression() 36 | reg.fit(ages_train, net_worths_train) 37 | reg.predict(ages_test) 38 | print('coef:', reg.coef_) 39 | print('intercept:', reg.intercept_) 40 | print('scores:', reg.score(ages_test, net_worths_test)) 41 | 42 | 43 | # In[55]: 44 | 45 | 46 | plt.scatter(ages_train, net_worths_train, color='blue') 47 | plt.scatter(ages_test, net_worths_test, color='red') 48 | plt.plot(ages_train, reg.predict(ages_train), color='green') 49 | plt.xlabel('Ages') 50 | plt.ylabel('Net Worth') 51 | plt.show() 52 | 53 | 54 | # ## Removing the most outlier points: 55 | 56 | # In[56]: 57 | 58 | 59 | cleaned_data = [] 60 | try: 61 | predictions = reg.predict(ages_train) 62 | cleaned_data = outlierCleaner( predictions, ages_train, net_worths_train ) 63 | except NameError: 64 | print("your regression object doesn't exist, or isn't name reg") 65 | print("can't make predictions to use in identifying outliers") 66 | 67 | 68 | # In[57]: 69 | 70 | 71 | if len(cleaned_data) > 0: 72 | ages, net_worths, errors = zip(*cleaned_data) 73 | ages= np.reshape(np.array(ages), (len(ages), 1)) 74 | net_worths = np.reshape( np.array(net_worths), (len(net_worths), 1)) 75 | 76 | 77 | # In[58]: 78 | 79 | 80 | try: 81 | reg.fit(ages, net_worths) 82 | plt.plot(ages, reg.predict(ages), color="blue") 83 | except NameError: 84 | print("you don't seem to have regression imported/created") 85 | print("or else your regression object isn't named reg") 86 | print("either way, only draw the scatter plot of the cleaned data") 87 | plt.scatter(ages, net_worths) 88 | plt.xlabel("ages") 89 | plt.ylabel("net worths") 90 | plt.show() 91 | 92 | print('coef-after cleaned:', reg.coef_) 93 | print('intercept-after cleaned:', reg.intercept_) 94 | print('score-after cleaned:', reg.score(ages_test, net_worths_test)) 95 | 96 | else: 97 | print("outlierCleaner() is returning an empty list, no refitting to be done") 98 | 99 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fraud-Detection 2 | The Enron fraud is the largest case of corporate fraud in American history. Founded in 1985, Enron Corporation went bankrupt by end of 2001 due to widespread corporate fraud and corruption. Before its fall, Fortune magazine had named Enron “America’s most innovative company” for six consecutive years. 3 | 4 | **Dataset**: https://www.cs.cmu.edu/~./enron/ 5 |
6 | 7 | ## Goal of the Porject: 8 | The goal of the project is to go through the thought process of data exploration (learning, cleaning and preparing the data), 9 | feature selecting/engineering (selecting the features which influence mostly on the target, 10 | create new features (which explains the target the better than existing) and, 11 | reducing the dimensionality of the data using principal component analysis (PCA)), 12 | picking/tuning one of the supervised machine learning algorithm and validating it to get the accurate person of interest identifier model. 13 | 14 | ## Data Exploration 15 | The features in the data fall into three major types, namely 16 | - financial features, 17 | - email features 18 | - POI labels. 19 | 20 | There are 143 samples with 20 features and a binary classification ("poi") 21 | Among 146 samples, there are 22 | - 18 POI and 23 | - 128 non-POI. 24 | 25 |
26 | 27 | ## Optimize Feature Selection/Engineering 28 | During the work on the project, I've played with the different features and models. One strategy was to standardize features, 29 | apply principal component analysis and GaussianNB classifier, another strategy was to use decision tree classifier, incl. choosing the 30 | features with features importance attribute and tuning the model. 31 | 32 | 33 | 34 | ### Create new features 35 | For both strategies I've tried to create new features as a fraction of almost all financial variables (f.ex. fractional bonus 36 | as fraction of bonus to total_payments, etc.). Logic behind email feature creation was to check the fraction of emails, sent to POI, 37 | to all sent emails; emails, received from POI, to all received emails. 38 | I've end up with using one new feature fraction_to_POI. 39 |
40 | 41 | ## Pick and Tune an Algorithm: 42 | I've played with 7 machine learning algorithms: 43 | - Naive Bayes (GaussianNB) 44 | - SVC 45 | - RandomForestClassifier 46 | - ExtraTreesClassifier 47 | - AdaBoostClassifier 48 | - LogisticRegression 49 | - SVC 50 | 51 | ### Comparing Classifiers based on cross-validation scores: 52 | - 1st tier: SVC, RandomForestClassifier 53 | - 2nd tier: GaussianNB, ExtraTreesClassifier, AdaBoostClassifier 54 | - 3rd tier: Logistic Regression, LinearSVC 55 | 56 | ### Tuning the algorithm: 57 | Bias-variance tradeoff is one of the key dilema in machine learning. High bias algorithms has no capacity to learn, high variance algorithms 58 | react poorly in case they didn't see such data before. Predictive model should be tuned to achieve compromise. The process of changing the parameteres of algorithms is 59 | algorithm tuning and it lets us find the golden mean and best result. If I don't tune the algorithm well, I don't get the best result I could. 60 | Algorithm might be tuned manually by iteratively changing the parameteres and tracking the results. Or GridSearchCV might be used which makes this automatically. 61 | I've tuned the parameteres of my decision tree classifier by sequentially tuning parameter by parameter and got the best F1 using these parameters 62 |
63 | 64 | ## Validate and Evaluate 65 | ### Usage of Evaluation Metrics 66 | In the project I've used F1 score as key measure of algorithms' accuracy. It considers both the precision and the recall of the test to compute the score. 67 | Precision is the ability of the classifier not label as positive sample that is negative. 68 | Recall is the ability of the classifier to find all positive samples. 69 | The F1 score can be interpreted as a weighted average of the precision and recall, where an F1 score reaches its best value at 1 and worst at 0. 70 | 71 | ### Validation Strategy 72 | The validation is a process of model performance evaluation. Classic mistace is to use small data set for the model training or validate model on the same data set as train it. 73 | There are a number of strategies to validate the model. One of them is to split the available data into train and test data another one is to perform a cross validation: process of splitting the data on k beans equal size; run learning experiments; repeat this operation number of times and take the average test result. 74 |
75 | 76 | ## Conclusions: 77 | Before the start of this project I was completely sure that building the machine learning is about choosing the right algorithm 78 | from the black box and some magic. Working on the person of interest identifier I've been recursively going through the process 79 | of data exploration, outlier detection and algorithm tuning and spend most of the time on a data preparation. The model performance raised 80 | significantly after missing values imputation, extra feature creation and feature selection and less after algorithm tuning which shows me 81 | once again how important to fit the model with the good data. 82 | This experience might be applied to other fraud detection tasks. I think there is way of the model improvement by 83 | using and tuning alternative algorithms like Random Forest. 84 | 85 | ## Limitations of the study: 86 | It’s important to identify and acknowledge the limitation of the study. My conclusions are based just on the provided 87 | data set which represent just 143 persons. To get the real causation, I should gather all financial and email information 88 | about all enron persons which is most probably not possible. Missing email values were imputed with median so the modes of the distributions 89 | of email features are switched to the medians. Algorithms were tuned sequentially (I've changed one parameter to achieve better performance 90 | and then swithched to another parameter. There is a chance that othere parameters in combination might give better model's accuracy). 91 | 92 | ## References: 93 | - Enron data set: https://www.cs.cmu.edu/~./enron/ 94 | - FindLaw financial data: http://www.findlaw.com 95 | - Visualization of POI: http://www.nytimes.com/packages/html/national/20061023_ENRON_TABLE/index.html 96 | - Enron on Wikipedia: https://en.wikipedia.org/wiki/Enron 97 | - F1 score on Wikipedia: https://en.wikipedia.org/wiki/F1_score 98 | 99 | 100 | -------------------------------------------------------------------------------- /Regression.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[10]: 5 | 6 | 7 | import sys 8 | import pickle 9 | sys.path.append("../tools/") 10 | from feature_format import featureFormat, targetFeatureSplit 11 | import pickle 12 | 13 | dictionary = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb")) 14 | 15 | 16 | # In[11]: 17 | 18 | 19 | features_list = ["bonus", "salary"] 20 | data = featureFormat( dictionary, features_list, remove_any_zeroes=True) 21 | target, features = targetFeatureSplit( data ) 22 | 23 | 24 | # In[12]: 25 | 26 | 27 | from sklearn.cross_validation import train_test_split 28 | feature_train, feature_test, target_train, target_test = train_test_split(features, target, test_size=0.5, random_state=42) 29 | train_color = "b" 30 | test_color = "b" 31 | 32 | 33 | # In[15]: 34 | 35 | 36 | from sklearn.linear_model import LinearRegression 37 | reg = LinearRegression() 38 | reg.fit(feature_train, target_train) 39 | print('coef', reg.coef_) 40 | print('intercept', reg.intercept_) 41 | 42 | 43 | # In[25]: 44 | 45 | 46 | import matplotlib.pyplot as plt 47 | for feature, target in zip(feature_test, target_test): 48 | plt.scatter(feature, target, color = test_color) 49 | 50 | for feature, target in zip(feature_train, target_train): 51 | plt.scatter(feature, target, color= train_color) 52 | 53 | plt.scatter(feature_test[0], target_test[0], color=test_color, label='test') 54 | plt.scatter(feature_test[0], target_test[0], color=train_color, label='train') 55 | plt.plot(feature_test, reg.predict(feature_test)) 56 | plt.xlabel(features_list[1]) 57 | plt.ylabel(features_list[0]) 58 | plt.legend() 59 | plt.show() 60 | 61 | -------------------------------------------------------------------------------- /enron61702insiderpay.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/enron61702insiderpay.pdf -------------------------------------------------------------------------------- /evaluate_poi_identifier.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pickle 8 | import sys 9 | from feature_format import featureFormat, targetFeatureSplit 10 | 11 | 12 | # In[2]: 13 | 14 | 15 | data_dict = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb")) 16 | 17 | 18 | # In[3]: 19 | 20 | 21 | features_list = ['poi', 'salary'] 22 | data = featureFormat(data_dict, features_list) 23 | labels, features = targetFeatureSplit(data) 24 | 25 | 26 | # In[10]: 27 | 28 | 29 | import sklearn 30 | import numpy as np 31 | from sklearn.cross_validation import train_test_split 32 | from sklearn.tree import DecisionTreeClassifier 33 | from sklearn import svm 34 | 35 | 36 | # In[6]: 37 | 38 | 39 | features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state=42) 40 | 41 | 42 | # In[7]: 43 | 44 | 45 | clf = DecisionTreeClassifier() 46 | clf.fit(features_train, labels_train) 47 | print("Accuracy:", clf.score(features_test, labels_test)) 48 | print(clf.predict(features_test)) 49 | 50 | 51 | # In[12]: 52 | 53 | 54 | print('np.array(labels_test):') 55 | print(np.array(labels_test)) 56 | 57 | 58 | # In[16]: 59 | 60 | 61 | print('POIs predict:', clf.predict(features_test)) 62 | print('Number of POIs predict:', len([e for e in labels_test if e == 1.0])) 63 | 64 | 65 | # In[17]: 66 | 67 | 68 | print("Number of tests:", len(labels_test)) 69 | 70 | 71 | # In[21]: 72 | 73 | 74 | from sklearn.metrics import * 75 | print("precision:", precision_score(labels_test, clf.predict(features_test))) 76 | print("recall:", recall_score(labels_test, clf.predict(features_test))) 77 | 78 | -------------------------------------------------------------------------------- /img/feature.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/feature.png -------------------------------------------------------------------------------- /img/k_means.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/k_means.png -------------------------------------------------------------------------------- /img/outliers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/outliers.png -------------------------------------------------------------------------------- /img/regression.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/geekquad/Fraud-Detection/4806f6dc8f01cc285f90bf70d4ba1e14603b7ec5/img/regression.png -------------------------------------------------------------------------------- /validate_poi.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import pickle 8 | import sys 9 | from feature_format import featureFormat, targetFeatureSplit 10 | 11 | 12 | # In[2]: 13 | 14 | 15 | data_dict = pickle.load(open("C:/Users/Geekquad/ud120-projects/final_project/final_project_dataset_modified_unix.pkl", "rb")) 16 | 17 | 18 | # In[3]: 19 | 20 | 21 | features_list = ["poi", "salary"] 22 | 23 | 24 | # In[5]: 25 | 26 | 27 | data = featureFormat(data_dict, features_list) 28 | labels, features = targetFeatureSplit(data) 29 | 30 | 31 | # In[7]: 32 | 33 | 34 | import sklearn 35 | from sklearn.cross_validation import train_test_split 36 | from sklearn import svm 37 | 38 | 39 | # In[16]: 40 | 41 | 42 | features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state=42) 43 | 44 | 45 | # In[17]: 46 | 47 | 48 | from sklearn.model_selection import GridSearchCV 49 | 50 | parameters = {'kernel': ('linear', 'rbf'), 'C': [1, 10]} 51 | svr = svm.SVC() 52 | clf = GridSearchCV(svr, parameters) 53 | clf.fit(features_train,labels_train) 54 | print(clf.score(features_test, labels_test)) 55 | 56 | --------------------------------------------------------------------------------