├── Report
    ├── Human_Resource_Analytics_Report.docx
    └── Human_Resource_Analytics_Report.pdf
├── code
    ├── scaler.m
    ├── data_preparation.py
    └── code.m
└── README.md


/Report/Human_Resource_Analytics_Report.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryankarlos/Human-Resource-Analytics/HEAD/Report/Human_Resource_Analytics_Report.docx


--------------------------------------------------------------------------------
/Report/Human_Resource_Analytics_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ryankarlos/Human-Resource-Analytics/HEAD/Report/Human_Resource_Analytics_Report.pdf


--------------------------------------------------------------------------------
/code/scaler.m:
--------------------------------------------------------------------------------
 1 | function [Xscaled, mu, stddev] = scaler(X)
 2 | % This function standardizes the features to mu=0 and stddev=1.
 3 | 
 4 | Xscaled = X;
 5 | mu = zeros(1, size(X, 2));
 6 | stddev = zeros(1, size(X, 2));
 7 | 
 8 | % Perform feature scaling for every feature
 9 | for i=1:size(mu,2)
10 |     mu(1,i) = mean(X(:,i)); % calculate the mean
11 |     stddev(1,i) = std(X(:,i)); % calculate the stddev
12 |     Xscaled(:,i) = (X(:,i)-mu(1,i))/stddev(1,i); % subtract the mean and devide by stddev
13 | end


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Human Resource Analytics-Kaggle-Dataset
 3 | ## Authors - Ryan Nazareth and Hannes Draxl
 4 | 
 5 | A group project carried out on a dataset freely available on Kaggle https://www.kaggle.com/ludobenistant/d/ludobenistant/hr-analytics/hr-analytics 
 6 | 
 7 | Fields in the dataset include:
 8 | 
 9 | * Employee satisfaction level
10 | * Last evaluation
11 | * Number of projects
12 | * Average monthly hours
13 | * Time spent at the company
14 | * Whether they have had a work accident
15 | * Whether they have had a promotion in the last 5 years
16 | * Department
17 | * Salary
18 | * Whether the employee has left
19 | 
20 | Trying to predict if the best and most experienced employees leave prematurely based on features listed above, using vanilla Neural Network techniques:
21 | 
22 | * SVM 
23 | * Multi Layer Perceptron with Backpropagation 
24 | 
25 | The original dataset is stored in the 'Original Kaggle Dataset' folder. The cleaned data and code is stored in the 'cleaned data' folder. All programming carried out in Matlab.
26 | 
27 | This work will also be ported into one of the open source deep learning frameworks - keras/tensor flow to run more sophistcated techniques not available in Matlab
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/code/data_preparation.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # import dependecies
 3 | import numpy as np
 4 | import pandas as pd
 5 | import seaborn as sns
 6 | import matplotlib.pyplot as plt
 7 | #% matplotlib inline
 8 | 
 9 | 
10 | # change path
11 | path = "/Users/Hannes/Desktop/City University/Term2/NN/Coursework/data/"
12 | # load data
13 | df = pd.read_csv(path + "data_original.csv")
14 | 
15 | # create dummy variables for the categorical features
16 | sales_dummies = pd.get_dummies(df.sales, prefix="sales_").astype("int")
17 | salary_dummies = pd.get_dummies(df.salary, prefix="salary_").astype("int")
18 | 
19 | # stack the individual dummy sets together
20 | data = pd.concat([sales_dummies, salary_dummies, df.drop("left", axis=1)], axis=1).drop(["sales", "salary"], axis=1)
21 | data = pd.concat([data, df.left], axis=1)
22 | 
23 | 
24 | ################### PLOTS
25 | # Correlation Heatmap
26 | cor = data.corr()
27 | mask = np.zeros_like(cor)
28 | mask[np.triu_indices_from(mask)] = True
29 | with sns.axes_style("white"):
30 |     ax = sns.heatmap(cor, mask=mask, vmax=.3, square=True)
31 | plt.show()
32 | 
33 | 
34 | # Factorplot
35 | g = sns.factorplot(x="satisfaction_level", y="promotion_last_5years",
36 |                    hue="left", row="salary",
37 |                     data=df,
38 |                     orient="h", size=2, aspect=3.5, palette="Set3",
39 |                     kind="violin", split=True, cut=0, bw=.2)
40 | plt.show()
41 | 
42 | 
43 | 
44 | # Save clean data set to csv
45 | # data.to_csv("data_clean.csv", index=False, header=False)
46 | 


--------------------------------------------------------------------------------
/code/code.m:
--------------------------------------------------------------------------------
  1 | clear; clc;
  2 | 
  3 | % load data and shuffle
  4 | df = importdata('data_clean.csv');
  5 | rng(10);
  6 | n = randperm(length(df));
  7 | data = df(n, :);  % permuatation
  8 | 
  9 | % Split data set into 70 % training and 30 % testing
 10 | Xtrain = data(1:10500, 1:20);
 11 | Xtest = data(10501: end, 1:20);
 12 | ytrain = data(1:10500, end);
 13 | ytest = data(10501:end, end);
 14 | 
 15 | % renaming the labels as nntoolbox does not negative or zero values 
 16 | ytrain(ytrain == 1) = 2; 
 17 | ytrain(ytrain == 0) = 1;
 18 | ytest(ytest == 1) = 2;
 19 | ytest(ytest == 0) = 1;
 20 | 
 21 | % plot class distribution 
 22 | fprintf('\nClass distribution training data:')
 23 | fprintf('\n')
 24 | tabulate(ytrain)
 25 | 
 26 | fprintf('\nClass distributtion test data:')
 27 | fprintf('\n')
 28 | tabulate(ytest)
 29 | 
 30 | %% Feature Scaling 
 31 | 
 32 | % Apply feature scaling with mu=0 and stddev=1 to the features. 
 33 | % This is crucial for the subsequent SVM and Neural Net.
 34 | 
 35 | % Save the output as the scaled Xtrain matrix as well as the corresponding 
 36 | % mu and stddev of every feature column.
 37 | 
 38 | [Xtrain, mu, stddev] = scaler(Xtrain); 
 39 | 
 40 | % Standardize the test set with mu and std of the training set
 41 | % It is crucial to use mu and stddev from the training set as
 42 | % our test set represents an unseen set of samples.
 43 | 
 44 | for i=1:size(Xtest, 2)
 45 |     Xtest(:,i) = (Xtest(:,i)-mu(1,i))/stddev(1,i);
 46 | end
 47 | 
 48 | fprintf('\nStandardizing training and test data with mu=0 and std=1:')
 49 | fprintf('\n')
 50 | fprintf('\nMean of standardized Xtrain: %.3f\nStd of standardized Xtrain: %.3f',...
 51 |     mean(mean(Xtrain(:,:))), mean(std(Xtrain(:,:))))
 52 | fprintf('\n')
 53 | fprintf('\nMean of standardized Xtest: %.3f\nStd of standardized Xtest: %.3f\n',...
 54 |     mean(mean(Xtest(:,:))), mean(std(Xtest(:,:))))
 55 | 
 56 | 
 57 | 
 58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 59 | %% Support Vector Machines (SVM)
 60 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 61 | 
 62 | %1) HP grid search 1) Search for best kernel function between linear,
 63 | %gaussian and polynomial kernels
 64 | 
 65 | % Create random partition for stratified 5-fold cross validation. Each fold
 66 | % roughly has the same class proportions.
 67 | 
 68 | cv = cvpartition(ytrain,'Kfold',5);
 69 | 
 70 | % loop over different kernel functions with 5 fold stratified cross
 71 | % validation
 72 | for i = {'linear', 'gaussian', 'polynomial'}
 73 |     % fitcecoc requires an SVM template
 74 |     t = templateSVM('KernelFunction', i{1});
 75 |     svm = fitcecoc(Xtrain, ytrain, 'learners', t, 'CVPartition', cv);
 76 |     accuracy = 1- kfoldLoss(svm);
 77 |     fprintf('\nAccuracy score of SVM with %s Kernel: %0.2f %', i{1}, accuracy)
 78 | end
 79 | 
 80 | % results:
 81 | %Accuracy score of SVM with linear Kernel: 0.78 
 82 | %Accuracy score of SVM with gaussian Kernel: 0.97 
 83 | %Accuracy score of SVM with polynomial Kernel: 0.95 
 84 | 
 85 | %% Continue with gaussian kernel and tune C and sigma
 86 | 
 87 | % create HP object
 88 | params = hyperparameters('fitcecoc', Xtrain, ytrain, 'svm');
 89 | % change range of C
 90 | params(2).Range = [0.1, 1];
 91 | % change range of sigma
 92 | params(3).Range = [0.1, 1];
 93 | 
 94 | % fit random search 
 95 | fitcecoc(Xtrain, ytrain, 'OptimizeHyperparameters', params,...
 96 |     'HyperparameterOptimizationOptions',struct('AcquisitionFunctionName',...
 97 |     'expected-improvement-plus', 'Optimizer', 'randomsearch', 'MaxObjectiveEvaluations',...
 98 |     10, 'CVPartition', cv));
 99 | 
100 | 
101 | %% train on best HP and evaluate generalisation performance on test
102 | 
103 | % train on best HP values
104 | t = templateSVM('KernelFunction', 'gaussian', 'KernelScale', 1, 'BoxConstraint', 1);
105 | rng(10);
106 | svm = fitcecoc(Xtrain, ytrain, 'learners', t);
107 | % compute loss
108 | train_error_svm = loss(svm, Xtrain, ytrain);
109 | fprintf('\nSVM train accuracy: %0.2f\n', (1 - train_error_svm) * 100)
110 | % test on test set
111 | [ypred_svm, score_svm] = predict(svm, Xtest);  
112 | test_error_svm = loss(svm, Xtest, ytest);
113 | fprintf('\nSVM test accuracy: %0.2f\n', (1 - test_error_svm) * 100)
114 | 
115 | %% Confusion Matrix, Precision, Recall, F1 Score for SVM 
116 | 
117 | % svm
118 | [Csvm, order] = confusionmat(ytest, ypred_svm);
119 | precision_svm = Csvm(2,2)./(Csvm(2,2)+Csvm(1,2));
120 | recall_svm =  Csvm(2,2)./(Csvm(2,2)+Csvm(2,1));
121 | f1Score_svm =  2*(precision_svm.*recall_svm)./(precision_svm+recall_svm);
122 | 
123 | fprintf('Precision: %0.3f\n', precision_svm) 
124 | fprintf('Recall: %0.3f\n', recall_svm) 
125 | fprintf('F1: %0.3f\n', f1Score_svm) 
126 | 
127 | % Plot the confusion matrix
128 | % Convert the integer label vector to a class-identifier matrix.
129 | isLabels = unique(ytest);
130 | nLabels = numel(isLabels);
131 | [n,p] = size(Xtest);
132 | [~,grpOOF] = ismember(ypred_svm,isLabels); 
133 | oofLabelMat = zeros(nLabels,n); 
134 | idxLinear = sub2ind([nLabels n],grpOOF,(1:n)'); 
135 | oofLabelMat(idxLinear) = 1; % Flags the row corresponding to the class 
136 | [~,grpY] = ismember(ytest,isLabels); 
137 | YMat = zeros(nLabels,n); 
138 | idxLinearY = sub2ind([nLabels n],grpY,(1:n)'); 
139 | YMat(idxLinearY) = 1; 
140 | 
141 | figure;
142 | plotconfusion(YMat,oofLabelMat);
143 | h = gca;
144 | h.XTickLabel = [num2cell(isLabels); {''}];
145 | h.YTickLabel = [num2cell(isLabels); {''}];
146 | 
147 | 
148 | 
149 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
150 | %% Mutli Layer Perceptron (MLP)
151 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
152 | 
153 | x_train = Xtrain'; % training data 
154 | label_train = ytrain'; % labels 
155 | 
156 | %convert each label to a spearate column as per the format required by the toolbox
157 | vec = ind2vec(label_train) ;    
158 | t_train = full(vec);
159 | 
160 | %% Hyperparameter tuning and k fold cross validation
161 | 
162 | %%%%%% Total time for hyperparameter tuning: 2.5 - 3hrs 
163 | 
164 | clear train 
165 | for hiddenLayerSize = [10 20 40 60 80];  % number of hidden neurons
166 | epochs = 500; % maximum number of epochs 
167 | for lr = [0.05 0.1 0.3 0.6 0.9]; % learning rate 
168 | for numLayers = [3 5 7];  % number of layers in the MLP 
169 | net = feedforwardnet(hiddenLayerSize, 'trainscg'); % Stochastic conjugate gradient
170 | net.trainParam.epochs = epochs;	% Maximum number of epochs to train	
171 | net.trainParam.lr = lr; % learning rate	
172 | net.trainParam.goal = 0.01;	% stop training if error gold reached
173 | net.numLayers = numLayers; % number of layers in the MLP
174 | % generate cross validation indices for partition of data into 5 folds
175 | indices = crossvalind('Kfold',x_train(1,:),5);  
176 | performance_cv = zeros(1,5);
177 | for j = 1:5  % for each fold
178 |     % samples which are present in fold j are true 
179 |       testIdx = (indices == j); % boolean vector of test indices      
180 |       trainIdx = ~testIdx  ; % boolean vector of train indices (which are not test)    
181 |       trInd = find(trainIdx); % get training sample indices 
182 |       tstInd = find(testIdx); % get test sample indices 
183 |       
184 | net.divideFcn = 'divideind'; % dividing the samples into sets using indices
185 | net.divideParam.trainInd=trInd; % separate samples into train set using train indices 
186 | net.divideParam.testInd=tstInd; % separate samples into test set using test indices
187 |             
188 | % Train the Network
189 | [net,tr] = train(net, x_train, t_train);
190 | 
191 | % Fit the model on the training data 
192 | pred_cv = net(x_train);
193 | % calculate the difference between predicted and target values
194 | e = gsubtract(t_train, pred_cv);
195 | % compute performance of the network for a single fold 
196 | performance_cv(:,j) = perform(net,t_train,pred_cv);
197 | 
198 | % View the Network
199 | %view(net)
200 | 
201 | % Plots
202 | % Uncomment these lines to enable various plots.
203 | %figure(i), plotperform(tr)
204 | %figure(i), plottrainstate(tr)
205 | %figure(i), ploterrhist(e)
206 | 
207 | end 
208 | 
209 | % average cross validation accuracy after tuning network on 5 folds
210 | fprintf('Average CV performance for following parameter settings: hidden layer:%d, epochs:%d,lr rate:%.2f,num Layers:%d, = %.4f \n', hiddenLayerSize,epochs, lr,numLayers, 100*(mean(performance_cv)));
211 | 
212 | end 
213 | end
214 | end
215 | 
216 | 
217 | % Results from hyperparameter tuning (Average CV scores)
218 | % best HP: hiddenlayersize : 60, lr 0.1, numLayers 3 
219 | 
220 | 
221 | %% Re-Train the best classifer after hyperparameter tuning again
222 | 
223 | % total time for training model with best HP: ~30 mins
224 | 
225 | trainFcn = 'trainbr' % Here we apply a more robust method: Bayesian Regularisation Backpropagation 
226 | hiddenLayerSize =60;  % number of hidden neurons
227 | epochs = 500; % maximum number of epochs
228 | lr = 0.1; % learning rate 
229 | numLayers = 3;
230 | % creating a MLP object setting hidden layer size and backprop algorithm 
231 | net = feedforwardnet(hiddenLayerSize, trainFcn);  
232 | 
233 | % setting other parameters for the MLP
234 | 
235 | net.trainParam.epochs = epochs;	% Maximum number of epochs to train
236 | net.trainParam.lr = lr; % learning rate	
237 | net.trainParam.goal = 0.01;	% stop training if error gold reached
238 | net.numLayers  = numLayers; % number of layers in MLP
239 | % Setup Division of Data for Training, Validation, Testing
240 | net.divideParam.trainRatio = 70/100; % keep 70% of data for training the model 
241 | net.divideParam.valRatio = 30/100; % keep 30% of the training data for model evaluation
242 | 
243 | % Evaluate the Network on the split test dataset
244 | [net,tr] = train(net, x_train, t_train);
245 | 
246 | % fit the model to the training data
247 | pred_train = net(x_train);
248 | % compute thedifference between the train and predicted
249 | e = gsubtract(t_train, pred_train);
250 | 
251 | % evaluate performance (mean square error and cross entropy) on the train data
252 | perf_train_mse = mse(net,t_train,pred_train) %MSE 
253 | perf_train_crossentropy =  crossentropy(net,t_train,pred_train)% crossentropy 
254 | 
255 | % mse and cross entropy give the same values in this case 
256 | 
257 | 
258 | %% Calculating the training accuracy 
259 | 
260 | % For each observation, we set the higher of the two predicted value to 1
261 | % and the lower to 0.
262 | 
263 | % Now we compute accuracy by comparing predicted labels with the target labels 
264 | 
265 | for i =1:size(pred_train,2) % loop through all the columns 
266 |     if pred_train(1,i)> pred_train(2,i) % if predicted value for one class is greater than the other 
267 |         pred_train(1,i) = 1; % set the class with higher value to 1
268 |         pred_train(2,i) = 0; % class with lower value set to 0
269 |     else
270 |         pred_train(1,i) = 0; % if statement above is not true, then do the opposite 
271 |         pred_train(2,i) = 1;
272 |     end 
273 | end 
274 | 
275 | count = 0; % initialise the count to 0
276 | for i =1:size(pred_train,2) % loop through all the columns 
277 | if pred_train(1,i) == t_train(1,i); % if predicted is equal to target 
278 |     count = count + 1;  % increment the count 
279 | else 
280 |     count = count; % otherwise leave it unchanged
281 | end 
282 | end 
283 | 
284 | train_accuracy = count/size(pred_train,2); % calculate proportion of correct classifications
285 | 
286 | fprintf('Training accuracy = %.4f \n', train_accuracy);
287 | 
288 | %% Test the model on unseen test data 
289 | 
290 | % View the Network
291 | %view(net)
292 | 
293 | % Plots
294 | % Uncomment these lines to enable various plots.
295 | %figure(), plotperform(tr)
296 | %figure(i), plottrainstate(tr)
297 | %figure(i), ploterrhist(e)
298 | 
299 | %%%%%%% Test the network on unseen test data %%%%%%%%%%%%%%%
300 | 
301 | % Here we will test the model on the completely unseen test dataset
302 | x_test = Xtest';
303 | label_test = ytest';
304 | vec = ind2vec(label_test) ; %convert each label to a spearate column with binary values 
305 | t_test = full(vec) ; % as nn toolbox requires it in this format
306 | 
307 | % fit the model on the test data
308 | pred_test = net(x_test);
309 | % evaluate performance (mean square error and cross entropy) on the test data
310 | perf_test_mse = mse(net,t_test,pred_test) %MSE 
311 | perf_test_crossentropy =  crossentropy(net,t_test,pred_test)% crossentropy 
312 | 
313 | % mse and cross entropy give the same values in this case 
314 | 
315 | % Calculate the test accuracy 
316 | 
317 | for i =1:size(pred_test,2)
318 |     if pred_test(1,i)> pred_test(2,i) % if predicted value for one class is greater than the other 
319 |         pred_test(1,i) = 1; % set the class with higher value to 1
320 |         pred_test(2,i) = 0; % class with lower value set to 0
321 |     else
322 |         pred_test(1,i) = 0; % if statement above is not true, then do the opposite 
323 |         pred_test(2,i) = 1;
324 |     end 
325 | end 
326 | 
327 | count = 0; % initialise the count to 0
328 | for i =1:size(pred_test,2) % loop through all the columns 
329 | if pred_test(1,i) == t_test(1,i);% if predicted is equal to target 
330 |     count = count + 1; % increment the count 
331 | else 
332 |     count = count; % otherwise leave it unchanged
333 | end 
334 | end 
335 | 
336 | test_accuracy = count/size(pred_test,2); % compute accuracy as proportion of correct classifications 
337 | 
338 | 
339 | fprintf('Test accuracy is %.4f \n', test_accuracy);
340 | 
341 | %% Confusion Matrix, Precision, Recall, F1 Score for Neural Net 
342 | 
343 | % plot the confusion matrix 
344 | plotconfusion(t_test,pred_test)
345 | 
346 | % c is the fraction of samples misclassified 
347 | %Cnn is the 2 x 2 confusion matrix 
348 | [c,Cnn] = confusion(t_test,pred_test) 
349 | 
350 | % computing the precision, recall and F1 score 
351 | precision_nn = Cnn(2,2)./(Cnn(2,2)+Cnn(2,1));
352 | recall_nn =  Cnn(2,2)./(Cnn(2,2)+Cnn(1,2));
353 | f1Score_nn =  2*(precision_nn.*recall_nn)./(precision_nn+recall_nn);
354 | fprintf('Precision: %0.3f\n', precision_nn) 
355 | fprintf('Recall: %0.3f\n', recall_nn) 
356 | fprintf('F1: %0.3f\n', f1Score_nn) 
357 | 
358 | 


--------------------------------------------------------------------------------