├── Report ├── Human_Resource_Analytics_Report.docx └── Human_Resource_Analytics_Report.pdf ├── code ├── scaler.m ├── data_preparation.py └── code.m └── README.md /Report/Human_Resource_Analytics_Report.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryankarlos/Human-Resource-Analytics/HEAD/Report/Human_Resource_Analytics_Report.docx -------------------------------------------------------------------------------- /Report/Human_Resource_Analytics_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ryankarlos/Human-Resource-Analytics/HEAD/Report/Human_Resource_Analytics_Report.pdf -------------------------------------------------------------------------------- /code/scaler.m: -------------------------------------------------------------------------------- 1 | function [Xscaled, mu, stddev] = scaler(X) 2 | % This function standardizes the features to mu=0 and stddev=1. 3 | 4 | Xscaled = X; 5 | mu = zeros(1, size(X, 2)); 6 | stddev = zeros(1, size(X, 2)); 7 | 8 | % Perform feature scaling for every feature 9 | for i=1:size(mu,2) 10 | mu(1,i) = mean(X(:,i)); % calculate the mean 11 | stddev(1,i) = std(X(:,i)); % calculate the stddev 12 | Xscaled(:,i) = (X(:,i)-mu(1,i))/stddev(1,i); % subtract the mean and devide by stddev 13 | end -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Human Resource Analytics-Kaggle-Dataset 3 | ## Authors - Ryan Nazareth and Hannes Draxl 4 | 5 | A group project carried out on a dataset freely available on Kaggle https://www.kaggle.com/ludobenistant/d/ludobenistant/hr-analytics/hr-analytics 6 | 7 | Fields in the dataset include: 8 | 9 | * Employee satisfaction level 10 | * Last evaluation 11 | * Number of projects 12 | * Average monthly hours 13 | * Time spent at the company 14 | * Whether they have had a work accident 15 | * Whether they have had a promotion in the last 5 years 16 | * Department 17 | * Salary 18 | * Whether the employee has left 19 | 20 | Trying to predict if the best and most experienced employees leave prematurely based on features listed above, using vanilla Neural Network techniques: 21 | 22 | * SVM 23 | * Multi Layer Perceptron with Backpropagation 24 | 25 | The original dataset is stored in the 'Original Kaggle Dataset' folder. The cleaned data and code is stored in the 'cleaned data' folder. All programming carried out in Matlab. 26 | 27 | This work will also be ported into one of the open source deep learning frameworks - keras/tensor flow to run more sophistcated techniques not available in Matlab 28 | 29 | 30 | -------------------------------------------------------------------------------- /code/data_preparation.py: -------------------------------------------------------------------------------- 1 | 2 | # import dependecies 3 | import numpy as np 4 | import pandas as pd 5 | import seaborn as sns 6 | import matplotlib.pyplot as plt 7 | #% matplotlib inline 8 | 9 | 10 | # change path 11 | path = "/Users/Hannes/Desktop/City University/Term2/NN/Coursework/data/" 12 | # load data 13 | df = pd.read_csv(path + "data_original.csv") 14 | 15 | # create dummy variables for the categorical features 16 | sales_dummies = pd.get_dummies(df.sales, prefix="sales_").astype("int") 17 | salary_dummies = pd.get_dummies(df.salary, prefix="salary_").astype("int") 18 | 19 | # stack the individual dummy sets together 20 | data = pd.concat([sales_dummies, salary_dummies, df.drop("left", axis=1)], axis=1).drop(["sales", "salary"], axis=1) 21 | data = pd.concat([data, df.left], axis=1) 22 | 23 | 24 | ################### PLOTS 25 | # Correlation Heatmap 26 | cor = data.corr() 27 | mask = np.zeros_like(cor) 28 | mask[np.triu_indices_from(mask)] = True 29 | with sns.axes_style("white"): 30 | ax = sns.heatmap(cor, mask=mask, vmax=.3, square=True) 31 | plt.show() 32 | 33 | 34 | # Factorplot 35 | g = sns.factorplot(x="satisfaction_level", y="promotion_last_5years", 36 | hue="left", row="salary", 37 | data=df, 38 | orient="h", size=2, aspect=3.5, palette="Set3", 39 | kind="violin", split=True, cut=0, bw=.2) 40 | plt.show() 41 | 42 | 43 | 44 | # Save clean data set to csv 45 | # data.to_csv("data_clean.csv", index=False, header=False) 46 | -------------------------------------------------------------------------------- /code/code.m: -------------------------------------------------------------------------------- 1 | clear; clc; 2 | 3 | % load data and shuffle 4 | df = importdata('data_clean.csv'); 5 | rng(10); 6 | n = randperm(length(df)); 7 | data = df(n, :); % permuatation 8 | 9 | % Split data set into 70 % training and 30 % testing 10 | Xtrain = data(1:10500, 1:20); 11 | Xtest = data(10501: end, 1:20); 12 | ytrain = data(1:10500, end); 13 | ytest = data(10501:end, end); 14 | 15 | % renaming the labels as nntoolbox does not negative or zero values 16 | ytrain(ytrain == 1) = 2; 17 | ytrain(ytrain == 0) = 1; 18 | ytest(ytest == 1) = 2; 19 | ytest(ytest == 0) = 1; 20 | 21 | % plot class distribution 22 | fprintf('\nClass distribution training data:') 23 | fprintf('\n') 24 | tabulate(ytrain) 25 | 26 | fprintf('\nClass distributtion test data:') 27 | fprintf('\n') 28 | tabulate(ytest) 29 | 30 | %% Feature Scaling 31 | 32 | % Apply feature scaling with mu=0 and stddev=1 to the features. 33 | % This is crucial for the subsequent SVM and Neural Net. 34 | 35 | % Save the output as the scaled Xtrain matrix as well as the corresponding 36 | % mu and stddev of every feature column. 37 | 38 | [Xtrain, mu, stddev] = scaler(Xtrain); 39 | 40 | % Standardize the test set with mu and std of the training set 41 | % It is crucial to use mu and stddev from the training set as 42 | % our test set represents an unseen set of samples. 43 | 44 | for i=1:size(Xtest, 2) 45 | Xtest(:,i) = (Xtest(:,i)-mu(1,i))/stddev(1,i); 46 | end 47 | 48 | fprintf('\nStandardizing training and test data with mu=0 and std=1:') 49 | fprintf('\n') 50 | fprintf('\nMean of standardized Xtrain: %.3f\nStd of standardized Xtrain: %.3f',... 51 | mean(mean(Xtrain(:,:))), mean(std(Xtrain(:,:)))) 52 | fprintf('\n') 53 | fprintf('\nMean of standardized Xtest: %.3f\nStd of standardized Xtest: %.3f\n',... 54 | mean(mean(Xtest(:,:))), mean(std(Xtest(:,:)))) 55 | 56 | 57 | 58 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 59 | %% Support Vector Machines (SVM) 60 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 61 | 62 | %1) HP grid search 1) Search for best kernel function between linear, 63 | %gaussian and polynomial kernels 64 | 65 | % Create random partition for stratified 5-fold cross validation. Each fold 66 | % roughly has the same class proportions. 67 | 68 | cv = cvpartition(ytrain,'Kfold',5); 69 | 70 | % loop over different kernel functions with 5 fold stratified cross 71 | % validation 72 | for i = {'linear', 'gaussian', 'polynomial'} 73 | % fitcecoc requires an SVM template 74 | t = templateSVM('KernelFunction', i{1}); 75 | svm = fitcecoc(Xtrain, ytrain, 'learners', t, 'CVPartition', cv); 76 | accuracy = 1- kfoldLoss(svm); 77 | fprintf('\nAccuracy score of SVM with %s Kernel: %0.2f %', i{1}, accuracy) 78 | end 79 | 80 | % results: 81 | %Accuracy score of SVM with linear Kernel: 0.78 82 | %Accuracy score of SVM with gaussian Kernel: 0.97 83 | %Accuracy score of SVM with polynomial Kernel: 0.95 84 | 85 | %% Continue with gaussian kernel and tune C and sigma 86 | 87 | % create HP object 88 | params = hyperparameters('fitcecoc', Xtrain, ytrain, 'svm'); 89 | % change range of C 90 | params(2).Range = [0.1, 1]; 91 | % change range of sigma 92 | params(3).Range = [0.1, 1]; 93 | 94 | % fit random search 95 | fitcecoc(Xtrain, ytrain, 'OptimizeHyperparameters', params,... 96 | 'HyperparameterOptimizationOptions',struct('AcquisitionFunctionName',... 97 | 'expected-improvement-plus', 'Optimizer', 'randomsearch', 'MaxObjectiveEvaluations',... 98 | 10, 'CVPartition', cv)); 99 | 100 | 101 | %% train on best HP and evaluate generalisation performance on test 102 | 103 | % train on best HP values 104 | t = templateSVM('KernelFunction', 'gaussian', 'KernelScale', 1, 'BoxConstraint', 1); 105 | rng(10); 106 | svm = fitcecoc(Xtrain, ytrain, 'learners', t); 107 | % compute loss 108 | train_error_svm = loss(svm, Xtrain, ytrain); 109 | fprintf('\nSVM train accuracy: %0.2f\n', (1 - train_error_svm) * 100) 110 | % test on test set 111 | [ypred_svm, score_svm] = predict(svm, Xtest); 112 | test_error_svm = loss(svm, Xtest, ytest); 113 | fprintf('\nSVM test accuracy: %0.2f\n', (1 - test_error_svm) * 100) 114 | 115 | %% Confusion Matrix, Precision, Recall, F1 Score for SVM 116 | 117 | % svm 118 | [Csvm, order] = confusionmat(ytest, ypred_svm); 119 | precision_svm = Csvm(2,2)./(Csvm(2,2)+Csvm(1,2)); 120 | recall_svm = Csvm(2,2)./(Csvm(2,2)+Csvm(2,1)); 121 | f1Score_svm = 2*(precision_svm.*recall_svm)./(precision_svm+recall_svm); 122 | 123 | fprintf('Precision: %0.3f\n', precision_svm) 124 | fprintf('Recall: %0.3f\n', recall_svm) 125 | fprintf('F1: %0.3f\n', f1Score_svm) 126 | 127 | % Plot the confusion matrix 128 | % Convert the integer label vector to a class-identifier matrix. 129 | isLabels = unique(ytest); 130 | nLabels = numel(isLabels); 131 | [n,p] = size(Xtest); 132 | [~,grpOOF] = ismember(ypred_svm,isLabels); 133 | oofLabelMat = zeros(nLabels,n); 134 | idxLinear = sub2ind([nLabels n],grpOOF,(1:n)'); 135 | oofLabelMat(idxLinear) = 1; % Flags the row corresponding to the class 136 | [~,grpY] = ismember(ytest,isLabels); 137 | YMat = zeros(nLabels,n); 138 | idxLinearY = sub2ind([nLabels n],grpY,(1:n)'); 139 | YMat(idxLinearY) = 1; 140 | 141 | figure; 142 | plotconfusion(YMat,oofLabelMat); 143 | h = gca; 144 | h.XTickLabel = [num2cell(isLabels); {''}]; 145 | h.YTickLabel = [num2cell(isLabels); {''}]; 146 | 147 | 148 | 149 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 150 | %% Mutli Layer Perceptron (MLP) 151 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% 152 | 153 | x_train = Xtrain'; % training data 154 | label_train = ytrain'; % labels 155 | 156 | %convert each label to a spearate column as per the format required by the toolbox 157 | vec = ind2vec(label_train) ; 158 | t_train = full(vec); 159 | 160 | %% Hyperparameter tuning and k fold cross validation 161 | 162 | %%%%%% Total time for hyperparameter tuning: 2.5 - 3hrs 163 | 164 | clear train 165 | for hiddenLayerSize = [10 20 40 60 80]; % number of hidden neurons 166 | epochs = 500; % maximum number of epochs 167 | for lr = [0.05 0.1 0.3 0.6 0.9]; % learning rate 168 | for numLayers = [3 5 7]; % number of layers in the MLP 169 | net = feedforwardnet(hiddenLayerSize, 'trainscg'); % Stochastic conjugate gradient 170 | net.trainParam.epochs = epochs; % Maximum number of epochs to train 171 | net.trainParam.lr = lr; % learning rate 172 | net.trainParam.goal = 0.01; % stop training if error gold reached 173 | net.numLayers = numLayers; % number of layers in the MLP 174 | % generate cross validation indices for partition of data into 5 folds 175 | indices = crossvalind('Kfold',x_train(1,:),5); 176 | performance_cv = zeros(1,5); 177 | for j = 1:5 % for each fold 178 | % samples which are present in fold j are true 179 | testIdx = (indices == j); % boolean vector of test indices 180 | trainIdx = ~testIdx ; % boolean vector of train indices (which are not test) 181 | trInd = find(trainIdx); % get training sample indices 182 | tstInd = find(testIdx); % get test sample indices 183 | 184 | net.divideFcn = 'divideind'; % dividing the samples into sets using indices 185 | net.divideParam.trainInd=trInd; % separate samples into train set using train indices 186 | net.divideParam.testInd=tstInd; % separate samples into test set using test indices 187 | 188 | % Train the Network 189 | [net,tr] = train(net, x_train, t_train); 190 | 191 | % Fit the model on the training data 192 | pred_cv = net(x_train); 193 | % calculate the difference between predicted and target values 194 | e = gsubtract(t_train, pred_cv); 195 | % compute performance of the network for a single fold 196 | performance_cv(:,j) = perform(net,t_train,pred_cv); 197 | 198 | % View the Network 199 | %view(net) 200 | 201 | % Plots 202 | % Uncomment these lines to enable various plots. 203 | %figure(i), plotperform(tr) 204 | %figure(i), plottrainstate(tr) 205 | %figure(i), ploterrhist(e) 206 | 207 | end 208 | 209 | % average cross validation accuracy after tuning network on 5 folds 210 | fprintf('Average CV performance for following parameter settings: hidden layer:%d, epochs:%d,lr rate:%.2f,num Layers:%d, = %.4f \n', hiddenLayerSize,epochs, lr,numLayers, 100*(mean(performance_cv))); 211 | 212 | end 213 | end 214 | end 215 | 216 | 217 | % Results from hyperparameter tuning (Average CV scores) 218 | % best HP: hiddenlayersize : 60, lr 0.1, numLayers 3 219 | 220 | 221 | %% Re-Train the best classifer after hyperparameter tuning again 222 | 223 | % total time for training model with best HP: ~30 mins 224 | 225 | trainFcn = 'trainbr' % Here we apply a more robust method: Bayesian Regularisation Backpropagation 226 | hiddenLayerSize =60; % number of hidden neurons 227 | epochs = 500; % maximum number of epochs 228 | lr = 0.1; % learning rate 229 | numLayers = 3; 230 | % creating a MLP object setting hidden layer size and backprop algorithm 231 | net = feedforwardnet(hiddenLayerSize, trainFcn); 232 | 233 | % setting other parameters for the MLP 234 | 235 | net.trainParam.epochs = epochs; % Maximum number of epochs to train 236 | net.trainParam.lr = lr; % learning rate 237 | net.trainParam.goal = 0.01; % stop training if error gold reached 238 | net.numLayers = numLayers; % number of layers in MLP 239 | % Setup Division of Data for Training, Validation, Testing 240 | net.divideParam.trainRatio = 70/100; % keep 70% of data for training the model 241 | net.divideParam.valRatio = 30/100; % keep 30% of the training data for model evaluation 242 | 243 | % Evaluate the Network on the split test dataset 244 | [net,tr] = train(net, x_train, t_train); 245 | 246 | % fit the model to the training data 247 | pred_train = net(x_train); 248 | % compute thedifference between the train and predicted 249 | e = gsubtract(t_train, pred_train); 250 | 251 | % evaluate performance (mean square error and cross entropy) on the train data 252 | perf_train_mse = mse(net,t_train,pred_train) %MSE 253 | perf_train_crossentropy = crossentropy(net,t_train,pred_train)% crossentropy 254 | 255 | % mse and cross entropy give the same values in this case 256 | 257 | 258 | %% Calculating the training accuracy 259 | 260 | % For each observation, we set the higher of the two predicted value to 1 261 | % and the lower to 0. 262 | 263 | % Now we compute accuracy by comparing predicted labels with the target labels 264 | 265 | for i =1:size(pred_train,2) % loop through all the columns 266 | if pred_train(1,i)> pred_train(2,i) % if predicted value for one class is greater than the other 267 | pred_train(1,i) = 1; % set the class with higher value to 1 268 | pred_train(2,i) = 0; % class with lower value set to 0 269 | else 270 | pred_train(1,i) = 0; % if statement above is not true, then do the opposite 271 | pred_train(2,i) = 1; 272 | end 273 | end 274 | 275 | count = 0; % initialise the count to 0 276 | for i =1:size(pred_train,2) % loop through all the columns 277 | if pred_train(1,i) == t_train(1,i); % if predicted is equal to target 278 | count = count + 1; % increment the count 279 | else 280 | count = count; % otherwise leave it unchanged 281 | end 282 | end 283 | 284 | train_accuracy = count/size(pred_train,2); % calculate proportion of correct classifications 285 | 286 | fprintf('Training accuracy = %.4f \n', train_accuracy); 287 | 288 | %% Test the model on unseen test data 289 | 290 | % View the Network 291 | %view(net) 292 | 293 | % Plots 294 | % Uncomment these lines to enable various plots. 295 | %figure(), plotperform(tr) 296 | %figure(i), plottrainstate(tr) 297 | %figure(i), ploterrhist(e) 298 | 299 | %%%%%%% Test the network on unseen test data %%%%%%%%%%%%%%% 300 | 301 | % Here we will test the model on the completely unseen test dataset 302 | x_test = Xtest'; 303 | label_test = ytest'; 304 | vec = ind2vec(label_test) ; %convert each label to a spearate column with binary values 305 | t_test = full(vec) ; % as nn toolbox requires it in this format 306 | 307 | % fit the model on the test data 308 | pred_test = net(x_test); 309 | % evaluate performance (mean square error and cross entropy) on the test data 310 | perf_test_mse = mse(net,t_test,pred_test) %MSE 311 | perf_test_crossentropy = crossentropy(net,t_test,pred_test)% crossentropy 312 | 313 | % mse and cross entropy give the same values in this case 314 | 315 | % Calculate the test accuracy 316 | 317 | for i =1:size(pred_test,2) 318 | if pred_test(1,i)> pred_test(2,i) % if predicted value for one class is greater than the other 319 | pred_test(1,i) = 1; % set the class with higher value to 1 320 | pred_test(2,i) = 0; % class with lower value set to 0 321 | else 322 | pred_test(1,i) = 0; % if statement above is not true, then do the opposite 323 | pred_test(2,i) = 1; 324 | end 325 | end 326 | 327 | count = 0; % initialise the count to 0 328 | for i =1:size(pred_test,2) % loop through all the columns 329 | if pred_test(1,i) == t_test(1,i);% if predicted is equal to target 330 | count = count + 1; % increment the count 331 | else 332 | count = count; % otherwise leave it unchanged 333 | end 334 | end 335 | 336 | test_accuracy = count/size(pred_test,2); % compute accuracy as proportion of correct classifications 337 | 338 | 339 | fprintf('Test accuracy is %.4f \n', test_accuracy); 340 | 341 | %% Confusion Matrix, Precision, Recall, F1 Score for Neural Net 342 | 343 | % plot the confusion matrix 344 | plotconfusion(t_test,pred_test) 345 | 346 | % c is the fraction of samples misclassified 347 | %Cnn is the 2 x 2 confusion matrix 348 | [c,Cnn] = confusion(t_test,pred_test) 349 | 350 | % computing the precision, recall and F1 score 351 | precision_nn = Cnn(2,2)./(Cnn(2,2)+Cnn(2,1)); 352 | recall_nn = Cnn(2,2)./(Cnn(2,2)+Cnn(1,2)); 353 | f1Score_nn = 2*(precision_nn.*recall_nn)./(precision_nn+recall_nn); 354 | fprintf('Precision: %0.3f\n', precision_nn) 355 | fprintf('Recall: %0.3f\n', recall_nn) 356 | fprintf('F1: %0.3f\n', f1Score_nn) 357 | 358 | --------------------------------------------------------------------------------