├── README.md ├── Noise_Remover.m ├── sample.m ├── SMOTE.m ├── ClusterSMOTE.m ├── license.txt ├── ASUWO_Main.m ├── Orig_agg_cluster.m ├── BorSMOTE.m ├── Safe_Level_SMOTE.m ├── padadd.m ├── Num_OV_Finder.m ├── ASUWO.m ├── Mod_AggCluster.m └── nearestneighbour.m /README.md: -------------------------------------------------------------------------------- 1 | # MATLAB-Source-Code-Oversampling-Methods 2 | This repository contains the source code for four oversampling methods to address imbalanced binary data classification that I wrote in MATLAB: 1) SMOTE 2) Borderline SMOTE 3) Safe Level SMOTE 4) ASUWO (Adaptive Semi-Unsupervised Weighted Oversampling) 3 | 4 | For more details, please check out my paper at: 5 | http://www.sciencedirect.com/science/article/pii/S0957417415007356 6 | -------------------------------------------------------------------------------- /Noise_Remover.m: -------------------------------------------------------------------------------- 1 | function [ClearData, ClearLabel] = Noise_Remover(WholeDataInst, WholeDataLable, KNN) 2 | 3 | Ins_neighbors = knnsearch(WholeDataInst, WholeDataInst, 'k', KNN); 4 | Safe_Level = zeros(1,size(WholeDataInst,1)); 5 | 6 | for i = 1:size(WholeDataInst,1) 7 | for j = 2:KNN 8 | if(WholeDataLable(Ins_neighbors(i,j),1) == WholeDataLable(i,1)) 9 | Safe_Level(1,i) = Safe_Level(1,i) + 1; 10 | end 11 | end 12 | end 13 | 14 | ToRemove = find(Safe_Level == 0); 15 | ClearData = WholeDataInst; 16 | ClearData(ToRemove,:) = []; 17 | ClearLabel = WholeDataLable; 18 | ClearLabel(ToRemove,:) = []; 19 | 20 | end -------------------------------------------------------------------------------- /sample.m: -------------------------------------------------------------------------------- 1 | function d=sample(I,P,N) 2 | % This code is from the authors of the paper MWMOTE. The paper can be found 3 | % in the following link: 4 | % http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=6361394&tag=1 5 | 6 | 7 | %samples N data from the input vectors according to probability 8 | %distribution P; 9 | 10 | [m,n]=size(P); 11 | C=zeros(m,1); 12 | prev=0; 13 | for i=1:m 14 | C(i)=P(i)+prev; 15 | prev=C(i); 16 | end 17 | d=[]; 18 | 19 | for i=1:N 20 | rn=rand(1); 21 | 22 | for j=1:m 23 | if(rn<=C(j)) 24 | d=[d;I(j,:)]; 25 | break; 26 | end; 27 | end 28 | end 29 | 30 | -------------------------------------------------------------------------------- /SMOTE.m: -------------------------------------------------------------------------------- 1 | function [final_features ,final_mark] = SMOTE(original_features, original_mark) 2 | 3 | ind = find(original_mark == -1); 4 | P = original_features(ind,:); 5 | KNN = 5; 6 | final_features = original_features; 7 | Limit = size(original_features,2); 8 | 9 | Num_Ov = ceil(max(size(find(original_mark == -1),1) - size(find(original_mark == 1),1),size(find(original_mark == 1),1) - size(find(original_mark == -1),1))); 10 | j2 = 1; 11 | while j2 <= Num_Ov 12 | %find nearest K samples from S2(i,:) 13 | S2= datasample(P,1); 14 | Condidates = nearestneighbour(S2', P', 'NumberOfNeighbours', min(KNN,Limit)); 15 | Condidates(:,1) = [] ; 16 | rn=ceil(rand(1)*(size(Condidates,2))); 17 | Sel_index = Condidates(:,rn); 18 | g = P(Sel_index,:); 19 | alpha = rand(1); 20 | snew = S2(1,:) + alpha.*(g-S2(1,:)); 21 | final_features = [final_features;snew]; 22 | j2=j2+1; 23 | end 24 | 25 | mark = -1 * ones(Num_Ov,1); 26 | final_mark = [original_mark; mark]; 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /ClusterSMOTE.m: -------------------------------------------------------------------------------- 1 | function [final_features ,final_mark] = ClusterSMOTE(original_features, original_mark, Ncluster) 2 | 3 | ind = find(original_mark == -1); 4 | Min_instances = original_features(ind,:); 5 | min_clusters = kmeans(Min_instances,Ncluster); 6 | 7 | KNN = 6; 8 | final_features = original_features; 9 | 10 | Num_Ov = ceil(max(size(find(original_mark == -1),1) - size(find(original_mark == 1),1),size(find(original_mark == 1),1) - size(find(original_mark == -1),1))); 11 | j2 = 1; 12 | 13 | 14 | while j2 <= Num_Ov 15 | %find nearest K samples from S2(i,:) 16 | [S2 idx]= datasample(Min_instances,1); 17 | Min_Cluster = find(min_clusters == min_clusters(idx)); 18 | Min_cand = Min_instances(Min_Cluster,:); 19 | Limit = size(Min_cand,1); 20 | Condidates = nearestneighbour(S2', Min_cand', 'NumberOfNeighbours', min(KNN,Limit)); 21 | Condidates(:,1) = [] ; 22 | if size(Condidates,2)>= 1 23 | rn=ceil(rand(1)*(size(Condidates,2))); 24 | Sel_index = Condidates(:,rn); 25 | g = Min_instances(Sel_index,:); 26 | alpha = rand(1); 27 | snew = S2(1,:) + alpha.*(g-S2(1,:)); 28 | final_features = [final_features;snew]; 29 | j2=j2+1; 30 | end 31 | end 32 | 33 | mark = -1 * ones(Num_Ov,1); 34 | final_mark = [original_mark; mark]; -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, Iman Nekooeimehr 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in 12 | the documentation and/or other materials provided with the distribution 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 18 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 19 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 20 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 21 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 22 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 23 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 24 | POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /ASUWO_Main.m: -------------------------------------------------------------------------------- 1 | clear 2 | clc 3 | close all 4 | 5 | % Loading the example dataset 6 | load fisheriris 7 | X = meas; 8 | %Y = [ones(100,1); -1 * ones(50,1)]; 9 | Y = [ones(50,1); -1 * ones(50,1);ones(50,1)]; 10 | 11 | [N D] = size(X); 12 | % Standardize the feature sapce 13 | for i = 1:D 14 | X_scaled(:,i) = 2*((X(:,i) - min(X(:,i))) / ( max(X(:,i)) - min(X(:,i)) ))-1; 15 | end 16 | X_scaled = X_scaled + normrnd(0,0.01,size(X_scaled)); 17 | 18 | NumberFolds = 3; 19 | NumIteration = 2; 20 | 21 | SR_RG = 1; 22 | stepSize = 1; 23 | 24 | division = round(N/NumberFolds); 25 | 26 | %% Buiding the models 27 | for ite = 1:NumIteration 28 | C = cvpartition(Y,'k',NumberFolds); 29 | for num = 1:NumberFolds; 30 | trainData = X_scaled(training(C,num),:); 31 | trainLabel = Y(training(C,num),:); 32 | testData = X_scaled(test(C,num),:); 33 | testLabel = Y(test(C,num),:); 34 | %% Oversampling using SMOTE 35 | display ('SMOTE:') 36 | [trainDataSMOTE, trainLabelSMOTE] = SMOTE(trainData,trainLabel); 37 | %% Oversampling using Borderline SMOTE 38 | display ('Borderline SMOTE:') 39 | NNC = 5; 40 | [borderMin_BorSMOTE, trainDatanewBorSMOTE, trainLabelnewBorSMOTE] = BorSMOTE(trainData,trainLabel,NNC); 41 | %% Oversampling using Safe-level SMOTE 42 | display ('Safe-level SMOTE:') 43 | NNC = 5; 44 | [trainDatanewSafeSMOTE, trainLabelnewSafeSMOTE] = Safe_Level_SMOTE(trainData,trainLabel,NNC); 45 | %% Oversampling using ASUWO 46 | display ('ASUWO:') 47 | CThresh = 1; 48 | K = 3; 49 | NN = 5; 50 | NS = 5; 51 | [trainDatanewASUWO, trainLabelnewASUWO] = ASUWO(trainData,trainLabel, CThresh , K, NN, NS); 52 | end 53 | perm = []; 54 | end -------------------------------------------------------------------------------- /Orig_agg_cluster.m: -------------------------------------------------------------------------------- 1 | function labels = Orig_agg_cluster(data, CThresh) 2 | 3 | N = size(data,2); 4 | 5 | % Clusters is a cell array of vectors. Each vector contains the 6 | % indicies of the points belonging to that cluster. 7 | % Initially, each point is in it's own cluster. 8 | clusters = cell(N,1); 9 | for cc = 1:length(clusters) 10 | clusters{cc} = [cc]; 11 | end 12 | 13 | % the distance between each pair of points 14 | % point_dist = point_distance(data); 15 | D = pdist(data,'euclidean'); 16 | point_dist = squareform(D); 17 | point_dist2 = point_dist; 18 | for i=1:N 19 | point_dist2(i,i) = 100; 20 | end 21 | thresh = mean(median(point_dist2)).* CThresh; 22 | 23 | Z = linkage(D,'complete'); 24 | labels = cluster(Z,'cutoff',thresh, 'criterion', 'distance'); 25 | 26 | function d = point_distance(X) 27 | N = size(X,2); 28 | d = sum(X.^2,1); 29 | d = ones(N,1)*d + d'*ones(1,N) - 2*X'*X; 30 | 31 | 32 | 33 | %////////////////////////////////////////////////////////// 34 | % d = cluster_distance(c1,c2,point_dist,linkage) 35 | % Computes the pairwise distances between clusters c1 36 | % and c2, using the point distance info in point_dist. 37 | %---------------------------------------------------------- 38 | 39 | function d = cluster_distance(c1,c2,point_dist,version) 40 | 41 | M1 = length(c1); 42 | M2 = length(c2); 43 | MaxM = max([M1,M2]); 44 | 45 | d = point_dist(c1,c2); 46 | if version == 1 47 | d = min(d(:))*MaxM^0; 48 | else if version == 2 49 | d = mean(d(:))*MaxM^0; 50 | else 51 | d = max(d(:))*MaxM^0; 52 | end 53 | end 54 | 55 | %////////////////////////////////////////////////////////// 56 | % clusters = merge_clusters(clusters, indicies) 57 | % Merge the clusters indicated by the entries indicies(1) 58 | % and indicies(2) of cell array 'clusters'. 59 | %---------------------------------------------------------- 60 | function clusters = merge_clusters(clusters, indicies) 61 | clusters{indicies(1)} = [clusters{indicies(1)} clusters{indicies(2)}]; 62 | clusters(indicies(2)) = []; 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /BorSMOTE.m: -------------------------------------------------------------------------------- 1 | function [border_min, final_features, final_mark] = BorSMOTE(original_features, original_mark, NNC) 2 | 3 | %NNC = NNC + 1; 4 | 5 | Minority_index = find(original_mark == -1); 6 | Minority_features = original_features(Minority_index,:); 7 | 8 | % Finding the 5 positive nearest neighbours of all the positive blobs 9 | Minority_neighbors = nearestneighbour(Minority_features', original_features', 'NumberOfNeighbours', NNC); 10 | 11 | num_min_neighbor = zeros(1,length(Minority_index)); 12 | for i=1:length(Minority_index) 13 | for j = 2:NNC 14 | if(original_mark(Minority_neighbors(j,i),1)== 1) 15 | num_min_neighbor(1,i) = num_min_neighbor(1,i)+1; 16 | end 17 | end 18 | end 19 | 20 | border_min = Minority_index(find(num_min_neighbor > (NNC-1)/2),1); 21 | while size( border_min,1) < 4 22 | NNC = NNC - 1; 23 | border_min = Minority_index(find(num_min_neighbor > (NNC-1)/2),1); 24 | end 25 | Border_min_features = original_features(border_min,:); 26 | NNC = 5; 27 | Num_Ov = ceil(max(size(find(original_mark == -1),1) - size(find(original_mark == 1),1),size(find(original_mark == 1),1) - size(find(original_mark == -1),1))); 28 | j2 = 1; 29 | Limit = size(Border_min_features,1); 30 | 31 | if Limit > 3 32 | final_features = original_features; 33 | while j2 <= Num_Ov 34 | %find nearest K samples from S2(i,:) 35 | S2 = datasample(Border_min_features,1); 36 | Condidates = nearestneighbour(S2', Minority_features', 'NumberOfNeighbours', min(NNC-1,Limit)); 37 | Condidates(:,1) = [] ; 38 | rn = ceil(rand(1)*(size(Condidates,2))); 39 | Sel_index = Condidates(:,rn); 40 | g = Minority_features(Sel_index,:); 41 | alpha = rand(1); 42 | snew = S2(1,:) + alpha.*(g-S2(1,:)); 43 | final_features = [final_features;snew]; 44 | j2=j2+1; 45 | end 46 | mark = -1 * ones(Num_Ov,1); 47 | final_mark = [original_mark; mark]; 48 | else 49 | [final_features ,final_mark] = SMOTE(original_features, original_mark); 50 | end 51 | 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /Safe_Level_SMOTE.m: -------------------------------------------------------------------------------- 1 | function [final_features ,final_mark] = Safe_Level_SMOTE(original_features, original_mark, KNN) 2 | 3 | ind = find(original_mark == -1); 4 | Min_ins = original_features(ind,:); 5 | KNN = KNN + 1; 6 | final_features = original_features; 7 | Limit = size(Min_ins,1); 8 | 9 | Num_Ov = ceil(max(size(find(original_mark == -1),1) - size(find(original_mark == 1),1),size(find(original_mark == 1),1) - size(find(original_mark == -1),1))); 10 | j2 = 1; 11 | 12 | Safe_Level = safe_level_Finder(Min_ins, original_features, original_mark, KNN); 13 | 14 | while j2 <= Num_Ov 15 | %find nearest K samples from S2(i,:) 16 | [FirstCand idx] = datasample(Min_ins,1); 17 | Safe_Level_cand1 = Safe_Level(idx); 18 | Condidates = nearestneighbour(FirstCand', Min_ins', 'NumberOfNeighbours', min(KNN,Limit)); 19 | Condidates(:,1) = [] ; 20 | rn=ceil(rand(1)*(size(Condidates,2))); 21 | Sel_index = Condidates(:,rn); 22 | SecondCand = Min_ins(Sel_index,:); 23 | Safe_Level_cand2 = Safe_Level(Sel_index); 24 | 25 | if Safe_Level_cand2 ~= 0 26 | Safe_level_ratio = Safe_Level_cand1/Safe_Level_cand2; 27 | else 28 | Safe_level_ratio = inf; 29 | end 30 | 31 | if (Safe_level_ratio == inf && Safe_Level_cand1 == 0) 32 | else 33 | if (Safe_level_ratio == inf && Safe_Level_cand1 ~= 0) 34 | gap = 0; 35 | else if Safe_level_ratio == 1 36 | gap = rand(1); 37 | else if Safe_level_ratio > 1 38 | gap = rand(1)*(1/Safe_level_ratio); 39 | else if Safe_level_ratio < 1 40 | gap = rand(1) * Safe_level_ratio + 1 - Safe_level_ratio; 41 | end 42 | end 43 | end 44 | end 45 | snew = FirstCand(1,:) + gap.*(SecondCand - FirstCand(1,:)); 46 | final_features = [final_features;snew]; 47 | j2=j2+1; 48 | end 49 | end 50 | 51 | mark = -1 * ones(Num_Ov,1); 52 | final_mark = [original_mark; mark]; 53 | end 54 | 55 | function Safe_Level = safe_level_Finder(Minority_features, WholeDataInst, WholeDataLable, KNN) 56 | 57 | Ins_neighbors = nearestneighbour(Minority_features', WholeDataInst', 'NumberOfNeighbours', KNN); 58 | Safe_Level = zeros(1,size(Minority_features,1)); 59 | 60 | for i = 1:size(Minority_features,1) 61 | for j = 2:KNN 62 | if(WholeDataLable(Ins_neighbors(j,i),1)== -1) 63 | Safe_Level(1,i) = Safe_Level(1,i) + 1; 64 | end 65 | end 66 | end 67 | 68 | end -------------------------------------------------------------------------------- /padadd.m: -------------------------------------------------------------------------------- 1 | function [output] = padadd(A, x, index) 2 | % PADADD Adds data columns to an array even column lengths don't match. 3 | % Missmatched areas of data array are padded with NaNs. 4 | % 5 | % answer = padadd(A, x) 6 | % appends "x" column vector as the last column of "A" 7 | % 8 | % answer = padadd(A, x, index) 9 | % assigns "x" to the column specified by "index" in "A" 10 | % by overwriting any existing data. 11 | % 12 | % If "x" is a matrix, "index" specifies the leftmost column written to. 13 | % 14 | % The result is saved recursively to "A" if the output argument is omitted 15 | % and "A" is a defined variable 16 | % 17 | %Example: 18 | % padadd( eye(2,2), 2*ones(4,1) ) 19 | % 20 | % ans = 21 | % 22 | % 1 0 2 23 | % 0 1 2 24 | % NaN NaN 2 25 | % NaN NaN 2 26 | % 27 | %Author: HDJ 28 | 29 | %check input argument number 30 | if (nargin < 2) 31 | error('not enough input arguments') 32 | end 33 | 34 | %transpose 'x' if it is a row vector 35 | if (size(x,1) == 1) | (size(x,2) == 1) & (size(x,2) > size(x,1)) 36 | x = x'; 37 | end 38 | 39 | %get sizes of 'A' and 'x' 40 | dAr = size(A,1); 41 | dAc = size(A,2); 42 | dxr = size(x,1); 43 | dxc = size(x,2); 44 | 45 | 46 | if nargin == 2 47 | %if index is not specified 48 | %index = dAc + 1; %default to adding a column to the end 49 | index = dAc + (1:dxc); %default to adding all columns to the end 50 | else 51 | %create index array from index argument 52 | index = index(1)+ (0:dxc-1); 53 | end 54 | 55 | %%%%%%BEGIN PADDING SECTION%%%%%% 56 | %if index is outside current size of 'A' then pad whole columns of 'A' 57 | if dAc < index(end) 58 | answer = [A,NaN*ones(dAr,index(end)-dAc)]; 59 | else 60 | answer = A; 61 | end 62 | 63 | %if 'x' is shorter or the same height as 'A' then pad 'x' as necessary 64 | if dAr >= dxr, 65 | %answer(:,index) = [ x(:,1); NaN*ones(dAr-dxr,1)]; 66 | answer(:,index) = [ x; NaN*ones(dAr-dxr,dxc)]; 67 | end 68 | 69 | %if 'x' is taller than 'A' then pad 'A' 70 | if dAr < dxr, 71 | answer = [answer; NaN*ones(dxr-dAr,size(answer,2))]; 72 | %answer(:,index) = x(:,1); 73 | answer(:,index) = x; 74 | end 75 | %%%%%%END PADDING SECTION%%%%%% 76 | 77 | %%%%%%DECIDE OUTPUT METHOD%%%%%% 78 | %get input arguments name 79 | ARGIN = inputname(1); 80 | %if no output argument, ouput to A is available 81 | if (nargout == 0) 82 | %if ARG is a variable 83 | if ~(isempty(ARGIN)) 84 | assignin('caller', ARGIN, answer); 85 | return 86 | end 87 | end 88 | 89 | %default action if either there is an ouput argument 90 | %or if input is not a variable 91 | output = answer; 92 | %%%%%%END DECIDE OUTPUT METHOD%%%%%% -------------------------------------------------------------------------------- /Num_OV_Finder.m: -------------------------------------------------------------------------------- 1 | function [Kmin2, rand_matrix, Final_Ov] = Num_OV_Finder(IDX_min, Majority_features, Minority_features, m_each_min, Kmin, folds, Out_Th) 2 | 3 | pow = 0.2; 4 | Maj_size = size(Majority_features,1); 5 | Min_size = size(Minority_features,1); 6 | % Randomely permute the memebrs in each minority cluster 7 | rand_matrix = []; 8 | for i=1:Kmin 9 | perm = []; 10 | buff_min_ind = find(IDX_min == i); 11 | des_min_sam = size(find(IDX_min == i),1); 12 | perm = randsample(buff_min_ind,des_min_sam); 13 | padadd(rand_matrix,perm) 14 | end 15 | rand_matrix(:, m_each_min <= Out_Th) = []; 16 | m_each_min(m_each_min <= Out_Th) = []; 17 | Kmin2 = size(rand_matrix,2); 18 | LessFoldsIn = find(m_each_min=1 20 | for fk = 1:size(LessFoldsIn,1) 21 | temp1 = rand_matrix(~isnan(rand_matrix(1:end,LessFoldsIn(fk))),LessFoldsIn(fk)); 22 | Added = randsample(temp1,folds-size(temp1,1),true); 23 | rand_matrix ((size(temp1,1)+1):folds,LessFoldsIn(fk)) = Added; 24 | end 25 | end 26 | 27 | % Split each Minority cluster and put some portion of each in the fold matrix 28 | buffer = [] ; 29 | folds_matrix = []; 30 | for i = 1:folds-1 31 | for j=1:Kmin2 32 | temp = rand_matrix(~isnan(rand_matrix(1:end,j)),j); 33 | division = floor(size(temp,1)/folds); 34 | buffer = [buffer; temp(((i-1)*division+1):i*division,1)]; 35 | end 36 | padadd(folds_matrix,buffer) 37 | buffer = []; 38 | end 39 | 40 | for j=1:Kmin2 41 | temp = rand_matrix(~isnan(rand_matrix(1:end,j)),j); 42 | division = floor(size(temp,1)/folds); 43 | buffer = [buffer; temp(((folds-1)*division+1):end,1)]; 44 | end 45 | padadd(folds_matrix,buffer) 46 | 47 | % Finding the number of misclassified instances 48 | errorCluster_min = zeros(1,Kmin2); 49 | C = nchoosek(1:folds,folds-1); 50 | % for ite = 1:folds 51 | ite = 1; 52 | A_min = folds_matrix(:,C(ite,:)); 53 | Min_Feat_Train = Minority_features(A_min(~isnan(A_min)),:); 54 | B_min = folds_matrix(:,~ismember(1:folds,C(ite,:))); 55 | Min_Feat_Valid = Minority_features(B_min(~isnan(B_min)),:); 56 | % Train the SVM 57 | Feat_Train_whole = [Min_Feat_Train; Majority_features]; 58 | trainLabel_whole = [-1*ones(size(Min_Feat_Train,1),1);ones(Maj_size,1)]; 59 | [trainDatanew, trainLabelnew] = SMOTE(Feat_Train_whole, trainLabel_whole); 60 | %model = svmtrain(trainLabelnew, trainDatanew, Options); 61 | model = fitcdiscr(trainDatanew, trainLabelnew); 62 | 63 | % Use the LDA/SVM model to classify the data 64 | predict_label_SMOTE = predict(model, Min_Feat_Valid); 65 | % predict_label_SMOTE = svmpredict(testLabel, Min_Feat_Valid, model, '-q'); % run the SVM model on the test data 66 | misclassified = B_min(predict_label_SMOTE == 1); 67 | errorCluster_min = sum(ismember(rand_matrix,misclassified)) + errorCluster_min; 68 | %end 69 | 70 | NeedOv = Maj_size - Min_size; 71 | % Kmin_real = size(m_each_min_real,1); 72 | Pow_m_each = m_each_min .^ pow; 73 | Ratio_Size = (1./Pow_m_each)/sum(1./Pow_m_each,1); 74 | 75 | ratio_min = errorCluster_min./sum(~isnan(rand_matrix)); 76 | ratio_min(ratio_min <= 0.1)= 0.1; 77 | ratio_min2 = ratio_min/sum(ratio_min); 78 | 79 | % ratio_min2(ratio_min2 <= 0.1)= 0.1; 80 | New_Ratio = ratio_min2 .* Ratio_Size'; 81 | ratio_min_scaled = New_Ratio/sum(New_Ratio) 82 | Final_Ov = floor(NeedOv * ratio_min_scaled) 83 | 84 | end -------------------------------------------------------------------------------- /ASUWO.m: -------------------------------------------------------------------------------- 1 | function [final_features, final_mark] = ASUWO(original_features, original_mark, CThresh , K, NN, NS) 2 | 3 | %Inputs: 4 | % original_features: The features of original dataset needed to be oversampled. 5 | % original_mark: The label of original dataset needed to be oversampled. 6 | % CThresh: Coefficient to tune the threshold for clustering. 7 | % NN: Number of nearest neighbors to be found for each minority instance to determine the weights. 8 | % NS: Number of nearest neighbors used to identify noisy instances. 9 | % K: Number of folds in the K-fold Cross Validation. 10 | 11 | %Outputs: 12 | % final_features: The features of dataset after being oversampled. 13 | % final_mark: The label of dataset after being oversampled. 14 | % Copyright 2015 Iman Nekooeimehr. This code may be freely used and 15 | % distributed, so long as it maintains this copyright line. 16 | 17 | %Removing noisy instances for both minority and majority class: 18 | [Clean_orig_inst, Clean_orig_mark] = Noise_Remover(original_features, original_mark, NS); 19 | 20 | NNC = 5; 21 | Out_Th = 2; 22 | 23 | % Separating Minority and Majority instances 24 | MinorityIndex = find(Clean_orig_mark == -1); 25 | MajorityIndex = find(Clean_orig_mark == 1); 26 | Majority_features = Clean_orig_inst(MajorityIndex,:); 27 | Minority_features = Clean_orig_inst(MinorityIndex,:); 28 | Maj_size = size(Majority_features,1); 29 | 30 | %% Clustering the minority instances by considering majority instances: 31 | [IDX_min] = Mod_AggCluster(Majority_features, Minority_features ,CThresh); 32 | Kmin = size(unique(IDX_min),1); 33 | m_each_min = histc(IDX_min,1:Kmin); 34 | 35 | %% Finding cluster sizes for minority class using K fold cross validation 36 | [Kmin2, rand_matrix, num_cluster_min] = Num_OV_Finder(IDX_min, Majority_features, Minority_features, m_each_min, Kmin, K, Out_Th); 37 | 38 | final_features = Minority_features; 39 | 40 | %% find selection probability and oversample within minority clusters 41 | [p,q]=size(Majority_features); 42 | 43 | for i=1:Kmin2 44 | minority_clustered = rand_matrix(~isnan(rand_matrix(:,i)),i); 45 | Minority_clustered_features = Minority_features(minority_clustered,:); 46 | [m,n]=size(Minority_clustered_features); 47 | dist_vec = []; 48 | for i2=1:m 49 | %find nearest K1 borderline majority sets 50 | dist = zeros(p,1); 51 | for j=1:p 52 | x = sum((Majority_features(j,:) - Minority_clustered_features(i2,:)).^2); 53 | dist(j,1) = x; 54 | end 55 | distm = sort (dist); 56 | dist_vec = [dist_vec distm(1:NN)]; 57 | end 58 | thresh = quantile(dist_vec(1,:),0.5); 59 | dist_vec(dist_vec > thresh) = thresh; 60 | dist_vec = dist_vec./n; 61 | dist_rec = (1./dist_vec).^1; 62 | mean_dis = mean(dist_rec,1); 63 | totw = sum(mean_dis); 64 | P = mean_dis ./ totw; 65 | %end of our selection probability algorithm 66 | 67 | j2 = 1; 68 | while j2 <= (num_cluster_min(1,i)) 69 | %find nearest K samples from S2(i,:) 70 | S2=sample(Minority_clustered_features,P',1); 71 | Condidates = nearestneighbour(S2', Minority_clustered_features', 'NumberOfNeighbours', min(NNC,m)); 72 | Condidates(:,1) = [] ; 73 | rn=ceil(rand(1)*(size(Condidates,2))); 74 | Sel_index = Condidates(:,rn); 75 | g = Minority_clustered_features(Sel_index,:); 76 | alpha=rand(1) ; 77 | snew = S2(1,:) + alpha.*(g-S2(1,:)); 78 | final_features = [final_features;snew]; 79 | j2=j2+1; 80 | end 81 | end 82 | 83 | r = size(final_features,1); 84 | MinMark = -1 * ones(r,1); 85 | MaxMark = ones(Maj_size,1); 86 | final_mark = [MinMark; MaxMark]; 87 | final_features = [final_features; Majority_features]; 88 | 89 | -------------------------------------------------------------------------------- /Mod_AggCluster.m: -------------------------------------------------------------------------------- 1 | function [min_clusters] = Mod_AggCluster(Majority_features, Minority_features ,CThresh) 2 | 3 | % This code is a modification of the source code for Hierachical Clustering 4 | % implemented by David Ross 5 | % The source code for the original Hierachical Clustering can be found in: 6 | % http://www.cs.toronto.edu/~dross/code/ 7 | 8 | SizeMin = size(Minority_features,1); 9 | min_clusters = (1:SizeMin)'; 10 | 11 | %% Clustering the majority class using Hierachical Clustering 12 | maj_clusters = Orig_agg_cluster(Majority_features, CThresh); 13 | 14 | % Kmaj = size(unique(maj_clusters),1); 15 | % m_each_maj = histc(maj_clusters,1:Kmaj); 16 | 17 | Whole_data_min = [Minority_features; Majority_features]; 18 | D = pdist(Whole_data_min,'euclidean'); 19 | point_dist_min = squareform(D); 20 | 21 | %% Clustering the Minority instances using majority clusters 22 | min_clusters = inside_AggCluster(Minority_features', min_clusters, maj_clusters, point_dist_min, CThresh); 23 | 24 | function labels = inside_AggCluster(data, same_clusters, other_clusters, point_dist_whole, CThresh) 25 | Num_Reject = 0; 26 | N = size(data,2); 27 | Exist_Clus = unique(same_clusters); 28 | M = size(Exist_Clus ,1); 29 | 30 | % the distance between each pair of points 31 | point_dist = point_dist_whole(1:N,1:N); 32 | point_dist2 = point_dist; 33 | for i=1:N 34 | point_dist2(i,i) = 100; 35 | end 36 | 37 | % Measuring the threshold 38 | thresh = mean(median(point_dist2)).* CThresh; 39 | 40 | % Clusters is a cell array of vectors. Each vector contains the 41 | % indicies of the points belonging to that cluster. 42 | % Initially, each point is in it's own cluster. 43 | clusters = cell(M,1); 44 | for cc = 1:M 45 | clusters{cc} = find(same_clusters == Exist_Clus(cc))'; 46 | end 47 | 48 | % until the termination condition is met 49 | mm = 0; 50 | while mm < thresh 51 | 52 | % compute the distances between all pairs of clusters 53 | cluster_dist = inf*ones(length(clusters)); 54 | for c1 = 1:length(clusters) 55 | for c2 = (c1+1):length(clusters) 56 | cluster_dist(c1,c2) = cluster_distance(clusters{c1}, clusters{c2}, point_dist, 3); 57 | end 58 | end 59 | 60 | % merge the two nearest clusters 61 | [mm ii] = min(cluster_dist(:)); 62 | [ii(1) ii(2)] = ind2sub(size(cluster_dist), ii(1)); 63 | 64 | if mm > thresh || length(clusters) < 3, 65 | break 66 | end 67 | % find the distance of nearest clusters to other class clusters: 68 | Unique_Other = unique(other_clusters); 69 | num_clus = size(Unique_Other,1); 70 | 71 | for k = 1:num_clus 72 | MN2other(k) = cluster_distance_maj(clusters{ii(1)}, N + find(other_clusters == Unique_Other(k)), point_dist_whole, 3); 73 | end 74 | flag = 1; 75 | Distr = histc(other_clusters,1:max(other_clusters)); 76 | Distr(Distr == 0) = [] ; 77 | near_other_ind = find(MN2other < mm & Distr' > 3); 78 | for t = 1:length(near_other_ind) 79 | check_dis = cluster_distance_maj(clusters{ii(2)}, N + find(other_clusters == Unique_Other(near_other_ind(t))) , point_dist_whole, 3); 80 | if check_dis size(X, 1) && ... 121 | ~fIndexed && ... 122 | userParams.Radius == inf; 123 | case 'off' 124 | fDelaunay = false; 125 | case 'auto' 126 | fDelaunay = userParams.NumberOfNeighbours == 1 && ... 127 | ~fIndexed && ... 128 | size(X, 2) > size(X, 1) && ... 129 | userParams.Radius == inf && ... 130 | ( ~isempty(userParams.Triangulation) || delaunaytest(nX, nP, dim) ); 131 | end 132 | 133 | % Try doing Delaunay, if fDelaunay. 134 | fDone = false; 135 | if fDelaunay 136 | tri = userParams.Triangulation; 137 | if isempty(tri) 138 | try 139 | tri = delaunayn(X'); 140 | catch 141 | msgId = 'NearestNeighbour:DelaunayFail'; 142 | msg = ['Unable to compute delaunay triangulation, not using it. ',... 143 | 'Set the DelaunayMode parameter to ''off''']; 144 | warning(msgId, msg); 145 | end 146 | end 147 | if ~isempty(tri) 148 | try 149 | idx = dsearchn(X', tri, P')'; 150 | fDone = true; 151 | catch 152 | warning('NearestNeighbour:DSearchFail', ... 153 | 'dsearchn failed on triangulation, not using Delaunay'); 154 | end 155 | end 156 | else % if fDelaunay 157 | tri = []; 158 | end 159 | 160 | % If it didn't use Delaunay triangulation, find the neighbours directly by 161 | % finding minimum distances 162 | if ~fDone 163 | idx = zeros(userParams.NumberOfNeighbours, size(P, 2)); 164 | 165 | % Loop through the set of points P, finding the neighbours 166 | Y = zeros(size(X)); 167 | for iPoint = 1:size(P, 2) 168 | x = P(:, iPoint); 169 | 170 | % This is the faster than using repmat based techniques such as 171 | % Y = X - repmat(x, 1, size(X, 2)) 172 | for i = 1:size(Y, 1) 173 | Y(i, :) = X(i, :) - x(i); 174 | end 175 | 176 | % Find the closest points, and remove matches beneath a radius 177 | dSq = sum(abs(Y).^2, 1); 178 | iRad = find(dSq < userParams.Radius^2); 179 | if ~fIndexed 180 | iSorted = iRad(minn(dSq(iRad), userParams.NumberOfNeighbours)); 181 | else 182 | iSorted = iRad(minn(dSq(iRad), userParams.NumberOfNeighbours + 1)); 183 | iSorted = iSorted(2:end); 184 | end 185 | 186 | % Remove any bad ones 187 | idx(1:length(iSorted), iPoint) = iSorted'; 188 | end 189 | %while ~isempty(idx) && isequal(idx(end, :), zeros(1, size(idx, 2))) 190 | % idx(end, :) = []; 191 | %end 192 | idx( all(idx == 0, 2), :) = []; 193 | end % if ~fDone 194 | if isvector(idx) 195 | idx = idx(:)'; 196 | end 197 | end % nearestneighbour 198 | 199 | 200 | 201 | 202 | %DELAUNAYTEST Work out whether the combination of dimensions makes 203 | %fastest to use a Delaunay triangulation in conjunction with dsearchn. 204 | %These parameters have been determined empirically on a Pentium M 1.6G / 205 | %WinXP / 512MB / Matlab R14SP3 platform. Their precision is not 206 | %particularly important 207 | function tf = delaunaytest(nx, np, dim) 208 | switch dim 209 | case 2 210 | tf = np > min(1.5 * nx, 400); 211 | case 3 212 | tf = np > min(4 * nx , 1200); 213 | case 4 214 | tf = np > min(40 * nx , 5000); 215 | 216 | % if the dimension is higher than 4, it is almost invariably better not 217 | % to try to use the Delaunay triangulation 218 | otherwise 219 | tf = false; 220 | end % switch 221 | end % delaunaytest 222 | 223 | 224 | 225 | 226 | %MINN find the n most negative elements in x, and return their indices 227 | % in ascending order 228 | function I = minn(x, n) 229 | 230 | % Make sure n is no larger than length(x) 231 | n = min(n, length(x)); 232 | 233 | % Sort the first n 234 | [xsn, I] = sort(x(1:n)); 235 | 236 | % Go through the rest of the entries, and insert them into the sorted block 237 | % if they are negative enough 238 | for i = (n+1):length(x) 239 | j = n; 240 | while j > 0 && x(i) < xsn(j) 241 | j = j - 1; 242 | end 243 | 244 | if j < n 245 | % x(i) should go into the (j+1) position 246 | xsn = [xsn(1:j), x(i), xsn((j+1):(n-1))]; 247 | I = [I(1:j), i, I((j+1):(n-1))]; 248 | end 249 | end 250 | 251 | end %minn 252 | 253 | 254 | %PARSEINPUTS Support function for nearestneighbour 255 | function [P, X, fIndexed, userParams] = parseinputs(userParams, varargin) 256 | if length(varargin) == 1 || ~isnumeric(varargin{2}) 257 | P = varargin{1}; 258 | X = varargin{1}; 259 | fIndexed = true; 260 | varargin(1) = []; 261 | else 262 | P = varargin{1}; 263 | X = varargin{2}; 264 | varargin(1:2) = []; 265 | 266 | % Check the dimensions of X and P 267 | if size(X, 1) ~= 1 268 | % Check to see whether P is in fact a vector of indices 269 | if size(P, 1) == 1 270 | try 271 | P = X(:, P); 272 | catch 273 | error('NearestNeighbour:InvalidIndexVector', ... 274 | 'Unable to index matrix using index vector'); 275 | end 276 | fIndexed = true; 277 | else 278 | fIndexed = false; 279 | end % if size(P, 1) == 1 280 | else % if size(X, 1) ~= 1 281 | fIndexed = false; 282 | end 283 | 284 | if ~fIndexed && size(P, 1) ~= size(X, 1) 285 | error('NearestNeighbour:DimensionMismatch', ... 286 | 'No. of rows of input arrays doesn''t match'); 287 | end 288 | end 289 | % Parse the Property/Value pairs 290 | if rem(length(varargin), 2) ~= 0 291 | error('NearestNeighbour:propertyValueNotPair', ... 292 | 'Additional arguments must take the form of Property/Value pairs'); 293 | end 294 | 295 | propertyNames = {'numberofneighbours', 'delaunaymode', 'triangulation', ... 296 | 'radius'}; 297 | while length(varargin) ~= 0 298 | property = varargin{1}; 299 | value = varargin{2}; 300 | 301 | % If the property has been supplied in a shortened form, lengthen it 302 | iProperty = find(strncmpi(property, propertyNames, length(property))); 303 | if isempty(iProperty) 304 | error('NearestNeighbour:InvalidProperty', 'Invalid Property'); 305 | elseif length(iProperty) > 1 306 | error('NearestNeighbour:AmbiguousProperty', ... 307 | 'Supplied shortened property name is ambiguous'); 308 | end 309 | property = propertyNames{iProperty}; 310 | 311 | switch property 312 | case 'numberofneighbours' 313 | if rem(value, 1) ~= 0 || ... 314 | value > length(X) - double(fIndexed) || ... 315 | value < 1 316 | error('NearestNeighbour:InvalidNumberOfNeighbours', ... 317 | 'Number of Neighbours must be an integer, and smaller than the no. of points in X'); 318 | end 319 | userParams.NumberOfNeighbours = value; 320 | 321 | case 'delaunaymode' 322 | fOn = strcmpi(value, 'on'); 323 | if strcmpi(value, 'off') 324 | userParams.DelaunayMode = 'off'; 325 | elseif fOn || strcmpi(value, 'auto') 326 | if userParams.NumberOfNeighbours ~= 1 327 | if fOn 328 | warning('NearestNeighbour:TooMuchForDelaunay', ... 329 | 'Delaunay Triangulation method works only for one neighbour'); 330 | end 331 | userParams.DelaunayMode = 'off'; 332 | elseif size(X, 2) < size(X, 1) + 1 333 | if fOn 334 | warning('NearestNeighbour:TooFewDelaunayPoints', ... 335 | 'Insufficient points to compute Delaunay triangulation'); 336 | end 337 | userParams.DelaunayMode = 'off'; 338 | 339 | elseif size(X, 1) == 1 340 | if fOn 341 | warning('NearestNeighbour:DelaunayDimensionOne', ... 342 | 'Cannot compute Delaunay triangulation for 1D input'); 343 | end 344 | userParams.DelaunayMode = 'off'; 345 | else 346 | userParams.DelaunayMode = value; 347 | end 348 | else 349 | warning('NearestNeighbour:InvalidOption', ... 350 | 'Invalid Option'); 351 | end % if strcmpi(value, 'off') 352 | 353 | case 'radius' 354 | if isscalar(value) && isnumeric(value) && isreal(value) && value > 0 355 | userParams.Radius = value; 356 | if isempty(userParams.NumberOfNeighbours) 357 | userParams.NumberOfNeighbours = size(X, 2) - double(fIndexed); 358 | end 359 | else 360 | error('NearestNeighbour:InvalidRadius', ... 361 | 'Radius must be a positive real number'); 362 | end 363 | 364 | 365 | case 'triangulation' 366 | if isnumeric(value) && size(value, 2) == size(X, 1) + 1 && ... 367 | all(ismember(1:size(X, 2), value)) 368 | userParams.Triangulation = value; 369 | else 370 | error('NearestNeighbour:InvalidTriangulation', ... 371 | 'Triangulation not a valid Delaunay Triangulation'); 372 | end 373 | end % switch property 374 | 375 | varargin(1:2) = []; 376 | end % while 377 | if isempty(userParams.NumberOfNeighbours) 378 | userParams.NumberOfNeighbours = 1; 379 | end 380 | end %parseinputs --------------------------------------------------------------------------------