├── Customer_Segmentation.R ├── Customer_Segmentation.pdf ├── Mall_Customers.csv ├── README.md ├── ReadMe.docx └── Screenshots ├── Annual income histogram.PNG ├── BoxPlotofdescriptiveanalysisofSpendingscore.PNG ├── Boxplot_age_distrub.PNG ├── Capture.PNG ├── Database.PNG ├── Density_plot annualincome.PNG ├── Intra_clusters.PNG ├── SpendingScore_Histogram.PNG ├── age_distribution.PNG ├── k_means_output.PNG ├── optimal_no_of_clusters.PNG └── pct_gender_comaprison.PNG /Customer_Segmentation.R: -------------------------------------------------------------------------------- 1 | customer_data=read.csv("C:/Users/sauranh sangle/Desktop/New folder/Mall_Customers.csv") 2 | str(customer_data) 3 | 4 | names(customer_data) 5 | head(customer_data) 6 | 7 | summary(customer_data$Age) 8 | sd(customer_data$Age) 9 | summary(customer_data$Annual.Income..k..) 10 | sd(customer_data$Annual.Income..k..) 11 | summary(customer_data$Age) 12 | sd(customer_data$Spending.Score..1.100.) 13 | 14 | 15 | #Customer Gender Visualization 16 | 17 | a=table(customer_data$Gender) 18 | barplot(a,main="Using BarPlot to display Gender Comparision", 19 | ylab="Count", 20 | xlab="Gender", 21 | col=rainbow(2), 22 | legend=rownames(a)) 23 | 24 | 25 | pct=round(a/sum(a)*100) 26 | lbs=paste(c("Female","Male")," ",pct,"%",sep=" ") 27 | library(plotrix) 28 | pie3D(a,labels=lbs, 29 | main="Pie Chart Depicting Ratio of Female and Male") 30 | 31 | 32 | #Visualization of Age Distribution 33 | 34 | 35 | summary(customer_data$Age) 36 | 37 | hist(customer_data$Age, 38 | col="blue", 39 | main="Histogram to Show Count of Age Class", 40 | xlab="Age Class", 41 | ylab="Frequency", 42 | labels=TRUE) 43 | 44 | boxplot(customer_data$Age, 45 | col="#ff0066", 46 | main="Boxplot for Descriptive Analysis of Age") 47 | 48 | #Analysis of the Annual Income of the Customers 49 | 50 | summary(customer_data$Annual.Income..k..) 51 | hist(customer_data$Annual.Income..k.., 52 | col="#660033", 53 | main="Histogram for Annual Income", 54 | xlab="Annual Income Class", 55 | ylab="Frequency", 56 | labels=TRUE) 57 | 58 | plot(density(customer_data$Annual.Income..k..), 59 | col="yellow", 60 | main="Density Plot for Annual Income", 61 | xlab="Annual Income Class", 62 | ylab="Density") 63 | polygon(density(customer_data$Annual.Income..k..), 64 | col="#ccff66") 65 | 66 | boxplot(customer_data$Spending.Score..1.100., 67 | horizontal=TRUE, 68 | col="#990000", 69 | main="BoxPlot for Descriptive Analysis of Spending Score") 70 | 71 | hist(customer_data$Spending.Score..1.100., 72 | main="HistoGram for Spending Score", 73 | xlab="Spending Score Class", 74 | ylab="Frequency", 75 | col="#6600cc", 76 | labels=TRUE) 77 | 78 | #K-means Algorithm 79 | 80 | library(purrr) 81 | set.seed(123) 82 | # function to calculate total intra-cluster sum of square 83 | iss <- function(k) { 84 | kmeans(customer_data[,3:5],k,iter.max=100,nstart=100,algorithm="Lloyd" )$tot.withinss 85 | } 86 | 87 | k.values <- 1:10 88 | 89 | 90 | iss_values <- map_dbl(k.values, iss) 91 | 92 | plot(k.values, iss_values, 93 | type="b", pch = 19, frame = FALSE, 94 | xlab="Number of clusters K", 95 | ylab="Total intra-clusters sum of squares") 96 | 97 | #Average Silhouette Method 98 | 99 | library(cluster) 100 | library(gridExtra) 101 | library(grid) 102 | 103 | 104 | k2<-kmeans(customer_data[,3:5],2,iter.max=100,nstart=50,algorithm="Lloyd") 105 | s2<-plot(silhouette(k2$cluster,dist(customer_data[,3:5],"euclidean"))) 106 | 107 | k3<-kmeans(customer_data[,3:5],3,iter.max=100,nstart=50,algorithm="Lloyd") 108 | s3<-plot(silhouette(k3$cluster,dist(customer_data[,3:5],"euclidean"))) 109 | 110 | k4<-kmeans(customer_data[,3:5],4,iter.max=100,nstart=50,algorithm="Lloyd") 111 | s4<-plot(silhouette(k4$cluster,dist(customer_data[,3:5],"euclidean"))) 112 | 113 | k5<-kmeans(customer_data[,3:5],5,iter.max=100,nstart=50,algorithm="Lloyd") 114 | s5<-plot(silhouette(k5$cluster,dist(customer_data[,3:5],"euclidean"))) 115 | 116 | k6<-kmeans(customer_data[,3:5],6,iter.max=100,nstart=50,algorithm="Lloyd") 117 | s6<-plot(silhouette(k6$cluster,dist(customer_data[,3:5],"euclidean"))) 118 | 119 | k7<-kmeans(customer_data[,3:5],7,iter.max=100,nstart=50,algorithm="Lloyd") 120 | s7<-plot(silhouette(k7$cluster,dist(customer_data[,3:5],"euclidean"))) 121 | 122 | k8<-kmeans(customer_data[,3:5],8,iter.max=100,nstart=50,algorithm="Lloyd") 123 | s8<-plot(silhouette(k8$cluster,dist(customer_data[,3:5],"euclidean"))) 124 | 125 | k9<-kmeans(customer_data[,3:5],9,iter.max=100,nstart=50,algorithm="Lloyd") 126 | s9<-plot(silhouette(k9$cluster,dist(customer_data[,3:5],"euclidean"))) 127 | 128 | k10<-kmeans(customer_data[,3:5],10,iter.max=100,nstart=50,algorithm="Lloyd") 129 | s10<-plot(silhouette(k10$cluster,dist(customer_data[,3:5],"euclidean"))) 130 | 131 | 132 | 133 | library(NbClust) 134 | library(factoextra) 135 | 136 | fviz_nbclust(customer_data[,3:5], kmeans, method = "silhouette") 137 | 138 | set.seed(125) 139 | stat_gap <- clusGap(customer_data[,3:5], FUN = kmeans, nstart = 25, 140 | K.max = 10, B = 50) 141 | fviz_gap_stat(stat_gap) 142 | 143 | k6<-kmeans(customer_data[,3:5],6,iter.max=100,nstart=50,algorithm="Lloyd") 144 | k6 145 | 146 | 147 | #Visualizing the Clustering Results using the First Two Principle Components 148 | 149 | pcclust=prcomp(customer_data[,3:5],scale=FALSE) #principal component analysis 150 | summary(pcclust) 151 | 152 | pcclust$rotation[,1:2] 153 | 154 | 155 | 156 | set.seed(1) 157 | ggplot(customer_data, aes(x =Annual.Income..k.., y = Spending.Score..1.100.)) + 158 | geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) + 159 | scale_color_discrete(name=" ", 160 | breaks=c("1", "2", "3", "4", "5","6"), 161 | labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) + 162 | ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering") 163 | 164 | 165 | 166 | ggplot(customer_data, aes(x =Spending.Score..1.100., y =Age)) + 167 | geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) + 168 | scale_color_discrete(name=" ", 169 | breaks=c("1", "2", "3", "4", "5","6"), 170 | labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) + 171 | ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering") 172 | 173 | 174 | 175 | kCols=function(vec){cols=rainbow (length (unique (vec))) 176 | return (cols[as.numeric(as.factor(vec))])} 177 | 178 | digCluster<-k6$cluster; dignm<-as.character(digCluster); # K-means clusters 179 | 180 | plot(pcclust$x[,1:2], col =kCols(digCluster),pch =19,xlab ="K-means",ylab="classes") 181 | legend("bottomleft",unique(dignm),fill=unique(kCols(digCluster))) 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | -------------------------------------------------------------------------------- /Customer_Segmentation.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Customer_Segmentation.pdf -------------------------------------------------------------------------------- /Mall_Customers.csv: -------------------------------------------------------------------------------- 1 | CustomerID,Gender,Age,Annual Income (k$),Spending Score (1-100) 2 | 1,Male,19,15,39 3 | 2,Male,21,15,81 4 | 3,Female,20,16,6 5 | 4,Female,23,16,77 6 | 5,Female,31,17,40 7 | 6,Female,22,17,76 8 | 7,Female,35,18,6 9 | 8,Female,23,18,94 10 | 9,Male,64,19,3 11 | 10,Female,30,19,72 12 | 11,Male,67,19,14 13 | 12,Female,35,19,99 14 | 13,Female,58,20,15 15 | 14,Female,24,20,77 16 | 15,Male,37,20,13 17 | 16,Male,22,20,79 18 | 17,Female,35,21,35 19 | 18,Male,20,21,66 20 | 19,Male,52,23,29 21 | 20,Female,35,23,98 22 | 21,Male,35,24,35 23 | 22,Male,25,24,73 24 | 23,Female,46,25,5 25 | 24,Male,31,25,73 26 | 25,Female,54,28,14 27 | 26,Male,29,28,82 28 | 27,Female,45,28,32 29 | 28,Male,35,28,61 30 | 29,Female,40,29,31 31 | 30,Female,23,29,87 32 | 31,Male,60,30,4 33 | 32,Female,21,30,73 34 | 33,Male,53,33,4 35 | 34,Male,18,33,92 36 | 35,Female,49,33,14 37 | 36,Female,21,33,81 38 | 37,Female,42,34,17 39 | 38,Female,30,34,73 40 | 39,Female,36,37,26 41 | 40,Female,20,37,75 42 | 41,Female,65,38,35 43 | 42,Male,24,38,92 44 | 43,Male,48,39,36 45 | 44,Female,31,39,61 46 | 45,Female,49,39,28 47 | 46,Female,24,39,65 48 | 47,Female,50,40,55 49 | 48,Female,27,40,47 50 | 49,Female,29,40,42 51 | 50,Female,31,40,42 52 | 51,Female,49,42,52 53 | 52,Male,33,42,60 54 | 53,Female,31,43,54 55 | 54,Male,59,43,60 56 | 55,Female,50,43,45 57 | 56,Male,47,43,41 58 | 57,Female,51,44,50 59 | 58,Male,69,44,46 60 | 59,Female,27,46,51 61 | 60,Male,53,46,46 62 | 61,Male,70,46,56 63 | 62,Male,19,46,55 64 | 63,Female,67,47,52 65 | 64,Female,54,47,59 66 | 65,Male,63,48,51 67 | 66,Male,18,48,59 68 | 67,Female,43,48,50 69 | 68,Female,68,48,48 70 | 69,Male,19,48,59 71 | 70,Female,32,48,47 72 | 71,Male,70,49,55 73 | 72,Female,47,49,42 74 | 73,Female,60,50,49 75 | 74,Female,60,50,56 76 | 75,Male,59,54,47 77 | 76,Male,26,54,54 78 | 77,Female,45,54,53 79 | 78,Male,40,54,48 80 | 79,Female,23,54,52 81 | 80,Female,49,54,42 82 | 81,Male,57,54,51 83 | 82,Male,38,54,55 84 | 83,Male,67,54,41 85 | 84,Female,46,54,44 86 | 85,Female,21,54,57 87 | 86,Male,48,54,46 88 | 87,Female,55,57,58 89 | 88,Female,22,57,55 90 | 89,Female,34,58,60 91 | 90,Female,50,58,46 92 | 91,Female,68,59,55 93 | 92,Male,18,59,41 94 | 93,Male,48,60,49 95 | 94,Female,40,60,40 96 | 95,Female,32,60,42 97 | 96,Male,24,60,52 98 | 97,Female,47,60,47 99 | 98,Female,27,60,50 100 | 99,Male,48,61,42 101 | 100,Male,20,61,49 102 | 101,Female,23,62,41 103 | 102,Female,49,62,48 104 | 103,Male,67,62,59 105 | 104,Male,26,62,55 106 | 105,Male,49,62,56 107 | 106,Female,21,62,42 108 | 107,Female,66,63,50 109 | 108,Male,54,63,46 110 | 109,Male,68,63,43 111 | 110,Male,66,63,48 112 | 111,Male,65,63,52 113 | 112,Female,19,63,54 114 | 113,Female,38,64,42 115 | 114,Male,19,64,46 116 | 115,Female,18,65,48 117 | 116,Female,19,65,50 118 | 117,Female,63,65,43 119 | 118,Female,49,65,59 120 | 119,Female,51,67,43 121 | 120,Female,50,67,57 122 | 121,Male,27,67,56 123 | 122,Female,38,67,40 124 | 123,Female,40,69,58 125 | 124,Male,39,69,91 126 | 125,Female,23,70,29 127 | 126,Female,31,70,77 128 | 127,Male,43,71,35 129 | 128,Male,40,71,95 130 | 129,Male,59,71,11 131 | 130,Male,38,71,75 132 | 131,Male,47,71,9 133 | 132,Male,39,71,75 134 | 133,Female,25,72,34 135 | 134,Female,31,72,71 136 | 135,Male,20,73,5 137 | 136,Female,29,73,88 138 | 137,Female,44,73,7 139 | 138,Male,32,73,73 140 | 139,Male,19,74,10 141 | 140,Female,35,74,72 142 | 141,Female,57,75,5 143 | 142,Male,32,75,93 144 | 143,Female,28,76,40 145 | 144,Female,32,76,87 146 | 145,Male,25,77,12 147 | 146,Male,28,77,97 148 | 147,Male,48,77,36 149 | 148,Female,32,77,74 150 | 149,Female,34,78,22 151 | 150,Male,34,78,90 152 | 151,Male,43,78,17 153 | 152,Male,39,78,88 154 | 153,Female,44,78,20 155 | 154,Female,38,78,76 156 | 155,Female,47,78,16 157 | 156,Female,27,78,89 158 | 157,Male,37,78,1 159 | 158,Female,30,78,78 160 | 159,Male,34,78,1 161 | 160,Female,30,78,73 162 | 161,Female,56,79,35 163 | 162,Female,29,79,83 164 | 163,Male,19,81,5 165 | 164,Female,31,81,93 166 | 165,Male,50,85,26 167 | 166,Female,36,85,75 168 | 167,Male,42,86,20 169 | 168,Female,33,86,95 170 | 169,Female,36,87,27 171 | 170,Male,32,87,63 172 | 171,Male,40,87,13 173 | 172,Male,28,87,75 174 | 173,Male,36,87,10 175 | 174,Male,36,87,92 176 | 175,Female,52,88,13 177 | 176,Female,30,88,86 178 | 177,Male,58,88,15 179 | 178,Male,27,88,69 180 | 179,Male,59,93,14 181 | 180,Male,35,93,90 182 | 181,Female,37,97,32 183 | 182,Female,32,97,86 184 | 183,Male,46,98,15 185 | 184,Female,29,98,88 186 | 185,Female,41,99,39 187 | 186,Male,30,99,97 188 | 187,Female,54,101,24 189 | 188,Male,28,101,68 190 | 189,Female,41,103,17 191 | 190,Female,36,103,85 192 | 191,Female,34,103,23 193 | 192,Female,32,103,69 194 | 193,Male,33,113,8 195 | 194,Female,38,113,91 196 | 195,Female,47,120,16 197 | 196,Female,35,120,79 198 | 197,Female,45,126,28 199 | 198,Male,32,126,74 200 | 199,Male,32,137,18 201 | 200,Male,30,137,83 202 | 201,Male,19,15,39 203 | 202,Male,21,15,81 204 | 203,Female,20,16,6 205 | 204,Female,23,16,77 206 | 205,Female,31,17,40 207 | 206,Female,22,17,76 208 | 207,Female,35,18,6 209 | 208,Female,23,18,94 210 | 209,Male,64,19,3 211 | 210,Female,30,19,72 212 | 211,Male,67,19,14 213 | 212,Female,35,19,99 214 | 213,Female,58,20,15 215 | 214,Female,24,20,77 216 | 215,Male,37,20,13 217 | 216,Male,22,20,79 218 | 217,Female,35,21,35 219 | 218,Male,20,21,66 220 | 219,Male,52,23,29 221 | 220,Female,35,23,98 222 | 221,Male,35,24,35 223 | 222,Male,25,24,73 224 | 223,Female,46,25,5 225 | 224,Male,31,25,73 226 | 225,Female,54,28,14 227 | 226,Male,29,28,82 228 | 227,Female,45,28,32 229 | 228,Male,35,28,61 230 | 229,Female,40,29,31 231 | 230,Female,23,29,87 232 | 231,Male,60,30,4 233 | 232,Female,21,30,73 234 | 233,Male,53,33,4 235 | 234,Male,18,33,92 236 | 235,Female,49,33,14 237 | 236,Female,21,33,81 238 | 237,Female,42,34,17 239 | 238,Female,30,34,73 240 | 239,Female,36,37,26 241 | 240,Female,20,37,75 242 | 241,Female,65,38,35 243 | 242,Male,24,38,92 244 | 243,Male,48,39,36 245 | 244,Female,31,39,61 246 | 245,Female,49,39,28 247 | 246,Female,24,39,65 248 | 247,Female,50,40,55 249 | 248,Female,27,40,47 250 | 249,Female,29,40,42 251 | 250,Female,31,40,42 252 | 251,Female,49,42,52 253 | 252,Male,33,42,60 254 | 253,Female,31,43,54 255 | 254,Male,59,43,60 256 | 255,Female,50,43,45 257 | 256,Male,47,43,41 258 | 257,Female,51,44,50 259 | 258,Male,69,44,46 260 | 259,Female,27,46,51 261 | 260,Male,53,46,46 262 | 261,Male,70,46,56 263 | 262,Male,19,46,55 264 | 263,Female,67,47,52 265 | 264,Female,54,47,59 266 | 265,Male,63,48,51 267 | 266,Male,18,48,59 268 | 267,Female,43,48,50 269 | 268,Female,68,48,48 270 | 269,Male,19,48,59 271 | 270,Female,32,48,47 272 | 271,Male,70,49,55 273 | 272,Female,47,49,42 274 | 273,Female,60,50,49 275 | 274,Female,60,50,56 276 | 275,Male,59,54,47 277 | 276,Male,26,54,54 278 | 277,Female,45,54,53 279 | 278,Male,40,54,48 280 | 279,Female,23,54,52 281 | 280,Female,49,54,42 282 | 281,Male,57,54,51 283 | 282,Male,38,54,55 284 | 283,Male,67,54,41 285 | 284,Female,46,54,44 286 | 285,Female,21,54,57 287 | 286,Male,48,54,46 288 | 287,Female,55,57,58 289 | 288,Female,22,57,55 290 | 289,Female,34,58,60 291 | 290,Female,50,58,46 292 | 291,Female,68,59,55 293 | 292,Male,18,59,41 294 | 293,Male,48,60,49 295 | 294,Female,40,60,40 296 | 295,Female,32,60,42 297 | 296,Male,24,60,52 298 | 297,Female,47,60,47 299 | 298,Female,27,60,50 300 | 299,Male,48,61,42 301 | 300,Male,20,61,49 302 | 301,Female,23,62,41 303 | 301,Female,49,62,48 304 | 303,Male,67,62,59 305 | 304,Male,26,62,55 306 | 305,Male,49,62,56 307 | 306,Female,21,62,42 308 | 307,Female,66,63,50 309 | 308,Male,54,63,46 310 | 309,Male,68,63,43 311 | 310,Male,66,63,48 312 | 311,Male,65,63,52 313 | 312,Female,19,63,54 314 | 313,Female,38,64,42 315 | 314,Male,19,64,46 316 | 315,Female,18,65,48 317 | 316,Female,19,65,50 318 | 317,Female,63,65,43 319 | 318,Female,49,65,59 320 | 319,Female,51,67,43 321 | 320,Female,50,67,57 322 | 321,Male,27,67,56 323 | 322,Female,38,67,40 324 | 323,Female,40,69,58 325 | 324,Male,39,69,91 326 | 325,Female,23,70,29 327 | 326,Female,31,70,77 328 | 327,Male,43,71,35 329 | 328,Male,40,71,95 330 | 329,Male,59,71,11 331 | 330,Male,38,71,75 332 | 331,Male,47,71,9 333 | 332,Male,39,71,75 334 | 333,Female,25,72,34 335 | 334,Female,31,72,71 336 | 335,Male,20,73,5 337 | 336,Female,29,73,88 338 | 337,Female,44,73,7 339 | 338,Male,32,73,73 340 | 339,Male,19,74,10 341 | 340,Female,35,74,72 342 | 341,Female,57,75,5 343 | 342,Male,32,75,93 344 | 343,Female,28,76,40 345 | 344,Female,32,76,87 346 | 345,Male,25,77,12 347 | 346,Male,28,77,97 348 | 347,Male,48,77,36 349 | 348,Female,32,77,74 350 | 349,Female,34,78,22 351 | 350,Male,34,78,90 352 | 351,Male,43,78,17 353 | 352,Male,39,78,88 354 | 353,Female,44,78,20 355 | 354,Female,38,78,76 356 | 355,Female,47,78,16 357 | 356,Female,27,78,89 358 | 357,Male,37,78,1 359 | 358,Female,30,78,78 360 | 359,Male,34,78,1 361 | 360,Female,30,78,73 362 | 361,Female,56,79,35 363 | 362,Female,29,79,83 364 | 363,Male,19,81,5 365 | 364,Female,31,81,93 366 | 365,Male,50,85,26 367 | 366,Female,36,85,75 368 | 367,Male,42,86,20 369 | 368,Female,33,86,95 370 | 369,Female,36,87,27 371 | 370,Male,32,87,63 372 | 371,Male,40,87,13 373 | 372,Male,28,87,75 374 | 373,Male,36,87,10 375 | 374,Male,36,87,92 376 | 375,Female,52,88,13 377 | 376,Female,30,88,86 378 | 377,Male,58,88,15 379 | 378,Male,27,88,69 380 | 379,Male,59,93,14 381 | 380,Male,35,93,90 382 | 381,Female,37,97,32 383 | 382,Female,32,97,86 384 | 383,Male,46,98,15 385 | 384,Female,29,98,88 386 | 385,Female,41,99,39 387 | 386,Male,30,99,97 388 | 387,Female,54,101,24 389 | 388,Male,28,101,68 390 | 389,Female,41,103,17 391 | 390,Female,36,103,85 392 | 391,Female,34,103,23 393 | 392,Female,32,103,69 394 | 393,Male,33,113,8 395 | 394,Female,38,113,91 396 | 395,Female,47,120,16 397 | 396,Female,35,120,79 398 | 397,Female,45,126,28 399 | 398,Male,32,126,74 400 | 399,Male,32,137,18 401 | 400,Male,30,137,83 402 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # R-Programming-Customer-Segmentation 2 | Customer-Segmentation Project 3 | # Introduction 4 | Customer Segmentation is one the most important applications of unsupervised learning. Using clustering techniques, companies can identify the several segments of customers allowing them to target the potential user base. In this machine learning project, we will make use of k-mean Clustering which is the essential algorithm for clustering unlabelled dataset. 5 | # SCOPE 6 | Whenever you need to find your best customer, customer segmentation is the ideal methodology. We will perform one of the most essential applications of machine learning – Customer Segmentation. In this project, we will implement customer segmentation in R. 7 | # PROJECT SPECIFICATION 8 | - R Studio version 1.2.5033 9 | # HARDWARE SPECIFICATIONS 10 | - Microsoft® Windows® 7/8/10 (32- or 64-bit) 11 | - 3 GB RAM minimum, 8 GB RAM recommended; 12 | - 2 GB of available disk space minimum 13 | - core processor of i3 minimum or above. 14 | # DATASET 15 | - Mall_Customers.csv 16 | # PACKAGES REQURIED: 17 | - plotrix 18 | - purr 19 | - cluster 20 | - gridExtra 21 | - grid 22 | - nbClust 23 | - factoextra 24 | - ggplot2 25 | - dplyr 26 | 27 | # What is Customer Segmentation? 28 | Customer Segmentation is the process of division of customer base into several groups of individuals that share a similarity in different ways that are relevant to marketing such as gender, age, interests, and miscellaneous spending habits. 29 | Companies that deploy customer segmentation are under the notion that every customer has different requirements and require a specific marketing effort to address them appropriately. 30 | 31 | Companies aim to gain a deeper approach of the customer they are targeting. Therefore, their aim has to be specific and should be tailored to address the requirements of each and every individual customer. Furthermore, through the data collected, companies can gain a deeper understanding of customer preferences as well as the requirements for discovering valuable segments that would reap them maximum profit. This way, they can strategize their marketing techniques more efficiently and minimize the possibility of risk to their investment. The technique of customer segmentation is dependent on several key differentiators that divide customers into groups to be targeted. 32 | 33 | Data related to demographics, geography, economic status as well as behavioral patterns play a crucial role in determining the company direction towards addressing the various segments. 34 | # Customer Gender Visualization: 35 | 36 | In this, we will create a barplot and a piechart to show the gender distribution across our customer_data dataset. A bar chart represents data in rectangular bars with length of the bar proportional to the value of the variable. R uses the function barplot() to create bar charts. R can draw both vertical and Horizontal bars in the bar chart. In bar chart each of the bars can be given different colors 37 | # K-means Algorithm 38 | 39 | While using the k-means clustering algorithm, the first step is to indicate the number of clusters (k) that we wish to produce in the final output. The algorithm starts by selecting k objects from dataset randomly that will serve as the initial centers for our clusters. These selected objects are the cluster means, also known as centroids. Then, the remaining objects have an assignment of the closest centroid. This centroid is defined by the Euclidean Distance present between the object and the cluster mean. We refer to this step as “cluster assignment”. 40 | 41 | When the assignment is complete, the algorithm proceeds to calculate new mean value of each cluster present in the data. After the recalculation of the centers, the observations are checked if they are closer to a different cluster. Using the updated cluster mean, the objects undergo reassignment. This goes on repeatedly through several iterations until the cluster assignments stop altering. The clusters that are present in the current iteration are the same as the ones obtained in the previous iteration. Summing up the K-means clustering – 42 | - We specify the number of clusters that we need to create. 43 | - The algorithm selects k objects at random from the dataset. This object is the initial cluster or mean. 44 | - The closest centroid obtains the assignment of a new observation. We base this assignment on the Euclidean Distance between object and the centroid. 45 | - k clusters in the data points update the centroid through calculation of the new mean values present in all the data points of the cluster. The kth cluster’s centroid has a length of p that contains means of all variables for observations in the k-th cluster. We denote the number of variables with p. 46 | - Iterative minimization of the total within the sum of squares. Then through the iterative minimization of the total sum of the square, the assignment stop wavering when we achieve maximum iteration. The default value is 10 that the R software uses for the maximum iterations. 47 | - we calculate the clustering algorithm for several values of k. This can be done by creating a variation within k from 1 to 10 clusters. We then calculate the total intra-cluster sum of square (iss). Then, we proceed to plot iss based on the number of k clusters. 48 | 49 | This plot denotes the appropriate number of 50 | clusters required in our model. In the plot, the location of a bend or a knee is the indication of the optimum number of clusters. Let us implement this in R as follows – 51 | 52 | #### Code: library(purrr) set.seed(123) 53 | 54 | function to calculate total intra-cluster sum of square iss <- function(k) { kmeans(customer_data[,3:5],k,iter.max=100,nstart=100,algorithm="Lloyd" ) 55 | $tot.withinss } 56 | 57 | k.values <- 1:10 iss_values <- map_dbl(k.values, iss) 58 | 59 | plot(k.values, iss_values, type="b", pch = 19, frame = FALSE, xlab="Number of clusters K", ylab="Total intra-clusters sum of squares") 60 | 61 | # Visualizing the Clustering Results using the First Two Principle Components 62 | A line chart or line plot or line graph or curve chart is a type of chart which displays information as a series of data points called 'markers' connected by straight line segments. It is a basic type of chart common in many fields. Used across many fields, this type of graph can be quite helpful in depicting the changes in values over time. We are going to use ggplot for depicting the line plot. 63 | 64 | Code: set.seed(1) ggplot(customer_data, aes(x =Annual.Income..k.., y = Spending.Score..1.100.)) + geom_point(stat = "identity", aes(color = as.factor(k6$cluster))) + scale_color_discrete(name=" ", breaks=c("1", "2", "3", "4", "5","6"), labels=c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5","Cluster 6")) + ggtitle("Segments of Mall Customers", subtitle = "Using K-means Clustering") 65 | # CONCLUSION 66 | In this data science project, we went through the customer segmentation model. We developed this using a class of machine learning known as unsupervised learning. Specifically, we made use of a clustering algorithm called K-means clustering. We analysed and visualized the data and then proceeded to implement our algorithm. Hope you enjoyed this customer segmentation project of machine learning using R 67 | -------------------------------------------------------------------------------- /ReadMe.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/ReadMe.docx -------------------------------------------------------------------------------- /Screenshots/Annual income histogram.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/Annual income histogram.PNG -------------------------------------------------------------------------------- /Screenshots/BoxPlotofdescriptiveanalysisofSpendingscore.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/BoxPlotofdescriptiveanalysisofSpendingscore.PNG -------------------------------------------------------------------------------- /Screenshots/Boxplot_age_distrub.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/Boxplot_age_distrub.PNG -------------------------------------------------------------------------------- /Screenshots/Capture.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/Capture.PNG -------------------------------------------------------------------------------- /Screenshots/Database.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/Database.PNG -------------------------------------------------------------------------------- /Screenshots/Density_plot annualincome.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/Density_plot annualincome.PNG -------------------------------------------------------------------------------- /Screenshots/Intra_clusters.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/Intra_clusters.PNG -------------------------------------------------------------------------------- /Screenshots/SpendingScore_Histogram.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/SpendingScore_Histogram.PNG -------------------------------------------------------------------------------- /Screenshots/age_distribution.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/age_distribution.PNG -------------------------------------------------------------------------------- /Screenshots/k_means_output.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/k_means_output.PNG -------------------------------------------------------------------------------- /Screenshots/optimal_no_of_clusters.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/optimal_no_of_clusters.PNG -------------------------------------------------------------------------------- /Screenshots/pct_gender_comaprison.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TejasSangle/R-Programming-Customer-Segmentation/d4bfaa29de8efce12292114b5843849c1ac68240/Screenshots/pct_gender_comaprison.PNG --------------------------------------------------------------------------------