├── README.md
├── main.R
├── 《数据挖掘实验》课程设计--周伟.pdf
├── 《数据挖掘实验》课程设计PPT--周伟.pptx
└── 口红-data.xlsx


/README.md:
--------------------------------------------------------------------------------
 1 | # data-mining-R
 2 | 从网站爬取口红销售数据，分析影响销售数据的重要因素以及根据销售因素建模预测其销售量。 
 3 | 本文先将数据进行预处理得到实验数据， 
 4 | 然后着重分析朴素贝叶斯判别分析算法、 AdaBoost 算法以及随机森林算法在口红销量预测中的效果， 并在随机森林算法中进行模型优化。 
 5 | 通过实验结果表明总评价数、 价格和描述分这三个因素对销售量的影响较大， 
 6 | 对三个算法对比分析得出随机森林算法预测错误率最低，有较好的预测效果。
 7 | 
 8 | Crawling lipstick sales data from the website, 
 9 | analyzing the important factors affecting sales data and predicting sales volume according to sales factors modeling. 
10 | In this paper, 
11 | we first preprocess the data to get the experimental data, 
12 | and then focus on the analysis of Naive Bayesian Discriminant Analysis (Naive Bayesian Discriminant Analysis), 
13 | AdaBoost algorithm and random forest algorithm in lipstick sales forecasting effect, 
14 | and in the random forest algorithm to optimize the model. 
15 | The experimental results show that the total evaluation number, 
16 | price and description of these three factors have a greater impact on sales. 
17 | The comparison of the three algorithms shows that the random forest algorithm has the lowest prediction error rate, 
18 | and has a better prediction effect.
19 | 
20 | 
21 | main.R文件是 代码的源文件。
22 | 


--------------------------------------------------------------------------------
/main.R:
--------------------------------------------------------------------------------
  1 | setwd("E:\\R_workspace\\数据挖掘大作业")
  2 | # 安装包
  3 | # install.packages("openxlsx")
  4 | library(xlsx)  
  5 | library(openxlsx)
  6 | #文件名+sheet的序号
  7 | data_OR<- read.xlsx("口红.xlsx", sheet = 1)
  8 | View(data_OR)
  9 | data_OR<-data.frame(data_OR)
 10 | data<-data_OR[,c(-6,-11,-14,-15,-18)]
 11 | View(data)
 12 | 
 13 | 
 14 | # ###########################################################
 15 | # 数据清洗
 16 | # ###########################################################
 17 | # 对价格进行处理，将 ￥~￥的记录，取平均值
 18 | x_price<-data[,6]
 19 | x_price=data.frame(x_price)
 20 | View(x_price)
 21 | x_price<-apply(x_price,1,function(i){
 22 |   flag=grepl(pattern="~",x=i)
 23 |   if(flag){
 24 |     # 转换为字符串
 25 |     y=as.character(i)
 26 |     # 字符分割
 27 |     n=strsplit(y,split = "~",fixed = T)
 28 |     # 字符截取
 29 |     n<-sapply(n, function(x){
 30 |       substring(x, 2, nchar(x))
 31 |     })
 32 |     # 转换为数字
 33 |     n=apply(n,2,as.numeric)
 34 |     # 求平均
 35 |     n_mean=apply(n,2,mean)
 36 |     i=n_mean
 37 |     j="¥"
 38 |     i=paste(j, i, sep = "")
 39 |   }else{
 40 |     i=i
 41 |   }
 42 |   
 43 | })
 44 | data[,6]<-x_price
 45 | View(data)
 46 | View(data[,6])
 47 | # 去除价格的"￥"
 48 | data[,6]<-sapply(data[,6], function(x){
 49 |   substring(x, 2, nchar(x))
 50 | })
 51 | 
 52 | View(data)
 53 | # 查看结构信息
 54 | summary(data)
 55 | # 将字符串转换为数值
 56 | # dflme1[,2:60]<-lapply(dflme1[,2:60],as.numeric)
 57 | data[,2:8]<-lapply(data[,2:8], as.numeric)
 58 | summary(data)
 59 | # 将字符串转换为因子
 60 | data[,c(1,9:13)]<-lapply(data[,c(1,9:13)], as.factor)
 61 | summary(data)
 62 | View(data)
 63 | # typeof(data$国家)
 64 | # data$国家
 65 | # data$是否进口
 66 | 
 67 | #########################################################
 68 | # 缺失值处理
 69 | ########################################################
 70 | # 缺失值的可视化
 71 | library(mice)
 72 | # 查看缺失数据的情况，各行各列统计
 73 | md_data=md.pattern(data)
 74 | # 可以观测到，缺失的数据，都是非数值的变量
 75 | fix(md_data)
 76 | 
 77 | # 缺失值的缺失行的记录
 78 | data.lack<-which(!complete.cases(data))
 79 | # 转换为 数据框
 80 | data.lack<-data.frame(data.lack)
 81 | # 缺失值的总记录数
 82 | data.lack.num<-nrow(data.lack)
 83 | data.lack.num
 84 | data<-data.frame(data)
 85 | summary(data)
 86 | # 把缺失的记录全部删除
 87 | data_clean<-na.omit(data)
 88 | # 重新查看--缺失值的总个数
 89 | n=sum(is.na(data_clean))
 90 | n
 91 | # 非空数据的行数
 92 | (nrow(data_clean))
 93 | 
 94 | # # 求NA和的函数
 95 | # na.count<-function(x){
 96 | #   sum(is.na(x))
 97 | # }
 98 | # # 缺失值的属性超过所有属性数的30%
 99 | # (idx<-which(apply(data,1,na.count)>=ncol(data2)*0.3))
100 | # # 去除缺失30%属性的记录
101 | # data_clean_01<-data[-idx,]
102 | # 
103 | # 
104 | # # 去除防晒缺失的数据和进口缺失的数据
105 | # idx_Sunscreen<-which((!complete.cases(data_clean_01[,11])))
106 | # idx_Imported<-which((!complete.cases(data_clean_01[,13])))
107 | # data_clean_02<-data_clean_01[c(-idx_Sunscreen,-idx_Imported),]
108 | # 
109 | # # 数据清洗完毕
110 | # data_clean<-data_clean_02
111 | # # 查看缺失的数据
112 | # which(!complete.cases(text_deal))
113 | 
114 | ########################################################
115 | # 文件操作
116 | #######################################################
117 | # 写入文件
118 | write.csv(data_clean,file = "mydata.csv",row.names = F)
119 | # 读文件，共1540条记录
120 | data_handle<-read.table("mydata.csv",header=T, sep=",")
121 | data<-data_handle
122 | View(data)
123 | 
124 | 
125 | #############################################################
126 | # 查看喜欢---颜色----的词云
127 | #############################################################
128 | library(Rwordseg)
129 | library(wordcloud)
130 | library(tm)
131 | library(RColorBrewer)
132 | library(wordcloud)
133 | library(rJava)
134 | # 分词包
135 | # install.packages("jiebaR")
136 | library(jiebaR)
137 | library(jiebaRD) 
138 | 
139 | x_color<- data[,9]
140 | x_color<- data.frame(x_color)
141 | View(x_color)
142 | # 导出 .txt 数据格式
143 | # 这里的col.names=F是不保存列名，col.names=T 为保存列名
144 | # 空格分隔
145 | write.table(x_color, file = "color_ciyun.txt",sep ="\t",
146 |             col.name=F,row.names = F, quote = F) 
147 | # 读文件
148 | # 解决：line 3 did not have 3 elements
149 | # 加入参数：blank.lines.skip=F
150 | color_text<-read.table("color_ciyun.txt",head=FALSE,
151 |                        blank.lines.skip=F,quote = "")
152 | View(color_text)
153 | 
154 | 
155 | #读入数据分隔符是‘\n’，字符编码是‘UTF-8’，what=''表示以字符串类型读入
156 | f_color <- scan('color_ciyun.txt',sep='\n',what='',encoding="GB2312",fileEncoding='GB2312')
157 | #将数据字符串化
158 | f_color <-as.character(f_color) 
159 | View(f_color)
160 | # 造工厂
161 | wk<-worker()
162 | word_color<-c('朱红色','粉红色','粉色','玫红','樱桃红','西瓜红','姨妈色','玫紫','经典红','胭脂红','豆沙色','橘子红','哑光',
163 |               '草莓红','酒红色','玫瑰红','暖橙红','中国红','砖红色','复古红','双色','小样','PBF','M0','80','NO.',
164 |               'SE0','RD0','CR0','PK0','OR','PBG','豆沙红','姨妈','复合','CR','水光','MHS','裸红','RD','CR','1#',
165 |               '2#','3#','4#','5#','6#','7#','8#','9#','10#','11#','12#','13#','14#','15#','#1','#2','#3','#4','#5',
166 |               '#6','#7','#8','#9','#10','#11','#12','#13','#14','#15','#20','#30','#40','#50','#60','#70','#80','#90','M0')
167 | # 使用new_user_word函数
168 | # R语言worker()参数user不起作用
169 | new_user_word(wk,word_color)
170 | seg_color<-wk[f_color]
171 | 
172 | #统计词频
173 | seg_color <- table(seg_color)
174 | #去除单个没有意义的字：‘色’
175 | seg_color <- seg_color[!grepl('色',names(seg_color))] 
176 | #去除单个没有意义的字：‘号’
177 | seg_color <- seg_color[!grepl('号',names(seg_color))] 
178 | #去除NA
179 | seg_color <- seg_color[!grepl('NA',names(seg_color))] 
180 | # seg_color
181 | #查看处理完后剩余的词数
182 | length(seg_color) 
183 | #降序排序，并提取出现次数最多的前100个词语
184 | seg_color<- sort(seg_color, decreasing = TRUE)[1:100]
185 | #查看100个词频最高的
186 | seg_color 
187 | View(seg_color)
188 | data_color=data.frame(seg_color)
189 | # 制作词云
190 | wordcloud(data_color$seg_color , data_color$Freq, colors = rainbow(100), random.order=F)
191 | 
192 | # ######################################################
193 | # 查看喜欢的功效词云
194 | # ######################################################
195 | x_save<-data[,10]
196 | x_save<-data.frame(x_save)
197 | View(x_save)
198 | nrow(x_save)
199 | # 导出 .txt 数据格式
200 | # 这里的col.names=F是不保存列名，col.names=T 为保存列名
201 | # 不输出列名、行名:col.names = FALSE,row.names = FALSE
202 | # 代表字符串的双引号——加参数：quote = FALSE
203 | # 空格分隔
204 | write.table(x_save, file = "effect_ciyun.txt",sep ="\t",
205 |             col.name=F,row.names = F, quote = F) 
206 | 
207 | # 读文件
208 | effect_text<-read.table("effect_ciyun.txt",head=FALSE,sep="\t")
209 | View(effect_text)
210 | 
211 | #读入数据分隔符是‘\n’，字符编码是‘UTF-8’，what=''表示以字符串类型读入
212 | effect_f <- scan('effect_ciyun.txt',sep='\n',what='',encoding="GB2312",fileEncoding='GB2312')
213 | #将数据字符串化
214 | effect_f <-as.character(effect_f) 
215 | # 造工厂
216 | wk<-worker()
217 | word_fenci<-c("易上色","易卸妆","不沾杯","不脱妆","防脱色","非小样唇膏","均匀肤色","防脱妆","不掉色",
218 |               "双色口红","雾面哑光","不脱色","温和卸妆","不粘杯","姨妈色","咬唇妆","提亮肤色","均匀肤色","清爽不油腻",
219 |               "哑光唇","液体唇蜜","粘杯唇膏","唇部去死皮","哑光豆沙色","其他功效","哑光口红","按钮唇膏","唇颊两用",
220 |               "豆沙色","植物成份")
221 | # 使用new_user_word函数
222 | # R语言worker()参数user不起作用
223 | new_user_word(wk,word_fenci)
224 | effect_seg<-wk[effect_f]
225 | 
226 | #去除字符长度小于2的词语
227 | effect_seg <- effect_seg[nchar(effect_seg)>1] 
228 | #统计词频
229 | effect_seg <- table(effect_seg)
230 | #去除数字
231 | effect_seg <- effect_seg[!grepl('[0-9]+',names(effect_seg))] 
232 | #去除字母
233 | effect_seg <- effect_seg[!grepl('[a-zA-Z]{2}',names(effect_seg))] 
234 | effect_seg
235 | #查看处理完后剩余的词数
236 | length(effect_seg) 
237 | #降序排序，并提取出现次数最多的前100个词语
238 | effect_seg <- sort(effect_seg, decreasing = TRUE)[1:100]
239 | #查看100个词频最高的
240 | effect_seg
241 | View(effect_seg)
242 | data_Effect=data.frame(effect_seg)
243 | # 制作词云
244 | wordcloud(data_Effect$effect_seg , data_Effect$Freq, colors = rainbow(100), random.order=F)
245 | 
246 | ########################################################
247 | # 对sales_num进行分类处理
248 | ########################################################
249 | # 查看 总销量的分布图
250 | data3<-data
251 | summary(data3)
252 | nrow(data3)
253 | # 转化为英文字段
254 | names(data3) <- c('name','describe_score','price_score','quality_score',
255 |                   'service_score','price','evaluate_num','sales_num',
256 |                   'color','effect','sunScreen','country','imported')
257 | View(data3)
258 | # 原始数据的统计直方图
259 | hist(data3$sales_num)
260 | # 查看销量的排名
261 | sales_num_sorted<-sort(data3[,8],decreasing = TRUE)
262 | View(sales_num_sorted)
263 | # ###################################################
264 | # 对数据进行分级处理
265 | # 将数据分为A/B/C三个等级
266 | # 设置中间变量对处理后的向量进行临时存储
267 | grade=0
268 | num_sales=nrow(data3)
269 | for(i in 1:num_sales){
270 |   if(data3[i,8]>100){
271 |     grade[i]="A"
272 |   }
273 |   else if(data3[i,8]>10){
274 |     grade[i]="B"
275 |   }
276 |   else{
277 |     grade[i]="C"
278 |   }
279 | }
280 | # 将字符型变量转化为含有因子的变量并复制给数据集data3
281 | grade
282 | data3[,8]=factor(grade)
283 | # 查看三个等级数据的数量
284 | summary(data3$sales_num)
285 | # A   B   C 
286 | # 467 566 507 
287 | View(data3)
288 | 
289 | ############################################################
290 | # 文本处理：功效
291 | #############################################################
292 | #读入数据分隔符是‘\n’，字符编码是‘UTF-8’，what=''表示以字符串类型读入
293 | text_deal<-scan('effect_ciyun.txt',sep='\n',what='',encoding="GB2312",fileEncoding='GB2312')
294 | # 将数据字符串化
295 | text_deal<-as.character(text_deal)
296 | text_deal<-data.frame(text_deal)
297 | # which(!complete.cases(text_deal))
298 | # # 把缺失的记录全部删除
299 | # text_deal2<-na.omit(text_deal)
300 | # View(data3)
301 | # nrow(data3)
302 | # text_deal<-data3
303 | n=sum(is.na(text_deal))
304 | n
305 | View(text_deal)
306 | # 判断行数
307 | num_text<-nrow(text_deal)
308 | # 设置中间变量对处理后的向量进行临时存储
309 | weight_save=0
310 | # 计算权重函数
311 | value_weight=function(x){
312 |   temp=0
313 |   if(grepl("易上色",x)==TRUE){
314 |     temp=temp+4
315 |   }
316 |   if(grepl("滋润",x)==TRUE){
317 |     temp=temp+5
318 |   }
319 |   if(grepl("保湿",x)==TRUE){
320 |     temp=temp+5
321 |   }
322 |   if(grepl("防脱色",x)==TRUE){
323 |     temp=temp+4
324 |   }
325 |   if(grepl("易卸妆",x)==TRUE){
326 |     temp=temp+4
327 |   }
328 |   if(grepl("补水",x)==TRUE){
329 |     temp=temp+5
330 |   }
331 |   if(grepl("温和卸妆",x)==TRUE){
332 |     temp=temp+5
333 |   }
334 |   if(grepl("均匀肤色",x)==TRUE){
335 |     temp=temp+4
336 |   }
337 |   if(grepl("不掉色",x)==TRUE){
338 |     temp=temp+3
339 |   }
340 |   if(grepl("防水",x)==TRUE){
341 |     temp=temp+4
342 |   }
343 |   if(grepl("不沾杯",x)==TRUE){
344 |     temp=temp+3
345 |   }
346 |   if(grepl("哑光",x)==TRUE){
347 |     temp=temp+2
348 |   }
349 |   if(grepl("咬唇妆",x)==TRUE){
350 |     temp=temp+4
351 |   }
352 |   if(grepl("不脱色",x)==TRUE){
353 |     temp=temp+3
354 |   }
355 |   if(grepl("提亮肤色",x)==TRUE){
356 |     temp=temp+5
357 |   }
358 |   if(grepl("持久",x)==TRUE){
359 |     temp=temp+4
360 |   }
361 |   if(grepl("防晒",x)==TRUE){
362 |     temp=temp+3
363 |   }
364 |   if(grepl("其他功能",x)==TRUE){
365 |     temp=temp+1
366 |   }
367 |   return(temp)
368 | }
369 | 
370 | # 依次计算每条记录的权重
371 | for(i in 1:num_text){
372 |   weight_save[i]=value_weight(text_deal[i,])
373 | }
374 | weight_save
375 | # 赋予权重
376 | data3[,10]<-weight_save
377 | # 查看信息
378 | summary(data3[,10])
379 | View(data3)
380 | View(data3[,10])
381 | 
382 | ########################################################
383 | # 数据归一化方法: 最大最小值方法
384 | ########################################################
385 | scale_norma=function(x){
386 |   # 提取预处理样本集中特征变量个数
387 |   ncol=dim(x)[2]
388 |   # 提取预处理样本的样本总量
389 |   nrow=dim(x)[1]
390 |   # 建立用于保存新样本集的矩阵
391 |   new=matrix(0,nrow,ncol)
392 |   
393 |   for(i in 1:ncol){
394 |     # 提取每个变量的最大值
395 |     max=max(x[,i])
396 |     # 提取每个变量的最小值
397 |     min=min(x[,i])
398 |     # 对每个变量的所有样本数据进行归一化处理
399 |     for(j in 1:nrow){
400 |       new[j,i]=(x[j,i]-min)/(max-min)
401 |     }
402 |   }
403 |   new
404 | }
405 | 
406 | # data9<-data3
407 | # 进行归一化
408 | data3[,c(2:7,10)]<-scale_norma(data3[,c(2:7,10)])
409 | View(data3)
410 | 
411 | 
412 | ########################################################
413 | # 文件操作
414 | #######################################################
415 | # 写入文件
416 | write.csv(data3[,-9],file = "mydata_clean.csv",row.names = F)
417 | # 读文件
418 | data_handle2<-read.table("mydata_clean.csv",header=T, sep=",")
419 | data3<-data_handle2
420 | View(data3)
421 | 
422 | 
423 | ##################################################################
424 | # 数据抽样，生成训练集和测试集
425 | ###################################################################
426 | # 没有缺失值的数据
427 | View(data3)
428 | # 对数据进行分组抽样
429 | set.seed(50)
430 | num_sales=nrow(data3)
431 | num_sales
432 | # 分层抽样：sampling ，3:1的规则，
433 | library(sampling)
434 | # 保存分组抽取的个数
435 | a=round(1/4*sum(data3$sales_num=="A"))
436 | b=round(1/4*sum(data3$sales_num=="B"))
437 | c=round(1/4*sum(data3$sales_num=="C"))
438 | # 查看分布
439 | a;b;c;
440 | # 分层抽样
441 | samp=strata(data3,stratanames = "sales_num",size=c(a,b,c),method="srswor")
442 | # View(samp)
443 | # 生成训练集
444 | Train_data=data3[-samp$ID_unit,]
445 | # 生成测试集
446 | Test_data=data3[samp$ID_unit,]
447 | # 查看训练集和测试集的个数
448 | nrow(Train_data);nrow(Test_data)
449 | # 1154 386
450 | # 查看维度
451 | dim(Train_data);dim(Test_data)
452 | # 查看抽取样本的信息
453 | View(Train_data)
454 | View(Test_data)
455 | # head(Train_data3)
456 | # 提取训练集和测试集的行号
457 | Train_data_num<-sample(1:nrow(Train_data),nrow(Train_data))
458 | Test_data_num<-sample(1:nrow(Test_data),nrow(Test_data))
459 | # 1540--1154--386
460 | View(Train_data_num)
461 | View(Test_data_num)
462 | 
463 | 
464 | ####################################################
465 | # 模型构建：判别分析----朴素贝叶斯
466 | #####################################################
467 | library(klaR)
468 | # 模型训练
469 | # 对样本进行预处理，去掉 店名name 
470 | data6<-data3[Train_data_num,-1]
471 | View(data6)
472 | # 生成判别规则
473 | data6_Bayes<-NaiveBayes(sales_num~.,data6)
474 | # names(data6_Bayes)
475 | # data6_Bayes$tables
476 | # data6_Bayes$levels
477 | # data6_Bayes$call
478 | # data6_Bayes$usekernel
479 | # data6_Bayes$varnames
480 | 
481 | ############################################################
482 | # 各类别下变量密度可视化
483 | # 'sunScreen','country','imported'
484 | # 对是否防晒各类别绘制密度图
485 | plot(data6_Bayes,vars="describe_score",main="describe_score--密度图",n=20,lwd = 2,col=c("DeepPink","#D55E00","DarkTurquoise","#0033FF","#000000","#009900")) 
486 | # 对价格各类别绘制密度图
487 | plot(data6_Bayes,vars = "price",main="price--密度图",n=50,lwd = 2,col = c("DeepPink","#D55E00","DarkTurquoise","#0033FF","#000000","#009900")) 
488 | # 对是否进口各类别绘制密度图
489 | plot(data6_Bayes,vars = "imported",main="imported--密度图",n=50,col = c("DeepPink","#D55E00","RDarkTurquoise","#0033FF","#000000","#009900")) 
490 | 
491 | # #############################################
492 | # 预测
493 | # 准备测试集
494 | data7<-data3[Test_data_num,-1]
495 | View(data7)
496 | pred_Bayes<-predict(data6_Bayes,data7)
497 | # pred_Bayes
498 | # 混淆矩阵
499 | table(data7$sales_num,pred_Bayes$class)
500 | 
501 | # 计算贝叶斯判别预测错误概率
502 | error_Bayes<-sum(as.numeric(as.numeric(pred_Bayes$class)
503 |                             !=as.numeric(data7$sales_num)))/nrow(data7)
504 | 
505 | # 不满足各变量之间的独立条件
506 | error_Bayes
507 | # 0.5310881
508 | library(ggplot2)
509 | library(corrplot)
510 | cor_Bayes<-cor(data3[,c(2:7,9)])
511 | corrplot(cor_Bayes,method="number")
512 | corrplot(cor_Bayes,method="pie")
513 | 
514 | ################################################################
515 | # 模型构建：集成学习
516 | ################################################################
517 | library(adabag)
518 | library(rpart)
519 | 
520 | # 模型训练
521 | # 对样本进行预处理,训练集
522 | data4<-data3[Train_data_num,-1]
523 | View(data4)
524 | # 模型构建
525 | boost=boosting(sales_num~.,data4,boos = TRUE,mfinal = 500)
526 | # print(boost)
527 | 
528 | # 查看boost所生成的输出项名称
529 | names(boost)
530 | # [1] "formula"    "trees"      "weights"    "votes"      "prob"       "class"      "importance"
531 | # [8] "terms"      "call" 
532 | # 查看树的构成
533 | boost$trees[20]
534 | # 投票情况
535 | boost$votes[200:215,]
536 | # 预测类别
537 | boost$class[200:215]
538 | # "A" "B" "C" "B" "C" "B" "C" "B" "C" "A" "A" "C" "A" "A" "A" "B"
539 | # 模型boost各输入变量的相对重要性
540 | sort(boost$importance,decreasing = TRUE)
541 | 
542 | # 通过control参数控制基分类树的复杂度
543 | boost2=boosting(sales_num~.,data4,boos = TRUE,mfinal = 500,
544 |                 control = rpart.control(maxdepth = 7))
545 | # 查看树的构成
546 | boost2$trees[20]
547 | ################################################
548 | # 预测
549 | data5<-data3[Test_data_num,-1]
550 | # View(data5)
551 | # 预测
552 | pred_boost=predict(boost2,data5)
553 | # 查看测试集的混淆矩阵
554 | pred_boost$confusion
555 | #                 Observed Class
556 | # Predicted Class   A   B   C
557 | #               A 209  18   5
558 | #               B  16  74  15
559 | #               C   3  12  34
560 | 
561 | # 查看测试集的错误率
562 | pred_boost$error
563 | # 0.1787565
564 | # (p=sum(as.numeric(pre_data3!=Test_data$sales_num))/nrow(Test_data))
565 | pred_boost$class<-as.factor(pred_boost$class)
566 | error_boost<-sum(as.numeric(as.numeric(pred_boost$class)
567 |                             !=as.numeric(data5$sales_num)))/nrow(data5)
568 | 
569 | error_boost
570 | # 0.1787565
571 | 
572 | ############################################################
573 | # 模型构建：随机森林的判别模型
574 | ############################################################
575 | # 把缺失的记录全部删除
576 | # data2<-na.omit(data3)
577 | # View(data2)
578 | # md_data2=md.pattern(data2)
579 | # fix(md_data2)
580 | 
581 | # data3<-data2[,-c(9,10)]
582 | # View(data3)
583 | 
584 | # 查看变量的重要值
585 | library(randomForest)
586 | set.seed(50)
587 | num_sales=nrow(data3)
588 | 
589 | set.seed(111)
590 | # 总销量数：sales_num
591 | # 构建决策树为500棵的随机森林模型
592 | data.rf=randomForest(sales_num~.-name,data=data3,ntree=500,
593 |                      importance=TRUE,proximity=TRUE,subset=Train_data_num)
594 | # 展示所构建的随机森林模型
595 | print(data.rf)
596 | # OOB estimate of  error rate: 29.12%
597 | 
598 | # 提取随机森林模型中的重要值
599 | importance(data.rf)
600 | # 提取随机森林中以第一种度量标准得到的重要值
601 | # importance(data.rf,type=1)
602 | # 对重要值进行排序并取值
603 | data.rf_importance<-data.frame(sort(importance(data.rf,type=1)[,1],decreasing = TRUE))
604 | names(data.rf_importance)<-'importance_sorted'
605 | data.rf_importance
606 | # 调用varlmPlot函数绘制变量重要性曲线
607 | varImpPlot(data.rf)
608 | ########################################################
609 | # 随机森林：模型优化
610 | # 自变量的个数，出去店的名字
611 | num_var=ncol(data3)-1
612 | num_var
613 | # 设置模型误判率向量的初始值
614 | rate=1
615 | # 依次逐个增加节点所选变量个数
616 | for(i in 1:num_var){
617 |   set.seed(200)
618 |   # mtry:用来决定随机森林中决策树的每次分支时所选的变量个数
619 |   model=randomForest(sales_num~.-name,data=data3,mtry=i,importance=TRUE,
620 |                      ntree=1000,proximity=TRUE,subset=Train_data_num)
621 |   # 计算基于OOB数据的模型误判率均值
622 |   rate[i]=mean(model$err.rate)
623 |   # 展示模型简要信息
624 |   print(model)
625 | }
626 | # 展示所有模型误判率的均值
627 | rate
628 | # 选择节点数为的作为模型的优化
629 | set.seed(222)
630 | model=randomForest(sales_num~.-name,data=data3,mtry=5,importance=TRUE,ntree=1000,
631 |                    proximity=TRUE,subset=Train_data_num)
632 | # 绘制模型误差与决策树数量的关系图
633 | plot(model,col=1:6)
634 | 
635 | # 为图像添加图例topright
636 | legend(800,0.180,"A",cex=0.9,bty="n")
637 | legend(800,0.352,"B",cex=0.9,bty="n")
638 | legend(800,0.394,"C",cex=0.9,bty="n")
639 | 
640 | # 当决策树大于之后，模型趋于稳定，选择决策树的数量为，进行优化
641 | set.seed(230)
642 | model2=randomForest(sales_num~.-name,data=data3,mtry=5,importance=TRUE,ntree=400,
643 |                     proximity=TRUE,subset=Train_data_num)
644 | # 展示模型的简要信息
645 | print(model2)
646 | # 绘制相应的柱状图，展示随机森林模型中每棵决策树的节点数
647 | hist(treesize(model2))
648 | # 随机森林模型的可视化,注意，要有一个临近矩阵
649 | # 不同类别，使用的符号不一样
650 | MDSplot(data.rf,data3$sales_num,palette = rep(1,6),pch=as.numeric(data3$sales_num))
651 | varImpPlot(model2)
652 | 
653 | #################################################
654 | # 对测试集进行目标变量预测
655 | #################################################
656 | pre_data3=predict(model2,Test_data,type="class")
657 | # 显示预测结果
658 | pre_data3
659 | # 获取混淆矩阵
660 | table(Test_data$sales_num,pre_data3)
661 | # 计算错误
662 | error_randomForest=sum(as.numeric(pre_data3!=Test_data$sales_num))/nrow(Test_data)
663 | error_randomForest
664 | # 错误率为: 0.06994819
665 | 
666 | 
667 | 
668 | 
669 | 
670 | 


--------------------------------------------------------------------------------
/《数据挖掘实验》课程设计--周伟.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leeeric9527/data-mining-R/37279b96fc90b5c926ec8f948ab74822cbac3ef2/《数据挖掘实验》课程设计--周伟.pdf


--------------------------------------------------------------------------------
/《数据挖掘实验》课程设计PPT--周伟.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leeeric9527/data-mining-R/37279b96fc90b5c926ec8f948ab74822cbac3ef2/《数据挖掘实验》课程设计PPT--周伟.pptx


--------------------------------------------------------------------------------
/口红-data.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/leeeric9527/data-mining-R/37279b96fc90b5c926ec8f948ab74822cbac3ef2/口红-data.xlsx


--------------------------------------------------------------------------------