├── DDoS检测
├── .idea
│ ├── kddcup99.iml
│ ├── misc.xml
│ ├── modules.xml
│ └── workspace.xml
├── README.MD
├── client.py
├── gbdt.py
├── model
│ ├── RF_clf.m
│ ├── back_clf.m
│ ├── land_clf.m
│ ├── neptune_clf.m
│ ├── normal_clf.m
│ ├── pod_clf.m
│ ├── sumrf_clf.m
│ └── teardrop_clf.m
├── mutl_gbdt.py
└── paper
│ ├── 1.png
│ ├── 入侵检测数据集KDDCUP99研究_张新有.pdf
│ └── 网络安全态势感知综述.pdf
├── README.MD
└── 用户异常行为检测
├── All_User_KNN_Score.py
├── All_User_NB_Score.py
├── KNN_50.py
├── KNN_50_Color.py
├── KNN_Result.txt
├── MasqueradeDat
├── User1
├── User10
├── User11
├── User12
├── User13
├── User14
├── User15
├── User16
├── User17
├── User18
├── User19
├── User2
├── User20
├── User21
├── User22
├── User23
├── User24
├── User25
├── User26
├── User27
├── User28
├── User29
├── User3
├── User30
├── User31
├── User32
├── User33
├── User34
├── User35
├── User36
├── User37
├── User38
├── User39
├── User4
├── User40
├── User41
├── User42
├── User43
├── User44
├── User45
├── User46
├── User47
├── User48
├── User49
├── User5
├── User50
├── User6
├── User7
├── User8
├── User9
└── label.txt
├── NB_Result.txt
├── NB_all.py
├── NB_all_Color.py
└── README.MD
/DDoS检测/.idea/kddcup99.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/DDoS检测/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/DDoS检测/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/DDoS检测/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 | 1494337918182
311 |
312 |
313 | 1494337918182
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
628 |
629 |
630 |
631 |
632 |
633 |
634 |
635 |
636 |
637 |
638 |
639 |
640 |
641 |
642 |
643 |
644 |
645 |
646 |
647 |
648 |
649 |
650 |
651 |
652 |
653 |
654 |
655 |
656 |
657 |
658 |
659 |
660 |
661 |
662 |
663 |
664 |
665 |
666 |
667 |
668 |
669 |
670 |
671 |
672 |
673 |
674 |
675 |
676 |
677 |
678 |
679 |
680 |
681 |
682 |
683 |
684 |
685 |
686 |
687 |
688 |
689 |
690 |
691 |
692 |
693 |
694 |
695 |
696 |
697 |
698 |
699 |
700 |
701 |
702 |
703 |
704 |
705 |
706 |
707 |
--------------------------------------------------------------------------------
/DDoS检测/README.MD:
--------------------------------------------------------------------------------
1 | ## 用KDDCUP99 预测DDoS
2 |
3 | 这是我第一次试水机器学习,代码中有丰富的注释,便于初学者理解,paper文件夹有引用的论文
4 | 直接上思路:
5 | 
6 |
7 | - ### KDDCUP 99数据集简介:
8 | Kddcup99数据集包含25种攻击数据、正常数据、主机配置信息、日志信息。kddcup99有41个特征,对于基于内容的特征攻击预测十分不明朗,特征也有许多不确定性。但是基于连接类型和2秒内连接数量的特征预测与之还是不错的。所以对于数据包分组层面防范还是有很大优势。
9 | 发现数据集在预测DDoS方面效果还不错,这里训练数据、测试数据6/4。学习识别防护DOS攻击
10 |
11 | [数据是已经打标好的,文件过大,放在百度网盘,密码:ojha](https://pan.baidu.com/s/1d7CMe6sO9NqKj2YEygxYjQ)
12 |
13 | - ### 特征提取与数据清洗:
14 | 这里感谢卑尔根大学的kdd99_feature_extractor开源项目,可以将网络分组数据化解为KDDCUP 99的特征的数据,给项目带来了可以部署在生产环境的希望
15 | - ### 模型训练:
16 | 由于数据已经打标,这里用监督式学习,采用梯度提升树训练模型,gbdt.py为训练模型,client.py为加载模型预测,其余文件没用
17 |
18 | - ### 预测的准确率:
19 | 交叉验证,精度在97左右
20 |
21 | - ### 不足之处以及未来打算:
22 |
23 | 采用监督式学习,而且数据集过老,在实际生产环境中并不适用,但是提供了解决问题的思路。
24 | 在此立下flag,有时间换[UNSW-NB15](https://www.unsw.adfa.edu.au/australian-centre-for-cyber-security/cybersecurity/ADFA-NB15-Datasets/)数据集再来一次,这个数据集比较新,2015年的,有部署在实际生产环境中的意义
--------------------------------------------------------------------------------
/DDoS检测/client.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.externals import joblib
3 | import time
4 |
5 | class DDos_Dection:
6 | def __init__(self):
7 | self.normal_clf = joblib.load('model/normal_clf.m')
8 | self.back_clf = joblib.load('model/back_clf.m')
9 | self.land_clf = joblib.load('model/land_clf.m')
10 | self.neptune_clf = joblib.load('model/neptune_clf.m')
11 | self.pod_clf = joblib.load('model/pod_clf.m')
12 | self.sumrf_clf = joblib.load('model/sumrf_clf.m')
13 | self.teardrop_clf = joblib.load('model/teardrop_clf.m')
14 |
15 | #加载clf
16 | self.clf = []
17 | self.ddos_type = ['back', 'land', 'neptune', 'pod', 'sumrf', 'teardrop']
18 | self.clf.append(self.back_clf)
19 | self.clf.append(self.land_clf)
20 | self.clf.append(self.neptune_clf)
21 | self.clf.append(self.pod_clf)
22 | self.clf.append(self.sumrf_clf)
23 | self.clf.append(self.teardrop_clf)
24 |
25 | def dection(self,data):
26 | type1 = []
27 | normal_flag = self.normal_clf.predict(data) #先看是不是正常
28 | if normal_flag==1:
29 | type1.append('normal')
30 | elif normal_flag==0: #再看是哪种ddos
31 | ddos_type = []
32 | for i in range(6):
33 | temp = self.clf[i].predict(data)
34 | if temp==1:
35 | ddos_type.append(self.ddos_type[i])
36 | type1 = ddos_type
37 | return type1
38 |
39 |
40 |
41 | def main():
42 | data = pd.read_csv('data/feature.csv')
43 | data = data.values
44 | data = data[:,1:] #取出所有数据的第二列到最后一列的数据
45 | clf = DDos_Dection() # 加载DDos检测类
46 | for i in range(500): #先测试个500条
47 | feature = data[i]
48 | #print('load data finish!')
49 | #print('load dection finish!')
50 | print('feature:')
51 | print(feature)
52 | result = clf.dection(feature)
53 | print('dect result :',result)
54 | time.sleep(5)
55 |
56 |
57 |
58 | if __name__ == '__main__':
59 | main()
--------------------------------------------------------------------------------
/DDoS检测/gbdt.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.externals import joblib #模型保存模块dump方法
6 | from sklearn.metrics import recall_score
7 | from sklearn.metrics import f1_score#评估方法包
8 | import lightgbm as lgb
9 |
10 | df1 = pd.read_csv('data/feature.csv')
11 | df2 = pd.read_csv('data/class.csv')
12 |
13 | x = df1.values #注意values和index的区别
14 | y = df2.values
15 | x = x[:,1:] #除去第一列的所有数
16 | #y = y[:,1:]
17 | x = x.astype('float32')#被划分的样本特征集
18 | y = y.astype('float32')#被划分的样本标签
19 |
20 |
21 | back = y[:,0] #取所有行第一个数
22 | land = y[:,1] #取所有行第二个数,这里对应特征
23 | neptune = y[:,2]
24 | normal = y[:,3]
25 | pod = y[:,4]
26 | smurf = y[:,5]
27 | teardrop = y[:,6]
28 |
29 | print(x.shape) #shap函数,几行几列np.shape
30 | print(normal.shape)
31 |
32 | y = teardrop#先只针对teardrop预测
33 |
34 | X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state=0)#验证集占训练集40%,随机种子(random_state)每次不一样
35 | print('data load finish.....')
36 |
37 | print(np.sum(y_train))
38 | print(np.sum(y_test)) #没有交叉验证,
39 |
40 | clf = GradientBoostingClassifier(n_estimators=50,learning_rate=0.1,max_depth=5,verbose=1)
41 | #gbdt初始化(迭代最大次数太大过拟合太小欠拟合,步长,决策树的最大深度,输出日志
42 | clf.fit(X_train,y_train) #训练
43 | y_ = clf.predict(X_test)#预测
44 |
45 | score = f1_score(y_test,y_) #预测准确率
46 |
47 | print(score)
48 |
49 | joblib.dump(clf,'model/teardrop_clf.m')
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
--------------------------------------------------------------------------------
/DDoS检测/model/RF_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/RF_clf.m
--------------------------------------------------------------------------------
/DDoS检测/model/back_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/back_clf.m
--------------------------------------------------------------------------------
/DDoS检测/model/land_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/land_clf.m
--------------------------------------------------------------------------------
/DDoS检测/model/neptune_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/neptune_clf.m
--------------------------------------------------------------------------------
/DDoS检测/model/normal_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/normal_clf.m
--------------------------------------------------------------------------------
/DDoS检测/model/pod_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/pod_clf.m
--------------------------------------------------------------------------------
/DDoS检测/model/sumrf_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/sumrf_clf.m
--------------------------------------------------------------------------------
/DDoS检测/model/teardrop_clf.m:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/model/teardrop_clf.m
--------------------------------------------------------------------------------
/DDoS检测/mutl_gbdt.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | import numpy as np
3 | from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
4 | from sklearn.model_selection import train_test_split
5 | from sklearn.externals import joblib
6 | from sklearn.metrics import recall_score
7 | from sklearn.metrics import f1_score
8 | from sklearn.multiclass import OneVsRestClassifier
9 |
10 | df1 = pd.read_csv('data/feature.csv')
11 | df2 = pd.read_csv('data/class.csv')
12 |
13 | x = df1.values
14 | y = df2.values
15 | x = x[:,1:]
16 | y = y[:,1:]
17 | x = x.astype('float32')
18 | y = y.astype('float32')
19 | y = np.argmax(y,axis=1)
20 |
21 | print(x.shape)
22 | print(y.shape)
23 |
24 |
25 | X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.4,random_state=0)
26 | print('data load finish.....')
27 | #print(df2.columns)
28 | #print(np.sum(y_train,axis=0))
29 | #print(np.sum(y_test,axis=0))
30 |
31 | clf = GradientBoostingClassifier(n_estimators=50,learning_rate=0.1,max_depth=5,verbose=1)
32 | clf2 = OneVsRestClassifier(clf)
33 | clf.fit(X_train,y_train)
34 | joblib.dump(clf,'model/mutul_gbdt_clf.m')
35 |
36 | y_ = clf.predict(X_test)
37 |
38 | score = recall_score(y_test,y_)
39 | #召回率
40 | print(score)
41 |
42 |
--------------------------------------------------------------------------------
/DDoS检测/paper/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/paper/1.png
--------------------------------------------------------------------------------
/DDoS检测/paper/入侵检测数据集KDDCUP99研究_张新有.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/paper/入侵检测数据集KDDCUP99研究_张新有.pdf
--------------------------------------------------------------------------------
/DDoS检测/paper/网络安全态势感知综述.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aviraonepiece/machine_learning/ed4a286df0d7638a67ba3d833f651fc4fc0aaa7e/DDoS检测/paper/网络安全态势感知综述.pdf
--------------------------------------------------------------------------------
/README.MD:
--------------------------------------------------------------------------------
1 | # 机器学习算法解决网络安全问题的一些典例
2 |
3 | 原来名字没这么low的,是跟着导师做项目,名字叫“基于大数据技术的网络安全态势感知”:
4 |
5 | > 1. 采集许许多多的数据,收集所测试网络的静态和动态信息,包括网络的拓扑信息、环境配置,和状态信息等,以及网络流量数据、各种防护措施的日志,IDS警报等基本的运行信息。
6 | > 2. 利用大数据技术来分析取得的流量数据和各种日志,进行数据处理和集成,并分析其关联性 探索出在攻击发生之前的流量特征,包括在网络入口时的特征,以及在提升权限时流量特征
7 | > 3. 进行量化分析,建立模型预测出不同流量特征可能受到的安全威胁,由传统的被动防御转化为主动防御,提高应急响应能力用最低成本预防最多攻击
8 |
9 | but!!对于大三的我来说训练数据采集是一件比较难的事情,模型对数据的要求很高,所以直接用现有高质量的数据集做练习,顺便开启了我的机器学习启蒙之路。
10 |
11 | 自己的一点思考:**目前机器学习引擎还是辅助,还是需要规则库这样的东西结合,才能实现真正的态势感知、主动防火墙。这是当今防护产品的主流思路。两者最劳神的就是正则表达式规则撰写、和机器学习数据清洗与打标。强化学习目前在安全行业还是比较看好的,我也相信强化学习在IPS、waf对抗方面将来会取得成就**
12 |
13 | 现在将demo晒出来,点开跳转到对应子项目的介绍。我还是机器学习初学者,有错误的地方欢迎指正。实验环境全部为Python 3.6.0 |Anaconda 4.3.1 (64-bit)
14 |
15 | - [基于KDDCUP 99数据集预测DDoS攻击](https://github.com/aviraonepiece/machine_learning/tree/master/DDoS%E6%A3%80%E6%B5%8B)
16 |
17 | - [基于MasqueradeDat的用户异常行为检测](https://github.com/aviraonepiece/machine_learning/tree/master/%E7%94%A8%E6%88%B7%E5%BC%82%E5%B8%B8%E8%A1%8C%E4%B8%BA%E6%A3%80%E6%B5%8B)
18 |
19 |
--------------------------------------------------------------------------------
/用户异常行为检测/All_User_KNN_Score.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
4 |
5 | from sklearn.metrics import classification_report
6 | import sys
7 | import numpy as np
8 | import nltk
9 |
10 | from sklearn.neighbors import KNeighborsClassifier
11 |
12 | import pandas as pd
13 | from sklearn import svm
14 | from sklearn.naive_bayes import GaussianNB
15 | from sklearn import model_selection
16 | import warnings
17 | warnings.filterwarnings("ignore")
18 |
19 | #训练集的样本操作序列数为N(从前往后数),包含前50个正常的;测试集的样本数为150-N(从后往前数),共150个操作序列
20 | N=100
21 |
22 |
23 |
24 | def load_user_cmd(filename): #加载操作序列文件函数
25 | cmd_list=[]
26 | dist_max=[]
27 | dist_min=[]
28 | dist=[]
29 | # 下面代表着一次读取操作命令 每一百个组成一个操作序列,放到cmd_list
30 | with open(filename) as f: #作为 try 打开文件读取 finnlly f.close的简洁写法
31 | i=0
32 | x=[]
33 | for line in f:
34 | line=line.strip('\n') #去掉开头结尾的换行
35 | x.append(line)
36 | dist.append(line)
37 | i+=1
38 | if i == 100:
39 | cmd_list.append(x)
40 | #print(x) 循环内的每个x都是一个操作序列,包含100条命令
41 | x=[]
42 | i=0
43 | #print(cmd_list) 只有一个cmd_list,且里面有15000/100=150个x(操作序列)0-149
44 |
45 |
46 |
47 | fist=dist[0:5000] #dist里面由上面得出,包含15000个命令
48 | fdist = nltk.FreqDist(fist)#由于后面的要做特征重合度对比,个人认为统计正常操作习惯的最频繁最不频繁比较妥
49 | ser = pd.Series(fdist)
50 | sersort = ser.sort_values() # 按照升序排列
51 | dist_min = sersort.index[0:50].tolist() # 取出频率最小的前50个操作命令
52 | dist_max = sersort.index[-50:].tolist() # 取出频率最大的最后50个操作命令
53 | return cmd_list, dist_max, dist_min
54 |
55 | def get_user_cmd_feature(user_cmd_list,dist_max,dist_min):#user_cmd_list是150个操作序列,0-149
56 | user_cmd_feature=[]
57 | for cmd_block in user_cmd_list:
58 | f1=len(set(cmd_block)) #抽取每一个序列,对序列里的命令做去重操作,去重后的命令个数作为特征一
59 |
60 |
61 | fdist = nltk.FreqDist(cmd_block)
62 | ser = pd.Series(fdist)
63 | sersort = ser.sort_values() # 按照升序排列
64 | f3 = sersort.index[0:10].tolist() # 取出最不频繁的10个操作命令作为特征三
65 | f2 = sersort.index[-10:].tolist() # 取出最频繁的10个操作命令作为特征二
66 |
67 | f2 = len(set(f2) & set(dist_max)) #和最频繁的50条命令计算重合个数
68 | f3=len(set(f3)&set(dist_min)) #和最不频繁的50条命令计算重合个数
69 | x=[f1,f2,f3]
70 | user_cmd_feature.append(x)
71 | return user_cmd_feature
72 |
73 | def get_label(filename,index=0):
74 | x=[]
75 |
76 | with open(filename) as f:
77 |
78 | for line in f:
79 | line=line.strip('\n')#去掉换行符,否则会打印多一个回车
80 | x.append( int(line.split()[index]))#index为标记,第3个用户的标记是line[用户异常行为检测],从0开始
81 |
82 | return x #x为竖列对应的标记
83 |
84 | if __name__ == '__main__':
85 |
86 | for usernum in range(1,51):
87 |
88 | user_cmd_list,user_cmd_dist_max,user_cmd_dist_min=load_user_cmd("D:/ml/用户异常行为检测/MasqueradeDat/User%s" % (usernum))#"./MasqueradeDat/User9"
89 | #此时最频繁的命令已经被统计,放入特征提取(数据清洗)
90 | user_cmd_feature=get_user_cmd_feature(user_cmd_list,user_cmd_dist_max,user_cmd_dist_min)
91 | #此时得到了特征集,共三个数值(特征提取)
92 | labels=get_label("D:/ml/用户异常行为检测/MasqueradeDat/label.txt",usernum-1)
93 | #此时得到了样本标签,是一个竖列标记
94 | y=[0]*50+labels #在lables[100]这个list从前插入50个0,意味着前面的50个操作序列是正常的,凑成完整的操作序列
95 |
96 |
97 | x_train=user_cmd_feature[0:N] #被划分的样本特征集,训练集(命令数,与统计重合的最频繁命令数,最不频繁的命令数)(150个取N个,含50个正常)
98 | y_train=y[0:N] #被划分的样本标签 ,训练集(150个取N个,含50个正常)
99 |
100 | x_test=user_cmd_feature[N:150] #测试集,样本特征集取后50个
101 | y_test=y[N:150] #测试集,样本标签取后50个
102 |
103 | #KNN
104 | neigh = KNeighborsClassifier(n_neighbors=6,algorithm='auto') #k值经过调整,设为6,方法自动选择,原来有三个方法
105 | neigh.fit(x_train, y_train)
106 | y_predict=neigh.predict(x_test) #根据模型对测试集进行一个预测
107 | score=np.mean(y_test==y_predict)*100 #将的预测标记和已有的特征标记做对比,取均值
108 | # print ('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum),y_test)
109 | #print (' KNN的预测后50个操作序列特征标签是(0为正常):',y_predict.tolist())
110 | print ('User %s KNN异常操作的预测准确率是:'%(usernum),score)
111 | target_name = ['正常', '异常']
112 | # print (classification_report(y_test, y_predict,target_names=target_name))
113 |
114 | # print(model_selection.cross_val_score(neigh, user_cmd_feature, y, n_jobs=-1, cv=10))
115 | y_predict_knn10=model_selection.cross_val_predict(neigh, user_cmd_feature, y, n_jobs=-1, cv=10)
116 | score = np.mean(y_test == y_predict_knn10[-50:]) * 100
117 | # 将的预测标记和已有的特征标记做对比,取均值,这里取150个的后50个序列(测试集序列)
118 | # print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
119 | # print('十折交叉验证后50个操作序列特征标签是(0为正常):', y_predict_knn10[-50:].tolist())#同样取后50个测试集
120 | print('User %s KNN的十折交叉异常操作的预测准确率是:' %(usernum), score,'\n')
121 |
122 |
123 |
124 |
125 | # #SVM
126 | # clfsvm = svm.SVC(kernel='linear', C=1).fit(x_train, y_train)
127 | # y_predict_svm = neigh.predict(x_test) # 根据模型对测试集进行一个预测
128 | # score = np.mean(y_test == y_predict_svm) * 100 # 将的预测标记和已有的特征标记做对比,取均值
129 | # print('SVM实际的后50个特征标签是(0为正常):', y_test)
130 | # print('SVM的预测后50个特征标签是(0为正常):', y_predict_svm.tolist())
131 | # print('SVM异常操作的预测准确率是:', score)
132 | # target_name = ['正常', '异常']
133 | # print(classification_report(y_test, y_predict_svm, target_names=target_name))
134 | #
135 | #
136 | # #NB
137 | # clfnb = GaussianNB().fit(x_train, y_train)
138 | # y_predict_nb = clfnb.predict(x_test)
139 | # score = np.mean(y_test == y_predict_nb) * 100 # 将的预测标记和已有的特征标记做对比,取均值
140 | # print('NB实际的后50个特征标签是(0为正常):', y_test)
141 | # print('NB的预测后50个特征标签是(0为正常):', y_predict_nb.tolist())
142 | # print('NB异常操作的预测准确率是:', score)
143 | # target_name = ['正常', '异常']
144 | # print(classification_report(y_test, y_predict_nb, target_names=target_name))
145 |
146 |
--------------------------------------------------------------------------------
/用户异常行为检测/All_User_NB_Score.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
4 | import numpy as np
5 | from sklearn.metrics import classification_report
6 | import sys
7 |
8 | from nltk.probability import FreqDist
9 |
10 | from sklearn.neighbors import KNeighborsClassifier
11 | from sklearn import svm
12 | from sklearn.naive_bayes import GaussianNB
13 | from sklearn import model_selection
14 | import warnings
15 | warnings.filterwarnings("ignore")
16 |
17 | ##训练集的样本操作序列数为N(从前往后数),包含前50个正常的;测试集的样本数为150-N(从后往前数),共150个操作序列
18 | N=100
19 |
20 | def load_user_cmd_all(filename):#加载操作序列文件函数
21 | cmd_list=[]
22 | dist=[]
23 | with open(filename) as f:# 下面代表着一次读取操作命令 每一百个组成一个操作序列,放到cmd_list
24 | i=0
25 | x=[]
26 | for line in f:
27 | line=line.strip('\n')
28 | x.append(line)
29 | dist.append(line)
30 | i+=1
31 | if i == 100:
32 | cmd_list.append(x)
33 | #print(x) 每个x都是一个操作序列,包含100条命令
34 | x=[]
35 | i=0
36 | # print(cmd_list) 只有一个cmd_list,且里面有15000/100=150个x(操作序列)0-149
37 |
38 | fdist = list(FreqDist(dist).keys()) #dist由上面得出,包含15000个命令,这一步是去重
39 | return cmd_list,fdist #cmd_list有150个操作序列,fdist是去重后的命令集合
40 |
41 | def get_user_cmd_feature_all(user_cmd_list, dist):#传入上面函数返回的两个值,user_cmd_list是150个操作序列,dist上同fdist
42 | user_cmd_feature=[]
43 |
44 | for cmd_list in user_cmd_list: #,150个序列,迭代150次 cmd_list,一个cmd_list代表一个序列,含100条命令
45 | v=[0]*len(dist) #dist是fdist,值依照不同用户而不同,初始化向量为全零,User3 的v为107个
46 |
47 | for i in range(0,len(dist)): #对于107个向量的分量进行迭代
48 | if dist[i] in cmd_list: #如果向量的第i个分量(命令)在本次的cmd_list中被找到
49 | v[i]+=1 #对应的分量+1
50 | user_cmd_feature.append(v) #user_cmd_feature收纳对应本次cmd_list序列的向量
51 |
52 | return user_cmd_feature #对应有150个向量,每个向量有len(fdist)(去重后命令个数)个分量,user3为107个
53 |
54 | def get_label(filename,index=0):
55 | x=[]
56 | with open(filename) as f: #作为 try 打开文件读取 finnlly f.close的简洁写法
57 | for line in f:
58 | line=line.strip('\n')
59 | x.append( int(line.split()[index]))# x对应lable.txt的一个竖列,代表一个用户的所有操作序列标签
60 | return x
61 |
62 | if __name__ == '__main__':
63 |
64 | for usernum in range(1,51):
65 |
66 | user_cmd_list,dist=load_user_cmd_all("D:/ml/用户异常行为检测/MasqueradeDat/User%s" % (usernum))#dist为去重后的序列
67 | # print ("该用户的去重向量表Dist:(%s)" % dist)
68 | user_cmd_feature=get_user_cmd_feature_all(user_cmd_list, dist)#150个向量,每个向量有len(dist)个分量,1或0表示
69 |
70 | labels=get_label("D:/ml/用户异常行为检测/MasqueradeDat/label.txt",usernum-1)
71 | y=[0]*50+labels #加上前50个正常的序列标签
72 |
73 | x_train=user_cmd_feature[0:N]#取前N(100)个训练集(序列向量,样本特征集)
74 | y_train=y[0:N] #取前N个对应的样本特征标签
75 |
76 | x_test=user_cmd_feature[N:150]#测试集特征集
77 | y_test=y[N:150] #测试集特征标签
78 |
79 |
80 |
81 | clf=GaussianNB().fit(x_train,y_train)
82 | y_predict=clf.predict(x_test)
83 | score=np.mean(y_test==y_predict)*100
84 |
85 |
86 | # print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
87 | # print(' NB预测的后50个操作序列特征标签是(0为正常):', y_predict.tolist())
88 | print('User %s NB异常操作的预测准确率是:' %(usernum), score)
89 |
90 | target_name = ['正常', '异常']
91 | # print(classification_report(y_test, y_predict, target_names=target_name))
92 | # print( model_selection.cross_val_score(clf, user_cmd_feature, y, n_jobs=-1,cv=10))
93 |
94 |
95 | y_predict_nb10 = model_selection.cross_val_predict(clf, user_cmd_feature, y, n_jobs=-1, cv=10)
96 | score = np.mean(y_test == y_predict_nb10[-50:]) * 100
97 | # 将预测的标记和已有的特征标记做对比,取均值,这里取150个的后50个序列(测试集序列)
98 | #print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
99 | #print('十折交叉验证后50个操作序列特征标签是(0为正常):', y_predict_nb10[-50:].tolist()) # 同样取后50个测试集
100 | print('User %s NB的十折交叉异常操作的预测准确率是:'%(usernum), score,'\n')
101 |
102 |
103 |
--------------------------------------------------------------------------------
/用户异常行为检测/KNN_50.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 |
3 |
4 |
5 | from sklearn.metrics import classification_report
6 | import sys
7 | import numpy as np
8 | import nltk
9 |
10 | from sklearn.neighbors import KNeighborsClassifier
11 |
12 | import pandas as pd
13 | # from sklearn import svm
14 | # from sklearn.naive_bayes import GaussianNB
15 | from sklearn import model_selection
16 | import warnings
17 | warnings.filterwarnings("ignore")
18 |
19 | #训练集的样本操作序列数为N(从前往后数),包含前50个正常的;测试集的样本数为150-N(从后往前数),共150个操作序列
20 | N=100
21 |
22 |
23 |
24 | def load_user_cmd(filename): #加载操作序列文件函数
25 | cmd_list=[]
26 | dist_max=[]
27 | dist_min=[]
28 | dist=[]
29 | # 下面代表着一次读取操作命令 每一百个组成一个操作序列,放到cmd_list
30 | with open(filename) as f: #作为 try 打开文件读取 finnlly f.close的简洁写法
31 | i=0
32 | x=[]
33 | for line in f:
34 | line=line.strip('\n') #去掉开头结尾的换行
35 | x.append(line)
36 | dist.append(line)
37 | i+=1
38 | if i == 100:
39 | cmd_list.append(x)
40 | #print(x) 循环内的每个x都是一个操作序列,包含100条命令
41 | x=[]
42 | i=0
43 | #print(cmd_list) 只有一个cmd_list,且里面有15000/100=150个x(操作序列)0-149
44 |
45 |
46 |
47 | fist=dist[0:5000] #dist里面由上面得出,包含15000个命令
48 | fdist = nltk.FreqDist(fist)#由于后面的要做特征重合度对比,个人认为统计正常操作习惯的最频繁最不频繁比较妥
49 | ser = pd.Series(fdist)
50 | sersort = ser.sort_values() # 按照升序排列
51 | dist_min = sersort.index[0:50].tolist() # 取出频率最小的(前50个)操作命令
52 | dist_max = sersort.index[-50:].tolist() # 取出频率最大的(最后50个)操作命令
53 | return cmd_list, dist_max, dist_min
54 |
55 | def get_user_cmd_feature(user_cmd_list,dist_max,dist_min):#user_cmd_list是150个操作序列,0-149
56 | user_cmd_feature=[]
57 | for cmd_block in user_cmd_list:
58 | f1=len(set(cmd_block)) #抽取每一个序列,对序列里的命令做去重操作,去重后的命令个数作为特征一
59 |
60 |
61 | fdist = nltk.FreqDist(cmd_block)
62 | ser = pd.Series(fdist)
63 | sersort = ser.sort_values() # 按照升序排列
64 | f3 = sersort.index[0:10].tolist() # 取出最不频繁的10个操作命令作为特征三
65 | f2 = sersort.index[-10:].tolist() # 取出最频繁的10个操作命令作为特征二
66 |
67 | f2 = len(set(f2) & set(dist_max)) #和最频繁的50条命令计算重合个数
68 | f3=len(set(f3)&set(dist_min)) #和最不频繁的50条命令计算重合个数
69 | x=[f1,f2,f3]
70 | user_cmd_feature.append(x)
71 | return user_cmd_feature
72 |
73 | def get_label(filename,index=0):
74 | x=[]
75 |
76 | with open(filename) as f:
77 |
78 | for line in f:
79 | line=line.strip('\n')#去掉换行符,否则会打印多一个回车
80 | x.append( int(line.split()[index]))#index为标记,第3个用户的标记是line[用户异常行为检测],从0开始
81 |
82 | return x #x为竖列对应的标记
83 |
84 | if __name__ == '__main__':
85 | arg=sys.argv
86 | try:
87 | if len(arg)>=2 and 0=1 and 0= 2 and 0 < int(arg[1]) < 51: # 51代表是50个用户
66 |
67 | usernum = int(arg[1])
68 | user_cmd_list,dist=load_user_cmd_all("D:/ml/用户异常行为检测/MasqueradeDat/User%s" % (usernum))#dist为去重后的序列
69 | # print ("该用户的去重向量表Dist:(%s)" % dist)
70 | user_cmd_feature=get_user_cmd_feature_all(user_cmd_list, dist)#150个向量,每个向量有len(dist)个分量,1或0表示
71 |
72 | labels=get_label("D:/ml/用户异常行为检测/MasqueradeDat/label.txt",usernum-1)
73 | y=[0]*50+labels #加上前50个正常的序列标签
74 |
75 | x_train=user_cmd_feature[0:N]#取前N(100)个训练集(序列向量,样本特征集)
76 | y_train=y[0:N] #取前N个对应的样本特征标签
77 |
78 | x_test=user_cmd_feature[N:150]#测试集特征集
79 | y_test=y[N:150] #测试集特征标签
80 |
81 |
82 |
83 | clf=GaussianNB().fit(x_train,y_train)
84 | y_predict=clf.predict(x_test)
85 | score=np.mean(y_test==y_predict)*100
86 |
87 |
88 | print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
89 |
90 | print(' NB预测的后50个操作序列特征标签是(0为正常):', y_predict.tolist())
91 | print('NB异常操作的预测准确率是:', score)
92 | target_name = ['正常', '异常']
93 | print(classification_report(y_test, y_predict, target_names=target_name))
94 | print( model_selection.cross_val_score(clf, user_cmd_feature, y, n_jobs=-1,cv=10))
95 |
96 |
97 | y_predict_nb10 = model_selection.cross_val_predict(clf, user_cmd_feature, y, n_jobs=-1, cv=10)
98 | score = np.mean(y_test == y_predict_nb10[-50:]) * 100
99 | # 将预测的标记和已有的特征标记做对比,取均值,这里取150个的后50个序列(测试集序列)
100 | print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
101 | print('十折交叉验证后50个操作序列特征标签是(0为正常):', y_predict_nb10[-50:].tolist()) # 同样取后50个测试集
102 | print('NB的十折交叉异常操作的预测准确率是:', score)
103 |
104 |
105 | try:
106 | if len(arg)==3 and 0= 1 and 0 < int(arg[0]) < 51: # 51代表是50个用户
68 |
69 | usernum = int(arg[0])
70 | user_cmd_list,dist=load_user_cmd_all("D:/ml/用户异常行为检测/MasqueradeDat/User%s" % (usernum))#dist为去重后的序列
71 | # print ("该用户的去重向量表Dist:(%s)" % dist)
72 | user_cmd_feature=get_user_cmd_feature_all(user_cmd_list, dist)#150个向量,每个向量有len(dist)个分量,1或0表示
73 |
74 | labels=get_label("D:/ml/用户异常行为检测/MasqueradeDat/label.txt",usernum-1)
75 | y=[0]*50+labels #加上前50个正常的序列标签
76 |
77 | x_train=user_cmd_feature[0:N]#取前N(100)个训练集(序列向量,样本特征集)
78 | y_train=y[0:N] #取前N个对应的样本特征标签
79 |
80 | x_test=user_cmd_feature[N:150]#测试集特征集
81 | y_test=y[N:150] #测试集特征标签
82 |
83 |
84 |
85 | clf=GaussianNB().fit(x_train,y_train)
86 | y_predict=clf.predict(x_test)
87 | score=np.mean(y_test==y_predict)*100
88 |
89 | print()
90 | print('模型算法:NB朴素贝叶斯算法 特征提取:将用户的操作序列化为词集向量')
91 | print('User%s的测试集里50个操作序列特征检测情况(\033[1;30;42m0 \033[0m为正常,代表是该用户操作 ; \033[1;30;41m1 \033[0m为异常,代表不是该用户操作):' % (usernum))
92 | print()
93 | print('操作序列的\033[1;30;0m实际值\033[0m(50个序列): ', end='')
94 | for i in range(0, (150-N)):
95 | if y_test[i] == 1:
96 | print('\033[1;30;41m1 \033[0m', end='')
97 | else:
98 | print('\033[1;30;42m0 \033[0m', end='')
99 | print()
100 | print(' NB算法的\033[1;30;0m预测值\033[0m(50个序列): ', end='')
101 | for i in range(0, (150-N)):
102 | if y_predict.tolist()[i] == 1:
103 | print('\033[1;30;41m1 \033[0m', end='')
104 | else:
105 | print('\033[1;30;42m0 \033[0m', end='')
106 | print()
107 |
108 | print(' NB异常操作的预测准确率是:\033[1;30;0m%s \033[0m ' % (score))
109 |
110 | # print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
111 | #
112 | # print(' NB预测的后50个操作序列特征标签是(0为正常):', y_predict.tolist())
113 | # print('NB异常操作的预测准确率是:', score)
114 |
115 | #NB的细节
116 | # target_name = ['正常', '异常']
117 | # print(classification_report(y_test, y_predict, target_names=target_name))
118 | # print( model_selection.cross_val_score(clf, user_cmd_feature, y, n_jobs=-1,cv=10))
119 |
120 |
121 | y_predict_nb10 = model_selection.cross_val_predict(clf, user_cmd_feature, y, n_jobs=-1, cv=10)
122 | # cross_val_predict 返回的是estimator 的分类结果(或回归值),这个对于后期模型的改善很重要,
123 | # 可以通过该预测输出对比实际目标值,准确定位到预测出错的地方,为我们参数优化及问题排查十分的重要。
124 | score = np.mean(y_test == y_predict_nb10[-(150-N):]) * 100
125 | # 将预测的标记和已有的特征标记做对比,取均值,这里取150个的后50个序列(测试集序列)
126 | print()
127 | print()
128 | print('模型算法:NB朴素贝叶斯算法十折交叉验证 特征提取:将用户的操作序列化为词集向量')
129 | print( 'User%s的测试集里50个操作序列特征检测情况(\033[1;30;42m0 \033[0m为正常,代表是该用户操作 ; \033[1;30;41m1 \033[0m为异常,代表不是该用户操作):' % (usernum))
130 | print()
131 | print(' 操作序列的\033[1;30;0m实际值\033[0m(50个序列): ', end='')
132 | # print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
133 | for i in range(0, (150-N)):
134 | if y_test[i] == 1:
135 | print('\033[1;30;41m1 \033[0m', end='')
136 | else:
137 | print('\033[1;30;42m0 \033[0m', end='')
138 | print()
139 | print('NB十折交叉的\033[1;30;0m预测值\033[0m(50个序列): ', end='')
140 | for i in range(0, (150-N)):
141 | if y_predict_nb10[-(150-N):].tolist()[i] == 1:
142 | print('\033[1;30;41m1 \033[0m', end='')
143 | else:
144 | print('\033[1;30;42m0 \033[0m', end='')
145 | print()
146 | # print('十折交叉验证后50个操作序列特征标签是(0为正常):', y_predict_nb10[-50:].tolist())#同样取后50个测试集
147 | print('NB十折交叉验证的预测准确率是:\033[1;30;0m%s \033[0m ' % (score))
148 |
149 | # print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test)
150 | # print('十折交叉验证后50个操作序列特征标签是(0为正常):', y_predict_nb10[-50:].tolist()) # 同样取后50个测试集
151 | # print('NB的十折交叉异常操作的预测准确率是:', score)
152 |
153 |
154 | try:
155 | if len(arg)==2 and 0